From 539855b7dea1403b03fa0bd7d4130cf1b1e3c083 Mon Sep 17 00:00:00 2001 From: Eric Moore Date: Mon, 25 Aug 2025 02:37:09 +0800 Subject: [PATCH 1/6] feat: add fp8 optimization support for transformer model - Implement fp8 quantization utilities for linear layers - Add fp8 optimization option to gradio demo interface - Modify worker function to handle fp8 optimized state dict - Include monkey patching for fp8 linear layer forward pass --- .gitignore | 1 + demo_gradio_f1.py | 44 +++-- utils/fp8_optimization_utils.py | 277 ++++++++++++++++++++++++++++++++ 3 files changed, 312 insertions(+), 10 deletions(-) create mode 100644 utils/fp8_optimization_utils.py diff --git a/.gitignore b/.gitignore index e83dd2c0..57fe59e5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +hf_download hf_download/ outputs/ repo/ diff --git a/demo_gradio_f1.py b/demo_gradio_f1.py index e9120edc..47bf98a4 100644 --- a/demo_gradio_f1.py +++ b/demo_gradio_f1.py @@ -30,6 +30,7 @@ from diffusers_helper.clip_vision import hf_clip_vision_encode from diffusers_helper.bucket_tools import find_nearest_bucket from utils.lora_utils import merge_lora_to_state_dict +from utils.fp8_optimization_utils import optimize_state_dict_with_fp8, apply_fp8_monkey_patch parser = argparse.ArgumentParser() @@ -68,6 +69,7 @@ transformer_dtype = torch.bfloat16 previous_lora_file = None previous_lora_multiplier = None +previous_fp8_optimization = None vae.eval() text_encoder.eval() @@ -104,12 +106,13 @@ @torch.no_grad() -def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier): - global transformer, previous_lora_file, previous_lora_multiplier +def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization): + global transformer, previous_lora_file, previous_lora_multiplier, previous_fp8_optimization model_changed = transformer is None or ( lora_file != previous_lora_file or lora_multiplier != previous_lora_multiplier + or fp8_optimization != previous_fp8_optimization ) total_latent_sections = (total_second_length * 24) / (latent_window_size * 4) @@ -195,6 +198,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind previous_lora_file = lora_file previous_lora_multiplier = lora_multiplier + previous_fp8_optimization = fp8_optimization transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu() transformer.eval() @@ -204,13 +208,32 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind transformer.to(dtype=torch.bfloat16) transformer.requires_grad_(False) - if lora_file is not None: + if lora_file is not None or fp8_optimization: state_dict = transformer.state_dict() - print(f"Merging LoRA file {os.path.basename(lora_file)} ...") - state_dict = merge_lora_to_state_dict(state_dict, lora_file, lora_multiplier, device=gpu) - gc.collect() + + # LoRA should be merged before fp8 optimization + if lora_file is not None: + # TODO It would be better to merge the LoRA into the state dict before creating the transformer instance. + # Use from_config() instead of from_pretrained to make the instance without loading. + + print(f"Merging LoRA file {os.path.basename(lora_file)} ...") + state_dict = merge_lora_to_state_dict(state_dict, lora_file, lora_multiplier, device=gpu) + gc.collect() + + if fp8_optimization: + TARGET_KEYS = ["transformer_blocks", "single_transformer_blocks"] + EXCLUDE_KEYS = ["norm"] # Exclude norm layers (e.g., LayerNorm, RMSNorm) from FP8 + + # inplace optimization + print("Optimizing for fp8") + state_dict = optimize_state_dict_with_fp8(state_dict, gpu, TARGET_KEYS, EXCLUDE_KEYS, move_to_device=False) + + # apply monkey patching + apply_fp8_monkey_patch(transformer, state_dict, use_scaled_mm=False) + gc.collect() + info = transformer.load_state_dict(state_dict, strict=True, assign=True) - print(f"LoRA applied: {info}") + print(f"LoRA and/or fp8 optimization applied: {info}") if not high_vram: DynamicSwapInstaller.install_model(transformer, device=gpu) @@ -341,7 +364,7 @@ def callback(d): return -def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier): +def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization): global stream assert input_image is not None, 'No input image!' @@ -349,7 +372,7 @@ def process(input_image, prompt, n_prompt, seed, total_second_length, latent_win stream = AsyncStream() - async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier) + async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization) output_filename = None @@ -418,6 +441,7 @@ def end_process(): with gr.Group(): lora_file = gr.File(label="LoRA File", file_count="single", type="filepath") lora_multiplier = gr.Slider(label="LoRA Multiplier", minimum=0.0, maximum=1.0, value=0.8, step=0.1) + fp8_optimization = gr.Checkbox(label="FP8 Optimization", value=False) with gr.Column(): preview_image = gr.Image(label="Next Latents", height=200, visible=False) @@ -427,7 +451,7 @@ def end_process(): gr.HTML('
Share your results and find ideas at the FramePack Twitter (X) thread
') - ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier] + ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization] start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button]) end_button.click(fn=end_process) diff --git a/utils/fp8_optimization_utils.py b/utils/fp8_optimization_utils.py new file mode 100644 index 00000000..27b47e4e --- /dev/null +++ b/utils/fp8_optimization_utils.py @@ -0,0 +1,277 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from tqdm import tqdm + + +def calculate_fp8_maxval(exp_bits=4, mantissa_bits=3, sign_bits=1): + """ + Calculate the maximum representable value in FP8 format. + Default is E4M3 format (4-bit exponent, 3-bit mantissa, 1-bit sign). + + Args: + exp_bits (int): Number of exponent bits + mantissa_bits (int): Number of mantissa bits + sign_bits (int): Number of sign bits (0 or 1) + + Returns: + float: Maximum value representable in FP8 format + """ + assert exp_bits + mantissa_bits + sign_bits == 8, "Total bits must be 8" + + # Calculate exponent bias + bias = 2 ** (exp_bits - 1) - 1 + + # Calculate maximum mantissa value + mantissa_max = 1.0 + for i in range(mantissa_bits - 1): + mantissa_max += 2 ** -(i + 1) + + # Calculate maximum value + max_value = mantissa_max * (2 ** (2**exp_bits - 1 - bias)) + + return max_value + + +def quantize_tensor_to_fp8(tensor, scale, exp_bits=4, mantissa_bits=3, sign_bits=1, max_value=None, min_value=None): + """ + Quantize a tensor to FP8 format. + + Args: + tensor (torch.Tensor): Tensor to quantize + scale (float or torch.Tensor): Scale factor + exp_bits (int): Number of exponent bits + mantissa_bits (int): Number of mantissa bits + sign_bits (int): Number of sign bits + + Returns: + tuple: (quantized_tensor, scale_factor) + """ + # Create scaled tensor + scaled_tensor = tensor / scale + + # Calculate FP8 parameters + bias = 2 ** (exp_bits - 1) - 1 + + if max_value is None: + # Calculate max and min values + max_value = calculate_fp8_maxval(exp_bits, mantissa_bits, sign_bits) + min_value = -max_value if sign_bits > 0 else 0.0 + + # Clamp tensor to range + clamped_tensor = torch.clamp(scaled_tensor, min_value, max_value) + + # Quantization process + abs_values = torch.abs(clamped_tensor) + nonzero_mask = abs_values > 0 + + # Calculate logF scales (only for non-zero elements) + log_scales = torch.zeros_like(clamped_tensor) + if nonzero_mask.any(): + log_scales[nonzero_mask] = torch.floor(torch.log2(abs_values[nonzero_mask]) + bias).detach() + + # Limit log scales and calculate quantization factor + log_scales = torch.clamp(log_scales, min=1.0) + quant_factor = 2.0 ** (log_scales - mantissa_bits - bias) + + # Quantize and dequantize + quantized = torch.round(clamped_tensor / quant_factor) * quant_factor + + return quantized, scale + + +def optimize_state_dict_with_fp8( + state_dict, calc_device, target_layer_keys=None, exclude_layer_keys=None, exp_bits=4, mantissa_bits=3, move_to_device=False +): + """ + Optimize Linear layer weights in a model's state dict to FP8 format. + + Args: + state_dict (dict): State dict to optimize, replaced in-place + calc_device (str): Device to quantize tensors on + target_layer_keys (list, optional): Layer key patterns to target (None for all Linear layers) + exclude_layer_keys (list, optional): Layer key patterns to exclude + exp_bits (int): Number of exponent bits + mantissa_bits (int): Number of mantissa bits + move_to_device (bool): Move optimized tensors to the calculating device + + Returns: + dict: FP8 optimized state dict + """ + if exp_bits == 4 and mantissa_bits == 3: + fp8_dtype = torch.float8_e4m3fn + elif exp_bits == 5 and mantissa_bits == 2: + fp8_dtype = torch.float8_e5m2 + else: + raise ValueError(f"Unsupported FP8 format: E{exp_bits}M{mantissa_bits}") + + # Calculate FP8 max value + max_value = calculate_fp8_maxval(exp_bits, mantissa_bits) + min_value = -max_value # this function supports only signed FP8 + + # Create optimized state dict + optimized_count = 0 + + # Enumerate tarket keys + target_state_dict_keys = [] + for key in state_dict.keys(): + # Check if it's a weight key and matches target patterns + is_target = (target_layer_keys is None or any(pattern in key for pattern in target_layer_keys)) and key.endswith(".weight") + is_excluded = exclude_layer_keys is not None and any(pattern in key for pattern in exclude_layer_keys) + is_target = is_target and not is_excluded + + if is_target and isinstance(state_dict[key], torch.Tensor): + target_state_dict_keys.append(key) + + # Process each key + for key in tqdm(target_state_dict_keys): + value = state_dict[key] + + # Save original device and dtype + original_device = value.device + original_dtype = value.dtype + + # Move to calculation device + if calc_device is not None: + value = value.to(calc_device) + + # Calculate scale factor + scale = torch.max(torch.abs(value.flatten())) / max_value + # print(f"Optimizing {key} with scale: {scale}") + + # Quantize weight to FP8 + quantized_weight, _ = quantize_tensor_to_fp8(value, scale, exp_bits, mantissa_bits, 1, max_value, min_value) + + # Add to state dict using original key for weight and new key for scale + fp8_key = key # Maintain original key + scale_key = key.replace(".weight", ".scale_weight") + + quantized_weight = quantized_weight.to(fp8_dtype) + + if not move_to_device: + quantized_weight = quantized_weight.to(original_device) + + scale_tensor = torch.tensor([scale], dtype=original_dtype, device=quantized_weight.device) + + state_dict[fp8_key] = quantized_weight + state_dict[scale_key] = scale_tensor + + optimized_count += 1 + + if calc_device is not None: # optimized_count % 10 == 0 and + # free memory on calculation device + torch.cuda.empty_cache() # TODO check device typ + + print(f"Number of optimized Linear layers: {optimized_count}") + return state_dict + + +def fp8_linear_forward_patch(self: nn.Linear, x, use_scaled_mm=False, max_value=None): + """ + Patched forward method for Linear layers with FP8 weights. + + Args: + self: Linear layer instance + x (torch.Tensor): Input tensor + use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series) + max_value (float): Maximum value for FP8 quantization. If None, no quantization is applied for input tensor. + + Returns: + torch.Tensor: Result of linear transformation + """ + if use_scaled_mm: + input_dtype = x.dtype + original_weight_dtype = self.scale_weight.dtype + weight_dtype = self.weight.dtype + target_dtype = torch.float8_e5m2 + assert weight_dtype == torch.float8_e4m3fn, "Only FP8 E4M3FN format is supported" + assert x.ndim == 3, "Input tensor must be 3D (batch_size, seq_len, hidden_dim)" + + if max_value is None: + # no input quantization + scale_x = torch.tensor(1.0, dtype=torch.float32, device=x.device) + else: + # calculate scale factor for input tensor + scale_x = (torch.max(torch.abs(x.flatten())) / max_value).to(torch.float32) + + # quantize input tensor to FP8: this seems to consume a lot of memory + x, _ = quantize_tensor_to_fp8(x, scale_x, 5, 2, 1, max_value, -max_value) + + original_shape = x.shape + x = x.reshape(-1, x.shape[2]).to(target_dtype) + + weight = self.weight.t() + scale_weight = self.scale_weight.to(torch.float32) + + if self.bias is not None: + # float32 is not supported with bias in scaled_mm + o = torch._scaled_mm(x, weight, out_dtype=original_weight_dtype, bias=self.bias, scale_a=scale_x, scale_b=scale_weight) + else: + o = torch._scaled_mm(x, weight, out_dtype=input_dtype, scale_a=scale_x, scale_b=scale_weight) + + return o.reshape(original_shape[0], original_shape[1], -1).to(input_dtype) + + else: + # Dequantize the weight + original_dtype = self.scale_weight.dtype + dequantized_weight = self.weight.to(original_dtype) * self.scale_weight + + # Perform linear transformation + if self.bias is not None: + output = F.linear(x, dequantized_weight, self.bias) + else: + output = F.linear(x, dequantized_weight) + + return output + + +def apply_fp8_monkey_patch(model, optimized_state_dict, use_scaled_mm=False): + """ + Apply monkey patching to a model using FP8 optimized state dict. + + Args: + model (nn.Module): Model instance to patch + optimized_state_dict (dict): FP8 optimized state dict + use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series) + + Returns: + nn.Module: The patched model (same instance, modified in-place) + """ + # # Calculate FP8 float8_e5m2 max value + # max_value = calculate_fp8_maxval(5, 2) + max_value = None # do not quantize input tensor + + # Find all scale keys to identify FP8-optimized layers + scale_keys = [k for k in optimized_state_dict.keys() if k.endswith(".scale_weight")] + + # Enumerate patched layers + patched_module_paths = set() + for scale_key in scale_keys: + # Extract module path from scale key (remove .scale_weight) + module_path = scale_key.rsplit(".scale_weight", 1)[0] + patched_module_paths.add(module_path) + + patched_count = 0 + + # Apply monkey patch to each layer with FP8 weights + for name, module in model.named_modules(): + # Check if this module has a corresponding scale_weight + has_scale = name in patched_module_paths + + # Apply patch if it's a Linear layer with FP8 scale + if isinstance(module, nn.Linear) and has_scale: + # register the scale_weight as a buffer to load the state_dict + module.register_buffer("scale_weight", torch.tensor(1.0, dtype=module.weight.dtype)) + + # Create a new forward method with the patched version. + def new_forward(self, x): + return fp8_linear_forward_patch(self, x, use_scaled_mm, max_value) + + # Bind method to module + module.forward = new_forward.__get__(module, type(module)) + + patched_count += 1 + + print(f"Number of monkey-patched Linear layers: {patched_count}") + return model From 49505e7feec97e262a939a5eee5892eba7492c7a Mon Sep 17 00:00:00 2001 From: Eric Moore Date: Mon, 25 Aug 2025 03:39:02 +0800 Subject: [PATCH 2/6] feat(demo): add offline mode support for model loading Add --offline flag to load models from local cache instead of downloading from HuggingFace hub. This enables usage in environments with restricted internet access. --- demo_gradio.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/demo_gradio.py b/demo_gradio.py index 114aa323..4a877d95 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -38,6 +38,7 @@ parser.add_argument("--port", type=int, required=False) parser.add_argument("--inbrowser", action='store_true') parser.add_argument("--output_dir", type=str, default='./outputs') +parser.add_argument("--offline", action='store_true') args = parser.parse_args() # for win desktop probably use --server 127.0.0.1 --inbrowser @@ -54,14 +55,31 @@ print(f'Free VRAM {free_mem_gb} GB') print(f'High-VRAM Mode: {high_vram}') -text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu() -text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu() -tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer') -tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2') -vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu() - -feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor') -image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu() +if args.offline: + HF_CACHE_HUB_PATH = os.path.join(os.environ['HF_HOME'], 'hub') + HUNYUAN_VIDEO_LOCAL_PATH = os.path.join(HF_CACHE_HUB_PATH, 'models--hunyuanvideo-community--HunyuanVideo') + FLUX_REDUX_LOCAL_PATH = os.path.join(HF_CACHE_HUB_PATH, 'models--lllyasviel--flux_redux_bfl') + FRAMEPACK_I2V_HY_LOCAL_PATH = os.path.join(HF_CACHE_HUB_PATH, 'models--lllyasviel--FramePackI2V_HY') + + text_encoder = LlamaModel.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'text_encoder'), torch_dtype=torch.float16).cpu() + text_encoder_2 = CLIPTextModel.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'text_encoder_2'), torch_dtype=torch.float16).cpu() + tokenizer = LlamaTokenizerFast.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'tokenizer')) + tokenizer_2 = CLIPTokenizer.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'tokenizer_2')) + vae = AutoencoderKLHunyuanVideo.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'vae'), torch_dtype=torch.float16).cpu() + + feature_extractor = SiglipImageProcessor.from_pretrained(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots'))[0], 'feature_extractor')) + image_encoder = SiglipVisionModel.from_pretrained(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots'))[0], 'image_encoder'), torch_dtype=torch.float16).cpu() + + transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(os.path.join(FRAMEPACK_I2V_HY_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(FRAMEPACK_I2V_HY_LOCAL_PATH, 'snapshots'))[0]), torch_dtype=torch.bfloat16).cpu() +else: + text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu() + text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu() + tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer') + tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2') + vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu() + + feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor') + image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu() transformer = None # load later transformer_dtype = torch.bfloat16 From f27bd4e1d306afebb45bd3f150990ac2b8b67367 Mon Sep 17 00:00:00 2001 From: Eric Moore Date: Mon, 25 Aug 2025 03:53:58 +0800 Subject: [PATCH 3/6] fix: default offline --- demo_gradio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo_gradio.py b/demo_gradio.py index 4a877d95..5b5e4612 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -38,7 +38,7 @@ parser.add_argument("--port", type=int, required=False) parser.add_argument("--inbrowser", action='store_true') parser.add_argument("--output_dir", type=str, default='./outputs') -parser.add_argument("--offline", action='store_true') +parser.add_argument("--offline", default=True, action='store_true') args = parser.parse_args() # for win desktop probably use --server 127.0.0.1 --inbrowser From cc8735dfde6a7b292fff7e28840db2f6f1f33f20 Mon Sep 17 00:00:00 2001 From: Eric Moore Date: Mon, 25 Aug 2025 04:05:44 +0800 Subject: [PATCH 4/6] fix: demo_gradio.py fp8 --- demo_gradio.py | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/demo_gradio.py b/demo_gradio.py index 5b5e4612..8496cfce 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -30,6 +30,7 @@ from diffusers_helper.clip_vision import hf_clip_vision_encode from diffusers_helper.bucket_tools import find_nearest_bucket from utils.lora_utils import merge_lora_to_state_dict +from utils.fp8_optimization_utils import optimize_state_dict_with_fp8, apply_fp8_monkey_patch parser = argparse.ArgumentParser() @@ -85,6 +86,7 @@ transformer_dtype = torch.bfloat16 previous_lora_file = None previous_lora_multiplier = None +previous_fp8_optimization = None vae.eval() text_encoder.eval() @@ -121,12 +123,13 @@ @torch.no_grad() -def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier): - global transformer, previous_lora_file, previous_lora_multiplier +def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization): + global transformer, previous_lora_file, previous_lora_multiplier, previous_fp8_optimization model_changed = transformer is None or ( lora_file != previous_lora_file or lora_multiplier != previous_lora_multiplier + or fp8_optimization != previous_fp8_optimization ) total_latent_sections = (total_second_length * 24) / (latent_window_size * 4) @@ -212,6 +215,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind previous_lora_file = lora_file previous_lora_multiplier = lora_multiplier + previous_fp8_optimization = fp8_optimization transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu() transformer.eval() @@ -221,13 +225,32 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind transformer.to(dtype=torch.bfloat16) transformer.requires_grad_(False) - if lora_file is not None: + if lora_file is not None or fp8_optimization: state_dict = transformer.state_dict() - print(f"Merging LoRA file {os.path.basename(lora_file)} ...") - state_dict = merge_lora_to_state_dict(state_dict, lora_file, lora_multiplier, device=gpu) - gc.collect() + + # LoRA should be merged before fp8 optimization + if lora_file is not None: + # TODO It would be better to merge the LoRA into the state dict before creating the transformer instance. + # Use from_config() instead of from_pretrained to make the instance without loading. + + print(f"Merging LoRA file {os.path.basename(lora_file)} ...") + state_dict = merge_lora_to_state_dict(state_dict, lora_file, lora_multiplier, device=gpu) + gc.collect() + + if fp8_optimization: + TARGET_KEYS = ["transformer_blocks", "single_transformer_blocks"] + EXCLUDE_KEYS = ["norm"] # Exclude norm layers (e.g., LayerNorm, RMSNorm) from FP8 + + # inplace optimization + print("Optimizing for fp8") + state_dict = optimize_state_dict_with_fp8(state_dict, gpu, TARGET_KEYS, EXCLUDE_KEYS, move_to_device=False) + + # apply monkey patching + apply_fp8_monkey_patch(transformer, state_dict, use_scaled_mm=False) + gc.collect() + info = transformer.load_state_dict(state_dict, strict=True, assign=True) - print(f"LoRA applied: {info}") + print(f"LoRA and/or fp8 optimization applied: {info}") if not high_vram: DynamicSwapInstaller.install_model(transformer, device=gpu) @@ -371,7 +394,7 @@ def callback(d): return -def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier): +def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization): global stream assert input_image is not None, 'No input image!' @@ -379,7 +402,7 @@ def process(input_image, prompt, n_prompt, seed, total_second_length, latent_win stream = AsyncStream() - async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier) + async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization) output_filename = None @@ -448,6 +471,7 @@ def end_process(): with gr.Group(): lora_file = gr.File(label="LoRA File", file_count="single", type="filepath") lora_multiplier = gr.Slider(label="LoRA Multiplier", minimum=0.0, maximum=1.0, value=0.8, step=0.1) + fp8_optimization = gr.Checkbox(label="FP8 Optimization", value=False) with gr.Column(): preview_image = gr.Image(label="Next Latents", height=200, visible=False) @@ -458,7 +482,7 @@ def end_process(): gr.HTML('
Share your results and find ideas at the FramePack Twitter (X) thread
') - ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier] + ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization] start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button]) end_button.click(fn=end_process) From a6490ae56b724b72eb2c3c80ffbac973a769fb16 Mon Sep 17 00:00:00 2001 From: Eric Moore Date: Mon, 25 Aug 2025 04:26:43 +0800 Subject: [PATCH 5/6] fix: correct argument parser and adjust gpu memory slider range Fix the --offline argument to remove incorrect store_true action and adjust the minimum value of gpu_memory_preservation slider from 6 to 0 for better flexibility in low-memory scenarios --- demo_gradio.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/demo_gradio.py b/demo_gradio.py index 8496cfce..844ebdaf 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -39,7 +39,7 @@ parser.add_argument("--port", type=int, required=False) parser.add_argument("--inbrowser", action='store_true') parser.add_argument("--output_dir", type=str, default='./outputs') -parser.add_argument("--offline", default=True, action='store_true') +parser.add_argument("--offline", default=True) args = parser.parse_args() # for win desktop probably use --server 127.0.0.1 --inbrowser @@ -61,13 +61,13 @@ HUNYUAN_VIDEO_LOCAL_PATH = os.path.join(HF_CACHE_HUB_PATH, 'models--hunyuanvideo-community--HunyuanVideo') FLUX_REDUX_LOCAL_PATH = os.path.join(HF_CACHE_HUB_PATH, 'models--lllyasviel--flux_redux_bfl') FRAMEPACK_I2V_HY_LOCAL_PATH = os.path.join(HF_CACHE_HUB_PATH, 'models--lllyasviel--FramePackI2V_HY') - + text_encoder = LlamaModel.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'text_encoder'), torch_dtype=torch.float16).cpu() text_encoder_2 = CLIPTextModel.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'text_encoder_2'), torch_dtype=torch.float16).cpu() tokenizer = LlamaTokenizerFast.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'tokenizer')) tokenizer_2 = CLIPTokenizer.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'tokenizer_2')) vae = AutoencoderKLHunyuanVideo.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'vae'), torch_dtype=torch.float16).cpu() - + feature_extractor = SiglipImageProcessor.from_pretrained(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots'))[0], 'feature_extractor')) image_encoder = SiglipVisionModel.from_pretrained(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots'))[0], 'image_encoder'), torch_dtype=torch.float16).cpu() @@ -464,7 +464,7 @@ def end_process(): rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change # This is only used when high_vram is False - gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.", visible=not high_vram) + gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=0, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.", visible=not high_vram) mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ") From e5f2ed90d3e896506e5365dab88b7e6afd8e9341 Mon Sep 17 00:00:00 2001 From: Eric Moore Date: Mon, 25 Aug 2025 04:30:57 +0800 Subject: [PATCH 6/6] fix(demo_gradio): enable FP8 optimization by default The FP8 optimization checkbox was disabled by default, which may lead to suboptimal performance for users who are unaware of this setting. Enabling it by default ensures better performance out of the box. --- demo_gradio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo_gradio.py b/demo_gradio.py index 844ebdaf..e04ffe23 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -471,7 +471,7 @@ def end_process(): with gr.Group(): lora_file = gr.File(label="LoRA File", file_count="single", type="filepath") lora_multiplier = gr.Slider(label="LoRA Multiplier", minimum=0.0, maximum=1.0, value=0.8, step=0.1) - fp8_optimization = gr.Checkbox(label="FP8 Optimization", value=False) + fp8_optimization = gr.Checkbox(label="FP8 Optimization", value=True) with gr.Column(): preview_image = gr.Image(label="Next Latents", height=200, visible=False)