diff --git a/.gitignore b/.gitignore index e83dd2c0..57fe59e5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +hf_download hf_download/ outputs/ repo/ diff --git a/demo_gradio.py b/demo_gradio.py index 114aa323..e04ffe23 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -30,6 +30,7 @@ from diffusers_helper.clip_vision import hf_clip_vision_encode from diffusers_helper.bucket_tools import find_nearest_bucket from utils.lora_utils import merge_lora_to_state_dict +from utils.fp8_optimization_utils import optimize_state_dict_with_fp8, apply_fp8_monkey_patch parser = argparse.ArgumentParser() @@ -38,6 +39,7 @@ parser.add_argument("--port", type=int, required=False) parser.add_argument("--inbrowser", action='store_true') parser.add_argument("--output_dir", type=str, default='./outputs') +parser.add_argument("--offline", default=True) args = parser.parse_args() # for win desktop probably use --server 127.0.0.1 --inbrowser @@ -54,19 +56,37 @@ print(f'Free VRAM {free_mem_gb} GB') print(f'High-VRAM Mode: {high_vram}') -text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu() -text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu() -tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer') -tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2') -vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu() +if args.offline: + HF_CACHE_HUB_PATH = os.path.join(os.environ['HF_HOME'], 'hub') + HUNYUAN_VIDEO_LOCAL_PATH = os.path.join(HF_CACHE_HUB_PATH, 'models--hunyuanvideo-community--HunyuanVideo') + FLUX_REDUX_LOCAL_PATH = os.path.join(HF_CACHE_HUB_PATH, 'models--lllyasviel--flux_redux_bfl') + FRAMEPACK_I2V_HY_LOCAL_PATH = os.path.join(HF_CACHE_HUB_PATH, 'models--lllyasviel--FramePackI2V_HY') -feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor') -image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu() + text_encoder = LlamaModel.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'text_encoder'), torch_dtype=torch.float16).cpu() + text_encoder_2 = CLIPTextModel.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'text_encoder_2'), torch_dtype=torch.float16).cpu() + tokenizer = LlamaTokenizerFast.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'tokenizer')) + tokenizer_2 = CLIPTokenizer.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'tokenizer_2')) + vae = AutoencoderKLHunyuanVideo.from_pretrained(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(HUNYUAN_VIDEO_LOCAL_PATH, 'snapshots'))[0], 'vae'), torch_dtype=torch.float16).cpu() + + feature_extractor = SiglipImageProcessor.from_pretrained(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots'))[0], 'feature_extractor')) + image_encoder = SiglipVisionModel.from_pretrained(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(FLUX_REDUX_LOCAL_PATH, 'snapshots'))[0], 'image_encoder'), torch_dtype=torch.float16).cpu() + + transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(os.path.join(FRAMEPACK_I2V_HY_LOCAL_PATH, 'snapshots', os.listdir(os.path.join(FRAMEPACK_I2V_HY_LOCAL_PATH, 'snapshots'))[0]), torch_dtype=torch.bfloat16).cpu() +else: + text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu() + text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu() + tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer') + tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2') + vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu() + + feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor') + image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu() transformer = None # load later transformer_dtype = torch.bfloat16 previous_lora_file = None previous_lora_multiplier = None +previous_fp8_optimization = None vae.eval() text_encoder.eval() @@ -103,12 +123,13 @@ @torch.no_grad() -def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier): - global transformer, previous_lora_file, previous_lora_multiplier +def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization): + global transformer, previous_lora_file, previous_lora_multiplier, previous_fp8_optimization model_changed = transformer is None or ( lora_file != previous_lora_file or lora_multiplier != previous_lora_multiplier + or fp8_optimization != previous_fp8_optimization ) total_latent_sections = (total_second_length * 24) / (latent_window_size * 4) @@ -194,6 +215,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind previous_lora_file = lora_file previous_lora_multiplier = lora_multiplier + previous_fp8_optimization = fp8_optimization transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu() transformer.eval() @@ -203,13 +225,32 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind transformer.to(dtype=torch.bfloat16) transformer.requires_grad_(False) - if lora_file is not None: + if lora_file is not None or fp8_optimization: state_dict = transformer.state_dict() - print(f"Merging LoRA file {os.path.basename(lora_file)} ...") - state_dict = merge_lora_to_state_dict(state_dict, lora_file, lora_multiplier, device=gpu) - gc.collect() + + # LoRA should be merged before fp8 optimization + if lora_file is not None: + # TODO It would be better to merge the LoRA into the state dict before creating the transformer instance. + # Use from_config() instead of from_pretrained to make the instance without loading. + + print(f"Merging LoRA file {os.path.basename(lora_file)} ...") + state_dict = merge_lora_to_state_dict(state_dict, lora_file, lora_multiplier, device=gpu) + gc.collect() + + if fp8_optimization: + TARGET_KEYS = ["transformer_blocks", "single_transformer_blocks"] + EXCLUDE_KEYS = ["norm"] # Exclude norm layers (e.g., LayerNorm, RMSNorm) from FP8 + + # inplace optimization + print("Optimizing for fp8") + state_dict = optimize_state_dict_with_fp8(state_dict, gpu, TARGET_KEYS, EXCLUDE_KEYS, move_to_device=False) + + # apply monkey patching + apply_fp8_monkey_patch(transformer, state_dict, use_scaled_mm=False) + gc.collect() + info = transformer.load_state_dict(state_dict, strict=True, assign=True) - print(f"LoRA applied: {info}") + print(f"LoRA and/or fp8 optimization applied: {info}") if not high_vram: DynamicSwapInstaller.install_model(transformer, device=gpu) @@ -353,7 +394,7 @@ def callback(d): return -def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier): +def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization): global stream assert input_image is not None, 'No input image!' @@ -361,7 +402,7 @@ def process(input_image, prompt, n_prompt, seed, total_second_length, latent_win stream = AsyncStream() - async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier) + async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization) output_filename = None @@ -423,13 +464,14 @@ def end_process(): rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change # This is only used when high_vram is False - gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.", visible=not high_vram) + gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=0, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.", visible=not high_vram) mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ") with gr.Group(): lora_file = gr.File(label="LoRA File", file_count="single", type="filepath") lora_multiplier = gr.Slider(label="LoRA Multiplier", minimum=0.0, maximum=1.0, value=0.8, step=0.1) + fp8_optimization = gr.Checkbox(label="FP8 Optimization", value=True) with gr.Column(): preview_image = gr.Image(label="Next Latents", height=200, visible=False) @@ -440,7 +482,7 @@ def end_process(): gr.HTML('
Share your results and find ideas at the FramePack Twitter (X) thread
') - ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier] + ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization] start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button]) end_button.click(fn=end_process) diff --git a/demo_gradio_f1.py b/demo_gradio_f1.py index e9120edc..47bf98a4 100644 --- a/demo_gradio_f1.py +++ b/demo_gradio_f1.py @@ -30,6 +30,7 @@ from diffusers_helper.clip_vision import hf_clip_vision_encode from diffusers_helper.bucket_tools import find_nearest_bucket from utils.lora_utils import merge_lora_to_state_dict +from utils.fp8_optimization_utils import optimize_state_dict_with_fp8, apply_fp8_monkey_patch parser = argparse.ArgumentParser() @@ -68,6 +69,7 @@ transformer_dtype = torch.bfloat16 previous_lora_file = None previous_lora_multiplier = None +previous_fp8_optimization = None vae.eval() text_encoder.eval() @@ -104,12 +106,13 @@ @torch.no_grad() -def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier): - global transformer, previous_lora_file, previous_lora_multiplier +def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization): + global transformer, previous_lora_file, previous_lora_multiplier, previous_fp8_optimization model_changed = transformer is None or ( lora_file != previous_lora_file or lora_multiplier != previous_lora_multiplier + or fp8_optimization != previous_fp8_optimization ) total_latent_sections = (total_second_length * 24) / (latent_window_size * 4) @@ -195,6 +198,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind previous_lora_file = lora_file previous_lora_multiplier = lora_multiplier + previous_fp8_optimization = fp8_optimization transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu() transformer.eval() @@ -204,13 +208,32 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind transformer.to(dtype=torch.bfloat16) transformer.requires_grad_(False) - if lora_file is not None: + if lora_file is not None or fp8_optimization: state_dict = transformer.state_dict() - print(f"Merging LoRA file {os.path.basename(lora_file)} ...") - state_dict = merge_lora_to_state_dict(state_dict, lora_file, lora_multiplier, device=gpu) - gc.collect() + + # LoRA should be merged before fp8 optimization + if lora_file is not None: + # TODO It would be better to merge the LoRA into the state dict before creating the transformer instance. + # Use from_config() instead of from_pretrained to make the instance without loading. + + print(f"Merging LoRA file {os.path.basename(lora_file)} ...") + state_dict = merge_lora_to_state_dict(state_dict, lora_file, lora_multiplier, device=gpu) + gc.collect() + + if fp8_optimization: + TARGET_KEYS = ["transformer_blocks", "single_transformer_blocks"] + EXCLUDE_KEYS = ["norm"] # Exclude norm layers (e.g., LayerNorm, RMSNorm) from FP8 + + # inplace optimization + print("Optimizing for fp8") + state_dict = optimize_state_dict_with_fp8(state_dict, gpu, TARGET_KEYS, EXCLUDE_KEYS, move_to_device=False) + + # apply monkey patching + apply_fp8_monkey_patch(transformer, state_dict, use_scaled_mm=False) + gc.collect() + info = transformer.load_state_dict(state_dict, strict=True, assign=True) - print(f"LoRA applied: {info}") + print(f"LoRA and/or fp8 optimization applied: {info}") if not high_vram: DynamicSwapInstaller.install_model(transformer, device=gpu) @@ -341,7 +364,7 @@ def callback(d): return -def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier): +def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization): global stream assert input_image is not None, 'No input image!' @@ -349,7 +372,7 @@ def process(input_image, prompt, n_prompt, seed, total_second_length, latent_win stream = AsyncStream() - async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier) + async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization) output_filename = None @@ -418,6 +441,7 @@ def end_process(): with gr.Group(): lora_file = gr.File(label="LoRA File", file_count="single", type="filepath") lora_multiplier = gr.Slider(label="LoRA Multiplier", minimum=0.0, maximum=1.0, value=0.8, step=0.1) + fp8_optimization = gr.Checkbox(label="FP8 Optimization", value=False) with gr.Column(): preview_image = gr.Image(label="Next Latents", height=200, visible=False) @@ -427,7 +451,7 @@ def end_process(): gr.HTML('
Share your results and find ideas at the FramePack Twitter (X) thread
') - ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier] + ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, resolution, lora_file, lora_multiplier, fp8_optimization] start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button]) end_button.click(fn=end_process) diff --git a/utils/fp8_optimization_utils.py b/utils/fp8_optimization_utils.py new file mode 100644 index 00000000..27b47e4e --- /dev/null +++ b/utils/fp8_optimization_utils.py @@ -0,0 +1,277 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from tqdm import tqdm + + +def calculate_fp8_maxval(exp_bits=4, mantissa_bits=3, sign_bits=1): + """ + Calculate the maximum representable value in FP8 format. + Default is E4M3 format (4-bit exponent, 3-bit mantissa, 1-bit sign). + + Args: + exp_bits (int): Number of exponent bits + mantissa_bits (int): Number of mantissa bits + sign_bits (int): Number of sign bits (0 or 1) + + Returns: + float: Maximum value representable in FP8 format + """ + assert exp_bits + mantissa_bits + sign_bits == 8, "Total bits must be 8" + + # Calculate exponent bias + bias = 2 ** (exp_bits - 1) - 1 + + # Calculate maximum mantissa value + mantissa_max = 1.0 + for i in range(mantissa_bits - 1): + mantissa_max += 2 ** -(i + 1) + + # Calculate maximum value + max_value = mantissa_max * (2 ** (2**exp_bits - 1 - bias)) + + return max_value + + +def quantize_tensor_to_fp8(tensor, scale, exp_bits=4, mantissa_bits=3, sign_bits=1, max_value=None, min_value=None): + """ + Quantize a tensor to FP8 format. + + Args: + tensor (torch.Tensor): Tensor to quantize + scale (float or torch.Tensor): Scale factor + exp_bits (int): Number of exponent bits + mantissa_bits (int): Number of mantissa bits + sign_bits (int): Number of sign bits + + Returns: + tuple: (quantized_tensor, scale_factor) + """ + # Create scaled tensor + scaled_tensor = tensor / scale + + # Calculate FP8 parameters + bias = 2 ** (exp_bits - 1) - 1 + + if max_value is None: + # Calculate max and min values + max_value = calculate_fp8_maxval(exp_bits, mantissa_bits, sign_bits) + min_value = -max_value if sign_bits > 0 else 0.0 + + # Clamp tensor to range + clamped_tensor = torch.clamp(scaled_tensor, min_value, max_value) + + # Quantization process + abs_values = torch.abs(clamped_tensor) + nonzero_mask = abs_values > 0 + + # Calculate logF scales (only for non-zero elements) + log_scales = torch.zeros_like(clamped_tensor) + if nonzero_mask.any(): + log_scales[nonzero_mask] = torch.floor(torch.log2(abs_values[nonzero_mask]) + bias).detach() + + # Limit log scales and calculate quantization factor + log_scales = torch.clamp(log_scales, min=1.0) + quant_factor = 2.0 ** (log_scales - mantissa_bits - bias) + + # Quantize and dequantize + quantized = torch.round(clamped_tensor / quant_factor) * quant_factor + + return quantized, scale + + +def optimize_state_dict_with_fp8( + state_dict, calc_device, target_layer_keys=None, exclude_layer_keys=None, exp_bits=4, mantissa_bits=3, move_to_device=False +): + """ + Optimize Linear layer weights in a model's state dict to FP8 format. + + Args: + state_dict (dict): State dict to optimize, replaced in-place + calc_device (str): Device to quantize tensors on + target_layer_keys (list, optional): Layer key patterns to target (None for all Linear layers) + exclude_layer_keys (list, optional): Layer key patterns to exclude + exp_bits (int): Number of exponent bits + mantissa_bits (int): Number of mantissa bits + move_to_device (bool): Move optimized tensors to the calculating device + + Returns: + dict: FP8 optimized state dict + """ + if exp_bits == 4 and mantissa_bits == 3: + fp8_dtype = torch.float8_e4m3fn + elif exp_bits == 5 and mantissa_bits == 2: + fp8_dtype = torch.float8_e5m2 + else: + raise ValueError(f"Unsupported FP8 format: E{exp_bits}M{mantissa_bits}") + + # Calculate FP8 max value + max_value = calculate_fp8_maxval(exp_bits, mantissa_bits) + min_value = -max_value # this function supports only signed FP8 + + # Create optimized state dict + optimized_count = 0 + + # Enumerate tarket keys + target_state_dict_keys = [] + for key in state_dict.keys(): + # Check if it's a weight key and matches target patterns + is_target = (target_layer_keys is None or any(pattern in key for pattern in target_layer_keys)) and key.endswith(".weight") + is_excluded = exclude_layer_keys is not None and any(pattern in key for pattern in exclude_layer_keys) + is_target = is_target and not is_excluded + + if is_target and isinstance(state_dict[key], torch.Tensor): + target_state_dict_keys.append(key) + + # Process each key + for key in tqdm(target_state_dict_keys): + value = state_dict[key] + + # Save original device and dtype + original_device = value.device + original_dtype = value.dtype + + # Move to calculation device + if calc_device is not None: + value = value.to(calc_device) + + # Calculate scale factor + scale = torch.max(torch.abs(value.flatten())) / max_value + # print(f"Optimizing {key} with scale: {scale}") + + # Quantize weight to FP8 + quantized_weight, _ = quantize_tensor_to_fp8(value, scale, exp_bits, mantissa_bits, 1, max_value, min_value) + + # Add to state dict using original key for weight and new key for scale + fp8_key = key # Maintain original key + scale_key = key.replace(".weight", ".scale_weight") + + quantized_weight = quantized_weight.to(fp8_dtype) + + if not move_to_device: + quantized_weight = quantized_weight.to(original_device) + + scale_tensor = torch.tensor([scale], dtype=original_dtype, device=quantized_weight.device) + + state_dict[fp8_key] = quantized_weight + state_dict[scale_key] = scale_tensor + + optimized_count += 1 + + if calc_device is not None: # optimized_count % 10 == 0 and + # free memory on calculation device + torch.cuda.empty_cache() # TODO check device typ + + print(f"Number of optimized Linear layers: {optimized_count}") + return state_dict + + +def fp8_linear_forward_patch(self: nn.Linear, x, use_scaled_mm=False, max_value=None): + """ + Patched forward method for Linear layers with FP8 weights. + + Args: + self: Linear layer instance + x (torch.Tensor): Input tensor + use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series) + max_value (float): Maximum value for FP8 quantization. If None, no quantization is applied for input tensor. + + Returns: + torch.Tensor: Result of linear transformation + """ + if use_scaled_mm: + input_dtype = x.dtype + original_weight_dtype = self.scale_weight.dtype + weight_dtype = self.weight.dtype + target_dtype = torch.float8_e5m2 + assert weight_dtype == torch.float8_e4m3fn, "Only FP8 E4M3FN format is supported" + assert x.ndim == 3, "Input tensor must be 3D (batch_size, seq_len, hidden_dim)" + + if max_value is None: + # no input quantization + scale_x = torch.tensor(1.0, dtype=torch.float32, device=x.device) + else: + # calculate scale factor for input tensor + scale_x = (torch.max(torch.abs(x.flatten())) / max_value).to(torch.float32) + + # quantize input tensor to FP8: this seems to consume a lot of memory + x, _ = quantize_tensor_to_fp8(x, scale_x, 5, 2, 1, max_value, -max_value) + + original_shape = x.shape + x = x.reshape(-1, x.shape[2]).to(target_dtype) + + weight = self.weight.t() + scale_weight = self.scale_weight.to(torch.float32) + + if self.bias is not None: + # float32 is not supported with bias in scaled_mm + o = torch._scaled_mm(x, weight, out_dtype=original_weight_dtype, bias=self.bias, scale_a=scale_x, scale_b=scale_weight) + else: + o = torch._scaled_mm(x, weight, out_dtype=input_dtype, scale_a=scale_x, scale_b=scale_weight) + + return o.reshape(original_shape[0], original_shape[1], -1).to(input_dtype) + + else: + # Dequantize the weight + original_dtype = self.scale_weight.dtype + dequantized_weight = self.weight.to(original_dtype) * self.scale_weight + + # Perform linear transformation + if self.bias is not None: + output = F.linear(x, dequantized_weight, self.bias) + else: + output = F.linear(x, dequantized_weight) + + return output + + +def apply_fp8_monkey_patch(model, optimized_state_dict, use_scaled_mm=False): + """ + Apply monkey patching to a model using FP8 optimized state dict. + + Args: + model (nn.Module): Model instance to patch + optimized_state_dict (dict): FP8 optimized state dict + use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series) + + Returns: + nn.Module: The patched model (same instance, modified in-place) + """ + # # Calculate FP8 float8_e5m2 max value + # max_value = calculate_fp8_maxval(5, 2) + max_value = None # do not quantize input tensor + + # Find all scale keys to identify FP8-optimized layers + scale_keys = [k for k in optimized_state_dict.keys() if k.endswith(".scale_weight")] + + # Enumerate patched layers + patched_module_paths = set() + for scale_key in scale_keys: + # Extract module path from scale key (remove .scale_weight) + module_path = scale_key.rsplit(".scale_weight", 1)[0] + patched_module_paths.add(module_path) + + patched_count = 0 + + # Apply monkey patch to each layer with FP8 weights + for name, module in model.named_modules(): + # Check if this module has a corresponding scale_weight + has_scale = name in patched_module_paths + + # Apply patch if it's a Linear layer with FP8 scale + if isinstance(module, nn.Linear) and has_scale: + # register the scale_weight as a buffer to load the state_dict + module.register_buffer("scale_weight", torch.tensor(1.0, dtype=module.weight.dtype)) + + # Create a new forward method with the patched version. + def new_forward(self, x): + return fp8_linear_forward_patch(self, x, use_scaled_mm, max_value) + + # Bind method to module + module.forward = new_forward.__get__(module, type(module)) + + patched_count += 1 + + print(f"Number of monkey-patched Linear layers: {patched_count}") + return model