diff --git a/README.md b/README.md index 43c41538..00b1db27 100644 --- a/README.md +++ b/README.md @@ -36,18 +36,21 @@ About speed, on my RTX 4090 desktop it generates at a speed of 2.5 seconds/frame In any case, you will directly see the generated frames since it is next-frame(-section) prediction. So you will get lots of visual feedback before the entire video is generated. -# Installation +# Installation (NVIDIA) -**Windows**: +For CUDA (NVIDIA GPUs): +```bash +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 +pip install -r requirements.txt +``` -One-click-package will be released soon. Please come back tomorrow. +The software supports PyTorch attention, xformers, flash-attn, sage-attention. By default, it will just use PyTorch attention. You can install those attention kernels if you know how. -**Linux**: +For example, to install sage-attention (linux): -We recommend having an independent Python 3.10. + pip install sageattention==1.0.6 - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 - pip install -r requirements.txt +However, you are highly recommended to first try without sage-attention since it will influence results, though the influence is minimal. To start the GUI, run: @@ -55,13 +58,38 @@ To start the GUI, run: Note that it supports `--share`, `--port`, `--server`, and so on. -The software supports PyTorch attention, xformers, flash-attn, sage-attention. By default, it will just use PyTorch attention. You can install those attention kernels if you know how. +# Installation (macOS) -For example, to install sage-attention (linux): +FramePack recommends using Python 3.10. If you have [homebrew](https://brew.sh/) installed, you can install Python 3.10 using brew. +```bash +brew install python@3.10 +``` - pip install sageattention==1.0.6 +To install dependencies +```bash +pip3.10 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu +pip3.10 install -r requirements.txt +``` -However, you are highly recommended to first try without sage-attention since it will influence results, though the influence is minimal. +# Starting FramePack on macOS +To start the GUI, run: + + python3.10 demo_gradio.py + +Some additional arguments you can specify are: +* `--share`: Enable sharing the Gradio interface via a public URL +* `--server`: Specify the server address (default: '0.0.0.0') +* `--port`: Specify the port number to run the server on (default: 7860) +* `--output_dir`: Set the directory for saving generated outputs (default: './outputs') +* `--resolution`: Set the target resolution for video generation (default: 416) +* `--fp32`: Use float32 precision instead of float16/bfloat16 (helpful for some M1/M2 processors) + +When running on M1 or M2 processors you may need to pass `--fp32` as they may not fully support float16 and bfloat16. + +You can also adjust the resolution of the generated videos using `--resolution`. +In the original FramePack version, the resolution was set to 640. However, as of the time this documentation was written, MPS in pytorch has issues with the large tensors needed to generate videos at that resolution. This version now defaults to 416, but may also be successful with slightly higher resolutions like 480. Feel free to experiment with the setting. + +(NOTE: The 640 value means it will generate a video that is roughly 640x640 for a square image. If the image has a different aspect ratio, the resolution will attempt to match the ratio and keep roughly the same number of pixels.) # GUI diff --git a/demo_gradio.py b/demo_gradio.py index b150c59f..a9a6c288 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -3,6 +3,7 @@ import os os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))) +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" import gradio as gr import torch @@ -32,13 +33,19 @@ parser.add_argument('--share', action='store_true') parser.add_argument("--server", type=str, default='0.0.0.0') parser.add_argument("--port", type=int, default=7860) +parser.add_argument("--output_dir", type=str, default='./outputs') +parser.add_argument("--resolution", type=int, default=416) +parser.add_argument("--fp32", action='store_true', default=False) args = parser.parse_args() print(args) -free_mem_gb = get_cuda_free_memory_gb(gpu) -high_vram = free_mem_gb > 40 +if torch.cuda.is_available(): + free_mem_gb = get_cuda_free_memory_gb(gpu) +else: + free_mem_gb = torch.mps.recommended_max_memory() / 1024 / 1024 / 1024 +high_vram = free_mem_gb > 40 print(f'Free VRAM {free_mem_gb} GB') print(f'High-VRAM Mode: {high_vram}') @@ -66,11 +73,20 @@ transformer.high_quality_fp32_output_for_inference = True print('transformer.high_quality_fp32_output_for_inference = True') -transformer.to(dtype=torch.bfloat16) -vae.to(dtype=torch.float16) -image_encoder.to(dtype=torch.float16) -text_encoder.to(dtype=torch.float16) -text_encoder_2.to(dtype=torch.float16) +# For MPS, some processors like M1/M2 may need to use float32 +if args.fp32: + print('Using float32 for transformer and encoder models') + transformer.to(dtype=torch.float32) + vae.to(dtype=torch.float32) + image_encoder.to(dtype=torch.float32) + text_encoder.to(dtype=torch.float32) + text_encoder_2.to(dtype=torch.float32) +else: + transformer.to(dtype=torch.bfloat16) + vae.to(dtype=torch.float16) + image_encoder.to(dtype=torch.float16) + text_encoder.to(dtype=torch.float16) + text_encoder_2.to(dtype=torch.float16) vae.requires_grad_(False) text_encoder.requires_grad_(False) @@ -91,13 +107,13 @@ stream = AsyncStream() -outputs_folder = './outputs/' +outputs_folder = args.output_dir os.makedirs(outputs_folder, exist_ok=True) @torch.no_grad() def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache): - total_latent_sections = (total_second_length * 30) / (latent_window_size * 4) + total_latent_sections = (total_second_length * 24) / (latent_window_size * 4) total_latent_sections = int(max(round(total_latent_sections), 1)) job_id = generate_timestamp() @@ -116,7 +132,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...')))) if not high_vram: - fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode. + fake_diffusers_current_device(text_encoder, gpu) load_model_as_complete(text_encoder_2, target_device=gpu) llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2) @@ -134,7 +150,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...')))) H, W, C = input_image.shape - height, width = find_nearest_bucket(H, W, resolution=640) + height, width = find_nearest_bucket(H, W, resolution=args.resolution) input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height) Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png')) @@ -183,10 +199,6 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind latent_paddings = reversed(range(total_latent_sections)) if total_latent_sections > 4: - # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some - # items looks better than expanding it when total_latent_sections > 4 - # One can try to remove below trick and just - # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0] for latent_padding in latent_paddings: @@ -230,7 +242,7 @@ def callback(d): current_step = d['i'] + 1 percentage = int(100.0 * current_step / steps) hint = f'Sampling {current_step}/{steps}' - desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...' + desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 24) :.2f} seconds (FPS-24). The video is being extended now ...' stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint)))) return @@ -243,7 +255,6 @@ def callback(d): real_guidance_scale=cfg, distilled_guidance_scale=gs, guidance_rescale=rs, - # shift=3.0, num_inference_steps=steps, generator=rnd, prompt_embeds=llama_vec, @@ -253,7 +264,7 @@ def callback(d): negative_prompt_embeds_mask=llama_attention_mask_n, negative_prompt_poolers=clip_l_pooler_n, device=gpu, - dtype=torch.bfloat16, + dtype=transformer.dtype, image_embeddings=image_encoder_last_hidden_state, latent_indices=latent_indices, clean_latents=clean_latents, @@ -291,7 +302,7 @@ def callback(d): output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4') - save_bcthw_as_mp4(history_pixels, output_filename, fps=30) + save_bcthw_as_mp4(history_pixels, output_filename, fps=24) print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}') @@ -379,7 +390,8 @@ def end_process(): gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.') rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change - gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.") + # This is only used when high_vram is False + gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.", visible=not high_vram) with gr.Column(): preview_image = gr.Image(label="Next Latents", height=200, visible=False) @@ -396,4 +408,5 @@ def end_process(): server_name=args.server, server_port=args.port, share=args.share, + allowed_paths=[outputs_folder], ) diff --git a/diffusers_helper/bucket_tools.py b/diffusers_helper/bucket_tools.py index dc13fdeb..9c288228 100644 --- a/diffusers_helper/bucket_tools.py +++ b/diffusers_helper/bucket_tools.py @@ -1,30 +1,36 @@ bucket_options = { - 640: [ - (416, 960), - (448, 864), - (480, 832), - (512, 768), - (544, 704), - (576, 672), - (608, 640), - (640, 608), - (672, 576), - (704, 544), - (768, 512), - (832, 480), - (864, 448), - (960, 416), - ], + (416, 960), + (448, 864), + (480, 832), + (512, 768), + (544, 704), + (576, 672), + (608, 640), + (640, 608), + (672, 576), + (704, 544), + (768, 512), + (832, 480), + (864, 448), + (960, 416), } def find_nearest_bucket(h, w, resolution=640): min_metric = float('inf') best_bucket = None - for (bucket_h, bucket_w) in bucket_options[resolution]: + for (bucket_h, bucket_w) in bucket_options: metric = abs(h * bucket_w - w * bucket_h) if metric <= min_metric: min_metric = metric best_bucket = (bucket_h, bucket_w) + + if resolution != 640: + scale_factor = resolution / 640.0 + scaled_height = round(best_bucket[0] * scale_factor / 16) * 16 + scaled_width = round(best_bucket[1] * scale_factor / 16) * 16 + best_bucket = (scaled_height, scaled_width) + print(f'Resolution: {best_bucket[1]} x {best_bucket[0]}') + return best_bucket diff --git a/diffusers_helper/memory.py b/diffusers_helper/memory.py index 3380c538..4dc303c2 100644 --- a/diffusers_helper/memory.py +++ b/diffusers_helper/memory.py @@ -4,8 +4,14 @@ import torch +# Detect available devices cpu = torch.device('cpu') -gpu = torch.device(f'cuda:{torch.cuda.current_device()}') +if torch.cuda.is_available(): + gpu = torch.device(f'cuda:{torch.cuda.current_device()}') +elif torch.backends.mps.is_available(): + gpu = torch.device('mps') +else: + raise RuntimeError("No GPU device available. Please use a system with CUDA or MPS support.") gpu_complete_modules = [] @@ -72,44 +78,60 @@ def get_cuda_free_memory_gb(device=None): if device is None: device = gpu - memory_stats = torch.cuda.memory_stats(device) - bytes_active = memory_stats['active_bytes.all.current'] - bytes_reserved = memory_stats['reserved_bytes.all.current'] - bytes_free_cuda, _ = torch.cuda.mem_get_info(device) - bytes_inactive_reserved = bytes_reserved - bytes_active - bytes_total_available = bytes_free_cuda + bytes_inactive_reserved - return bytes_total_available / (1024 ** 3) + if device.type == 'cuda': + memory_stats = torch.cuda.memory_stats(device) + bytes_active = memory_stats['active_bytes.all.current'] + bytes_reserved = memory_stats['reserved_bytes.all.current'] + bytes_free_cuda, _ = torch.cuda.mem_get_info(device) + bytes_inactive_reserved = bytes_reserved - bytes_active + bytes_total_available = bytes_free_cuda + bytes_inactive_reserved + return bytes_total_available / (1024 ** 3) + elif device.type == 'mps': + # MPS doesn't provide detailed memory stats, return a fixed value + return 16.0 # Assuming 16GB available for MPS + else: + return 0.0 def move_model_to_device_with_memory_preservation(model, target_device, preserved_memory_gb=0): print(f'Moving {model.__class__.__name__} to {target_device} with preserved memory: {preserved_memory_gb} GB') - for m in model.modules(): - if get_cuda_free_memory_gb(target_device) <= preserved_memory_gb: - torch.cuda.empty_cache() - return + if target_device.type == 'cuda': + for m in model.modules(): + if get_cuda_free_memory_gb(target_device) <= preserved_memory_gb: + torch.cuda.empty_cache() + return - if hasattr(m, 'weight'): - m.to(device=target_device) + if hasattr(m, 'weight'): + m.to(device=target_device) + else: + # For MPS, just move the model directly + model.to(device=target_device) model.to(device=target_device) - torch.cuda.empty_cache() + if target_device.type == 'cuda': + torch.cuda.empty_cache() return def offload_model_from_device_for_memory_preservation(model, target_device, preserved_memory_gb=0): print(f'Offloading {model.__class__.__name__} from {target_device} to preserve memory: {preserved_memory_gb} GB') - for m in model.modules(): - if get_cuda_free_memory_gb(target_device) >= preserved_memory_gb: - torch.cuda.empty_cache() - return + if target_device.type == 'cuda': + for m in model.modules(): + if get_cuda_free_memory_gb(target_device) >= preserved_memory_gb: + torch.cuda.empty_cache() + return - if hasattr(m, 'weight'): - m.to(device=cpu) + if hasattr(m, 'weight'): + m.to(device=cpu) + else: + # For MPS, just move the model directly + model.to(device=cpu) model.to(device=cpu) - torch.cuda.empty_cache() + if target_device.type == 'cuda': + torch.cuda.empty_cache() return @@ -119,7 +141,8 @@ def unload_complete_models(*args): print(f'Unloaded {m.__class__.__name__} as complete.') gpu_complete_modules.clear() - torch.cuda.empty_cache() + if gpu.type == 'cuda': + torch.cuda.empty_cache() return diff --git a/diffusers_helper/models/hunyuan_video_packed.py b/diffusers_helper/models/hunyuan_video_packed.py index f8797990..5c847d9b 100644 --- a/diffusers_helper/models/hunyuan_video_packed.py +++ b/diffusers_helper/models/hunyuan_video_packed.py @@ -84,7 +84,9 @@ def get_cu_seqlens(text_mask, img_len): text_len = text_mask.sum(dim=1) max_len = text_mask.shape[1] + img_len - cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda") + # Use the same device as the input tensor + device = text_mask.device + cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device=device) for i in range(batch_size): s = text_len[i] + img_len