From a00ad08b0257f0fa820db946eea390b64280d11e Mon Sep 17 00:00:00 2001 From: Brandon Cook Date: Fri, 18 Apr 2025 14:00:44 +0900 Subject: [PATCH 1/6] Enhance device compatibility and memory management for CUDA and MPS. Updated demo_gradio.py to handle memory checks and model precision based on device type. Modified memory.py to support MPS with fixed memory estimates. Adjusted hunyuan_video_packed.py to create tensors on the same device as input. Updated README.md for clearer installation instructions for CUDA and MPS. --- README.md | 21 +++--- demo_gradio.py | 39 +++++++---- diffusers_helper/memory.py | 69 ++++++++++++------- .../models/hunyuan_video_packed.py | 4 +- 4 files changed, 84 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 43c41538..fa1f423c 100644 --- a/README.md +++ b/README.md @@ -38,16 +38,17 @@ In any case, you will directly see the generated frames since it is next-frame(- # Installation -**Windows**: - -One-click-package will be released soon. Please come back tomorrow. - -**Linux**: - -We recommend having an independent Python 3.10. - - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 - pip install -r requirements.txt +For CUDA (NVIDIA GPUs): +```bash +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 +pip install -r requirements.txt +``` + +For MPS (Apple Silicon Macs): +```bash +pip install torch torchvision torchaudio +pip install -r requirements.txt +``` To start the GUI, run: diff --git a/demo_gradio.py b/demo_gradio.py index b150c59f..2e26e429 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -36,10 +36,16 @@ print(args) -free_mem_gb = get_cuda_free_memory_gb(gpu) -high_vram = free_mem_gb > 40 +if torch.cuda.is_available(): + free_mem_gb = get_cuda_free_memory_gb(gpu) + high_vram = free_mem_gb > 40 + print(f'Free VRAM {free_mem_gb} GB') +else: + # For MPS, we'll use a fixed value since we can't get memory stats + free_mem_gb = 100.0 + high_vram = True + print(f'Using MPS device with estimated {free_mem_gb} GB available') -print(f'Free VRAM {free_mem_gb} GB') print(f'High-VRAM Mode: {high_vram}') text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu() @@ -66,11 +72,19 @@ transformer.high_quality_fp32_output_for_inference = True print('transformer.high_quality_fp32_output_for_inference = True') -transformer.to(dtype=torch.bfloat16) -vae.to(dtype=torch.float16) -image_encoder.to(dtype=torch.float16) -text_encoder.to(dtype=torch.float16) -text_encoder_2.to(dtype=torch.float16) +# For MPS, we need to use float32 instead of bfloat16 +if gpu.type == 'mps': + transformer.to(dtype=torch.float32) + vae.to(dtype=torch.float32) + image_encoder.to(dtype=torch.float32) + text_encoder.to(dtype=torch.float32) + text_encoder_2.to(dtype=torch.float32) +else: + transformer.to(dtype=torch.bfloat16) + vae.to(dtype=torch.float16) + image_encoder.to(dtype=torch.float16) + text_encoder.to(dtype=torch.float16) + text_encoder_2.to(dtype=torch.float16) vae.requires_grad_(False) text_encoder.requires_grad_(False) @@ -116,7 +130,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...')))) if not high_vram: - fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode. + fake_diffusers_current_device(text_encoder, gpu) load_model_as_complete(text_encoder_2, target_device=gpu) llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2) @@ -183,10 +197,6 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind latent_paddings = reversed(range(total_latent_sections)) if total_latent_sections > 4: - # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some - # items looks better than expanding it when total_latent_sections > 4 - # One can try to remove below trick and just - # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0] for latent_padding in latent_paddings: @@ -243,7 +253,6 @@ def callback(d): real_guidance_scale=cfg, distilled_guidance_scale=gs, guidance_rescale=rs, - # shift=3.0, num_inference_steps=steps, generator=rnd, prompt_embeds=llama_vec, @@ -253,7 +262,7 @@ def callback(d): negative_prompt_embeds_mask=llama_attention_mask_n, negative_prompt_poolers=clip_l_pooler_n, device=gpu, - dtype=torch.bfloat16, + dtype=transformer.dtype, image_embeddings=image_encoder_last_hidden_state, latent_indices=latent_indices, clean_latents=clean_latents, diff --git a/diffusers_helper/memory.py b/diffusers_helper/memory.py index 3380c538..4dc303c2 100644 --- a/diffusers_helper/memory.py +++ b/diffusers_helper/memory.py @@ -4,8 +4,14 @@ import torch +# Detect available devices cpu = torch.device('cpu') -gpu = torch.device(f'cuda:{torch.cuda.current_device()}') +if torch.cuda.is_available(): + gpu = torch.device(f'cuda:{torch.cuda.current_device()}') +elif torch.backends.mps.is_available(): + gpu = torch.device('mps') +else: + raise RuntimeError("No GPU device available. Please use a system with CUDA or MPS support.") gpu_complete_modules = [] @@ -72,44 +78,60 @@ def get_cuda_free_memory_gb(device=None): if device is None: device = gpu - memory_stats = torch.cuda.memory_stats(device) - bytes_active = memory_stats['active_bytes.all.current'] - bytes_reserved = memory_stats['reserved_bytes.all.current'] - bytes_free_cuda, _ = torch.cuda.mem_get_info(device) - bytes_inactive_reserved = bytes_reserved - bytes_active - bytes_total_available = bytes_free_cuda + bytes_inactive_reserved - return bytes_total_available / (1024 ** 3) + if device.type == 'cuda': + memory_stats = torch.cuda.memory_stats(device) + bytes_active = memory_stats['active_bytes.all.current'] + bytes_reserved = memory_stats['reserved_bytes.all.current'] + bytes_free_cuda, _ = torch.cuda.mem_get_info(device) + bytes_inactive_reserved = bytes_reserved - bytes_active + bytes_total_available = bytes_free_cuda + bytes_inactive_reserved + return bytes_total_available / (1024 ** 3) + elif device.type == 'mps': + # MPS doesn't provide detailed memory stats, return a fixed value + return 16.0 # Assuming 16GB available for MPS + else: + return 0.0 def move_model_to_device_with_memory_preservation(model, target_device, preserved_memory_gb=0): print(f'Moving {model.__class__.__name__} to {target_device} with preserved memory: {preserved_memory_gb} GB') - for m in model.modules(): - if get_cuda_free_memory_gb(target_device) <= preserved_memory_gb: - torch.cuda.empty_cache() - return + if target_device.type == 'cuda': + for m in model.modules(): + if get_cuda_free_memory_gb(target_device) <= preserved_memory_gb: + torch.cuda.empty_cache() + return - if hasattr(m, 'weight'): - m.to(device=target_device) + if hasattr(m, 'weight'): + m.to(device=target_device) + else: + # For MPS, just move the model directly + model.to(device=target_device) model.to(device=target_device) - torch.cuda.empty_cache() + if target_device.type == 'cuda': + torch.cuda.empty_cache() return def offload_model_from_device_for_memory_preservation(model, target_device, preserved_memory_gb=0): print(f'Offloading {model.__class__.__name__} from {target_device} to preserve memory: {preserved_memory_gb} GB') - for m in model.modules(): - if get_cuda_free_memory_gb(target_device) >= preserved_memory_gb: - torch.cuda.empty_cache() - return + if target_device.type == 'cuda': + for m in model.modules(): + if get_cuda_free_memory_gb(target_device) >= preserved_memory_gb: + torch.cuda.empty_cache() + return - if hasattr(m, 'weight'): - m.to(device=cpu) + if hasattr(m, 'weight'): + m.to(device=cpu) + else: + # For MPS, just move the model directly + model.to(device=cpu) model.to(device=cpu) - torch.cuda.empty_cache() + if target_device.type == 'cuda': + torch.cuda.empty_cache() return @@ -119,7 +141,8 @@ def unload_complete_models(*args): print(f'Unloaded {m.__class__.__name__} as complete.') gpu_complete_modules.clear() - torch.cuda.empty_cache() + if gpu.type == 'cuda': + torch.cuda.empty_cache() return diff --git a/diffusers_helper/models/hunyuan_video_packed.py b/diffusers_helper/models/hunyuan_video_packed.py index f8797990..5c847d9b 100644 --- a/diffusers_helper/models/hunyuan_video_packed.py +++ b/diffusers_helper/models/hunyuan_video_packed.py @@ -84,7 +84,9 @@ def get_cu_seqlens(text_mask, img_len): text_len = text_mask.sum(dim=1) max_len = text_mask.shape[1] + img_len - cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda") + # Use the same device as the input tensor + device = text_mask.device + cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device=device) for i in range(batch_size): s = text_len[i] + img_len From 38a3ea05a62dbe0b5e5b030c89a24b959557edcc Mon Sep 17 00:00:00 2001 From: Brandon Cook Date: Fri, 18 Apr 2025 20:12:03 +0900 Subject: [PATCH 2/6] Add output directory argument to demo_gradio.py and update bucket_tools.py with new resolution options - Introduced `--output_dir` argument in demo_gradio.py for customizable output folder. - Adjusted image processing resolution to 480 in demo_gradio.py. - Expanded bucket_tools.py with additional resolution options for better image handling. --- demo_gradio.py | 10 +++---- diffusers_helper/bucket_tools.py | 48 ++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/demo_gradio.py b/demo_gradio.py index 2e26e429..d87fe67b 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -32,6 +32,7 @@ parser.add_argument('--share', action='store_true') parser.add_argument("--server", type=str, default='0.0.0.0') parser.add_argument("--port", type=int, default=7860) +parser.add_argument("--output_dir", type=str, default='./outputs') args = parser.parse_args() print(args) @@ -41,10 +42,8 @@ high_vram = free_mem_gb > 40 print(f'Free VRAM {free_mem_gb} GB') else: - # For MPS, we'll use a fixed value since we can't get memory stats - free_mem_gb = 100.0 + # For MPS, we'll say high_vram is always True high_vram = True - print(f'Using MPS device with estimated {free_mem_gb} GB available') print(f'High-VRAM Mode: {high_vram}') @@ -105,7 +104,7 @@ stream = AsyncStream() -outputs_folder = './outputs/' +outputs_folder = args.output_dir os.makedirs(outputs_folder, exist_ok=True) @@ -148,7 +147,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...')))) H, W, C = input_image.shape - height, width = find_nearest_bucket(H, W, resolution=640) + height, width = find_nearest_bucket(H, W, resolution=480) input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height) Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png')) @@ -405,4 +404,5 @@ def end_process(): server_name=args.server, server_port=args.port, share=args.share, + allowed_paths=[outputs_folder], ) diff --git a/diffusers_helper/bucket_tools.py b/diffusers_helper/bucket_tools.py index dc13fdeb..4854a7e6 100644 --- a/diffusers_helper/bucket_tools.py +++ b/diffusers_helper/bucket_tools.py @@ -1,4 +1,52 @@ bucket_options = { + 320: [ + (208, 480), + (224, 432), + (240, 416), + (256, 384), + (272, 352), + (288, 336), + (304, 320), + (320, 304), + (336, 288), + (352, 272), + (384, 256), + (416, 240), + (432, 224), + (480, 208), + ], + 416: [ + (272, 640), + (288, 560), + (304, 528), + (320, 480), + (352, 464), + (368, 432), + (384, 416), + (416, 384), + (432, 368), + (464, 352), + (480, 320), + (528, 304), + (560, 288), + (640, 272), + ], + 480: [ + (304, 704), + (320, 624), + (352, 624), + (368, 560), + (400, 528), + (416, 496), + (448, 480), + (480, 448), + (496, 416), + (528, 400), + (560, 368), + (624, 352), + (624, 320), + (704, 304), + ], 640: [ (416, 960), (448, 864), From cfafa16d4bfa9013e98ccb677f5cdcb268727ffb Mon Sep 17 00:00:00 2001 From: Brandon Cook Date: Sat, 19 Apr 2025 06:48:41 +0900 Subject: [PATCH 3/6] Update frame rate calculations in demo_gradio.py to reflect FPS-24 adjustments - Modified total latent sections calculation to use a frame rate of 24 instead of 30. - Updated video length description and output file saving function to align with the new frame rate. --- demo_gradio.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/demo_gradio.py b/demo_gradio.py index d87fe67b..899105ab 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -110,7 +110,7 @@ @torch.no_grad() def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache): - total_latent_sections = (total_second_length * 30) / (latent_window_size * 4) + total_latent_sections = (total_second_length * 24) / (latent_window_size * 4) total_latent_sections = int(max(round(total_latent_sections), 1)) job_id = generate_timestamp() @@ -239,7 +239,7 @@ def callback(d): current_step = d['i'] + 1 percentage = int(100.0 * current_step / steps) hint = f'Sampling {current_step}/{steps}' - desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...' + desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 24) :.2f} seconds (FPS-24). The video is being extended now ...' stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint)))) return @@ -299,7 +299,7 @@ def callback(d): output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4') - save_bcthw_as_mp4(history_pixels, output_filename, fps=30) + save_bcthw_as_mp4(history_pixels, output_filename, fps=24) print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}') From d408b145f9dcc1992bee865b0c1493b8ee7b2ee9 Mon Sep 17 00:00:00 2001 From: Brandon Cook Date: Sat, 19 Apr 2025 11:22:04 +0900 Subject: [PATCH 4/6] Enhance demo_gradio.py with new arguments and improved memory handling - Added `--resolution_bucket` and `--fp32` arguments for customizable image processing and model precision. - Updated memory management for MPS devices to use recommended maximum memory. - Adjusted GPU memory preservation slider visibility based on high VRAM mode. --- demo_gradio.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/demo_gradio.py b/demo_gradio.py index 899105ab..a190dfa6 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -3,6 +3,7 @@ import os os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))) +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" import gradio as gr import torch @@ -33,18 +34,19 @@ parser.add_argument("--server", type=str, default='0.0.0.0') parser.add_argument("--port", type=int, default=7860) parser.add_argument("--output_dir", type=str, default='./outputs') +parser.add_argument("--resolution_bucket", type=int, default=416) +parser.add_argument("--fp32", action='store_true', default=False) args = parser.parse_args() print(args) if torch.cuda.is_available(): free_mem_gb = get_cuda_free_memory_gb(gpu) - high_vram = free_mem_gb > 40 - print(f'Free VRAM {free_mem_gb} GB') else: - # For MPS, we'll say high_vram is always True - high_vram = True + free_mem_gb = torch.mps.recommended_max_memory() / 1024 / 1024 / 1024 +high_vram = free_mem_gb > 40 +print(f'Free VRAM {free_mem_gb} GB') print(f'High-VRAM Mode: {high_vram}') text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu() @@ -71,8 +73,9 @@ transformer.high_quality_fp32_output_for_inference = True print('transformer.high_quality_fp32_output_for_inference = True') -# For MPS, we need to use float32 instead of bfloat16 -if gpu.type == 'mps': +# For MPS, some processors like M1/M2 may need to use float32 +if args.fp32: + print('Using float32 for transformer and encoder models') transformer.to(dtype=torch.float32) vae.to(dtype=torch.float32) image_encoder.to(dtype=torch.float32) @@ -147,7 +150,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...')))) H, W, C = input_image.shape - height, width = find_nearest_bucket(H, W, resolution=480) + height, width = find_nearest_bucket(H, W, resolution=args.resolution_bucket) input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height) Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png')) @@ -387,7 +390,8 @@ def end_process(): gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.') rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change - gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.") + # This is only used when high_vram is False + gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.", visible=not high_vram) with gr.Column(): preview_image = gr.Image(label="Next Latents", height=200, visible=False) From aca3cbc4d3ab0bf28366afba033506463601d59c Mon Sep 17 00:00:00 2001 From: Brandon Cook Date: Sat, 19 Apr 2025 11:38:28 +0900 Subject: [PATCH 5/6] Refactor resolution handling in demo_gradio.py and bucket_tools.py - Renamed `--resolution_bucket` argument to `--resolution` for clarity in demo_gradio.py. - Updated image processing logic to utilize the new resolution argument. - Simplified bucket_options in bucket_tools.py by removing resolution-specific keys, allowing for more flexible resolution handling. --- demo_gradio.py | 4 +- diffusers_helper/bucket_tools.py | 88 +++++++++----------------------- 2 files changed, 25 insertions(+), 67 deletions(-) diff --git a/demo_gradio.py b/demo_gradio.py index a190dfa6..a9a6c288 100644 --- a/demo_gradio.py +++ b/demo_gradio.py @@ -34,7 +34,7 @@ parser.add_argument("--server", type=str, default='0.0.0.0') parser.add_argument("--port", type=int, default=7860) parser.add_argument("--output_dir", type=str, default='./outputs') -parser.add_argument("--resolution_bucket", type=int, default=416) +parser.add_argument("--resolution", type=int, default=416) parser.add_argument("--fp32", action='store_true', default=False) args = parser.parse_args() @@ -150,7 +150,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...')))) H, W, C = input_image.shape - height, width = find_nearest_bucket(H, W, resolution=args.resolution_bucket) + height, width = find_nearest_bucket(H, W, resolution=args.resolution) input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height) Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png')) diff --git a/diffusers_helper/bucket_tools.py b/diffusers_helper/bucket_tools.py index 4854a7e6..9c288228 100644 --- a/diffusers_helper/bucket_tools.py +++ b/diffusers_helper/bucket_tools.py @@ -1,78 +1,36 @@ bucket_options = { - 320: [ - (208, 480), - (224, 432), - (240, 416), - (256, 384), - (272, 352), - (288, 336), - (304, 320), - (320, 304), - (336, 288), - (352, 272), - (384, 256), - (416, 240), - (432, 224), - (480, 208), - ], - 416: [ - (272, 640), - (288, 560), - (304, 528), - (320, 480), - (352, 464), - (368, 432), - (384, 416), - (416, 384), - (432, 368), - (464, 352), - (480, 320), - (528, 304), - (560, 288), - (640, 272), - ], - 480: [ - (304, 704), - (320, 624), - (352, 624), - (368, 560), - (400, 528), - (416, 496), - (448, 480), - (480, 448), - (496, 416), - (528, 400), - (560, 368), - (624, 352), - (624, 320), - (704, 304), - ], - 640: [ - (416, 960), - (448, 864), - (480, 832), - (512, 768), - (544, 704), - (576, 672), - (608, 640), - (640, 608), - (672, 576), - (704, 544), - (768, 512), - (832, 480), - (864, 448), - (960, 416), - ], + (416, 960), + (448, 864), + (480, 832), + (512, 768), + (544, 704), + (576, 672), + (608, 640), + (640, 608), + (672, 576), + (704, 544), + (768, 512), + (832, 480), + (864, 448), + (960, 416), } def find_nearest_bucket(h, w, resolution=640): min_metric = float('inf') best_bucket = None - for (bucket_h, bucket_w) in bucket_options[resolution]: + for (bucket_h, bucket_w) in bucket_options: metric = abs(h * bucket_w - w * bucket_h) if metric <= min_metric: min_metric = metric best_bucket = (bucket_h, bucket_w) + + if resolution != 640: + scale_factor = resolution / 640.0 + scaled_height = round(best_bucket[0] * scale_factor / 16) * 16 + scaled_width = round(best_bucket[1] * scale_factor / 16) * 16 + best_bucket = (scaled_height, scaled_width) + print(f'Resolution: {best_bucket[1]} x {best_bucket[0]}') + return best_bucket From 305912c754760512006f73321c3df6ad2932b03b Mon Sep 17 00:00:00 2001 From: Brandon Cook Date: Sat, 19 Apr 2025 11:58:46 +0900 Subject: [PATCH 6/6] Update README.md to clarify installation instructions for NVIDIA and macOS - Renamed installation section for NVIDIA GPUs to improve clarity. - Added detailed installation steps for macOS, including Python 3.10 installation via Homebrew. - Included new arguments for starting FramePack on macOS, enhancing user guidance for configuration options. - Updated resolution handling information to reflect changes in default settings and recommendations for M1/M2 processors. --- README.md | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index fa1f423c..00b1db27 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ About speed, on my RTX 4090 desktop it generates at a speed of 2.5 seconds/frame In any case, you will directly see the generated frames since it is next-frame(-section) prediction. So you will get lots of visual feedback before the entire video is generated. -# Installation +# Installation (NVIDIA) For CUDA (NVIDIA GPUs): ```bash @@ -44,11 +44,13 @@ pip install torch torchvision torchaudio --index-url https://download.pytorch.or pip install -r requirements.txt ``` -For MPS (Apple Silicon Macs): -```bash -pip install torch torchvision torchaudio -pip install -r requirements.txt -``` +The software supports PyTorch attention, xformers, flash-attn, sage-attention. By default, it will just use PyTorch attention. You can install those attention kernels if you know how. + +For example, to install sage-attention (linux): + + pip install sageattention==1.0.6 + +However, you are highly recommended to first try without sage-attention since it will influence results, though the influence is minimal. To start the GUI, run: @@ -56,13 +58,38 @@ To start the GUI, run: Note that it supports `--share`, `--port`, `--server`, and so on. -The software supports PyTorch attention, xformers, flash-attn, sage-attention. By default, it will just use PyTorch attention. You can install those attention kernels if you know how. +# Installation (macOS) -For example, to install sage-attention (linux): +FramePack recommends using Python 3.10. If you have [homebrew](https://brew.sh/) installed, you can install Python 3.10 using brew. +```bash +brew install python@3.10 +``` - pip install sageattention==1.0.6 +To install dependencies +```bash +pip3.10 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu +pip3.10 install -r requirements.txt +``` -However, you are highly recommended to first try without sage-attention since it will influence results, though the influence is minimal. +# Starting FramePack on macOS +To start the GUI, run: + + python3.10 demo_gradio.py + +Some additional arguments you can specify are: +* `--share`: Enable sharing the Gradio interface via a public URL +* `--server`: Specify the server address (default: '0.0.0.0') +* `--port`: Specify the port number to run the server on (default: 7860) +* `--output_dir`: Set the directory for saving generated outputs (default: './outputs') +* `--resolution`: Set the target resolution for video generation (default: 416) +* `--fp32`: Use float32 precision instead of float16/bfloat16 (helpful for some M1/M2 processors) + +When running on M1 or M2 processors you may need to pass `--fp32` as they may not fully support float16 and bfloat16. + +You can also adjust the resolution of the generated videos using `--resolution`. +In the original FramePack version, the resolution was set to 640. However, as of the time this documentation was written, MPS in pytorch has issues with the large tensors needed to generate videos at that resolution. This version now defaults to 416, but may also be successful with slightly higher resolutions like 480. Feel free to experiment with the setting. + +(NOTE: The 640 value means it will generate a video that is roughly 640x640 for a square image. If the image has a different aspect ratio, the resolution will attempt to match the ratio and keep roughly the same number of pixels.) # GUI