Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 39 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,60 @@ About speed, on my RTX 4090 desktop it generates at a speed of 2.5 seconds/frame

In any case, you will directly see the generated frames since it is next-frame(-section) prediction. So you will get lots of visual feedback before the entire video is generated.

# Installation
# Installation (NVIDIA)

**Windows**:
For CUDA (NVIDIA GPUs):
```bash
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
pip install -r requirements.txt
```

One-click-package will be released soon. Please come back tomorrow.
The software supports PyTorch attention, xformers, flash-attn, sage-attention. By default, it will just use PyTorch attention. You can install those attention kernels if you know how.

**Linux**:
For example, to install sage-attention (linux):

We recommend having an independent Python 3.10.
pip install sageattention==1.0.6

pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
pip install -r requirements.txt
However, you are highly recommended to first try without sage-attention since it will influence results, though the influence is minimal.

To start the GUI, run:

python demo_gradio.py

Note that it supports `--share`, `--port`, `--server`, and so on.

The software supports PyTorch attention, xformers, flash-attn, sage-attention. By default, it will just use PyTorch attention. You can install those attention kernels if you know how.
# Installation (macOS)

For example, to install sage-attention (linux):
FramePack recommends using Python 3.10. If you have [homebrew](https://brew.sh/) installed, you can install Python 3.10 using brew.
```bash
brew install python@3.10
```

pip install sageattention==1.0.6
To install dependencies
```bash
pip3.10 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
pip3.10 install -r requirements.txt
```

However, you are highly recommended to first try without sage-attention since it will influence results, though the influence is minimal.
# Starting FramePack on macOS
To start the GUI, run:

python3.10 demo_gradio.py

Some additional arguments you can specify are:
* `--share`: Enable sharing the Gradio interface via a public URL
* `--server`: Specify the server address (default: '0.0.0.0')
* `--port`: Specify the port number to run the server on (default: 7860)
* `--output_dir`: Set the directory for saving generated outputs (default: './outputs')
* `--resolution`: Set the target resolution for video generation (default: 416)
* `--fp32`: Use float32 precision instead of float16/bfloat16 (helpful for some M1/M2 processors)

When running on M1 or M2 processors you may need to pass `--fp32` as they may not fully support float16 and bfloat16.

You can also adjust the resolution of the generated videos using `--resolution`.
In the original FramePack version, the resolution was set to 640. However, as of the time this documentation was written, MPS in pytorch has issues with the large tensors needed to generate videos at that resolution. This version now defaults to 416, but may also be successful with slightly higher resolutions like 480. Feel free to experiment with the setting.

(NOTE: The 640 value means it will generate a video that is roughly 640x640 for a square image. If the image has a different aspect ratio, the resolution will attempt to match the ratio and keep roughly the same number of pixels.)

# GUI

Expand Down
53 changes: 33 additions & 20 deletions demo_gradio.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os

os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

import gradio as gr
import torch
Expand Down Expand Up @@ -32,13 +33,19 @@
parser.add_argument('--share', action='store_true')
parser.add_argument("--server", type=str, default='0.0.0.0')
parser.add_argument("--port", type=int, default=7860)
parser.add_argument("--output_dir", type=str, default='./outputs')
parser.add_argument("--resolution", type=int, default=416)
parser.add_argument("--fp32", action='store_true', default=False)
args = parser.parse_args()

print(args)

free_mem_gb = get_cuda_free_memory_gb(gpu)
high_vram = free_mem_gb > 40
if torch.cuda.is_available():
free_mem_gb = get_cuda_free_memory_gb(gpu)
else:
free_mem_gb = torch.mps.recommended_max_memory() / 1024 / 1024 / 1024

high_vram = free_mem_gb > 40
print(f'Free VRAM {free_mem_gb} GB')
print(f'High-VRAM Mode: {high_vram}')

Expand Down Expand Up @@ -66,11 +73,20 @@
transformer.high_quality_fp32_output_for_inference = True
print('transformer.high_quality_fp32_output_for_inference = True')

transformer.to(dtype=torch.bfloat16)
vae.to(dtype=torch.float16)
image_encoder.to(dtype=torch.float16)
text_encoder.to(dtype=torch.float16)
text_encoder_2.to(dtype=torch.float16)
# For MPS, some processors like M1/M2 may need to use float32
if args.fp32:
print('Using float32 for transformer and encoder models')
transformer.to(dtype=torch.float32)
vae.to(dtype=torch.float32)
image_encoder.to(dtype=torch.float32)
text_encoder.to(dtype=torch.float32)
text_encoder_2.to(dtype=torch.float32)
else:
transformer.to(dtype=torch.bfloat16)
vae.to(dtype=torch.float16)
image_encoder.to(dtype=torch.float16)
text_encoder.to(dtype=torch.float16)
text_encoder_2.to(dtype=torch.float16)

vae.requires_grad_(False)
text_encoder.requires_grad_(False)
Expand All @@ -91,13 +107,13 @@

stream = AsyncStream()

outputs_folder = './outputs/'
outputs_folder = args.output_dir
os.makedirs(outputs_folder, exist_ok=True)


@torch.no_grad()
def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache):
total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
total_latent_sections = (total_second_length * 24) / (latent_window_size * 4)
total_latent_sections = int(max(round(total_latent_sections), 1))

job_id = generate_timestamp()
Expand All @@ -116,7 +132,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))

if not high_vram:
fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
fake_diffusers_current_device(text_encoder, gpu)
load_model_as_complete(text_encoder_2, target_device=gpu)

llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
Expand All @@ -134,7 +150,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))

H, W, C = input_image.shape
height, width = find_nearest_bucket(H, W, resolution=640)
height, width = find_nearest_bucket(H, W, resolution=args.resolution)
input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)

Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
Expand Down Expand Up @@ -183,10 +199,6 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
latent_paddings = reversed(range(total_latent_sections))

if total_latent_sections > 4:
# In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
# items looks better than expanding it when total_latent_sections > 4
# One can try to remove below trick and just
# use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]

for latent_padding in latent_paddings:
Expand Down Expand Up @@ -230,7 +242,7 @@ def callback(d):
current_step = d['i'] + 1
percentage = int(100.0 * current_step / steps)
hint = f'Sampling {current_step}/{steps}'
desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 24) :.2f} seconds (FPS-24). The video is being extended now ...'
stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
return

Expand All @@ -243,7 +255,6 @@ def callback(d):
real_guidance_scale=cfg,
distilled_guidance_scale=gs,
guidance_rescale=rs,
# shift=3.0,
num_inference_steps=steps,
generator=rnd,
prompt_embeds=llama_vec,
Expand All @@ -253,7 +264,7 @@ def callback(d):
negative_prompt_embeds_mask=llama_attention_mask_n,
negative_prompt_poolers=clip_l_pooler_n,
device=gpu,
dtype=torch.bfloat16,
dtype=transformer.dtype,
image_embeddings=image_encoder_last_hidden_state,
latent_indices=latent_indices,
clean_latents=clean_latents,
Expand Down Expand Up @@ -291,7 +302,7 @@ def callback(d):

output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')

save_bcthw_as_mp4(history_pixels, output_filename, fps=30)
save_bcthw_as_mp4(history_pixels, output_filename, fps=24)

print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')

Expand Down Expand Up @@ -379,7 +390,8 @@ def end_process():
gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change

gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
# This is only used when high_vram is False
gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.", visible=not high_vram)

with gr.Column():
preview_image = gr.Image(label="Next Latents", height=200, visible=False)
Expand All @@ -396,4 +408,5 @@ def end_process():
server_name=args.server,
server_port=args.port,
share=args.share,
allowed_paths=[outputs_folder],
)
40 changes: 23 additions & 17 deletions diffusers_helper/bucket_tools.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,36 @@
bucket_options = {
640: [
(416, 960),
(448, 864),
(480, 832),
(512, 768),
(544, 704),
(576, 672),
(608, 640),
(640, 608),
(672, 576),
(704, 544),
(768, 512),
(832, 480),
(864, 448),
(960, 416),
],
(416, 960),
(448, 864),
(480, 832),
(512, 768),
(544, 704),
(576, 672),
(608, 640),
(640, 608),
(672, 576),
(704, 544),
(768, 512),
(832, 480),
(864, 448),
(960, 416),
}


def find_nearest_bucket(h, w, resolution=640):
min_metric = float('inf')
best_bucket = None
for (bucket_h, bucket_w) in bucket_options[resolution]:
for (bucket_h, bucket_w) in bucket_options:
metric = abs(h * bucket_w - w * bucket_h)
if metric <= min_metric:
min_metric = metric
best_bucket = (bucket_h, bucket_w)

if resolution != 640:
scale_factor = resolution / 640.0
scaled_height = round(best_bucket[0] * scale_factor / 16) * 16
scaled_width = round(best_bucket[1] * scale_factor / 16) * 16
best_bucket = (scaled_height, scaled_width)
print(f'Resolution: {best_bucket[1]} x {best_bucket[0]}')

return best_bucket

69 changes: 46 additions & 23 deletions diffusers_helper/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,14 @@
import torch


# Detect available devices
cpu = torch.device('cpu')
gpu = torch.device(f'cuda:{torch.cuda.current_device()}')
if torch.cuda.is_available():
gpu = torch.device(f'cuda:{torch.cuda.current_device()}')
elif torch.backends.mps.is_available():
gpu = torch.device('mps')
else:
raise RuntimeError("No GPU device available. Please use a system with CUDA or MPS support.")
gpu_complete_modules = []


Expand Down Expand Up @@ -72,44 +78,60 @@ def get_cuda_free_memory_gb(device=None):
if device is None:
device = gpu

memory_stats = torch.cuda.memory_stats(device)
bytes_active = memory_stats['active_bytes.all.current']
bytes_reserved = memory_stats['reserved_bytes.all.current']
bytes_free_cuda, _ = torch.cuda.mem_get_info(device)
bytes_inactive_reserved = bytes_reserved - bytes_active
bytes_total_available = bytes_free_cuda + bytes_inactive_reserved
return bytes_total_available / (1024 ** 3)
if device.type == 'cuda':
memory_stats = torch.cuda.memory_stats(device)
bytes_active = memory_stats['active_bytes.all.current']
bytes_reserved = memory_stats['reserved_bytes.all.current']
bytes_free_cuda, _ = torch.cuda.mem_get_info(device)
bytes_inactive_reserved = bytes_reserved - bytes_active
bytes_total_available = bytes_free_cuda + bytes_inactive_reserved
return bytes_total_available / (1024 ** 3)
elif device.type == 'mps':
# MPS doesn't provide detailed memory stats, return a fixed value
return 16.0 # Assuming 16GB available for MPS
else:
return 0.0


def move_model_to_device_with_memory_preservation(model, target_device, preserved_memory_gb=0):
print(f'Moving {model.__class__.__name__} to {target_device} with preserved memory: {preserved_memory_gb} GB')

for m in model.modules():
if get_cuda_free_memory_gb(target_device) <= preserved_memory_gb:
torch.cuda.empty_cache()
return
if target_device.type == 'cuda':
for m in model.modules():
if get_cuda_free_memory_gb(target_device) <= preserved_memory_gb:
torch.cuda.empty_cache()
return

if hasattr(m, 'weight'):
m.to(device=target_device)
if hasattr(m, 'weight'):
m.to(device=target_device)
else:
# For MPS, just move the model directly
model.to(device=target_device)

model.to(device=target_device)
torch.cuda.empty_cache()
if target_device.type == 'cuda':
torch.cuda.empty_cache()
return


def offload_model_from_device_for_memory_preservation(model, target_device, preserved_memory_gb=0):
print(f'Offloading {model.__class__.__name__} from {target_device} to preserve memory: {preserved_memory_gb} GB')

for m in model.modules():
if get_cuda_free_memory_gb(target_device) >= preserved_memory_gb:
torch.cuda.empty_cache()
return
if target_device.type == 'cuda':
for m in model.modules():
if get_cuda_free_memory_gb(target_device) >= preserved_memory_gb:
torch.cuda.empty_cache()
return

if hasattr(m, 'weight'):
m.to(device=cpu)
if hasattr(m, 'weight'):
m.to(device=cpu)
else:
# For MPS, just move the model directly
model.to(device=cpu)

model.to(device=cpu)
torch.cuda.empty_cache()
if target_device.type == 'cuda':
torch.cuda.empty_cache()
return


Expand All @@ -119,7 +141,8 @@ def unload_complete_models(*args):
print(f'Unloaded {m.__class__.__name__} as complete.')

gpu_complete_modules.clear()
torch.cuda.empty_cache()
if gpu.type == 'cuda':
torch.cuda.empty_cache()
return


Expand Down
4 changes: 3 additions & 1 deletion diffusers_helper/models/hunyuan_video_packed.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,9 @@ def get_cu_seqlens(text_mask, img_len):
text_len = text_mask.sum(dim=1)
max_len = text_mask.shape[1] + img_len

cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")
# Use the same device as the input tensor
device = text_mask.device
cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device=device)

for i in range(batch_size):
s = text_len[i] + img_len
Expand Down