From 0037a21835fb67071d0b785480d620e014822c81 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sun, 28 Jun 2026 19:29:09 -0400 Subject: [PATCH] feat(multi-gpu): offload text encoders to idle GPUs Adds `offload_text_encoders_to_idle_gpus` (default on): when more than one generation device is configured and a GPU is idle, a session's text/prompt encoder runs on the idle GPU instead of the one running its denoise pipeline. This avoids evicting the denoise model from VRAM to make room for the encoder, and lets a cached encoder be reused across generations. Under full load (no idle GPU) behavior is unchanged. Mechanism: - New GENERATION_DEVICE_POOL arbiter (backend/util/device_pool.py) with a per-device exclusive-use lock. A native session blocking-acquires its own device's lock for the whole run; an encoder node try-borrows an idle device's lock for the duration of the node. This makes a borrowed encoder and a native session mutually exclusive on a GPU -- preventing the shared-encoder corruption that produced garbled images -- and is deadlock-free (borrows are non-blocking; a session only ever blocks on its own device). - DefaultSessionRunner re-pins the worker thread to the borrowed device for the whole encoder node; conditioning is stored on the CPU and the denoiser picks it up on its own GPU afterward. - Nodes opt in via @invocation(idle_gpu_offloadable=True), mirroring the existing `bottleneck` ClassVar marker. Applied to the text/prompt encoder nodes (compel + sdxl/refiner, flux, sd3, qwen-image, anima, cogview4, flux2 klein, z-image, flux_redux). Inspired by #9310; supersedes it. Tests: device-pool lock semantics, two concurrency regression tests asserting a session and a borrow never use a GPU at the same time, the runner offload context-manager behavior, and a marker-wiring check. Docs: invokeai-yaml.mdx (config setting) and creating-nodes.mdx (how to support the feature in a node). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../docs/configuration/invokeai-yaml.mdx | 21 ++ .../development/Guides/creating-nodes.mdx | 33 +++ docs/src/generated/settings.json | 11 + .../app/invocations/anima_text_encoder.py | 1 + invokeai/app/invocations/baseinvocation.py | 10 + .../app/invocations/cogview4_text_encoder.py | 1 + invokeai/app/invocations/compel.py | 3 + .../invocations/flux2_klein_text_encoder.py | 1 + invokeai/app/invocations/flux_redux.py | 1 + invokeai/app/invocations/flux_text_encoder.py | 1 + .../invocations/qwen_image_text_encoder.py | 1 + invokeai/app/invocations/sd3_text_encoder.py | 1 + .../app/invocations/z_image_text_encoder.py | 1 + .../app/services/config/config_default.py | 1 + .../session_processor_default.py | 65 +++++- invokeai/backend/util/device_pool.py | 119 +++++++++++ invokeai/frontend/web/openapi.json | 6 + .../frontend/web/src/services/api/schema.ts | 6 + .../session_processor/test_encoder_offload.py | 201 ++++++++++++++++++ tests/backend/util/test_device_pool.py | 158 ++++++++++++++ 20 files changed, 636 insertions(+), 6 deletions(-) create mode 100644 invokeai/backend/util/device_pool.py create mode 100644 tests/app/services/session_processor/test_encoder_offload.py create mode 100644 tests/backend/util/test_device_pool.py diff --git a/docs/src/content/docs/configuration/invokeai-yaml.mdx b/docs/src/content/docs/configuration/invokeai-yaml.mdx index 6ac56053928..1c79fbf82aa 100644 --- a/docs/src/content/docs/configuration/invokeai-yaml.mdx +++ b/docs/src/content/docs/configuration/invokeai-yaml.mdx @@ -147,6 +147,27 @@ Notes: During parallel generation, the progress display shows one progress bar per active session, stacked vertically, each disappearing as its session completes. +#### Text Encoder Offload to Idle GPUs + +When more than one GPU is configured for generation but not all of them are busy, InvokeAI can run a session's text/prompt encoder on a currently-idle GPU instead of the GPU running its denoise pipeline. This avoids evicting the denoise model from VRAM just to make room for the encoder, and lets the cached encoder be reused across generations — making repeated generations noticeably smoother. + +This is controlled by the `offload_text_encoders_to_idle_gpus` setting: + +```yaml +offload_text_encoders_to_idle_gpus: true # default value +``` + +| Value | Behavior | +| ------- | ---------------------------------------------------------------------------------------------------------------- | +| `true` | Run text encoders on an idle GPU when one is available. This is the default. | +| `false` | Always run text encoders on the same GPU as the rest of the pipeline (the behavior before this feature existed). | + +Notes: + +- This has no effect unless at least two `generation_devices` are configured. On a single device — or when every GPU is already busy with its own session — encoders run on the session's own GPU, exactly as if the setting were `false`. +- It is purely a placement optimization and does not change generated images. +- A borrowed GPU is used exclusively for the encoder while it runs, so it never interferes with a generation session running on that same GPU. + #### Image Subfolder Strategy By default, generated images are stored in a single flat directory under `outputs/images/`. The `image_subfolder_strategy` setting lets you organize newly-created images into subfolders automatically. You can edit this setting in `invokeai.yaml` or, as an admin user, in the Settings panel. diff --git a/docs/src/content/docs/development/Guides/creating-nodes.mdx b/docs/src/content/docs/development/Guides/creating-nodes.mdx index f2dbee639bc..abc905f6e6a 100644 --- a/docs/src/content/docs/development/Guides/creating-nodes.mdx +++ b/docs/src/content/docs/development/Guides/creating-nodes.mdx @@ -21,6 +21,39 @@ import { Steps, LinkCard } from '@astrojs/starlight/components'; 4. A maintainer will review the pull request and node. If the node is aligned with the direction of the project, you may be asked for permission to include it in the core project. +### Supporting multi-GPU text-encoder offload + +On a machine with more than one GPU, InvokeAI can run several generation sessions at once — one per GPU. When fewer sessions are running than there are GPUs, the spare GPUs sit idle. To put that capacity to use, InvokeAI can run a session's **prompt/text encoder** on a currently-idle GPU instead of on the GPU running the denoise pipeline. This avoids evicting the denoise model from VRAM just to make room for the encoder, and lets the cached encoder be reused across generations. + +This is controlled globally by the `offload_text_encoders_to_idle_gpus` config setting (enabled by default) and opted into **per node** via the `@invocation` decorator: + +```python +from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation + + +@invocation( + "my_text_encoder", + title="Prompt - My Model", + category="conditioning", + version="1.0.0", + idle_gpu_offloadable=True, # opt in to idle-GPU offload +) +class MyTextEncoderInvocation(BaseInvocation): + ... +``` + +When the feature is enabled and an idle GPU is available, the **entire node** is temporarily re-pinned to a borrowed idle GPU: any model it loads goes onto that GPU and runs there. If no idle GPU is free (e.g. every GPU is busy with its own session), the node simply runs on its own GPU, unchanged. The borrow holds the idle GPU exclusively for the duration of the node, so it can never run concurrently against a native session on that same GPU. + +Because the whole node is moved to another device, only mark a node `idle_gpu_offloadable=True` if **all** of the following hold: + +- **It is encoder-only.** Its sole GPU work is loading one or more encoder models and running their forward pass. It must not load or run the denoise/transformer or VAE, or do any other work tied to the session's own GPU. +- **It stores its result on the CPU before returning.** Move output tensors to the CPU (`tensor.detach().to("cpu")`) and save them as conditioning/tensors. The denoiser picks them up and moves them onto its own GPU later — this is what makes the cross-GPU handoff safe and device-agnostic. +- **It places inputs on the loaded model's device, not a fixed device.** Resolve the device from the model you just loaded (e.g. `get_effective_device(model)` from `invokeai.backend.model_manager.load.model_cache.utils`, or `TorchDevice.choose_torch_device()`), rather than hard-coding `cuda:0`. The built-in `flux_text_encoder` and `compel` nodes are good references. + +:::caution[Only mark encoder-only nodes] +If a node that also runs the denoiser, VAE, or other session-GPU work is marked `idle_gpu_offloadable=True`, that work will be re-pinned to the wrong GPU and can misplace tensors or raise device-mismatch errors. When in doubt, leave it unset (the default is `False`) — the node will still work correctly, just without the offload optimization. +::: + ### Community Node Template Append the following template to your pull request and the [Community Nodes](../../../workflows/community-nodes) page when submitting a node to be added to the community nodes list: diff --git a/docs/src/generated/settings.json b/docs/src/generated/settings.json index 1987a90abce..d2d62d04f57 100644 --- a/docs/src/generated/settings.json +++ b/docs/src/generated/settings.json @@ -501,6 +501,17 @@ "type": "typing.Union[typing.Literal['auto'], list[str]]", "validation": {} }, + { + "category": "DEVICE", + "default": true, + "description": "When running on multiple GPUs, load text encoders onto a currently-idle GPU instead of the one running the denoise pipeline. This avoids churning the denoise model in and out of VRAM to make room for the encoder, and lets a cached encoder be reused across generations. Has no effect unless at least two `generation_devices` are configured and a GPU is idle; under full load encoders run on the session's own GPU as before.", + "env_var": "INVOKEAI_OFFLOAD_TEXT_ENCODERS_TO_IDLE_GPUS", + "literal_values": [], + "name": "offload_text_encoders_to_idle_gpus", + "required": false, + "type": "", + "validation": {} + }, { "category": "DEVICE", "default": "auto", diff --git a/invokeai/app/invocations/anima_text_encoder.py b/invokeai/app/invocations/anima_text_encoder.py index f1d4fbff8f1..c9bad65f3d0 100644 --- a/invokeai/app/invocations/anima_text_encoder.py +++ b/invokeai/app/invocations/anima_text_encoder.py @@ -59,6 +59,7 @@ category="conditioning", version="1.4.0", classification=Classification.Prototype, + idle_gpu_offloadable=True, ) class AnimaTextEncoderInvocation(BaseInvocation): """Encodes and preps a prompt for an Anima image. diff --git a/invokeai/app/invocations/baseinvocation.py b/invokeai/app/invocations/baseinvocation.py index 0546dabebb5..95cac4065a3 100644 --- a/invokeai/app/invocations/baseinvocation.py +++ b/invokeai/app/invocations/baseinvocation.py @@ -271,6 +271,12 @@ def invoke_internal(self, context: InvocationContext, services: "InvocationServi bottleneck: ClassVar[Bottleneck] + idle_gpu_offloadable: ClassVar[bool] = False + """Whether this node's entire execution may be temporarily re-pinned to an idle GPU when + `offload_text_encoders_to_idle_gpus` is enabled in multi-GPU mode. Only set this to True on nodes + that exclusively load encoder model(s), run a forward pass, and store their result on the CPU — + i.e. nodes that do no work tied to the session's own GPU. Set via the `@invocation` decorator.""" + UIConfig: ClassVar[UIConfigBase] model_config = ConfigDict( @@ -459,6 +465,7 @@ def get_output_for_type(cls, output_type: str) -> type[BaseInvocationOutput] | N "type", "workflow", "bottleneck", + "idle_gpu_offloadable", } RESERVED_INPUT_FIELD_NAMES = {"metadata", "board"} @@ -643,6 +650,7 @@ def invocation( use_cache: Optional[bool] = True, classification: Classification = Classification.Stable, bottleneck: Bottleneck = Bottleneck.GPU, + idle_gpu_offloadable: bool = False, ) -> Callable[[Type[TBaseInvocation]], Type[TBaseInvocation]]: """ Registers an invocation. @@ -655,6 +663,7 @@ def invocation( :param Optional[bool] use_cache: Whether or not to use the invocation cache. Defaults to True. The user may override this in the workflow editor. :param Classification classification: The classification of the invocation. Defaults to FeatureClassification.Stable. Use Beta or Prototype if the invocation is unstable. :param Bottleneck bottleneck: The bottleneck of the invocation. Defaults to Bottleneck.GPU. Use Network if the invocation is network-bound. + :param bool idle_gpu_offloadable: Whether this node's whole execution may run on a borrowed idle GPU when `offload_text_encoders_to_idle_gpus` is enabled. Only set True for encoder-only nodes that store their result on the CPU and do no work on the session's own GPU. Defaults to False. """ def wrapper(cls: Type[TBaseInvocation]) -> Type[TBaseInvocation]: @@ -712,6 +721,7 @@ def wrapper(cls: Type[TBaseInvocation]) -> Type[TBaseInvocation]: cls.model_fields["use_cache"].default = use_cache cls.bottleneck = bottleneck + cls.idle_gpu_offloadable = idle_gpu_offloadable # Add the invocation type to the model. diff --git a/invokeai/app/invocations/cogview4_text_encoder.py b/invokeai/app/invocations/cogview4_text_encoder.py index 13234889fba..c303e55b828 100644 --- a/invokeai/app/invocations/cogview4_text_encoder.py +++ b/invokeai/app/invocations/cogview4_text_encoder.py @@ -23,6 +23,7 @@ category="prompt", version="1.0.0", classification=Classification.Prototype, + idle_gpu_offloadable=True, ) class CogView4TextEncoderInvocation(BaseInvocation): """Encodes and preps a prompt for a cogview4 image.""" diff --git a/invokeai/app/invocations/compel.py b/invokeai/app/invocations/compel.py index 99373531d8e..428f72d3964 100644 --- a/invokeai/app/invocations/compel.py +++ b/invokeai/app/invocations/compel.py @@ -45,6 +45,7 @@ tags=["prompt", "compel"], category="prompt", version="1.2.1", + idle_gpu_offloadable=True, ) class CompelInvocation(BaseInvocation): """Parse prompt using compel package to conditioning.""" @@ -250,6 +251,7 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]: tags=["sdxl", "compel", "prompt"], category="prompt", version="1.2.1", + idle_gpu_offloadable=True, ) class SDXLCompelPromptInvocation(BaseInvocation, SDXLPromptInvocationBase): """Parse prompt using compel package to conditioning.""" @@ -344,6 +346,7 @@ def invoke(self, context: InvocationContext) -> ConditioningOutput: tags=["sdxl", "compel", "prompt"], category="prompt", version="1.1.2", + idle_gpu_offloadable=True, ) class SDXLRefinerCompelPromptInvocation(BaseInvocation, SDXLPromptInvocationBase): """Parse prompt using compel package to conditioning.""" diff --git a/invokeai/app/invocations/flux2_klein_text_encoder.py b/invokeai/app/invocations/flux2_klein_text_encoder.py index b2728d1d7cc..2b4b53faf72 100644 --- a/invokeai/app/invocations/flux2_klein_text_encoder.py +++ b/invokeai/app/invocations/flux2_klein_text_encoder.py @@ -48,6 +48,7 @@ category="prompt", version="1.1.1", classification=Classification.Prototype, + idle_gpu_offloadable=True, ) class Flux2KleinTextEncoderInvocation(BaseInvocation): """Encodes and preps a prompt for Flux2 Klein image generation. diff --git a/invokeai/app/invocations/flux_redux.py b/invokeai/app/invocations/flux_redux.py index b68e9911c56..ac1f5764d78 100644 --- a/invokeai/app/invocations/flux_redux.py +++ b/invokeai/app/invocations/flux_redux.py @@ -50,6 +50,7 @@ class FluxReduxOutput(BaseInvocationOutput): category="conditioning", version="2.1.0", classification=Classification.Beta, + idle_gpu_offloadable=True, ) class FluxReduxInvocation(BaseInvocation): """Runs a FLUX Redux model to generate a conditioning tensor.""" diff --git a/invokeai/app/invocations/flux_text_encoder.py b/invokeai/app/invocations/flux_text_encoder.py index 8b3b33fad1c..e3f28e57d72 100644 --- a/invokeai/app/invocations/flux_text_encoder.py +++ b/invokeai/app/invocations/flux_text_encoder.py @@ -30,6 +30,7 @@ tags=["prompt", "conditioning", "flux"], category="prompt", version="1.1.2", + idle_gpu_offloadable=True, ) class FluxTextEncoderInvocation(BaseInvocation): """Encodes and preps a prompt for a flux image.""" diff --git a/invokeai/app/invocations/qwen_image_text_encoder.py b/invokeai/app/invocations/qwen_image_text_encoder.py index d2aecd9f226..9d9347a8cf9 100644 --- a/invokeai/app/invocations/qwen_image_text_encoder.py +++ b/invokeai/app/invocations/qwen_image_text_encoder.py @@ -68,6 +68,7 @@ def _build_prompt(user_prompt: str, num_images: int) -> str: category="conditioning", version="1.2.0", classification=Classification.Prototype, + idle_gpu_offloadable=True, ) class QwenImageTextEncoderInvocation(BaseInvocation): """Encodes text and reference images for Qwen Image using Qwen2.5-VL.""" diff --git a/invokeai/app/invocations/sd3_text_encoder.py b/invokeai/app/invocations/sd3_text_encoder.py index 7af138fe45e..d9f5c3f1f15 100644 --- a/invokeai/app/invocations/sd3_text_encoder.py +++ b/invokeai/app/invocations/sd3_text_encoder.py @@ -33,6 +33,7 @@ tags=["prompt", "conditioning", "sd3"], category="prompt", version="1.0.1", + idle_gpu_offloadable=True, ) class Sd3TextEncoderInvocation(BaseInvocation): """Encodes and preps a prompt for a SD3 image.""" diff --git a/invokeai/app/invocations/z_image_text_encoder.py b/invokeai/app/invocations/z_image_text_encoder.py index 71af6085d0e..148cff5c269 100644 --- a/invokeai/app/invocations/z_image_text_encoder.py +++ b/invokeai/app/invocations/z_image_text_encoder.py @@ -37,6 +37,7 @@ category="prompt", version="1.1.0", classification=Classification.Prototype, + idle_gpu_offloadable=True, ) class ZImageTextEncoderInvocation(BaseInvocation): """Encodes and preps a prompt for a Z-Image image. diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py index 8c07c2139f4..d5c8a9634a5 100644 --- a/invokeai/app/services/config/config_default.py +++ b/invokeai/app/services/config/config_default.py @@ -206,6 +206,7 @@ class InvokeAIAppConfig(BaseSettings): # DEVICE device: str = Field(default="auto", description="Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.
Valid values: `auto`, `cpu`, `cuda`, `mps`, `cuda:N` (where N is a device number)", pattern=r"^(auto|cpu|mps|cuda(:\d+)?)$") generation_devices: Union[Literal["auto"], list[str]] = Field(default="auto", description="Devices to use for parallel generation. `auto` (the default) uses every available GPU, running one generation session per GPU concurrently and distributing jobs fairly across users. Provide an explicit list (e.g. `[cuda:0, cuda:1]`) to use specific devices, or a single-device list (e.g. `[cuda:0]`) to run serially. On systems without a GPU, `auto` resolves to the single `cpu`/`mps` device.
Valid values: `auto`, or a list whose entries are each `cpu`, `cuda`, `mps`, or `cuda:N` (where N is a device number)") + offload_text_encoders_to_idle_gpus: bool = Field(default=True, description="When running on multiple GPUs, load text encoders onto a currently-idle GPU instead of the one running the denoise pipeline. This avoids churning the denoise model in and out of VRAM to make room for the encoder, and lets a cached encoder be reused across generations. Has no effect unless at least two `generation_devices` are configured and a GPU is idle; under full load encoders run on the session's own GPU as before.") precision: PRECISION = Field(default="auto", description="Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.") # GENERATION diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py index 93c4554b1fe..e776dc79614 100644 --- a/invokeai/app/services/session_processor/session_processor_default.py +++ b/invokeai/app/services/session_processor/session_processor_default.py @@ -1,9 +1,9 @@ import gc import traceback -from contextlib import suppress +from contextlib import contextmanager, suppress from threading import BoundedSemaphore, Thread from threading import Event as ThreadEvent -from typing import Optional +from typing import Iterator, Optional import torch @@ -33,6 +33,7 @@ from invokeai.app.services.shared.graph import NodeInputError from invokeai.app.services.shared.invocation_context import InvocationContextData, build_invocation_context from invokeai.app.util.profiler import Profiler +from invokeai.backend.util.device_pool import GENERATION_DEVICE_POOL from invokeai.backend.util.devices import TorchDevice @@ -129,8 +130,9 @@ def run_node(self, invocation: BaseInvocation, queue_item: SessionQueueItem): is_canceled=self._is_canceled, ) - # Invoke the node - output = invocation.invoke_internal(context=context, services=self._services) + # Invoke the node, optionally on a borrowed idle GPU (text encoders only). + with self._maybe_offload_to_idle_gpu(invocation): + output = invocation.invoke_internal(context=context, services=self._services) # Save output and history queue_item.session.complete(invocation.id, output) @@ -156,6 +158,45 @@ def run_node(self, invocation: BaseInvocation, queue_item: SessionQueueItem): error_traceback=error_traceback, ) + @contextmanager + def _maybe_offload_to_idle_gpu(self, invocation: BaseInvocation) -> Iterator[None]: + """Temporarily re-pin this worker thread to an idle GPU for a text-encoder node. + + When ``offload_text_encoders_to_idle_gpus`` is enabled and an idle generation GPU can be + borrowed, the encoder model loads into that GPU's cache and its forward runs there (all + device-selecting code resolves to the pinned device), keeping the busy GPU's denoise model + resident. The conditioning output is stored on the CPU, so the denoiser picks it up on the + worker's own GPU after the pin is restored. + + The borrow holds the idle device's exclusive-use lock for the whole node, so a native + session on that GPU can never run concurrently against the same cached encoder (which would + corrupt it). If no idle GPU is free, the node runs on the worker's own GPU unchanged. + """ + native_device = TorchDevice.get_session_device() + if ( + native_device is None + or native_device.type != "cuda" + or not invocation.idle_gpu_offloadable + or not self._services.configuration.offload_text_encoders_to_idle_gpus + ): + yield + return + + borrowed_device = GENERATION_DEVICE_POOL.try_borrow(exclude=native_device) + if borrowed_device is None: + yield + return + + self._services.logger.debug( + f"Running {invocation.get_type()} on idle device {borrowed_device} (session device {native_device})." + ) + TorchDevice.set_session_device(borrowed_device) + try: + yield + finally: + TorchDevice.set_session_device(native_device) + GENERATION_DEVICE_POOL.release_borrow(borrowed_device) + def _on_before_run_session(self, queue_item: SessionQueueItem) -> None: """Called before a session is run. @@ -388,6 +429,10 @@ def start(self, invoker: Invoker) -> None: devices = self._resolve_devices() + # Register the generation devices so the model loader can discover idle GPUs to host text + # encoders on (see offload_text_encoders_to_idle_gpus). None means legacy single-device mode. + GENERATION_DEVICE_POOL.set_generation_devices([d for d in devices if d is not None]) + # If profiling is enabled, create a profiler. The same profiler will be used for all sessions. Internally, # the profiler will create a new profile for each session. Profiling uses a process-global cProfile, which # cannot cleanly attribute work when multiple sessions run concurrently, so it is disabled in multi-GPU mode. @@ -582,8 +627,16 @@ def _process( f"on {worker.label}" ) - # Run the graph - worker.runner.run(queue_item=worker.queue_item) + # Run the graph. Hold this GPU's exclusive-use lock for the whole session so no + # other worker can borrow it for text-encoder offload while we're running on it + # (a borrow + concurrent native session on one GPU would corrupt the shared + # cached encoder). Acquired here, after dequeue, so an idle worker doesn't hold + # the lock and block borrows while waiting for work. + GENERATION_DEVICE_POOL.acquire_session(worker.device) + try: + worker.runner.run(queue_item=worker.queue_item) + finally: + GENERATION_DEVICE_POOL.release_session(worker.device) except Exception as e: error_type = e.__class__.__name__ diff --git a/invokeai/backend/util/device_pool.py b/invokeai/backend/util/device_pool.py new file mode 100644 index 00000000000..1e6675161a6 --- /dev/null +++ b/invokeai/backend/util/device_pool.py @@ -0,0 +1,119 @@ +"""Process-global arbiter that lends idle generation GPUs for text-encoder offload. + +In multi-GPU mode (see ``generation_devices``) the session processor runs one generation worker +per GPU. When fewer sessions are running than there are GPUs, some GPUs sit idle. This arbiter lets +a busy worker temporarily *borrow* an idle GPU to host a text encoder, instead of churning the busy +GPU's denoise model in and out of VRAM. + +Correctness hinges on one rule: **a borrowed GPU must never run an encoder at the same time as a +native generation session on that same GPU.** They share that device's single ``ModelCache``, and a +model's forward pass (including in-place LoRA patching) runs with no cache lock held — so two +threads touching the same cached encoder concurrently corrupts it (garbled output). + +To enforce the rule, each generation device has one lock used for *both* roles: + +- A native session holds its device's lock for the entire run (blocking acquire). +- A borrower *try*-acquires another device's lock for the duration of one encoder node; if the lock + is already held (that GPU is running, or just started, a session) the borrow simply fails and the + encoder runs on the worker's own GPU instead. + +Because borrows are non-blocking try-acquires and a session only ever blocking-acquires its *own* +device lock, there is no lock-ordering cycle — the design is deadlock-free. The only cost is that, +in the startup race where a borrow wins the lock a moment before the lent GPU's own session starts, +that session waits out the (short) encoder node before beginning. +""" + +import threading +from typing import Optional + +import torch + +from invokeai.backend.util.devices import TorchDevice + + +class _GenerationDevicePool: + """Arbitrates exclusive use of each generation device between native sessions and borrowers.""" + + def __init__(self) -> None: + self._registry_lock = threading.Lock() + # Registration order is preserved so borrow selection is deterministic (and therefore sticky + # across repeated single-session generations, letting a cached encoder be reused). Maps + # normalized device string -> that device's exclusive-use lock. + self._device_locks: dict[str, threading.Lock] = {} + self._order: list[str] = [] + + def set_generation_devices(self, devices: list[torch.device]) -> None: + """Register the full set of generation devices (called once at processor startup). + + Only CUDA devices participate in idle-offload; others are ignored. + """ + with self._registry_lock: + self._device_locks = {} + self._order = [] + for device in devices: + if device.type != "cuda": + continue + key = str(TorchDevice.normalize(device)) + if key not in self._device_locks: + self._device_locks[key] = threading.Lock() + self._order.append(key) + + def _get_lock(self, device: torch.device) -> Optional[threading.Lock]: + key = str(TorchDevice.normalize(device)) + with self._registry_lock: + return self._device_locks.get(key) + + def acquire_session(self, device: Optional[torch.device]) -> None: + """Take exclusive use of ``device`` for a native generation session (blocking). + + Waits out any in-flight borrow that won the lock first, guaranteeing the session never runs + concurrently with a borrowed encoder on the same GPU. No-op for non-CUDA / unregistered + devices (e.g. legacy single-device mode). + """ + if device is None or device.type != "cuda": + return + lock = self._get_lock(device) + if lock is not None: + lock.acquire() + + def release_session(self, device: Optional[torch.device]) -> None: + """Release the exclusive use taken by :meth:`acquire_session`.""" + if device is None or device.type != "cuda": + return + lock = self._get_lock(device) + if lock is not None: + lock.release() + + def try_borrow(self, exclude: torch.device) -> Optional[torch.device]: + """Try to take exclusive use of an idle CUDA device other than ``exclude`` (non-blocking). + + Returns the borrowed device (whose lock the caller now holds and must release via + :meth:`release_borrow`), or ``None`` if no other registered device is currently free. + Selection is deterministic (lowest registration order) so repeated borrows reuse the same + GPU and the encoder cached there. + """ + if exclude.type != "cuda": + return None + exclude_key = str(TorchDevice.normalize(exclude)) + with self._registry_lock: + candidates = [(key, self._device_locks[key]) for key in self._order if key != exclude_key] + for key, lock in candidates: + if lock.acquire(blocking=False): + return torch.device(key) + return None + + def release_borrow(self, device: torch.device) -> None: + """Release a device taken by :meth:`try_borrow`.""" + lock = self._get_lock(device) + if lock is not None: + lock.release() + + def reset(self) -> None: + """Clear all registered devices (used by tests).""" + with self._registry_lock: + self._device_locks = {} + self._order = [] + + +# Process-global singleton. +GENERATION_DEVICE_POOL = _GenerationDevicePool() diff --git a/invokeai/frontend/web/openapi.json b/invokeai/frontend/web/openapi.json index 522cd1ce4aa..4ca744496d3 100644 --- a/invokeai/frontend/web/openapi.json +++ b/invokeai/frontend/web/openapi.json @@ -41226,6 +41226,12 @@ "description": "Devices to use for parallel generation. `auto` (the default) uses every available GPU, running one generation session per GPU concurrently and distributing jobs fairly across users. Provide an explicit list (e.g. `[cuda:0, cuda:1]`) to use specific devices, or a single-device list (e.g. `[cuda:0]`) to run serially. On systems without a GPU, `auto` resolves to the single `cpu`/`mps` device.
Valid values: `auto`, or a list whose entries are each `cpu`, `cuda`, `mps`, or `cuda:N` (where N is a device number)", "default": "auto" }, + "offload_text_encoders_to_idle_gpus": { + "type": "boolean", + "title": "Offload Text Encoders To Idle Gpus", + "description": "When running on multiple GPUs, load text encoders onto a currently-idle GPU instead of the one running the denoise pipeline. This avoids churning the denoise model in and out of VRAM to make room for the encoder, and lets a cached encoder be reused across generations. Has no effect unless at least two `generation_devices` are configured and a GPU is idle; under full load encoders run on the session's own GPU as before.", + "default": true + }, "precision": { "type": "string", "enum": ["auto", "float16", "bfloat16", "float32"], diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts index 75dafa37f34..a18faeaed26 100644 --- a/invokeai/frontend/web/src/services/api/schema.ts +++ b/invokeai/frontend/web/src/services/api/schema.ts @@ -16556,6 +16556,12 @@ export type components = { * @default auto */ generation_devices?: "auto" | string[]; + /** + * Offload Text Encoders To Idle Gpus + * @description When running on multiple GPUs, load text encoders onto a currently-idle GPU instead of the one running the denoise pipeline. This avoids churning the denoise model in and out of VRAM to make room for the encoder, and lets a cached encoder be reused across generations. Has no effect unless at least two `generation_devices` are configured and a GPU is idle; under full load encoders run on the session's own GPU as before. + * @default true + */ + offload_text_encoders_to_idle_gpus?: boolean; /** * Precision * @description Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system. diff --git a/tests/app/services/session_processor/test_encoder_offload.py b/tests/app/services/session_processor/test_encoder_offload.py new file mode 100644 index 00000000000..972783a9ea9 --- /dev/null +++ b/tests/app/services/session_processor/test_encoder_offload.py @@ -0,0 +1,201 @@ +"""Tests for DefaultSessionRunner._maybe_offload_to_idle_gpu (idle-GPU text-encoder offload). + +These exercise the re-pinning + borrow-lock logic without needing real CUDA: the session device is +a thread-local set via TorchDevice, and the device pool only manipulates locks keyed by device +string. +""" + +import logging +import threading +import time +from collections.abc import Iterator + +import pytest +import torch + +from invokeai.app.services.session_processor.session_processor_default import DefaultSessionRunner +from invokeai.backend.util.device_pool import GENERATION_DEVICE_POOL +from invokeai.backend.util.devices import TorchDevice + + +@pytest.fixture(autouse=True) +def reset_state() -> Iterator[None]: + GENERATION_DEVICE_POOL.reset() + try: + yield + finally: + TorchDevice.clear_session_device() + GENERATION_DEVICE_POOL.reset() + + +class _FakeInvocation: + def __init__(self, idle_gpu_offloadable: bool, type_str: str = "fake_node"): + self.idle_gpu_offloadable = idle_gpu_offloadable + self._type_str = type_str + + def get_type(self) -> str: + return self._type_str + + +class _FakeConfig: + def __init__(self, enabled: bool = True): + self.offload_text_encoders_to_idle_gpus = enabled + + +class _FakeServices: + def __init__(self, enabled: bool = True): + self.configuration = _FakeConfig(enabled) + self.logger = logging.getLogger("test-encoder-offload") + + +def _runner(enabled: bool = True) -> DefaultSessionRunner: + runner = DefaultSessionRunner() + runner._services = _FakeServices(enabled) # type: ignore[assignment] + return runner + + +def test_encoder_node_repins_to_idle_gpu_and_restores(): + runner = _runner() + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0"), torch.device("cuda:1")]) + TorchDevice.set_session_device("cuda:0") + + with runner._maybe_offload_to_idle_gpu(_FakeInvocation(True, "flux_text_encoder")): + # Re-pinned to the borrowed idle GPU for the duration of the node... + assert TorchDevice.get_session_device() == torch.device("cuda:1") + # ...and that GPU is locked, so nothing else can borrow it. + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) is None + + # Pin restored and the borrow released. + assert TorchDevice.get_session_device() == torch.device("cuda:0") + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) == torch.device("cuda:1") + + +def test_non_encoder_node_is_not_offloaded(): + runner = _runner() + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0"), torch.device("cuda:1")]) + TorchDevice.set_session_device("cuda:0") + + with runner._maybe_offload_to_idle_gpu(_FakeInvocation(False, "denoise_latents")): + assert TorchDevice.get_session_device() == torch.device("cuda:0") + # Idle device was never borrowed. + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) == torch.device("cuda:1") + + +def test_no_offload_when_target_running_a_session(): + """With both GPUs busy (the other holds a session lock), the encoder stays on its own GPU.""" + runner = _runner() + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0"), torch.device("cuda:1")]) + TorchDevice.set_session_device("cuda:0") + GENERATION_DEVICE_POOL.acquire_session(torch.device("cuda:1")) + try: + with runner._maybe_offload_to_idle_gpu(_FakeInvocation(True, "flux_text_encoder")): + assert TorchDevice.get_session_device() == torch.device("cuda:0") + finally: + GENERATION_DEVICE_POOL.release_session(torch.device("cuda:1")) + + +def test_flag_off_disables_offload(): + runner = _runner(enabled=False) + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0"), torch.device("cuda:1")]) + TorchDevice.set_session_device("cuda:0") + + with runner._maybe_offload_to_idle_gpu(_FakeInvocation(True, "flux_text_encoder")): + assert TorchDevice.get_session_device() == torch.device("cuda:0") + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) == torch.device("cuda:1") + + +def test_borrow_released_on_exception(): + runner = _runner() + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0"), torch.device("cuda:1")]) + TorchDevice.set_session_device("cuda:0") + + with pytest.raises(RuntimeError): + with runner._maybe_offload_to_idle_gpu(_FakeInvocation(True, "flux_text_encoder")): + raise RuntimeError("node failed") + + # The pin is restored and the borrow lock released even though the node raised. + assert TorchDevice.get_session_device() == torch.device("cuda:0") + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) == torch.device("cuda:1") + + +def test_single_gpu_never_offloads(): + runner = _runner() + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0")]) + TorchDevice.set_session_device("cuda:0") + + with runner._maybe_offload_to_idle_gpu(_FakeInvocation(True, "flux_text_encoder")): + assert TorchDevice.get_session_device() == torch.device("cuda:0") + + +def test_concurrent_workers_never_share_a_gpu(): + """Regression for the garbled-image bug: two sessions running at once must never use the same + GPU for an encoder concurrently. Each worker holds its own GPU's session lock (as the processor + does) and runs encoder nodes that may borrow the other GPU through the real offload path; we + assert no GPU is ever occupied by two workers at the same time. + + Before the fix, a startup race let one worker offload its encoder onto the other's GPU while + that GPU also ran a native session — both touching the same cached encoder. This test exercises + that exact interleaving and would flag it as occupancy > 1. + """ + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0"), torch.device("cuda:1")]) + + occupancy = {"cuda:0": 0, "cuda:1": 0} + occ_lock = threading.Lock() + violations: list[str] = [] + + def occupy(device_str: str) -> None: + with occ_lock: + occupancy[device_str] += 1 + if occupancy[device_str] > 1: + violations.append(device_str) + + def vacate(device_str: str) -> None: + with occ_lock: + occupancy[device_str] -= 1 + + def worker(own: str) -> None: + runner = _runner() + own_device = torch.device(own) + encoder = _FakeInvocation(True, "flux_text_encoder") + for _ in range(150): + # The processor holds the device's session lock for the whole run. + GENERATION_DEVICE_POOL.acquire_session(own_device) + TorchDevice.set_session_device(own_device) + occupy(own) + try: + with runner._maybe_offload_to_idle_gpu(encoder): + current = str(TorchDevice.get_session_device()) + if current != own: + # The node was re-pinned to a borrowed GPU; it must be exclusively ours. + occupy(current) + try: + time.sleep(0.0002) + finally: + vacate(current) + else: + time.sleep(0.0001) + finally: + vacate(own) + TorchDevice.clear_session_device() + GENERATION_DEVICE_POOL.release_session(own_device) + + threads = [threading.Thread(target=worker, args=(d,)) for d in ("cuda:0", "cuda:1")] + for t in threads: + t.start() + for t in threads: + t.join() + + assert not violations, f"GPU(s) used by two workers at once: {set(violations)}" + + +def test_real_nodes_declare_the_marker_correctly(): + """The @invocation(idle_gpu_offloadable=...) marker is wired through to the class, and is set on + encoder nodes but not on ordinary nodes.""" + from invokeai.app.invocations.compel import CompelInvocation + from invokeai.app.invocations.flux_text_encoder import FluxTextEncoderInvocation + from invokeai.app.invocations.primitives import IntegerInvocation + + assert FluxTextEncoderInvocation.idle_gpu_offloadable is True + assert CompelInvocation.idle_gpu_offloadable is True + # A non-encoder node defaults to False (never re-pinned to a borrowed GPU). + assert IntegerInvocation.idle_gpu_offloadable is False diff --git a/tests/backend/util/test_device_pool.py b/tests/backend/util/test_device_pool.py new file mode 100644 index 00000000000..402e496889b --- /dev/null +++ b/tests/backend/util/test_device_pool.py @@ -0,0 +1,158 @@ +"""Tests for the idle generation-device arbiter used by text-encoder offload.""" + +import threading +import time +from collections.abc import Iterator + +import pytest +import torch + +from invokeai.backend.util.device_pool import GENERATION_DEVICE_POOL + + +@pytest.fixture(autouse=True) +def reset_pool() -> Iterator[None]: + """The arbiter is a process-global singleton; reset it around each test.""" + GENERATION_DEVICE_POOL.reset() + try: + yield + finally: + GENERATION_DEVICE_POOL.reset() + + +def test_borrow_picks_lowest_other_device(): + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0"), torch.device("cuda:1")]) + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) == torch.device("cuda:1") + + +def test_borrow_excludes_requesting_device(): + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0"), torch.device("cuda:1")]) + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:1")) == torch.device("cuda:0") + + +def test_session_lock_blocks_borrow(): + """A device held by a native session cannot be borrowed.""" + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0"), torch.device("cuda:1")]) + GENERATION_DEVICE_POOL.acquire_session(torch.device("cuda:1")) + try: + # The only other device is busy with a session -> no borrow. + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) is None + finally: + GENERATION_DEVICE_POOL.release_session(torch.device("cuda:1")) + # Released -> borrowable again. + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) == torch.device("cuda:1") + + +def test_borrow_blocks_session_until_released(): + """A native session acquire waits for an in-flight borrow on the same device (startup race).""" + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0"), torch.device("cuda:1")]) + borrowed = GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) + assert borrowed == torch.device("cuda:1") + + acquired = threading.Event() + + def native_session(): + GENERATION_DEVICE_POOL.acquire_session(torch.device("cuda:1")) + acquired.set() + + t = threading.Thread(target=native_session) + t.start() + # The session must block while the borrow holds cuda:1. + assert not acquired.wait(timeout=0.2) + GENERATION_DEVICE_POOL.release_borrow(torch.device("cuda:1")) + # Now it can proceed. + assert acquired.wait(timeout=2.0) + t.join() + GENERATION_DEVICE_POOL.release_session(torch.device("cuda:1")) + + +def test_two_borrowers_do_not_share_a_device(): + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0"), torch.device("cuda:1")]) + first = GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) + assert first == torch.device("cuda:1") + # A second borrower (also from cuda:0) finds the only other device already taken -> None. + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) is None + GENERATION_DEVICE_POOL.release_borrow(first) + + +def test_single_device_has_no_borrow_target(): + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cuda:0")]) + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) is None + + +def test_deterministic_lowest_order_selection(): + GENERATION_DEVICE_POOL.set_generation_devices( + [torch.device("cuda:0"), torch.device("cuda:1"), torch.device("cuda:2")] + ) + # cuda:1 and cuda:2 are both free; the lowest-order one (cuda:1) is chosen, and the choice is + # stable across calls (release then re-borrow) so a cached encoder can be reused. + for _ in range(3): + device = GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) + assert device == torch.device("cuda:1") + GENERATION_DEVICE_POOL.release_borrow(device) + + +def test_non_cuda_devices_ignored(): + GENERATION_DEVICE_POOL.set_generation_devices([torch.device("cpu"), torch.device("cuda:0")]) + # Only cuda:0 registered; nothing else to borrow. + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) is None + # A non-cuda requester never borrows, and a non-cuda session acquire is a no-op. + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cpu")) is None + GENERATION_DEVICE_POOL.acquire_session(torch.device("cpu")) # must not raise + GENERATION_DEVICE_POOL.release_session(torch.device("cpu")) + + +def test_empty_pool_returns_none(): + assert GENERATION_DEVICE_POOL.try_borrow(exclude=torch.device("cuda:0")) is None + + +def test_concurrent_sessions_and_borrows_never_overlap_on_a_device(): + """Regression: a GPU must never be used by a native session and a borrowed encoder at the same + time. That overlap is exactly what corrupted a shared encoder and produced garbled images. Here + we stress the arbiter from several threads and assert exclusive use is always honored. + + With only the busy-flag approach this used before the fix, a borrow could win against a starting + session and both would "use" the device — which this test would catch as occupancy > 1. + """ + device_strs = ["cuda:0", "cuda:1", "cuda:2"] + GENERATION_DEVICE_POOL.set_generation_devices([torch.device(d) for d in device_strs]) + + occupancy = dict.fromkeys(device_strs, 0) + occ_lock = threading.Lock() + violations: list[str] = [] + + def occupy(device_str: str) -> None: + with occ_lock: + occupancy[device_str] += 1 + if occupancy[device_str] > 1: + violations.append(device_str) + + def vacate(device_str: str) -> None: + with occ_lock: + occupancy[device_str] -= 1 + + def worker(own: str) -> None: + own_device = torch.device(own) + for _ in range(200): + GENERATION_DEVICE_POOL.acquire_session(own_device) + occupy(own) # this thread now exclusively owns `own` (as a native session would) + try: + borrowed = GENERATION_DEVICE_POOL.try_borrow(exclude=own_device) + if borrowed is not None: + occupy(str(borrowed)) + try: + time.sleep(0.0002) # widen the window so any overlap is observed + finally: + vacate(str(borrowed)) + GENERATION_DEVICE_POOL.release_borrow(borrowed) + finally: + vacate(own) + GENERATION_DEVICE_POOL.release_session(own_device) + + threads = [threading.Thread(target=worker, args=(d,)) for d in device_strs] + for t in threads: + t.start() + for t in threads: + t.join() + + assert not violations, f"device(s) used concurrently by a session and a borrow: {set(violations)}"