diff --git a/README.md b/README.md
index e01550476..3224f858b 100644
--- a/README.md
+++ b/README.md
@@ -174,8 +174,7 @@ ctx.canvas.height = ctx.canvas.clientHeight * PixelRatio.get();
 
 ### Frame Scheduling
 
-In React Native, we want to keep frame presentation as a manual operation as we plan to provide more advanced rendering options that are React Native specific.  
-This means that when you are ready to present a frame, you need to call `present` on the context.
+In React Native, frame presentation is a manual operation: when you are ready to present a frame, call `present()` on the context after submitting your commands to the queue. This works the same on every runtime: the main JS runtime, the Reanimated UI runtime, and dedicated worklet runtimes (`createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame processor). `present()` runs synchronously on the calling thread, so the frame is presented from whichever thread did the rendering.
 
 ```tsx
 // draw
@@ -293,10 +292,10 @@ const render = () => {
 
   // ... encode a pass that samples `externalTexture`, then:
   device.queue.submit([encoder.finish()]);
+  context.present();
 
   // Release the surface's access window right after the submit that sampled it.
   externalTexture.destroy();
-  context.present();
 };
 ```
 
diff --git a/apps/example/ios/Podfile.lock b/apps/example/ios/Podfile.lock
index fd5ba968c..8559c8c27 100644
--- a/apps/example/ios/Podfile.lock
+++ b/apps/example/ios/Podfile.lock
@@ -1924,7 +1924,7 @@ PODS:
     - ReactCommon/turbomodule/core
     - SocketRocket
     - Yoga
-  - react-native-wgpu (0.5.12):
+  - react-native-webgpu (0.5.14):
     - boost
     - DoubleConversion
     - fast_float
@@ -2812,7 +2812,7 @@ DEPENDENCIES:
   - React-microtasksnativemodule (from `../../../node_modules/react-native/ReactCommon/react/nativemodule/microtasks`)
   - react-native-safe-area-context (from `../../../node_modules/react-native-safe-area-context`)
   - "react-native-skia (from `../../../node_modules/@shopify/react-native-skia`)"
-  - react-native-wgpu (from `../../../node_modules/react-native-wgpu`)
+  - react-native-webgpu (from `../../../node_modules/react-native-webgpu`)
   - React-NativeModulesApple (from `../../../node_modules/react-native/ReactCommon/react/nativemodule/core/platform/ios`)
   - React-oscompat (from `../../../node_modules/react-native/ReactCommon/oscompat`)
   - React-perflogger (from `../../../node_modules/react-native/ReactCommon/reactperflogger`)
@@ -2948,8 +2948,8 @@ EXTERNAL SOURCES:
     :path: "../../../node_modules/react-native-safe-area-context"
   react-native-skia:
     :path: "../../../node_modules/@shopify/react-native-skia"
-  react-native-wgpu:
-    :path: "../../../node_modules/react-native-wgpu"
+  react-native-webgpu:
+    :path: "../../../node_modules/react-native-webgpu"
   React-NativeModulesApple:
     :path: "../../../node_modules/react-native/ReactCommon/react/nativemodule/core/platform/ios"
   React-oscompat:
@@ -3074,7 +3074,7 @@ SPEC CHECKSUMS:
   React-microtasksnativemodule: 75b6604b667d297292345302cc5bfb6b6aeccc1b
   react-native-safe-area-context: c00143b4823773bba23f2f19f85663ae89ceb460
   react-native-skia: fc73e9bdc46ebb420a98c9c2be29fee80f565e79
-  react-native-wgpu: 274ffec11ee3a082260d9f3d1fb54030a5ca0873
+  react-native-webgpu: ea7239ee381b4937d8e971f648cdcf6b9ff4de7e
   React-NativeModulesApple: 879fbdc5dcff7136abceb7880fe8a2022a1bd7c3
   React-oscompat: 93b5535ea7f7dff46aaee4f78309a70979bdde9d
   React-perflogger: 5536d2df3d18fe0920263466f7b46a56351c0510
diff --git a/apps/example/src/CanvasAPI/CanvasAPI.tsx b/apps/example/src/CanvasAPI/CanvasAPI.tsx
index a6fc2bd32..f5815169d 100644
--- a/apps/example/src/CanvasAPI/CanvasAPI.tsx
+++ b/apps/example/src/CanvasAPI/CanvasAPI.tsx
@@ -89,7 +89,6 @@ export const CanvasAPI = () => {
             passEncoder.end();
 
             device.queue.submit([commandEncoder.finish()]);
-
             context.present();
           })()
         }
diff --git a/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx b/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx
index 4027baf63..9fd16463e 100644
--- a/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx
+++ b/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx
@@ -244,10 +244,10 @@ export const ImportExternalTexture = () => {
 
       pass.end();
       device.queue.submit([encoder.finish()]);
+      context.present();
       // Now that the work sampling it has been submitted, end the external
       // texture's access window so the frame's surface is released promptly.
       externalTex?.destroy();
-      context.present();
       rafRef.current = requestAnimationFrame(render);
     };
     rafRef.current = requestAnimationFrame(render);
diff --git a/apps/example/src/Reanimated/Reanimated.tsx b/apps/example/src/Reanimated/Reanimated.tsx
index 74392d03d..f0af0d59b 100644
--- a/apps/example/src/Reanimated/Reanimated.tsx
+++ b/apps/example/src/Reanimated/Reanimated.tsx
@@ -78,8 +78,10 @@ export const webGPUDemo = (
     passEncoder.end();
 
     device.queue.submit([commandEncoder.finish()]);
-
+    // Needed on a dedicated worklet runtime (DedicatedThread); a no-op on the
+    // UI runtime (UIThread), where present is automatic.
     context.present();
+
     if (runAnimation.value) {
       requestAnimationFrame(frame);
     }
diff --git a/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx b/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx
index 0e48aa2b7..371fad7c0 100644
--- a/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx
+++ b/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx
@@ -185,8 +185,6 @@ export function StorageBufferVertices() {
 
     const commandBuffer = encoder.finish();
     device.queue.submit([commandBuffer]);
-    // eslint-disable-next-line @typescript-eslint/no-explicit-any
-    (context as any).present();
   });
 
   return (
diff --git a/apps/example/src/ThreeJS/Backdrop.tsx b/apps/example/src/ThreeJS/Backdrop.tsx
index 64bd63bbe..12189ce60 100644
--- a/apps/example/src/ThreeJS/Backdrop.tsx
+++ b/apps/example/src/ThreeJS/Backdrop.tsx
@@ -150,7 +150,7 @@ export const Backdrop = () => {
       }
 
       renderer.render(scene, camera);
-      context!.present();
+      context.present();
     }
     return () => {
       renderer.setAnimationLoop(null);
diff --git a/apps/example/src/ThreeJS/Helmet.tsx b/apps/example/src/ThreeJS/Helmet.tsx
index cbf16011e..0dbb8dd91 100644
--- a/apps/example/src/ThreeJS/Helmet.tsx
+++ b/apps/example/src/ThreeJS/Helmet.tsx
@@ -49,7 +49,7 @@ export const Helmet = () => {
     function animate() {
       animateCamera();
       renderer.render(scene, camera);
-      context!.present();
+      context.present();
     }
 
     return () => {
diff --git a/apps/example/src/ThreeJS/InstancedMesh.tsx b/apps/example/src/ThreeJS/InstancedMesh.tsx
index 208c0afbc..42f489f0d 100644
--- a/apps/example/src/ThreeJS/InstancedMesh.tsx
+++ b/apps/example/src/ThreeJS/InstancedMesh.tsx
@@ -59,7 +59,6 @@ export const InstancedMesh = () => {
 
     function animate() {
       render();
-      context!.present();
     }
 
     function render() {
@@ -88,6 +87,7 @@ export const InstancedMesh = () => {
       }
 
       renderer.render(scene, camera);
+      context.present();
     }
     return () => {
       renderer.setAnimationLoop(null);
diff --git a/apps/example/src/ThreeJS/PostProcessing.tsx b/apps/example/src/ThreeJS/PostProcessing.tsx
index 2186b26b1..1698563c0 100644
--- a/apps/example/src/ThreeJS/PostProcessing.tsx
+++ b/apps/example/src/ThreeJS/PostProcessing.tsx
@@ -72,7 +72,7 @@ export const PostProcessing = () => {
         mixer.update(delta);
       }
       postProcessing.render();
-      context!.present();
+      context.present();
     }
     return () => {
       renderer.setAnimationLoop(null);
diff --git a/apps/example/src/ThreeJS/components/FiberCanvas.tsx b/apps/example/src/ThreeJS/components/FiberCanvas.tsx
index a72d18b13..8b6eaaf8f 100644
--- a/apps/example/src/ThreeJS/components/FiberCanvas.tsx
+++ b/apps/example/src/ThreeJS/components/FiberCanvas.tsx
@@ -66,7 +66,7 @@ export const FiberCanvas = ({
         const renderFrame = state.gl.render.bind(state.gl);
         state.gl.render = (s: THREE.Scene, c: THREE.Camera) => {
           renderFrame(s, c);
-          context?.present();
+          context.present();
         };
       },
     });
diff --git a/apps/example/src/Triangle/HelloTriangle.tsx b/apps/example/src/Triangle/HelloTriangle.tsx
index 56ec732bc..12bf56653 100644
--- a/apps/example/src/Triangle/HelloTriangle.tsx
+++ b/apps/example/src/Triangle/HelloTriangle.tsx
@@ -77,7 +77,6 @@ export function HelloTriangle() {
       passEncoder.end();
 
       device.queue.submit([commandEncoder.finish()]);
-
       context.present();
     })();
   }, [ref]);
diff --git a/apps/example/src/Triangle/HelloTriangleMSAA.tsx b/apps/example/src/Triangle/HelloTriangleMSAA.tsx
index 481063830..19a692a2f 100644
--- a/apps/example/src/Triangle/HelloTriangleMSAA.tsx
+++ b/apps/example/src/Triangle/HelloTriangleMSAA.tsx
@@ -84,10 +84,10 @@ export function HelloTriangleMSAA() {
         passEncoder.end();
 
         device.queue.submit([commandEncoder.finish()]);
+        context.present();
       }
 
       frame();
-      context.present();
     })();
   }, [ref]);
 
diff --git a/apps/example/src/VisionCamera/VisionCamera.tsx b/apps/example/src/VisionCamera/VisionCamera.tsx
index 8f196f937..25ad6ac39 100644
--- a/apps/example/src/VisionCamera/VisionCamera.tsx
+++ b/apps/example/src/VisionCamera/VisionCamera.tsx
@@ -613,11 +613,13 @@ const CameraView = () => {
           pass.draw(3);
           pass.end();
           device.queue.submit([encoder.finish()]);
+          // Vision Camera frame processors run on a dedicated worklet runtime,
+          // so present explicitly (auto-present only covers the JS/UI runtime).
+          context.present();
           // The work sampling it is submitted, so end the external texture's
           // access window now to release the camera frame's surface promptly
           // (don't wait for GC, which would starve the frame buffer pool).
           externalTex.destroy();
-          context.present();
         } finally {
           videoFrame.release();
         }
diff --git a/packages/webgpu/README.md b/packages/webgpu/README.md
index e01550476..3224f858b 100644
--- a/packages/webgpu/README.md
+++ b/packages/webgpu/README.md
@@ -174,8 +174,7 @@ ctx.canvas.height = ctx.canvas.clientHeight * PixelRatio.get();
 
 ### Frame Scheduling
 
-In React Native, we want to keep frame presentation as a manual operation as we plan to provide more advanced rendering options that are React Native specific.  
-This means that when you are ready to present a frame, you need to call `present` on the context.
+In React Native, frame presentation is a manual operation: when you are ready to present a frame, call `present()` on the context after submitting your commands to the queue. This works the same on every runtime: the main JS runtime, the Reanimated UI runtime, and dedicated worklet runtimes (`createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame processor). `present()` runs synchronously on the calling thread, so the frame is presented from whichever thread did the rendering.
 
 ```tsx
 // draw
@@ -293,10 +292,10 @@ const render = () => {
 
   // ... encode a pass that samples `externalTexture`, then:
   device.queue.submit([encoder.finish()]);
+  context.present();
 
   // Release the surface's access window right after the submit that sampled it.
   externalTexture.destroy();
-  context.present();
 };
 ```
 
diff --git a/packages/webgpu/android/CMakeLists.txt b/packages/webgpu/android/CMakeLists.txt
index c2e25f54d..35fc9b50f 100644
--- a/packages/webgpu/android/CMakeLists.txt
+++ b/packages/webgpu/android/CMakeLists.txt
@@ -51,9 +51,10 @@ add_library(${PACKAGE_NAME} SHARED
     ../cpp/jsi/Promise.cpp
     ../cpp/jsi/RuntimeLifecycleMonitor.cpp
     ../cpp/jsi/RuntimeAwareCache.cpp
-    ../cpp/rnwgpu/async/AsyncRunner.cpp
+    ../cpp/rnwgpu/async/RuntimeContext.cpp
     ../cpp/rnwgpu/async/AsyncTaskHandle.cpp
-    ../cpp/rnwgpu/async/JSIMicrotaskDispatcher.cpp
+    ../cpp/rnwgpu/async/CallInvokerScheduler.cpp
+    ../cpp/rnwgpu/async/GpuEventLoop.cpp
 )
 
 target_include_directories(
diff --git a/packages/webgpu/apple/WebGPUModule.mm b/packages/webgpu/apple/WebGPUModule.mm
index e637633b0..5d710dd91 100644
--- a/packages/webgpu/apple/WebGPUModule.mm
+++ b/packages/webgpu/apple/WebGPUModule.mm
@@ -78,6 +78,7 @@ - (void)invalidate {
       std::make_shared<rnwgpu::ApplePlatformContext>();
   webgpuManager = std::make_shared<rnwgpu::RNWebGPUManager>(runtime, jsInvoker,
                                                             platformContext);
+
   return @true;
 }
 
diff --git a/packages/webgpu/cpp/rnwgpu/RNWebGPUManager.cpp b/packages/webgpu/cpp/rnwgpu/RNWebGPUManager.cpp
index 56b0b5581..8868f3d8a 100644
--- a/packages/webgpu/cpp/rnwgpu/RNWebGPUManager.cpp
+++ b/packages/webgpu/cpp/rnwgpu/RNWebGPUManager.cpp
@@ -34,6 +34,7 @@
 #include "GPUSharedFence.h"
 #include "GPUSharedTextureMemory.h"
 #include "GPUShaderModule.h"
+#include "GPUSharedTextureMemory.h"
 #include "GPUSupportedLimits.h"
 #include "GPUTexture.h"
 #include "GPUTextureView.h"
@@ -64,7 +65,7 @@ RNWebGPUManager::RNWebGPUManager(
   // Register main runtime for RuntimeAwareCache
   BaseRuntimeAwareCache::setMainJsRuntime(_jsRuntime);
 
-  auto gpu = std::make_shared<GPU>(*_jsRuntime);
+  auto gpu = std::make_shared<GPU>(*_jsRuntime, _jsCallInvoker);
   auto rnWebGPU =
       std::make_shared<RNWebGPU>(gpu, _platformContext, _jsCallInvoker);
   _gpu = gpu->get();
diff --git a/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h b/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h
index 110a45d44..db18d7af1 100644
--- a/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h
+++ b/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h
@@ -7,6 +7,12 @@
 
 #include "webgpu/webgpu_cpp.h"
 
+#ifdef __APPLE__
+namespace dawn::native::metal {
+void WaitForCommandsToBeScheduled(WGPUDevice device);
+} // namespace dawn::native::metal
+#endif
+
 namespace rnwgpu {
 
 struct NativeInfo {
@@ -113,7 +119,23 @@ class SurfaceInfo {
     height = newHeight;
   }
 
-  void present() {
+  // Present the current surface texture. Called synchronously from the thread
+  // that did getCurrentTexture / submit (via GPUCanvasContext::present), so it
+  // preserves Dawn surface thread-affinity. No-op when offscreen / unconfigured
+  // (no surface).
+  void presentFrame() {
+#ifdef __APPLE__
+    // Ensure command buffers are scheduled before presenting. Read the device
+    // under a shared lock, then wait without holding it (the wait can block).
+    wgpu::Device device;
+    {
+      std::shared_lock<std::shared_mutex> lock(_mutex);
+      device = config.device;
+    }
+    if (device) {
+      dawn::native::metal::WaitForCommandsToBeScheduled(device.Get());
+    }
+#endif
     std::unique_lock<std::shared_mutex> lock(_mutex);
     if (surface) {
       surface.Present();
@@ -131,6 +153,12 @@ class SurfaceInfo {
     }
   }
 
+  // True when an on-screen wgpu::Surface is attached (vs offscreen texture).
+  bool hasSurface() {
+    std::shared_lock<std::shared_mutex> lock(_mutex);
+    return surface != nullptr;
+  }
+
   NativeInfo getNativeInfo() {
     std::shared_lock<std::shared_mutex> lock(_mutex);
     return {.nativeSurface = nativeSurface, .width = width, .height = height};
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPU.cpp b/packages/webgpu/cpp/rnwgpu/api/GPU.cpp
index 11530f4da..36434e0ee 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPU.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPU.cpp
@@ -9,11 +9,14 @@
 
 #include "Convertors.h"
 #include "JSIConverter.h"
-#include "rnwgpu/async/JSIMicrotaskDispatcher.h"
+#include "rnwgpu/async/CallInvokerScheduler.h"
+#include "rnwgpu/async/GpuEventLoop.h"
 
 namespace rnwgpu {
 
-GPU::GPU(jsi::Runtime &runtime) : NativeObject(CLASS_NAME) {
+GPU::GPU(jsi::Runtime &runtime,
+         std::shared_ptr<facebook::react::CallInvoker> callInvoker)
+    : NativeObject(CLASS_NAME) {
   static const auto kTimedWaitAny = wgpu::InstanceFeatureName::TimedWaitAny;
   wgpu::InstanceDescriptor instanceDesc{.requiredFeatureCount = 1,
                                         .requiredFeatures = &kTimedWaitAny};
@@ -49,8 +52,11 @@ GPU::GPU(jsi::Runtime &runtime) : NativeObject(CLASS_NAME) {
 
   _instance = wgpu::CreateInstance(&instanceDesc);
 
-  auto dispatcher = std::make_shared<async::JSIMicrotaskDispatcher>(runtime);
-  _async = async::AsyncRunner::getOrCreate(runtime, _instance, dispatcher);
+  auto scheduler =
+      std::make_shared<async::CallInvokerScheduler>(std::move(callInvoker));
+  auto eventLoop = std::make_shared<async::GpuEventLoop>(_instance);
+  _async = async::RuntimeContext::getOrCreate(runtime, std::move(scheduler),
+                                              std::move(eventLoop));
 }
 
 async::AsyncTaskHandle GPU::requestAdapter(
@@ -68,10 +74,11 @@ async::AsyncTaskHandle GPU::requestAdapter(
   aOptions.backendType = kDefaultBackendType;
   return _async->postTask(
       [this, aOptions](const async::AsyncTaskHandle::ResolveFunction &resolve,
-                       const async::AsyncTaskHandle::RejectFunction &reject) {
-        _instance.RequestAdapter(
-            &aOptions, wgpu::CallbackMode::AllowProcessEvents,
-            [asyncRunner = _async, resolve,
+                       const async::AsyncTaskHandle::RejectFunction &reject)
+          -> wgpu::Future {
+        return _instance.RequestAdapter(
+            &aOptions, wgpu::CallbackMode::WaitAnyOnly,
+            [context = _async, resolve,
              reject](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter,
                      wgpu::StringView message) {
               if (message.length) {
@@ -79,8 +86,8 @@ async::AsyncTaskHandle GPU::requestAdapter(
               }
 
               if (status == wgpu::RequestAdapterStatus::Success && adapter) {
-                auto adapterHost = std::make_shared<GPUAdapter>(
-                    std::move(adapter), asyncRunner);
+                auto adapterHost =
+                    std::make_shared<GPUAdapter>(std::move(adapter), context);
                 auto result =
                     std::variant<std::nullptr_t, std::shared_ptr<GPUAdapter>>(
                         adapterHost);
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPU.h b/packages/webgpu/cpp/rnwgpu/api/GPU.h
index f6bb4ede3..e7dc15caf 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPU.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPU.h
@@ -9,8 +9,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -19,6 +19,10 @@
 
 #include <webgpu/webgpu.h>
 
+namespace facebook::react {
+class CallInvoker;
+} // namespace facebook::react
+
 namespace rnwgpu {
 
 namespace jsi = facebook::jsi;
@@ -27,7 +31,8 @@ class GPU : public NativeObject<GPU> {
 public:
   static constexpr const char *CLASS_NAME = "GPU";
 
-  explicit GPU(jsi::Runtime &runtime);
+  GPU(jsi::Runtime &runtime,
+      std::shared_ptr<facebook::react::CallInvoker> callInvoker);
 
 public:
   std::string getBrand() { return CLASS_NAME; }
@@ -48,11 +53,10 @@ class GPU : public NativeObject<GPU> {
   }
 
   inline const wgpu::Instance get() { return _instance; }
-  inline std::shared_ptr<async::AsyncRunner> getAsyncRunner() { return _async; }
 
 private:
   wgpu::Instance _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
+  std::shared_ptr<async::RuntimeContext> _async;
 };
 
 } // namespace rnwgpu
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.cpp
index 085b582dc..130b00622 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.cpp
@@ -134,7 +134,8 @@ async::AsyncTaskHandle GPUAdapter::requestDevice(
       [this, aDescriptor, descriptor, label = std::move(label),
        deviceLostBinding,
        creationRuntime](const async::AsyncTaskHandle::ResolveFunction &resolve,
-                        const async::AsyncTaskHandle::RejectFunction &reject) {
+                        const async::AsyncTaskHandle::RejectFunction &reject)
+          -> wgpu::Future {
         // Build a local mutable copy so we can chain Dawn's device toggles.
         // The toggle name strings are owned by `descriptor` (captured above),
         // and the const char* / DawnTogglesDescriptor locals live for the
@@ -162,12 +163,11 @@ async::AsyncTaskHandle GPUAdapter::requestDevice(
           }
           deviceDesc.nextInChain = &toggles;
         }
-        _instance.RequestDevice(
-            &deviceDesc, wgpu::CallbackMode::AllowProcessEvents,
-            [asyncRunner = _async, resolve, reject, label, creationRuntime,
+        return _instance.RequestDevice(
+            &deviceDesc, wgpu::CallbackMode::WaitAnyOnly,
+            [context = _async, resolve, reject, label, creationRuntime,
              deviceLostBinding](wgpu::RequestDeviceStatus status,
-                                wgpu::Device device,
-                                wgpu::StringView message) {
+                                wgpu::Device device, wgpu::StringView message) {
               if (message.length) {
                 fprintf(stderr, "%s", message.data);
               }
@@ -191,14 +191,12 @@ async::AsyncTaskHandle GPUAdapter::requestDevice(
                     case wgpu::LoggingType::Warning:
                       logLevel = "Warning";
                       Logger::warnToJavascriptConsole(
-                          *creationRuntime,
-                          std::string(msg.data, msg.length));
+                          *creationRuntime, std::string(msg.data, msg.length));
                       break;
                     case wgpu::LoggingType::Error:
                       logLevel = "Error";
                       Logger::errorToJavascriptConsole(
-                          *creationRuntime,
-                          std::string(msg.data, msg.length));
+                          *creationRuntime, std::string(msg.data, msg.length));
                       break;
                     case wgpu::LoggingType::Verbose:
                       logLevel = "Verbose";
@@ -216,7 +214,7 @@ async::AsyncTaskHandle GPUAdapter::requestDevice(
                   creationRuntime);
 
               auto deviceHost = std::make_shared<GPUDevice>(std::move(device),
-                                                            asyncRunner, label);
+                                                            context, label);
               *deviceLostBinding = deviceHost;
 
               // Register the device in the static registry so the uncaptured
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.h b/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.h
index 66acdc2f7..7f399f0a7 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUAdapter.h
@@ -8,8 +8,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -27,7 +27,7 @@ class GPUAdapter : public NativeObject<GPUAdapter> {
   static constexpr const char *CLASS_NAME = "GPUAdapter";
 
   explicit GPUAdapter(wgpu::Adapter instance,
-                      std::shared_ptr<async::AsyncRunner> async)
+                      std::shared_ptr<async::RuntimeContext> async)
       : NativeObject(CLASS_NAME), _instance(instance), _async(async) {}
 
 public:
@@ -53,7 +53,7 @@ class GPUAdapter : public NativeObject<GPUAdapter> {
 
 private:
   wgpu::Adapter _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
+  std::shared_ptr<async::RuntimeContext> _async;
 };
 
 } // namespace rnwgpu
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.cpp
index 4d6012621..a53d97940 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.cpp
@@ -54,31 +54,31 @@ async::AsyncTaskHandle GPUBuffer::mapAsync(uint64_t modeIn,
   return _async->postTask(
       [bufferHandle, mode, resolvedOffset,
        rangeSize](const async::AsyncTaskHandle::ResolveFunction &resolve,
-                  const async::AsyncTaskHandle::RejectFunction &reject) {
-        bufferHandle.MapAsync(mode, resolvedOffset, rangeSize,
-                              wgpu::CallbackMode::AllowProcessEvents,
-                              [resolve, reject](wgpu::MapAsyncStatus status,
-                                                wgpu::StringView message) {
-                                switch (status) {
-                                case wgpu::MapAsyncStatus::Success:
-                                  resolve(nullptr);
-                                  break;
-                                case wgpu::MapAsyncStatus::CallbackCancelled:
-                                  reject("MapAsyncStatus::CallbackCancelled");
-                                  break;
-                                case wgpu::MapAsyncStatus::Error:
-                                  reject("MapAsyncStatus::Error");
-                                  break;
-                                case wgpu::MapAsyncStatus::Aborted:
-                                  reject("MapAsyncStatus::Aborted");
-                                  break;
-                                default:
-                                  reject(
-                                      "MapAsyncStatus: " +
-                                      std::to_string(static_cast<int>(status)));
-                                  break;
-                                }
-                              });
+                  const async::AsyncTaskHandle::RejectFunction &reject)
+          -> wgpu::Future {
+        return bufferHandle.MapAsync(
+            mode, resolvedOffset, rangeSize, wgpu::CallbackMode::WaitAnyOnly,
+            [resolve, reject](wgpu::MapAsyncStatus status,
+                              wgpu::StringView message) {
+              switch (status) {
+              case wgpu::MapAsyncStatus::Success:
+                resolve(nullptr);
+                break;
+              case wgpu::MapAsyncStatus::CallbackCancelled:
+                reject("MapAsyncStatus::CallbackCancelled");
+                break;
+              case wgpu::MapAsyncStatus::Error:
+                reject("MapAsyncStatus::Error");
+                break;
+              case wgpu::MapAsyncStatus::Aborted:
+                reject("MapAsyncStatus::Aborted");
+                break;
+              default:
+                reject("MapAsyncStatus: " +
+                       std::to_string(static_cast<int>(status)));
+                break;
+              }
+            });
       });
 }
 
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.h b/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.h
index edfc8e41b..036b5af4b 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUBuffer.h
@@ -9,8 +9,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -25,7 +25,7 @@ class GPUBuffer : public NativeObject<GPUBuffer> {
   static constexpr const char *CLASS_NAME = "GPUBuffer";
 
   explicit GPUBuffer(wgpu::Buffer instance,
-                     std::shared_ptr<async::AsyncRunner> async,
+                     std::shared_ptr<async::RuntimeContext> async,
                      std::string label)
       : NativeObject(CLASS_NAME), _instance(instance), _async(async),
         _label(label) {}
@@ -71,7 +71,7 @@ class GPUBuffer : public NativeObject<GPUBuffer> {
 
 private:
   wgpu::Buffer _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
+  std::shared_ptr<async::RuntimeContext> _async;
   std::string _label;
   struct Mapping {
     uint64_t start;
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
index d75eb7b0f..4da91d441 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
@@ -3,14 +3,6 @@
 #include "RNWebGPUManager.h"
 #include <memory>
 
-#ifdef __APPLE__
-namespace dawn::native::metal {
-
-void WaitForCommandsToBeScheduled(WGPUDevice device);
-
-}
-#endif
-
 namespace rnwgpu {
 
 void GPUCanvasContext::configure(
@@ -47,21 +39,26 @@ std::shared_ptr<GPUTexture> GPUCanvasContext::getCurrentTexture() {
   if (sizeHasChanged) {
     _surfaceInfo->reconfigure(width, height);
   }
+
   auto texture = _surfaceInfo->getCurrentTexture();
+
+  auto size = _surfaceInfo->getSize();
+  _canvas->setClientWidth(size.width);
+  _canvas->setClientHeight(size.height);
+
   // Pass reportsMemoryPressure=false to avoid triggering spurious Hermes GC
   // cycles every frame since the canvas texture doesn't own the buffer.
   return std::make_shared<GPUTexture>(texture, "", false);
 }
 
 void GPUCanvasContext::present() {
-#ifdef __APPLE__
-  dawn::native::metal::WaitForCommandsToBeScheduled(
-      _surfaceInfo->getDevice().Get());
-#endif
-  auto size = _surfaceInfo->getSize();
-  _canvas->setClientWidth(size.width);
-  _canvas->setClientHeight(size.height);
-  _surfaceInfo->present();
+  // Present runs synchronously on the calling thread (the one that did
+  // getCurrentTexture / submit), preserving Dawn surface thread-affinity.
+  // Required on every runtime (main JS, Reanimated UI, dedicated worklet);
+  // offscreen surfaces have no wgpu::Surface so they no-op.
+  if (_surfaceInfo->hasSurface()) {
+    _surfaceInfo->presentFrame();
+  }
 }
 
 } // namespace rnwgpu
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
index 4b97a7887..a5efc3c6a 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
@@ -55,6 +55,9 @@ class GPUCanvasContext : public NativeObject<GPUCanvasContext> {
   void configure(std::shared_ptr<GPUCanvasConfiguration> configuration);
   void unconfigure();
   std::shared_ptr<GPUTexture> getCurrentTexture();
+  // Present is explicit on every runtime (main JS, Reanimated UI, and dedicated
+  // worklet runtimes). It runs synchronously on the calling thread, preserving
+  // Dawn surface thread-affinity; offscreen surfaces no-op.
   void present();
 
 private:
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUDevice.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUDevice.cpp
index 58df56f89..ae01b0eab 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUDevice.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUDevice.cpp
@@ -19,23 +19,33 @@ namespace rnwgpu {
 
 void GPUDevice::notifyDeviceLost(wgpu::DeviceLostReason reason,
                                  std::string message) {
-  if (_lostSettled) {
-    return;
-  }
+  std::optional<async::AsyncTaskHandle::ResolveFunction> resolveToCall;
+  std::shared_ptr<GPUDeviceLostInfo> info;
+  {
+    std::lock_guard<std::mutex> lock(_lostMutex);
+    if (_lostSettled) {
+      return;
+    }
 
-  _lostSettled = true;
-  _lostInfo = std::make_shared<GPUDeviceLostInfo>(reason, std::move(message));
+    _lostSettled = true;
+    _lostInfo = std::make_shared<GPUDeviceLostInfo>(reason, std::move(message));
+    info = _lostInfo;
+
+    if (_lostResolve.has_value()) {
+      resolveToCall = std::move(*_lostResolve);
+      _lostResolve.reset();
+    }
 
-  if (_lostResolve.has_value()) {
-    auto resolve = std::move(*_lostResolve);
-    _lostResolve.reset();
-    resolve([info = _lostInfo](jsi::Runtime &runtime) mutable {
+    _lostHandle.reset();
+  }
+
+  // Settle outside the lock: resolve() only enqueues onto the JS thread.
+  if (resolveToCall.has_value()) {
+    (*resolveToCall)([info](jsi::Runtime &runtime) mutable {
       return JSIConverter<std::shared_ptr<GPUDeviceLostInfo>>::toJSI(runtime,
                                                                      info);
     });
   }
-
-  _lostHandle.reset();
 }
 
 void GPUDevice::forceLossForTesting() {
@@ -353,10 +363,10 @@ async::AsyncTaskHandle GPUDevice::createComputePipelineAsync(
                               const async::AsyncTaskHandle::ResolveFunction
                                   &resolve,
                               const async::AsyncTaskHandle::RejectFunction
-                                  &reject) {
+                                  &reject) -> wgpu::Future {
     (void)descriptor;
-    device.CreateComputePipelineAsync(
-        &desc, wgpu::CallbackMode::AllowProcessEvents,
+    return device.CreateComputePipelineAsync(
+        &desc, wgpu::CallbackMode::WaitAnyOnly,
         [pipelineHolder, resolve,
          reject](wgpu::CreatePipelineAsyncStatus status,
                  wgpu::ComputePipeline pipeline, wgpu::StringView msg) {
@@ -367,9 +377,9 @@ async::AsyncTaskHandle GPUDevice::createComputePipelineAsync(
                   runtime, pipelineHolder);
             });
           } else {
-            std::string error =
-                msg.length ? std::string(msg.data, msg.length)
-                           : "Failed to create compute pipeline";
+            std::string error = msg.length
+                                    ? std::string(msg.data, msg.length)
+                                    : "Failed to create compute pipeline";
             reject(std::move(error));
           }
         });
@@ -395,10 +405,10 @@ async::AsyncTaskHandle GPUDevice::createRenderPipelineAsync(
                               const async::AsyncTaskHandle::ResolveFunction
                                   &resolve,
                               const async::AsyncTaskHandle::RejectFunction
-                                  &reject) {
+                                  &reject) -> wgpu::Future {
     (void)descriptor;
-    device.CreateRenderPipelineAsync(
-        &desc, wgpu::CallbackMode::AllowProcessEvents,
+    return device.CreateRenderPipelineAsync(
+        &desc, wgpu::CallbackMode::WaitAnyOnly,
         [pipelineHolder, resolve,
          reject](wgpu::CreatePipelineAsyncStatus status,
                  wgpu::RenderPipeline pipeline, wgpu::StringView msg) {
@@ -409,9 +419,8 @@ async::AsyncTaskHandle GPUDevice::createRenderPipelineAsync(
                   runtime, pipelineHolder);
             });
           } else {
-            std::string error =
-                msg.length ? std::string(msg.data, msg.length)
-                           : "Failed to create render pipeline";
+            std::string error = msg.length ? std::string(msg.data, msg.length)
+                                           : "Failed to create render pipeline";
             reject(std::move(error));
           }
         });
@@ -428,9 +437,9 @@ async::AsyncTaskHandle GPUDevice::popErrorScope() {
   return _async->postTask([device](const async::AsyncTaskHandle::ResolveFunction
                                        &resolve,
                                    const async::AsyncTaskHandle::RejectFunction
-                                       &reject) {
-    device.PopErrorScope(
-        wgpu::CallbackMode::AllowProcessEvents,
+                                       &reject) -> wgpu::Future {
+    return device.PopErrorScope(
+        wgpu::CallbackMode::WaitAnyOnly,
         [resolve, reject](wgpu::PopErrorScopeStatus status,
                           wgpu::ErrorType type, wgpu::StringView message) {
           if (status == wgpu::PopErrorScopeStatus::Error ||
@@ -498,6 +507,11 @@ std::unordered_set<std::string> GPUDevice::getFeatures() {
 }
 
 async::AsyncTaskHandle GPUDevice::getLost() {
+  // Held across the whole body: the postTask callback below runs synchronously
+  // on this (JS) thread and touches the same _lost* fields, so it must not
+  // re-lock. notifyDeviceLost() takes the same lock from its (possibly worker)
+  // thread.
+  std::lock_guard<std::mutex> lock(_lostMutex);
   if (_lostHandle.has_value()) {
     return *_lostHandle;
   }
@@ -506,29 +520,33 @@ async::AsyncTaskHandle GPUDevice::getLost() {
     return _async->postTask(
         [info = _lostInfo](
             const async::AsyncTaskHandle::ResolveFunction &resolve,
-            const async::AsyncTaskHandle::RejectFunction & /*reject*/) {
+            const async::AsyncTaskHandle::RejectFunction & /*reject*/)
+            -> wgpu::Future {
           resolve([info](jsi::Runtime &runtime) mutable {
             return JSIConverter<std::shared_ptr<GPUDeviceLostInfo>>::toJSI(
                 runtime, info);
           });
-        },
-        false);
+          // No Dawn event to wait on: resolved synchronously.
+          return wgpu::Future{};
+        });
   }
 
   auto handle = _async->postTask(
       [this](const async::AsyncTaskHandle::ResolveFunction &resolve,
-             const async::AsyncTaskHandle::RejectFunction & /*reject*/) {
+             const async::AsyncTaskHandle::RejectFunction & /*reject*/)
+          -> wgpu::Future {
         if (_lostSettled && _lostInfo) {
           resolve([info = _lostInfo](jsi::Runtime &runtime) mutable {
             return JSIConverter<std::shared_ptr<GPUDeviceLostInfo>>::toJSI(
                 runtime, info);
           });
-          return;
+          return wgpu::Future{};
         }
 
+        // Resolved later from notifyDeviceLost(); no Dawn event to wait on.
         _lostResolve = resolve;
-      },
-      false);
+        return wgpu::Future{};
+      });
 
   _lostHandle = handle;
   return handle;
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUDevice.h b/packages/webgpu/cpp/rnwgpu/api/GPUDevice.h
index ed5ff98ef..80b02bfd8 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUDevice.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUDevice.h
@@ -15,8 +15,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -50,6 +50,8 @@
 #include "GPUSharedTextureMemoryDescriptor.h"
 #include "GPUShaderModule.h"
 #include "GPUShaderModuleDescriptor.h"
+#include "GPUSharedTextureMemory.h"
+#include "GPUSharedTextureMemoryDescriptor.h"
 #include "GPUSupportedLimits.h"
 #include "GPUTexture.h"
 #include "GPUTextureDescriptor.h"
@@ -64,7 +66,7 @@ class GPUDevice : public NativeObject<GPUDevice> {
   static constexpr const char *CLASS_NAME = "GPUDevice";
 
   explicit GPUDevice(wgpu::Device instance,
-                     std::shared_ptr<async::AsyncRunner> async,
+                     std::shared_ptr<async::RuntimeContext> async,
                      std::string label)
       : NativeObject(CLASS_NAME), _instance(instance), _async(async),
         _label(label) {}
@@ -254,8 +256,12 @@ class GPUDevice : public NativeObject<GPUDevice> {
   friend class GPUAdapter;
 
   wgpu::Device _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
+  std::shared_ptr<async::RuntimeContext> _async;
   std::string _label;
+  // Guards the device-lost state below. notifyDeviceLost() may run on a
+  // GpuEventLoop worker thread (the device-lost callback is Spontaneous), while
+  // getLost() runs on the JS thread, so these fields need synchronization.
+  std::mutex _lostMutex;
   std::optional<async::AsyncTaskHandle> _lostHandle;
   std::shared_ptr<GPUDeviceLostInfo> _lostInfo;
   bool _lostSettled = false;
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUQueue.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUQueue.cpp
index d3c0d65af..9b3365d69 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUQueue.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUQueue.cpp
@@ -82,9 +82,10 @@ async::AsyncTaskHandle GPUQueue::onSubmittedWorkDone() {
   auto queue = _instance;
   return _async->postTask(
       [queue](const async::AsyncTaskHandle::ResolveFunction &resolve,
-              const async::AsyncTaskHandle::RejectFunction &reject) {
-        queue.OnSubmittedWorkDone(
-            wgpu::CallbackMode::AllowProcessEvents,
+              const async::AsyncTaskHandle::RejectFunction &reject)
+          -> wgpu::Future {
+        return queue.OnSubmittedWorkDone(
+            wgpu::CallbackMode::WaitAnyOnly,
             [resolve, reject](wgpu::QueueWorkDoneStatus status,
                               wgpu::StringView message) {
               if (status == wgpu::QueueWorkDoneStatus::Success) {
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUQueue.h b/packages/webgpu/cpp/rnwgpu/api/GPUQueue.h
index be824e781..f322392b7 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUQueue.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUQueue.h
@@ -8,8 +8,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -28,7 +28,7 @@ class GPUQueue : public NativeObject<GPUQueue> {
   static constexpr const char *CLASS_NAME = "GPUQueue";
 
   explicit GPUQueue(wgpu::Queue instance,
-                    std::shared_ptr<async::AsyncRunner> async,
+                    std::shared_ptr<async::RuntimeContext> async,
                     std::string label)
       : NativeObject(CLASS_NAME), _instance(instance), _async(async),
         _label(label) {}
@@ -74,7 +74,7 @@ class GPUQueue : public NativeObject<GPUQueue> {
 
 private:
   wgpu::Queue _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
+  std::shared_ptr<async::RuntimeContext> _async;
   std::string _label;
 };
 
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.cpp
index 113dc407c..5ac6d3634 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.cpp
@@ -12,10 +12,11 @@ async::AsyncTaskHandle GPUShaderModule::getCompilationInfo() {
 
   return _async->postTask(
       [module](const async::AsyncTaskHandle::ResolveFunction &resolve,
-               const async::AsyncTaskHandle::RejectFunction &reject) {
+               const async::AsyncTaskHandle::RejectFunction &reject)
+          -> wgpu::Future {
         auto result = std::make_shared<GPUCompilationInfo>();
-        module.GetCompilationInfo(
-            wgpu::CallbackMode::AllowProcessEvents,
+        return module.GetCompilationInfo(
+            wgpu::CallbackMode::WaitAnyOnly,
             [result, resolve,
              reject](wgpu::CompilationInfoRequestStatus status,
                      const wgpu::CompilationInfo *compilationInfo) {
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.h b/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.h
index ab8561090..0e59edf01 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUShaderModule.h
@@ -7,8 +7,8 @@
 
 #include "NativeObject.h"
 
-#include "rnwgpu/async/AsyncRunner.h"
 #include "rnwgpu/async/AsyncTaskHandle.h"
+#include "rnwgpu/async/RuntimeContext.h"
 
 #include "webgpu/webgpu_cpp.h"
 
@@ -23,7 +23,7 @@ class GPUShaderModule : public NativeObject<GPUShaderModule> {
   static constexpr const char *CLASS_NAME = "GPUShaderModule";
 
   explicit GPUShaderModule(wgpu::ShaderModule instance,
-                           std::shared_ptr<async::AsyncRunner> async,
+                           std::shared_ptr<async::RuntimeContext> async,
                            std::string label)
       : NativeObject(CLASS_NAME), _instance(instance), _async(async),
         _label(label) {}
@@ -59,7 +59,7 @@ class GPUShaderModule : public NativeObject<GPUShaderModule> {
 
 private:
   wgpu::ShaderModule _instance;
-  std::shared_ptr<async::AsyncRunner> _async;
+  std::shared_ptr<async::RuntimeContext> _async;
   std::string _label;
 };
 
diff --git a/packages/webgpu/cpp/rnwgpu/async/AsyncDispatcher.h b/packages/webgpu/cpp/rnwgpu/async/AsyncDispatcher.h
deleted file mode 100644
index 0ec176824..000000000
--- a/packages/webgpu/cpp/rnwgpu/async/AsyncDispatcher.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-#include <functional>
-#include <memory>
-
-#include <jsi/jsi.h>
-
-namespace rnwgpu::async {
-
-namespace jsi = facebook::jsi;
-
-/**
- * Abstract dispatcher used by the AsyncRunner to enqueue work back onto the
- * JavaScript thread.
- */
-class AsyncDispatcher {
-public:
-  using Work = std::function<void(jsi::Runtime &)>;
-
-  virtual ~AsyncDispatcher() = default;
-
-  /**
-   * Enqueue a unit of work that will be executed on the JavaScript thread.
-   */
-  virtual void post(Work work) = 0;
-};
-
-} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.cpp b/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.cpp
deleted file mode 100644
index 94bbae230..000000000
--- a/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-#include "AsyncRunner.h"
-
-#include <chrono>
-#include <stdexcept>
-#include <utility>
-
-#include "AsyncTaskHandle.h"
-#include "WGPULogger.h"
-
-namespace rnwgpu::async {
-
-namespace {
-struct RuntimeData {
-  std::shared_ptr<AsyncRunner> runner;
-};
-constexpr const char *TAG = "AsyncRunner";
-} // namespace
-
-AsyncRunner::AsyncRunner(wgpu::Instance instance,
-                         std::shared_ptr<AsyncDispatcher> dispatcher)
-    : _instance(std::move(instance)), _dispatcher(std::move(dispatcher)),
-      _pendingTasks(0), _pumpTasks(0), _tickScheduled(false),
-      _lastTickTimeNs(0) {
-  if (!_dispatcher) {
-    throw std::runtime_error("AsyncRunner requires a valid dispatcher.");
-  }
-  Logger::logToConsole("[%s] Created runner (dispatcher=%p)", TAG,
-                       _dispatcher.get());
-}
-
-std::shared_ptr<AsyncRunner> AsyncRunner::get(jsi::Runtime &runtime) {
-  auto data = runtime.getRuntimeData(runtimeDataUUID());
-  if (!data) {
-    return nullptr;
-  }
-  auto stored = std::static_pointer_cast<RuntimeData>(data);
-  return stored->runner;
-}
-
-std::shared_ptr<AsyncRunner>
-AsyncRunner::getOrCreate(jsi::Runtime &runtime, wgpu::Instance instance,
-                         std::shared_ptr<AsyncDispatcher> dispatcher) {
-  auto existing = get(runtime);
-  if (existing) {
-    return existing;
-  }
-
-  auto runner =
-      std::make_shared<AsyncRunner>(std::move(instance), std::move(dispatcher));
-  auto data = std::make_shared<RuntimeData>();
-  data->runner = runner;
-  runtime.setRuntimeData(runtimeDataUUID(), data);
-  return runner;
-}
-
-AsyncTaskHandle AsyncRunner::postTask(const TaskCallback &callback,
-                                      bool keepPumping) {
-  auto handle = AsyncTaskHandle::create(shared_from_this(), keepPumping);
-  if (!handle.valid()) {
-    throw std::runtime_error("Failed to create AsyncTaskHandle.");
-  }
-
-  _pendingTasks.fetch_add(1, std::memory_order_acq_rel);
-  if (keepPumping) {
-    _pumpTasks.fetch_add(1, std::memory_order_acq_rel);
-  }
-  requestTick();
-
-  Logger::logToConsole(
-      "[%s] postTask (keepPumping=%s, pending=%zu, pumping=%zu)", TAG,
-      keepPumping ? "true" : "false",
-      _pendingTasks.load(std::memory_order_acquire),
-      _pumpTasks.load(std::memory_order_acquire));
-
-  auto resolve = handle.createResolveFunction();
-  auto reject = handle.createRejectFunction();
-
-  try {
-    callback(resolve, reject);
-  } catch (const std::exception &exception) {
-    reject(exception.what());
-  } catch (...) {
-    reject("Unknown native error in AsyncRunner::postTask.");
-  }
-
-  return handle;
-}
-
-void AsyncRunner::requestTick() {
-  bool expected = false;
-  if (!_tickScheduled.compare_exchange_strong(expected, true,
-                                              std::memory_order_acq_rel)) {
-    return;
-  }
-
-  auto self = shared_from_this();
-  _dispatcher->post([self](jsi::Runtime &runtime) {
-    auto tickCallback = jsi::Function::createFromHostFunction(
-        runtime, jsi::PropNameID::forAscii(runtime, "AsyncRunnerTick"), 0,
-        [self](jsi::Runtime &runtime, const jsi::Value & /*thisValue*/,
-               const jsi::Value * /*args*/, size_t /*count*/) -> jsi::Value {
-          self->tick(runtime);
-          return jsi::Value::undefined();
-        });
-
-#if defined(ANDROID) || defined(__ANDROID__)
-    auto global = runtime.global();
-    auto setImmediateValue = global.getProperty(runtime, "setImmediate");
-    constexpr auto kMinTickInterval = std::chrono::milliseconds(4);
-    const int64_t nowNs =
-        std::chrono::duration_cast<std::chrono::nanoseconds>(
-            std::chrono::steady_clock::now().time_since_epoch())
-            .count();
-    const int64_t lastNs =
-        self->_lastTickTimeNs.load(std::memory_order_acquire);
-    int delayMs = 0;
-    if (lastNs > 0) {
-      const int64_t elapsedNs = nowNs - lastNs;
-      const int64_t minIntervalNs = kMinTickInterval.count() * 1000000LL;
-      if (elapsedNs < minIntervalNs) {
-        const int64_t remainingNs = minIntervalNs - elapsedNs;
-        delayMs = static_cast<int>((remainingNs + 999999) / 1000000);
-      }
-    }
-
-    auto tryScheduleTimeout = [&](int ms) {
-      auto setTimeoutValue = global.getProperty(runtime, "setTimeout");
-      if (!setTimeoutValue.isObject()) {
-        return false;
-      }
-      auto setTimeoutObj = setTimeoutValue.asObject(runtime);
-      if (!setTimeoutObj.isFunction(runtime)) {
-        return false;
-      }
-      Logger::logToConsole("[%s] requestTick scheduled via setTimeout(%d)", TAG,
-                           ms);
-      auto setTimeoutFn = setTimeoutObj.asFunction(runtime);
-      jsi::Value callbackArg(runtime, tickCallback);
-      jsi::Value delayArg(static_cast<double>(ms));
-      setTimeoutFn.call(runtime, callbackArg, delayArg);
-      return true;
-    };
-
-    if (delayMs > 0) {
-      if (tryScheduleTimeout(delayMs)) {
-        return;
-      }
-      // If setTimeout unavailable fall through to immediate scheduling.
-    }
-
-    if (setImmediateValue.isObject()) {
-      auto setImmediateObj = setImmediateValue.asObject(runtime);
-      if (setImmediateObj.isFunction(runtime)) {
-        Logger::logToConsole("[%s] requestTick scheduled via setImmediate",
-                             TAG);
-        auto setImmediateFn = setImmediateObj.asFunction(runtime);
-        jsi::Value callbackArg(runtime, tickCallback);
-        setImmediateFn.call(runtime, callbackArg);
-        return;
-      }
-    }
-
-    int timeoutDelayMs = delayMs > 0 ? delayMs : 0;
-    if (tryScheduleTimeout(timeoutDelayMs)) {
-      return;
-    }
-
-    Logger::logToConsole("[%s] requestTick scheduled via microtask fallback",
-                         TAG);
-    runtime.queueMicrotask(std::move(tickCallback));
-#else
-    Logger::logToConsole("[%s] requestTick scheduled microtask (non-Android)",
-                         TAG);
-    runtime.queueMicrotask(std::move(tickCallback));
-#endif
-  });
-}
-
-void AsyncRunner::tick(jsi::Runtime & /*runtime*/) {
-  _tickScheduled.store(false, std::memory_order_release);
-  _instance.ProcessEvents();
-  const auto nowNs = std::chrono::duration_cast<std::chrono::nanoseconds>(
-                         std::chrono::steady_clock::now().time_since_epoch())
-                         .count();
-  _lastTickTimeNs.store(nowNs, std::memory_order_release);
-  Logger::logToConsole("[%s] tick processed events (pending=%zu, pumping=%zu)",
-                       TAG, _pendingTasks.load(std::memory_order_acquire),
-                       _pumpTasks.load(std::memory_order_acquire));
-  if (_pumpTasks.load(std::memory_order_acquire) > 0) {
-    requestTick();
-  }
-}
-
-void AsyncRunner::onTaskSettled(bool keepPumping) {
-  _pendingTasks.fetch_sub(1, std::memory_order_acq_rel);
-  if (keepPumping) {
-    _pumpTasks.fetch_sub(1, std::memory_order_acq_rel);
-  }
-  Logger::logToConsole(
-      "[%s] onTaskSettled (keepPumping=%s, pending=%zu, pumping=%zu)", TAG,
-      keepPumping ? "true" : "false",
-      _pendingTasks.load(std::memory_order_acquire),
-      _pumpTasks.load(std::memory_order_acquire));
-}
-
-std::shared_ptr<AsyncDispatcher> AsyncRunner::dispatcher() const {
-  return _dispatcher;
-}
-
-jsi::UUID AsyncRunner::runtimeDataUUID() {
-  static const auto uuid = jsi::UUID();
-  return uuid;
-}
-
-} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.h b/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.h
deleted file mode 100644
index f81101d10..000000000
--- a/packages/webgpu/cpp/rnwgpu/async/AsyncRunner.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#pragma once
-
-#include <atomic>
-#include <cstdint>
-#include <functional>
-#include <memory>
-
-#include <jsi/jsi.h>
-
-#include "AsyncDispatcher.h"
-#include "AsyncTaskHandle.h"
-
-#include "webgpu/webgpu_cpp.h"
-
-namespace jsi = facebook::jsi;
-
-namespace rnwgpu::async {
-
-class AsyncRunner : public std::enable_shared_from_this<AsyncRunner> {
-public:
-  using TaskCallback =
-      std::function<void(const AsyncTaskHandle::ResolveFunction &,
-                         const AsyncTaskHandle::RejectFunction &)>;
-
-  AsyncRunner(wgpu::Instance instance,
-              std::shared_ptr<AsyncDispatcher> dispatcher);
-
-  static std::shared_ptr<AsyncRunner> get(jsi::Runtime &runtime);
-  static std::shared_ptr<AsyncRunner>
-  getOrCreate(jsi::Runtime &runtime, wgpu::Instance instance,
-              std::shared_ptr<AsyncDispatcher> dispatcher);
-
-  AsyncTaskHandle postTask(const TaskCallback &callback,
-                           bool keepPumping = true);
-
-  void requestTick();
-  void tick(jsi::Runtime &runtime);
-  void onTaskSettled(bool keepPumping);
-
-  std::shared_ptr<AsyncDispatcher> dispatcher() const;
-
-private:
-  static jsi::UUID runtimeDataUUID();
-
-  wgpu::Instance _instance;
-  std::shared_ptr<AsyncDispatcher> _dispatcher;
-  std::atomic<size_t> _pendingTasks;
-  std::atomic<size_t> _pumpTasks;
-  std::atomic<bool> _tickScheduled;
-  std::atomic<int64_t> _lastTickTimeNs;
-};
-
-} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.cpp b/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.cpp
index 6b262005a..c0876c1e3 100644
--- a/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.cpp
+++ b/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.cpp
@@ -1,20 +1,19 @@
 #include "AsyncTaskHandle.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 
 #include "Promise.h"
 
-#include "AsyncRunner.h"
-
 namespace rnwgpu::async {
 
 using Action = std::function<void(jsi::Runtime &, rnwgpu::Promise &)>;
 
 struct AsyncTaskHandle::State
     : public std::enable_shared_from_this<AsyncTaskHandle::State> {
-  State(std::shared_ptr<AsyncRunner> runner, bool keepPumping)
-      : runner(std::move(runner)), keepPumping(keepPumping) {}
+  explicit State(std::shared_ptr<RuntimeScheduler> scheduler)
+      : scheduler(std::move(scheduler)) {}
 
   void settle(Action action);
   void attachPromise(const std::shared_ptr<rnwgpu::Promise> &promise);
@@ -26,12 +25,11 @@ struct AsyncTaskHandle::State
   std::shared_ptr<rnwgpu::Promise> currentPromise();
 
   std::mutex mutex;
-  std::weak_ptr<AsyncRunner> runner;
+  std::shared_ptr<RuntimeScheduler> scheduler;
   std::shared_ptr<rnwgpu::Promise> promise;
   std::optional<Action> pendingAction;
   bool settled = false;
   std::shared_ptr<State> keepAlive;
-  bool keepPumping;
 };
 
 // MARK: - State helpers
@@ -77,26 +75,18 @@ void AsyncTaskHandle::State::attachPromise(
 }
 
 void AsyncTaskHandle::State::schedule(Action action) {
-  auto runnerRef = runner.lock();
-  if (!runnerRef) {
+  if (!scheduler) {
     return;
   }
 
   auto promiseRef = currentPromise();
   if (!promiseRef) {
-    runnerRef->onTaskSettled(keepPumping);
-    return;
-  }
-
-  auto dispatcherRef = runnerRef->dispatcher();
-  if (!dispatcherRef) {
-    runnerRef->onTaskSettled(keepPumping);
     return;
   }
 
-  dispatcherRef->post([self = shared_from_this(), action = std::move(action),
-                       runnerRef, promiseRef](jsi::Runtime &runtime) mutable {
-    runnerRef->onTaskSettled(self->keepPumping);
+  scheduler->scheduleOnJS([self = shared_from_this(),
+                           action = std::move(action),
+                           promiseRef](jsi::Runtime &runtime) mutable {
     action(runtime, *promiseRef);
     std::lock_guard<std::mutex> lock(self->mutex);
     self->keepAlive.reset();
@@ -149,9 +139,8 @@ AsyncTaskHandle::AsyncTaskHandle(std::shared_ptr<State> state)
 bool AsyncTaskHandle::valid() const { return _state != nullptr; }
 
 AsyncTaskHandle
-AsyncTaskHandle::create(const std::shared_ptr<AsyncRunner> &runner,
-                        bool keepPumping) {
-  auto state = std::make_shared<State>(runner, keepPumping);
+AsyncTaskHandle::create(const std::shared_ptr<RuntimeScheduler> &scheduler) {
+  auto state = std::make_shared<State>(scheduler);
   state->keepAlive = state;
   return AsyncTaskHandle(std::move(state));
 }
diff --git a/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.h b/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.h
index cb6c7a2a4..e3a224563 100644
--- a/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.h
+++ b/packages/webgpu/cpp/rnwgpu/async/AsyncTaskHandle.h
@@ -8,7 +8,7 @@
 
 #include <jsi/jsi.h>
 
-#include "AsyncDispatcher.h"
+#include "RuntimeScheduler.h"
 
 namespace rnwgpu {
 class Promise;
@@ -16,11 +16,13 @@ class Promise;
 
 namespace rnwgpu::async {
 
-class AsyncRunner;
-
 /**
  * Represents a pending asynchronous WebGPU operation that can be converted into
  * a JavaScript Promise.
+ *
+ * The native callback (resolve/reject) may be invoked from any thread (e.g. a
+ * GpuEventLoop worker); the actual Promise settlement is marshalled onto the
+ * owning runtime's JS thread via a RuntimeScheduler.
  */
 class AsyncTaskHandle {
 public:
@@ -34,7 +36,7 @@ class AsyncTaskHandle {
   AsyncTaskHandle();
 
   /**
-   * Internal constructor used by AsyncRunner.
+   * Internal constructor used by RuntimeContext.
    */
   explicit AsyncTaskHandle(std::shared_ptr<State> state);
 
@@ -45,8 +47,8 @@ class AsyncTaskHandle {
 
   void attachPromise(const std::shared_ptr<rnwgpu::Promise> &promise) const;
 
-  static AsyncTaskHandle create(const std::shared_ptr<AsyncRunner> &runner,
-                                bool keepPumping);
+  static AsyncTaskHandle
+  create(const std::shared_ptr<RuntimeScheduler> &scheduler);
 
 private:
   std::shared_ptr<State> _state;
diff --git a/packages/webgpu/cpp/rnwgpu/async/CallInvokerScheduler.cpp b/packages/webgpu/cpp/rnwgpu/async/CallInvokerScheduler.cpp
new file mode 100644
index 000000000..2ef72f407
--- /dev/null
+++ b/packages/webgpu/cpp/rnwgpu/async/CallInvokerScheduler.cpp
@@ -0,0 +1,21 @@
+#include "CallInvokerScheduler.h"
+
+#include <memory>
+#include <utility>
+
+namespace rnwgpu::async {
+
+CallInvokerScheduler::CallInvokerScheduler(
+    std::shared_ptr<react::CallInvoker> invoker)
+    : _invoker(std::move(invoker)) {}
+
+void CallInvokerScheduler::scheduleOnJS(
+    std::function<void(jsi::Runtime &)> job) {
+  if (!_invoker || !job) {
+    return;
+  }
+  _invoker->invokeAsync(
+      [job = std::move(job)](jsi::Runtime &runtime) { job(runtime); });
+}
+
+} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/CallInvokerScheduler.h b/packages/webgpu/cpp/rnwgpu/async/CallInvokerScheduler.h
new file mode 100644
index 000000000..cbb6a9174
--- /dev/null
+++ b/packages/webgpu/cpp/rnwgpu/async/CallInvokerScheduler.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include <ReactCommon/CallInvoker.h>
+#include <jsi/jsi.h>
+
+#include "RuntimeScheduler.h"
+
+namespace rnwgpu::async {
+
+namespace jsi = facebook::jsi;
+namespace react = facebook::react;
+
+/**
+ * RuntimeScheduler for the main React Native JS runtime, backed by
+ * react::CallInvoker::invokeAsync. invokeAsync is safe to call from any thread
+ * and delivers the work on the JS thread with the runtime, which is exactly the
+ * contract RuntimeScheduler requires.
+ */
+class CallInvokerScheduler final : public RuntimeScheduler {
+public:
+  explicit CallInvokerScheduler(std::shared_ptr<react::CallInvoker> invoker);
+
+  void scheduleOnJS(std::function<void(jsi::Runtime &)> job) override;
+
+private:
+  std::shared_ptr<react::CallInvoker> _invoker;
+};
+
+} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/GpuEventLoop.cpp b/packages/webgpu/cpp/rnwgpu/async/GpuEventLoop.cpp
new file mode 100644
index 000000000..2bd643b39
--- /dev/null
+++ b/packages/webgpu/cpp/rnwgpu/async/GpuEventLoop.cpp
@@ -0,0 +1,113 @@
+#include "GpuEventLoop.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <thread>
+#include <utility>
+
+#include "WGPULogger.h"
+
+namespace rnwgpu::async {
+
+namespace {
+constexpr const char *TAG = "GpuEventLoop";
+
+std::size_t computeMaxWorkers() {
+  unsigned int hw = std::thread::hardware_concurrency();
+  if (hw == 0) {
+    hw = 4;
+  }
+  // A small bounded pool: enough to overlap the handful of async GPU ops that
+  // are realistically in flight at once, without spawning unbounded threads.
+  return std::max<std::size_t>(2, std::min<std::size_t>(8, hw));
+}
+} // namespace
+
+GpuEventLoop::GpuEventLoop(wgpu::Instance instance)
+    : _state(std::make_shared<State>(std::move(instance))) {
+  _state->maxWorkers = computeMaxWorkers();
+  Logger::logToConsole("[%s] Created (maxWorkers=%zu)", TAG,
+                       _state->maxWorkers);
+}
+
+GpuEventLoop::~GpuEventLoop() {
+  {
+    std::lock_guard<std::mutex> lock(_state->mutex);
+    _state->running.store(false, std::memory_order_release);
+  }
+  // Wake idle workers so they can observe !running and exit. Workers that are
+  // currently blocked in WaitAny keep the shared State (and its wgpu::Instance
+  // ref) alive until their future completes, then exit; we intentionally do not
+  // join here to avoid blocking teardown on in-flight GPU work.
+  _state->cv.notify_all();
+}
+
+void GpuEventLoop::addFuture(wgpu::Future future) {
+  if (future.id == 0) {
+    // No event to wait on (deferred/immediate resolution). The callback path
+    // settles the promise without involving the event loop.
+    return;
+  }
+
+  std::lock_guard<std::mutex> lock(_state->mutex);
+  if (!_state->running.load(std::memory_order_acquire)) {
+    return;
+  }
+
+  _state->queue.push(future);
+
+  // Grow the pool if every worker is busy and we are still under the cap;
+  // otherwise wake an idle worker. A freshly spawned worker picks the job up
+  // via the queue-non-empty predicate, so it needs no separate notify.
+  if (_state->idleWorkers == 0 && _state->totalWorkers < _state->maxWorkers) {
+    _state->totalWorkers++;
+    std::thread(&GpuEventLoop::worker, _state).detach();
+    Logger::logToConsole("[%s] grew pool to %zu worker(s)", TAG,
+                         _state->totalWorkers);
+  } else {
+    _state->cv.notify_one();
+  }
+}
+
+void GpuEventLoop::worker(std::shared_ptr<State> state) {
+  for (;;) {
+    wgpu::Future future{};
+    {
+      std::unique_lock<std::mutex> lock(state->mutex);
+      state->idleWorkers++;
+      state->cv.wait(lock, [&state] {
+        return !state->running.load(std::memory_order_acquire) ||
+               !state->queue.empty();
+      });
+      state->idleWorkers--;
+
+      if (state->queue.empty()) {
+        // Only happens when shutting down.
+        state->totalWorkers--;
+        return;
+      }
+
+      future = state->queue.front();
+      state->queue.pop();
+    }
+
+    // Single-future wait: always a legal single-source WaitAny. Blocks with no
+    // CPU cost until the GPU work completes, at which point Dawn invokes the
+    // future's callback on this thread (it then marshals back to the owning
+    // runtime via its RuntimeScheduler).
+    auto status = state->instance.WaitAny(future, UINT64_MAX);
+    if (status != wgpu::WaitStatus::Success) {
+      // With an infinite timeout on a single future this is not expected. If it
+      // happens, Dawn did not invoke the future's callback, so the associated
+      // JS Promise will never settle. Log it so the otherwise-silent hang is at
+      // least observable.
+      Logger::logToConsole(
+          "[%s] WaitAny returned non-success status %u for future %llu; its "
+          "Promise will not settle.",
+          TAG, static_cast<unsigned int>(status),
+          static_cast<unsigned long long>(future.id));
+    }
+  }
+}
+
+} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/GpuEventLoop.h b/packages/webgpu/cpp/rnwgpu/async/GpuEventLoop.h
new file mode 100644
index 000000000..07e90cd98
--- /dev/null
+++ b/packages/webgpu/cpp/rnwgpu/async/GpuEventLoop.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <utility>
+
+#include "webgpu/webgpu_cpp.h"
+
+namespace rnwgpu::async {
+
+/**
+ * Background, event-driven driver for Dawn async operations. Replaces the old
+ * JS-thread ProcessEvents polling loop.
+ *
+ * Each pending wgpu::Future (registered with CallbackMode::WaitAnyOnly) is
+ * handed to addFuture() and waited on by a worker thread via
+ * `instance.WaitAny(future, UINT64_MAX)`. The wait is genuinely event-driven
+ * (zero idle CPU) and resolves the instant the GPU work completes, at which
+ * point Dawn fires the future's callback on the worker thread. That callback is
+ * responsible for marshalling back to the owning runtime's JS thread (via a
+ * RuntimeScheduler) to settle the JS Promise.
+ *
+ * Threading model (validated in Phase 0, spike 2): each WaitAny call waits on a
+ * *single* future, which is always a legal single-source wait. Multiple workers
+ * may block in WaitAny on the same instance concurrently; Dawn's EventManager
+ * is designed for this.
+ *
+ * The worker pool grows lazily up to a small cap as concurrent work demands,
+ * and threads are reused. Shared state is held behind a shared_ptr so detached
+ * workers (and the wgpu::Instance ref they need) outlive this object safely.
+ */
+class GpuEventLoop {
+public:
+  explicit GpuEventLoop(wgpu::Instance instance);
+  ~GpuEventLoop();
+
+  GpuEventLoop(const GpuEventLoop &) = delete;
+  GpuEventLoop &operator=(const GpuEventLoop &) = delete;
+
+  /**
+   * Wait for `future` to complete on a background thread. A future with id == 0
+   * (no event to wait on, e.g. a deferred/immediate resolution) is ignored.
+   * Thread-safe.
+   */
+  void addFuture(wgpu::Future future);
+
+private:
+  struct State {
+    explicit State(wgpu::Instance instance) : instance(std::move(instance)) {}
+
+    wgpu::Instance instance;
+    std::mutex mutex;
+    std::condition_variable cv;
+    std::queue<wgpu::Future> queue;
+    std::atomic_bool running{true};
+    std::size_t idleWorkers = 0;
+    std::size_t totalWorkers = 0;
+    std::size_t maxWorkers = 1;
+  };
+
+  static void worker(std::shared_ptr<State> state);
+
+  std::shared_ptr<State> _state;
+};
+
+} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.cpp b/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.cpp
deleted file mode 100644
index 6231a833c..000000000
--- a/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "JSIMicrotaskDispatcher.h"
-
-#include <utility>
-
-namespace rnwgpu::async {
-
-JSIMicrotaskDispatcher::JSIMicrotaskDispatcher(jsi::Runtime &runtime)
-    : _runtime(runtime) {}
-
-void JSIMicrotaskDispatcher::post(Work work) {
-  auto microtask = jsi::Function::createFromHostFunction(
-      _runtime, jsi::PropNameID::forAscii(_runtime, "AsyncMicrotask"), 0,
-      [work = std::move(work)](
-          jsi::Runtime &runtime, const jsi::Value & /*thisValue*/,
-          const jsi::Value * /*args*/, size_t /*count*/) -> jsi::Value {
-        work(runtime);
-        return jsi::Value::undefined();
-      });
-
-  _runtime.queueMicrotask(std::move(microtask));
-}
-
-} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.h b/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.h
deleted file mode 100644
index bae208c5d..000000000
--- a/packages/webgpu/cpp/rnwgpu/async/JSIMicrotaskDispatcher.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "AsyncDispatcher.h"
-
-namespace rnwgpu::async {
-
-/**
- * Dispatcher implementation backed by `jsi::Runtime::queueMicrotask`.
- */
-class JSIMicrotaskDispatcher final
-    : public AsyncDispatcher,
-      public std::enable_shared_from_this<JSIMicrotaskDispatcher> {
-public:
-  explicit JSIMicrotaskDispatcher(jsi::Runtime &runtime);
-
-  void post(Work work) override;
-
-private:
-  jsi::Runtime &_runtime;
-};
-
-} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.cpp b/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.cpp
new file mode 100644
index 000000000..f297ae6b0
--- /dev/null
+++ b/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.cpp
@@ -0,0 +1,92 @@
+#include "RuntimeContext.h"
+
+#include <memory>
+#include <stdexcept>
+#include <utility>
+
+#include "AsyncTaskHandle.h"
+#include "WGPULogger.h"
+
+namespace rnwgpu::async {
+
+namespace {
+struct RuntimeData {
+  std::shared_ptr<RuntimeContext> runner;
+};
+constexpr const char *TAG = "RuntimeContext";
+} // namespace
+
+RuntimeContext::RuntimeContext(std::shared_ptr<RuntimeScheduler> scheduler,
+                               std::shared_ptr<GpuEventLoop> eventLoop)
+    : _scheduler(std::move(scheduler)), _eventLoop(std::move(eventLoop)) {
+  if (!_scheduler) {
+    throw std::runtime_error(
+        "RuntimeContext requires a valid RuntimeScheduler.");
+  }
+  if (!_eventLoop) {
+    throw std::runtime_error("RuntimeContext requires a valid GpuEventLoop.");
+  }
+  Logger::logToConsole("[%s] Created runner (scheduler=%p, eventLoop=%p)", TAG,
+                       _scheduler.get(), _eventLoop.get());
+}
+
+std::shared_ptr<RuntimeContext> RuntimeContext::get(jsi::Runtime &runtime) {
+  auto data = runtime.getRuntimeData(runtimeDataUUID());
+  if (!data) {
+    return nullptr;
+  }
+  auto stored = std::static_pointer_cast<RuntimeData>(data);
+  return stored->runner;
+}
+
+std::shared_ptr<RuntimeContext>
+RuntimeContext::getOrCreate(jsi::Runtime &runtime,
+                            std::shared_ptr<RuntimeScheduler> scheduler,
+                            std::shared_ptr<GpuEventLoop> eventLoop) {
+  auto existing = get(runtime);
+  if (existing) {
+    return existing;
+  }
+
+  auto runner = std::make_shared<RuntimeContext>(std::move(scheduler),
+                                                 std::move(eventLoop));
+  auto data = std::make_shared<RuntimeData>();
+  data->runner = runner;
+  runtime.setRuntimeData(runtimeDataUUID(), data);
+  return runner;
+}
+
+AsyncTaskHandle RuntimeContext::postTask(const TaskCallback &callback) {
+  auto handle = AsyncTaskHandle::create(_scheduler);
+  if (!handle.valid()) {
+    throw std::runtime_error("Failed to create AsyncTaskHandle.");
+  }
+
+  auto resolve = handle.createResolveFunction();
+  auto reject = handle.createRejectFunction();
+
+  wgpu::Future future{};
+  try {
+    future = callback(resolve, reject);
+  } catch (const std::exception &exception) {
+    reject(exception.what());
+    return handle;
+  } catch (...) {
+    reject("Unknown native error in RuntimeContext::postTask.");
+    return handle;
+  }
+
+  _eventLoop->addFuture(future);
+  return handle;
+}
+
+std::shared_ptr<RuntimeScheduler> RuntimeContext::scheduler() const {
+  return _scheduler;
+}
+
+jsi::UUID RuntimeContext::runtimeDataUUID() {
+  static const auto uuid = jsi::UUID();
+  return uuid;
+}
+
+} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.h b/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.h
new file mode 100644
index 000000000..a7a5d46f4
--- /dev/null
+++ b/packages/webgpu/cpp/rnwgpu/async/RuntimeContext.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include <jsi/jsi.h>
+
+#include "AsyncTaskHandle.h"
+#include "GpuEventLoop.h"
+#include "RuntimeScheduler.h"
+
+#include "webgpu/webgpu_cpp.h"
+
+namespace jsi = facebook::jsi;
+
+namespace rnwgpu::async {
+
+/**
+ * Per-runtime coordinator for asynchronous WebGPU operations.
+ *
+ * Bundles the runtime's RuntimeScheduler (how to settle Promises back on the
+ * owning JS thread) with the GpuEventLoop (how to wait on Dawn futures off the
+ * JS thread). This replaces the previous ProcessEvents polling design: there is
+ * no tick loop and no idle CPU usage.
+ *
+ * A task callback registers a Dawn async op with CallbackMode::WaitAnyOnly and
+ * returns the resulting wgpu::Future, which is handed to the GpuEventLoop. A
+ * returned future with id == 0 means "no event to wait on" (deferred/immediate
+ * resolution, e.g. GPUDevice::getLost).
+ */
+class RuntimeContext : public std::enable_shared_from_this<RuntimeContext> {
+public:
+  using TaskCallback =
+      std::function<wgpu::Future(const AsyncTaskHandle::ResolveFunction &,
+                                 const AsyncTaskHandle::RejectFunction &)>;
+
+  RuntimeContext(std::shared_ptr<RuntimeScheduler> scheduler,
+                 std::shared_ptr<GpuEventLoop> eventLoop);
+
+  static std::shared_ptr<RuntimeContext> get(jsi::Runtime &runtime);
+  static std::shared_ptr<RuntimeContext>
+  getOrCreate(jsi::Runtime &runtime,
+              std::shared_ptr<RuntimeScheduler> scheduler,
+              std::shared_ptr<GpuEventLoop> eventLoop);
+
+  AsyncTaskHandle postTask(const TaskCallback &callback);
+
+  std::shared_ptr<RuntimeScheduler> scheduler() const;
+
+private:
+  static jsi::UUID runtimeDataUUID();
+
+  std::shared_ptr<RuntimeScheduler> _scheduler;
+  std::shared_ptr<GpuEventLoop> _eventLoop;
+};
+
+} // namespace rnwgpu::async
diff --git a/packages/webgpu/cpp/rnwgpu/async/RuntimeScheduler.h b/packages/webgpu/cpp/rnwgpu/async/RuntimeScheduler.h
new file mode 100644
index 000000000..926b494c3
--- /dev/null
+++ b/packages/webgpu/cpp/rnwgpu/async/RuntimeScheduler.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <functional>
+
+#include <jsi/jsi.h>
+
+namespace rnwgpu::async {
+
+namespace jsi = facebook::jsi;
+
+/**
+ * Thread-safe "post this job onto a specific runtime's JS thread".
+ *
+ * Replaces the old AsyncDispatcher / JSIMicrotaskDispatcher, whose
+ * queueMicrotask-based dispatch was only safe to call from the runtime's own
+ * thread. A RuntimeScheduler can be called from any thread (e.g. the
+ * GpuEventLoop background threads) and guarantees the job runs on the owning
+ * runtime's JS thread.
+ */
+class RuntimeScheduler {
+public:
+  virtual ~RuntimeScheduler() = default;
+
+  /**
+   * Schedule `job` to run on this runtime's JS thread. Callable from any
+   * thread. Jobs are delivered in FIFO order relative to one another.
+   */
+  virtual void scheduleOnJS(std::function<void(jsi::Runtime &)> job) = 0;
+};
+
+} // namespace rnwgpu::async
diff --git a/packages/webgpu/src/Canvas.tsx b/packages/webgpu/src/Canvas.tsx
index 1030f3e38..43c9621e7 100644
--- a/packages/webgpu/src/Canvas.tsx
+++ b/packages/webgpu/src/Canvas.tsx
@@ -20,6 +20,15 @@ export interface NativeCanvas {
 }
 
 export type RNCanvasContext = GPUCanvasContext & {
+  /**
+   * Present the current frame.
+   *
+   * Only needed when rendering from a **dedicated worklet runtime** (e.g.
+   * `createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame
+   * processor), which runs on its own thread. On the main JS runtime and the
+   * Reanimated UI runtime present is automatic (driven by a global vsync), so
+   * calling this there is a no-op. Call it after `queue.submit()`.
+   */
   present: () => void;
 };
 
diff --git a/packages/webgpu/src/Offscreen.ts b/packages/webgpu/src/Offscreen.ts
index c4e460bb2..4deab8a1c 100644
--- a/packages/webgpu/src/Offscreen.ts
+++ b/packages/webgpu/src/Offscreen.ts
@@ -65,7 +65,7 @@ class GPUOffscreenCanvasContext implements GPUCanvasContext {
   }
 
   present() {
-    // Do nothing
+    // Offscreen contexts have nothing to present; readback is via getImageData.
   }
 
   getDevice() {
diff --git a/packages/webgpu/src/WebPolyfillGPUModule.ts b/packages/webgpu/src/WebPolyfillGPUModule.ts
index 9dcc1f1c5..8b629a0c9 100644
--- a/packages/webgpu/src/WebPolyfillGPUModule.ts
+++ b/packages/webgpu/src/WebPolyfillGPUModule.ts
@@ -40,9 +40,9 @@ function makeWebGPUCanvasContext(
   }
 
   const context = canvas.getContext("webgpu")!;
-  return Object.assign(context, {
-    present: () => {},
-  });
+  // On web there is no manual present; expose a no-op so RNCanvasContext's
+  // present() (used on native dedicated worklet runtimes) is callable here too.
+  return Object.assign(context, { present: () => {} });
 }
 
 // @ts-expect-error - polyfill for RNWebGPU native module
diff --git a/packages/webgpu/src/types.ts b/packages/webgpu/src/types.ts
index ef06c192c..df3443157 100644
--- a/packages/webgpu/src/types.ts
+++ b/packages/webgpu/src/types.ts
@@ -9,6 +9,15 @@ export interface NativeCanvas {
 }
 
 export type RNCanvasContext = GPUCanvasContext & {
+  /**
+   * Present the current frame.
+   *
+   * Only needed when rendering from a **dedicated worklet runtime** (e.g.
+   * `createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame
+   * processor), which runs on its own thread. On the main JS runtime and the
+   * Reanimated UI runtime present is automatic (driven by a global vsync), so
+   * calling this there is a no-op. Call it after `queue.submit()`.
+   */
   present: () => void;
 };