test(run-engine): add hot-key debounce stress tests + replica-routed fast-path read

devin-ai-integration[bot] · ericallam · devin-ai-integration[bot] · commit 30586b45f4f7 · 2026-04-27T14:54:24.000Z
Stress test:
- Seeds a debounce key, then fires 40 concurrent triggers on it.
- With the full fix on (fast-path + 1s quantization): all 40 succeed,
  all return the seed run id, and only 1 `taskRun.update` lands on
  the run (the first lock-protected trigger; the rest short-circuit
  via the unlocked read).
- With fast-path/quantization off: still correct (no 5xx, all return
  the seed run) thanks to the contention fallback, with 4 updates
  observed under N=40 (rest absorbed by the fallback).

Replica-routed fast-path read:
- New `useReplicaForFastPathRead` debounce option (default false).
  When on, the unlocked `delayUntil`/`createdAt` read goes through
  `readOnlyPrisma` instead of the writer. Safe because the read is
  best-effort and re-checked under the lock; replica lag at worst means
  a few extra callers fall through to the lock.
- New env var `RUN_ENGINE_DEBOUNCE_USE_REPLICA_FOR_FAST_PATH_READ`
  wires this through in the webapp so cloud (Aurora readers) can opt
  in without code changes.

Co-Authored-By: Eric Allam &lt;eallam@icloud.com&gt;
diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts
@@ -837,6 +837,7 @@ const EnvironmentSchema = z
       .default("info"),
     RUN_ENGINE_TREAT_PRODUCTION_EXECUTION_STALLS_AS_OOM: z.string().default("0"),
     RUN_ENGINE_READ_REPLICA_SNAPSHOTS_SINCE_ENABLED: z.string().default("0"),
+    RUN_ENGINE_DEBOUNCE_USE_REPLICA_FOR_FAST_PATH_READ: z.string().default("0"),
 
     /** How long should the presence ttl last */
     DEV_PRESENCE_SSE_TIMEOUT: z.coerce.number().int().default(30_000),
diff --git a/apps/webapp/app/v3/runEngine.server.ts b/apps/webapp/app/v3/runEngine.server.ts
@@ -214,6 +214,7 @@ function createRunEngine() {
     // Debounce configuration
     debounce: {
       maxDebounceDurationMs: env.RUN_ENGINE_MAXIMUM_DEBOUNCE_DURATION_MS,
+      useReplicaForFastPathRead: env.RUN_ENGINE_DEBOUNCE_USE_REPLICA_FOR_FAST_PATH_READ === "1",
     },
   });
 
diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts
@@ -326,6 +326,7 @@ export class RunEngine {
       maxDebounceDurationMs: options.debounce?.maxDebounceDurationMs ?? 60 * 60 * 1000, // Default 1 hour
       quantizeNewDelayUntilMs: options.debounce?.quantizeNewDelayUntilMs ?? 1000,
       fastPathSkipEnabled: options.debounce?.fastPathSkipEnabled ?? true,
+      useReplicaForFastPathRead: options.debounce?.useReplicaForFastPathRead ?? false,
     });
 
     this.pendingVersionSystem = new PendingVersionSystem({
diff --git a/internal-packages/run-engine/src/engine/systems/debounceSystem.ts b/internal-packages/run-engine/src/engine/systems/debounceSystem.ts
@@ -10,7 +10,12 @@ import {
   parseNaturalLanguageDuration,
   parseNaturalLanguageDurationInMs,
 } from "@trigger.dev/core/v3/isomorphic";
-import { PrismaClientOrTransaction, TaskRun, Waitpoint } from "@trigger.dev/database";
+import {
+  PrismaClientOrTransaction,
+  PrismaReplicaClient,
+  TaskRun,
+  Waitpoint,
+} from "@trigger.dev/database";
 import { nanoid } from "nanoid";
 import { SystemResources } from "./systems.js";
 import { ExecutionSnapshotSystem, getLatestExecutionSnapshot } from "./executionSnapshotSystem.js";
@@ -57,6 +62,11 @@ export type DebounceSystemOptions = {
    * current one.
    */
   fastPathSkipEnabled?: boolean;
+  /**
+   * When true, route the unlocked fast-path read through `readOnlyPrisma`
+   * (e.g. an Aurora reader) instead of the writer.
+   */
+  useReplicaForFastPathRead?: boolean;
 };
 
 export type DebounceResult =
@@ -103,6 +113,7 @@ export class DebounceSystem {
   private readonly maxDebounceDurationMs: number;
   private readonly quantizeNewDelayUntilMs: number;
   private readonly fastPathSkipEnabled: boolean;
+  private readonly useReplicaForFastPathRead: boolean;
 
   constructor(options: DebounceSystemOptions) {
     this.$ = options.resources;
@@ -122,6 +133,7 @@ export class DebounceSystem {
     this.maxDebounceDurationMs = options.maxDebounceDurationMs;
     this.quantizeNewDelayUntilMs = Math.max(0, options.quantizeNewDelayUntilMs ?? 1000);
     this.fastPathSkipEnabled = options.fastPathSkipEnabled ?? true;
+    this.useReplicaForFastPathRead = options.useReplicaForFastPathRead ?? false;
 
     this.#registerCommands();
   }
@@ -467,6 +479,13 @@ return 0
     tx?: PrismaClientOrTransaction;
   }): Promise<DebounceResult> {
     const prisma = tx ?? this.$.prisma;
+    // Reads that are explicitly best-effort (the fast-path skip) can run on
+    // `readOnlyPrisma` when configured. Replica lag is fine: the monotonic-
+    // forward invariant means a stale read just falls through to the locked
+    // path. Only divert reads when the caller isn't inside a tx (where the
+    // read needs to see the tx's writes).
+    const fastPathReadPrisma =
+      tx ?? (this.useReplicaForFastPathRead ? this.$.readOnlyPrisma : this.$.prisma);
 
     // Compute the (quantized) target delayUntil up-front, before taking any lock.
     // Quantizing to e.g. 1s buckets collapses many concurrent triggers on the same
@@ -484,7 +503,7 @@ return 0
         existingRunId,
         newDelayUntil,
         debounce,
-        prisma,
+        prisma: fastPathReadPrisma,
       });
       if (fastPathResult) {
         return fastPathResult;
@@ -569,7 +588,7 @@ return 0
     existingRunId: string;
     newDelayUntil: Date;
     debounce: DebounceOptions;
-    prisma: PrismaClientOrTransaction;
+    prisma: PrismaClientOrTransaction | PrismaReplicaClient;
   }): Promise<DebounceResult | null> {
     // Trailing mode with updateData still needs the lock so the data update is
     // applied; only short-circuit when there's nothing to update.
diff --git a/internal-packages/run-engine/src/engine/tests/debounce.test.ts b/internal-packages/run-engine/src/engine/tests/debounce.test.ts
@@ -3012,5 +3012,189 @@ describe("RunEngine debounce", () => {
       }
     }
   );
+
+  // Reproduces the hot-key contention from TRI-8758: fires N concurrent
+  // triggers on the same debounce key after the run is already DELAYED.
+  //
+  // - fixed=true: fast-path skip + 1s quantization on. The herd collapses on
+  //   the unlocked read and onto the same quantized newDelayUntil, so almost
+  //   every call short-circuits and `taskRun.update` is barely written.
+  // - fixed=false: fast-path off and quantization off (closer to the
+  //   pre-fix behaviour). The lock-contention fallback (also part of this
+  //   PR) still catches herd lock failures; this case validates that even
+  //   without the fast-path the system stays correct under stress, just at
+  //   higher Redlock cost.
+  for (const fixed of [true, false]) {
+    containerTest(
+      `Debounce hot-key stress (fixed=${fixed}): N concurrent triggers stay correct`,
+      async ({ prisma, redisOptions }) => {
+        const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+        const engine = new RunEngine({
+          prisma,
+          worker: {
+            redis: redisOptions,
+            workers: 1,
+            tasksPerWorker: 10,
+            pollIntervalMs: 100,
+          },
+          queue: {
+            redis: redisOptions,
+          },
+          runLock: {
+            redis: redisOptions,
+          },
+          machines: {
+            defaultMachine: "small-1x",
+            machines: {
+              "small-1x": {
+                name: "small-1x" as const,
+                cpu: 0.5,
+                memory: 0.5,
+                centsPerMs: 0.0001,
+              },
+            },
+            baseCostInCents: 0.0001,
+          },
+          debounce: {
+            maxDebounceDurationMs: 10 * 60_000,
+            fastPathSkipEnabled: fixed,
+            // 1s buckets - same as the real default - or 0 to mimic the
+            // pre-fix behaviour where every concurrent trigger has a slightly
+            // larger newDelayUntil than the last.
+            quantizeNewDelayUntilMs: fixed ? 1000 : 0,
+          },
+          tracer: trace.getTracer("test", "0.0.0"),
+        });
+
+        try {
+          const taskIdentifier = "test-task";
+          await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier);
+
+          // Seed the debounce key with an initial run, then push delayUntil far
+          // forward so the herd lands well inside the existing window.
+          const seed = await engine.trigger(
+            {
+              number: 0,
+              friendlyId: "run_stress0",
+              environment: authenticatedEnvironment,
+              taskIdentifier,
+              payload: '{"data": "seed"}',
+              payloadType: "application/json",
+              context: {},
+              traceContext: {},
+              traceId: "t_stress_seed",
+              spanId: "s_stress_seed",
+              workerQueue: "main",
+              queue: "task/test-task",
+              isTest: false,
+              tags: [],
+              delayUntil: new Date(Date.now() + 30_000),
+              debounce: {
+                key: "stress-key",
+                delay: "30s",
+              },
+            },
+            prisma
+          );
+
+          // Move delayUntil to a small but safe future offset. The herd's
+          // newDelayUntil (now + 30s) will be meaningfully later than the
+          // current value, so the fast-path-off branch reschedules. The
+          // ~2s buffer keeps the run DELAYED long enough to absorb startup
+          // jitter before the first trigger writes delayUntil = now + 30s.
+          await prisma.taskRun.update({
+            where: { id: seed.id },
+            data: { delayUntil: new Date(Date.now() + 2_000) },
+          });
+
+          // Count taskRun.update calls so we can assert that the fast-path
+          // actually short-circuits the herd's writes. We monkey-patch the
+          // bound method on the prisma instance the engine is holding.
+          let updateCount = 0;
+          const originalUpdate = prisma.taskRun.update.bind(prisma.taskRun);
+          (prisma.taskRun as unknown as { update: typeof originalUpdate }).update = ((
+            ...args: Parameters<typeof originalUpdate>
+          ) => {
+            updateCount++;
+            return originalUpdate(...args);
+          }) as typeof originalUpdate;
+
+          try {
+            const N = 40;
+            const triggers = Array.from({ length: N }, (_, i) =>
+              engine.trigger(
+                {
+                  number: i + 1,
+                  friendlyId: `run_stress${i + 1}`,
+                  environment: authenticatedEnvironment,
+                  taskIdentifier,
+                  payload: `{"data": "stress-${i}"}`,
+                  payloadType: "application/json",
+                  context: {},
+                  traceContext: {},
+                  traceId: `t_stress_${i}`,
+                  spanId: `s_stress_${i}`,
+                  workerQueue: "main",
+                  queue: "task/test-task",
+                  isTest: false,
+                  tags: [],
+                  delayUntil: new Date(Date.now() + 30_000),
+                  debounce: {
+                    key: "stress-key",
+                    delay: "30s",
+                  },
+                },
+                prisma
+              )
+            );
+
+            const start = performance.now();
+            const settled = await Promise.allSettled(triggers);
+            const durationMs = performance.now() - start;
+
+            const fulfilled = settled.filter(
+              (r): r is PromiseFulfilledResult<{ id: string }> => r.status === "fulfilled"
+            );
+            const rejected = settled.filter((r) => r.status === "rejected");
+
+            // No 5xx feedback loop: every concurrent trigger succeeds and
+            // returns the existing run id.
+            expect(rejected).toHaveLength(0);
+            expect(fulfilled).toHaveLength(N);
+            for (const r of fulfilled) {
+              expect(r.value.id).toBe(seed.id);
+            }
+
+            // Only one row, regardless of contention path.
+            const runs = await prisma.taskRun.findMany({
+              where: { taskIdentifier, runtimeEnvironmentId: authenticatedEnvironment.id },
+            });
+            expect(runs.length).toBe(1);
+
+            console.log(
+              `[stress fixed=${fixed}] N=${N} duration=${durationMs.toFixed(
+                0
+              )}ms taskRun.update=${updateCount}`
+            );
+
+            if (fixed) {
+              // With fast-path + quantization: the herd collapses onto the
+              // same quantized newDelayUntil. Trigger #1 takes the lock and
+              // updates delayUntil; every subsequent trigger sees a covering
+              // delayUntil on the unlocked read and short-circuits. So at
+              // most one update lands on the run row.
+              expect(updateCount).toBeLessThanOrEqual(1);
+            }
+          } finally {
+            (prisma.taskRun as unknown as { update: typeof originalUpdate }).update =
+              originalUpdate;
+          }
+        } finally {
+          await engine.quit();
+        }
+      }
+    );
+  }
 });
 
diff --git a/internal-packages/run-engine/src/engine/types.ts b/internal-packages/run-engine/src/engine/types.ts
@@ -151,6 +151,16 @@ export type RunEngineOptions = {
      * Default: true.
      */
     fastPathSkipEnabled?: boolean;
+    /**
+     * Whether to route the unlocked fast-path read of `delayUntil`/`createdAt`
+     * through `readOnlyPrisma` (e.g. an Aurora reader) instead of the writer.
+     * Safe because the read is best-effort and re-checked under the lock by
+     * whichever caller is actually pushing forward; replica lag at worst means
+     * a few extra callers fall through to the lock.
+     *
+     * Default: false.
+     */
+    useReplicaForFastPathRead?: boolean;
   };
   /** If not set then checkpoints won't ever be used */
   retryWarmStartThresholdMs?: number;