@@ -3012,5 +3012,189 @@ describe("RunEngine debounce", () => {
30123012 }
30133013 }
30143014 ) ;
3015+
3016+ // Reproduces the hot-key contention from TRI-8758: fires N concurrent
3017+ // triggers on the same debounce key after the run is already DELAYED.
3018+ //
3019+ // - fixed=true: fast-path skip + 1s quantization on. The herd collapses on
3020+ // the unlocked read and onto the same quantized newDelayUntil, so almost
3021+ // every call short-circuits and `taskRun.update` is barely written.
3022+ // - fixed=false: fast-path off and quantization off (closer to the
3023+ // pre-fix behaviour). The lock-contention fallback (also part of this
3024+ // PR) still catches herd lock failures; this case validates that even
3025+ // without the fast-path the system stays correct under stress, just at
3026+ // higher Redlock cost.
3027+ for ( const fixed of [ true , false ] ) {
3028+ containerTest (
3029+ `Debounce hot-key stress (fixed=${ fixed } ): N concurrent triggers stay correct` ,
3030+ async ( { prisma, redisOptions } ) => {
3031+ const authenticatedEnvironment = await setupAuthenticatedEnvironment ( prisma , "PRODUCTION" ) ;
3032+
3033+ const engine = new RunEngine ( {
3034+ prisma,
3035+ worker : {
3036+ redis : redisOptions ,
3037+ workers : 1 ,
3038+ tasksPerWorker : 10 ,
3039+ pollIntervalMs : 100 ,
3040+ } ,
3041+ queue : {
3042+ redis : redisOptions ,
3043+ } ,
3044+ runLock : {
3045+ redis : redisOptions ,
3046+ } ,
3047+ machines : {
3048+ defaultMachine : "small-1x" ,
3049+ machines : {
3050+ "small-1x" : {
3051+ name : "small-1x" as const ,
3052+ cpu : 0.5 ,
3053+ memory : 0.5 ,
3054+ centsPerMs : 0.0001 ,
3055+ } ,
3056+ } ,
3057+ baseCostInCents : 0.0001 ,
3058+ } ,
3059+ debounce : {
3060+ maxDebounceDurationMs : 10 * 60_000 ,
3061+ fastPathSkipEnabled : fixed ,
3062+ // 1s buckets - same as the real default - or 0 to mimic the
3063+ // pre-fix behaviour where every concurrent trigger has a slightly
3064+ // larger newDelayUntil than the last.
3065+ quantizeNewDelayUntilMs : fixed ? 1000 : 0 ,
3066+ } ,
3067+ tracer : trace . getTracer ( "test" , "0.0.0" ) ,
3068+ } ) ;
3069+
3070+ try {
3071+ const taskIdentifier = "test-task" ;
3072+ await setupBackgroundWorker ( engine , authenticatedEnvironment , taskIdentifier ) ;
3073+
3074+ // Seed the debounce key with an initial run, then push delayUntil far
3075+ // forward so the herd lands well inside the existing window.
3076+ const seed = await engine . trigger (
3077+ {
3078+ number : 0 ,
3079+ friendlyId : "run_stress0" ,
3080+ environment : authenticatedEnvironment ,
3081+ taskIdentifier,
3082+ payload : '{"data": "seed"}' ,
3083+ payloadType : "application/json" ,
3084+ context : { } ,
3085+ traceContext : { } ,
3086+ traceId : "t_stress_seed" ,
3087+ spanId : "s_stress_seed" ,
3088+ workerQueue : "main" ,
3089+ queue : "task/test-task" ,
3090+ isTest : false ,
3091+ tags : [ ] ,
3092+ delayUntil : new Date ( Date . now ( ) + 30_000 ) ,
3093+ debounce : {
3094+ key : "stress-key" ,
3095+ delay : "30s" ,
3096+ } ,
3097+ } ,
3098+ prisma
3099+ ) ;
3100+
3101+ // Move delayUntil to a small but safe future offset. The herd's
3102+ // newDelayUntil (now + 30s) will be meaningfully later than the
3103+ // current value, so the fast-path-off branch reschedules. The
3104+ // ~2s buffer keeps the run DELAYED long enough to absorb startup
3105+ // jitter before the first trigger writes delayUntil = now + 30s.
3106+ await prisma . taskRun . update ( {
3107+ where : { id : seed . id } ,
3108+ data : { delayUntil : new Date ( Date . now ( ) + 2_000 ) } ,
3109+ } ) ;
3110+
3111+ // Count taskRun.update calls so we can assert that the fast-path
3112+ // actually short-circuits the herd's writes. We monkey-patch the
3113+ // bound method on the prisma instance the engine is holding.
3114+ let updateCount = 0 ;
3115+ const originalUpdate = prisma . taskRun . update . bind ( prisma . taskRun ) ;
3116+ ( prisma . taskRun as unknown as { update : typeof originalUpdate } ) . update = ( (
3117+ ...args : Parameters < typeof originalUpdate >
3118+ ) => {
3119+ updateCount ++ ;
3120+ return originalUpdate ( ...args ) ;
3121+ } ) as typeof originalUpdate ;
3122+
3123+ try {
3124+ const N = 40 ;
3125+ const triggers = Array . from ( { length : N } , ( _ , i ) =>
3126+ engine . trigger (
3127+ {
3128+ number : i + 1 ,
3129+ friendlyId : `run_stress${ i + 1 } ` ,
3130+ environment : authenticatedEnvironment ,
3131+ taskIdentifier,
3132+ payload : `{"data": "stress-${ i } "}` ,
3133+ payloadType : "application/json" ,
3134+ context : { } ,
3135+ traceContext : { } ,
3136+ traceId : `t_stress_${ i } ` ,
3137+ spanId : `s_stress_${ i } ` ,
3138+ workerQueue : "main" ,
3139+ queue : "task/test-task" ,
3140+ isTest : false ,
3141+ tags : [ ] ,
3142+ delayUntil : new Date ( Date . now ( ) + 30_000 ) ,
3143+ debounce : {
3144+ key : "stress-key" ,
3145+ delay : "30s" ,
3146+ } ,
3147+ } ,
3148+ prisma
3149+ )
3150+ ) ;
3151+
3152+ const start = performance . now ( ) ;
3153+ const settled = await Promise . allSettled ( triggers ) ;
3154+ const durationMs = performance . now ( ) - start ;
3155+
3156+ const fulfilled = settled . filter (
3157+ ( r ) : r is PromiseFulfilledResult < { id : string } > => r . status === "fulfilled"
3158+ ) ;
3159+ const rejected = settled . filter ( ( r ) => r . status === "rejected" ) ;
3160+
3161+ // No 5xx feedback loop: every concurrent trigger succeeds and
3162+ // returns the existing run id.
3163+ expect ( rejected ) . toHaveLength ( 0 ) ;
3164+ expect ( fulfilled ) . toHaveLength ( N ) ;
3165+ for ( const r of fulfilled ) {
3166+ expect ( r . value . id ) . toBe ( seed . id ) ;
3167+ }
3168+
3169+ // Only one row, regardless of contention path.
3170+ const runs = await prisma . taskRun . findMany ( {
3171+ where : { taskIdentifier, runtimeEnvironmentId : authenticatedEnvironment . id } ,
3172+ } ) ;
3173+ expect ( runs . length ) . toBe ( 1 ) ;
3174+
3175+ console . log (
3176+ `[stress fixed=${ fixed } ] N=${ N } duration=${ durationMs . toFixed (
3177+ 0
3178+ ) } ms taskRun.update=${ updateCount } `
3179+ ) ;
3180+
3181+ if ( fixed ) {
3182+ // With fast-path + quantization: the herd collapses onto the
3183+ // same quantized newDelayUntil. Trigger #1 takes the lock and
3184+ // updates delayUntil; every subsequent trigger sees a covering
3185+ // delayUntil on the unlocked read and short-circuits. So at
3186+ // most one update lands on the run row.
3187+ expect ( updateCount ) . toBeLessThanOrEqual ( 1 ) ;
3188+ }
3189+ } finally {
3190+ ( prisma . taskRun as unknown as { update : typeof originalUpdate } ) . update =
3191+ originalUpdate ;
3192+ }
3193+ } finally {
3194+ await engine . quit ( ) ;
3195+ }
3196+ }
3197+ ) ;
3198+ }
30153199} ) ;
30163200
0 commit comments