From edded41cc30de62ca1a22ce2de4b0bf52f1adae0 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 20 Apr 2026 12:59:26 +0100 Subject: [PATCH 01/23] feat(webapp,clickhouse,database,core): Session primitive (server side) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Durable, typed, bidirectional I/O primitive that outlives a single run. Ship target is agent/chat use cases; run-scoped streams.pipe/streams.input are untouched and do not create Session rows. Postgres - New Session table: id, friendlyId, externalId, type (plain string), denormalised project/environment/organization scalar columns (no FKs), taskIdentifier, tags String[], metadata Json, closedAt, closedReason, expiresAt, timestamps - Point-lookup indexes only (friendlyId unique, (env, externalId) unique, expiresAt). List queries are served from ClickHouse so Postgres stays minimal and insert-heavy. Control-plane API - POST /api/v1/sessions create (idempotent via externalId) - GET /api/v1/sessions list with filters (type, tag, taskIdentifier, externalId, status ACTIVE|CLOSED|EXPIRED, period/from/to) and cursor pagination, ClickHouse-backed - GET /api/v1/sessions/:session retrieve — polymorphic: `session_` prefix hits friendlyId, otherwise externalId - PATCH /api/v1/sessions/:session update tags/metadata/externalId - POST /api/v1/sessions/:session/close terminal close (idempotent) Realtime (S2-backed) - PUT /realtime/v1/sessions/:session/:io returns S2 creds - GET /realtime/v1/sessions/:session/:io SSE subscribe - POST /realtime/v1/sessions/:session/:io/append server-side append - S2 key format: sessions/{friendlyId}/{out|in} Auth - sessions added to ResourceTypes. read:sessions:{id}, write:sessions:{id}, admin:sessions:{id} scopes work via existing JWT validation. ClickHouse - sessions_v1 ReplacingMergeTree table - SessionsReplicationService mirrors RunsReplicationService exactly: logical replication with leader-locked consumer, ConcurrentFlushScheduler, retry with exponential backoff + jitter, identical metric shape. Dedicated slot + publication (sessions_to_clickhouse_v1[_publication]). - SessionsRepository + ClickHouseSessionsRepository expose list, count, tags with cursor pagination keyed by (created_at DESC, session_id DESC). - Derived status (ACTIVE/CLOSED/EXPIRED) computed from closed_at + expires_at; in-memory fallback on list results to catch pre-replication writes. Verification - Webapp typecheck 10/10 - Core + SDK build 3/3 - sessionsReplicationService.test.ts integration tests 2/2 (insert + update round-trip via testcontainers) - Live round-trip against local dev: create -> retrieve (friendlyId and externalId) -> out.initialize -> out.append x2 -> in.send -> out.subscribe (receives records) -> close -> ClickHouse sessions_v1 shows the replicated row with closed_reason - Live list smoke: tag, type, status CLOSED, externalId, and cursor pagination --- .changeset/session-primitive.md | 5 + .server-changes/session-primitive.md | 6 + apps/webapp/app/entry.server.tsx | 28 + apps/webapp/app/env.server.ts | 32 + .../routes/api.v1.sessions.$session.close.ts | 61 ++ .../app/routes/api.v1.sessions.$session.ts | 86 ++ apps/webapp/app/routes/api.v1.sessions.ts | 159 ++++ ...ealtime.v1.sessions.$session.$io.append.ts | 80 ++ .../realtime.v1.sessions.$session.$io.ts | 132 +++ .../app/services/authorization.server.ts | 2 +- .../realtime/s2realtimeStreams.server.ts | 76 +- .../app/services/realtime/sessions.server.ts | 59 ++ .../sessionsReplicationInstance.server.ts | 72 ++ .../sessionsReplicationService.server.ts | 763 ++++++++++++++++++ .../clickhouseSessionsRepository.server.ts | 252 ++++++ .../sessionsRepository.server.ts | 198 +++++ .../app/v3/services/adminWorker.server.ts | 6 + .../test/sessionsReplicationService.test.ts | 205 +++++ .../schema/030_create_sessions_v1.sql | 42 + internal-packages/clickhouse/src/index.ts | 18 + internal-packages/clickhouse/src/sessions.ts | 184 +++++ .../migration.sql | 33 + .../database/prisma/schema.prisma | 41 + packages/core/src/v3/isomorphic/friendlyId.ts | 1 + packages/core/src/v3/schemas/api.ts | 112 +++ 25 files changed, 2647 insertions(+), 6 deletions(-) create mode 100644 .changeset/session-primitive.md create mode 100644 .server-changes/session-primitive.md create mode 100644 apps/webapp/app/routes/api.v1.sessions.$session.close.ts create mode 100644 apps/webapp/app/routes/api.v1.sessions.$session.ts create mode 100644 apps/webapp/app/routes/api.v1.sessions.ts create mode 100644 apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts create mode 100644 apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts create mode 100644 apps/webapp/app/services/realtime/sessions.server.ts create mode 100644 apps/webapp/app/services/sessionsReplicationInstance.server.ts create mode 100644 apps/webapp/app/services/sessionsReplicationService.server.ts create mode 100644 apps/webapp/app/services/sessionsRepository/clickhouseSessionsRepository.server.ts create mode 100644 apps/webapp/app/services/sessionsRepository/sessionsRepository.server.ts create mode 100644 apps/webapp/test/sessionsReplicationService.test.ts create mode 100644 internal-packages/clickhouse/schema/030_create_sessions_v1.sql create mode 100644 internal-packages/clickhouse/src/sessions.ts create mode 100644 internal-packages/database/prisma/migrations/20260419000000_add_sessions_table/migration.sql diff --git a/.changeset/session-primitive.md b/.changeset/session-primitive.md new file mode 100644 index 00000000000..ccfd3b51807 --- /dev/null +++ b/.changeset/session-primitive.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Add `SessionId` friendly ID generator and schemas for the new durable Session primitive. Exported from `@trigger.dev/core/v3/isomorphic` alongside `RunId`, `BatchId`, etc. diff --git a/.server-changes/session-primitive.md b/.server-changes/session-primitive.md new file mode 100644 index 00000000000..80516a5c6a6 --- /dev/null +++ b/.server-changes/session-primitive.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add `Session` primitive — a durable, typed, bidirectional I/O primitive that outlives a single run, intended for agent/chat use cases. Ships the Postgres schema (`Session` table), control-plane CRUD routes (`POST/GET/PATCH /api/v1/sessions`, `POST /api/v1/sessions/:session/close` — polymorphic on friendlyId or externalId), `sessions` JWT scope, ClickHouse `sessions_v1` table, and `SessionsReplicationService` (logical replication from Postgres `Session` → ClickHouse `sessions_v1`). Run-scoped realtime streams (`streams.pipe`/`streams.input`) are unchanged and do **not** create Session rows. diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index 87171011e03..06b02537da7 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -23,6 +23,34 @@ import { registerRunEngineEventBusHandlers, setupBatchQueueCallbacks, } from "./v3/runEngineHandlers.server"; +import { sessionsReplicationInstance } from "./services/sessionsReplicationInstance.server"; +import { signalsEmitter } from "./services/signals.server"; + +// Start the sessions replication service (subscribes to the logical replication +// slot, runs leader election, flushes to ClickHouse). Done at entry level so it +// runs deterministically on webapp boot rather than lazily via a singleton +// reference elsewhere in the module graph. +if (sessionsReplicationInstance && env.SESSION_REPLICATION_ENABLED === "1") { + sessionsReplicationInstance + .start() + .then(() => { + console.log("🗃️ Sessions replication service started"); + }) + .catch((error) => { + console.error("🗃️ Sessions replication service failed to start", { + error, + }); + }); + + signalsEmitter.on( + "SIGTERM", + sessionsReplicationInstance.shutdown.bind(sessionsReplicationInstance) + ); + signalsEmitter.on( + "SIGINT", + sessionsReplicationInstance.shutdown.bind(sessionsReplicationInstance) + ); +} const ABORT_DELAY = 30000; diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index c10446d08ab..c62d9d9ab39 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1221,6 +1221,38 @@ const EnvironmentSchema = z RUN_REPLICATION_DISABLE_PAYLOAD_INSERT: z.string().default("0"), RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING: z.string().default("0"), + // Session replication (Postgres → ClickHouse sessions_v1). Shares Redis + // with the runs replicator for leader locking but has its own slot and + // publication so the two consume independently. + SESSION_REPLICATION_CLICKHOUSE_URL: z.string().optional(), + SESSION_REPLICATION_ENABLED: z.string().default("0"), + SESSION_REPLICATION_SLOT_NAME: z.string().default("sessions_to_clickhouse_v1"), + SESSION_REPLICATION_PUBLICATION_NAME: z + .string() + .default("sessions_to_clickhouse_v1_publication"), + SESSION_REPLICATION_MAX_FLUSH_CONCURRENCY: z.coerce.number().int().default(1), + SESSION_REPLICATION_FLUSH_INTERVAL_MS: z.coerce.number().int().default(1000), + SESSION_REPLICATION_FLUSH_BATCH_SIZE: z.coerce.number().int().default(100), + SESSION_REPLICATION_LEADER_LOCK_TIMEOUT_MS: z.coerce.number().int().default(30_000), + SESSION_REPLICATION_LEADER_LOCK_EXTEND_INTERVAL_MS: z.coerce.number().int().default(10_000), + SESSION_REPLICATION_LEADER_LOCK_ADDITIONAL_TIME_MS: z.coerce.number().int().default(10_000), + SESSION_REPLICATION_LEADER_LOCK_RETRY_INTERVAL_MS: z.coerce.number().int().default(500), + SESSION_REPLICATION_ACK_INTERVAL_SECONDS: z.coerce.number().int().default(10), + SESSION_REPLICATION_LOG_LEVEL: z + .enum(["log", "error", "warn", "info", "debug"]) + .default("info"), + SESSION_REPLICATION_CLICKHOUSE_LOG_LEVEL: z + .enum(["log", "error", "warn", "info", "debug"]) + .default("info"), + SESSION_REPLICATION_WAIT_FOR_ASYNC_INSERT: z.string().default("0"), + SESSION_REPLICATION_KEEP_ALIVE_ENABLED: z.string().default("0"), + SESSION_REPLICATION_KEEP_ALIVE_IDLE_SOCKET_TTL_MS: z.coerce.number().int().optional(), + SESSION_REPLICATION_MAX_OPEN_CONNECTIONS: z.coerce.number().int().default(10), + SESSION_REPLICATION_INSERT_STRATEGY: z.enum(["insert", "insert_async"]).default("insert"), + SESSION_REPLICATION_INSERT_MAX_RETRIES: z.coerce.number().int().default(3), + SESSION_REPLICATION_INSERT_BASE_DELAY_MS: z.coerce.number().int().default(100), + SESSION_REPLICATION_INSERT_MAX_DELAY_MS: z.coerce.number().int().default(2000), + // Clickhouse CLICKHOUSE_URL: z.string(), CLICKHOUSE_KEEP_ALIVE_ENABLED: z.string().default("1"), diff --git a/apps/webapp/app/routes/api.v1.sessions.$session.close.ts b/apps/webapp/app/routes/api.v1.sessions.$session.close.ts new file mode 100644 index 00000000000..047477e47ae --- /dev/null +++ b/apps/webapp/app/routes/api.v1.sessions.$session.close.ts @@ -0,0 +1,61 @@ +import { json } from "@remix-run/server-runtime"; +import { + CloseSessionRequestBody, + type RetrieveSessionResponseBody, +} from "@trigger.dev/core/v3"; +import { z } from "zod"; +import { prisma } from "~/db.server"; +import { + resolveSessionByIdOrExternalId, + serializeSession, +} from "~/services/realtime/sessions.server"; +import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; + +const ParamsSchema = z.object({ + session: z.string(), +}); + +const { action } = createActionApiRoute( + { + params: ParamsSchema, + body: CloseSessionRequestBody, + maxContentLength: 1024, + method: "POST", + allowJWT: true, + corsStrategy: "all", + authorization: { + action: "admin", + resource: (params) => ({ sessions: params.session }), + superScopes: ["admin:sessions", "admin:all", "admin"], + }, + }, + async ({ authentication, params, body }) => { + const existing = await resolveSessionByIdOrExternalId( + prisma, + authentication.environment.id, + params.session + ); + + if (!existing) { + return json({ error: "Session not found" }, { status: 404 }); + } + + // Idempotent: if already closed, return the current row without clobbering + // the original closedAt / closedReason. + if (existing.closedAt) { + return json(serializeSession(existing)); + } + + const updated = await prisma.session.update({ + where: { id: existing.id }, + data: { + closedAt: new Date(), + closedReason: body.reason ?? null, + }, + }); + + return json(serializeSession(updated)); + } +); + +export { action }; diff --git a/apps/webapp/app/routes/api.v1.sessions.$session.ts b/apps/webapp/app/routes/api.v1.sessions.$session.ts new file mode 100644 index 00000000000..202058a596e --- /dev/null +++ b/apps/webapp/app/routes/api.v1.sessions.$session.ts @@ -0,0 +1,86 @@ +import { json } from "@remix-run/server-runtime"; +import { + type RetrieveSessionResponseBody, + UpdateSessionRequestBody, +} from "@trigger.dev/core/v3"; +import { Prisma } from "@trigger.dev/database"; +import { z } from "zod"; +import { $replica, prisma } from "~/db.server"; +import { + resolveSessionByIdOrExternalId, + serializeSession, +} from "~/services/realtime/sessions.server"; +import { + createActionApiRoute, + createLoaderApiRoute, +} from "~/services/routeBuilders/apiBuilder.server"; + +const ParamsSchema = z.object({ + session: z.string(), +}); + +export const loader = createLoaderApiRoute( + { + params: ParamsSchema, + allowJWT: true, + corsStrategy: "all", + findResource: async (params, auth) => { + return resolveSessionByIdOrExternalId($replica, auth.environment.id, params.session); + }, + authorization: { + action: "read", + resource: (session) => ({ sessions: [session.friendlyId, session.externalId ?? ""] }), + superScopes: ["read:sessions", "read:all", "admin"], + }, + }, + async ({ resource: session }) => { + return json(serializeSession(session)); + } +); + +const { action } = createActionApiRoute( + { + params: ParamsSchema, + body: UpdateSessionRequestBody, + maxContentLength: 1024 * 32, + method: "PATCH", + allowJWT: true, + corsStrategy: "all", + authorization: { + action: "admin", + resource: (params) => ({ sessions: params.session }), + superScopes: ["admin:sessions", "admin:all", "admin"], + }, + }, + async ({ authentication, params, body }) => { + const existing = await resolveSessionByIdOrExternalId( + prisma, + authentication.environment.id, + params.session + ); + + if (!existing) { + return json({ error: "Session not found" }, { status: 404 }); + } + + const updated = await prisma.session.update({ + where: { id: existing.id }, + data: { + ...(body.tags !== undefined ? { tags: body.tags } : {}), + ...(body.metadata !== undefined + ? { + metadata: + body.metadata === null + ? Prisma.JsonNull + : (body.metadata as Prisma.InputJsonValue), + } + : {}), + ...(body.externalId !== undefined ? { externalId: body.externalId } : {}), + }, + }); + + return json(serializeSession(updated)); + } +); + +export { action }; diff --git a/apps/webapp/app/routes/api.v1.sessions.ts b/apps/webapp/app/routes/api.v1.sessions.ts new file mode 100644 index 00000000000..e9d1a106127 --- /dev/null +++ b/apps/webapp/app/routes/api.v1.sessions.ts @@ -0,0 +1,159 @@ +import { json } from "@remix-run/server-runtime"; +import { + CreateSessionRequestBody, + type CreatedSessionResponseBody, + ListSessionsQueryParams, + type ListSessionsResponseBody, + type SessionStatus, +} from "@trigger.dev/core/v3"; +import { SessionId } from "@trigger.dev/core/v3/isomorphic"; +import type { Prisma, Session } from "@trigger.dev/database"; +import { $replica, prisma, type PrismaClient } from "~/db.server"; +import { clickhouseClient } from "~/services/clickhouseInstance.server"; +import { serializeSession } from "~/services/realtime/sessions.server"; +import { SessionsRepository } from "~/services/sessionsRepository/sessionsRepository.server"; +import { + createActionApiRoute, + createLoaderApiRoute, +} from "~/services/routeBuilders/apiBuilder.server"; +import { ServiceValidationError } from "~/v3/services/common.server"; + +function asArray(value: T | T[] | undefined): T[] | undefined { + if (value === undefined) return undefined; + return Array.isArray(value) ? value : [value]; +} + +export const loader = createLoaderApiRoute( + { + searchParams: ListSessionsQueryParams, + findResource: async () => 1, + }, + async ({ searchParams, authentication }) => { + const repository = new SessionsRepository({ + clickhouse: clickhouseClient, + prisma: $replica as PrismaClient, + }); + + // `page[after]` is the forward cursor, `page[before]` is the backward + // cursor. The repository internally keys off `{cursor, direction}`. + const cursor = searchParams["page[after]"] ?? searchParams["page[before]"]; + const direction = searchParams["page[before]"] ? "backward" : "forward"; + + const { sessions: rows, pagination } = await repository.listSessions({ + organizationId: authentication.environment.organizationId, + projectId: authentication.environment.projectId, + environmentId: authentication.environment.id, + types: asArray(searchParams["filter[type]"]), + tags: asArray(searchParams["filter[tags]"]), + taskIdentifiers: asArray(searchParams["filter[taskIdentifier]"]), + externalId: searchParams["filter[externalId]"], + statuses: asArray(searchParams["filter[status]"]) as SessionStatus[] | undefined, + period: searchParams["filter[createdAt][period]"], + from: searchParams["filter[createdAt][from]"], + to: searchParams["filter[createdAt][to]"], + page: { + size: searchParams["page[size]"], + cursor, + direction, + }, + }); + + return json({ + data: rows.map((session) => + serializeSession({ + ...session, + // Columns the list query doesn't select — filled so `serializeSession` + // can operate on a narrowed payload without type errors. + projectId: authentication.environment.projectId, + environmentType: authentication.environment.type, + organizationId: authentication.environment.organizationId, + } as Session) + ), + pagination: { + ...(pagination.nextCursor ? { next: pagination.nextCursor } : {}), + ...(pagination.previousCursor ? { previous: pagination.previousCursor } : {}), + }, + }); + } +); + +const { action } = createActionApiRoute( + { + body: CreateSessionRequestBody, + method: "POST", + maxContentLength: 1024 * 32, // 32KB — metadata is the only thing that grows + }, + async ({ authentication, body }) => { + try { + let session: Session; + let isCached = false; + + if (body.externalId) { + // Idempotent: (env, externalId) uniquely identifies the Session. + const existing = await prisma.session.findUnique({ + where: { + runtimeEnvironmentId_externalId: { + runtimeEnvironmentId: authentication.environment.id, + externalId: body.externalId, + }, + }, + }); + + if (existing) { + session = existing; + isCached = true; + } else { + const { id, friendlyId } = SessionId.generate(); + session = await prisma.session.create({ + data: { + id, + friendlyId, + externalId: body.externalId, + type: body.type, + taskIdentifier: body.taskIdentifier ?? null, + tags: body.tags ?? [], + metadata: body.metadata as Prisma.InputJsonValue | undefined, + expiresAt: body.expiresAt ?? null, + projectId: authentication.environment.projectId, + runtimeEnvironmentId: authentication.environment.id, + environmentType: authentication.environment.type, + organizationId: authentication.environment.organizationId, + }, + }); + } + } else { + const { id, friendlyId } = SessionId.generate(); + session = await prisma.session.create({ + data: { + id, + friendlyId, + type: body.type, + taskIdentifier: body.taskIdentifier ?? null, + tags: body.tags ?? [], + metadata: body.metadata as Prisma.InputJsonValue | undefined, + expiresAt: body.expiresAt ?? null, + projectId: authentication.environment.projectId, + runtimeEnvironmentId: authentication.environment.id, + environmentType: authentication.environment.type, + organizationId: authentication.environment.organizationId, + }, + }); + } + + return json( + { ...serializeSession(session), isCached }, + { status: isCached ? 200 : 201 } + ); + } catch (error) { + if (error instanceof ServiceValidationError) { + return json({ error: error.message }, { status: 422 }); + } + if (error instanceof Error) { + return json({ error: error.message }, { status: 500 }); + } + return json({ error: "Something went wrong" }, { status: 500 }); + } + } +); + +export { action }; diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts new file mode 100644 index 00000000000..d97197a262b --- /dev/null +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts @@ -0,0 +1,80 @@ +import { json } from "@remix-run/server-runtime"; +import { tryCatch } from "@trigger.dev/core/utils"; +import { nanoid } from "nanoid"; +import { z } from "zod"; +import { $replica } from "~/db.server"; +import { S2RealtimeStreams } from "~/services/realtime/s2realtimeStreams.server"; +import { resolveSessionByIdOrExternalId } from "~/services/realtime/sessions.server"; +import { getRealtimeStreamInstance } from "~/services/realtime/v1StreamsGlobal.server"; +import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; +import { ServiceValidationError } from "~/v3/services/common.server"; + +const ParamsSchema = z.object({ + session: z.string(), + io: z.enum(["out", "in"]), +}); + +// POST: server-side append of a single record to a session channel. Mirrors +// the existing /realtime/v1/streams/:runId/:target/:streamId/append route, +// scoped to a Session primitive. +const { action } = createActionApiRoute( + { + params: ParamsSchema, + method: "POST", + allowJWT: true, + corsStrategy: "all", + authorization: { + action: "write", + resource: (params) => ({ sessions: params.session }), + superScopes: ["write:sessions", "write:all", "admin"], + }, + }, + async ({ request, params, authentication }) => { + const session = await resolveSessionByIdOrExternalId( + $replica, + authentication.environment.id, + params.session + ); + + if (!session) { + return new Response("Session not found", { status: 404 }); + } + + if (session.closedAt) { + return json( + { ok: false, error: "Cannot append to a closed session" }, + { status: 400 } + ); + } + + const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2"); + + if (!(realtimeStream instanceof S2RealtimeStreams)) { + return json( + { ok: false, error: "Session channels require the S2 realtime backend" }, + { status: 501 } + ); + } + + const part = await request.text(); + const partId = request.headers.get("X-Part-Id") ?? nanoid(7); + + const [appendError] = await tryCatch( + realtimeStream.appendPartToSessionStream(part, partId, session.friendlyId, params.io) + ); + + if (appendError) { + if (appendError instanceof ServiceValidationError) { + return json( + { ok: false, error: appendError.message }, + { status: appendError.status ?? 422 } + ); + } + return json({ ok: false, error: appendError.message }, { status: 500 }); + } + + return json({ ok: true }, { status: 200 }); + } +); + +export { action }; diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts new file mode 100644 index 00000000000..99649ffa0a4 --- /dev/null +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts @@ -0,0 +1,132 @@ +import { json } from "@remix-run/server-runtime"; +import { z } from "zod"; +import { $replica } from "~/db.server"; +import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; +import { S2RealtimeStreams } from "~/services/realtime/s2realtimeStreams.server"; +import { resolveSessionByIdOrExternalId } from "~/services/realtime/sessions.server"; +import { getRealtimeStreamInstance } from "~/services/realtime/v1StreamsGlobal.server"; +import { + createActionApiRoute, + createLoaderApiRoute, +} from "~/services/routeBuilders/apiBuilder.server"; + +const ParamsSchema = z.object({ + session: z.string(), + io: z.enum(["out", "in"]), +}); + +// PUT: initialize the S2 channel for this (session, io) pair — returns S2 +// credentials in response headers so the caller can write/read directly +// against S2. GET is handled by the loader below. +const { action } = createActionApiRoute( + { + params: ParamsSchema, + allowJWT: true, + corsStrategy: "all", + authorization: { + action: "write", + resource: (params) => ({ sessions: params.session }), + superScopes: ["write:sessions", "write:all", "admin"], + }, + }, + async ({ request, params, authentication }) => { + if (request.method !== "PUT") { + return new Response("Method not allowed", { status: 405 }); + } + + const session = await resolveSessionByIdOrExternalId( + $replica, + authentication.environment.id, + params.session + ); + + if (!session) { + return new Response("Session not found", { status: 404 }); + } + + if (session.closedAt) { + return new Response("Cannot initialize a channel on a closed session", { + status: 400, + }); + } + + const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2"); + + if (!(realtimeStream instanceof S2RealtimeStreams)) { + return new Response("Session channels require the S2 realtime backend", { + status: 501, + }); + } + + const { responseHeaders } = await realtimeStream.initializeSessionStream( + session.friendlyId, + params.io + ); + + return json({ version: "v2" }, { status: 202, headers: responseHeaders }); + } +); + +// GET: SSE subscribe to a session channel. HEAD returns the last chunk index +// for resume semantics, mirroring the existing run-stream route. +const loader = createLoaderApiRoute( + { + params: ParamsSchema, + allowJWT: true, + corsStrategy: "all", + findResource: async (params, auth) => { + return resolveSessionByIdOrExternalId($replica, auth.environment.id, params.session); + }, + authorization: { + action: "read", + resource: (session) => ({ sessions: [session.friendlyId, session.externalId ?? ""] }), + superScopes: ["read:sessions", "read:all", "admin"], + }, + }, + async ({ params, request, resource: session, authentication }) => { + const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2"); + + if (!(realtimeStream instanceof S2RealtimeStreams)) { + return new Response("Session channels require the S2 realtime backend", { + status: 501, + }); + } + + if (request.method === "HEAD") { + // No last-chunk-index on the S2 backend (clients resume via Last-Event-ID + // on the SSE stream directly). Return 200 with a zero index for + // compatibility with the run-stream shape. + return new Response(null, { + status: 200, + headers: { "X-Last-Chunk-Index": "0" }, + }); + } + + const lastEventId = request.headers.get("Last-Event-ID") ?? undefined; + + const timeoutInSecondsRaw = request.headers.get("Timeout-Seconds") ?? undefined; + const timeoutInSeconds = timeoutInSecondsRaw ? parseInt(timeoutInSecondsRaw, 10) : undefined; + + if (timeoutInSeconds !== undefined && isNaN(timeoutInSeconds)) { + return new Response("Invalid timeout seconds", { status: 400 }); + } + + if (timeoutInSeconds !== undefined && timeoutInSeconds < 1) { + return new Response("Timeout seconds must be greater than 0", { status: 400 }); + } + + if (timeoutInSeconds !== undefined && timeoutInSeconds > 600) { + return new Response("Timeout seconds must be less than 600", { status: 400 }); + } + + return realtimeStream.streamResponseFromSessionStream( + request, + session.friendlyId, + params.io, + getRequestAbortSignal(), + { lastEventId, timeoutInSeconds } + ); + } +); + +export { action, loader }; diff --git a/apps/webapp/app/services/authorization.server.ts b/apps/webapp/app/services/authorization.server.ts index 0406c02438e..786cc161ed9 100644 --- a/apps/webapp/app/services/authorization.server.ts +++ b/apps/webapp/app/services/authorization.server.ts @@ -1,6 +1,6 @@ export type AuthorizationAction = "read" | "write" | string; // Add more actions as needed -const ResourceTypes = ["tasks", "tags", "runs", "batch", "waitpoints", "deployments", "inputStreams", "query", "prompts"] as const; +const ResourceTypes = ["tasks", "tags", "runs", "batch", "waitpoints", "deployments", "inputStreams", "query", "prompts", "sessions"] as const; export type AuthorizationResources = { [key in (typeof ResourceTypes)[number]]?: string | string[]; diff --git a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts index 4a7acb60606..8e74661fe70 100644 --- a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts +++ b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts @@ -88,9 +88,42 @@ export class S2RealtimeStreams implements StreamResponder, StreamIngestor { return `${this.streamPrefix}/runs/${runId}/${streamId}`; } + /** + * Build an S2 stream name for a `Session`-primitive channel, addressed by + * the session's `friendlyId` and the I/O direction. Used by the session + * realtime routes to route traffic to `sessions/{friendlyId}/{out|in}`. + */ + public toSessionStreamName(friendlyId: string, io: "out" | "in"): string { + return `${this.streamPrefix}/sessions/${friendlyId}/${io}`; + } + async initializeStream( runId: string, streamId: string + ): Promise<{ responseHeaders?: Record }> { + return this.#initializeStreamByName( + this.toStreamName(runId, streamId), + `/runs/${runId}/${streamId}` + ); + } + + /** + * Initialize an S2 stream by `(sessionFriendlyId, io)` — mirrors + * {@link initializeStream} but addresses the new `sessions/*` key format. + */ + async initializeSessionStream( + friendlyId: string, + io: "out" | "in" + ): Promise<{ responseHeaders?: Record }> { + return this.#initializeStreamByName( + this.toSessionStreamName(friendlyId, io), + `/sessions/${friendlyId}/${io}` + ); + } + + async #initializeStreamByName( + prefixedName: string, + relativeName: string ): Promise<{ responseHeaders?: Record }> { const accessToken = this.skipAccessTokens ? this.token @@ -99,9 +132,7 @@ export class S2RealtimeStreams implements StreamResponder, StreamIngestor { return { responseHeaders: { "X-S2-Access-Token": accessToken, - "X-S2-Stream-Name": this.skipAccessTokens - ? this.toStreamName(runId, streamId) - : `/runs/${runId}/${streamId}`, + "X-S2-Stream-Name": this.skipAccessTokens ? prefixedName : relativeName, "X-S2-Basin": this.basin, "X-S2-Flush-Interval-Ms": this.flushIntervalMs.toString(), "X-S2-Max-Retries": this.maxRetries.toString(), @@ -121,8 +152,22 @@ export class S2RealtimeStreams implements StreamResponder, StreamIngestor { } async appendPart(part: string, partId: string, runId: string, streamId: string): Promise { - const s2Stream = this.toStreamName(runId, streamId); + return this.#appendPartByName(part, partId, this.toStreamName(runId, streamId)); + } + + /** + * Append a single record to a `Session`-primitive channel. + */ + async appendPartToSessionStream( + part: string, + partId: string, + friendlyId: string, + io: "out" | "in" + ): Promise { + return this.#appendPartByName(part, partId, this.toSessionStreamName(friendlyId, io)); + } + async #appendPartByName(part: string, partId: string, s2Stream: string): Promise { this.logger.debug(`S2 appending to stream`, { part, stream: s2Stream }); const result = await this.s2Append(s2Stream, { @@ -227,7 +272,28 @@ export class S2RealtimeStreams implements StreamResponder, StreamIngestor { signal: AbortSignal, options?: StreamResponseOptions ): Promise { - const s2Stream = this.toStreamName(runId, streamId); + return this.#streamResponseByName(this.toStreamName(runId, streamId), signal, options); + } + + /** + * Serve SSE from a `Session`-primitive channel addressed by + * `(friendlyId, io)`. + */ + async streamResponseFromSessionStream( + request: Request, + friendlyId: string, + io: "out" | "in", + signal: AbortSignal, + options?: StreamResponseOptions + ): Promise { + return this.#streamResponseByName(this.toSessionStreamName(friendlyId, io), signal, options); + } + + async #streamResponseByName( + s2Stream: string, + signal: AbortSignal, + options?: StreamResponseOptions + ): Promise { const startSeq = this.parseLastEventId(options?.lastEventId); this.logger.info(`S2 streaming records from stream`, { stream: s2Stream, startSeq }); diff --git a/apps/webapp/app/services/realtime/sessions.server.ts b/apps/webapp/app/services/realtime/sessions.server.ts new file mode 100644 index 00000000000..b330cc7ee47 --- /dev/null +++ b/apps/webapp/app/services/realtime/sessions.server.ts @@ -0,0 +1,59 @@ +import type { PrismaClient, Session } from "@trigger.dev/database"; +import type { SessionItem } from "@trigger.dev/core/v3"; + +/** + * Prefix that {@link SessionId.generate} attaches to every Session friendlyId. + * Used to distinguish friendlyId lookups (`session_abc...`) from externalId + * lookups on the public `GET /api/v1/sessions/:session` route. + */ +const SESSION_FRIENDLY_ID_PREFIX = "session_"; + +/** + * Resolve a session from a URL path parameter that may contain either a + * friendlyId (`session_abc...`) or a user-supplied externalId. + * + * Disambiguated by prefix: values starting with `session_` are treated as + * friendlyIds, anything else is looked up against `externalId` scoped to + * the caller's environment. + */ +export async function resolveSessionByIdOrExternalId( + prisma: Pick, + runtimeEnvironmentId: string, + idOrExternalId: string +): Promise { + if (idOrExternalId.startsWith(SESSION_FRIENDLY_ID_PREFIX)) { + return prisma.session.findFirst({ + where: { friendlyId: idOrExternalId, runtimeEnvironmentId }, + }); + } + + return prisma.session.findUnique({ + where: { + runtimeEnvironmentId_externalId: { + runtimeEnvironmentId, + externalId: idOrExternalId, + }, + }, + }); +} + +/** + * Convert a Prisma `Session` row to the public {@link SessionItem} wire format. + * Strips internal columns (project/environment/organization ids) and narrows + * the `metadata` JSON to a record. + */ +export function serializeSession(session: Session): SessionItem { + return { + id: session.friendlyId, + externalId: session.externalId, + type: session.type, + taskIdentifier: session.taskIdentifier, + tags: session.tags, + metadata: (session.metadata ?? null) as SessionItem["metadata"], + closedAt: session.closedAt, + closedReason: session.closedReason, + expiresAt: session.expiresAt, + createdAt: session.createdAt, + updatedAt: session.updatedAt, + }; +} diff --git a/apps/webapp/app/services/sessionsReplicationInstance.server.ts b/apps/webapp/app/services/sessionsReplicationInstance.server.ts new file mode 100644 index 00000000000..c6ed1b6b088 --- /dev/null +++ b/apps/webapp/app/services/sessionsReplicationInstance.server.ts @@ -0,0 +1,72 @@ +import { ClickHouse } from "@internal/clickhouse"; +import invariant from "tiny-invariant"; +import { env } from "~/env.server"; +import { singleton } from "~/utils/singleton"; +import { meter, provider } from "~/v3/tracer.server"; +import { SessionsReplicationService } from "./sessionsReplicationService.server"; + +export const sessionsReplicationInstance = singleton( + "sessionsReplicationInstance", + initializeSessionsReplicationInstance +); + +function initializeSessionsReplicationInstance() { + const { DATABASE_URL } = process.env; + invariant(typeof DATABASE_URL === "string", "DATABASE_URL env var not set"); + + if (!env.SESSION_REPLICATION_CLICKHOUSE_URL) { + console.log("🗃️ Sessions replication service not enabled"); + return; + } + + console.log("🗃️ Sessions replication service enabled"); + + const clickhouse = new ClickHouse({ + url: env.SESSION_REPLICATION_CLICKHOUSE_URL, + name: "sessions-replication", + keepAlive: { + enabled: env.SESSION_REPLICATION_KEEP_ALIVE_ENABLED === "1", + idleSocketTtl: env.SESSION_REPLICATION_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, + }, + logLevel: env.SESSION_REPLICATION_CLICKHOUSE_LOG_LEVEL, + compression: { + request: true, + }, + maxOpenConnections: env.SESSION_REPLICATION_MAX_OPEN_CONNECTIONS, + }); + + const service = new SessionsReplicationService({ + clickhouse: clickhouse, + pgConnectionUrl: DATABASE_URL, + serviceName: "sessions-replication", + slotName: env.SESSION_REPLICATION_SLOT_NAME, + publicationName: env.SESSION_REPLICATION_PUBLICATION_NAME, + redisOptions: { + keyPrefix: "sessions-replication:", + port: env.RUN_REPLICATION_REDIS_PORT ?? undefined, + host: env.RUN_REPLICATION_REDIS_HOST ?? undefined, + username: env.RUN_REPLICATION_REDIS_USERNAME ?? undefined, + password: env.RUN_REPLICATION_REDIS_PASSWORD ?? undefined, + enableAutoPipelining: true, + ...(env.RUN_REPLICATION_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), + }, + maxFlushConcurrency: env.SESSION_REPLICATION_MAX_FLUSH_CONCURRENCY, + flushIntervalMs: env.SESSION_REPLICATION_FLUSH_INTERVAL_MS, + flushBatchSize: env.SESSION_REPLICATION_FLUSH_BATCH_SIZE, + leaderLockTimeoutMs: env.SESSION_REPLICATION_LEADER_LOCK_TIMEOUT_MS, + leaderLockExtendIntervalMs: env.SESSION_REPLICATION_LEADER_LOCK_EXTEND_INTERVAL_MS, + leaderLockAcquireAdditionalTimeMs: env.SESSION_REPLICATION_LEADER_LOCK_ADDITIONAL_TIME_MS, + leaderLockRetryIntervalMs: env.SESSION_REPLICATION_LEADER_LOCK_RETRY_INTERVAL_MS, + ackIntervalSeconds: env.SESSION_REPLICATION_ACK_INTERVAL_SECONDS, + logLevel: env.SESSION_REPLICATION_LOG_LEVEL, + waitForAsyncInsert: env.SESSION_REPLICATION_WAIT_FOR_ASYNC_INSERT === "1", + tracer: provider.getTracer("sessions-replication-service"), + meter, + insertMaxRetries: env.SESSION_REPLICATION_INSERT_MAX_RETRIES, + insertBaseDelayMs: env.SESSION_REPLICATION_INSERT_BASE_DELAY_MS, + insertMaxDelayMs: env.SESSION_REPLICATION_INSERT_MAX_DELAY_MS, + insertStrategy: env.SESSION_REPLICATION_INSERT_STRATEGY, + }); + + return service; +} diff --git a/apps/webapp/app/services/sessionsReplicationService.server.ts b/apps/webapp/app/services/sessionsReplicationService.server.ts new file mode 100644 index 00000000000..f7f384faffc --- /dev/null +++ b/apps/webapp/app/services/sessionsReplicationService.server.ts @@ -0,0 +1,763 @@ +import type { ClickHouse, SessionInsertArray } from "@internal/clickhouse"; +import { getSessionField } from "@internal/clickhouse"; +import { type RedisOptions } from "@internal/redis"; +import { + LogicalReplicationClient, + type MessageDelete, + type MessageInsert, + type MessageUpdate, + type PgoutputMessage, +} from "@internal/replication"; +import { + getMeter, + recordSpanError, + startSpan, + trace, + type Counter, + type Histogram, + type Meter, + type Tracer, +} from "@internal/tracing"; +import { Logger, type LogLevel } from "@trigger.dev/core/logger"; +import { tryCatch } from "@trigger.dev/core/utils"; +import { type Session } from "@trigger.dev/database"; +import EventEmitter from "node:events"; +import { ConcurrentFlushScheduler } from "./runsReplicationService.server"; + +interface TransactionEvent { + tag: "insert" | "update" | "delete"; + data: T; + raw: MessageInsert | MessageUpdate | MessageDelete; +} + +interface Transaction { + beginStartTimestamp: number; + commitLsn: string | null; + commitEndLsn: string | null; + xid: number; + events: TransactionEvent[]; + replicationLagMs: number; +} + +export type SessionsReplicationServiceOptions = { + clickhouse: ClickHouse; + pgConnectionUrl: string; + serviceName: string; + slotName: string; + publicationName: string; + redisOptions: RedisOptions; + maxFlushConcurrency?: number; + flushIntervalMs?: number; + flushBatchSize?: number; + leaderLockTimeoutMs?: number; + leaderLockExtendIntervalMs?: number; + leaderLockAcquireAdditionalTimeMs?: number; + leaderLockRetryIntervalMs?: number; + ackIntervalSeconds?: number; + acknowledgeTimeoutMs?: number; + logger?: Logger; + logLevel?: LogLevel; + tracer?: Tracer; + meter?: Meter; + waitForAsyncInsert?: boolean; + insertStrategy?: "insert" | "insert_async"; + // Retry configuration for insert operations + insertMaxRetries?: number; + insertBaseDelayMs?: number; + insertMaxDelayMs?: number; +}; + +type SessionInsert = { + _version: bigint; + session: Session; + event: "insert" | "update" | "delete"; +}; + +export type SessionsReplicationServiceEvents = { + message: [{ lsn: string; message: PgoutputMessage; service: SessionsReplicationService }]; + batchFlushed: [{ flushId: string; sessionInserts: SessionInsertArray[] }]; +}; + +export class SessionsReplicationService { + private _isSubscribed = false; + private _currentTransaction: + | (Omit, "commitEndLsn" | "replicationLagMs"> & { + commitEndLsn?: string | null; + replicationLagMs?: number; + }) + | null = null; + + private _replicationClient: LogicalReplicationClient; + private _concurrentFlushScheduler: ConcurrentFlushScheduler; + private logger: Logger; + private _isShuttingDown = false; + private _isShutDownComplete = false; + private _tracer: Tracer; + private _meter: Meter; + private _currentParseDurationMs: number | null = null; + private _lastAcknowledgedAt: number | null = null; + private _acknowledgeTimeoutMs: number; + private _latestCommitEndLsn: string | null = null; + private _lastAcknowledgedLsn: string | null = null; + private _acknowledgeInterval: NodeJS.Timeout | null = null; + // Retry configuration + private _insertMaxRetries: number; + private _insertBaseDelayMs: number; + private _insertMaxDelayMs: number; + private _insertStrategy: "insert" | "insert_async"; + + // Metrics + private _replicationLagHistogram: Histogram; + private _batchesFlushedCounter: Counter; + private _batchSizeHistogram: Histogram; + private _sessionsInsertedCounter: Counter; + private _insertRetriesCounter: Counter; + private _eventsProcessedCounter: Counter; + private _flushDurationHistogram: Histogram; + + public readonly events: EventEmitter; + + constructor(private readonly options: SessionsReplicationServiceOptions) { + this.logger = + options.logger ?? new Logger("SessionsReplicationService", options.logLevel ?? "info"); + this.events = new EventEmitter(); + this._tracer = options.tracer ?? trace.getTracer("sessions-replication-service"); + this._meter = options.meter ?? getMeter("sessions-replication"); + + // Initialize metrics + this._replicationLagHistogram = this._meter.createHistogram( + "sessions_replication.replication_lag_ms", + { + description: "Replication lag from Postgres commit to processing", + unit: "ms", + } + ); + + this._batchesFlushedCounter = this._meter.createCounter( + "sessions_replication.batches_flushed", + { + description: "Total batches flushed to ClickHouse", + } + ); + + this._batchSizeHistogram = this._meter.createHistogram("sessions_replication.batch_size", { + description: "Number of items per batch flush", + unit: "items", + }); + + this._sessionsInsertedCounter = this._meter.createCounter( + "sessions_replication.sessions_inserted", + { + description: "Session inserts to ClickHouse", + unit: "inserts", + } + ); + + this._insertRetriesCounter = this._meter.createCounter("sessions_replication.insert_retries", { + description: "Insert retry attempts", + }); + + this._eventsProcessedCounter = this._meter.createCounter( + "sessions_replication.events_processed", + { + description: "Replication events processed (inserts, updates, deletes)", + } + ); + + this._flushDurationHistogram = this._meter.createHistogram( + "sessions_replication.flush_duration_ms", + { + description: "Duration of batch flush operations", + unit: "ms", + } + ); + + this._acknowledgeTimeoutMs = options.acknowledgeTimeoutMs ?? 1_000; + + this._insertStrategy = options.insertStrategy ?? "insert"; + + this._replicationClient = new LogicalReplicationClient({ + pgConfig: { + connectionString: options.pgConnectionUrl, + }, + name: options.serviceName, + slotName: options.slotName, + publicationName: options.publicationName, + table: "Session", + redisOptions: options.redisOptions, + autoAcknowledge: false, + publicationActions: ["insert", "update", "delete"], + logger: options.logger ?? new Logger("LogicalReplicationClient", options.logLevel ?? "info"), + leaderLockTimeoutMs: options.leaderLockTimeoutMs ?? 30_000, + leaderLockExtendIntervalMs: options.leaderLockExtendIntervalMs ?? 10_000, + ackIntervalSeconds: options.ackIntervalSeconds ?? 10, + leaderLockAcquireAdditionalTimeMs: options.leaderLockAcquireAdditionalTimeMs ?? 10_000, + leaderLockRetryIntervalMs: options.leaderLockRetryIntervalMs ?? 500, + tracer: options.tracer, + }); + + this._concurrentFlushScheduler = new ConcurrentFlushScheduler({ + batchSize: options.flushBatchSize ?? 50, + flushInterval: options.flushIntervalMs ?? 100, + maxConcurrency: options.maxFlushConcurrency ?? 100, + callback: this.#flushBatch.bind(this), + // Key-based deduplication to reduce duplicates sent to ClickHouse + getKey: (item) => { + if (!item?.session?.id) { + this.logger.warn("Skipping replication event with null session", { event: item }); + return null; + } + return `${item.event}_${item.session.id}`; + }, + // Keep the session with the higher version (latest) + // and take the last occurrence for that version. + // Items originating from the same DB transaction have the same version. + shouldReplace: (existing, incoming) => incoming._version >= existing._version, + logger: new Logger("ConcurrentFlushScheduler", options.logLevel ?? "info"), + tracer: options.tracer, + }); + + this._replicationClient.events.on("data", async ({ lsn, log, parseDuration }) => { + this.#handleData(lsn, log, parseDuration); + }); + + this._replicationClient.events.on("heartbeat", async ({ lsn, shouldRespond }) => { + if (this._isShuttingDown) return; + if (this._isShutDownComplete) return; + + if (shouldRespond) { + this._lastAcknowledgedLsn = lsn; + await this._replicationClient.acknowledge(lsn); + } + }); + + this._replicationClient.events.on("error", (error) => { + this.logger.error("Replication client error", { + error, + }); + }); + + this._replicationClient.events.on("start", () => { + this.logger.info("Replication client started"); + }); + + this._replicationClient.events.on("acknowledge", ({ lsn }) => { + this.logger.debug("Acknowledged", { lsn }); + }); + + this._replicationClient.events.on("leaderElection", (isLeader) => { + this.logger.info("Leader election", { isLeader }); + }); + + // Initialize retry configuration + this._insertMaxRetries = options.insertMaxRetries ?? 3; + this._insertBaseDelayMs = options.insertBaseDelayMs ?? 100; + this._insertMaxDelayMs = options.insertMaxDelayMs ?? 2000; + } + + public async shutdown() { + if (this._isShuttingDown) return; + + this._isShuttingDown = true; + + this.logger.info("Initiating shutdown of sessions replication service"); + + if (!this._currentTransaction) { + this.logger.info("No transaction to commit, shutting down immediately"); + await this._replicationClient.stop(); + this._isSubscribed = false; + this._isShutDownComplete = true; + return; + } + + this._concurrentFlushScheduler.shutdown(); + } + + async start() { + if (this._isSubscribed) { + this.logger.debug("Replication client already started, skipping start"); + return; + } + + this.logger.info("Starting replication client", { + lastLsn: this._latestCommitEndLsn, + }); + + await this._replicationClient.subscribe(this._latestCommitEndLsn ?? undefined); + + this._acknowledgeInterval = setInterval(this.#acknowledgeLatestTransaction.bind(this), 1000); + this._concurrentFlushScheduler.start(); + this._isSubscribed = true; + } + + async stop() { + this.logger.info("Stopping replication client"); + + await this._replicationClient.stop(); + + if (this._acknowledgeInterval) { + clearInterval(this._acknowledgeInterval); + this._acknowledgeInterval = null; + } + + this._isSubscribed = false; + } + + async teardown() { + this.logger.info("Teardown replication client"); + + await this._replicationClient.teardown(); + + if (this._acknowledgeInterval) { + clearInterval(this._acknowledgeInterval); + this._acknowledgeInterval = null; + } + + this._isSubscribed = false; + } + + #handleData(lsn: string, message: PgoutputMessage, parseDuration: bigint) { + this.logger.debug("Handling data", { + lsn, + tag: message.tag, + parseDuration, + }); + + this.events.emit("message", { lsn, message, service: this }); + + switch (message.tag) { + case "begin": { + if (this._isShuttingDown || this._isShutDownComplete) { + return; + } + + this._currentTransaction = { + beginStartTimestamp: Date.now(), + commitLsn: message.commitLsn, + xid: message.xid, + events: [], + }; + + this._currentParseDurationMs = Number(parseDuration) / 1_000_000; + + break; + } + case "insert": { + if (!this._currentTransaction) { + return; + } + + if (this._currentParseDurationMs) { + this._currentParseDurationMs = + this._currentParseDurationMs + Number(parseDuration) / 1_000_000; + } + + this._currentTransaction.events.push({ + tag: message.tag, + data: message.new as Session, + raw: message, + }); + break; + } + case "update": { + if (!this._currentTransaction) { + return; + } + + if (this._currentParseDurationMs) { + this._currentParseDurationMs = + this._currentParseDurationMs + Number(parseDuration) / 1_000_000; + } + + this._currentTransaction.events.push({ + tag: message.tag, + data: message.new as Session, + raw: message, + }); + break; + } + case "delete": { + if (!this._currentTransaction) { + return; + } + + if (this._currentParseDurationMs) { + this._currentParseDurationMs = + this._currentParseDurationMs + Number(parseDuration) / 1_000_000; + } + + this._currentTransaction.events.push({ + tag: message.tag, + data: message.old as Session, + raw: message, + }); + + break; + } + case "commit": { + if (!this._currentTransaction) { + return; + } + + if (this._currentParseDurationMs) { + this._currentParseDurationMs = + this._currentParseDurationMs + Number(parseDuration) / 1_000_000; + } + + const replicationLagMs = Date.now() - Number(message.commitTime / 1000n); + this._currentTransaction.commitEndLsn = message.commitEndLsn; + this._currentTransaction.replicationLagMs = replicationLagMs; + const transaction = this._currentTransaction as Transaction; + this._currentTransaction = null; + + if (transaction.commitEndLsn) { + this._latestCommitEndLsn = transaction.commitEndLsn; + } + + this.#handleTransaction(transaction); + break; + } + default: { + this.logger.debug("Unknown message tag", { + pgMessage: message, + }); + } + } + } + + #handleTransaction(transaction: Transaction) { + if (this._isShutDownComplete) return; + + if (this._isShuttingDown) { + this._replicationClient.stop().finally(() => { + this._isSubscribed = false; + this._isShutDownComplete = true; + }); + } + + // If there are no events, do nothing + if (transaction.events.length === 0) { + return; + } + + if (!transaction.commitEndLsn) { + this.logger.error("Transaction has no commit end lsn", { + transaction, + }); + + return; + } + + const lsnToUInt64Start = process.hrtime.bigint(); + + // If there are events, we need to handle them + const _version = lsnToUInt64(transaction.commitEndLsn); + + const lsnToUInt64DurationMs = Number(process.hrtime.bigint() - lsnToUInt64Start) / 1_000_000; + + this._concurrentFlushScheduler.addToBatch( + transaction.events.map((event) => ({ + _version, + session: event.data, + event: event.tag, + })) + ); + + // Record metrics + this._replicationLagHistogram.record(transaction.replicationLagMs); + + // Count events by type + for (const event of transaction.events) { + this._eventsProcessedCounter.add(1, { event_type: event.tag }); + } + + this.logger.debug("handle_transaction", { + transaction: { + xid: transaction.xid, + commitLsn: transaction.commitLsn, + commitEndLsn: transaction.commitEndLsn, + events: transaction.events.length, + parseDurationMs: this._currentParseDurationMs, + lsnToUInt64DurationMs, + version: _version.toString(), + }, + }); + } + + async #acknowledgeLatestTransaction() { + if (!this._latestCommitEndLsn) { + return; + } + + if (this._lastAcknowledgedLsn === this._latestCommitEndLsn) { + return; + } + + const now = Date.now(); + + if (this._lastAcknowledgedAt) { + const timeSinceLastAcknowledged = now - this._lastAcknowledgedAt; + // If we've already acknowledged within the last second, don't acknowledge again + if (timeSinceLastAcknowledged < this._acknowledgeTimeoutMs) { + return; + } + } + + this._lastAcknowledgedAt = now; + this._lastAcknowledgedLsn = this._latestCommitEndLsn; + + this.logger.debug("acknowledge_latest_transaction", { + commitEndLsn: this._latestCommitEndLsn, + lastAcknowledgedAt: this._lastAcknowledgedAt, + }); + + const [ackError] = await tryCatch( + this._replicationClient.acknowledge(this._latestCommitEndLsn) + ); + + if (ackError) { + this.logger.error("Error acknowledging transaction", { ackError }); + } + + if (this._isShutDownComplete && this._acknowledgeInterval) { + clearInterval(this._acknowledgeInterval); + } + } + + async #flushBatch(flushId: string, batch: Array) { + if (batch.length === 0) { + return; + } + + this.logger.debug("Flushing batch", { + flushId, + batchSize: batch.length, + }); + + const flushStartTime = performance.now(); + + await startSpan(this._tracer, "flushBatch", async (span) => { + const sessionInserts = batch + .map((item) => toSessionInsertArray(item.session, item._version, item.event === "delete")) + // batch inserts in clickhouse are more performant if the items + // are pre-sorted by the primary key + .sort((a, b) => { + const aOrgId = getSessionField(a, "organization_id"); + const bOrgId = getSessionField(b, "organization_id"); + if (aOrgId !== bOrgId) { + return aOrgId < bOrgId ? -1 : 1; + } + const aProjId = getSessionField(a, "project_id"); + const bProjId = getSessionField(b, "project_id"); + if (aProjId !== bProjId) { + return aProjId < bProjId ? -1 : 1; + } + const aEnvId = getSessionField(a, "environment_id"); + const bEnvId = getSessionField(b, "environment_id"); + if (aEnvId !== bEnvId) { + return aEnvId < bEnvId ? -1 : 1; + } + const aCreatedAt = getSessionField(a, "created_at"); + const bCreatedAt = getSessionField(b, "created_at"); + if (aCreatedAt !== bCreatedAt) { + return aCreatedAt - bCreatedAt; + } + const aSessionId = getSessionField(a, "session_id"); + const bSessionId = getSessionField(b, "session_id"); + if (aSessionId === bSessionId) return 0; + return aSessionId < bSessionId ? -1 : 1; + }); + + span.setAttribute("session_inserts", sessionInserts.length); + + this.logger.debug("Flushing inserts", { + flushId, + sessionInserts: sessionInserts.length, + }); + + const [sessionError, sessionResult] = await this.#insertWithRetry( + (attempt) => this.#insertSessionInserts(sessionInserts, attempt), + "session inserts", + flushId + ); + + if (sessionError) { + this.logger.error("Error inserting session inserts", { + error: sessionError, + flushId, + }); + recordSpanError(span, sessionError); + } + + this.logger.debug("Flushed inserts", { + flushId, + sessionInserts: sessionInserts.length, + }); + + this.events.emit("batchFlushed", { flushId, sessionInserts }); + + // Record metrics + const flushDurationMs = performance.now() - flushStartTime; + const hasErrors = sessionError !== null; + + this._batchSizeHistogram.record(batch.length); + this._flushDurationHistogram.record(flushDurationMs); + this._batchesFlushedCounter.add(1, { success: !hasErrors }); + + if (!sessionError) { + this._sessionsInsertedCounter.add(sessionInserts.length); + } + }); + } + + // New method to handle inserts with retry logic for connection errors + async #insertWithRetry( + insertFn: (attempt: number) => Promise, + operationName: string, + flushId: string + ): Promise<[Error | null, T | null]> { + let lastError: Error | null = null; + + for (let attempt = 1; attempt <= this._insertMaxRetries; attempt++) { + try { + const result = await insertFn(attempt); + return [null, result]; + } catch (error) { + lastError = error instanceof Error ? error : new Error(String(error)); + + // Check if this is a retryable error + if (this.#isRetryableError(lastError)) { + const delay = this.#calculateRetryDelay(attempt); + + this.logger.warn(`Retrying SessionsReplication insert due to error`, { + operationName, + flushId, + attempt, + maxRetries: this._insertMaxRetries, + error: lastError.message, + delay, + }); + + // Record retry metric + this._insertRetriesCounter.add(1, { operation: "sessions" }); + + await new Promise((resolve) => setTimeout(resolve, delay)); + continue; + } + break; + } + } + + return [lastError, null]; + } + + // Retry all errors except known permanent ones + #isRetryableError(error: Error): boolean { + const errorMessage = error.message.toLowerCase(); + + // Permanent errors that should NOT be retried + const permanentErrorPatterns = [ + "authentication failed", + "permission denied", + "invalid credentials", + "table not found", + "database not found", + "column not found", + "schema mismatch", + "invalid query", + "syntax error", + "type error", + "constraint violation", + "duplicate key", + "foreign key violation", + ]; + + // If it's a known permanent error, don't retry + if (permanentErrorPatterns.some((pattern) => errorMessage.includes(pattern))) { + return false; + } + + // Retry everything else + return true; + } + + #calculateRetryDelay(attempt: number): number { + // Exponential backoff: baseDelay, baseDelay*2, baseDelay*4, etc. + const delay = Math.min( + this._insertBaseDelayMs * Math.pow(2, attempt - 1), + this._insertMaxDelayMs + ); + + // Add some jitter to prevent thundering herd + const jitter = Math.random() * 100; + return delay + jitter; + } + + #getClickhouseInsertSettings() { + if (this._insertStrategy === "insert") { + return {}; + } + + return { + async_insert: 1 as const, + async_insert_max_data_size: "1000000", + async_insert_busy_timeout_ms: 1000, + wait_for_async_insert: this.options.waitForAsyncInsert ? (1 as const) : (0 as const), + }; + } + + async #insertSessionInserts(sessionInserts: SessionInsertArray[], attempt: number) { + return await startSpan(this._tracer, "insertSessionInserts", async (span) => { + const [insertError, insertResult] = + await this.options.clickhouse.sessions.insertCompactArrays(sessionInserts, { + params: { + clickhouse_settings: this.#getClickhouseInsertSettings(), + }, + }); + + if (insertError) { + this.logger.error("Error inserting session inserts attempt", { + error: insertError, + attempt, + }); + + recordSpanError(span, insertError); + throw insertError; + } + + return insertResult; + }); + } +} + +function toSessionInsertArray( + session: Session, + version: bigint, + isDeleted: boolean +): SessionInsertArray { + return [ + session.runtimeEnvironmentId, + session.organizationId, + session.projectId, + session.id, + session.environmentType, + session.friendlyId, + session.externalId ?? "", + session.type, + session.taskIdentifier ?? "", + session.tags ?? [], + { data: session.metadata ?? null }, + session.closedAt ? session.closedAt.getTime() : null, + session.closedReason ?? "", + session.expiresAt ? session.expiresAt.getTime() : null, + session.createdAt.getTime(), + session.updatedAt.getTime(), + version.toString(), + isDeleted ? 1 : 0, + ]; +} + +function lsnToUInt64(lsn: string): bigint { + const [seg, off] = lsn.split("/"); + return (BigInt("0x" + seg) << 32n) | BigInt("0x" + off); +} diff --git a/apps/webapp/app/services/sessionsRepository/clickhouseSessionsRepository.server.ts b/apps/webapp/app/services/sessionsRepository/clickhouseSessionsRepository.server.ts new file mode 100644 index 00000000000..79fb06ac933 --- /dev/null +++ b/apps/webapp/app/services/sessionsRepository/clickhouseSessionsRepository.server.ts @@ -0,0 +1,252 @@ +import { type ClickhouseQueryBuilder } from "@internal/clickhouse"; +import parseDuration from "parse-duration"; +import { + convertSessionListInputOptionsToFilterOptions, + type FilterSessionsOptions, + type ISessionsRepository, + type ListSessionsOptions, + type SessionListInputOptions, + type SessionTagListOptions, + type SessionsRepositoryOptions, +} from "./sessionsRepository.server"; + +export class ClickHouseSessionsRepository implements ISessionsRepository { + constructor(private readonly options: SessionsRepositoryOptions) {} + + get name() { + return "clickhouse"; + } + + async listSessionIds(options: ListSessionsOptions): Promise { + const queryBuilder = this.options.clickhouse.sessions.queryBuilder(); + applySessionFiltersToQueryBuilder( + queryBuilder, + convertSessionListInputOptionsToFilterOptions(options) + ); + + if (options.page.cursor) { + if (options.page.direction === "forward" || !options.page.direction) { + queryBuilder + .where("session_id < {sessionId: String}", { sessionId: options.page.cursor }) + .orderBy("created_at DESC, session_id DESC") + .limit(options.page.size + 1); + } else { + queryBuilder + .where("session_id > {sessionId: String}", { sessionId: options.page.cursor }) + .orderBy("created_at ASC, session_id ASC") + .limit(options.page.size + 1); + } + } else { + queryBuilder.orderBy("created_at DESC, session_id DESC").limit(options.page.size + 1); + } + + const [queryError, result] = await queryBuilder.execute(); + if (queryError) throw queryError; + + return result.map((row) => row.session_id); + } + + async listSessions(options: ListSessionsOptions) { + const sessionIds = await this.listSessionIds(options); + const hasMore = sessionIds.length > options.page.size; + + let nextCursor: string | null = null; + let previousCursor: string | null = null; + + const direction = options.page.direction ?? "forward"; + switch (direction) { + case "forward": { + previousCursor = options.page.cursor ? sessionIds.at(0) ?? null : null; + if (hasMore) { + nextCursor = sessionIds[options.page.size - 1]; + } + break; + } + case "backward": { + const reversed = [...sessionIds].reverse(); + if (hasMore) { + previousCursor = reversed.at(1) ?? null; + nextCursor = reversed.at(options.page.size) ?? null; + } else { + nextCursor = reversed.at(options.page.size - 1) ?? null; + } + break; + } + } + + const idsToReturn = + options.page.direction === "backward" && hasMore + ? sessionIds.slice(1, options.page.size + 1) + : sessionIds.slice(0, options.page.size); + + let sessions = await this.options.prisma.session.findMany({ + where: { + id: { in: idsToReturn }, + runtimeEnvironmentId: options.environmentId, + }, + orderBy: { createdAt: "desc" }, + select: { + id: true, + friendlyId: true, + externalId: true, + type: true, + taskIdentifier: true, + tags: true, + metadata: true, + closedAt: true, + closedReason: true, + expiresAt: true, + createdAt: true, + updatedAt: true, + runtimeEnvironmentId: true, + }, + }); + + // ClickHouse is slightly delayed; narrow by derived status in-memory to + // catch recent Postgres writes that haven't replicated yet. + if (options.statuses && options.statuses.length > 0) { + const wanted = new Set(options.statuses); + const now = Date.now(); + sessions = sessions.filter((s) => { + const status = + s.closedAt != null + ? "CLOSED" + : s.expiresAt != null && s.expiresAt.getTime() < now + ? "EXPIRED" + : "ACTIVE"; + return wanted.has(status); + }); + } + + return { + sessions, + pagination: { nextCursor, previousCursor }, + }; + } + + async countSessions(options: SessionListInputOptions): Promise { + const queryBuilder = this.options.clickhouse.sessions.countQueryBuilder(); + applySessionFiltersToQueryBuilder( + queryBuilder, + convertSessionListInputOptionsToFilterOptions(options) + ); + + const [queryError, result] = await queryBuilder.execute(); + if (queryError) throw queryError; + + if (result.length === 0) { + throw new Error("No count rows returned"); + } + return result[0].count; + } + + async listTags(options: SessionTagListOptions) { + const queryBuilder = this.options.clickhouse.sessions + .tagQueryBuilder() + .where("organization_id = {organizationId: String}", { + organizationId: options.organizationId, + }) + .where("project_id = {projectId: String}", { projectId: options.projectId }) + .where("environment_id = {environmentId: String}", { + environmentId: options.environmentId, + }); + + const periodMs = options.period ? parseDuration(options.period) ?? undefined : undefined; + if (periodMs) { + queryBuilder.where("created_at >= fromUnixTimestamp64Milli({period: Int64})", { + period: new Date(Date.now() - periodMs).getTime(), + }); + } + + if (options.from) { + queryBuilder.where("created_at >= fromUnixTimestamp64Milli({from: Int64})", { + from: options.from, + }); + } + + if (options.to) { + queryBuilder.where("created_at <= fromUnixTimestamp64Milli({to: Int64})", { + to: options.to, + }); + } + + if (options.query && options.query.trim().length > 0) { + queryBuilder.where("positionCaseInsensitiveUTF8(tag, {query: String}) > 0", { + query: options.query, + }); + } + + queryBuilder.orderBy("tag ASC").limit(options.limit); + + const [queryError, result] = await queryBuilder.execute(); + if (queryError) throw queryError; + + return { tags: result.map((row) => row.tag) }; + } +} + +function applySessionFiltersToQueryBuilder( + queryBuilder: ClickhouseQueryBuilder, + options: FilterSessionsOptions +) { + queryBuilder + .where("organization_id = {organizationId: String}", { + organizationId: options.organizationId, + }) + .where("project_id = {projectId: String}", { projectId: options.projectId }) + .where("environment_id = {environmentId: String}", { environmentId: options.environmentId }); + + if (options.types && options.types.length > 0) { + queryBuilder.where("type IN {types: Array(String)}", { types: options.types }); + } + + if (options.tags && options.tags.length > 0) { + queryBuilder.where("hasAny(tags, {tags: Array(String)})", { tags: options.tags }); + } + + if (options.taskIdentifiers && options.taskIdentifiers.length > 0) { + queryBuilder.where("task_identifier IN {taskIdentifiers: Array(String)}", { + taskIdentifiers: options.taskIdentifiers, + }); + } + + if (options.externalId) { + queryBuilder.where("external_id = {externalId: String}", { externalId: options.externalId }); + } + + if (options.statuses && options.statuses.length > 0) { + const conditions: string[] = []; + if (options.statuses.includes("ACTIVE")) { + conditions.push( + "(closed_at IS NULL AND (expires_at IS NULL OR expires_at > now64(3)))" + ); + } + if (options.statuses.includes("CLOSED")) { + conditions.push("closed_at IS NOT NULL"); + } + if (options.statuses.includes("EXPIRED")) { + conditions.push("(closed_at IS NULL AND expires_at IS NOT NULL AND expires_at <= now64(3))"); + } + if (conditions.length > 0) { + queryBuilder.where(`(${conditions.join(" OR ")})`); + } + } + + if (options.period) { + queryBuilder.where("created_at >= fromUnixTimestamp64Milli({period: Int64})", { + period: new Date(Date.now() - options.period).getTime(), + }); + } + + if (options.from) { + queryBuilder.where("created_at >= fromUnixTimestamp64Milli({from: Int64})", { + from: options.from, + }); + } + + if (options.to) { + queryBuilder.where("created_at <= fromUnixTimestamp64Milli({to: Int64})", { + to: options.to, + }); + } +} diff --git a/apps/webapp/app/services/sessionsRepository/sessionsRepository.server.ts b/apps/webapp/app/services/sessionsRepository/sessionsRepository.server.ts new file mode 100644 index 00000000000..cb4ebb48b6f --- /dev/null +++ b/apps/webapp/app/services/sessionsRepository/sessionsRepository.server.ts @@ -0,0 +1,198 @@ +import { type ClickHouse } from "@internal/clickhouse"; +import { type Tracer } from "@internal/tracing"; +import { type Logger, type LogLevel } from "@trigger.dev/core/logger"; +import { type Prisma } from "@trigger.dev/database"; +import parseDuration from "parse-duration"; +import { z } from "zod"; +import { type PrismaClientOrTransaction } from "~/db.server"; +import { startActiveSpan } from "~/v3/tracer.server"; +import { ClickHouseSessionsRepository } from "./clickhouseSessionsRepository.server"; + +export type SessionsRepositoryOptions = { + clickhouse: ClickHouse; + prisma: PrismaClientOrTransaction; + logger?: Logger; + logLevel?: LogLevel; + tracer?: Tracer; +}; + +/** + * Derived status values — `Session` rows don't have a stored status column. + * `ACTIVE` is the base state; `CLOSED` means `closedAt` is set; `EXPIRED` + * means `expiresAt` has passed. + */ +export const SessionStatus = z.enum(["ACTIVE", "CLOSED", "EXPIRED"]); +export type SessionStatus = z.infer; + +const SessionListInputOptionsSchema = z.object({ + organizationId: z.string(), + projectId: z.string(), + environmentId: z.string(), + // filters + types: z.array(z.string()).optional(), + tags: z.array(z.string()).optional(), + taskIdentifiers: z.array(z.string()).optional(), + externalId: z.string().optional(), + statuses: z.array(SessionStatus).optional(), + period: z.string().optional(), + from: z.number().optional(), + to: z.number().optional(), +}); + +export type SessionListInputOptions = z.infer; +export type SessionListInputFilters = Omit< + SessionListInputOptions, + "organizationId" | "projectId" | "environmentId" +>; + +export type FilterSessionsOptions = Omit & { + /** period converted to milliseconds duration */ + period: number | undefined; +}; + +type Pagination = { + page: { + size: number; + cursor?: string; + direction?: "forward" | "backward"; + }; +}; + +export type ListSessionsOptions = SessionListInputOptions & Pagination; + +type OffsetPagination = { + offset: number; + limit: number; +}; + +export type SessionTagListOptions = { + organizationId: string; + projectId: string; + environmentId: string; + period?: string; + from?: number; + to?: number; + /** Case-insensitive substring match on the tag name */ + query?: string; +} & OffsetPagination; + +export type SessionTagList = { + tags: string[]; +}; + +export type ListedSession = Prisma.SessionGetPayload<{ + select: { + id: true; + friendlyId: true; + externalId: true; + type: true; + taskIdentifier: true; + tags: true; + metadata: true; + closedAt: true; + closedReason: true; + expiresAt: true; + createdAt: true; + updatedAt: true; + runtimeEnvironmentId: true; + }; +}>; + +export interface ISessionsRepository { + name: string; + listSessionIds(options: ListSessionsOptions): Promise; + listSessions(options: ListSessionsOptions): Promise<{ + sessions: ListedSession[]; + pagination: { + nextCursor: string | null; + previousCursor: string | null; + }; + }>; + countSessions(options: SessionListInputOptions): Promise; + listTags(options: SessionTagListOptions): Promise; +} + +export class SessionsRepository implements ISessionsRepository { + private readonly clickHouseSessionsRepository: ClickHouseSessionsRepository; + + constructor(private readonly options: SessionsRepositoryOptions) { + this.clickHouseSessionsRepository = new ClickHouseSessionsRepository(options); + } + + get name() { + return "sessionsRepository"; + } + + async listSessionIds(options: ListSessionsOptions): Promise { + return startActiveSpan( + "sessionsRepository.listSessionIds", + async () => this.clickHouseSessionsRepository.listSessionIds(options), + { + attributes: { + "repository.name": "clickhouse", + organizationId: options.organizationId, + projectId: options.projectId, + environmentId: options.environmentId, + }, + } + ); + } + + async listSessions(options: ListSessionsOptions) { + return startActiveSpan( + "sessionsRepository.listSessions", + async () => this.clickHouseSessionsRepository.listSessions(options), + { + attributes: { + "repository.name": "clickhouse", + organizationId: options.organizationId, + projectId: options.projectId, + environmentId: options.environmentId, + }, + } + ); + } + + async countSessions(options: SessionListInputOptions) { + return startActiveSpan( + "sessionsRepository.countSessions", + async () => this.clickHouseSessionsRepository.countSessions(options), + { + attributes: { + "repository.name": "clickhouse", + organizationId: options.organizationId, + projectId: options.projectId, + environmentId: options.environmentId, + }, + } + ); + } + + async listTags(options: SessionTagListOptions) { + return startActiveSpan( + "sessionsRepository.listTags", + async () => this.clickHouseSessionsRepository.listTags(options), + { + attributes: { + "repository.name": "clickhouse", + organizationId: options.organizationId, + projectId: options.projectId, + environmentId: options.environmentId, + }, + } + ); + } +} + +export function parseSessionListInputOptions(data: unknown): SessionListInputOptions { + return SessionListInputOptionsSchema.parse(data); +} + +export function convertSessionListInputOptionsToFilterOptions( + options: SessionListInputOptions +): FilterSessionsOptions { + return { + ...options, + period: options.period ? parseDuration(options.period) ?? undefined : undefined, + }; +} diff --git a/apps/webapp/app/v3/services/adminWorker.server.ts b/apps/webapp/app/v3/services/adminWorker.server.ts index 97c94b954f0..2e4d1b066cb 100644 --- a/apps/webapp/app/v3/services/adminWorker.server.ts +++ b/apps/webapp/app/v3/services/adminWorker.server.ts @@ -4,6 +4,12 @@ import { z } from "zod"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import { runsReplicationInstance } from "~/services/runsReplicationInstance.server"; +// Reference-hold the sessions-replication singleton so module evaluation runs +// its initializer (creates the ClickHouse client, subscribes to the logical +// replication slot, wires signal handlers) when the webapp boots. A bare +// side-effect import gets tree-shaken by the bundler. +import { sessionsReplicationInstance } from "~/services/sessionsReplicationInstance.server"; +void sessionsReplicationInstance; import { singleton } from "~/utils/singleton"; import { tracer } from "../tracer.server"; import { $replica } from "~/db.server"; diff --git a/apps/webapp/test/sessionsReplicationService.test.ts b/apps/webapp/test/sessionsReplicationService.test.ts new file mode 100644 index 00000000000..f6d8d4ba8b1 --- /dev/null +++ b/apps/webapp/test/sessionsReplicationService.test.ts @@ -0,0 +1,205 @@ +import { ClickHouse } from "@internal/clickhouse"; +import { containerTest } from "@internal/testcontainers"; +import { setTimeout } from "node:timers/promises"; +import { z } from "zod"; +import { SessionsReplicationService } from "~/services/sessionsReplicationService.server"; + +vi.setConfig({ testTimeout: 60_000 }); + +describe("SessionsReplicationService", () => { + containerTest( + "replicates an insert from Postgres Session → ClickHouse sessions_v1", + async ({ clickhouseContainer, redisOptions, postgresContainer, prisma }) => { + // Logical replication needs full-row images for DELETE events. + await prisma.$executeRawUnsafe(`ALTER TABLE public."Session" REPLICA IDENTITY FULL;`); + + const clickhouse = new ClickHouse({ + url: clickhouseContainer.getConnectionUrl(), + name: "sessions-replication", + compression: { request: true }, + logLevel: "warn", + }); + + const service = new SessionsReplicationService({ + clickhouse, + pgConnectionUrl: postgresContainer.getConnectionUri(), + serviceName: "sessions-replication", + slotName: "sessions_to_clickhouse_v1", + publicationName: "sessions_to_clickhouse_v1_publication", + redisOptions, + maxFlushConcurrency: 1, + flushIntervalMs: 100, + flushBatchSize: 1, + leaderLockTimeoutMs: 5000, + leaderLockExtendIntervalMs: 1000, + ackIntervalSeconds: 5, + logLevel: "warn", + }); + + await service.start(); + + const organization = await prisma.organization.create({ + data: { title: "test", slug: "test" }, + }); + + const project = await prisma.project.create({ + data: { + name: "test", + slug: "test", + organizationId: organization.id, + externalRef: "test", + }, + }); + + const environment = await prisma.runtimeEnvironment.create({ + data: { + slug: "test", + type: "DEVELOPMENT", + projectId: project.id, + organizationId: organization.id, + apiKey: "test", + pkApiKey: "test", + shortcode: "test", + }, + }); + + const session = await prisma.session.create({ + data: { + id: "session_test_insert_1", + friendlyId: "session_abc123", + externalId: "my-test-session", + type: "chat.agent", + projectId: project.id, + runtimeEnvironmentId: environment.id, + environmentType: "DEVELOPMENT", + organizationId: organization.id, + taskIdentifier: "my-agent", + tags: ["user:42", "plan:pro"], + metadata: { plan: "pro", seats: 3 }, + }, + }); + + // Allow the replication pipeline to flush + await setTimeout(2000); + + const querySessions = clickhouse.reader.query({ + name: "read-sessions", + query: "SELECT * FROM trigger_dev.sessions_v1 FINAL", + schema: z.any(), + }); + + const [queryError, result] = await querySessions({}); + + expect(queryError).toBeNull(); + expect(result?.length).toBe(1); + expect(result?.[0]).toEqual( + expect.objectContaining({ + session_id: session.id, + friendly_id: session.friendlyId, + external_id: "my-test-session", + type: "chat.agent", + project_id: project.id, + environment_id: environment.id, + organization_id: organization.id, + environment_type: "DEVELOPMENT", + task_identifier: "my-agent", + tags: ["user:42", "plan:pro"], + _is_deleted: 0, + }) + ); + + await service.stop(); + } + ); + + containerTest( + "replicates an update (close) from Postgres → ClickHouse", + async ({ clickhouseContainer, redisOptions, postgresContainer, prisma }) => { + await prisma.$executeRawUnsafe(`ALTER TABLE public."Session" REPLICA IDENTITY FULL;`); + + const clickhouse = new ClickHouse({ + url: clickhouseContainer.getConnectionUrl(), + name: "sessions-replication", + compression: { request: true }, + logLevel: "warn", + }); + + const service = new SessionsReplicationService({ + clickhouse, + pgConnectionUrl: postgresContainer.getConnectionUri(), + serviceName: "sessions-replication", + slotName: "sessions_to_clickhouse_v1", + publicationName: "sessions_to_clickhouse_v1_publication", + redisOptions, + maxFlushConcurrency: 1, + flushIntervalMs: 100, + flushBatchSize: 1, + leaderLockTimeoutMs: 5000, + leaderLockExtendIntervalMs: 1000, + ackIntervalSeconds: 5, + logLevel: "warn", + }); + + await service.start(); + + const organization = await prisma.organization.create({ + data: { title: "test", slug: "test" }, + }); + const project = await prisma.project.create({ + data: { + name: "test", + slug: "test", + organizationId: organization.id, + externalRef: "test", + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + slug: "test", + type: "DEVELOPMENT", + projectId: project.id, + organizationId: organization.id, + apiKey: "test", + pkApiKey: "test", + shortcode: "test", + }, + }); + + const created = await prisma.session.create({ + data: { + id: "session_test_update_1", + friendlyId: "session_update1", + type: "chat.agent", + projectId: project.id, + runtimeEnvironmentId: environment.id, + environmentType: "DEVELOPMENT", + organizationId: organization.id, + }, + }); + + await setTimeout(1000); + + await prisma.session.update({ + where: { id: created.id }, + data: { closedAt: new Date(), closedReason: "test-close" }, + }); + + await setTimeout(2000); + + const querySessions = clickhouse.reader.query({ + name: "read-sessions-closed", + query: "SELECT closed_reason, closed_at FROM trigger_dev.sessions_v1 FINAL", + schema: z.any(), + }); + + const [queryError, result] = await querySessions({}); + + expect(queryError).toBeNull(); + expect(result?.length).toBe(1); + expect(result?.[0].closed_reason).toBe("test-close"); + expect(result?.[0].closed_at).toBeDefined(); + + await service.stop(); + } + ); +}); diff --git a/internal-packages/clickhouse/schema/030_create_sessions_v1.sql b/internal-packages/clickhouse/schema/030_create_sessions_v1.sql new file mode 100644 index 00000000000..f575953ea80 --- /dev/null +++ b/internal-packages/clickhouse/schema/030_create_sessions_v1.sql @@ -0,0 +1,42 @@ +-- +goose Up + +CREATE TABLE trigger_dev.sessions_v1 +( + /* ─── identity ─────────────────────────────────────────────── */ + environment_id String, + organization_id String, + project_id String, + session_id String, + + environment_type LowCardinality(String), + friendly_id String, + external_id String DEFAULT '', + + /* ─── type discriminator ──────────────────────────────────── */ + type LowCardinality(String), + task_identifier String DEFAULT '', + + /* ─── filtering / free-form ──────────────────────────────── */ + tags Array(String) CODEC(ZSTD(1)), + metadata JSON(max_dynamic_paths = 256), + + /* ─── terminal markers ────────────────────────────────────── */ + closed_at Nullable(DateTime64(3)), + closed_reason String DEFAULT '', + expires_at Nullable(DateTime64(3)), + + /* ─── timing ─────────────────────────────────────────────── */ + created_at DateTime64(3), + updated_at DateTime64(3), + + /* ─── commit lsn ────────────────────────────────────────── */ + _version UInt64, + _is_deleted UInt8 DEFAULT 0 +) +ENGINE = ReplacingMergeTree(_version, _is_deleted) +PARTITION BY toYYYYMM(created_at) +ORDER BY (organization_id, project_id, environment_id, created_at, session_id) +SETTINGS enable_json_type = 1; + +-- +goose Down +DROP TABLE IF EXISTS trigger_dev.sessions_v1; diff --git a/internal-packages/clickhouse/src/index.ts b/internal-packages/clickhouse/src/index.ts index c6b8858fa9c..45f0fa485a7 100644 --- a/internal-packages/clickhouse/src/index.ts +++ b/internal-packages/clickhouse/src/index.ts @@ -28,6 +28,12 @@ import { } from "./taskEvents.js"; import { insertMetrics } from "./metrics.js"; import { insertLlmMetrics } from "./llmMetrics.js"; +import { + getSessionTagsQueryBuilder, + getSessionsCountQueryBuilder, + getSessionsQueryBuilder, + insertSessionsCompactArrays, +} from "./sessions.js"; import { getGlobalModelMetrics, getGlobalModelComparison, @@ -57,6 +63,7 @@ export type * from "./metrics.js"; export type * from "./llmMetrics.js"; export type * from "./llmModelAggregates.js"; export type * from "./errors.js"; +export type * from "./sessions.js"; export type * from "./client/queryBuilder.js"; // Re-export column constants, indices, and type-safe accessors @@ -69,6 +76,8 @@ export { getPayloadField, } from "./taskRuns.js"; +export { SESSION_COLUMNS, SESSION_INDEX, getSessionField } from "./sessions.js"; + // TSQL query execution export { executeTSQL, @@ -251,6 +260,15 @@ export class ClickHouse { }; } + get sessions() { + return { + insertCompactArrays: insertSessionsCompactArrays(this.writer), + queryBuilder: getSessionsQueryBuilder(this.reader), + countQueryBuilder: getSessionsCountQueryBuilder(this.reader), + tagQueryBuilder: getSessionTagsQueryBuilder(this.reader), + }; + } + get taskEventsV2() { return { insert: insertTaskEventsV2(this.writer), diff --git a/internal-packages/clickhouse/src/sessions.ts b/internal-packages/clickhouse/src/sessions.ts new file mode 100644 index 00000000000..567fe65511e --- /dev/null +++ b/internal-packages/clickhouse/src/sessions.ts @@ -0,0 +1,184 @@ +import { ClickHouseSettings } from "@clickhouse/client"; +import { z } from "zod"; +import { ClickhouseReader, ClickhouseWriter } from "./client/types.js"; + +export const SessionV1 = z.object({ + environment_id: z.string(), + organization_id: z.string(), + project_id: z.string(), + session_id: z.string(), + environment_type: z.string(), + friendly_id: z.string(), + external_id: z.string().default(""), + type: z.string(), + task_identifier: z.string().default(""), + tags: z.array(z.string()).default([]), + metadata: z.unknown(), + closed_at: z.number().int().nullish(), + closed_reason: z.string().default(""), + expires_at: z.number().int().nullish(), + created_at: z.number().int(), + updated_at: z.number().int(), + _version: z.string(), + _is_deleted: z.number().int().default(0), +}); + +export type SessionV1 = z.input; + +// Column order for compact format - must match ClickHouse table schema +export const SESSION_COLUMNS = [ + "environment_id", + "organization_id", + "project_id", + "session_id", + "environment_type", + "friendly_id", + "external_id", + "type", + "task_identifier", + "tags", + "metadata", + "closed_at", + "closed_reason", + "expires_at", + "created_at", + "updated_at", + "_version", + "_is_deleted", +] as const; + +export type SessionColumnName = (typeof SESSION_COLUMNS)[number]; + +export const SESSION_INDEX = Object.fromEntries(SESSION_COLUMNS.map((col, idx) => [col, idx])) as { + readonly [K in SessionColumnName]: number; +}; + +export type SessionFieldTypes = { + environment_id: string; + organization_id: string; + project_id: string; + session_id: string; + environment_type: string; + friendly_id: string; + external_id: string; + type: string; + task_identifier: string; + tags: string[]; + metadata: { data: unknown }; + closed_at: number | null; + closed_reason: string; + expires_at: number | null; + created_at: number; + updated_at: number; + _version: string; + _is_deleted: number; +}; + +/** + * Type-safe tuple representing a Session insert array. + * Order matches {@link SESSION_COLUMNS} exactly. + */ +export type SessionInsertArray = [ + environment_id: string, + organization_id: string, + project_id: string, + session_id: string, + environment_type: string, + friendly_id: string, + external_id: string, + type: string, + task_identifier: string, + tags: string[], + metadata: { data: unknown }, + closed_at: number | null, + closed_reason: string, + expires_at: number | null, + created_at: number, + updated_at: number, + _version: string, + _is_deleted: number, +]; + +export function getSessionField( + session: SessionInsertArray, + field: K +): SessionFieldTypes[K] { + return session[SESSION_INDEX[field]] as SessionFieldTypes[K]; +} + +export function insertSessionsCompactArrays(ch: ClickhouseWriter, settings?: ClickHouseSettings) { + return ch.insertCompactRaw({ + name: "insertSessionsCompactArrays", + table: "trigger_dev.sessions_v1", + columns: SESSION_COLUMNS, + settings: { + enable_json_type: 1, + type_json_skip_duplicated_paths: 1, + ...settings, + }, + }); +} + +export function insertSessions(ch: ClickhouseWriter, settings?: ClickHouseSettings) { + return ch.insert({ + name: "insertSessions", + table: "trigger_dev.sessions_v1", + schema: SessionV1, + settings: { + enable_json_type: 1, + type_json_skip_duplicated_paths: 1, + ...settings, + }, + }); +} + +// ─── read path ─────────────────────────────────────────────────── + +export const SessionV1QueryResult = z.object({ + session_id: z.string(), +}); + +export type SessionV1QueryResult = z.infer; + +/** + * Base query builder for listing Sessions. Filters + pagination are composed + * on top of this; callers can chain `.where(...).orderBy(...).limit(...)`. + */ +export function getSessionsQueryBuilder(ch: ClickhouseReader, settings?: ClickHouseSettings) { + return ch.queryBuilder({ + name: "getSessions", + baseQuery: "SELECT session_id FROM trigger_dev.sessions_v1 FINAL", + schema: SessionV1QueryResult, + settings, + }); +} + +export function getSessionsCountQueryBuilder( + ch: ClickhouseReader, + settings?: ClickHouseSettings +) { + return ch.queryBuilder({ + name: "getSessionsCount", + baseQuery: "SELECT count() as count FROM trigger_dev.sessions_v1 FINAL", + schema: z.object({ count: z.number().int() }), + settings, + }); +} + +export const SessionTagsQueryResult = z.object({ + tag: z.string(), +}); + +export type SessionTagsQueryResult = z.infer; + +export function getSessionTagsQueryBuilder( + ch: ClickhouseReader, + settings?: ClickHouseSettings +) { + return ch.queryBuilder({ + name: "getSessionTags", + baseQuery: "SELECT DISTINCT arrayJoin(tags) as tag FROM trigger_dev.sessions_v1", + schema: SessionTagsQueryResult, + settings, + }); +} diff --git a/internal-packages/database/prisma/migrations/20260419000000_add_sessions_table/migration.sql b/internal-packages/database/prisma/migrations/20260419000000_add_sessions_table/migration.sql new file mode 100644 index 00000000000..4cd7e543223 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20260419000000_add_sessions_table/migration.sql @@ -0,0 +1,33 @@ +-- CreateTable +CREATE TABLE "Session" ( + "id" TEXT NOT NULL, + "friendlyId" TEXT NOT NULL, + "externalId" TEXT, + "type" TEXT NOT NULL, + "projectId" TEXT NOT NULL, + "runtimeEnvironmentId" TEXT NOT NULL, + "environmentType" "RuntimeEnvironmentType" NOT NULL, + "organizationId" TEXT NOT NULL, + "taskIdentifier" TEXT, + "tags" TEXT[] NOT NULL DEFAULT ARRAY[]::TEXT[], + "metadata" JSONB, + "closedAt" TIMESTAMP(3), + "closedReason" TEXT, + "expiresAt" TIMESTAMP(3), + "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updatedAt" TIMESTAMP(3) NOT NULL, + + CONSTRAINT "Session_pkey" PRIMARY KEY ("id") +); + +-- CreateIndex +CREATE UNIQUE INDEX "Session_friendlyId_key" + ON "Session"("friendlyId"); + +-- CreateIndex +CREATE UNIQUE INDEX "Session_runtimeEnvironmentId_externalId_key" + ON "Session"("runtimeEnvironmentId", "externalId"); + +-- CreateIndex +CREATE INDEX "Session_expiresAt_idx" + ON "Session"("expiresAt"); diff --git a/internal-packages/database/prisma/schema.prisma b/internal-packages/database/prisma/schema.prisma index 9ccf2495d3a..cfb4c7f8057 100644 --- a/internal-packages/database/prisma/schema.prisma +++ b/internal-packages/database/prisma/schema.prisma @@ -686,6 +686,47 @@ enum TaskTriggerSource { SCHEDULED } +/// Durable, typed, bidirectional I/O primitive. Owns two S2 streams (.out / .in). +/// The row is essentially static — no status, no counters, no pointers. No +/// foreign keys: project/runtimeEnvironment/organization ids are plain +/// scalar columns (matches TaskRun pattern). List-style queries are served +/// from ClickHouse, not Postgres, so only point-lookup indexes live here. +model Session { + id String @id @default(cuid()) + friendlyId String @unique + /// User-supplied identifier scoped to the environment. Used for + /// idempotent upsert and for resolving sessions via the public API. + externalId String? + + /// Plain string — intentionally not an enum. + type String + + /// Denormalized scoping columns — no FK relations. + projectId String + runtimeEnvironmentId String + environmentType RuntimeEnvironmentType + organizationId String + + /// Informational pointer for task-owned types. Never changes after create. + taskIdentifier String? + + tags String[] @default([]) + metadata Json? + + /// Terminal markers — written once, never flipped back. + closedAt DateTime? + closedReason String? + expiresAt DateTime? + + createdAt DateTime @default(now()) + updatedAt DateTime @updatedAt + + /// Idempotency: `(env, externalId)` uniquely identifies a session. + /// PostgreSQL treats NULLs as distinct, so `externalId=NULL` rows never collide. + @@unique([runtimeEnvironmentId, externalId]) + @@index([expiresAt]) +} + model TaskRun { id String @id @default(cuid()) diff --git a/packages/core/src/v3/isomorphic/friendlyId.ts b/packages/core/src/v3/isomorphic/friendlyId.ts index a230f8c7450..66575c7c178 100644 --- a/packages/core/src/v3/isomorphic/friendlyId.ts +++ b/packages/core/src/v3/isomorphic/friendlyId.ts @@ -97,6 +97,7 @@ export const BatchId = new IdUtil("batch"); export const BulkActionId = new IdUtil("bulk"); export const AttemptId = new IdUtil("attempt"); export const ErrorId = new IdUtil("error"); +export const SessionId = new IdUtil("session"); export class IdGenerator { private alphabet: string; diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts index 6d324a10d11..14adf823ad3 100644 --- a/packages/core/src/v3/schemas/api.ts +++ b/packages/core/src/v3/schemas/api.ts @@ -1449,6 +1449,118 @@ export const CompleteWaitpointTokenRequestBody = z.object({ }); export type CompleteWaitpointTokenRequestBody = z.infer; +/** + * Request body for `POST /api/v1/sessions`. Creates a Session — the durable, + * typed, bidirectional I/O primitive that outlives a single run. + */ +export const CreateSessionRequestBody = z.object({ + /** Plain string discriminator — e.g. `"chat.agent"`. Not validated against an enum on the server. */ + type: z.string().min(1).max(64), + /** User-supplied idempotency key. Unique per environment. */ + externalId: z.string().max(256).optional(), + /** Optional pointer for task-owned session types. */ + taskIdentifier: z.string().max(128).optional(), + /** Up to 10 tags for dashboard filtering. */ + tags: z.array(z.string().max(128)).max(10).optional(), + /** Arbitrary JSON metadata. */ + metadata: z.record(z.unknown()).optional(), + /** Absolute expiry timestamp for retention. */ + expiresAt: z.coerce.date().optional(), +}); +export type CreateSessionRequestBody = z.infer; + +export const SessionItem = z.object({ + id: z.string(), + externalId: z.string().nullable(), + type: z.string(), + taskIdentifier: z.string().nullable(), + tags: z.array(z.string()), + metadata: z.record(z.unknown()).nullable(), + closedAt: z.coerce.date().nullable(), + closedReason: z.string().nullable(), + expiresAt: z.coerce.date().nullable(), + createdAt: z.coerce.date(), + updatedAt: z.coerce.date(), +}); +export type SessionItem = z.infer; + +export const CreatedSessionResponseBody = SessionItem.extend({ + isCached: z.boolean(), +}); +export type CreatedSessionResponseBody = z.infer; + +export const RetrieveSessionResponseBody = SessionItem; +export type RetrieveSessionResponseBody = z.infer; + +export const UpdateSessionRequestBody = z.object({ + tags: z.array(z.string().max(128)).max(10).optional(), + metadata: z.record(z.unknown()).nullable().optional(), + externalId: z.string().max(256).nullable().optional(), +}); +export type UpdateSessionRequestBody = z.infer; + +export const CloseSessionRequestBody = z.object({ + reason: z.string().max(256).optional(), +}); +export type CloseSessionRequestBody = z.infer; + +export const SessionStatus = z.enum(["ACTIVE", "CLOSED", "EXPIRED"]); +export type SessionStatus = z.infer; + +/** + * Server-side validation schema for `GET /api/v1/sessions`. Follows the same + * cursor-pagination convention as runs/waitpoints (`page[size]`, + * `page[after]`, `page[before]`) and uses the `filter[*]` prefix for + * narrowing fields — both produced automatically by `zodfetchCursorPage` + * and the matching client-side search-query helper. + */ +export const ListSessionsQueryParams = z.object({ + "page[size]": z.coerce.number().int().min(1).max(100).default(20), + "page[after]": z.string().optional(), + "page[before]": z.string().optional(), + "filter[type]": z.union([z.string(), z.array(z.string())]).optional(), + "filter[tags]": z.union([z.string(), z.array(z.string())]).optional(), + "filter[taskIdentifier]": z.union([z.string(), z.array(z.string())]).optional(), + "filter[externalId]": z.string().optional(), + "filter[status]": z.union([SessionStatus, z.array(SessionStatus)]).optional(), + "filter[createdAt][period]": z.string().optional(), + "filter[createdAt][from]": z.coerce.number().int().optional(), + "filter[createdAt][to]": z.coerce.number().int().optional(), +}); +export type ListSessionsQueryParams = z.infer; + +/** + * Client-facing list options — flattened shape that + * {@link ApiClient.listSessions} converts into the `filter[*]` / `page[*]` + * query string before sending. + */ +export const ListSessionsOptions = z.object({ + limit: z.number().int().min(1).max(100).optional(), + after: z.string().optional(), + before: z.string().optional(), + type: z.union([z.string(), z.array(z.string())]).optional(), + tag: z.union([z.string(), z.array(z.string())]).optional(), + taskIdentifier: z.union([z.string(), z.array(z.string())]).optional(), + externalId: z.string().optional(), + status: z.union([SessionStatus, z.array(SessionStatus)]).optional(), + period: z.string().optional(), + from: z.union([z.number(), z.date()]).optional(), + to: z.union([z.number(), z.date()]).optional(), +}); +export type ListSessionsOptions = z.infer; + +export const ListedSessionItem = SessionItem; +export type ListedSessionItem = z.infer; + +export const ListSessionsResponseBody = z.object({ + data: z.array(ListedSessionItem), + pagination: z.object({ + next: z.string().optional(), + previous: z.string().optional(), + }), +}); +export type ListSessionsResponseBody = z.infer; + export const CompleteWaitpointTokenResponseBody = z.object({ success: z.literal(true), }); From f6149c9f77602972e0a832d67bba82dce04f9304 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 20 Apr 2026 15:43:48 +0100 Subject: [PATCH 02/23] code review fixes --- .../app/routes/api.v1.sessions.$session.ts | 50 ++++++++++------ apps/webapp/app/routes/api.v1.sessions.ts | 57 ++++++++++--------- ...ealtime.v1.sessions.$session.$io.append.ts | 8 +++ .../realtime.v1.sessions.$session.$io.ts | 36 +++++++----- ...streams.$runId.$target.$streamId.append.ts | 8 +++ .../app/services/realtime/sessions.server.ts | 12 ++-- packages/core/src/v3/schemas/api.ts | 43 ++++++++------ 7 files changed, 135 insertions(+), 79 deletions(-) diff --git a/apps/webapp/app/routes/api.v1.sessions.$session.ts b/apps/webapp/app/routes/api.v1.sessions.$session.ts index 202058a596e..fa061116993 100644 --- a/apps/webapp/app/routes/api.v1.sessions.$session.ts +++ b/apps/webapp/app/routes/api.v1.sessions.$session.ts @@ -63,23 +63,41 @@ const { action } = createActionApiRoute( return json({ error: "Session not found" }, { status: 404 }); } - const updated = await prisma.session.update({ - where: { id: existing.id }, - data: { - ...(body.tags !== undefined ? { tags: body.tags } : {}), - ...(body.metadata !== undefined - ? { - metadata: - body.metadata === null - ? Prisma.JsonNull - : (body.metadata as Prisma.InputJsonValue), - } - : {}), - ...(body.externalId !== undefined ? { externalId: body.externalId } : {}), - }, - }); + try { + const updated = await prisma.session.update({ + where: { id: existing.id }, + data: { + ...(body.tags !== undefined ? { tags: body.tags } : {}), + ...(body.metadata !== undefined + ? { + metadata: + body.metadata === null + ? Prisma.JsonNull + : (body.metadata as Prisma.InputJsonValue), + } + : {}), + ...(body.externalId !== undefined ? { externalId: body.externalId } : {}), + }, + }); - return json(serializeSession(updated)); + return json(serializeSession(updated)); + } catch (error) { + // A duplicate externalId in the same environment violates the + // `(runtimeEnvironmentId, externalId)` unique constraint. Surface that + // as a 409 rather than a generic 500. + if ( + error instanceof Prisma.PrismaClientKnownRequestError && + error.code === "P2002" && + Array.isArray((error.meta as { target?: string[] })?.target) && + ((error.meta as { target?: string[] }).target ?? []).includes("externalId") + ) { + return json( + { error: "A session with this externalId already exists in this environment" }, + { status: 409 } + ); + } + throw error; + } } ); diff --git a/apps/webapp/app/routes/api.v1.sessions.ts b/apps/webapp/app/routes/api.v1.sessions.ts index e9d1a106127..e96a62302c4 100644 --- a/apps/webapp/app/routes/api.v1.sessions.ts +++ b/apps/webapp/app/routes/api.v1.sessions.ts @@ -89,38 +89,43 @@ const { action } = createActionApiRoute( let isCached = false; if (body.externalId) { - // Idempotent: (env, externalId) uniquely identifies the Session. - const existing = await prisma.session.findUnique({ + // Atomic upsert — two concurrent POSTs with the same externalId both + // converge to the same row without either hitting a 500 from the + // unique constraint. + const { id, friendlyId } = SessionId.generate(); + const externalId = body.externalId; + const pre = await prisma.session.findFirst({ where: { - runtimeEnvironmentId_externalId: { - runtimeEnvironmentId: authentication.environment.id, - externalId: body.externalId, - }, + runtimeEnvironmentId: authentication.environment.id, + externalId, }, + select: { id: true }, }); + isCached = pre !== null; - if (existing) { - session = existing; - isCached = true; - } else { - const { id, friendlyId } = SessionId.generate(); - session = await prisma.session.create({ - data: { - id, - friendlyId, - externalId: body.externalId, - type: body.type, - taskIdentifier: body.taskIdentifier ?? null, - tags: body.tags ?? [], - metadata: body.metadata as Prisma.InputJsonValue | undefined, - expiresAt: body.expiresAt ?? null, - projectId: authentication.environment.projectId, + session = await prisma.session.upsert({ + where: { + runtimeEnvironmentId_externalId: { runtimeEnvironmentId: authentication.environment.id, - environmentType: authentication.environment.type, - organizationId: authentication.environment.organizationId, + externalId, }, - }); - } + }, + create: { + id, + friendlyId, + externalId, + type: body.type, + taskIdentifier: body.taskIdentifier ?? null, + tags: body.tags ?? [], + metadata: body.metadata as Prisma.InputJsonValue | undefined, + expiresAt: body.expiresAt ?? null, + projectId: authentication.environment.projectId, + runtimeEnvironmentId: authentication.environment.id, + environmentType: authentication.environment.type, + organizationId: authentication.environment.organizationId, + }, + update: {}, + }); } else { const { id, friendlyId } = SessionId.generate(); session = await prisma.session.create({ diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts index d97197a262b..a1e158d8821 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts @@ -17,10 +17,18 @@ const ParamsSchema = z.object({ // POST: server-side append of a single record to a session channel. Mirrors // the existing /realtime/v1/streams/:runId/:target/:streamId/append route, // scoped to a Session primitive. +// S2 enforces a 1 MiB per-record limit (metered as +// `8 + 2*H + Σ(header name+value) + body`). We cap the raw HTTP body at +// 512 KiB so the JSON wrapper (`{"data":"...","id":"..."}`), string +// escaping, and any future per-record header additions all stay comfortably +// below S2's ceiling. See https://s2.dev/docs/limits. +const MAX_APPEND_BODY_BYTES = 1024 * 512; + const { action } = createActionApiRoute( { params: ParamsSchema, method: "POST", + maxContentLength: MAX_APPEND_BODY_BYTES, allowJWT: true, corsStrategy: "all", authorization: { diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts index 99649ffa0a4..ebc13511818 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts @@ -79,7 +79,12 @@ const loader = createLoaderApiRoute( }, authorization: { action: "read", - resource: (session) => ({ sessions: [session.friendlyId, session.externalId ?? ""] }), + resource: (session) => { + const ids = session.externalId + ? [session.friendlyId, session.externalId] + : [session.friendlyId]; + return { sessions: ids }; + }, superScopes: ["read:sessions", "read:all", "admin"], }, }, @@ -104,19 +109,22 @@ const loader = createLoaderApiRoute( const lastEventId = request.headers.get("Last-Event-ID") ?? undefined; - const timeoutInSecondsRaw = request.headers.get("Timeout-Seconds") ?? undefined; - const timeoutInSeconds = timeoutInSecondsRaw ? parseInt(timeoutInSecondsRaw, 10) : undefined; - - if (timeoutInSeconds !== undefined && isNaN(timeoutInSeconds)) { - return new Response("Invalid timeout seconds", { status: 400 }); - } - - if (timeoutInSeconds !== undefined && timeoutInSeconds < 1) { - return new Response("Timeout seconds must be greater than 0", { status: 400 }); - } - - if (timeoutInSeconds !== undefined && timeoutInSeconds > 600) { - return new Response("Timeout seconds must be less than 600", { status: 400 }); + const timeoutInSecondsRaw = request.headers.get("Timeout-Seconds"); + let timeoutInSeconds: number | undefined; + if (timeoutInSecondsRaw) { + // `Number()` rejects `"10abc"` as NaN; `parseInt` would silently accept + // the trailing garbage and bypass the bounds checks below. + const parsed = Number(timeoutInSecondsRaw); + if (!Number.isFinite(parsed) || !Number.isInteger(parsed)) { + return new Response("Invalid timeout seconds", { status: 400 }); + } + if (parsed < 1) { + return new Response("Timeout seconds must be greater than 0", { status: 400 }); + } + if (parsed > 600) { + return new Response("Timeout seconds must be less than 600", { status: 400 }); + } + timeoutInSeconds = parsed; } return realtimeStream.streamResponseFromSessionStream( diff --git a/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts b/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts index facb6dd664f..deefbc20773 100644 --- a/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts +++ b/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts @@ -13,9 +13,17 @@ const ParamsSchema = z.object({ streamId: z.string(), }); +// S2 enforces a 1 MiB per-record limit (metered as +// `8 + 2*H + Σ(header name+value) + body`). Cap the raw HTTP body at +// 512 KiB so the JSON wrapper, string escaping, and any future per-record +// header additions all stay well under S2's ceiling. +// See https://s2.dev/docs/limits. +const MAX_APPEND_BODY_BYTES = 1024 * 512; + const { action } = createActionApiRoute( { params: ParamsSchema, + maxContentLength: MAX_APPEND_BODY_BYTES, }, async ({ request, params, authentication }) => { const run = await $replica.taskRun.findFirst({ diff --git a/apps/webapp/app/services/realtime/sessions.server.ts b/apps/webapp/app/services/realtime/sessions.server.ts index b330cc7ee47..5ed67d5a691 100644 --- a/apps/webapp/app/services/realtime/sessions.server.ts +++ b/apps/webapp/app/services/realtime/sessions.server.ts @@ -27,13 +27,11 @@ export async function resolveSessionByIdOrExternalId( }); } - return prisma.session.findUnique({ - where: { - runtimeEnvironmentId_externalId: { - runtimeEnvironmentId, - externalId: idOrExternalId, - }, - }, + // `findFirst` rather than `findUnique` per the repo rule — `findUnique`'s + // implicit DataLoader has open correctness bugs in Prisma 6.x that bite + // hot-path lookups exactly like this one. + return prisma.session.findFirst({ + where: { runtimeEnvironmentId, externalId: idOrExternalId }, }); } diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts index 14adf823ad3..23a65f00022 100644 --- a/packages/core/src/v3/schemas/api.ts +++ b/packages/core/src/v3/schemas/api.ts @@ -1456,8 +1456,8 @@ export type CompleteWaitpointTokenRequestBody = z.infer; @@ -1514,19 +1517,27 @@ export type SessionStatus = z.infer; * narrowing fields — both produced automatically by `zodfetchCursorPage` * and the matching client-side search-query helper. */ -export const ListSessionsQueryParams = z.object({ - "page[size]": z.coerce.number().int().min(1).max(100).default(20), - "page[after]": z.string().optional(), - "page[before]": z.string().optional(), - "filter[type]": z.union([z.string(), z.array(z.string())]).optional(), - "filter[tags]": z.union([z.string(), z.array(z.string())]).optional(), - "filter[taskIdentifier]": z.union([z.string(), z.array(z.string())]).optional(), - "filter[externalId]": z.string().optional(), - "filter[status]": z.union([SessionStatus, z.array(SessionStatus)]).optional(), - "filter[createdAt][period]": z.string().optional(), - "filter[createdAt][from]": z.coerce.number().int().optional(), - "filter[createdAt][to]": z.coerce.number().int().optional(), -}); +export const ListSessionsQueryParams = z + .object({ + "page[size]": z.coerce.number().int().min(1).max(100).default(20), + "page[after]": z.string().optional(), + "page[before]": z.string().optional(), + "filter[type]": z.union([z.string(), z.array(z.string())]).optional(), + "filter[tags]": z.union([z.string(), z.array(z.string())]).optional(), + "filter[taskIdentifier]": z.union([z.string(), z.array(z.string())]).optional(), + "filter[externalId]": z.string().optional(), + "filter[status]": z.union([SessionStatus, z.array(SessionStatus)]).optional(), + "filter[createdAt][period]": z.string().optional(), + "filter[createdAt][from]": z.coerce.number().int().optional(), + "filter[createdAt][to]": z.coerce.number().int().optional(), + }) + .refine( + (value) => !(value["page[after]"] && value["page[before]"]), + { + message: "Cannot pass both page[after] and page[before] on the same request", + path: ["page[before]"], + } + ); export type ListSessionsQueryParams = z.infer; /** From a724f9c8dd01e75c8877cc0af65d99f277269239 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 20 Apr 2026 15:58:57 +0100 Subject: [PATCH 03/23] fix(core): reject externalId starting with 'session_' on Session create/update The session_ prefix identifies internal friendlyIds. Allowing it in a user-supplied externalId would misroute subsequent GET/PATCH/close requests through resolveSessionByIdOrExternalId to a friendlyId lookup, returning null or the wrong session. Reject at the schema boundary so both routes surface a clean 422. --- packages/core/src/v3/schemas/api.ts | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts index 23a65f00022..99ba76b3b33 100644 --- a/packages/core/src/v3/schemas/api.ts +++ b/packages/core/src/v3/schemas/api.ts @@ -1457,7 +1457,15 @@ export const CreateSessionRequestBody = z.object({ /** Plain string discriminator — e.g. `"chat.agent"`. Not validated against an enum on the server. */ type: z.string().min(1).max(64), /** User-supplied idempotency key. Unique per environment. Empty strings are rejected. */ - externalId: z.string().trim().min(1).max(256).optional(), + externalId: z + .string() + .trim() + .min(1) + .max(256) + .refine((v) => !v.startsWith("session_"), { + message: "externalId cannot start with 'session_' (reserved prefix for internal friendlyIds)", + }) + .optional(), /** Optional pointer for task-owned session types. */ taskIdentifier: z.string().max(128).optional(), /** Up to 10 tags for dashboard filtering. */ @@ -1497,7 +1505,18 @@ export const UpdateSessionRequestBody = z.object({ metadata: z.record(z.unknown()).nullable().optional(), // Null explicitly clears the externalId; non-null values must be non-empty. externalId: z - .union([z.literal(null), z.string().trim().min(1).max(256)]) + .union([ + z.literal(null), + z + .string() + .trim() + .min(1) + .max(256) + .refine((v) => !v.startsWith("session_"), { + message: + "externalId cannot start with 'session_' (reserved prefix for internal friendlyIds)", + }), + ]) .optional(), }); export type UpdateSessionRequestBody = z.infer; From fceabc1ef863801ddceced8bfe5e6b54c01c4cf1 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 20 Apr 2026 16:04:17 +0100 Subject: [PATCH 04/23] fix(webapp): allow JWT + CORS on sessions list endpoint Without allowJWT/corsStrategy, frontend clients holding public access tokens hit 401 on GET /api/v1/sessions and browser preflights fail. Matches the single-session GET/PATCH/close routes and the runs list endpoint. --- apps/webapp/app/routes/api.v1.sessions.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/webapp/app/routes/api.v1.sessions.ts b/apps/webapp/app/routes/api.v1.sessions.ts index e96a62302c4..fbdcb381a75 100644 --- a/apps/webapp/app/routes/api.v1.sessions.ts +++ b/apps/webapp/app/routes/api.v1.sessions.ts @@ -26,6 +26,8 @@ function asArray(value: T | T[] | undefined): T[] | undefined { export const loader = createLoaderApiRoute( { searchParams: ListSessionsQueryParams, + allowJWT: true, + corsStrategy: "all", findResource: async () => 1, }, async ({ searchParams, authentication }) => { From 2f8903a8401d7c7b7ced967e4cc3a8e142cfc3a1 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 20 Apr 2026 16:23:03 +0100 Subject: [PATCH 05/23] fix(webapp): tighten sessions create + list auth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Derive isCached from the upsert result (id mismatch = pre-existing row) instead of doing a separate findFirst first. The pre-check was racy — two concurrent first-time POSTs could both return 201 with isCached: false. Using the returned row's id is atomic and saves a round-trip. - Scope the list endpoint's authorization to the standard action/resource pattern (matches api.v1.runs.ts): task-scoped JWTs can list sessions filtered by their task, and broader super-scopes (read:sessions, read:all, admin) authorize unfiltered listing. - Log and swallow unexpected errors on POST rather than returning the raw error.message. Prisma/internal messages can leak column names and query fragments. --- apps/webapp/app/routes/api.v1.sessions.ts | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/apps/webapp/app/routes/api.v1.sessions.ts b/apps/webapp/app/routes/api.v1.sessions.ts index fbdcb381a75..34bfc497afe 100644 --- a/apps/webapp/app/routes/api.v1.sessions.ts +++ b/apps/webapp/app/routes/api.v1.sessions.ts @@ -10,6 +10,7 @@ import { SessionId } from "@trigger.dev/core/v3/isomorphic"; import type { Prisma, Session } from "@trigger.dev/database"; import { $replica, prisma, type PrismaClient } from "~/db.server"; import { clickhouseClient } from "~/services/clickhouseInstance.server"; +import { logger } from "~/services/logger.server"; import { serializeSession } from "~/services/realtime/sessions.server"; import { SessionsRepository } from "~/services/sessionsRepository/sessionsRepository.server"; import { @@ -28,6 +29,11 @@ export const loader = createLoaderApiRoute( searchParams: ListSessionsQueryParams, allowJWT: true, corsStrategy: "all", + authorization: { + action: "read", + resource: (_, __, searchParams) => ({ tasks: searchParams["filter[taskIdentifier]"] }), + superScopes: ["read:sessions", "read:all", "admin"], + }, findResource: async () => 1, }, async ({ searchParams, authentication }) => { @@ -93,17 +99,11 @@ const { action } = createActionApiRoute( if (body.externalId) { // Atomic upsert — two concurrent POSTs with the same externalId both // converge to the same row without either hitting a 500 from the - // unique constraint. + // unique constraint. Derive isCached from the upsert result: if the + // row pre-existed, the returned id won't match the one we just + // generated. Saves a round-trip and is race-free. const { id, friendlyId } = SessionId.generate(); const externalId = body.externalId; - const pre = await prisma.session.findFirst({ - where: { - runtimeEnvironmentId: authentication.environment.id, - externalId, - }, - select: { id: true }, - }); - isCached = pre !== null; session = await prisma.session.upsert({ where: { @@ -128,6 +128,7 @@ const { action } = createActionApiRoute( }, update: {}, }); + isCached = session.id !== id; } else { const { id, friendlyId } = SessionId.generate(); session = await prisma.session.create({ @@ -155,9 +156,7 @@ const { action } = createActionApiRoute( if (error instanceof ServiceValidationError) { return json({ error: error.message }, { status: 422 }); } - if (error instanceof Error) { - return json({ error: error.message }, { status: 500 }); - } + logger.error("Failed to create session", { error }); return json({ error: "Something went wrong" }, { status: 500 }); } } From aaab95816d7a35327085b833fef98f88d1b55cd2 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 23 Apr 2026 10:00:43 +0100 Subject: [PATCH 06/23] =?UTF-8?q?feat(webapp,core):=20Session=20channel=20?= =?UTF-8?q?waitpoints=20=E2=80=94=20server=20side?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Give Session channels run-engine waitpoint semantics so a task can suspend while idle on a session channel and resume when an external client sends a record — parallel to what streams.input offers run-scoped streams. Webapp - POST /api/v1/runs/:runFriendlyId/session-streams/wait — creates a manual waitpoint attached to {sessionId, io} and race-checks the S2 stream starting at lastSeqNum so pre-arrived data fires it immediately. Mirrors the existing input-stream waitpoint route. - sessionStreamWaitpointCache.server.ts — Redis set keyed on {sessionFriendlyId, io}, drained atomically on each append so concurrent multi-tab waiters all wake together. - realtime.v1.sessions.$session.$io.append now drains pending waitpoints after every record lands and completes each with the appended body. - S2RealtimeStreams.readSessionStreamRecords — session-channel parallel of readRecords, feeds the race-check path. Core - CreateSessionStreamWaitpoint request/response schemas alongside the existing Session CRUD schemas. Server API contract only — the client ApiClient + SDK wrapper ship on the AI-chat branch. --- .changeset/session-primitive.md | 2 +- .server-changes/session-primitive.md | 2 + ...uns.$runFriendlyId.session-streams.wait.ts | 165 ++++++++++++++++++ ...ealtime.v1.sessions.$session.$io.append.ts | 40 +++++ .../realtime/s2realtimeStreams.server.ts | 17 +- .../sessionStreamWaitpointCache.server.ts | 117 +++++++++++++ packages/core/src/v3/schemas/api.ts | 32 ++++ 7 files changed, 373 insertions(+), 2 deletions(-) create mode 100644 apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts create mode 100644 apps/webapp/app/services/sessionStreamWaitpointCache.server.ts diff --git a/.changeset/session-primitive.md b/.changeset/session-primitive.md index ccfd3b51807..0f56fc65ad1 100644 --- a/.changeset/session-primitive.md +++ b/.changeset/session-primitive.md @@ -2,4 +2,4 @@ "@trigger.dev/core": patch --- -Add `SessionId` friendly ID generator and schemas for the new durable Session primitive. Exported from `@trigger.dev/core/v3/isomorphic` alongside `RunId`, `BatchId`, etc. +Add `SessionId` friendly ID generator and schemas for the new durable Session primitive. Exported from `@trigger.dev/core/v3/isomorphic` alongside `RunId`, `BatchId`, etc. Ships the `CreateSessionStreamWaitpoint` request/response schemas alongside the main Session CRUD. diff --git a/.server-changes/session-primitive.md b/.server-changes/session-primitive.md index 80516a5c6a6..3bb9481a0ee 100644 --- a/.server-changes/session-primitive.md +++ b/.server-changes/session-primitive.md @@ -4,3 +4,5 @@ type: feature --- Add `Session` primitive — a durable, typed, bidirectional I/O primitive that outlives a single run, intended for agent/chat use cases. Ships the Postgres schema (`Session` table), control-plane CRUD routes (`POST/GET/PATCH /api/v1/sessions`, `POST /api/v1/sessions/:session/close` — polymorphic on friendlyId or externalId), `sessions` JWT scope, ClickHouse `sessions_v1` table, and `SessionsReplicationService` (logical replication from Postgres `Session` → ClickHouse `sessions_v1`). Run-scoped realtime streams (`streams.pipe`/`streams.input`) are unchanged and do **not** create Session rows. + +Adds `POST /api/v1/runs/:runFriendlyId/session-streams/wait` (session-stream waitpoint creation) and wires `POST /realtime/v1/sessions/:session/:io/append` to fire any pending waitpoints on the channel. Gives `session.in` run-engine waitpoint semantics matching run-scoped input streams: a task can suspend while idle on a session channel and resume when an external client sends a record. Redis-backed pending-waitpoint set (`ssw:{sessionFriendlyId}:{io}`) is drained atomically on each append so multiple concurrent waiters (e.g. multi-tab chat) all resume together. diff --git a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts new file mode 100644 index 00000000000..c62d687751a --- /dev/null +++ b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts @@ -0,0 +1,165 @@ +import { json } from "@remix-run/server-runtime"; +import { + CreateSessionStreamWaitpointRequestBody, + type CreateSessionStreamWaitpointResponseBody, +} from "@trigger.dev/core/v3"; +import { WaitpointId } from "@trigger.dev/core/v3/isomorphic"; +import { z } from "zod"; +import { $replica } from "~/db.server"; +import { createWaitpointTag, MAX_TAGS_PER_WAITPOINT } from "~/models/waitpointTag.server"; +import { resolveSessionByIdOrExternalId } from "~/services/realtime/sessions.server"; +import { S2RealtimeStreams } from "~/services/realtime/s2realtimeStreams.server"; +import { getRealtimeStreamInstance } from "~/services/realtime/v1StreamsGlobal.server"; +import { + addSessionStreamWaitpoint, + removeSessionStreamWaitpoint, +} from "~/services/sessionStreamWaitpointCache.server"; +import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; +import { parseDelay } from "~/utils/delays"; +import { resolveIdempotencyKeyTTL } from "~/utils/idempotencyKeys.server"; +import { engine } from "~/v3/runEngine.server"; +import { ServiceValidationError } from "~/v3/services/baseService.server"; + +const ParamsSchema = z.object({ + runFriendlyId: z.string(), +}); + +const { action, loader } = createActionApiRoute( + { + params: ParamsSchema, + body: CreateSessionStreamWaitpointRequestBody, + maxContentLength: 1024 * 10, // 10KB + method: "POST", + }, + async ({ authentication, body, params }) => { + try { + const run = await $replica.taskRun.findFirst({ + where: { + friendlyId: params.runFriendlyId, + runtimeEnvironmentId: authentication.environment.id, + }, + select: { + id: true, + friendlyId: true, + realtimeStreamsVersion: true, + }, + }); + + if (!run) { + return json({ error: "Run not found" }, { status: 404 }); + } + + const session = await resolveSessionByIdOrExternalId( + $replica, + authentication.environment.id, + body.session + ); + + if (!session) { + return json({ error: "Session not found" }, { status: 404 }); + } + + const idempotencyKeyExpiresAt = body.idempotencyKeyTTL + ? resolveIdempotencyKeyTTL(body.idempotencyKeyTTL) + : undefined; + + const timeout = await parseDelay(body.timeout); + + const bodyTags = typeof body.tags === "string" ? [body.tags] : body.tags; + + if (bodyTags && bodyTags.length > MAX_TAGS_PER_WAITPOINT) { + throw new ServiceValidationError( + `Waitpoints can only have ${MAX_TAGS_PER_WAITPOINT} tags, you're trying to set ${bodyTags.length}.` + ); + } + + if (bodyTags && bodyTags.length > 0) { + for (const tag of bodyTags) { + await createWaitpointTag({ + tag, + environmentId: authentication.environment.id, + projectId: authentication.environment.projectId, + }); + } + } + + // Step 1: Create the waitpoint. + const result = await engine.createManualWaitpoint({ + environmentId: authentication.environment.id, + projectId: authentication.environment.projectId, + idempotencyKey: body.idempotencyKey, + idempotencyKeyExpiresAt, + timeout, + tags: bodyTags, + }); + + // Step 2: Register the waitpoint on the session channel so the next + // append fires it. Keyed by (sessionFriendlyId, io) — both runs on a + // multi-tab session wake on the same record. + const ttlMs = timeout ? timeout.getTime() - Date.now() : undefined; + await addSessionStreamWaitpoint( + session.friendlyId, + body.io, + result.waitpoint.id, + ttlMs && ttlMs > 0 ? ttlMs : undefined + ); + + // Step 3: Race-check. If a record landed on the channel before this + // .wait() call, complete the waitpoint synchronously with that data + // and remove the pending registration. + if (!result.isCached) { + try { + const realtimeStream = getRealtimeStreamInstance( + authentication.environment, + run.realtimeStreamsVersion + ); + + if (realtimeStream instanceof S2RealtimeStreams) { + const records = await realtimeStream.readSessionStreamRecords( + session.friendlyId, + body.io, + body.lastSeqNum + ); + + if (records.length > 0) { + const record = records[0]!; + + await engine.completeWaitpoint({ + id: result.waitpoint.id, + output: { + value: record.data, + type: "application/json", + isError: false, + }, + }); + + await removeSessionStreamWaitpoint( + session.friendlyId, + body.io, + result.waitpoint.id + ); + } + } + } catch { + // Non-fatal: pending registration stays in Redis; the next append + // will complete the waitpoint via the append handler path. + } + } + + return json({ + waitpointId: WaitpointId.toFriendlyId(result.waitpoint.id), + isCached: result.isCached, + }); + } catch (error) { + if (error instanceof ServiceValidationError) { + return json({ error: error.message }, { status: 422 }); + } else if (error instanceof Error) { + return json({ error: error.message }, { status: 500 }); + } + + return json({ error: "Something went wrong" }, { status: 500 }); + } + } +); + +export { action, loader }; diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts index a1e158d8821..36dcf1c4170 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts @@ -3,10 +3,13 @@ import { tryCatch } from "@trigger.dev/core/utils"; import { nanoid } from "nanoid"; import { z } from "zod"; import { $replica } from "~/db.server"; +import { logger } from "~/services/logger.server"; import { S2RealtimeStreams } from "~/services/realtime/s2realtimeStreams.server"; import { resolveSessionByIdOrExternalId } from "~/services/realtime/sessions.server"; import { getRealtimeStreamInstance } from "~/services/realtime/v1StreamsGlobal.server"; +import { drainSessionStreamWaitpoints } from "~/services/sessionStreamWaitpointCache.server"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; +import { engine } from "~/v3/runEngine.server"; import { ServiceValidationError } from "~/v3/services/common.server"; const ParamsSchema = z.object({ @@ -81,6 +84,43 @@ const { action } = createActionApiRoute( return json({ ok: false, error: appendError.message }, { status: 500 }); } + // Fire any run-scoped waitpoints registered against this channel. Best + // effort — a failure here must not fail the append (the record is + // durable in S2; the SSE tail will still deliver it). + const [drainError, waitpointIds] = await tryCatch( + drainSessionStreamWaitpoints(session.friendlyId, params.io) + ); + if (drainError) { + logger.error("Failed to drain session stream waitpoints", { + sessionFriendlyId: session.friendlyId, + io: params.io, + error: drainError, + }); + } else if (waitpointIds && waitpointIds.length > 0) { + await Promise.all( + waitpointIds.map(async (waitpointId) => { + const [completeError] = await tryCatch( + engine.completeWaitpoint({ + id: waitpointId, + output: { + value: part, + type: "application/json", + isError: false, + }, + }) + ); + if (completeError) { + logger.error("Failed to complete session stream waitpoint", { + sessionFriendlyId: session.friendlyId, + io: params.io, + waitpointId, + error: completeError, + }); + } + }) + ); + } + return json({ ok: true }, { status: 200 }); } ); diff --git a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts index 8e74661fe70..74277cf7677 100644 --- a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts +++ b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts @@ -186,7 +186,22 @@ export class S2RealtimeStreams implements StreamResponder, StreamIngestor { streamId: string, afterSeqNum?: number ): Promise { - const s2Stream = this.toStreamName(runId, streamId); + return this.#readRecordsByName(this.toStreamName(runId, streamId), afterSeqNum); + } + + /** + * Read records from a `Session`-primitive channel starting after the + * given sequence number. Used by the `.wait()` race-check path. + */ + async readSessionStreamRecords( + friendlyId: string, + io: "out" | "in", + afterSeqNum?: number + ): Promise { + return this.#readRecordsByName(this.toSessionStreamName(friendlyId, io), afterSeqNum); + } + + async #readRecordsByName(s2Stream: string, afterSeqNum?: number): Promise { const startSeq = afterSeqNum != null ? afterSeqNum + 1 : 0; const qs = new URLSearchParams(); diff --git a/apps/webapp/app/services/sessionStreamWaitpointCache.server.ts b/apps/webapp/app/services/sessionStreamWaitpointCache.server.ts new file mode 100644 index 00000000000..aa46f7cb59f --- /dev/null +++ b/apps/webapp/app/services/sessionStreamWaitpointCache.server.ts @@ -0,0 +1,117 @@ +import { Redis } from "ioredis"; +import { env } from "~/env.server"; +import { singleton } from "~/utils/singleton"; +import { logger } from "./logger.server"; + +// "ssw" — session-stream-waitpoint. Parallel to the input-stream variant +// (`isw:{runFriendlyId}:{streamId}`). Keyed purely on `{sessionId, io}` so +// a send() lands on the channel regardless of which run is waiting, and +// multiple concurrent waiters (e.g. two agents on one chat) all wake. +const KEY_PREFIX = "ssw:"; +const DEFAULT_TTL_MS = 7 * 24 * 60 * 60 * 1000; // 7 days + +function buildKey(sessionFriendlyId: string, io: "out" | "in"): string { + return `${KEY_PREFIX}${sessionFriendlyId}:${io}`; +} + +function initializeRedis(): Redis | undefined { + const host = env.CACHE_REDIS_HOST; + if (!host) { + return undefined; + } + + return new Redis({ + connectionName: "sessionStreamWaitpointCache", + host, + port: env.CACHE_REDIS_PORT, + username: env.CACHE_REDIS_USERNAME, + password: env.CACHE_REDIS_PASSWORD, + keyPrefix: "tr:", + enableAutoPipelining: true, + ...(env.CACHE_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), + }); +} + +const redis = singleton("sessionStreamWaitpointCache", initializeRedis); + +/** + * Register a waitpoint as pending on the given session channel. Called + * from the `.wait()` create-waitpoint route. Multiple waiters on the same + * channel are allowed (stored as a Redis set). + */ +export async function addSessionStreamWaitpoint( + sessionFriendlyId: string, + io: "out" | "in", + waitpointId: string, + ttlMs?: number +): Promise { + if (!redis) return; + + try { + const key = buildKey(sessionFriendlyId, io); + await redis.sadd(key, waitpointId); + await redis.pexpire(key, ttlMs ?? DEFAULT_TTL_MS); + } catch (error) { + logger.error("Failed to set session stream waitpoint cache", { + sessionFriendlyId, + io, + error, + }); + } +} + +/** + * Atomically read + clear all waitpoints registered on the given session + * channel. Called from the append handler so the next append sees an + * empty set even if two appends race. + */ +export async function drainSessionStreamWaitpoints( + sessionFriendlyId: string, + io: "out" | "in" +): Promise { + if (!redis) return []; + + try { + const key = buildKey(sessionFriendlyId, io); + const pipeline = redis.multi(); + pipeline.smembers(key); + pipeline.del(key); + const results = await pipeline.exec(); + if (!results) return []; + const [smembersResult] = results; + if (!smembersResult) return []; + const [err, members] = smembersResult; + if (err) return []; + return Array.isArray(members) ? (members as string[]) : []; + } catch (error) { + logger.error("Failed to drain session stream waitpoint cache", { + sessionFriendlyId, + io, + error, + }); + return []; + } +} + +/** + * Remove a single waitpoint from the pending set. Called after a race + * where `.wait()` completed the waitpoint from pre-arrived data. + */ +export async function removeSessionStreamWaitpoint( + sessionFriendlyId: string, + io: "out" | "in", + waitpointId: string +): Promise { + if (!redis) return; + + try { + const key = buildKey(sessionFriendlyId, io); + await redis.srem(key, waitpointId); + } catch (error) { + logger.error("Failed to remove session stream waitpoint cache entry", { + sessionFriendlyId, + io, + error, + }); + } +} diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts index 99ba76b3b33..2a24228e193 100644 --- a/packages/core/src/v3/schemas/api.ts +++ b/packages/core/src/v3/schemas/api.ts @@ -1411,6 +1411,38 @@ export type CreateInputStreamWaitpointResponseBody = z.infer< typeof CreateInputStreamWaitpointResponseBody >; +/** + * Create a run-scoped waitpoint that completes when the next record lands on + * a Session channel (`.in` or `.out`). Mirrors `CreateInputStreamWaitpointRequestBody` + * but keyed by `{sessionId, io}` instead of `{runId, streamId}`. The run is + * still the thing being suspended — Session only supplies the trigger source. + */ +export const CreateSessionStreamWaitpointRequestBody = z.object({ + /** Session friendlyId (`session_*`) or user-supplied externalId. */ + session: z.string(), + io: z.enum(["out", "in"]), + timeout: z.string().optional(), + idempotencyKey: z.string().optional(), + idempotencyKeyTTL: z.string().optional(), + tags: z.union([z.string(), z.array(z.string())]).optional(), + /** + * Last S2 sequence number the client has seen on this session channel. + * Used to catch data that arrived before `.wait()` was called. + */ + lastSeqNum: z.number().optional(), +}); +export type CreateSessionStreamWaitpointRequestBody = z.infer< + typeof CreateSessionStreamWaitpointRequestBody +>; + +export const CreateSessionStreamWaitpointResponseBody = z.object({ + waitpointId: z.string(), + isCached: z.boolean(), +}); +export type CreateSessionStreamWaitpointResponseBody = z.infer< + typeof CreateSessionStreamWaitpointResponseBody +>; + export const waitpointTokenStatuses = ["WAITING", "COMPLETED", "TIMED_OUT"] as const; export const WaitpointTokenStatus = z.enum(waitpointTokenStatuses); export type WaitpointTokenStatus = z.infer; From 4f2c0e7a11a45719c59e20abb8c6db1f2afc8c70 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 23 Apr 2026 17:57:56 +0100 Subject: [PATCH 07/23] fix(webapp): CORS + allowJWT on public session create + append preflight MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes needed by browser clients hitting the public session API (TriggerChatTransport's direct accessToken path, WebSocket-less session drivers, anything origin'd off the dashboard): - POST /api/v1/sessions: allowJWT: true + corsStrategy: "all" on the action. Pre-fix, the create endpoint only accepted secret-key auth, so any browser-originated sessions.create(...) 401'd. The loader (list) already had these; matches that shape. - POST /realtime/v1/sessions/:session/:io/append: export both { action, loader } so Remix routes the OPTIONS preflight to the route builder's CORS handler. With only { action } exported, the preflight returns 400 'No loader for route' and Chrome surfaces the follow-up POST as net::ERR_FAILED. Same pattern as /api/v1/tasks/:id/trigger (which already exports both). Validated by an end-to-end UI smoke on references/ai-chat: new chat → send → streamed assistant reply in ~4s → second turn reuses the same session + run, lastEventId advances 10 → 21. --- .server-changes/sessions-public-api-cors.md | 11 +++++++++++ apps/webapp/app/routes/api.v1.sessions.ts | 2 ++ .../realtime.v1.sessions.$session.$io.append.ts | 4 ++-- 3 files changed, 15 insertions(+), 2 deletions(-) create mode 100644 .server-changes/sessions-public-api-cors.md diff --git a/.server-changes/sessions-public-api-cors.md b/.server-changes/sessions-public-api-cors.md new file mode 100644 index 00000000000..f2047e634eb --- /dev/null +++ b/.server-changes/sessions-public-api-cors.md @@ -0,0 +1,11 @@ +--- +area: webapp +type: fix +--- + +CORS + preflight parity on the public session API so browser-side chat transports can hit the session endpoints without being blocked: + +- `POST /api/v1/sessions` (session upsert) gains `allowJWT: true` + `corsStrategy: "all"` so PATs minted by `chat.createTriggerAction` (and other browser-side session flows) pass the route's auth + respond to CORS preflight. Previously this route only accepted secret-key auth, which broke any browser-originated `sessions.create(...)` call — including the transport's direct `accessToken` fallback path. +- `POST /realtime/v1/sessions/:session/:io/append` now exports both `{ action, loader }`. The route builder installs the OPTIONS preflight handler on the `loader` even for write-only routes; without the loader export, the CORS preflight was returning 400 ("No loader for route") and Chrome treated the follow-up `POST` as `net::ERR_FAILED`. + +Validated by an end-to-end UI smoke against the `references/ai-chat` app: brand-new chat → send → streamed assistant reply in ~4s → follow-up turn on the same session → `lastEventId` advances from 10 → 21. diff --git a/apps/webapp/app/routes/api.v1.sessions.ts b/apps/webapp/app/routes/api.v1.sessions.ts index 34bfc497afe..e385dc1d654 100644 --- a/apps/webapp/app/routes/api.v1.sessions.ts +++ b/apps/webapp/app/routes/api.v1.sessions.ts @@ -90,6 +90,8 @@ const { action } = createActionApiRoute( body: CreateSessionRequestBody, method: "POST", maxContentLength: 1024 * 32, // 32KB — metadata is the only thing that grows + allowJWT: true, + corsStrategy: "all", }, async ({ authentication, body }) => { try { diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts index 36dcf1c4170..9a44f23ecff 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts @@ -27,7 +27,7 @@ const ParamsSchema = z.object({ // below S2's ceiling. See https://s2.dev/docs/limits. const MAX_APPEND_BODY_BYTES = 1024 * 512; -const { action } = createActionApiRoute( +const { action, loader } = createActionApiRoute( { params: ParamsSchema, method: "POST", @@ -125,4 +125,4 @@ const { action } = createActionApiRoute( } ); -export { action }; +export { action, loader }; From 84d3db1c05a5d53874431ed15e4fd4b0cf2ceb4f Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 23 Apr 2026 18:54:15 +0100 Subject: [PATCH 08/23] fix(webapp): address #3417 PR review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nine fixes from CodeRabbit + Devin review: - api.v1.sessions.$session.close.ts: - Export { action, loader } so CORS preflight reaches the route builder's OPTIONS handler. Same fix already applied to the append route — Devin caught that I'd missed this one. Without the loader, browser clients hitting POST /close fail preflight. - Switch to `prisma.session.updateMany({ where: { id, closedAt: null }, ... })` so concurrent closes can't overwrite the original `closedAt` / `closedReason`. Loser hits count === 0 and re-reads the winning row — closedness is write-once at the DB level. (CodeRabbit: TOCTOU.) - entry.server.tsx: Wrap the async `sessionsReplicationInstance.shutdown` in a sync handler with `.catch(...)`. SIGTERM/SIGINT fire during process teardown and a rejection from `_replicationClient.stop()` would become an unhandled promise rejection. Matches the pattern in `dynamicFlushScheduler.server.ts`. (CodeRabbit: unhandled rejection risk.) - api.v1.runs.$runFriendlyId.session-streams.wait.ts: - Swallowed race-check catch now logs `warn` with sessionFriendlyId / io / waitpointId / error. Silent failures in the S2-read / engine-complete / cache-remove path were indistinguishable from the expected cache-drain-on-append fast path. - Outer 500 path no longer forwards `error.message` (Prisma / engine / S2 internals could leak). Logs server-side and returns a generic "Something went wrong"; 422 ServiceValidationError path unchanged. (CodeRabbit: info-leak + logging gap.) - realtime.v1.sessions.$session.$io.ts: Add `method: "PUT"` to the route config so the route builder enforces method validation before the handler runs. Removed the now-redundant `request.method !== "PUT"` check inside the handler. (CodeRabbit: defense-in-depth.) - services/sessionsRepository/sessionsRepository.server.ts: `ISessionsRepository` is now a `type` alias, per repo coding guideline ("use types over interfaces"). Structural-typing means implementing classes don't need source changes. (CodeRabbit.) - services/sessionStreamWaitpointCache.server.ts: Replace separate SADD + PEXPIRE with a single atomic Lua script. Solves two distinct concerns at once: 1. Partial-failure window (CodeRabbit): if SADD succeeded and PEXPIRE failed, the key would persist with no TTL. The Lua script fails both or succeeds both. 2. TTL-race (Devin, twice): each waitpoint registers with its own `ttlMs` derived from the caller's timeout. The old code called PEXPIRE unconditionally, so a short-TTL registration would shrink the shared key's TTL below a longer-TTL sibling — evicting the sibling from Redis and degrading the append-path fast drain to engine-timeout-only. The script only PEXPIREs if the new TTL is greater than the current PTTL (or the key has no TTL yet), so the key lives as long as the longest-TTL member. Outstanding: one unresolved thread asking to rename `CloseSessionRequestBody.reason` → `closedReason` for symmetry with the DB column. Holding that for an API-taste call — will follow up. Validated: `pnpm run typecheck --filter webapp` clean. --- apps/webapp/app/entry.server.tsx | 28 ++++++++++----- ...uns.$runFriendlyId.session-streams.wait.ts | 18 +++++++--- .../routes/api.v1.sessions.$session.close.ts | 20 ++++++++--- .../realtime.v1.sessions.$session.$io.ts | 7 ++-- .../sessionStreamWaitpointCache.server.ts | 34 +++++++++++++++++-- .../sessionsRepository.server.ts | 4 +-- 6 files changed, 84 insertions(+), 27 deletions(-) diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index 06b02537da7..436ec288211 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -31,7 +31,11 @@ import { signalsEmitter } from "./services/signals.server"; // runs deterministically on webapp boot rather than lazily via a singleton // reference elsewhere in the module graph. if (sessionsReplicationInstance && env.SESSION_REPLICATION_ENABLED === "1") { - sessionsReplicationInstance + // Capture a non-nullable reference so the shutdown closure below + // doesn't need to re-null-check (TS narrowing doesn't follow through + // an inner function scope). + const replicator = sessionsReplicationInstance; + replicator .start() .then(() => { console.log("🗃️ Sessions replication service started"); @@ -42,14 +46,20 @@ if (sessionsReplicationInstance && env.SESSION_REPLICATION_ENABLED === "1") { }); }); - signalsEmitter.on( - "SIGTERM", - sessionsReplicationInstance.shutdown.bind(sessionsReplicationInstance) - ); - signalsEmitter.on( - "SIGINT", - sessionsReplicationInstance.shutdown.bind(sessionsReplicationInstance) - ); + // Wrap the async shutdown in a sync handler that catches rejections — + // SIGTERM/SIGINT fire during process teardown, and an unhandled + // promise rejection from `_replicationClient.stop()` there would + // bubble up past the process exit. Matches the pattern in + // dynamicFlushScheduler.server.ts. + const shutdownSessionsReplication = () => { + replicator.shutdown().catch((error) => { + console.error("🗃️ Sessions replication service shutdown error", { + error, + }); + }); + }; + signalsEmitter.on("SIGTERM", shutdownSessionsReplication); + signalsEmitter.on("SIGINT", shutdownSessionsReplication); } const ABORT_DELAY = 30000; diff --git a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts index c62d687751a..ad168110b45 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts @@ -15,6 +15,7 @@ import { removeSessionStreamWaitpoint, } from "~/services/sessionStreamWaitpointCache.server"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; +import { logger } from "~/services/logger.server"; import { parseDelay } from "~/utils/delays"; import { resolveIdempotencyKeyTTL } from "~/utils/idempotencyKeys.server"; import { engine } from "~/v3/runEngine.server"; @@ -140,9 +141,16 @@ const { action, loader } = createActionApiRoute( ); } } - } catch { + } catch (error) { // Non-fatal: pending registration stays in Redis; the next append - // will complete the waitpoint via the append handler path. + // will complete the waitpoint via the append handler path. Log so + // a broken race-check doesn't silently degrade to timeout-only. + logger.warn("session-stream wait race-check failed", { + sessionFriendlyId: session.friendlyId, + io: body.io, + waitpointId: WaitpointId.toFriendlyId(result.waitpoint.id), + error, + }); } } @@ -153,10 +161,10 @@ const { action, loader } = createActionApiRoute( } catch (error) { if (error instanceof ServiceValidationError) { return json({ error: error.message }, { status: 422 }); - } else if (error instanceof Error) { - return json({ error: error.message }, { status: 500 }); } - + // Don't forward raw internal error messages (could leak Prisma/engine + // details). Log server-side and return a generic 500. + logger.error("Failed to create session-stream waitpoint", { error }); return json({ error: "Something went wrong" }, { status: 500 }); } } diff --git a/apps/webapp/app/routes/api.v1.sessions.$session.close.ts b/apps/webapp/app/routes/api.v1.sessions.$session.close.ts index 047477e47ae..810e3350725 100644 --- a/apps/webapp/app/routes/api.v1.sessions.$session.close.ts +++ b/apps/webapp/app/routes/api.v1.sessions.$session.close.ts @@ -15,7 +15,7 @@ const ParamsSchema = z.object({ session: z.string(), }); -const { action } = createActionApiRoute( +const { action, loader } = createActionApiRoute( { params: ParamsSchema, body: CloseSessionRequestBody, @@ -46,16 +46,28 @@ const { action } = createActionApiRoute( return json(serializeSession(existing)); } - const updated = await prisma.session.update({ - where: { id: existing.id }, + // `closedAt: null` on the where clause makes the update conditional at + // the DB level. Two concurrent closes race through the earlier read, + // but only one can win this update — the loser hits `count === 0` and + // falls back to reading the winning row. Closedness is write-once. + const { count } = await prisma.session.updateMany({ + where: { id: existing.id, closedAt: null }, data: { closedAt: new Date(), closedReason: body.reason ?? null, }, }); + if (count === 0) { + const final = await prisma.session.findFirst({ where: { id: existing.id } }); + if (!final) return json({ error: "Session not found" }, { status: 404 }); + return json(serializeSession(final)); + } + + const updated = await prisma.session.findFirst({ where: { id: existing.id } }); + if (!updated) return json({ error: "Session not found" }, { status: 404 }); return json(serializeSession(updated)); } ); -export { action }; +export { action, loader }; diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts index ebc13511818..214a2efeb3e 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts @@ -21,6 +21,7 @@ const ParamsSchema = z.object({ const { action } = createActionApiRoute( { params: ParamsSchema, + method: "PUT", allowJWT: true, corsStrategy: "all", authorization: { @@ -29,11 +30,7 @@ const { action } = createActionApiRoute( superScopes: ["write:sessions", "write:all", "admin"], }, }, - async ({ request, params, authentication }) => { - if (request.method !== "PUT") { - return new Response("Method not allowed", { status: 405 }); - } - + async ({ params, authentication }) => { const session = await resolveSessionByIdOrExternalId( $replica, authentication.environment.id, diff --git a/apps/webapp/app/services/sessionStreamWaitpointCache.server.ts b/apps/webapp/app/services/sessionStreamWaitpointCache.server.ts index aa46f7cb59f..050ebddeac3 100644 --- a/apps/webapp/app/services/sessionStreamWaitpointCache.server.ts +++ b/apps/webapp/app/services/sessionStreamWaitpointCache.server.ts @@ -34,6 +34,31 @@ function initializeRedis(): Redis | undefined { const redis = singleton("sessionStreamWaitpointCache", initializeRedis); +// Atomic SADD + PEXPIRE that only ever extends the key's TTL. +// +// Two concerns rolled into one script: +// 1. SADD + PEXPIRE as separate commands can leave the key with no TTL +// if the second call fails (or the process crashes in between). +// 2. Each waitpoint registers with its own `ttlMs` (derived from the +// waitpoint's timeout). Calling PEXPIRE unconditionally would let a +// short-TTL registration shrink the key's TTL below a longer-TTL +// sibling — evicting the sibling early and degrading the append-path +// fast drain to engine-timeout-only. +// +// The script: SADD the member, then set PEXPIRE only if the new TTL is +// greater than the current PTTL (or the key has no TTL yet). Engine- +// level timeouts still fire per-waitpoint; this keeps the Redis key +// alive for the longest-lived member. +const ADD_WAITPOINT_SCRIPT = ` + redis.call("SADD", KEYS[1], ARGV[1]) + local newTtl = tonumber(ARGV[2]) + local currentTtl = redis.call("PTTL", KEYS[1]) + if currentTtl < 0 or newTtl > currentTtl then + redis.call("PEXPIRE", KEYS[1], newTtl) + end + return 1 +`; + /** * Register a waitpoint as pending on the given session channel. Called * from the `.wait()` create-waitpoint route. Multiple waiters on the same @@ -49,8 +74,13 @@ export async function addSessionStreamWaitpoint( try { const key = buildKey(sessionFriendlyId, io); - await redis.sadd(key, waitpointId); - await redis.pexpire(key, ttlMs ?? DEFAULT_TTL_MS); + await redis.eval( + ADD_WAITPOINT_SCRIPT, + 1, + key, + waitpointId, + String(ttlMs ?? DEFAULT_TTL_MS) + ); } catch (error) { logger.error("Failed to set session stream waitpoint cache", { sessionFriendlyId, diff --git a/apps/webapp/app/services/sessionsRepository/sessionsRepository.server.ts b/apps/webapp/app/services/sessionsRepository/sessionsRepository.server.ts index cb4ebb48b6f..15566295e33 100644 --- a/apps/webapp/app/services/sessionsRepository/sessionsRepository.server.ts +++ b/apps/webapp/app/services/sessionsRepository/sessionsRepository.server.ts @@ -98,7 +98,7 @@ export type ListedSession = Prisma.SessionGetPayload<{ }; }>; -export interface ISessionsRepository { +export type ISessionsRepository = { name: string; listSessionIds(options: ListSessionsOptions): Promise; listSessions(options: ListSessionsOptions): Promise<{ @@ -110,7 +110,7 @@ export interface ISessionsRepository { }>; countSessions(options: SessionListInputOptions): Promise; listTags(options: SessionTagListOptions): Promise; -} +}; export class SessionsRepository implements ISessionsRepository { private readonly clickHouseSessionsRepository: ClickHouseSessionsRepository; From b6e642f057f219a5b5de0272e76178d9052f61a7 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 23 Apr 2026 20:26:46 +0100 Subject: [PATCH 09/23] fix(webapp): correct backward pagination slice on session list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devin catch on #3417 — the ClickHouse sessions list was slicing `sessionIds.slice(1, size + 1)` on the backward path, which skipped the item closest to the cursor and surfaced the sentinel (the `size+1`th item that proves hasMore=true) to the user. Trace, with items c01…c11 and cursor=c07 (page size 3): - Backward query: `session_id > c07 ORDER BY ASC LIMIT 4` → `[c08, c09, c10, c11]`. Legitimate content is the first three (`[c08, c09, c10]`); `c11` is the sentinel. - Previous slice: `[c09, c10, c11]` → displayed DESC `[c11, c10, c09]` — user never sees c08, sees sentinel c11 instead. Fix: collapse both directions to `sessionIds.slice(0, size)`. The sentinel is always the last item regardless of direction, so the two branches had no reason to diverge. Cursor computations (`previousCursor = reversed.at(1)`, `nextCursor = reversed.at(size)`) already line up with the corrected slice — no change needed there. Verified: webapp typecheck clean. --- .../clickhouseSessionsRepository.server.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/apps/webapp/app/services/sessionsRepository/clickhouseSessionsRepository.server.ts b/apps/webapp/app/services/sessionsRepository/clickhouseSessionsRepository.server.ts index 79fb06ac933..c810a0dfa1e 100644 --- a/apps/webapp/app/services/sessionsRepository/clickhouseSessionsRepository.server.ts +++ b/apps/webapp/app/services/sessionsRepository/clickhouseSessionsRepository.server.ts @@ -74,10 +74,12 @@ export class ClickHouseSessionsRepository implements ISessionsRepository { } } - const idsToReturn = - options.page.direction === "backward" && hasMore - ? sessionIds.slice(1, options.page.size + 1) - : sessionIds.slice(0, options.page.size); + // Both directions slice the first `size` IDs: the `size+1`th item is + // the sentinel proving another page exists (hasMore), not part of the + // page content. Backward queries sort ASC (items closest to the cursor + // first), so `[0..size)` is still the legitimate window and the last + // element is the sentinel — identical to the forward case. + const idsToReturn = sessionIds.slice(0, options.page.size); let sessions = await this.options.prisma.session.findMany({ where: { From 4453a45b5f86abbc4951ea1574b94622f62b06e6 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 24 Apr 2026 10:44:26 +0100 Subject: [PATCH 10/23] feat(webapp): session.out wait=0 + X-Session-Settled on settled tail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /realtime/v1/sessions/:session/:io=out now peeks the tail record in S2 at connection time. When the tail chunk is trigger:turn-complete, the agent has finished a turn and is either idle-waiting on .in or has exited — either way no more chunks will arrive without further user action. In that case the downstream S2 read switches to wait=0 so the SSE drains and closes in ~1s instead of long-polling for 60s, and the response carries X-Session-Settled: true so the client can tell the close is terminal rather than a normal 60s cycle. Mid-turn tails (streaming UIMessageChunks in flight) fall through to the existing wait=60 long-poll. Crashed-mid-turn is indistinguishable from live-streaming at this point and gets the same 60s retry loop as today — that's a separate hardening, not in scope here. The peek uses GET /records?tail_offset=1&count=1&wait=0 (single-digit ms on S2), then unwraps the agent-side envelope written by StreamsWriterV2: record.body parses to {data: , id}, where is the raw UIMessageChunk object. No double-parse on data. 404 / 416 from the peek (stream never written / empty stream) short- circuit to settled=false so first-connect on a freshly-created session keeps the long-poll semantics the agent's first chunks depend on. Verified end-to-end against an idle chat-agent-smoke session: caught- up reconnect (Last-Event-ID = tail) closes in 1.08s with the header; behind reconnect (Last-Event-ID < tail) drains remaining records then closes in 0.94s with the header; empty-stream reconnect keeps the 60s long-poll behavior unchanged. --- .server-changes/session-out-settled-signal.md | 8 ++ .../realtime/s2realtimeStreams.server.ts | 105 +++++++++++++++++- 2 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 .server-changes/session-out-settled-signal.md diff --git a/.server-changes/session-out-settled-signal.md b/.server-changes/session-out-settled-signal.md new file mode 100644 index 00000000000..aeb0a537d38 --- /dev/null +++ b/.server-changes/session-out-settled-signal.md @@ -0,0 +1,8 @@ +--- +area: webapp +type: improvement +--- + +`/realtime/v1/sessions/:session/out` now peeks the tail record in S2 at connection time. If the last chunk is `trigger:turn-complete` (agent finished a turn and is either idle-waiting on `.in` or has exited), the downstream S2 read uses `wait=0` so the SSE drains and closes immediately instead of holding the connection open for 60s. The response also carries `X-Session-Settled: true` so the client can tell the close is terminal rather than a normal long-poll cycle. + +Lets `TriggerChatTransport.reconnectToStream` return quickly on page reloads of settled chats without requiring callers to persist an `isStreaming` flag — the server decides from the stream's own tail. Mid-turn tails still take the 60s long-poll path unchanged. diff --git a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts index 74277cf7677..8d2c599ffea 100644 --- a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts +++ b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts @@ -293,6 +293,18 @@ export class S2RealtimeStreams implements StreamResponder, StreamIngestor { /** * Serve SSE from a `Session`-primitive channel addressed by * `(friendlyId, io)`. + * + * For `io=out`, peek the tail record first. If it's + * `trigger:turn-complete`, the agent has finished a turn and is + * either idle-waiting on `.in` or has exited — either way, no more + * chunks will arrive without further user action. We switch the + * downstream S2 read to `wait=0` (drain whatever's left, close fast) + * and set `X-Session-Settled: true` so the client knows this SSE + * close is terminal instead of the normal 60s long-poll cycle. + * + * Mid-turn tail (streaming UIMessageChunk) falls through to the + * long-poll path; a crashed-mid-turn stream is indistinguishable + * here and behaves like today (client sees wait=60 close, retries). */ async streamResponseFromSessionStream( request: Request, @@ -301,7 +313,98 @@ export class S2RealtimeStreams implements StreamResponder, StreamIngestor { signal: AbortSignal, options?: StreamResponseOptions ): Promise { - return this.#streamResponseByName(this.toSessionStreamName(friendlyId, io), signal, options); + const s2Stream = this.toSessionStreamName(friendlyId, io); + + let waitSeconds = options?.timeoutInSeconds ?? this.s2WaitSeconds; + let settled = false; + + if (io === "out") { + const lastChunk = await this.#peekLastChunkBody(s2Stream); + if ( + lastChunk != null && + typeof lastChunk === "object" && + (lastChunk as { type?: unknown }).type === "trigger:turn-complete" + ) { + settled = true; + waitSeconds = 0; + } + } + + const s2Response = await this.#streamResponseByName(s2Stream, signal, { + ...options, + timeoutInSeconds: waitSeconds, + }); + + if (!settled) return s2Response; + + const headers = new Headers(s2Response.headers); + headers.set("X-Session-Settled", "true"); + return new Response(s2Response.body, { + status: s2Response.status, + statusText: s2Response.statusText, + headers, + }); + } + + async #peekLastChunkBody(s2Stream: string): Promise { + const qs = new URLSearchParams(); + // `tail_offset=1` reads one record before the next seq — i.e. the + // most recently appended record. `count=1` caps it to just that + // record. `wait=0` returns immediately with no long-poll. + qs.set("tail_offset", "1"); + qs.set("count", "1"); + qs.set("wait", "0"); + + let res: Response; + try { + res = await fetch( + `${this.baseUrl}/streams/${encodeURIComponent(s2Stream)}/records?${qs}`, + { + method: "GET", + headers: { + Authorization: `Bearer ${this.token}`, + Accept: "application/json", + "S2-Format": "raw", + "S2-Basin": this.basin, + }, + } + ); + } catch (err) { + this.logger.warn("S2 peek last record: fetch failed", { err, stream: s2Stream }); + return null; + } + + if (!res.ok) { + // 404: stream has never been written to. 416: range not + // satisfiable (empty stream). Both mean "nothing to peek." + if (res.status === 404 || res.status === 416) return null; + const text = await res.text().catch(() => ""); + this.logger.warn("S2 peek last record failed", { + status: res.status, + statusText: res.statusText, + text, + stream: s2Stream, + }); + return null; + } + + try { + const json = (await res.json()) as { + records?: Array<{ body: string; seq_num: number; timestamp: number }>; + }; + const record = json.records?.[0]; + if (!record) return null; + // The record body is a JSON string `{data: , id: partId}` + // where `` is the raw UIMessageChunk object (see + // `StreamsWriterV2` — the agent-side writer serializes the chunk + // object directly, not double-encoded). Unwrap the envelope and + // return `data` as-is. + const envelope = JSON.parse(record.body) as { data: unknown; id: string }; + return envelope.data; + } catch (err) { + this.logger.warn("S2 peek last record: parse failed", { err, stream: s2Stream }); + return null; + } } async #streamResponseByName( From 71f02ce783df959ed370e210abcf3141f2f31097 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 25 Apr 2026 21:26:54 +0100 Subject: [PATCH 11/23] x-peek-settled header --- .server-changes/session-out-settled-signal.md | 6 ++++-- .../app/routes/realtime.v1.sessions.$session.$io.ts | 10 +++++++++- .../app/services/realtime/s2realtimeStreams.server.ts | 11 ++++++++++- apps/webapp/app/services/realtime/types.ts | 11 +++++++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/.server-changes/session-out-settled-signal.md b/.server-changes/session-out-settled-signal.md index aeb0a537d38..519871af695 100644 --- a/.server-changes/session-out-settled-signal.md +++ b/.server-changes/session-out-settled-signal.md @@ -3,6 +3,8 @@ area: webapp type: improvement --- -`/realtime/v1/sessions/:session/out` now peeks the tail record in S2 at connection time. If the last chunk is `trigger:turn-complete` (agent finished a turn and is either idle-waiting on `.in` or has exited), the downstream S2 read uses `wait=0` so the SSE drains and closes immediately instead of holding the connection open for 60s. The response also carries `X-Session-Settled: true` so the client can tell the close is terminal rather than a normal long-poll cycle. +`/realtime/v1/sessions/:session/out` accepts an opt-in `X-Peek-Settled: 1` request header. When set, the route peeks the tail record in S2 before proxying; if the last chunk is `trigger:turn-complete`, it switches the downstream read to `wait=0` and returns `X-Session-Settled: true` so the SSE drains-and-closes in ~1s instead of long-polling for 60s. -Lets `TriggerChatTransport.reconnectToStream` return quickly on page reloads of settled chats without requiring callers to persist an `isStreaming` flag — the server decides from the stream's own tail. Mid-turn tails still take the 60s long-poll path unchanged. +Without the header, the route behaves exactly as before the settled work — unconditional `wait=60`. This matters because the peek races a newly-triggered turn's first chunk: the active `sendMessages → subscribeToSessionStream` path would otherwise see the previous turn's `trigger:turn-complete` at the tail and close the SSE before the new turn's chunks land on S2. The smoke test confirmed this race was failing every turn-2 response. + +`TriggerChatTransport.reconnectToStream` opts in via the header (that's the reload-on-a-settled-chat case where the fast close is a real UX win). Active send paths don't set the header and keep long-poll semantics. diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts index 214a2efeb3e..a118e6d151a 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts @@ -124,12 +124,20 @@ const loader = createLoaderApiRoute( timeoutInSeconds = parsed; } + // Opt-in: only consider the settled-peek shortcut when the client + // asks for it via `X-Peek-Settled: 1`. Reconnect-on-reload paths + // (`TriggerChatTransport.reconnectToStream`) set this; the active + // send-a-message path (`sendMessages → subscribeToSessionStream`) + // does not — otherwise the peek races with the newly-triggered + // turn's first chunk and the SSE closes before records land. + const peekSettled = request.headers.get("X-Peek-Settled") === "1"; + return realtimeStream.streamResponseFromSessionStream( request, session.friendlyId, params.io, getRequestAbortSignal(), - { lastEventId, timeoutInSeconds } + { lastEventId, timeoutInSeconds, peekSettled } ); } ); diff --git a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts index 8d2c599ffea..ecc7a63c66a 100644 --- a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts +++ b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts @@ -318,8 +318,17 @@ export class S2RealtimeStreams implements StreamResponder, StreamIngestor { let waitSeconds = options?.timeoutInSeconds ?? this.s2WaitSeconds; let settled = false; - if (io === "out") { + // Only peek + settle when the client opts in via `options.peekSettled`. + // Reconnect-on-reload paths (`TriggerChatTransport.reconnectToStream`) + // set it; active send-a-message paths don't — otherwise the peek + // races the newly-triggered turn's first chunk and the SSE closes + // before records land. + if (io === "out" && options?.peekSettled) { const lastChunk = await this.#peekLastChunkBody(s2Stream); + const lastChunkType = + lastChunk != null && typeof lastChunk === "object" + ? (lastChunk as { type?: unknown }).type + : null; if ( lastChunk != null && typeof lastChunk === "object" && diff --git a/apps/webapp/app/services/realtime/types.ts b/apps/webapp/app/services/realtime/types.ts index 64433a716f4..7161f158a48 100644 --- a/apps/webapp/app/services/realtime/types.ts +++ b/apps/webapp/app/services/realtime/types.ts @@ -33,6 +33,17 @@ export interface StreamIngestor { export type StreamResponseOptions = { timeoutInSeconds?: number; lastEventId?: string; + /** + * Session-stream-only. When `true`, the responder MAY peek the tail + * of `.out` and short-circuit to `wait=0` + `X-Session-Settled: true` + * if the last chunk is a terminal marker (e.g. `trigger:turn-complete`). + * Used by `TriggerChatTransport.reconnectToStream` on page reload. + * + * When absent/false, the responder keeps the unconditional long-poll + * behavior — required on the active send-a-message path where the + * peek would race the newly-triggered turn's first chunk. + */ + peekSettled?: boolean; }; // Interface for stream response From 427541c26cc59cb4ec5d925c87b475604015e0ed Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 27 Apr 2026 16:26:27 +0100 Subject: [PATCH 12/23] feat(webapp,db,core): Sessions become run manager MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Session is now the run manager for chat.agent and any future task-bound session. Atomically creates the row + triggers the first run + tracks the current run via optimistic claim, with a SessionRun audit log for provenance. Schema: - Session gains `taskIdentifier`, `triggerConfig` (JSON), `currentRunId` (non-FK), `currentRunVersion` (monotonic int for optimistic claim). - New SessionRun audit table — one row per run a session triggers, with `reason: "initial" | "continuation" | "upgrade" | "manual"`. Lifecycle: - `POST /api/v1/sessions`: idempotent on `(env, externalId)`, refreshes triggerConfig on cache hit, runs `ensureRunForSession` (probe + optimistic claim), returns a session-scoped PAT. JWT auth path dropped — secret-key only. The customer's server is the only entry point for session creation. - `POST /api/v1/sessions/:s/end-and-continue`: server-orchestrated handoff (cancels current run, triggers a fresh one, swaps currentRunId via `updateMany where currentRunVersion`). Powers `chat.requestUpgrade()` from inside the agent runtime. - `POST /realtime/v1/sessions/:s/:io/append`: probe + ensureRunForSession before append so messages arriving while no run is alive boot one transparently. Cross-form addressing on write paths: - `createActionApiRoute` now runs `findResource` before `authorization`, matching `createLoaderApiRoute`. Action routes get an optional `resource` argument on `authorization.resource()` — backwards-compatible (existing 4-arg callbacks unchanged). - Append + end-and-continue use the new ordering to authorize against `{paramSession, friendlyId, externalId}` so a JWT minted for either form authorizes either URL form. Helpers: - `mintSessionToken.server.ts`: server-side session-PAT factory (`read:sessions:{key} + write:sessions:{key}`, 1h TTL). - `sessionRunManager.server.ts`: `ensureRunForSession` (probe + claim) and `swapSessionRun` (force handoff with optimistic claim + cancel-on-loss). Pre-mutation existence reads switched to `$replica` (close, end-and- continue, PATCH). --- ...uns.$runFriendlyId.session-streams.wait.ts | 31 +- .../routes/api.v1.sessions.$session.close.ts | 4 +- ...i.v1.sessions.$session.end-and-continue.ts | 132 +++++++ .../app/routes/api.v1.sessions.$session.ts | 2 +- apps/webapp/app/routes/api.v1.sessions.ts | 97 +++-- ...ealtime.v1.sessions.$session.$io.append.ts | 83 ++++- .../realtime.v1.sessions.$session.$io.ts | 62 +++- .../realtime/mintSessionToken.server.ts | 40 +++ .../realtime/sessionRunManager.server.ts | 339 ++++++++++++++++++ .../app/services/realtime/sessions.server.ts | 43 ++- .../routeBuilders/apiBuilder.server.ts | 47 ++- .../migration.sql | 31 ++ .../migration.sql | 3 + .../database/prisma/schema.prisma | 49 ++- packages/core/src/v3/schemas/api.ts | 81 ++++- 15 files changed, 962 insertions(+), 82 deletions(-) create mode 100644 apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts create mode 100644 apps/webapp/app/services/realtime/mintSessionToken.server.ts create mode 100644 apps/webapp/app/services/realtime/sessionRunManager.server.ts create mode 100644 internal-packages/database/prisma/migrations/20260426190818_sessions_as_run_manager/migration.sql create mode 100644 internal-packages/database/prisma/migrations/20260426190819_session_current_run_id_index/migration.sql diff --git a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts index ad168110b45..8684cf3984e 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts @@ -7,7 +7,11 @@ import { WaitpointId } from "@trigger.dev/core/v3/isomorphic"; import { z } from "zod"; import { $replica } from "~/db.server"; import { createWaitpointTag, MAX_TAGS_PER_WAITPOINT } from "~/models/waitpointTag.server"; -import { resolveSessionByIdOrExternalId } from "~/services/realtime/sessions.server"; +import { + canonicalSessionAddressingKey, + isSessionFriendlyIdForm, + resolveSessionByIdOrExternalId, +} from "~/services/realtime/sessions.server"; import { S2RealtimeStreams } from "~/services/realtime/s2realtimeStreams.server"; import { getRealtimeStreamInstance } from "~/services/realtime/v1StreamsGlobal.server"; import { @@ -50,16 +54,23 @@ const { action, loader } = createActionApiRoute( return json({ error: "Run not found" }, { status: 404 }); } - const session = await resolveSessionByIdOrExternalId( + // Row-optional addressing — see the .out / .in.append handlers. + // The waitpoint cache + S2 stream key derive from the row's + // canonical identity (externalId if set, else friendlyId), so + // the agent's wait registration and the append-side drain + // converge regardless of which URL form each side used. + const maybeSession = await resolveSessionByIdOrExternalId( $replica, authentication.environment.id, body.session ); - if (!session) { + if (!maybeSession && isSessionFriendlyIdForm(body.session)) { return json({ error: "Session not found" }, { status: 404 }); } + const addressingKey = canonicalSessionAddressingKey(maybeSession, body.session); + const idempotencyKeyExpiresAt = body.idempotencyKeyTTL ? resolveIdempotencyKeyTTL(body.idempotencyKeyTTL) : undefined; @@ -95,11 +106,13 @@ const { action, loader } = createActionApiRoute( }); // Step 2: Register the waitpoint on the session channel so the next - // append fires it. Keyed by (sessionFriendlyId, io) — both runs on a - // multi-tab session wake on the same record. + // append fires it. Keyed by (addressingKey, io) — the canonical + // string for the row. The append handler drains by the same + // canonical key, so writers and readers converge regardless of + // which URL form the agent vs. the appending caller used. const ttlMs = timeout ? timeout.getTime() - Date.now() : undefined; await addSessionStreamWaitpoint( - session.friendlyId, + addressingKey, body.io, result.waitpoint.id, ttlMs && ttlMs > 0 ? ttlMs : undefined @@ -117,7 +130,7 @@ const { action, loader } = createActionApiRoute( if (realtimeStream instanceof S2RealtimeStreams) { const records = await realtimeStream.readSessionStreamRecords( - session.friendlyId, + addressingKey, body.io, body.lastSeqNum ); @@ -135,7 +148,7 @@ const { action, loader } = createActionApiRoute( }); await removeSessionStreamWaitpoint( - session.friendlyId, + addressingKey, body.io, result.waitpoint.id ); @@ -146,7 +159,7 @@ const { action, loader } = createActionApiRoute( // will complete the waitpoint via the append handler path. Log so // a broken race-check doesn't silently degrade to timeout-only. logger.warn("session-stream wait race-check failed", { - sessionFriendlyId: session.friendlyId, + addressingKey, io: body.io, waitpointId: WaitpointId.toFriendlyId(result.waitpoint.id), error, diff --git a/apps/webapp/app/routes/api.v1.sessions.$session.close.ts b/apps/webapp/app/routes/api.v1.sessions.$session.close.ts index 810e3350725..9ea86827d42 100644 --- a/apps/webapp/app/routes/api.v1.sessions.$session.close.ts +++ b/apps/webapp/app/routes/api.v1.sessions.$session.close.ts @@ -4,7 +4,7 @@ import { type RetrieveSessionResponseBody, } from "@trigger.dev/core/v3"; import { z } from "zod"; -import { prisma } from "~/db.server"; +import { $replica, prisma } from "~/db.server"; import { resolveSessionByIdOrExternalId, serializeSession, @@ -31,7 +31,7 @@ const { action, loader } = createActionApiRoute( }, async ({ authentication, params, body }) => { const existing = await resolveSessionByIdOrExternalId( - prisma, + $replica, authentication.environment.id, params.session ); diff --git a/apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts b/apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts new file mode 100644 index 00000000000..b878716f732 --- /dev/null +++ b/apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts @@ -0,0 +1,132 @@ +import { json } from "@remix-run/server-runtime"; +import { + EndAndContinueSessionRequestBody, + type EndAndContinueSessionResponseBody, +} from "@trigger.dev/core/v3"; +import { z } from "zod"; +import { $replica } from "~/db.server"; +import { logger } from "~/services/logger.server"; +import { swapSessionRun } from "~/services/realtime/sessionRunManager.server"; +import { resolveSessionByIdOrExternalId } from "~/services/realtime/sessions.server"; +import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; + +const ParamsSchema = z.object({ + session: z.string(), +}); + +// POST /api/v1/sessions/:session/end-and-continue +// +// Generic "the running run is exiting; please trigger a fresh one for +// this session and swap `currentRunId` to it" endpoint. The agent calls +// this from `chat.requestUpgrade` and other planned-handoff paths. The +// transport's `.out` SSE keeps streaming across the swap because S2 is +// keyed on the session, not the run — v1's last chunks land, v2's new +// chunks land on the same stream. +// +// Auth: `write:sessions:{ext}` — the running agent's internal API key +// (PRIVATE) bypasses authorization; a browser holding the session PAT +// can also reach this endpoint, which is fine: if you have the session +// PAT, you own the chat. +const { action } = createActionApiRoute( + { + params: ParamsSchema, + body: EndAndContinueSessionRequestBody, + method: "POST", + maxContentLength: 1024, + allowJWT: true, + corsStrategy: "all", + // Resolved before authorization so the auth scope can expand to both + // addressing forms (friendlyId + externalId). Handler reads the row + // from `resource` instead of re-fetching. + findResource: async (params, auth) => + resolveSessionByIdOrExternalId($replica, auth.environment.id, params.session), + authorization: { + action: "write", + resource: (params, _, __, ___, session) => { + const ids = new Set([params.session]); + if (session) { + ids.add(session.friendlyId); + if (session.externalId) ids.add(session.externalId); + } + return { sessions: [...ids] }; + }, + superScopes: ["write:sessions", "write:all", "admin"], + }, + }, + async ({ authentication, params, body, resource: session }) => { + if (!session) { + // Unreachable — `findResource` 404s before this runs. Type narrow. + return json({ error: "Session not found" }, { status: 404 }); + } + + if (session.closedAt) { + return json( + { error: "Cannot end-and-continue a closed session" }, + { status: 400 } + ); + } + + if (session.expiresAt && session.expiresAt.getTime() < Date.now()) { + return json( + { error: "Cannot end-and-continue an expired session" }, + { status: 400 } + ); + } + + // The wire `callingRunId` is a friendlyId (that's what the agent + // SDK exposes via `ctx.run.id`). Internally `Session.currentRunId` + // stores the TaskRun.id cuid, so resolve before handing to the + // optimistic-claim service. + const callingRun = await $replica.taskRun.findFirst({ + where: { + friendlyId: body.callingRunId, + runtimeEnvironmentId: authentication.environment.id, + }, + select: { id: true }, + }); + if (!callingRun) { + return json({ error: "callingRunId not found in this environment" }, { status: 404 }); + } + + try { + // Body's `reason` is free-form for forward-compat (audit metadata + // only); narrow into the closed `EnsureRunReason` set, defaulting + // to `"manual"` for unknown labels. + const reason: "initial" | "continuation" | "upgrade" | "manual" = + body.reason === "upgrade" || + body.reason === "continuation" || + body.reason === "initial" || + body.reason === "manual" + ? body.reason + : "manual"; + + const result = await swapSessionRun({ + session, + callingRunId: callingRun.id, + environment: authentication.environment, + reason, + }); + + // The swap stored a TaskRun.id (cuid) in `currentRunId`; surface + // the friendlyId for parity with the rest of the public API. + const run = await $replica.taskRun.findFirst({ + where: { id: result.runId }, + select: { friendlyId: true }, + }); + + const responseBody: EndAndContinueSessionResponseBody = { + runId: run?.friendlyId ?? result.runId, + swapped: result.swapped, + }; + return json(responseBody); + } catch (error) { + logger.error("Failed end-and-continue", { + sessionId: session.id, + error, + }); + return json({ error: "Failed to swap session run" }, { status: 500 }); + } + } +); + +export { action }; diff --git a/apps/webapp/app/routes/api.v1.sessions.$session.ts b/apps/webapp/app/routes/api.v1.sessions.$session.ts index fa061116993..02b74dc3b73 100644 --- a/apps/webapp/app/routes/api.v1.sessions.$session.ts +++ b/apps/webapp/app/routes/api.v1.sessions.$session.ts @@ -54,7 +54,7 @@ const { action } = createActionApiRoute( }, async ({ authentication, params, body }) => { const existing = await resolveSessionByIdOrExternalId( - prisma, + $replica, authentication.environment.id, params.session ); diff --git a/apps/webapp/app/routes/api.v1.sessions.ts b/apps/webapp/app/routes/api.v1.sessions.ts index e385dc1d654..f970a127238 100644 --- a/apps/webapp/app/routes/api.v1.sessions.ts +++ b/apps/webapp/app/routes/api.v1.sessions.ts @@ -4,6 +4,7 @@ import { type CreatedSessionResponseBody, ListSessionsQueryParams, type ListSessionsResponseBody, + type SessionItem, type SessionStatus, } from "@trigger.dev/core/v3"; import { SessionId } from "@trigger.dev/core/v3/isomorphic"; @@ -11,6 +12,11 @@ import type { Prisma, Session } from "@trigger.dev/database"; import { $replica, prisma, type PrismaClient } from "~/db.server"; import { clickhouseClient } from "~/services/clickhouseInstance.server"; import { logger } from "~/services/logger.server"; +import { mintSessionToken } from "~/services/realtime/mintSessionToken.server"; +import { + ensureRunForSession, + type SessionTriggerConfig, +} from "~/services/realtime/sessionRunManager.server"; import { serializeSession } from "~/services/realtime/sessions.server"; import { SessionsRepository } from "~/services/sessionsRepository/sessionsRepository.server"; import { @@ -67,9 +73,9 @@ export const loader = createLoaderApiRoute( }); return json({ - data: rows.map((session) => + data: rows.map((row) => serializeSession({ - ...session, + ...row, // Columns the list query doesn't select — filled so `serializeSession` // can operate on a narrowed payload without type errors. projectId: authentication.environment.projectId, @@ -90,36 +96,44 @@ const { action } = createActionApiRoute( body: CreateSessionRequestBody, method: "POST", maxContentLength: 1024 * 32, // 32KB — metadata is the only thing that grows - allowJWT: true, + // Secret-key only. Customer's server (typically wrapping + // `chat.createStartSessionAction`) owns session creation so any + // authorization decision (per-user/plan/quota) sits server-side + // alongside whatever DB write the customer pairs with the create. + // The session-scoped PAT returned in the response body is what the + // browser uses thereafter against `.in/append`, `.out` SSE, + // `end-and-continue`, etc. corsStrategy: "all", }, async ({ authentication, body }) => { try { + const { id, friendlyId } = SessionId.generate(); + + // Idempotent on (env, externalId): two concurrent POSTs converge + // to the same row. We refresh `triggerConfig` on the cached path + // so newly-deployed schema changes (e.g. an updated + // `clientDataSchema` on the agent) propagate to subsequent runs + // — the next `ensureRunForSession` reads back the latest config. let session: Session; let isCached = false; - if (body.externalId) { - // Atomic upsert — two concurrent POSTs with the same externalId both - // converge to the same row without either hitting a 500 from the - // unique constraint. Derive isCached from the upsert result: if the - // row pre-existed, the returned id won't match the one we just - // generated. Saves a round-trip and is race-free. - const { id, friendlyId } = SessionId.generate(); - const externalId = body.externalId; + const triggerConfigJson = body.triggerConfig as unknown as Prisma.InputJsonValue; + if (body.externalId) { session = await prisma.session.upsert({ where: { runtimeEnvironmentId_externalId: { runtimeEnvironmentId: authentication.environment.id, - externalId, + externalId: body.externalId, }, }, create: { id, friendlyId, - externalId, + externalId: body.externalId, type: body.type, - taskIdentifier: body.taskIdentifier ?? null, + taskIdentifier: body.taskIdentifier, + triggerConfig: triggerConfigJson, tags: body.tags ?? [], metadata: body.metadata as Prisma.InputJsonValue | undefined, expiresAt: body.expiresAt ?? null, @@ -128,17 +142,17 @@ const { action } = createActionApiRoute( environmentType: authentication.environment.type, organizationId: authentication.environment.organizationId, }, - update: {}, + update: { triggerConfig: triggerConfigJson }, }); isCached = session.id !== id; } else { - const { id, friendlyId } = SessionId.generate(); session = await prisma.session.create({ data: { id, friendlyId, type: body.type, - taskIdentifier: body.taskIdentifier ?? null, + taskIdentifier: body.taskIdentifier, + triggerConfig: triggerConfigJson, tags: body.tags ?? [], metadata: body.metadata as Prisma.InputJsonValue | undefined, expiresAt: body.expiresAt ?? null, @@ -150,10 +164,53 @@ const { action } = createActionApiRoute( }); } - return json( - { ...serializeSession(session), isCached }, - { status: isCached ? 200 : 201 } + // Session is task-bound — every session has a live run by + // construction. `ensureRunForSession` is idempotent: on the + // cached path it sees `currentRunId` is alive and returns it + // without re-triggering. + const ensureResult = await ensureRunForSession({ + session, + environment: authentication.environment, + reason: isCached ? "continuation" : "initial", + }); + + // The newly triggered run's friendlyId, looked up via Prisma — we + // need the friendly form for the wire response. + const run = await $replica.taskRun.findFirst({ + where: { id: ensureResult.runId }, + select: { friendlyId: true }, + }); + if (!run) { + throw new Error(`Triggered run ${ensureResult.runId} not found`); + } + + // Mint a session-scoped PAT keyed on the addressing string the + // transport will use everywhere (`.in/append`, `.out` SSE, + // `end-and-continue`). For sessions with an externalId, that's + // the externalId; otherwise the friendlyId. Mirrors the + // canonical addressing key used server-side. + const addressingKey = session.externalId ?? session.friendlyId; + const publicAccessToken = await mintSessionToken( + authentication.environment, + addressingKey ); + + const sessionItem: SessionItem = { + ...serializeSession(session), + triggerConfig: session.triggerConfig as unknown as SessionTriggerConfig, + currentRunId: run.friendlyId, + }; + + const responseBody: CreatedSessionResponseBody = { + ...sessionItem, + runId: run.friendlyId, + publicAccessToken, + isCached, + }; + + return json(responseBody, { + status: isCached ? 200 : 201, + }); } catch (error) { if (error instanceof ServiceValidationError) { return json({ error: error.message }, { status: 422 }); diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts index 9a44f23ecff..4251baae91e 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts @@ -5,7 +5,11 @@ import { z } from "zod"; import { $replica } from "~/db.server"; import { logger } from "~/services/logger.server"; import { S2RealtimeStreams } from "~/services/realtime/s2realtimeStreams.server"; -import { resolveSessionByIdOrExternalId } from "~/services/realtime/sessions.server"; +import { ensureRunForSession } from "~/services/realtime/sessionRunManager.server"; +import { + canonicalSessionAddressingKey, + resolveSessionByIdOrExternalId, +} from "~/services/realtime/sessions.server"; import { getRealtimeStreamInstance } from "~/services/realtime/v1StreamsGlobal.server"; import { drainSessionStreamWaitpoints } from "~/services/sessionStreamWaitpointCache.server"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; @@ -34,20 +38,32 @@ const { action, loader } = createActionApiRoute( maxContentLength: MAX_APPEND_BODY_BYTES, allowJWT: true, corsStrategy: "all", + // Sessions are task-bound (created by `POST /api/v1/sessions` which + // also triggers the first run). The row exists before any caller + // can reach `.in/append` — no row, no append. Resolved here so the + // authorization scope can expand to both addressing forms (friendlyId + // + externalId) and the handler can skip its own lookup. + findResource: async (params, auth) => + resolveSessionByIdOrExternalId($replica, auth.environment.id, params.session), authorization: { action: "write", - resource: (params) => ({ sessions: params.session }), + // Authorize against the union of the URL form, friendlyId, and + // externalId so a JWT scoped to any form authorizes any URL. + resource: (params, _, __, ___, session) => { + const ids = new Set([params.session]); + if (session) { + ids.add(session.friendlyId); + if (session.externalId) ids.add(session.externalId); + } + return { sessions: [...ids] }; + }, superScopes: ["write:sessions", "write:all", "admin"], }, }, - async ({ request, params, authentication }) => { - const session = await resolveSessionByIdOrExternalId( - $replica, - authentication.environment.id, - params.session - ); - + async ({ request, params, authentication, resource: session }) => { if (!session) { + // Unreachable — `findResource` short-circuits to 404 before this + // handler runs. Type-narrow the rest of the body. return new Response("Session not found", { status: 404 }); } @@ -58,6 +74,13 @@ const { action, loader } = createActionApiRoute( ); } + if (session.expiresAt && session.expiresAt.getTime() < Date.now()) { + return json( + { ok: false, error: "Cannot append to an expired session" }, + { status: 400 } + ); + } + const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2"); if (!(realtimeStream instanceof S2RealtimeStreams)) { @@ -67,11 +90,40 @@ const { action, loader } = createActionApiRoute( ); } + // Probe + ensure a live run before appending. The append itself is + // run-independent (S2 stream is durable, keyed on the session) but + // the message is useless if no run is alive to consume it. The + // probe is a single Prisma read; ensureRunForSession is no-op when + // currentRunId is alive, so the steady-state cost is one extra + // read in the hot path. + // + // Best-effort: if ensureRunForSession throws (e.g. the trigger + // call fails transiently), still append to S2 — the record is + // durable and the next append will retry the ensure step. Don't + // surface the error to the caller; the SSE tail just won't deliver + // it until a run boots. + const [ensureError] = await tryCatch( + ensureRunForSession({ + session, + environment: authentication.environment, + reason: "continuation", + }) + ); + if (ensureError) { + logger.error("Failed to ensureRunForSession on .in/append", { + sessionId: session.id, + externalId: session.externalId, + error: ensureError, + }); + } + + const addressingKey = canonicalSessionAddressingKey(session, params.session); + const part = await request.text(); const partId = request.headers.get("X-Part-Id") ?? nanoid(7); const [appendError] = await tryCatch( - realtimeStream.appendPartToSessionStream(part, partId, session.friendlyId, params.io) + realtimeStream.appendPartToSessionStream(part, partId, addressingKey, params.io) ); if (appendError) { @@ -86,13 +138,16 @@ const { action, loader } = createActionApiRoute( // Fire any run-scoped waitpoints registered against this channel. Best // effort — a failure here must not fail the append (the record is - // durable in S2; the SSE tail will still deliver it). + // durable in S2; the SSE tail will still deliver it). Waitpoints are + // keyed on the canonical addressing key the agent registered with via + // `sessions.open(...).in.wait()`, so writers and readers converge + // regardless of which URL form they used. const [drainError, waitpointIds] = await tryCatch( - drainSessionStreamWaitpoints(session.friendlyId, params.io) + drainSessionStreamWaitpoints(addressingKey, params.io) ); if (drainError) { logger.error("Failed to drain session stream waitpoints", { - sessionFriendlyId: session.friendlyId, + addressingKey, io: params.io, error: drainError, }); @@ -111,7 +166,7 @@ const { action, loader } = createActionApiRoute( ); if (completeError) { logger.error("Failed to complete session stream waitpoint", { - sessionFriendlyId: session.friendlyId, + addressingKey, io: params.io, waitpointId, error: completeError, diff --git a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts index a118e6d151a..c04992f7f14 100644 --- a/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts +++ b/apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts @@ -3,7 +3,11 @@ import { z } from "zod"; import { $replica } from "~/db.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { S2RealtimeStreams } from "~/services/realtime/s2realtimeStreams.server"; -import { resolveSessionByIdOrExternalId } from "~/services/realtime/sessions.server"; +import { + canonicalSessionAddressingKey, + isSessionFriendlyIdForm, + resolveSessionByIdOrExternalId, +} from "~/services/realtime/sessions.server"; import { getRealtimeStreamInstance } from "~/services/realtime/v1StreamsGlobal.server"; import { createActionApiRoute, @@ -31,17 +35,25 @@ const { action } = createActionApiRoute( }, }, async ({ params, authentication }) => { - const session = await resolveSessionByIdOrExternalId( + // Row-optional addressing. The agent calls PUT initialize as part + // of `session.out.writer()`, by which time it has already created + // the row at bind, so a missing row here is an unusual case + // (manual init from outside chat.agent). Require a real row only + // for opaque friendlyIds, and treat closedAt as a soft reject only + // when a row exists. The S2 stream key is built from the row's + // canonical key (externalId if set, else friendlyId) so writers + // and readers converge regardless of URL form. + const maybeSession = await resolveSessionByIdOrExternalId( $replica, authentication.environment.id, params.session ); - if (!session) { + if (!maybeSession && isSessionFriendlyIdForm(params.session)) { return new Response("Session not found", { status: 404 }); } - if (session.closedAt) { + if (maybeSession?.closedAt) { return new Response("Cannot initialize a channel on a closed session", { status: 400, }); @@ -55,8 +67,10 @@ const { action } = createActionApiRoute( }); } + const addressingKey = canonicalSessionAddressingKey(maybeSession, params.session); + const { responseHeaders } = await realtimeStream.initializeSessionStream( - session.friendlyId, + addressingKey, params.io ); @@ -66,26 +80,48 @@ const { action } = createActionApiRoute( // GET: SSE subscribe to a session channel. HEAD returns the last chunk index // for resume semantics, mirroring the existing run-stream route. +// +// Subscribes are row-optional: the chat.agent transport opens the SSE on +// `chatId` (externalId) before the agent has booted and upserted the +// Session row. The S2 stream is keyed on the row's *canonical* identity +// (externalId if set, else friendlyId) so two callers addressing the +// same row via different URL forms converge on the same stream. We +// short-circuit to 404 only for opaque `session_*` friendlyIds (those +// must come from a real mint). const loader = createLoaderApiRoute( { params: ParamsSchema, allowJWT: true, corsStrategy: "all", findResource: async (params, auth) => { - return resolveSessionByIdOrExternalId($replica, auth.environment.id, params.session); + const row = await resolveSessionByIdOrExternalId( + $replica, + auth.environment.id, + params.session + ); + if (!row && isSessionFriendlyIdForm(params.session)) { + return undefined; // 404 — opaque friendlyId must reference a real row + } + // Non-null wrapper so missing row doesn't 404 for externalId form. + return { + row, + addressingKey: canonicalSessionAddressingKey(row, params.session), + }; }, authorization: { action: "read", - resource: (session) => { - const ids = session.externalId - ? [session.friendlyId, session.externalId] - : [session.friendlyId]; - return { sessions: ids }; + resource: ({ row, addressingKey }) => { + const ids = new Set([addressingKey]); + if (row) { + ids.add(row.friendlyId); + if (row.externalId) ids.add(row.externalId); + } + return { sessions: [...ids] }; }, superScopes: ["read:sessions", "read:all", "admin"], }, }, - async ({ params, request, resource: session, authentication }) => { + async ({ params, request, authentication, resource }) => { const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2"); if (!(realtimeStream instanceof S2RealtimeStreams)) { @@ -134,7 +170,7 @@ const loader = createLoaderApiRoute( return realtimeStream.streamResponseFromSessionStream( request, - session.friendlyId, + resource.addressingKey, params.io, getRequestAbortSignal(), { lastEventId, timeoutInSeconds, peekSettled } diff --git a/apps/webapp/app/services/realtime/mintSessionToken.server.ts b/apps/webapp/app/services/realtime/mintSessionToken.server.ts new file mode 100644 index 00000000000..d69b36b7710 --- /dev/null +++ b/apps/webapp/app/services/realtime/mintSessionToken.server.ts @@ -0,0 +1,40 @@ +import { generateJWT as internal_generateJWT } from "@trigger.dev/core/v3"; +import { extractJwtSigningSecretKey } from "./jwtAuth.server"; + +type Environment = Parameters[0]; + +export type MintSessionTokenOptions = { + /** Token expiration. Defaults to "1h". */ + expirationTime?: string; +}; + +/** + * Mint a session-scoped public access token (JWT) covering both `.in` + * append and `.out` subscribe for a session's realtime channels. + * + * Returned by `POST /api/v1/sessions` so the browser holds a single + * long-lived token that survives across runs (sessions outlive any + * single run). Includes both read and write scopes since the transport + * needs both: read for SSE subscribe on `.out`, write for `.in` appends + * (`stop`, follow-up messages, action chunks). + */ +export async function mintSessionToken( + environment: Environment, + sessionAddressingKey: string, + options: MintSessionTokenOptions = {} +): Promise { + const scopes = [ + `read:sessions:${sessionAddressingKey}`, + `write:sessions:${sessionAddressingKey}`, + ]; + + return internal_generateJWT({ + secretKey: extractJwtSigningSecretKey(environment), + payload: { + sub: environment.id, + pub: true, + scopes, + }, + expirationTime: options.expirationTime ?? "1h", + }); +} diff --git a/apps/webapp/app/services/realtime/sessionRunManager.server.ts b/apps/webapp/app/services/realtime/sessionRunManager.server.ts new file mode 100644 index 00000000000..3a681f84e94 --- /dev/null +++ b/apps/webapp/app/services/realtime/sessionRunManager.server.ts @@ -0,0 +1,339 @@ +import type { Session, TaskRunStatus } from "@trigger.dev/database"; +import { SessionTriggerConfig as SessionTriggerConfigZod } from "@trigger.dev/core/v3"; +import { z } from "zod"; +import { prisma, $replica } from "~/db.server"; +import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { logger } from "~/services/logger.server"; +import { CancelTaskRunService } from "~/v3/services/cancelTaskRun.server"; +import { TriggerTaskService } from "~/v3/services/triggerTask.server"; +import { isFinalRunStatus } from "~/v3/taskStatus"; + +/** + * Schema for `Session.triggerConfig` (stored as JSONB). The wire-format + * source of truth lives in `@trigger.dev/core/v3` as `SessionTriggerConfig`; + * we re-export it here for the trigger machinery to validate on read. + * + * `basePayload` carries the customer's wire payload (for chat.agent: + * `{ chatId, ...clientData, idleTimeoutInSeconds? }`). Runtime fields + * specific to a particular trigger (e.g. `trigger: "trigger" | "preload"`, + * an `isContinuation` flag) come in via the `payloadOverrides` argument + * to `ensureRunForSession` and shallow-merge on top of `basePayload`. + */ +export const SessionTriggerConfigSchema = SessionTriggerConfigZod; + +export type SessionTriggerConfig = z.infer; + +export type EnsureRunReason = "initial" | "continuation" | "upgrade" | "manual"; + +type EnsureRunForSessionParams = { + /** + * Session row to operate on. Caller is responsible for the env match — + * we don't re-check `runtimeEnvironmentId` against `environment.id`. + */ + session: Pick< + Session, + "id" | "taskIdentifier" | "triggerConfig" | "currentRunId" | "currentRunVersion" + >; + environment: AuthenticatedEnvironment; + reason: EnsureRunReason; + /** + * Shallow-merged on top of `triggerConfig.basePayload`. Runtime fields + * only — caller-controlled data that varies per trigger (`trigger: + * "preload"` vs `"trigger"`, etc). + */ + payloadOverrides?: Record; +}; + +export type EnsureRunResult = { + runId: string; + /** True if this call triggered a fresh run; false if it reused an alive existing one. */ + triggered: boolean; +}; + +/** + * Idempotently make sure the session has a live run. + * + * Algorithm: + * 1. If `currentRunId` is set, probe its status. Alive → return as-is. + * 2. Trigger a new run upfront (cheap to cancel if we lose the race). + * 3. Atomic claim via `updateMany` keyed on `currentRunVersion`. + * - Won: return new runId, record SessionRun audit row. + * - Lost: cancel our triggered run, re-read session, reuse winner's + * run if alive. If pathological (winner's run already terminal), + * recurse. + * + * No DB lock is held across the trigger call. Wasted-trigger window is + * the rare multi-tab race on a dead run; cancel cost is negligible and + * the run-engine handles it gracefully. + */ +export async function ensureRunForSession( + params: EnsureRunForSessionParams +): Promise { + const { session, environment, reason, payloadOverrides } = params; + + // 1. Probe currentRunId. + if (session.currentRunId) { + const status = await getRunStatus(session.currentRunId); + if (status && !isFinalRunStatus(status)) { + return { runId: session.currentRunId, triggered: false }; + } + } + + // 2. Validate config + trigger upfront. + const config = SessionTriggerConfigSchema.parse(session.triggerConfig); + const triggered = await triggerSessionRun({ + session, + config, + environment, + payloadOverrides, + }); + + // 3. Try to claim the slot atomically. + const claim = await prisma.session.updateMany({ + where: { + id: session.id, + currentRunVersion: session.currentRunVersion, + }, + data: { + currentRunId: triggered.id, + currentRunVersion: { increment: 1 }, + }, + }); + + if (claim.count === 1) { + // Won. Audit the SessionRun. Best-effort — failure here doesn't + // invalidate the live run, just leaves a missing audit row. + prisma.sessionRun + .create({ + data: { sessionId: session.id, runId: triggered.id, reason }, + }) + .catch((error) => { + logger.warn("Failed to record SessionRun audit row", { + sessionId: session.id, + runId: triggered.id, + reason, + error, + }); + }); + + return { runId: triggered.id, triggered: true }; + } + + // 4. Lost the race. Cancel our triggered run; reuse the winner's. + cancelLostRaceRun(triggered.id, environment).catch((error) => { + logger.warn("Failed to cancel lost-race session run", { + sessionId: session.id, + runId: triggered.id, + error, + }); + }); + + const fresh = await $replica.session.findFirst({ + where: { id: session.id }, + select: { + id: true, + taskIdentifier: true, + triggerConfig: true, + currentRunId: true, + currentRunVersion: true, + }, + }); + + if (!fresh) { + // Session vanished mid-flight. Surface as an error — caller decides + // whether to 404 or retry. + throw new SessionRunManagerError(`Session ${session.id} not found after lost claim race`); + } + + if (fresh.currentRunId) { + const status = await getRunStatus(fresh.currentRunId); + if (status && !isFinalRunStatus(status)) { + return { runId: fresh.currentRunId, triggered: false }; + } + } + + // Pathological: winner's run already terminal. Recurse with the fresh + // version. Bounded by run-engine progress — if every triggered run + // dies instantly we'll loop, but that's a deeper bug worth surfacing. + return ensureRunForSession({ + session: fresh, + environment, + reason, + payloadOverrides, + }); +} + +/** + * Trigger a single run for a session. Builds `TriggerTaskRequestBody` + * by shallow-merging `payloadOverrides` over `config.basePayload` and + * threading `config`'s machine/queue/tags through the trigger options. + */ +async function triggerSessionRun(params: { + session: Pick; + config: SessionTriggerConfig; + environment: AuthenticatedEnvironment; + payloadOverrides?: Record; +}): Promise<{ id: string; friendlyId: string }> { + const { session, config, environment, payloadOverrides } = params; + + const payload = { + ...config.basePayload, + ...(config.idleTimeoutInSeconds !== undefined + ? { idleTimeoutInSeconds: config.idleTimeoutInSeconds } + : {}), + ...(payloadOverrides ?? {}), + }; + + const body = { + payload, + context: {}, + options: { + ...(config.machine ? { machine: config.machine as never } : {}), + ...(config.queue ? { queue: { name: config.queue } } : {}), + ...(config.tags ? { tags: config.tags } : {}), + ...(config.maxAttempts !== undefined ? { maxAttempts: config.maxAttempts } : {}), + }, + }; + + const service = new TriggerTaskService(); + const result = await service.call(session.taskIdentifier, environment, body, { + triggerSource: "session", + triggerAction: "trigger", + }); + + if (!result) { + throw new SessionRunManagerError( + `TriggerTaskService returned no result for taskIdentifier=${session.taskIdentifier}` + ); + } + + return { id: result.run.id, friendlyId: result.run.friendlyId }; +} + +type SwapSessionRunParams = { + session: Pick< + Session, + "id" | "taskIdentifier" | "triggerConfig" | "currentRunId" | "currentRunVersion" + >; + /** + * The run requesting the swap. Optimistic claim requires + * `Session.currentRunId === callingRunId` so the swap can't clobber + * a run triggered out-of-band (e.g. a parallel `.in/append` probe + * that already replaced the dead run). + */ + callingRunId: string; + environment: AuthenticatedEnvironment; + reason: EnsureRunReason; + payloadOverrides?: Record; +}; + +export type SwapSessionRunResult = { + /** runId of the newly-triggered run that has taken over the session. */ + runId: string; + /** + * False when the swap was preempted (currentRunId is no longer the + * calling run). The caller should treat this as "someone else + * already moved on" — exit cleanly without expecting to drive the + * next run. + */ + swapped: boolean; +}; + +/** + * Force-swap the session to a freshly-triggered run, regardless of + * whether the current run is alive. Called by `end-and-continue` when + * the running agent wants a clean handoff (typically version upgrade). + * + * Differs from `ensureRunForSession`: never reuses the current run. + * The optimistic claim is keyed on `currentRunId === callingRunId`, so + * a parallel append-time probe that already swapped to a different + * run wins the race and `swapped: false` is surfaced. + */ +export async function swapSessionRun( + params: SwapSessionRunParams +): Promise { + const { session, callingRunId, environment, reason, payloadOverrides } = params; + + const config = SessionTriggerConfigSchema.parse(session.triggerConfig); + const triggered = await triggerSessionRun({ + session, + config, + environment, + payloadOverrides, + }); + + const claim = await prisma.session.updateMany({ + where: { + id: session.id, + currentRunId: callingRunId, + currentRunVersion: session.currentRunVersion, + }, + data: { + currentRunId: triggered.id, + currentRunVersion: { increment: 1 }, + }, + }); + + if (claim.count === 1) { + prisma.sessionRun + .create({ + data: { sessionId: session.id, runId: triggered.id, reason }, + }) + .catch((error) => { + logger.warn("Failed to record SessionRun audit row", { + sessionId: session.id, + runId: triggered.id, + reason, + error, + }); + }); + return { runId: triggered.id, swapped: true }; + } + + // Lost the race — someone else already swapped to a new run. Cancel + // ours, surface the existing winner. + cancelLostRaceRun(triggered.id, environment).catch((error) => { + logger.warn("Failed to cancel preempted swap run", { + sessionId: session.id, + runId: triggered.id, + error, + }); + }); + + const fresh = await $replica.session.findFirst({ + where: { id: session.id }, + select: { currentRunId: true }, + }); + + return { + runId: fresh?.currentRunId ?? callingRunId, + swapped: false, + }; +} + +async function getRunStatus(runId: string): Promise { + // Use the read replica — this is a hot-path probe and stale-by-ms is + // fine. The append handler re-checks if it ends up reusing the runId. + const row = await $replica.taskRun.findFirst({ + where: { id: runId }, + select: { status: true }, + }); + return row?.status ?? null; +} + +async function cancelLostRaceRun( + runId: string, + environment: AuthenticatedEnvironment +): Promise { + const service = new CancelTaskRunService(); + // Resolve to a TaskRun reference — CancelTaskRunService takes the run + // object, not the id. Read from the replica; the actual cancellation + // write happens inside the service. + const run = await $replica.taskRun.findFirst({ where: { id: runId } }); + if (!run) return; + await service.call(run, { reason: "Lost session-run claim race" }); +} + +export class SessionRunManagerError extends Error { + readonly name = "SessionRunManagerError"; +} diff --git a/apps/webapp/app/services/realtime/sessions.server.ts b/apps/webapp/app/services/realtime/sessions.server.ts index 5ed67d5a691..82e2fda53f1 100644 --- a/apps/webapp/app/services/realtime/sessions.server.ts +++ b/apps/webapp/app/services/realtime/sessions.server.ts @@ -21,7 +21,7 @@ export async function resolveSessionByIdOrExternalId( runtimeEnvironmentId: string, idOrExternalId: string ): Promise { - if (idOrExternalId.startsWith(SESSION_FRIENDLY_ID_PREFIX)) { + if (isSessionFriendlyIdForm(idOrExternalId)) { return prisma.session.findFirst({ where: { friendlyId: idOrExternalId, runtimeEnvironmentId }, }); @@ -35,10 +35,49 @@ export async function resolveSessionByIdOrExternalId( }); } +/** True for `session_*` friendlyId form, false for everything else. */ +export function isSessionFriendlyIdForm(value: string): boolean { + return value.startsWith(SESSION_FRIENDLY_ID_PREFIX); +} + +/** + * Canonicalise the addressing key used for everything stream-level: the + * S2 stream path and the run-engine waitpoint cache key. `chat.agent` + * and the rest of the operational surface always pass `externalId`, but + * a public-API caller may legitimately address by `friendlyId` — and a + * session created without an `externalId` only has a friendlyId at all. + * + * Rule: + * - If we have a Session row, the canonical key is `externalId` if + * set, else `friendlyId`. This way two callers addressing the same + * row via different forms always converge to the same S2 stream. + * - If we have no row (yet — chat.agent's transport may subscribe + * before the agent's bind-time upsert lands), the canonical key is + * whatever the URL had. Operationally that's always an externalId. + * Friendlyid-form callers without a matching row are rejected by + * the route handler before this is reached. + */ +export function canonicalSessionAddressingKey( + row: Session | null, + paramSession: string +): string { + if (row) { + return row.externalId ?? row.friendlyId; + } + return paramSession; +} + /** * Convert a Prisma `Session` row to the public {@link SessionItem} wire format. * Strips internal columns (project/environment/organization ids) and narrows * the `metadata` JSON to a record. + * + * Note: `currentRunId` is left as-is — Prisma stores the internal run id + * (cuid), but `SessionItem.currentRunId` is the *friendly* form. Routes + * that emit `SessionItem` are responsible for resolving the friendlyId + * (typically via a separate TaskRun lookup) and overriding the field. + * `serializeSession` returns it raw so list endpoints don't pay an N+1 + * lookup just to surface the friendly form. */ export function serializeSession(session: Session): SessionItem { return { @@ -46,6 +85,8 @@ export function serializeSession(session: Session): SessionItem { externalId: session.externalId, type: session.type, taskIdentifier: session.taskIdentifier, + triggerConfig: session.triggerConfig as SessionItem["triggerConfig"], + currentRunId: session.currentRunId, tags: session.tags, metadata: (session.metadata ?? null) as SessionItem["metadata"], closedAt: session.closedAt, diff --git a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts index e1f248927ae..10ed2c3bff0 100644 --- a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts +++ b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts @@ -469,7 +469,13 @@ type ApiKeyActionRouteBuilderOptions< : undefined, body: TBodySchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion ? z.infer - : undefined + : undefined, + // The resolved resource from `findResource`. `undefined` when the route + // doesn't declare `findResource`. Routes that need to expand the auth + // scope to alternate identifiers of the same row (e.g. friendlyId + + // externalId for sessions) read it here so a JWT minted for either form + // authorizes both URL forms. + resource: TResource | undefined ) => AuthorizationResources; superScopes?: string[]; }; @@ -667,9 +673,32 @@ export function createActionApiRoute< parsedBody = body.data; } + // Resolve the resource before authorization so the auth scope check + // can expand to alternate identifiers of the same row (e.g. a Session + // is addressable by both `friendlyId` and `externalId` and a JWT minted + // for either form should authorize both URL forms). Mirrors the + // ordering in `createLoaderApiRoute`. + const resource = options.findResource + ? await options.findResource(parsedParams, authenticationResult, parsedSearchParams) + : undefined; + + if (options.findResource && !resource) { + return await wrapResponse( + request, + json({ error: "Resource not found" }, { status: 404 }), + corsStrategy !== "none" + ); + } + if (authorization) { - const { action, resource, superScopes } = authorization; - const $resource = resource(parsedParams, parsedSearchParams, parsedHeaders, parsedBody); + const { action, resource: authResource, superScopes } = authorization; + const $resource = authResource( + parsedParams, + parsedSearchParams, + parsedHeaders, + parsedBody, + resource + ); logger.debug("Checking authorization", { action, @@ -702,18 +731,6 @@ export function createActionApiRoute< } } - const resource = options.findResource - ? await options.findResource(parsedParams, authenticationResult, parsedSearchParams) - : undefined; - - if (options.findResource && !resource) { - return await wrapResponse( - request, - json({ error: "Resource not found" }, { status: 404 }), - corsStrategy !== "none" - ); - } - const result = await handler({ params: parsedParams, searchParams: parsedSearchParams, diff --git a/internal-packages/database/prisma/migrations/20260426190818_sessions_as_run_manager/migration.sql b/internal-packages/database/prisma/migrations/20260426190818_sessions_as_run_manager/migration.sql new file mode 100644 index 00000000000..a0f12496781 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20260426190818_sessions_as_run_manager/migration.sql @@ -0,0 +1,31 @@ +-- AlterTable +ALTER TABLE "Session" + ADD COLUMN "currentRunId" TEXT, + ADD COLUMN "currentRunVersion" INTEGER NOT NULL DEFAULT 0, + ADD COLUMN "triggerConfig" JSONB NOT NULL, + ALTER COLUMN "taskIdentifier" SET NOT NULL; + +-- CreateTable +CREATE TABLE "SessionRun" ( + "id" TEXT NOT NULL, + "sessionId" TEXT NOT NULL, + "runId" TEXT NOT NULL, + "reason" TEXT NOT NULL, + "triggeredAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + + CONSTRAINT "SessionRun_pkey" PRIMARY KEY ("id") +); + +-- CreateIndex +CREATE UNIQUE INDEX "SessionRun_runId_key" + ON "SessionRun"("runId"); + +-- CreateIndex +CREATE INDEX "SessionRun_sessionId_idx" + ON "SessionRun"("sessionId"); + +-- AddForeignKey +ALTER TABLE "SessionRun" + ADD CONSTRAINT "SessionRun_sessionId_fkey" + FOREIGN KEY ("sessionId") REFERENCES "Session"("id") + ON DELETE CASCADE ON UPDATE CASCADE; diff --git a/internal-packages/database/prisma/migrations/20260426190819_session_current_run_id_index/migration.sql b/internal-packages/database/prisma/migrations/20260426190819_session_current_run_id_index/migration.sql new file mode 100644 index 00000000000..479353a3e04 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20260426190819_session_current_run_id_index/migration.sql @@ -0,0 +1,3 @@ +-- CreateIndex +CREATE INDEX CONCURRENTLY IF NOT EXISTS "Session_currentRunId_idx" + ON "Session"("currentRunId"); diff --git a/internal-packages/database/prisma/schema.prisma b/internal-packages/database/prisma/schema.prisma index cfb4c7f8057..ee75ce82b5f 100644 --- a/internal-packages/database/prisma/schema.prisma +++ b/internal-packages/database/prisma/schema.prisma @@ -707,12 +707,32 @@ model Session { environmentType RuntimeEnvironmentType organizationId String - /// Informational pointer for task-owned types. Never changes after create. - taskIdentifier String? + /// Task this session triggers runs against. Required — Sessions are + /// task-bound: creating a session also triggers its first run, and + /// every subsequent re-trigger uses this same identifier. + taskIdentifier String + + /// Trigger config used for every run this session schedules. Shape + /// (validated at the route layer, opaque to the DB): + /// { basePayload: object, machine?: string, queue?: string, + /// tags?: string[], maxAttempts?: number, + /// idleTimeoutInSeconds?: number } + /// `basePayload` carries the customer's client-data; runtime fields + /// (chatId, messages, trigger) are merged at trigger time. + triggerConfig Json tags String[] @default([]) metadata Json? + /// Live run pointer — non-FK so run deletion never cascades. Can lag + /// reality; the `.in/append` handler re-checks the snapshot status + /// before reusing it. + currentRunId String? + /// Monotonic counter used for optimistic locking on `currentRunId` + /// swaps. Bumped atomically alongside any update that changes + /// `currentRunId`. + currentRunVersion Int @default(0) + /// Terminal markers — written once, never flipped back. closedAt DateTime? closedReason String? @@ -721,10 +741,35 @@ model Session { createdAt DateTime @default(now()) updatedAt DateTime @updatedAt + runs SessionRun[] + /// Idempotency: `(env, externalId)` uniquely identifies a session. /// PostgreSQL treats NULLs as distinct, so `externalId=NULL` rows never collide. @@unique([runtimeEnvironmentId, externalId]) @@index([expiresAt]) + @@index([currentRunId]) +} + +/// Historical record of every run a Session has owned. Append-only — +/// rows are inserted on each `ensureRunForSession` claim, never updated. +/// Lets us reconstruct the run timeline of a chat for debugging / +/// dashboard surfaces. The relation cascades on Session delete (tied to +/// the session lifecycle) but `runId` is a plain string column with no +/// FK to TaskRun so run pruning is independent. +model SessionRun { + id String @id @default(cuid()) + + sessionId String + /// TaskRun.id (no FK — runs may be archived independently of session history) + runId String @unique + /// One of: "initial" | "continuation" | "upgrade" | "manual". + /// Plain string for forward-compat with future trigger reasons. + reason String + triggeredAt DateTime @default(now()) + + session Session @relation(fields: [sessionId], references: [id], onDelete: Cascade) + + @@index([sessionId]) } model TaskRun { diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts index 2a24228e193..0db92a67c64 100644 --- a/packages/core/src/v3/schemas/api.ts +++ b/packages/core/src/v3/schemas/api.ts @@ -1482,8 +1482,29 @@ export const CompleteWaitpointTokenRequestBody = z.object({ export type CompleteWaitpointTokenRequestBody = z.infer; /** - * Request body for `POST /api/v1/sessions`. Creates a Session — the durable, - * typed, bidirectional I/O primitive that outlives a single run. + * Trigger config persisted on a Session. Drives every run the session + * schedules — `basePayload` is the customer's wire payload (for + * chat.agent: `{ chatId, ...clientData }`), runtime fields like + * `trigger: "preload" | "trigger"` are merged on top per-call by the + * server's trigger machinery. + */ +export const SessionTriggerConfig = z.object({ + basePayload: z.record(z.unknown()), + machine: MachinePresetName.optional(), + queue: z.string().max(128).optional(), + tags: z.array(z.string().max(128)).max(5).optional(), + maxAttempts: z.number().int().positive().max(10).optional(), + /** Convenience field surfaced to chat.agent via the wire payload. */ + idleTimeoutInSeconds: z.number().int().positive().max(3600).optional(), +}); +export type SessionTriggerConfig = z.infer; + +/** + * Request body for `POST /api/v1/sessions`. Creates a Session and + * triggers its first run. Sessions are task-bound: `taskIdentifier` and + * `triggerConfig` are required, and re-runs scheduled by the server + * (after run termination, after `end-and-continue`) reuse the same + * config. */ export const CreateSessionRequestBody = z.object({ /** Plain string discriminator — e.g. `"chat.agent"`. Not validated against an enum on the server. */ @@ -1498,8 +1519,10 @@ export const CreateSessionRequestBody = z.object({ message: "externalId cannot start with 'session_' (reserved prefix for internal friendlyIds)", }) .optional(), - /** Optional pointer for task-owned session types. */ - taskIdentifier: z.string().max(128).optional(), + /** Task this session triggers runs against. Required. */ + taskIdentifier: z.string().min(1).max(128), + /** Trigger config used for every run scheduled by this session. */ + triggerConfig: SessionTriggerConfig, /** Up to 10 tags for dashboard filtering. */ tags: z.array(z.string().max(128)).max(10).optional(), /** Arbitrary JSON metadata. */ @@ -1513,7 +1536,20 @@ export const SessionItem = z.object({ id: z.string(), externalId: z.string().nullable(), type: z.string(), - taskIdentifier: z.string().nullable(), + taskIdentifier: z.string(), + /** + * Optional on the wire because some surfaces (the list endpoint backed + * by ClickHouse, list-page rendering) don't carry triggerConfig. + * Always populated on `POST /sessions` and `GET /sessions/:id`. + */ + triggerConfig: SessionTriggerConfig.optional(), + /** + * Friendly id of the live run for this session, if any. Optional on + * the wire — list surfaces may not include it. Routes that emit + * `SessionItem` are responsible for resolving the friendly form + * from the underlying cuid before returning. + */ + currentRunId: z.string().nullable().optional(), tags: z.array(z.string()), metadata: z.record(z.unknown()).nullable(), closedAt: z.coerce.date().nullable(), @@ -1525,6 +1561,11 @@ export const SessionItem = z.object({ export type SessionItem = z.infer; export const CreatedSessionResponseBody = SessionItem.extend({ + /** Friendly id of the first run triggered alongside session create. */ + runId: z.string(), + /** Session-scoped public access token: `read:sessions:{ext} + write:sessions:{ext}`. */ + publicAccessToken: z.string(), + /** True if the session existed already (idempotent upsert), false if newly created. */ isCached: z.boolean(), }); export type CreatedSessionResponseBody = z.infer; @@ -1532,6 +1573,36 @@ export type CreatedSessionResponseBody = z.infer; +/** + * Body for `POST /api/v1/sessions/:session/end-and-continue`. Used by the + * running agent to request a clean handoff to a fresh run on the latest + * deployed version (typical use case: `chat.requestUpgrade`). The + * server triggers a new run, atomically swaps `currentRunId`, and the + * caller exits. + */ +export const EndAndContinueSessionRequestBody = z.object({ + /** The friendlyId of the run requesting the handoff. */ + callingRunId: z.string(), + /** Free-form label for the SessionRun audit row. e.g. `"upgrade"`. */ + reason: z.string().max(64), +}); +export type EndAndContinueSessionRequestBody = z.infer; + +export const EndAndContinueSessionResponseBody = z.object({ + /** friendlyId of the run that has taken over the session. */ + runId: z.string(), + /** + * False when the swap was preempted (a different run was already + * running by the time we tried to claim). The caller should treat + * this as "someone else moved on" — exit cleanly without expecting + * to drive the next run. + */ + swapped: z.boolean(), +}); +export type EndAndContinueSessionResponseBody = z.infer< + typeof EndAndContinueSessionResponseBody +>; + export const UpdateSessionRequestBody = z.object({ tags: z.array(z.string().max(128)).max(10).optional(), metadata: z.record(z.unknown()).nullable().optional(), From a349d020e63fbf60daef5c26de46d9dc837dfb66 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 27 Apr 2026 17:05:26 +0100 Subject: [PATCH 13/23] fix(webapp): address #3417 PR review feedback Three fixes after pushing the Sessions-as-run-manager commit: - `api.v1.sessions.$session.end-and-continue.ts` was destructuring only `{ action }` from `createActionApiRoute`, which means Remix had no handler for OPTIONS preflight on this route. Browser CORS would 405. Sibling routes (`close.ts`) already export `{ action, loader }`. Fix: destructure and export both. - `ensureRunForSession`'s pathological "lost the claim race AND the winner's run was already terminal" branch recursed without bound. In practice progress through the run engine bounds it, but a misconfigured task that crashes before being dequeued could blow the stack. Add a hidden `_attempt` counter, throw `SessionRunManagerError` once it exceeds 3. - `sessionsReplicationService.test.ts` was failing in CI because the sessions-as-run-manager schema migration made `taskIdentifier` and `triggerConfig` required on `Session`. The two `prisma.session.create` calls in the test predate the migration. Add the now-required fields to both fixtures. --- ...i.v1.sessions.$session.end-and-continue.ts | 4 +-- .../realtime/sessionRunManager.server.ts | 33 +++++++++++++++++-- .../test/sessionsReplicationService.test.ts | 7 ++++ 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts b/apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts index b878716f732..c33ebb755bb 100644 --- a/apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts +++ b/apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts @@ -27,7 +27,7 @@ const ParamsSchema = z.object({ // (PRIVATE) bypasses authorization; a browser holding the session PAT // can also reach this endpoint, which is fine: if you have the session // PAT, you own the chat. -const { action } = createActionApiRoute( +const { action, loader } = createActionApiRoute( { params: ParamsSchema, body: EndAndContinueSessionRequestBody, @@ -129,4 +129,4 @@ const { action } = createActionApiRoute( } ); -export { action }; +export { action, loader }; diff --git a/apps/webapp/app/services/realtime/sessionRunManager.server.ts b/apps/webapp/app/services/realtime/sessionRunManager.server.ts index 3a681f84e94..71850589514 100644 --- a/apps/webapp/app/services/realtime/sessionRunManager.server.ts +++ b/apps/webapp/app/services/realtime/sessionRunManager.server.ts @@ -25,6 +25,17 @@ export type SessionTriggerConfig = z.infer; export type EnsureRunReason = "initial" | "continuation" | "upgrade" | "manual"; +/** + * Hard cap on how many times `ensureRunForSession` will recurse on the + * pathological "we lost the claim race AND the winner's run was already + * terminal" path. In practice progress through the run engine bounds + * this, but a misconfigured task that crashes before it can be dequeued + * could otherwise loop without limit. After this many attempts we + * surface `SessionRunManagerError` so the caller can 5xx instead of + * blowing the stack. + */ +const ENSURE_RUN_FOR_SESSION_MAX_ATTEMPTS = 3; + type EnsureRunForSessionParams = { /** * Session row to operate on. Caller is responsible for the env match — @@ -42,6 +53,14 @@ type EnsureRunForSessionParams = { * "preload"` vs `"trigger"`, etc). */ payloadOverrides?: Record; + /** + * @internal Recursion-guard counter for the lost-claim-race retry path. + * Public callers should leave this unset; the function recurses with + * an incremented value on the pathological "winner's run was already + * terminal" branch and throws once it exceeds + * {@link ENSURE_RUN_FOR_SESSION_MAX_ATTEMPTS}. + */ + _attempt?: number; }; export type EnsureRunResult = { @@ -69,7 +88,13 @@ export type EnsureRunResult = { export async function ensureRunForSession( params: EnsureRunForSessionParams ): Promise { - const { session, environment, reason, payloadOverrides } = params; + const { session, environment, reason, payloadOverrides, _attempt = 1 } = params; + + if (_attempt > ENSURE_RUN_FOR_SESSION_MAX_ATTEMPTS) { + throw new SessionRunManagerError( + `ensureRunForSession exceeded ${ENSURE_RUN_FOR_SESSION_MAX_ATTEMPTS} attempts for session ${session.id} — every triggered run reached a terminal state before claim could resolve` + ); + } // 1. Probe currentRunId. if (session.currentRunId) { @@ -153,13 +178,15 @@ export async function ensureRunForSession( } // Pathological: winner's run already terminal. Recurse with the fresh - // version. Bounded by run-engine progress — if every triggered run - // dies instantly we'll loop, but that's a deeper bug worth surfacing. + // version. Bounded by `ENSURE_RUN_FOR_SESSION_MAX_ATTEMPTS` so a task + // that always crashes before being dequeued surfaces as an error + // instead of a stack overflow. return ensureRunForSession({ session: fresh, environment, reason, payloadOverrides, + _attempt: _attempt + 1, }); } diff --git a/apps/webapp/test/sessionsReplicationService.test.ts b/apps/webapp/test/sessionsReplicationService.test.ts index f6d8d4ba8b1..3a16ce4471a 100644 --- a/apps/webapp/test/sessionsReplicationService.test.ts +++ b/apps/webapp/test/sessionsReplicationService.test.ts @@ -74,6 +74,9 @@ describe("SessionsReplicationService", () => { environmentType: "DEVELOPMENT", organizationId: organization.id, taskIdentifier: "my-agent", + triggerConfig: { + basePayload: { messages: [], trigger: "preload" }, + }, tags: ["user:42", "plan:pro"], metadata: { plan: "pro", seats: 3 }, }, @@ -174,6 +177,10 @@ describe("SessionsReplicationService", () => { runtimeEnvironmentId: environment.id, environmentType: "DEVELOPMENT", organizationId: organization.id, + taskIdentifier: "my-agent", + triggerConfig: { + basePayload: { messages: [], trigger: "preload" }, + }, }, }); From c3971602c174e96c4a976dd027d043d6d40dfa72 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 27 Apr 2026 18:05:52 +0100 Subject: [PATCH 14/23] fix(webapp): address #3417 PR review feedback (round 2) Two fixes from Devin review on the sessions-as-run-manager commit: - `SessionItem.currentRunId`'s contract is the `run_*` friendlyId, but `serializeSession` returns the raw Prisma cuid. The `POST /sessions` create path overrides correctly via a TaskRun lookup, but GET, PATCH, and the three return paths in close.ts were passing the cuid through. A consumer using `currentRunId` from those endpoints in a downstream `GET /api/v1/runs/:runId` call would 404. Add a `serializeSessionWithFriendlyRunId` helper next to `serializeSession` that resolves via `$replica.taskRun.findFirst` (TaskRun friendlyIds are immutable, so replica lag is harmless), and switch the five affected return sites to use it. List endpoints stay on `serializeSession` to avoid N+1 lookups when paginating. The create endpoint keeps its existing manual lookup because it also needs the friendlyId for the response's `runId` field, and `session.currentRunId` is stale relative to the post-`ensureRunForSession` claim outcome. - Drop dead `lastChunkType` recomputation in `streamResponseFromSessionStream`. The variable was bound but never used; the conditional below it re-evaluated the same expression. Use the bound value in the condition. --- .../routes/api.v1.sessions.$session.close.ts | 14 +++++-- .../app/routes/api.v1.sessions.$session.ts | 10 +++-- .../realtime/s2realtimeStreams.server.ts | 6 +-- .../app/services/realtime/sessions.server.ts | 37 +++++++++++++++++-- 4 files changed, 51 insertions(+), 16 deletions(-) diff --git a/apps/webapp/app/routes/api.v1.sessions.$session.close.ts b/apps/webapp/app/routes/api.v1.sessions.$session.close.ts index 9ea86827d42..16d8a6d93d1 100644 --- a/apps/webapp/app/routes/api.v1.sessions.$session.close.ts +++ b/apps/webapp/app/routes/api.v1.sessions.$session.close.ts @@ -7,7 +7,7 @@ import { z } from "zod"; import { $replica, prisma } from "~/db.server"; import { resolveSessionByIdOrExternalId, - serializeSession, + serializeSessionWithFriendlyRunId, } from "~/services/realtime/sessions.server"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; @@ -43,7 +43,9 @@ const { action, loader } = createActionApiRoute( // Idempotent: if already closed, return the current row without clobbering // the original closedAt / closedReason. if (existing.closedAt) { - return json(serializeSession(existing)); + return json( + await serializeSessionWithFriendlyRunId(existing) + ); } // `closedAt: null` on the where clause makes the update conditional at @@ -61,12 +63,16 @@ const { action, loader } = createActionApiRoute( if (count === 0) { const final = await prisma.session.findFirst({ where: { id: existing.id } }); if (!final) return json({ error: "Session not found" }, { status: 404 }); - return json(serializeSession(final)); + return json( + await serializeSessionWithFriendlyRunId(final) + ); } const updated = await prisma.session.findFirst({ where: { id: existing.id } }); if (!updated) return json({ error: "Session not found" }, { status: 404 }); - return json(serializeSession(updated)); + return json( + await serializeSessionWithFriendlyRunId(updated) + ); } ); diff --git a/apps/webapp/app/routes/api.v1.sessions.$session.ts b/apps/webapp/app/routes/api.v1.sessions.$session.ts index 02b74dc3b73..800ee32b99b 100644 --- a/apps/webapp/app/routes/api.v1.sessions.$session.ts +++ b/apps/webapp/app/routes/api.v1.sessions.$session.ts @@ -8,7 +8,7 @@ import { z } from "zod"; import { $replica, prisma } from "~/db.server"; import { resolveSessionByIdOrExternalId, - serializeSession, + serializeSessionWithFriendlyRunId, } from "~/services/realtime/sessions.server"; import { createActionApiRoute, @@ -34,7 +34,9 @@ export const loader = createLoaderApiRoute( }, }, async ({ resource: session }) => { - return json(serializeSession(session)); + return json( + await serializeSessionWithFriendlyRunId(session) + ); } ); @@ -80,7 +82,9 @@ const { action } = createActionApiRoute( }, }); - return json(serializeSession(updated)); + return json( + await serializeSessionWithFriendlyRunId(updated) + ); } catch (error) { // A duplicate externalId in the same environment violates the // `(runtimeEnvironmentId, externalId)` unique constraint. Surface that diff --git a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts index ecc7a63c66a..13edae93244 100644 --- a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts +++ b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts @@ -329,11 +329,7 @@ export class S2RealtimeStreams implements StreamResponder, StreamIngestor { lastChunk != null && typeof lastChunk === "object" ? (lastChunk as { type?: unknown }).type : null; - if ( - lastChunk != null && - typeof lastChunk === "object" && - (lastChunk as { type?: unknown }).type === "trigger:turn-complete" - ) { + if (lastChunkType === "trigger:turn-complete") { settled = true; waitSeconds = 0; } diff --git a/apps/webapp/app/services/realtime/sessions.server.ts b/apps/webapp/app/services/realtime/sessions.server.ts index 82e2fda53f1..594d417292c 100644 --- a/apps/webapp/app/services/realtime/sessions.server.ts +++ b/apps/webapp/app/services/realtime/sessions.server.ts @@ -1,5 +1,6 @@ import type { PrismaClient, Session } from "@trigger.dev/database"; import type { SessionItem } from "@trigger.dev/core/v3"; +import { $replica } from "~/db.server"; /** * Prefix that {@link SessionId.generate} attaches to every Session friendlyId. @@ -74,10 +75,10 @@ export function canonicalSessionAddressingKey( * * Note: `currentRunId` is left as-is — Prisma stores the internal run id * (cuid), but `SessionItem.currentRunId` is the *friendly* form. Routes - * that emit `SessionItem` are responsible for resolving the friendlyId - * (typically via a separate TaskRun lookup) and overriding the field. - * `serializeSession` returns it raw so list endpoints don't pay an N+1 - * lookup just to surface the friendly form. + * that emit a single `SessionItem` should use + * {@link serializeSessionWithFriendlyRunId} instead, which resolves the + * friendlyId via a TaskRun lookup. List endpoints stay on this raw form + * to avoid N+1 lookups when paginating. */ export function serializeSession(session: Session): SessionItem { return { @@ -96,3 +97,31 @@ export function serializeSession(session: Session): SessionItem { updatedAt: session.updatedAt, }; } + +/** + * Same as {@link serializeSession} but resolves `currentRunId` from the + * internal cuid to the public `run_*` friendlyId via a TaskRun lookup. + * Single-row endpoints (`POST/GET/PATCH/close /api/v1/sessions/:s`) use + * this so the wire-side `currentRunId` is consistent with the rest of + * the public API (which only accepts friendlyIds for run lookups). + * + * Skips the lookup when `currentRunId` is null. The read goes through + * `$replica` — a TaskRun's `friendlyId` is immutable so replica lag is + * harmless, and serializing on the writer would just add hot-path load. + */ +export async function serializeSessionWithFriendlyRunId( + session: Session +): Promise { + const base = serializeSession(session); + if (!session.currentRunId) return base; + + const run = await $replica.taskRun.findFirst({ + where: { id: session.currentRunId }, + select: { friendlyId: true }, + }); + + return { + ...base, + currentRunId: run?.friendlyId ?? null, + }; +} From 4b15c7d6b1738803a9e1f0e89f536f8217318e73 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 27 Apr 2026 18:14:58 +0100 Subject: [PATCH 15/23] chore(server-changes): consolidate sessions PR into one entry Collapse `session-out-settled-signal.md` and `sessions-public-api-cors.md` into the single `session-primitive.md`, and rewrite that one to a high- level two-sentence summary that covers everything actually shipping in this PR (sessions-as-run-manager, end-and-continue, waitpoints, etc.). The CORS/JWT-on-create story is also out of date now that POST /api/v1/sessions is secret-key only. --- .server-changes/session-out-settled-signal.md | 10 ---------- .server-changes/session-primitive.md | 4 +--- .server-changes/sessions-public-api-cors.md | 11 ----------- 3 files changed, 1 insertion(+), 24 deletions(-) delete mode 100644 .server-changes/session-out-settled-signal.md delete mode 100644 .server-changes/sessions-public-api-cors.md diff --git a/.server-changes/session-out-settled-signal.md b/.server-changes/session-out-settled-signal.md deleted file mode 100644 index 519871af695..00000000000 --- a/.server-changes/session-out-settled-signal.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -area: webapp -type: improvement ---- - -`/realtime/v1/sessions/:session/out` accepts an opt-in `X-Peek-Settled: 1` request header. When set, the route peeks the tail record in S2 before proxying; if the last chunk is `trigger:turn-complete`, it switches the downstream read to `wait=0` and returns `X-Session-Settled: true` so the SSE drains-and-closes in ~1s instead of long-polling for 60s. - -Without the header, the route behaves exactly as before the settled work — unconditional `wait=60`. This matters because the peek races a newly-triggered turn's first chunk: the active `sendMessages → subscribeToSessionStream` path would otherwise see the previous turn's `trigger:turn-complete` at the tail and close the SSE before the new turn's chunks land on S2. The smoke test confirmed this race was failing every turn-2 response. - -`TriggerChatTransport.reconnectToStream` opts in via the header (that's the reload-on-a-settled-chat case where the fast close is a real UX win). Active send paths don't set the header and keep long-poll semantics. diff --git a/.server-changes/session-primitive.md b/.server-changes/session-primitive.md index 3bb9481a0ee..a4d8b606ee2 100644 --- a/.server-changes/session-primitive.md +++ b/.server-changes/session-primitive.md @@ -3,6 +3,4 @@ area: webapp type: feature --- -Add `Session` primitive — a durable, typed, bidirectional I/O primitive that outlives a single run, intended for agent/chat use cases. Ships the Postgres schema (`Session` table), control-plane CRUD routes (`POST/GET/PATCH /api/v1/sessions`, `POST /api/v1/sessions/:session/close` — polymorphic on friendlyId or externalId), `sessions` JWT scope, ClickHouse `sessions_v1` table, and `SessionsReplicationService` (logical replication from Postgres `Session` → ClickHouse `sessions_v1`). Run-scoped realtime streams (`streams.pipe`/`streams.input`) are unchanged and do **not** create Session rows. - -Adds `POST /api/v1/runs/:runFriendlyId/session-streams/wait` (session-stream waitpoint creation) and wires `POST /realtime/v1/sessions/:session/:io/append` to fire any pending waitpoints on the channel. Gives `session.in` run-engine waitpoint semantics matching run-scoped input streams: a task can suspend while idle on a session channel and resume when an external client sends a record. Redis-backed pending-waitpoint set (`ssw:{sessionFriendlyId}:{io}`) is drained atomically on each append so multiple concurrent waiters (e.g. multi-tab chat) all resume together. +Add the `Session` primitive — a durable, task-bound, bidirectional I/O channel that outlives a single run and acts as the run manager for `chat.agent`. Ships the Postgres `Session` + `SessionRun` tables, ClickHouse `sessions_v1` + replication service, the `sessions` JWT scope, and the public CRUD + realtime routes (`/api/v1/sessions`, `/realtime/v1/sessions/:session/:io`) including `end-and-continue` for server-orchestrated run handoffs and session-stream waitpoints. diff --git a/.server-changes/sessions-public-api-cors.md b/.server-changes/sessions-public-api-cors.md deleted file mode 100644 index f2047e634eb..00000000000 --- a/.server-changes/sessions-public-api-cors.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -area: webapp -type: fix ---- - -CORS + preflight parity on the public session API so browser-side chat transports can hit the session endpoints without being blocked: - -- `POST /api/v1/sessions` (session upsert) gains `allowJWT: true` + `corsStrategy: "all"` so PATs minted by `chat.createTriggerAction` (and other browser-side session flows) pass the route's auth + respond to CORS preflight. Previously this route only accepted secret-key auth, which broke any browser-originated `sessions.create(...)` call — including the transport's direct `accessToken` fallback path. -- `POST /realtime/v1/sessions/:session/:io/append` now exports both `{ action, loader }`. The route builder installs the OPTIONS preflight handler on the `loader` even for write-only routes; without the loader export, the CORS preflight was returning 400 ("No loader for route") and Chrome treated the follow-up `POST` as `net::ERR_FAILED`. - -Validated by an end-to-end UI smoke against the `references/ai-chat` app: brand-new chat → send → streamed assistant reply in ~4s → follow-up turn on the same session → `lastEventId` advances from 10 → 21. From c0e87bf12ce3fce47e00c6a1171e3aeb014897d5 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 28 Apr 2026 09:27:18 +0100 Subject: [PATCH 16/23] fix(webapp): use prisma writer for read-after-write of triggered run friendlyId Switch the two read-after-write taskRun lookups (POST /api/v1/sessions and POST /api/v1/sessions/:s/end-and-continue) from $replica back to prisma. Both reads happen immediately after triggering a run on the writer; replica lag would null the result and turn a successful create into a 500, or fall back to leaking the internal cuid in the end-and-continue response. --- .../api.v1.sessions.$session.end-and-continue.ts | 11 +++++++---- apps/webapp/app/routes/api.v1.sessions.ts | 7 ++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts b/apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts index c33ebb755bb..cdc9c9e8dc7 100644 --- a/apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts +++ b/apps/webapp/app/routes/api.v1.sessions.$session.end-and-continue.ts @@ -4,7 +4,7 @@ import { type EndAndContinueSessionResponseBody, } from "@trigger.dev/core/v3"; import { z } from "zod"; -import { $replica } from "~/db.server"; +import { $replica, prisma } from "~/db.server"; import { logger } from "~/services/logger.server"; import { swapSessionRun } from "~/services/realtime/sessionRunManager.server"; import { resolveSessionByIdOrExternalId } from "~/services/realtime/sessions.server"; @@ -107,9 +107,12 @@ const { action, loader } = createActionApiRoute( reason, }); - // The swap stored a TaskRun.id (cuid) in `currentRunId`; surface - // the friendlyId for parity with the rest of the public API. - const run = await $replica.taskRun.findFirst({ + // Read-after-write: the swap just triggered (or claimed) the + // run on the writer, so read it from `prisma` rather than + // `$replica`. A replica miss here would silently fall back to + // returning the internal cuid, which the public API contract + // says is a friendlyId. + const run = await prisma.taskRun.findFirst({ where: { id: result.runId }, select: { friendlyId: true }, }); diff --git a/apps/webapp/app/routes/api.v1.sessions.ts b/apps/webapp/app/routes/api.v1.sessions.ts index f970a127238..bf6f2892243 100644 --- a/apps/webapp/app/routes/api.v1.sessions.ts +++ b/apps/webapp/app/routes/api.v1.sessions.ts @@ -174,9 +174,10 @@ const { action } = createActionApiRoute( reason: isCached ? "continuation" : "initial", }); - // The newly triggered run's friendlyId, looked up via Prisma — we - // need the friendly form for the wire response. - const run = await $replica.taskRun.findFirst({ + // Read-after-write: the run was just triggered in this request, + // so go to the writer rather than $replica. Replica lag here + // would null this out and turn a successful create into a 500. + const run = await prisma.taskRun.findFirst({ where: { id: ensureResult.runId }, select: { friendlyId: true }, }); From e5f9dd175b993171cfde2e237c3aa73e7902e9b1 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 28 Apr 2026 09:54:13 +0100 Subject: [PATCH 17/23] fix(webapp): use prisma writer for post-race re-read of session row in sessionRunManager The lost-race re-read in ensureRunForSession and swapSessionRun reads the Session row that the winner just wrote on the writer. Reading from $replica could return pre-race state and either (1) cause ensureRunForSession to recurse with a stale currentRunVersion, fail the next claim, and waste runs until max-attempts; or (2) cause swapSessionRun to return swapped: false with the calling run's own id, misleading the caller into thinking it is still authoritative. --- .../services/realtime/sessionRunManager.server.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/apps/webapp/app/services/realtime/sessionRunManager.server.ts b/apps/webapp/app/services/realtime/sessionRunManager.server.ts index 71850589514..fb1b071817e 100644 --- a/apps/webapp/app/services/realtime/sessionRunManager.server.ts +++ b/apps/webapp/app/services/realtime/sessionRunManager.server.ts @@ -153,7 +153,11 @@ export async function ensureRunForSession( }); }); - const fresh = await $replica.session.findFirst({ + // Read-after-write: the winner just wrote `currentRunId` / + // `currentRunVersion` on the writer. Reading from `$replica` could + // return pre-race state and cause us to recurse with the same stale + // version, losing the next claim, until we exhaust max attempts. + const fresh = await prisma.session.findFirst({ where: { id: session.id }, select: { id: true, @@ -327,7 +331,12 @@ export async function swapSessionRun( }); }); - const fresh = await $replica.session.findFirst({ + // Read-after-write: the winner's swap was just committed on the + // writer. A replica read could return the pre-swap `currentRunId` + // (often `callingRunId` itself), which would tell the caller it is + // still the canonical run when in fact a different run has taken + // over. + const fresh = await prisma.session.findFirst({ where: { id: session.id }, select: { currentRunId: true }, }); From 554940d9dbbaf13257e3c234287a67c421d4cb68 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 28 Apr 2026 10:19:15 +0100 Subject: [PATCH 18/23] fix(webapp): parse stringified chunk envelope in peek-settled fast path The S2 record envelope wraps the agent-written chunk as {data: , id: partId} because StreamsWriterV2 hands appendPart an already-stringified chunk. The peek-settled check treated envelope.data as an object, so typeof === 'object' always returned false and the trigger:turn-complete sentinel was never matched. Reconnect-on-reload silently degraded to the full long-poll path. Parse envelope.data once more so the type discriminator is surfaced. --- .../realtime/s2realtimeStreams.server.ts | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts index 13edae93244..46c7f3854a1 100644 --- a/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts +++ b/apps/webapp/app/services/realtime/s2realtimeStreams.server.ts @@ -399,12 +399,18 @@ export class S2RealtimeStreams implements StreamResponder, StreamIngestor { }; const record = json.records?.[0]; if (!record) return null; - // The record body is a JSON string `{data: , id: partId}` - // where `` is the raw UIMessageChunk object (see - // `StreamsWriterV2` — the agent-side writer serializes the chunk - // object directly, not double-encoded). Unwrap the envelope and - // return `data` as-is. + // The record body is a JSON string `{data: , id: partId}`. + // The agent-side writer (`StreamsWriterV2`) hands `appendPart` an + // already-JSON-stringified chunk, so `data` round-trips as a string, + // not an object. Parse it once more to surface the chunk shape. const envelope = JSON.parse(record.body) as { data: unknown; id: string }; + if (typeof envelope.data === "string") { + try { + return JSON.parse(envelope.data); + } catch { + return envelope.data; + } + } return envelope.data; } catch (err) { this.logger.warn("S2 peek last record: parse failed", { err, stream: s2Stream }); From cf67175f033a6b5e8f67d902f8cbdd743ea22fec Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 28 Apr 2026 10:55:22 +0100 Subject: [PATCH 19/23] fix(webapp): use prisma writer for cancelLostRaceRun's just-triggered run lookup Same read-after-write pattern as the other lost-race re-reads: the run was just triggered on the writer milliseconds before, so a $replica.findFirst can return null due to replication lag. The null silently no-ops the cancellation and leaks an orphan run that no session will ever claim. --- .../app/services/realtime/sessionRunManager.server.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/webapp/app/services/realtime/sessionRunManager.server.ts b/apps/webapp/app/services/realtime/sessionRunManager.server.ts index fb1b071817e..58513460b14 100644 --- a/apps/webapp/app/services/realtime/sessionRunManager.server.ts +++ b/apps/webapp/app/services/realtime/sessionRunManager.server.ts @@ -362,10 +362,10 @@ async function cancelLostRaceRun( environment: AuthenticatedEnvironment ): Promise { const service = new CancelTaskRunService(); - // Resolve to a TaskRun reference — CancelTaskRunService takes the run - // object, not the id. Read from the replica; the actual cancellation - // write happens inside the service. - const run = await $replica.taskRun.findFirst({ where: { id: runId } }); + // Read-after-write: the run was just triggered on the writer, so go + // through `prisma`. A `$replica` miss here would silently no-op the + // cancel and leak an orphan run that no session is going to claim. + const run = await prisma.taskRun.findFirst({ where: { id: runId } }); if (!run) return; await service.call(run, { reason: "Lost session-run claim race" }); } From faf7888135aa94cb4d78f5a21b4a92bb1ffbbad2 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 28 Apr 2026 11:07:46 +0100 Subject: [PATCH 20/23] fix(webapp): reject create on closed sessions with 409 When the upsert path returns a previously-closed row, return 409 before ensureRunForSession fires. Otherwise we'd trigger a fresh run on a closed session that can't receive .in input (append handler rejects writes to closed sessions), wasting compute on a run that exits the moment it tries to read. close is one-way; callers must use a different externalId to start a new session. --- apps/webapp/app/routes/api.v1.sessions.ts | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/apps/webapp/app/routes/api.v1.sessions.ts b/apps/webapp/app/routes/api.v1.sessions.ts index bf6f2892243..6251ff3ac38 100644 --- a/apps/webapp/app/routes/api.v1.sessions.ts +++ b/apps/webapp/app/routes/api.v1.sessions.ts @@ -164,6 +164,19 @@ const { action } = createActionApiRoute( }); } + // Reject create on a closed session. The upsert path will return + // an already-closed row when the caller reuses an externalId, and + // without this guard `ensureRunForSession` would trigger a fresh + // run that can't receive `.in` input (the append handler 409s on + // closed sessions). Force the caller to use a different externalId + // — `close` is one-way. + if (session.closedAt) { + return json( + { error: "Session is closed; use a different externalId to create a new session" }, + { status: 409 } + ); + } + // Session is task-bound — every session has a live run by // construction. `ensureRunForSession` is idempotent: on the // cached path it sees `currentRunId` is alive and returns it From 1a880fd045e9b7d6441dfb8b545c4a7ec4f14260 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 28 Apr 2026 11:23:42 +0100 Subject: [PATCH 21/23] fix(webapp): hardcode v2 for session-streams wait race-check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The race-check in api.v1.runs.$runFriendlyId.session-streams.wait was selecting the realtime stream instance via run.realtimeStreamsVersion, but session streams are always v2 (S2) — the writer (appendPartToSessionStream) and the SSE subscribe both hardcode v2. For a v1 run the race-check silently fell back to a non-S2 instance, the instanceof check missed, and the optimization was skipped. Hardcode v2 for parity with the rest of the session surface. --- .../api.v1.runs.$runFriendlyId.session-streams.wait.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts index 8684cf3984e..18034caab47 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts @@ -123,10 +123,12 @@ const { action, loader } = createActionApiRoute( // and remove the pending registration. if (!result.isCached) { try { - const realtimeStream = getRealtimeStreamInstance( - authentication.environment, - run.realtimeStreamsVersion - ); + // Session streams are always v2 (S2) — the writer in + // `appendPartToSessionStream` and the SSE subscribe both + // hardcode "v2", so the race-check reader has to match. + // Don't fall through to the run's own `realtimeStreamsVersion`, + // which only describes the run's run-scoped streams. + const realtimeStream = getRealtimeStreamInstance(authentication.environment, "v2"); if (realtimeStream instanceof S2RealtimeStreams) { const records = await realtimeStream.readSessionStreamRecords( From 0caa55afa7ab9858eae7f30cfc1f09ff131eba57 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 28 Apr 2026 11:26:11 +0100 Subject: [PATCH 22/23] fix(webapp): mask 404 as 403 when findResource returns null on authorized routes createActionApiRoute now runs findResource before authorization so the auth scope check can expand to alternate identifiers of the resolved resource (Sessions are addressable by both friendlyId and externalId). Side-effect: an authenticated-but-underscoped caller could probe resource existence by observing 404 vs 403. Mask the 404 as 403 with the same response shape as the auth-failed branch when the route declares authorization, so the two cases are indistinguishable to callers without scopes. Routes without authorization keep returning 404. --- .../routeBuilders/apiBuilder.server.ts | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts index 10ed2c3bff0..4589507e9b7 100644 --- a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts +++ b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts @@ -683,6 +683,26 @@ export function createActionApiRoute< : undefined; if (options.findResource && !resource) { + // When the route also declares `authorization`, mask "resource + // doesn't exist" as 403 — same shape as the auth-failed branch + // below — so an authenticated-but-underscoped caller can't + // probe resource existence by observing 404 vs 403. Routes + // without an `authorization` block keep returning 404. + if (authorization) { + return await wrapResponse( + request, + json( + { + error: `Unauthorized: missing required scopes`, + code: "unauthorized", + param: "access_token", + type: "authorization", + }, + { status: 403 } + ), + corsStrategy !== "none" + ); + } return await wrapResponse( request, json({ error: "Resource not found" }, { status: 404 }), From 188fa43694201a8a4068b7461cacb173b2f9aeb9 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 28 Apr 2026 12:15:50 +0100 Subject: [PATCH 23/23] fix(webapp): only mask 404 as 403 when authorization fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous fix unconditionally returned 403 when findResource was null on a route with authorization, breaking PRIVATE-key callers (e.g. server SDK) hitting the existing api.v2.runs.cancel route — they always pass authorization but the new code returned 403 with a factually wrong message ('Unauthorized: missing required scopes') even though they had full permissions. New ordering: run authorization first (with the resolved resource as the 5th arg, so cross-form session auth still works), then check resource-null → 404. This gives: - PRIVATE key + missing resource: auth passes → 404 (correct) - Underscoped JWT + missing resource: auth fails (resource not in scope) → 403 (no info leak vs existing resource) - Underscoped JWT + existing resource: auth fails → 403 (unchanged) Only auth callbacks that destructure the resource (loader for realtime.v1.sessions.$session.$io) need to handle null — they all already do, since findResource was already nullable in pre-PR loaders. --- .../routeBuilders/apiBuilder.server.ts | 45 +++++++------------ 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts index 4589507e9b7..aae3c7ff54e 100644 --- a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts +++ b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts @@ -682,34 +682,15 @@ export function createActionApiRoute< ? await options.findResource(parsedParams, authenticationResult, parsedSearchParams) : undefined; - if (options.findResource && !resource) { - // When the route also declares `authorization`, mask "resource - // doesn't exist" as 403 — same shape as the auth-failed branch - // below — so an authenticated-but-underscoped caller can't - // probe resource existence by observing 404 vs 403. Routes - // without an `authorization` block keep returning 404. - if (authorization) { - return await wrapResponse( - request, - json( - { - error: `Unauthorized: missing required scopes`, - code: "unauthorized", - param: "access_token", - type: "authorization", - }, - { status: 403 } - ), - corsStrategy !== "none" - ); - } - return await wrapResponse( - request, - json({ error: "Resource not found" }, { status: 404 }), - corsStrategy !== "none" - ); - } - + // Run authorization first — but with the resolved resource available + // as the 5th arg so the auth scope check can expand to alternate + // identifiers of the same row (e.g. a Session is addressable by both + // `friendlyId` and `externalId`). Resource-null is checked AFTER auth + // so: + // - underscoped JWT + missing resource → 403 (no info leak) + // - underscoped JWT + existing resource → 403 (existing behavior) + // - PRIVATE key + missing resource → auth passes → 404 (correct) + // - PRIVATE key + existing resource → auth passes → handler runs if (authorization) { const { action, resource: authResource, superScopes } = authorization; const $resource = authResource( @@ -751,6 +732,14 @@ export function createActionApiRoute< } } + if (options.findResource && !resource) { + return await wrapResponse( + request, + json({ error: "Resource not found" }, { status: 404 }), + corsStrategy !== "none" + ); + } + const result = await handler({ params: parsedParams, searchParams: parsedSearchParams,