From 5d8fa5d2d09ad06652be7607c04a494a27579b4a Mon Sep 17 00:00:00 2001 From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com> Date: Sun, 12 Apr 2026 13:52:16 -0700 Subject: [PATCH 1/9] fix: harden truth and coverage gates --- README.md | 9 + apps/dashboard/vitest.config.mts | 4 +- apps/desktop/scripts/playwright-tempdir.mjs | 2 +- .../src/components/chain/ChainPanel.test.tsx | 8 +- .../copilot/DesktopCopilotPanel.test.tsx | 98 ++++++ .../desktop/src/hooks/useDesktopData.test.tsx | 22 +- apps/desktop/src/lib/desktopUi.test.ts | 17 +- apps/desktop/src/lib/uiError.test.ts | 10 +- .../src/pages/AgentsRoleConfigPanel.test.tsx | 174 +++++++++ apps/desktop/src/pages/EventsPage.test.tsx | 4 +- apps/desktop/src/pages/ReviewsPage.test.tsx | 6 +- apps/desktop/src/pages/TestsPage.test.tsx | 7 +- .../pages/coverage_sprint_f_pages.test.tsx | 8 +- .../pages/workflow_queue_controls.test.tsx | 2 +- .../src/cortexpilot_orch/gates/tests_gate.py | 53 ++- .../services/control_plane_read_service.py | 17 +- .../tests/test_bench_e2e_speed_gate.py | 100 ++++++ .../tests/test_control_plane_read_service.py | 330 ++++++++++++++++++ .../test_mcp_queue_pilot_server_branches.py | 132 +++++++ .../tests/test_repo_coverage_gate.py | 40 +++ .../tests/test_tests_gate_extended.py | 57 +++ configs/env.registry.json | 36 ++ .../storefront/benchmark-methodology.md | 18 + package.json | 7 +- scripts/check_bench_e2e_speed_gate.py | 135 +++++++ scripts/repo_coverage_gate.py | 44 ++- 26 files changed, 1272 insertions(+), 68 deletions(-) create mode 100644 apps/desktop/src/components/copilot/DesktopCopilotPanel.test.tsx create mode 100644 apps/desktop/src/pages/AgentsRoleConfigPanel.test.tsx create mode 100644 apps/orchestrator/tests/test_bench_e2e_speed_gate.py create mode 100644 apps/orchestrator/tests/test_control_plane_read_service.py create mode 100644 apps/orchestrator/tests/test_mcp_queue_pilot_server_branches.py create mode 100644 apps/orchestrator/tests/test_repo_coverage_gate.py create mode 100644 scripts/check_bench_e2e_speed_gate.py diff --git a/README.md b/README.md index 8ba5279..9126d8c 100644 --- a/README.md +++ b/README.md @@ -526,6 +526,8 @@ Default local verification path: npm run ci npm run test:quick npm run test +npm run mutation:gate +npm run bench:e2e:speed:gate ``` `npm run ci` is now the hosted-aligned local fast gate. Use @@ -533,6 +535,13 @@ npm run test `npm run scan:workflow-security`, `npm run scan:trivy`, and `npm run security:scan:closeout` only when you intentionally want the stricter closeout/manual layers. +`npm run mutation:gate` is the root mutation entrypoint for the existing +Orchestrator mutation profiles, `npm run bench:e2e:speed:gate` is the +fail-closed benchmark gate that evaluates a real benchmark summary once a run +has produced one, and `npm run coverage:repo` now points to the active +coverage runner that prepares subproject dependencies before generating fresh +repo-level coverage receipts. Use `npm run coverage:repo:aggregate` only when +you intentionally want to re-aggregate already-existing coverage artifacts. Current CI contract has five layers only: diff --git a/apps/dashboard/vitest.config.mts b/apps/dashboard/vitest.config.mts index bc44f9e..adaab2e 100644 --- a/apps/dashboard/vitest.config.mts +++ b/apps/dashboard/vitest.config.mts @@ -27,7 +27,9 @@ if (pool !== requestedPool) { } const shouldEmitHtmlCoverage = !process.env.CI || process.env.CORTEXPILOT_COVERAGE_HTML === "1"; const coverageReporter = shouldEmitHtmlCoverage ? ["text", "html", "json-summary"] : ["text", "json-summary"]; -const coverageReportsDirectory = path.resolve(process.cwd(), "coverage"); +const coverageReportsDirectory = process.env.CORTEXPILOT_DASHBOARD_COVERAGE_DIR + ? path.resolve(process.env.CORTEXPILOT_DASHBOARD_COVERAGE_DIR) + : path.resolve(process.cwd(), "coverage"); const coverageClean = !serialCoverageMode; const coverageProcessingConcurrency = serialCoverageMode ? 1 : undefined; const testTimeout = process.env.CI ? 45000 : 15000; diff --git a/apps/desktop/scripts/playwright-tempdir.mjs b/apps/desktop/scripts/playwright-tempdir.mjs index 3d5be06..7fa4c67 100644 --- a/apps/desktop/scripts/playwright-tempdir.mjs +++ b/apps/desktop/scripts/playwright-tempdir.mjs @@ -14,7 +14,7 @@ function sanitizeScope(scope) { function resolveTempRoot(scriptDir) { const runnerTemp = normalizeValue(process.env.RUNNER_TEMP); if (runnerTemp) return resolve(runnerTemp); - return resolve(scriptDir, "..", "..", "..", ".runtime-cache", "temp"); + return resolve(scriptDir, "..", "..", "..", ".runtime-cache", "cache", "tmp"); } export function configurePlaywrightTempDir(scope) { diff --git a/apps/desktop/src/components/chain/ChainPanel.test.tsx b/apps/desktop/src/components/chain/ChainPanel.test.tsx index 73e5dd4..25191f0 100644 --- a/apps/desktop/src/components/chain/ChainPanel.test.tsx +++ b/apps/desktop/src/components/chain/ChainPanel.test.tsx @@ -51,9 +51,9 @@ describe("ChainPanel", () => { /> ); - fireEvent.click(screen.getByRole("button", { name: "简洁视图" })); - fireEvent.click(screen.getByRole("button", { name: "详细视图" })); - fireEvent.click(screen.getByRole("button", { name: "Chain 优先" })); + fireEvent.click(screen.getByRole("button", { name: "Compact view" })); + fireEvent.click(screen.getByRole("button", { name: "Detailed view" })); + fireEvent.click(screen.getByRole("button", { name: "Chain first" })); expect(setChainDisplayMode).toHaveBeenCalledWith("compact"); expect(setChainDisplayMode).toHaveBeenCalledWith("detail"); @@ -79,7 +79,7 @@ describe("ChainPanel", () => { /> ); - const legend = screen.getByLabelText("节点状态说明"); + const legend = screen.getByLabelText("Node status legend"); const items = legend.querySelectorAll("li"); expect(items).toHaveLength(2); expect(items[0]).toHaveClass("is-active"); diff --git a/apps/desktop/src/components/copilot/DesktopCopilotPanel.test.tsx b/apps/desktop/src/components/copilot/DesktopCopilotPanel.test.tsx new file mode 100644 index 0000000..50e9059 --- /dev/null +++ b/apps/desktop/src/components/copilot/DesktopCopilotPanel.test.tsx @@ -0,0 +1,98 @@ +import { fireEvent, render, screen, waitFor } from "@testing-library/react"; +import { describe, expect, it, vi } from "vitest"; + +import { DesktopCopilotPanel } from "./DesktopCopilotPanel"; + +describe("DesktopCopilotPanel", () => { + it("renders operator-brief truth surfaces and grounded takeaways after generation", async () => { + const loadBrief = vi.fn().mockResolvedValue({ + report_type: "operator_copilot_brief", + status: "AVAILABLE", + scope: "run_detail", + subject_id: "run-123", + summary: "The operator should compare the staged diff before accepting the run.", + likely_cause: "The last proof pack is stale.", + compare_takeaway: "Compare the staged diff against the last approved run.", + proof_takeaway: "Refresh the proof pack before asking for review.", + incident_takeaway: "Treat stale proof as an incident until it is re-generated.", + queue_takeaway: "Keep the queue paused until proof is current.", + approval_takeaway: "Approval should wait for a fresh proof receipt.", + used_truth_surfaces: ["run_detail", "", "proof_pack"], + limitations: ["review not started", " "], + recommended_actions: ["Refresh proof", "Request review", " "], + top_risks: ["stale-proof", "", "queue drift"], + }); + + render( + , + ); + + expect(screen.getByText("Only grounded control-plane truth belongs here.")).toBeInTheDocument(); + expect(screen.getByText("What is blocked?")).toBeInTheDocument(); + expect(screen.getByText("What should the operator do next?")).toBeInTheDocument(); + expect(screen.getByText("On demand")).toBeInTheDocument(); + + fireEvent.click(screen.getByRole("button", { name: "Generate operator brief" })); + + expect(await screen.findByText("Grounded brief")).toBeInTheDocument(); + expect(await screen.findByText("The operator should compare the staged diff before accepting the run.")).toBeInTheDocument(); + expect(screen.getByText("The last proof pack is stale.")).toBeInTheDocument(); + expect(screen.getByText("Scope: run_detail")).toBeInTheDocument(); + expect(screen.getByText("Subject: run-123")).toBeInTheDocument(); + expect(screen.getByText("Truth surfaces: run_detail | proof_pack")).toBeInTheDocument(); + expect(screen.getByText("Limitations: review not started")).toBeInTheDocument(); + expect(screen.getByText("Compare the staged diff against the last approved run.")).toBeInTheDocument(); + expect(screen.getByText("Keep the queue paused until proof is current.")).toBeInTheDocument(); + expect(screen.getByText("Refresh proof")).toBeInTheDocument(); + expect(screen.getByText("queue drift")).toBeInTheDocument(); + expect(screen.getByRole("button", { name: "Regenerate brief" })).toBeInTheDocument(); + + expect(loadBrief).toHaveBeenCalledTimes(1); + }); + + it("covers flight-plan fallback labels and empty action/risk lists", async () => { + const loadBrief = vi.fn().mockResolvedValue({ + report_type: "flight_plan_copilot_brief", + status: "UNAVAILABLE", + summary: "The plan is still advisory because execution has not started yet.", + risk_takeaway: "Approval is still blocked on a missing operator confirmation.", + capability_takeaway: "Runtime capability is unresolved until the runner binds.", + approval_takeaway: "An operator must confirm the start gate before execution.", + used_truth_surfaces: ["execution_plan_preview"], + recommended_actions: ["", " "], + top_risks: [], + limitations: undefined, + }); + + render(); + + fireEvent.click(screen.getByRole("button", { name: "Generate operator brief" })); + + expect(await screen.findByText("Unavailable")).toBeInTheDocument(); + expect(screen.getByText("Scope: flight_plan")).toBeInTheDocument(); + expect(screen.getByText("Subject: execution_plan_report")).toBeInTheDocument(); + expect(screen.getByText("Truth surfaces: execution_plan_preview")).toBeInTheDocument(); + expect(screen.getByText("Limitations: -")).toBeInTheDocument(); + expect(screen.getAllByText("Approval is still blocked on a missing operator confirmation.").length).toBeGreaterThan(0); + expect(screen.getByText("This brief stays advisory until a run actually starts.")).toBeInTheDocument(); + expect(screen.getByText("No recommended actions were returned.")).toBeInTheDocument(); + expect(screen.getByText("No explicit risks were returned.")).toBeInTheDocument(); + }); + + it("surfaces load failures without leaving the panel in generating state", async () => { + const loadBrief = vi.fn().mockRejectedValue("brief backend unavailable"); + + render(); + + fireEvent.click(screen.getByRole("button", { name: "Generate operator brief" })); + + expect(await screen.findByText("brief backend unavailable")).toBeInTheDocument(); + await waitFor(() => { + expect(screen.getByRole("button", { name: "Generate operator brief" })).toBeEnabled(); + }); + }); +}); diff --git a/apps/desktop/src/hooks/useDesktopData.test.tsx b/apps/desktop/src/hooks/useDesktopData.test.tsx index 6d5ea46..df8e113 100644 --- a/apps/desktop/src/hooks/useDesktopData.test.tsx +++ b/apps/desktop/src/hooks/useDesktopData.test.tsx @@ -104,7 +104,9 @@ describe("useDesktopData", () => { const user = userEvent.setup(); render(); await waitFor(() => { - expect(screen.getByTestId("live-error")).toHaveTextContent("总览数据拉取失败"); + expect(screen.getByTestId("live-error")).toHaveTextContent( + "Failed to refresh overview data: the service is temporarily unavailable. Try again in a moment.", + ); }); overviewFail = false; @@ -131,7 +133,9 @@ describe("useDesktopData", () => { ); render(); await waitFor(() => { - expect(screen.getByTestId("live-error")).toHaveTextContent("会话列表拉取失败"); + expect(screen.getByTestId("live-error")).toHaveTextContent( + "Failed to refresh the session list: the service is temporarily unavailable. Try again in a moment.", + ); }); }); @@ -154,7 +158,9 @@ describe("useDesktopData", () => { ); render(); await waitFor(() => { - expect(screen.getByTestId("live-error")).toHaveTextContent("后端暂不可达,已进入退避重试"); + expect(screen.getByTestId("live-error")).toHaveTextContent( + "The backend is currently unreachable. Backoff retry is active and local actions can continue.", + ); }); }); @@ -233,7 +239,9 @@ describe("useDesktopData", () => { try { render(); await waitFor(() => { - expect(screen.getByTestId("live-error")).toHaveTextContent("当前网络离线,已暂停实时拉取。恢复联网后将自动重试。"); + expect(screen.getByTestId("live-error")).toHaveTextContent( + "The network is offline. Live polling is paused and will retry automatically when connectivity returns.", + ); }); } finally { Object.defineProperty(window.navigator, "onLine", { configurable: true, value: originalOnLine }); @@ -260,7 +268,9 @@ describe("useDesktopData", () => { render(); await waitFor(() => { - expect(screen.getByTestId("live-error")).toHaveTextContent("会话列表拉取失败:权限或认证异常,请确认登录状态。"); + expect(screen.getByTestId("live-error")).toHaveTextContent( + "Failed to refresh the session list: authentication or permission check failed. Confirm your sign-in state.", + ); }); }); @@ -380,7 +390,7 @@ describe("useDesktopData", () => { try { render(); await waitFor(() => { - expect(screen.getByTestId("live-error")).toHaveTextContent("策略告警拉取失败"); + expect(screen.getByTestId("live-error")).toHaveTextContent("Failed to refresh policy alerts"); }); expect(consoleSpy).toHaveBeenCalled(); } finally { diff --git a/apps/desktop/src/lib/desktopUi.test.ts b/apps/desktop/src/lib/desktopUi.test.ts index c3700f9..375fea2 100644 --- a/apps/desktop/src/lib/desktopUi.test.ts +++ b/apps/desktop/src/lib/desktopUi.test.ts @@ -49,7 +49,7 @@ describe("desktopUi seed timeline", () => { ), ); - fireEvent.click(screen.getByRole("button", { name: "查看完整 Diff" })); + fireEvent.click(screen.getByRole("button", { name: "View full diff" })); expect(onViewDiff).toHaveBeenCalledWith("report-1"); }); @@ -108,8 +108,8 @@ describe("desktopUi seed timeline", () => { render(createElement("div", null, renderChatEmbed(message as any, embed as any, chooseDecision))); - expect(screen.getByText("推荐")).toBeInTheDocument(); - fireEvent.click(screen.getByRole("button", { name: "选择" })); + expect(screen.getByText("Recommended")).toBeInTheDocument(); + fireEvent.click(screen.getByRole("button", { name: "Choose" })); expect(chooseDecision).toHaveBeenCalledWith("msg-decision", "decision-1", "fast"); }); @@ -157,10 +157,11 @@ describe("desktopUi seed timeline", () => { ) ); - expect(screen.getByText("任务:")).toBeInTheDocument(); - expect(screen.getAllByText("进行中")).toHaveLength(2); - expect(screen.getByText("等待")).toBeInTheDocument(); - expect(screen.getByText("完成")).toBeInTheDocument(); - expect(screen.getByLabelText("警报卡片")).toHaveClass("is-critical"); + expect(screen.getByText("Task:")).toBeInTheDocument(); + expect(screen.getByText("进行中")).toBeInTheDocument(); + expect(screen.getAllByText("In progress")).toHaveLength(1); + expect(screen.getByText("Waiting")).toBeInTheDocument(); + expect(screen.getByText("Done")).toBeInTheDocument(); + expect(screen.getByLabelText("Alert card")).toHaveClass("is-critical"); }); }); diff --git a/apps/desktop/src/lib/uiError.test.ts b/apps/desktop/src/lib/uiError.test.ts index 1232157..9e7bd56 100644 --- a/apps/desktop/src/lib/uiError.test.ts +++ b/apps/desktop/src/lib/uiError.test.ts @@ -7,13 +7,13 @@ describe("uiError", () => { }); it("maps network-style messages", () => { - expect(sanitizeUiError(new Error("Network timeout"), "加载失败")).toContain("未连接到本地服务"); - expect(sanitizeUiError(new Error("fetch failed"), "加载失败")).toContain("未连接到本地服务"); + expect(sanitizeUiError(new Error("Network timeout"), "Load failed")).toContain("unable to reach the local service"); + expect(sanitizeUiError(new Error("fetch failed"), "Load failed")).toContain("unable to reach the local service"); }); it("maps auth-style messages", () => { - expect(sanitizeUiError(new Error("401 unauthorized"), "加载失败")).toContain("权限或认证异常"); - expect(sanitizeUiError(new Error("token invalid"), "加载失败")).toContain("权限或认证异常"); + expect(sanitizeUiError(new Error("401 unauthorized"), "Load failed")).toContain("authentication or permission check failed"); + expect(sanitizeUiError(new Error("token invalid"), "Load failed")).toContain("authentication or permission check failed"); }); it("keeps generic fallback for unknown errors", () => { @@ -21,7 +21,7 @@ describe("uiError", () => { }); it("maps backend 5xx-style messages", () => { - expect(sanitizeUiError(new Error("API /path failed: 503"), "加载失败")).toContain("服务暂时不可用"); + expect(sanitizeUiError(new Error("API /path failed: 503"), "Load failed")).toContain("service is temporarily unavailable"); }); it("extracts detail from unknown payload", () => { diff --git a/apps/desktop/src/pages/AgentsRoleConfigPanel.test.tsx b/apps/desktop/src/pages/AgentsRoleConfigPanel.test.tsx new file mode 100644 index 0000000..e41e723 --- /dev/null +++ b/apps/desktop/src/pages/AgentsRoleConfigPanel.test.tsx @@ -0,0 +1,174 @@ +import { fireEvent, render, screen, waitFor } from "@testing-library/react"; +import { beforeEach, describe, expect, it, vi } from "vitest"; + +import { AgentsRoleConfigPanel } from "./AgentsRoleConfigPanel"; + +vi.mock("../lib/api", () => ({ + applyRoleConfig: vi.fn(), + fetchRoleConfig: vi.fn(), + mutationExecutionCapability: vi.fn(() => ({ executable: false, operatorRole: null })), + previewRoleConfig: vi.fn(), +})); + +import { applyRoleConfig, fetchRoleConfig, mutationExecutionCapability, previewRoleConfig } from "../lib/api"; + +function makeSurface(overrides: Record = {}) { + return { + persisted_source: "policies/role_config_registry.json", + execution_authority: "task_contract", + editable_now: { + system_prompt_ref: "policies/agents/codex/roles/20_planner_core.md", + skills_bundle_ref: "policies/skills_bundle_registry.json#bundles.planner", + mcp_bundle_ref: "policies/agent_registry.json#agents(role=PLANNER).capabilities.mcp_tools", + runtime_binding: { + runner: "agents", + provider: "cliproxyapi", + model: "gpt-5.4", + }, + }, + ...overrides, + } as any; +} + +describe("AgentsRoleConfigPanel", () => { + beforeEach(() => { + vi.clearAllMocks(); + vi.mocked(mutationExecutionCapability).mockReturnValue({ executable: false, operatorRole: null } as any); + }); + + it("shows the empty-state desk when no roles are available", () => { + render(); + + expect(screen.getByRole("heading", { name: "Role configuration desk" })).toBeInTheDocument(); + expect(screen.getByText("No registered roles are available for configuration yet.")).toBeInTheDocument(); + }); + + it("supports preview mode and reports role-load failures when switching roles", async () => { + let resolveFirstFetch: (value: any) => void = () => {}; + vi.mocked(fetchRoleConfig) + .mockImplementationOnce(() => new Promise((resolve) => { + resolveFirstFetch = resolve; + }) as any) + .mockRejectedValueOnce("role config fetch failed"); + vi.mocked(previewRoleConfig).mockResolvedValue({ + changes: [ + { field: "runtime_binding.runner", current: "agents", next: "codex" }, + ], + preview_surface: { + runtime_capability: { + lane: "tool-capable-provider", + tool_execution: "available", + }, + }, + } as any); + + render( + , + ); + + expect(screen.getByText("Loading role configuration…")).toBeInTheDocument(); + + resolveFirstFetch(makeSurface()); + expect(await screen.findByText("Drive wave planning")).toBeInTheDocument(); + expect(screen.getByText("Preview only")).toBeInTheDocument(); + expect(screen.getByText("Preview is available, but saving defaults requires an operator role.")).toBeInTheDocument(); + + fireEvent.change(screen.getByLabelText("Runtime runner"), { target: { value: "codex" } }); + fireEvent.click(screen.getByRole("button", { name: "Preview defaults" })); + + await waitFor(() => { + expect(previewRoleConfig).toHaveBeenCalledWith("PLANNER", { + system_prompt_ref: "policies/agents/codex/roles/20_planner_core.md", + skills_bundle_ref: "policies/skills_bundle_registry.json#bundles.planner", + mcp_bundle_ref: "policies/agent_registry.json#agents(role=PLANNER).capabilities.mcp_tools", + runtime_binding: { + runner: "codex", + provider: "cliproxyapi", + model: "gpt-5.4", + }, + }); + }); + await waitFor(() => { + expect(screen.getAllByText("Runtime runner").length).toBeGreaterThan(0); + }); + expect(screen.getByText("agents → codex")).toBeInTheDocument(); + expect(screen.getByText("tool-capable-provider")).toBeInTheDocument(); + + fireEvent.change(screen.getByLabelText("Select role for role configuration"), { target: { value: "REVIEWER" } }); + + expect(await screen.findByText("role config fetch failed")).toBeInTheDocument(); + expect(screen.getByText("No role purpose published yet.")).toBeInTheDocument(); + }); + + it("applies repo defaults when mutation execution is enabled", async () => { + const onApplied = vi.fn().mockResolvedValue(undefined); + + vi.mocked(fetchRoleConfig).mockResolvedValue(makeSurface()); + vi.mocked(mutationExecutionCapability).mockReturnValue({ executable: true, operatorRole: "OPS" } as any); + vi.mocked(previewRoleConfig).mockResolvedValue({ + changes: [], + preview_surface: { + runtime_capability: { + lane: "standard-provider-path", + tool_execution: "provider-path-required", + }, + }, + } as any); + vi.mocked(applyRoleConfig).mockResolvedValue({ + role: "PLANNER", + surface: makeSurface({ + editable_now: { + system_prompt_ref: "policies/agents/codex/roles/30_ops.md", + skills_bundle_ref: "policies/skills_bundle_registry.json#bundles.planner", + mcp_bundle_ref: "policies/agent_registry.json#agents(role=PLANNER).capabilities.mcp_tools", + runtime_binding: { + runner: "codex", + provider: null, + model: null, + }, + }, + }), + } as any); + + render( + , + ); + + expect(await screen.findByText("Apply enabled for OPS")).toBeInTheDocument(); + + fireEvent.change(screen.getByLabelText("System prompt ref"), { + target: { value: " policies/agents/codex/roles/30_ops.md " }, + }); + fireEvent.change(screen.getByLabelText("Runtime runner"), { target: { value: "codex" } }); + fireEvent.change(screen.getByLabelText("Runtime provider"), { target: { value: " " } }); + fireEvent.change(screen.getByLabelText("Runtime model"), { target: { value: "" } }); + + fireEvent.click(screen.getByRole("button", { name: "Save repo defaults" })); + + await waitFor(() => { + expect(applyRoleConfig).toHaveBeenCalledWith("PLANNER", { + system_prompt_ref: "policies/agents/codex/roles/30_ops.md", + skills_bundle_ref: "policies/skills_bundle_registry.json#bundles.planner", + mcp_bundle_ref: "policies/agent_registry.json#agents(role=PLANNER).capabilities.mcp_tools", + runtime_binding: { + runner: "codex", + provider: null, + model: null, + }, + }); + }); + + expect(await screen.findByText("Saved repo-owned defaults for PLANNER.")).toBeInTheDocument(); + expect(onApplied).toHaveBeenCalledTimes(1); + expect(screen.getByText("codex / Not set / Not set")).toBeInTheDocument(); + }); +}); diff --git a/apps/desktop/src/pages/EventsPage.test.tsx b/apps/desktop/src/pages/EventsPage.test.tsx index b4b21bb..f640fdc 100644 --- a/apps/desktop/src/pages/EventsPage.test.tsx +++ b/apps/desktop/src/pages/EventsPage.test.tsx @@ -30,7 +30,7 @@ describe("EventsPage", () => { const user = userEvent.setup(); render(); - const rowToggle = await screen.findByRole("button", { name: "查看事件详情 TEST_EVENT" }); + const rowToggle = await screen.findByRole("button", { name: "View event details TEST_EVENT" }); expect(rowToggle).toHaveAttribute("aria-expanded", "false"); await user.click(rowToggle); @@ -57,7 +57,7 @@ describe("EventsPage", () => { const user = userEvent.setup(); render(); - const rowToggle = await screen.findByRole("button", { name: "查看事件详情 KEYBOARD_EVENT" }); + const rowToggle = await screen.findByRole("button", { name: "View event details KEYBOARD_EVENT" }); rowToggle.focus(); await user.keyboard("{Enter}"); expect(rowToggle).toHaveAttribute("aria-expanded", "true"); diff --git a/apps/desktop/src/pages/ReviewsPage.test.tsx b/apps/desktop/src/pages/ReviewsPage.test.tsx index ec4eb65..31ea985 100644 --- a/apps/desktop/src/pages/ReviewsPage.test.tsx +++ b/apps/desktop/src/pages/ReviewsPage.test.tsx @@ -33,10 +33,10 @@ describe("ReviewsPage", () => { ] as any); const user = userEvent.setup(); render(); - expect(screen.getByRole("button", { name: "刷新中..." })).toBeDisabled(); + expect(screen.getByRole("button", { name: "Refreshing..." })).toBeDisabled(); resolveFirstFetch([]); - expect(await screen.findByText("暂无评审记录")).toBeInTheDocument(); - await user.click(screen.getByRole("button", { name: "刷新" })); + expect(await screen.findByText("No review records yet")).toBeInTheDocument(); + await user.click(screen.getByRole("button", { name: "Refresh" })); expect(await screen.findByText("run-1")).toBeInTheDocument(); expect(screen.getByText("looks good")).toBeInTheDocument(); expect(screen.getByText("Scope: ok")).toBeInTheDocument(); diff --git a/apps/desktop/src/pages/TestsPage.test.tsx b/apps/desktop/src/pages/TestsPage.test.tsx index dcb0c48..e440dc1 100644 --- a/apps/desktop/src/pages/TestsPage.test.tsx +++ b/apps/desktop/src/pages/TestsPage.test.tsx @@ -13,6 +13,7 @@ import { fetchTests } from "../lib/api"; describe("TestsPage", () => { beforeEach(() => { vi.clearAllMocks(); + vi.mocked(fetchTests).mockReset(); }); it("renders empty state and then status cards after refresh", async () => { @@ -25,11 +26,11 @@ describe("TestsPage", () => { command: "pnpm test", failure_info: "snapshot mismatch", }, - ] as any); + ] as any); const user = userEvent.setup(); render(); - expect(await screen.findByText("暂无测试记录")).toBeInTheDocument(); - await user.click(screen.getByRole("button", { name: "刷新" })); + expect(await screen.findByText("No test records yet")).toBeInTheDocument(); + await user.click(screen.getByRole("button", { name: "Refresh" })); expect(await screen.findByText("回归检查")).toBeInTheDocument(); expect(screen.getByText("pnpm test")).toBeInTheDocument(); expect(screen.getByText("snapshot mismatch")).toBeInTheDocument(); diff --git a/apps/desktop/src/pages/coverage_sprint_f_pages.test.tsx b/apps/desktop/src/pages/coverage_sprint_f_pages.test.tsx index ed15b13..54de14c 100644 --- a/apps/desktop/src/pages/coverage_sprint_f_pages.test.tsx +++ b/apps/desktop/src/pages/coverage_sprint_f_pages.test.tsx @@ -125,13 +125,13 @@ describe("coverage sprint F: low-branch pages", () => { locks: [], role_catalog: [], } as FirstAgentsPayload); - expect(await screen.findByText(/活跃状态机|Active State Machines/)).toBeInTheDocument(); - expect(screen.getByText(/注册代理 \(1\)|Registered Agents \(1\)/)).toBeInTheDocument(); + expect(await screen.findByText("Execution lane triage")).toBeInTheDocument(); + expect(screen.getByText("Registered execution seats (expandable, 1 items)")).toBeInTheDocument(); expect(screen.getByText("run-12345678")).toBeInTheDocument(); fireEvent.click(screen.getByRole("button", { name: /刷新|Refresh/ })); - expect(await screen.findByText(/暂无注册代理|No agents are registered yet/)).toBeInTheDocument(); - expect(screen.queryByText(/活跃状态机|Active state machines/)).not.toBeInTheDocument(); + expect(await screen.findByText("No registered agents")).toBeInTheDocument(); + expect(screen.queryByText("Execution lane triage")).not.toBeInTheDocument(); fireEvent.click(screen.getByRole("button", { name: /刷新|Refresh/ })); const errorBanner = await screen.findByRole("alert"); diff --git a/apps/desktop/src/pages/workflow_queue_controls.test.tsx b/apps/desktop/src/pages/workflow_queue_controls.test.tsx index f124b83..0453813 100644 --- a/apps/desktop/src/pages/workflow_queue_controls.test.tsx +++ b/apps/desktop/src/pages/workflow_queue_controls.test.tsx @@ -141,7 +141,7 @@ describe("workflow queue controls", () => { }), ); }); - expect(await screen.findByText("Queued task-queue.")).toBeInTheDocument(); + expect(await screen.findByText("Queued task-queue. Refreshing the workflow view...")).toBeInTheDocument(); }); it("renders locale-aware workflow detail labels when zh-CN is requested", async () => { diff --git a/apps/orchestrator/src/cortexpilot_orch/gates/tests_gate.py b/apps/orchestrator/src/cortexpilot_orch/gates/tests_gate.py index 7753920..0fccf98 100644 --- a/apps/orchestrator/src/cortexpilot_orch/gates/tests_gate.py +++ b/apps/orchestrator/src/cortexpilot_orch/gates/tests_gate.py @@ -25,6 +25,7 @@ "true", ":", } +_TRIVIAL_ECHO_PAYLOADS = {"", "ok", "hello", "pass", "success", "done", "1"} def _coerce_timeout_sec(raw: object) -> float: @@ -38,6 +39,16 @@ def _coerce_timeout_sec(raw: object) -> float: return timeout_sec +def _coerce_gate_result(gate: object) -> dict[str, object]: + if isinstance(gate, dict): + return gate + return { + "ok": False, + "reason": "invalid validate_command result", + "raw": repr(gate), + } + + def _now_ts() -> str: return datetime.now(timezone.utc).isoformat() @@ -145,9 +156,13 @@ def _is_trivial_acceptance_command(command: str) -> bool: return True if normalized in _TRIVIAL_ACCEPTANCE_COMMANDS: return True - if normalized.startswith("echo "): - payload = normalized[5:].strip().strip('"').strip("'") - if payload in {"", "ok", "hello", "pass", "success", "done", "1"}: + try: + tokens = shlex.split(command) + except ValueError: + return False + if tokens and tokens[0].lower() == "echo": + payload = " ".join(tokens[1:]).strip().lower() + if payload in _TRIVIAL_ECHO_PAYLOADS: return True return False @@ -167,7 +182,7 @@ def _normalize_tests(test_items: Iterable[object]) -> list[dict[str, object]]: continue if isinstance(item, dict): cmd = item.get("cmd") or item.get("command") - if isinstance(cmd, str) and cmd.strip(): + if isinstance(cmd, str): timeout_sec = _coerce_timeout_sec(item.get("timeout_sec", _DEFAULT_TIMEOUT_SEC)) normalized.append( { @@ -233,7 +248,7 @@ def run_acceptance_tests( strict_nontrivial_enabled = ( bool(strict_nontrivial) if strict_nontrivial is not None else _is_strict_nontrivial_enabled() ) - has_must_pass = any(bool(test.get("must_pass", True)) for test in normalized) + has_must_pass = any(_coerce_must_pass(test.get("must_pass", True)) for test in normalized) if not has_must_pass: finished_at = _now_ts() report = _build_report( @@ -277,12 +292,14 @@ def run_acceptance_tests( "reason": "trivial acceptance command blocked", } - gate = validate_command( - command, - forbidden, - network_policy=network_policy, - policy_pack=policy_pack, - repo_root=worktree_root, + gate = _coerce_gate_result( + validate_command( + command, + forbidden, + network_policy=network_policy, + policy_pack=policy_pack, + repo_root=worktree_root, + ) ) if not gate.get("ok", False): finished_at = _now_ts() @@ -416,12 +433,14 @@ def run_evals_gate( relative = script_path.relative_to(repo_root) command = f"bash {relative}" - gate = validate_command( - command, - forbidden_actions or [], - network_policy=network_policy, - policy_pack=policy_pack, - repo_root=repo_root, + gate = _coerce_gate_result( + validate_command( + command, + forbidden_actions or [], + network_policy=network_policy, + policy_pack=policy_pack, + repo_root=repo_root, + ) ) if not gate.get("ok", False): return {"ok": False, "reason": "tool gate violation", "gate": gate, "command": command} diff --git a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py index 7d20138..2a30a79 100644 --- a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py +++ b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from datetime import datetime, timezone +import importlib import json from pathlib import Path from typing import Any, Callable @@ -40,9 +41,9 @@ class ControlPlaneReadService: @classmethod def from_api_main(cls) -> "ControlPlaneReadService": - from cortexpilot_orch.api import main as api_main - from cortexpilot_orch.api import main_state_store_helpers - from cortexpilot_orch.queue import QueueStore + api_main = importlib.import_module("cortexpilot_orch.api.main") + main_state_store_helpers = importlib.import_module("cortexpilot_orch.api.main_state_store_helpers") + QueueStore = importlib.import_module("cortexpilot_orch.queue").QueueStore def _list_workflows_readonly() -> list[dict[str, Any]]: workflows = list( @@ -135,11 +136,11 @@ def _list_queue_readonly(*, workflow_id: str | None = None, status: str | None = @classmethod def from_runtime(cls) -> "ControlPlaneReadService": - from cortexpilot_orch.api import main_run_views_helpers - from cortexpilot_orch.api import main_state_store_helpers - from cortexpilot_orch.config import load_config - from cortexpilot_orch.contract.compiler import build_role_binding_summary - from cortexpilot_orch.queue import QueueStore + main_run_views_helpers = importlib.import_module("cortexpilot_orch.api.main_run_views_helpers") + main_state_store_helpers = importlib.import_module("cortexpilot_orch.api.main_state_store_helpers") + load_config = importlib.import_module("cortexpilot_orch.config").load_config + build_role_binding_summary = importlib.import_module("cortexpilot_orch.contract.compiler").build_role_binding_summary + QueueStore = importlib.import_module("cortexpilot_orch.queue").QueueStore cfg = load_config() runs_root = cfg.runs_root diff --git a/apps/orchestrator/tests/test_bench_e2e_speed_gate.py b/apps/orchestrator/tests/test_bench_e2e_speed_gate.py new file mode 100644 index 0000000..ddc246a --- /dev/null +++ b/apps/orchestrator/tests/test_bench_e2e_speed_gate.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + + +def _repo_root() -> Path: + return Path(__file__).resolve().parents[3] + + +def _gate_script() -> Path: + return _repo_root() / "scripts" / "check_bench_e2e_speed_gate.py" + + +def _write_summary(path: Path, *, overall_fail_rate: float, ui_p95: float, dash_p95: float) -> None: + path.write_text( + json.dumps( + { + "run_id": "bench_test", + "overall": {"fail_rate": overall_fail_rate}, + "suites": { + "ui_full_gemini_strict": {"duration_sec": {"p95": ui_p95}}, + "dashboard_high_risk_e2e": {"duration_sec": {"p95": dash_p95}}, + }, + } + ) + + "\n", + encoding="utf-8", + ) + + +def test_bench_gate_passes_for_summary_within_thresholds(tmp_path: Path) -> None: + summary = tmp_path / "summary.json" + _write_summary(summary, overall_fail_rate=0.0, ui_p95=90.0, dash_p95=45.0) + + result = subprocess.run( + [sys.executable, str(_gate_script()), "--summary", str(summary), "--ui-max-p95-sec", "120", "--dash-max-p95-sec", "60"], + check=False, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "benchmark gate passed" in result.stdout + + +def test_bench_gate_fails_when_overall_fail_rate_exceeds_threshold(tmp_path: Path) -> None: + summary = tmp_path / "summary.json" + _write_summary(summary, overall_fail_rate=0.2, ui_p95=90.0, dash_p95=45.0) + + result = subprocess.run( + [sys.executable, str(_gate_script()), "--summary", str(summary), "--max-overall-fail-rate", "0.1"], + check=False, + capture_output=True, + text=True, + ) + + assert result.returncode == 1 + assert "overall.fail_rate=0.2000 > max_overall_fail_rate=0.1000" in result.stderr + + +def test_bench_gate_fails_when_suite_p95_exceeds_threshold(tmp_path: Path) -> None: + summary = tmp_path / "summary.json" + _write_summary(summary, overall_fail_rate=0.0, ui_p95=181.0, dash_p95=91.0) + + result = subprocess.run( + [ + sys.executable, + str(_gate_script()), + "--summary", + str(summary), + "--ui-max-p95-sec", + "180", + "--dash-max-p95-sec", + "90", + ], + check=False, + capture_output=True, + text=True, + ) + + assert result.returncode == 1 + assert "ui_full_gemini_strict.p95=181.000s > max_p95=180.000s" in result.stderr + assert "dashboard_high_risk_e2e.p95=91.000s > max_p95=90.000s" in result.stderr + + +def test_bench_gate_fails_closed_when_summary_is_missing(tmp_path: Path) -> None: + summary = tmp_path / "missing-summary.json" + + result = subprocess.run( + [sys.executable, str(_gate_script()), "--summary", str(summary)], + check=False, + capture_output=True, + text=True, + ) + + assert result.returncode == 2 + assert "benchmark summary not found" in result.stderr diff --git a/apps/orchestrator/tests/test_control_plane_read_service.py b/apps/orchestrator/tests/test_control_plane_read_service.py new file mode 100644 index 0000000..c075ebf --- /dev/null +++ b/apps/orchestrator/tests/test_control_plane_read_service.py @@ -0,0 +1,330 @@ +from __future__ import annotations + +import json +import sys +from datetime import datetime, timezone +from pathlib import Path +from types import SimpleNamespace +from types import ModuleType + +import pytest + +from cortexpilot_orch.services.control_plane_read_service import ( + ControlPlaneReadService, + _as_array, + _as_record, + _as_text, + _find_report, +) + + +def _write_json(path: Path, payload: object) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload), encoding="utf-8") + + +def test_control_plane_read_service_wrapper_filters_and_summary_helpers() -> None: + service = ControlPlaneReadService( + list_runs_fn=lambda: [{"run_id": "run-1"}], + get_run_fn=lambda run_id: {"run_id": run_id}, + get_events_fn=lambda run_id: [{"run_id": run_id, "event": "RUN_UPDATED"}], + get_reports_fn=lambda run_id: [ + {"name": "run_compare_report.json", "data": {"compare_summary": {"mismatched_count": 2}}}, + {"name": "proof_pack.json", "data": {"summary": "proof-ready"}}, + {"name": "incident_pack.json", "data": "not-a-record"}, + ] + if run_id == "run-1" + else "not-a-list", + list_workflows_fn=lambda: [{"workflow_id": "wf-1"}], + get_workflow_fn=lambda workflow_id: {"workflow": {"workflow_id": workflow_id}, "runs": [], "events": []}, + list_queue_fn=lambda **_: [{"queue_id": "queue-1"}], + list_pending_approvals_fn=lambda: [ + {"run_id": "run-1", "status": "pending"}, + {"run_id": "run-2", "status": "pending"}, + ], + list_diff_gate_fn=lambda: [ + {"run_id": "run-1", "status": "FAILED"}, + {"run_id": "run-2", "status": "PASS"}, + ], + ) + + assert _as_record({"ok": True}) == {"ok": True} + assert _as_record("bad") == {} + assert _as_array([1, 2]) == [1, 2] + assert _as_array("bad") == [] + assert _as_text(" run-1 ") == "run-1" + assert _find_report([{"name": "proof_pack.json", "data": {"summary": "ready"}}], "proof_pack.json") == { + "summary": "ready" + } + assert _find_report([{"name": "proof_pack.json", "data": "bad"}], "proof_pack.json") == {} + + assert service.list_runs() == [{"run_id": "run-1"}] + assert service.get_run("run-9") == {"run_id": "run-9"} + assert service.get_run_events("run-9") == [{"run_id": "run-9", "event": "RUN_UPDATED"}] + assert service.get_run_reports("run-2") == [] + assert service.list_workflows() == [{"workflow_id": "wf-1"}] + assert service.get_workflow("wf-1") == {"workflow": {"workflow_id": "wf-1"}, "runs": [], "events": []} + assert service.list_queue(workflow_id="wf-1", status="pending") == [{"queue_id": "queue-1"}] + assert service.get_pending_approvals() == [ + {"run_id": "run-1", "status": "pending"}, + {"run_id": "run-2", "status": "pending"}, + ] + assert service.get_pending_approvals(run_id="run-1") == [{"run_id": "run-1", "status": "pending"}] + assert service.get_diff_gate_state() == [ + {"run_id": "run-1", "status": "FAILED"}, + {"run_id": "run-2", "status": "PASS"}, + ] + assert service.get_diff_gate_state(run_id="run-2") == [{"run_id": "run-2", "status": "PASS"}] + assert service.get_compare_summary("run-1") == {"mismatched_count": 2} + assert service.get_proof_summary("run-1") == {"summary": "proof-ready"} + assert service.get_incident_summary("run-1") == {} + + +def test_control_plane_read_service_from_api_main_builds_workflows_and_queue_filters(monkeypatch) -> None: + event_map = { + "run-b": [ + {"event": "WORKFLOW_STATUS", "ts": "2026-04-12T10:00:00Z", "context": {"workflow_id": "wf-1"}}, + {"event": "IGNORED", "context": {"workflow_id": "wf-2"}}, + ], + "run-a": [ + {"event": "WORKFLOW_BOUND", "ts": "2026-04-11T10:00:00Z"}, + {"event": "CUSTOM", "_ts": "2026-04-11T09:00:00Z", "context": {"workflow_id": "wf-1"}}, + ], + } + + workflows = { + "wf-1": { + "workflow_id": "wf-1", + "runs": [ + {"run_id": "run-a", "created_at": "2026-04-11T08:00:00Z"}, + {"run_id": "run-b", "created_at": "2026-04-12T08:00:00Z"}, + ], + }, + "wf-2": {"workflow_id": "wf-2", "runs": [{"run_id": "run-z", "created_at": "broken-ts"}]}, + } + + class _FakeQueueStore: + def __init__(self, *, ensure_storage: bool = False) -> None: + self.ensure_storage = ensure_storage + + def list_items(self) -> list[dict[str, str]]: + return [ + {"queue_id": "queue-1", "workflow_id": "wf-1", "status": "PENDING"}, + {"queue_id": "queue-2", "workflow_id": "wf-2", "status": "DONE"}, + ] + + api_main = ModuleType("cortexpilot_orch.api.main") + api_main.load_config = lambda: SimpleNamespace(runs_root=Path("/tmp/runs"), runtime_root=Path("/tmp/runtime")) + api_main._read_events = lambda run_id: event_map.get(run_id, []) + api_main._parse_iso_ts = lambda value: datetime.fromisoformat(value.replace("Z", "+00:00")) + api_main.list_runs = lambda: [{"run_id": "api-run"}] + api_main.get_run = lambda run_id: {"run_id": run_id, "source": "api"} + api_main.get_events = lambda run_id: event_map.get(run_id, []) + api_main.get_reports = lambda run_id: [{"name": "proof_pack.json", "data": {"run_id": run_id}}] + api_main.list_pending_approvals = lambda: [{"run_id": "run-a"}] + api_main.list_diff_gate = lambda: [{"run_id": "run-b", "status": "FAILED"}] + + main_state_store_helpers = ModuleType("cortexpilot_orch.api.main_state_store_helpers") + main_state_store_helpers.collect_workflows = lambda **_: workflows + + queue_module = ModuleType("cortexpilot_orch.queue") + queue_module.QueueStore = _FakeQueueStore + + monkeypatch.setitem(sys.modules, "cortexpilot_orch.api.main", api_main) + monkeypatch.setitem(sys.modules, "cortexpilot_orch.api.main_state_store_helpers", main_state_store_helpers) + monkeypatch.setitem(sys.modules, "cortexpilot_orch.queue", queue_module) + + service = ControlPlaneReadService.from_api_main() + + assert service.list_runs() == [{"run_id": "api-run"}] + assert service.get_run("run-a") == {"run_id": "run-a", "source": "api"} + assert [item["workflow_id"] for item in service.list_workflows()] == ["wf-1", "wf-2"] + + workflow_payload = service.get_workflow("wf-1") + assert [event["_run_id"] for event in workflow_payload["events"]] == ["run-b", "run-a", "run-a"] + assert workflow_payload["runs"][0]["run_id"] == "run-a" + with pytest.raises(KeyError, match="workflow `missing` not found"): + service.get_workflow("missing") + + assert service.list_queue(workflow_id="wf-1") == [{"queue_id": "queue-1", "workflow_id": "wf-1", "status": "PENDING"}] + assert service.list_queue(status="done") == [{"queue_id": "queue-2", "workflow_id": "wf-2", "status": "DONE"}] + assert service.get_pending_approvals() == [{"run_id": "run-a"}] + assert service.get_diff_gate_state(run_id="run-b") == [{"run_id": "run-b", "status": "FAILED"}] + + +def test_control_plane_read_service_from_runtime_builds_runtime_views_and_pending_approvals( + monkeypatch, tmp_path: Path +) -> None: + runtime_root = tmp_path / "runtime" + runs_root = runtime_root / "runs" + run_a = runs_root / "run-a" + run_b = runs_root / "run-b" + run_skip = runs_root / "run-skip" + + _write_json( + run_a / "manifest.json", + { + "run_id": "run-a", + "task_id": "task-a", + "status": "", + "role_binding_summary": {"source": "persisted"}, + }, + ) + _write_json( + run_a / "contract.json", + { + "task_id": "contract-task-a", + "allowed_paths": ["apps/orchestrator"], + }, + ) + _write_json(run_a / "reports" / "proof_pack.json", {"summary": "proof-a"}) + _write_json(run_a / "reports" / "run_compare_report.json", {"compare_summary": {"mismatched_count": 3}}) + + _write_json( + run_b / "manifest.json", + { + "status": "SUCCESS", + }, + ) + _write_json( + run_b / "contract.json", + { + "task_id": "contract-task-b", + "allowed_paths": "not-a-list", + }, + ) + _write_json(run_b / "reports" / "incident_pack.json", {"summary": "incident-b"}) + + run_skip.mkdir(parents=True, exist_ok=True) + (run_skip / "manifest.json").write_text("{bad json", encoding="utf-8") + + run_a.touch() + run_b.touch() + run_skip.touch() + (run_a / "manifest.json").touch() + (run_b / "manifest.json").touch() + (run_skip / "manifest.json").touch() + + event_map = { + "run-a": [ + {"event": "WORKFLOW_BOUND", "ts": "2026-04-12T10:00:00Z"}, + { + "event": "HUMAN_APPROVAL_REQUIRED", + "ts": "2026-04-12T10:01:00Z", + "context": { + "reason": ["owner review"], + "actions": ["approve"], + "verify_steps": ["pytest"], + "resume_step": "resume-from-review", + "workflow_id": "wf-1", + }, + }, + {"event": "CUSTOM", "_ts": "2026-04-12T10:02:00Z", "context": {"workflow_id": "wf-1"}}, + ], + "run-b": [ + {"event": "HUMAN_APPROVAL_REQUIRED", "ts": "2026-04-11T09:00:00Z", "meta": {"workflow_id": "wf-2"}}, + {"event": "HUMAN_APPROVAL_COMPLETED", "ts": "2026-04-11T09:05:00Z"}, + {"event": "TEMPORAL_NOTIFY_DONE", "ts": "2026-04-11T09:10:00Z"}, + ], + "run-skip": [], + } + + workflows = { + "wf-1": { + "workflow_id": "wf-1", + "runs": [ + {"run_id": "run-b", "created_at": "2026-04-11T08:00:00Z"}, + {"run_id": "run-a", "created_at": "2026-04-12T08:00:00Z"}, + ], + }, + "wf-2": {"workflow_id": "wf-2", "runs": [{"run_id": "run-b", "created_at": "invalid"}]}, + } + + class _FakeQueueStore: + def __init__(self, *, ensure_storage: bool = False) -> None: + self.ensure_storage = ensure_storage + + def list_items(self) -> list[dict[str, str]]: + return [ + {"queue_id": "queue-1", "workflow_id": "wf-1", "status": "PENDING"}, + {"queue_id": "queue-2", "workflow_id": "wf-2", "status": "DONE"}, + ] + + config_module = ModuleType("cortexpilot_orch.config") + config_module.load_config = lambda: SimpleNamespace(runs_root=runs_root, runtime_root=runtime_root) + + main_state_store_helpers = ModuleType("cortexpilot_orch.api.main_state_store_helpers") + main_state_store_helpers.read_events = lambda *, run_id, runs_root: event_map.get(run_id, []) + main_state_store_helpers.collect_workflows = lambda **_: workflows + + main_run_views_helpers = ModuleType("cortexpilot_orch.api.main_run_views_helpers") + main_run_views_helpers.list_diff_gate = lambda **_: [ + {"run_id": "run-a", "status": "FAILED"}, + {"run_id": "run-b", "status": "PASS"}, + ] + + compiler_module = ModuleType("cortexpilot_orch.contract.compiler") + compiler_module.build_role_binding_summary = lambda contract: { + "source": "generated", + "task_id": contract.get("task_id"), + } + + queue_module = ModuleType("cortexpilot_orch.queue") + queue_module.QueueStore = _FakeQueueStore + + monkeypatch.setitem(sys.modules, "cortexpilot_orch.config", config_module) + monkeypatch.setitem(sys.modules, "cortexpilot_orch.api.main_state_store_helpers", main_state_store_helpers) + monkeypatch.setitem(sys.modules, "cortexpilot_orch.api.main_run_views_helpers", main_run_views_helpers) + monkeypatch.setitem(sys.modules, "cortexpilot_orch.contract.compiler", compiler_module) + monkeypatch.setitem(sys.modules, "cortexpilot_orch.queue", queue_module) + + service = ControlPlaneReadService.from_runtime() + + listed_runs = service.list_runs() + assert [item["run_id"] for item in listed_runs] == ["run-b", "run-a"] + assert listed_runs[1]["status"] == "UNKNOWN" + assert listed_runs[1]["last_event_ts"] == "2026-04-12T10:02:00Z" + + runtime_run = service.get_run("run-a") + assert runtime_run["task_id"] == "task-a" + assert runtime_run["allowed_paths"] == ["apps/orchestrator"] + assert runtime_run["role_binding_read_model"] == {"source": "persisted"} + + generated_run = service.get_run("run-b") + assert generated_run["task_id"] == "contract-task-b" + assert generated_run["allowed_paths"] == [] + assert generated_run["role_binding_read_model"] == {"source": "generated", "task_id": "contract-task-b"} + with pytest.raises(KeyError, match="run `missing` not found"): + service.get_run("missing") + + assert service.get_run_reports("run-a") == [ + {"name": "proof_pack.json", "data": {"summary": "proof-a"}}, + {"name": "run_compare_report.json", "data": {"compare_summary": {"mismatched_count": 3}}}, + ] + assert [item["workflow_id"] for item in service.list_workflows()] == ["wf-1", "wf-2"] + + workflow_payload = service.get_workflow("wf-1") + assert [event["_run_id"] for event in workflow_payload["events"]] == ["run-a", "run-a", "run-a", "run-b"] + with pytest.raises(KeyError, match="workflow `missing` not found"): + service.get_workflow("missing") + + assert service.list_queue(workflow_id="wf-1", status="pending") == [ + {"queue_id": "queue-1", "workflow_id": "wf-1", "status": "PENDING"} + ] + assert service.get_pending_approvals() == [ + { + "run_id": "run-a", + "status": "pending", + "task_id": "task-a", + "failure_reason": "", + "reason": ["owner review"], + "actions": ["approve"], + "verify_steps": ["pytest"], + "resume_step": "resume-from-review", + } + ] + assert service.get_pending_approvals(run_id="run-a")[0]["run_id"] == "run-a" + assert service.get_diff_gate_state(run_id="run-a") == [{"run_id": "run-a", "status": "FAILED"}] + assert service.get_compare_summary("run-a") == {"mismatched_count": 3} + assert service.get_proof_summary("run-a") == {"summary": "proof-a"} + assert service.get_incident_summary("run-b") == {"summary": "incident-b"} diff --git a/apps/orchestrator/tests/test_mcp_queue_pilot_server_branches.py b/apps/orchestrator/tests/test_mcp_queue_pilot_server_branches.py new file mode 100644 index 0000000..d210814 --- /dev/null +++ b/apps/orchestrator/tests/test_mcp_queue_pilot_server_branches.py @@ -0,0 +1,132 @@ +from __future__ import annotations + +from dataclasses import replace +import io +import sys +from types import ModuleType + +from cortexpilot_orch import mcp_queue_pilot_server as queue_pilot_module + + +def test_mcp_queue_pilot_helpers_and_protocol_edges(monkeypatch) -> None: + monkeypatch.setenv("CORTEXPILOT_APPROVAL_ALLOWED_ROLES", " owner , ops ") + + assert queue_pilot_module._mutation_roles() == {"OWNER", "OPS"} + assert queue_pilot_module._required_role_arg({"actor_role": "owner"}) == "OWNER" + assert queue_pilot_module._queue_payload( + {"priority": 3, "scheduled_at": " 2026-04-12T09:00:00Z ", "deadline_at": " "} + ) == { + "priority": 3, + "scheduled_at": "2026-04-12T09:00:00Z", + } + assert queue_pilot_module._error_response(7, -32601, "boom") == { + "jsonrpc": "2.0", + "id": 7, + "error": {"code": -32601, "message": "boom"}, + } + + +def test_mcp_queue_pilot_server_covers_default_constructor_unknown_methods_and_stream(monkeypatch) -> None: + captured: list[tuple[str, dict[str, object]]] = [] + + def _preview(run_id: str, payload: dict[str, object]) -> dict[str, object]: + captured.append((run_id, payload)) + return { + "run_id": run_id, + "validation": "ok", + "can_apply": True, + "preview_item": {"queue_id": "preview-1"}, + } + + def _apply(run_id: str, payload: dict[str, object]) -> dict[str, object]: + return {"queue_id": f"{run_id}-queue", "task_id": "task-1", "status": "PENDING"} + + api_main = ModuleType("cortexpilot_orch.api.main") + api_main.preview_enqueue_run_queue = _preview + api_main.enqueue_run_queue = _apply + monkeypatch.setitem(sys.modules, "cortexpilot_orch.api.main", api_main) + + server = queue_pilot_module.CortexPilotQueuePilotMcpServer() + + assert server.handle_message({"jsonrpc": "2.0", "method": "initialized"}) is None + assert server.handle_message({"jsonrpc": "2.0", "id": 1, "method": "ping"}) == { + "jsonrpc": "2.0", + "id": 1, + "result": {}, + } + init_response = server.handle_message({"jsonrpc": "2.0", "id": 2, "method": "initialize"}) + assert init_response is not None + assert init_response["result"]["serverInfo"]["name"] == "cortexpilot-queue-pilot" + + alias_list = server.handle_message({"jsonrpc": "2.0", "id": 3, "method": "tooling/list"}) + assert alias_list is not None + assert {tool["name"] for tool in alias_list["result"]["tools"]} == { + "preview_enqueue_from_run", + "enqueue_from_run", + } + + unknown_tool = server.handle_message( + { + "jsonrpc": "2.0", + "id": 4, + "method": "tools/call", + "params": {"name": "missing_tool", "arguments": {}}, + } + ) + assert unknown_tool == { + "jsonrpc": "2.0", + "id": 4, + "error": {"code": -32601, "message": "unknown tool `missing_tool`"}, + } + + missing_run_id = server.handle_message( + { + "jsonrpc": "2.0", + "id": 5, + "method": "tools/call", + "params": {"name": "preview_enqueue_from_run", "arguments": {}}, + } + ) + assert missing_run_id is not None + assert missing_run_id["result"]["isError"] is True + assert "`run_id` is required" in missing_run_id["result"]["structuredContent"]["error"] + + broken_tool = replace( + server._tool_map["preview_enqueue_from_run"], + handler=lambda arguments: (_ for _ in ()).throw(RuntimeError("preview exploded")), + ) + server._tool_map["preview_enqueue_from_run"] = broken_tool + runtime_error = server.handle_message( + { + "jsonrpc": "2.0", + "id": 6, + "method": "tools/call", + "params": {"name": "preview_enqueue_from_run", "arguments": {"run_id": "run-9"}}, + } + ) + assert runtime_error is not None + assert runtime_error["result"]["isError"] is True + assert runtime_error["result"]["structuredContent"]["error"] == "preview exploded" + + assert server.handle_message({"jsonrpc": "2.0", "method": "unsupported"}) is None + unsupported = server.handle_message({"jsonrpc": "2.0", "id": 7, "method": "unsupported"}) + assert unsupported == { + "jsonrpc": "2.0", + "id": 7, + "error": {"code": -32601, "message": "method `unsupported` is not supported"}, + } + + source = io.StringIO('\nnot-json\n[]\n{"jsonrpc":"2.0","id":8,"method":"ping"}\n') + target = io.StringIO() + server.serve_forever(instream=source, outstream=target) + assert target.getvalue().strip() == '{"jsonrpc": "2.0", "id": 8, "result": {}}' + + called = {"serve_forever": False} + + class _FakeServer: + def serve_forever(self) -> None: + called["serve_forever"] = True + + monkeypatch.setattr(queue_pilot_module, "CortexPilotQueuePilotMcpServer", _FakeServer) + queue_pilot_module.serve_queue_pilot_mcp() + assert called["serve_forever"] is True diff --git a/apps/orchestrator/tests/test_repo_coverage_gate.py b/apps/orchestrator/tests/test_repo_coverage_gate.py new file mode 100644 index 0000000..a08dd6c --- /dev/null +++ b/apps/orchestrator/tests/test_repo_coverage_gate.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from pathlib import Path + + +def _read_script() -> str: + script_path = Path(__file__).resolve().parents[3] / "scripts" / "repo_coverage_gate.py" + return script_path.read_text(encoding="utf-8") + + +def test_dashboard_coverage_installs_deps_before_vitest() -> None: + text = _read_script() + install_idx = text.index('run_command(["bash", "scripts/install_dashboard_deps.sh"])') + vitest_idx = text.index('"pnpm",\n "--dir",\n "apps/dashboard",\n "exec",\n "vitest"') + assert install_idx < vitest_idx + + +def test_desktop_coverage_installs_deps_before_vitest() -> None: + text = _read_script() + install_idx = text.index('run_command(["bash", "scripts/install_desktop_deps.sh"])') + vitest_idx = text.index('"pnpm",\n "--dir",\n "apps/desktop",\n "exec",\n "vitest"') + assert install_idx < vitest_idx + + +def test_orchestrator_coverage_uses_managed_coverage_file() -> None: + text = _read_script() + assert 'DEFAULT_COVERAGE_DATA_DIR = ROOT_DIR / ".runtime-cache" / "cache" / "test" / "coverage" / "repo_coverage_gate"' in text + assert '"COVERAGE_FILE": str(coverage_file)' in text + + +def test_dashboard_and_desktop_coverage_use_managed_report_dirs() -> None: + text = _read_script() + assert '"CORTEXPILOT_DASHBOARD_COVERAGE_DIR": str(report_path.parent)' in text + assert '"CORTEXPILOT_DESKTOP_COVERAGE_DIR": str(report_path.parent)' in text + + +def test_orchestrator_coverage_uses_managed_hypothesis_storage() -> None: + text = _read_script() + assert 'DEFAULT_HYPOTHESIS_DATA_DIR = ROOT_DIR / ".runtime-cache" / "cache" / "hypothesis" / "repo_coverage_gate"' in text + assert '"HYPOTHESIS_STORAGE_DIRECTORY": str(DEFAULT_HYPOTHESIS_DATA_DIR)' in text diff --git a/apps/orchestrator/tests/test_tests_gate_extended.py b/apps/orchestrator/tests/test_tests_gate_extended.py index e916320..535335a 100644 --- a/apps/orchestrator/tests/test_tests_gate_extended.py +++ b/apps/orchestrator/tests/test_tests_gate_extended.py @@ -42,6 +42,19 @@ def test_tests_gate_tool_gate_violation(tmp_path: Path, monkeypatch) -> None: assert result["reason"] == "tool gate violation" +def test_tests_gate_tool_gate_non_dict_result_fails_closed(tmp_path: Path, monkeypatch) -> None: + monkeypatch.setattr(tests_gate, "validate_command", lambda *args, **kwargs: False) + + result = tests_gate.run_acceptance_tests( + tmp_path, + [{"name": "hygiene", "cmd": "bash scripts/check_repo_hygiene.sh", "must_pass": True}], + ) + + assert result["ok"] is False + assert result["reason"] == "tool gate violation" + assert result["gate"]["reason"] == "invalid validate_command result" + + def test_tests_gate_invalid_shlex(tmp_path: Path, monkeypatch) -> None: monkeypatch.setattr(tests_gate, "validate_command", lambda *args, **kwargs: {"ok": True}) result = tests_gate.run_acceptance_tests(tmp_path, ['echo "unterminated']) @@ -131,6 +144,16 @@ def test_tests_gate_strict_nontrivial_blocks_echo_numeric_payload(tmp_path: Path assert result["reason"] == "trivial acceptance command blocked" +def test_tests_gate_strict_nontrivial_blocks_echo_whitespace_payload(tmp_path: Path, monkeypatch) -> None: + monkeypatch.setenv("CORTEXPILOT_ACCEPTANCE_STRICT_NONTRIVIAL", "1") + monkeypatch.setattr(tests_gate, "validate_command", lambda *args, **kwargs: {"ok": True}) + + result = tests_gate.run_acceptance_tests(tmp_path, ['echo " "']) + + assert result["ok"] is False + assert result["reason"] == "trivial acceptance command blocked" + + def test_is_trivial_acceptance_command_treats_whitespace_only_as_trivial() -> None: assert tests_gate._is_trivial_acceptance_command(" \t \n ") is True @@ -284,6 +307,16 @@ def test_tests_gate_rejects_when_all_acceptance_tests_are_not_must_pass(tmp_path assert result["reason"] == "missing must_pass acceptance test" +def test_tests_gate_blank_dict_command_reports_empty_command(tmp_path: Path) -> None: + result = tests_gate.run_acceptance_tests( + tmp_path, + [{"name": "blank", "cmd": " ", "must_pass": True}], + ) + + assert result["ok"] is False + assert result["reason"] == "empty command" + + def test_run_evals_gate_blocks_tool_gate_violation(tmp_path: Path, monkeypatch) -> None: repo_root = tmp_path / "repo" worktree = repo_root / "worktree" @@ -307,6 +340,30 @@ def _fake_run(*args, **kwargs): assert called["run"] is False +def test_run_evals_gate_non_dict_tool_gate_result_fails_closed(tmp_path: Path, monkeypatch) -> None: + repo_root = tmp_path / "repo" + worktree = repo_root / "worktree" + (repo_root / "scripts").mkdir(parents=True) + worktree.mkdir(parents=True) + (repo_root / "scripts" / "run_evals.sh").write_text("#!/usr/bin/env bash\necho evals\n", encoding="utf-8") + + called: dict[str, bool] = {"run": False} + + def _fake_run(*args, **kwargs): + called["run"] = True + return subprocess.CompletedProcess(args, 0, "ok", "") + + monkeypatch.setattr(tests_gate, "validate_command", lambda *args, **kwargs: False) + _patch_tests_gate_subprocess(monkeypatch, _fake_run) + + result = tests_gate.run_evals_gate(repo_root, worktree) + + assert result["ok"] is False + assert result["reason"] == "tool gate violation" + assert result["gate"]["reason"] == "invalid validate_command result" + assert called["run"] is False + + def test_tests_gate_coerces_string_must_pass_false(tmp_path: Path, monkeypatch) -> None: monkeypatch.setattr(tests_gate, "validate_command", lambda *args, **kwargs: {"ok": True}) diff --git a/configs/env.registry.json b/configs/env.registry.json index 492bd40..293cf2e 100644 --- a/configs/env.registry.json +++ b/configs/env.registry.json @@ -596,6 +596,42 @@ "scripts/lib/toolchain_env.sh" ] }, + { + "name": "CORTEXPILOT_BENCH_DASHBOARD_HIGH_RISK_E2E_MAX_P95_SEC", + "scope": "platform", + "secret": false, + "required": false, + "default": "90", + "owner": "platform", + "description": "Maximum allowed p95 latency, in seconds, for the dashboard_high_risk_e2e suite when benchmark summaries are evaluated by the fail-closed benchmark gate.", + "consumers": [ + "scripts/check_bench_e2e_speed_gate.py" + ] + }, + { + "name": "CORTEXPILOT_BENCH_MAX_FAIL_RATE", + "scope": "platform", + "secret": false, + "required": false, + "default": "0.05", + "owner": "platform", + "description": "Maximum allowed overall benchmark fail rate enforced by the fail-closed benchmark gate.", + "consumers": [ + "scripts/check_bench_e2e_speed_gate.py" + ] + }, + { + "name": "CORTEXPILOT_BENCH_UI_FULL_GEMINI_STRICT_MAX_P95_SEC", + "scope": "platform", + "secret": false, + "required": false, + "default": "180", + "owner": "platform", + "description": "Maximum allowed p95 latency, in seconds, for the ui_full_gemini_strict suite when benchmark summaries are evaluated by the fail-closed benchmark gate.", + "consumers": [ + "scripts/check_bench_e2e_speed_gate.py" + ] + }, { "name": "CORTEXPILOT_BROWSER_ALLOWLIST", "scope": "platform", diff --git a/docs/assets/storefront/benchmark-methodology.md b/docs/assets/storefront/benchmark-methodology.md index 3b13f2d..b7f1f8f 100644 --- a/docs/assets/storefront/benchmark-methodology.md +++ b/docs/assets/storefront/benchmark-methodology.md @@ -8,6 +8,7 @@ inventing numbers. - Benchmark execution tooling exists: - `scripts/bench_e2e_speed.py` - `scripts/bench_e2e_speed.sh` + - `scripts/check_bench_e2e_speed_gate.py` - A first tracked public single-run baseline now exists at `docs/releases/assets/news-digest-benchmark-summary-2026-03-27.md`. - Broader multi-round public benchmark figures do **not** exist yet. @@ -62,6 +63,23 @@ A tracked public benchmark artifact should include: `.runtime-cache/` - enough metadata to show which happy path was exercised +## Gate Contract + +Once a real benchmark summary exists, the repo-owned fail-closed gate is: + +```bash +npm run bench:e2e:speed:gate +``` + +Default thresholds are driven by: + +- `CORTEXPILOT_BENCH_MAX_FAIL_RATE` +- `CORTEXPILOT_BENCH_UI_FULL_GEMINI_STRICT_MAX_P95_SEC` +- `CORTEXPILOT_BENCH_DASHBOARD_HIGH_RISK_E2E_MAX_P95_SEC` + +The gate is intentionally strict about artifact presence: if no benchmark +summary exists yet, it fails instead of inventing a baseline. + ## Anti-Fraud Rule Do not copy raw numbers into README, release notes, or social posts unless they diff --git a/package.json b/package.json index 8770f1e..718d8b2 100644 --- a/package.json +++ b/package.json @@ -41,8 +41,10 @@ "test:smell": "bash scripts/test_smell_gate.sh", "quality:full": "npm run lint && npm run test:smell && npm run test", "quality:full:host": "npm run lint && npm run test:smell && npm run test:host", - "coverage:repo": "bash scripts/run_governance_py.sh scripts/repo_coverage_aggregate.py --threshold ${CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD:-95}", - "coverage:repo:gate": "bash scripts/run_governance_py.sh scripts/repo_coverage_aggregate.py --threshold ${CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD:-95} --enforce-gate", + "coverage:repo": "bash scripts/run_governance_py.sh scripts/repo_coverage_gate.py --threshold ${CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD:-95}", + "coverage:repo:gate": "bash scripts/run_governance_py.sh scripts/repo_coverage_gate.py --threshold ${CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD:-95} --enforce-gate", + "coverage:repo:aggregate": "bash scripts/run_governance_py.sh scripts/repo_coverage_aggregate.py --threshold ${CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD:-95}", + "mutation:gate": "bash scripts/mutation_gate.sh", "test:quick": "bash scripts/docker_ci.sh test-quick", "test:quick:host": "bash scripts/test_quick.sh", "test:live:preflight": "${CORTEXPILOT_PYTHON:-python3} scripts/e2e_external_web_probe.py --url ${CORTEXPILOT_EXTERNAL_WEB_PROBE_URL:-https://example.com} --provider-api-mode ${CORTEXPILOT_EXTERNAL_WEB_PROBE_PROVIDER_API_MODE:-require} --hard-timeout-sec ${CORTEXPILOT_EXTERNAL_WEB_PROBE_HARD_TIMEOUT_SEC:-120}", @@ -99,6 +101,7 @@ "bench:e2e:speed": "bash scripts/bench_e2e_speed.sh", "bench:e2e:speed:dry-run": "bash scripts/bench_e2e_speed.sh --rounds 3 --ui-full-gemini-strict --dashboard-high-risk --dry-run", "bench:e2e:speed:report-only": "bash scripts/bench_e2e_speed.sh --report-only", + "bench:e2e:speed:gate": "python3 scripts/check_bench_e2e_speed_gate.py", "e2e:pm-chat": "bash scripts/e2e_pm_chat_command_tower_success.sh", "e2e:pm-chat:real": "CORTEXPILOT_E2E_RUN_MODE=real CORTEXPILOT_E2E_RUNNER=agents CORTEXPILOT_E2E_REEXEC_STRICT=true bash scripts/e2e_pm_chat_command_tower_success.sh", "ci": "bash scripts/ci_local_fast.sh", diff --git a/scripts/check_bench_e2e_speed_gate.py b/scripts/check_bench_e2e_speed_gate.py new file mode 100644 index 0000000..e82ad0b --- /dev/null +++ b/scripts/check_bench_e2e_speed_gate.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""Fail-closed gate for benchmark summaries produced by scripts/bench_e2e_speed.py.""" + +from __future__ import annotations + +import argparse +import json +import math +import os +import sys +from pathlib import Path +from typing import Any + + +ROOT = Path(__file__).resolve().parents[1] +BENCH_ROOT = ROOT / ".runtime-cache" / "test_output" / "benchmarks" +DEFAULT_MAX_FAIL_RATE = float(os.environ.get("CORTEXPILOT_BENCH_MAX_FAIL_RATE", "0.05")) +DEFAULT_UI_MAX_P95_SEC = float(os.environ.get("CORTEXPILOT_BENCH_UI_FULL_GEMINI_STRICT_MAX_P95_SEC", "180")) +DEFAULT_DASH_MAX_P95_SEC = float(os.environ.get("CORTEXPILOT_BENCH_DASHBOARD_HIGH_RISK_E2E_MAX_P95_SEC", "90")) + + +def _find_latest_summary() -> Path | None: + candidates = sorted(BENCH_ROOT.glob("*/summary.json"), key=lambda path: path.stat().st_mtime, reverse=True) + return candidates[0] if candidates else None + + +def _load_json(path: Path) -> dict[str, Any]: + try: + return json.loads(path.read_text(encoding="utf-8")) + except FileNotFoundError as exc: + raise FileNotFoundError(f"benchmark summary not found: {path}") from exc + except json.JSONDecodeError as exc: + raise ValueError(f"invalid JSON in benchmark summary {path}: {exc}") from exc + + +def _to_float(value: Any, *, field: str) -> float: + try: + parsed = float(value) + except (TypeError, ValueError) as exc: + raise ValueError(f"invalid numeric field {field!r}: {value!r}") from exc + if not math.isfinite(parsed): + raise ValueError(f"non-finite numeric field {field!r}: {value!r}") + return parsed + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Fail-closed gate for benchmark summaries emitted by scripts/bench_e2e_speed.py." + ) + parser.add_argument("--summary", default="", help="Explicit benchmark summary path. Defaults to the latest summary.json.") + parser.add_argument( + "--max-overall-fail-rate", + type=float, + default=DEFAULT_MAX_FAIL_RATE, + help="Maximum allowed overall fail_rate (default from CORTEXPILOT_BENCH_MAX_FAIL_RATE or 0.05).", + ) + parser.add_argument( + "--ui-max-p95-sec", + type=float, + default=DEFAULT_UI_MAX_P95_SEC, + help="Maximum allowed p95 for ui_full_gemini_strict (default env or 180).", + ) + parser.add_argument( + "--dash-max-p95-sec", + type=float, + default=DEFAULT_DASH_MAX_P95_SEC, + help="Maximum allowed p95 for dashboard_high_risk_e2e (default env or 90).", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + summary_path = Path(args.summary).expanduser().resolve() if args.summary else _find_latest_summary() + if summary_path is None: + print("❌ benchmark gate requires a benchmark summary; run `npm run bench:e2e:speed` first", file=sys.stderr) + return 2 + + try: + payload = _load_json(summary_path) + except (FileNotFoundError, ValueError) as exc: + print(f"❌ [bench-gate] {exc}", file=sys.stderr) + return 2 + overall = payload.get("overall") + suites = payload.get("suites") + if not isinstance(overall, dict) or not isinstance(suites, dict): + print(f"❌ benchmark summary missing overall/suites maps: {summary_path}", file=sys.stderr) + return 2 + + failures: list[str] = [] + overall_fail_rate = _to_float(overall.get("fail_rate"), field="overall.fail_rate") + if overall_fail_rate > args.max_overall_fail_rate: + failures.append( + f"overall.fail_rate={overall_fail_rate:.4f} > max_overall_fail_rate={args.max_overall_fail_rate:.4f}" + ) + + suite_thresholds = { + "ui_full_gemini_strict": args.ui_max_p95_sec, + "dashboard_high_risk_e2e": args.dash_max_p95_sec, + } + for suite_name, max_p95 in suite_thresholds.items(): + if suite_name not in suites: + failures.append(f"missing suite in benchmark summary: {suite_name}") + continue + suite = suites[suite_name] + if not isinstance(suite, dict): + failures.append(f"invalid suite payload: {suite_name}") + continue + duration = suite.get("duration_sec") + if not isinstance(duration, dict): + failures.append(f"missing duration metrics for suite: {suite_name}") + continue + p95 = _to_float(duration.get("p95"), field=f"{suite_name}.duration_sec.p95") + if p95 > max_p95: + failures.append(f"{suite_name}.p95={p95:.3f}s > max_p95={max_p95:.3f}s") + + print(f"📄 [bench-gate] summary={summary_path}") + print( + "ℹ️ [bench-gate] thresholds: " + f"overall_fail_rate<={args.max_overall_fail_rate:.4f} " + f"ui_p95<={args.ui_max_p95_sec:.3f}s " + f"dashboard_p95<={args.dash_max_p95_sec:.3f}s" + ) + if failures: + print("❌ [bench-gate] benchmark gate failed:", file=sys.stderr) + for failure in failures: + print(f" - {failure}", file=sys.stderr) + return 1 + + print("✅ [bench-gate] benchmark gate passed") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/repo_coverage_gate.py b/scripts/repo_coverage_gate.py index e2bd0ed..2c058c4 100644 --- a/scripts/repo_coverage_gate.py +++ b/scripts/repo_coverage_gate.py @@ -25,6 +25,8 @@ DEFAULT_DESKTOP_REPORT = ( ROOT_DIR / ".runtime-cache" / "test_output" / "repo_coverage" / "desktop" / "coverage-summary.json" ) +DEFAULT_COVERAGE_DATA_DIR = ROOT_DIR / ".runtime-cache" / "cache" / "test" / "coverage" / "repo_coverage_gate" +DEFAULT_HYPOTHESIS_DATA_DIR = ROOT_DIR / ".runtime-cache" / "cache" / "hypothesis" / "repo_coverage_gate" DEFAULT_THRESHOLD = float(os.environ.get("CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD", "95")) @@ -178,8 +180,17 @@ def run_command(command: list[str], env_overrides: dict[str, str] | None = None) raise RuntimeError(f"command failed (exit={result.returncode}): {' '.join(command)}") +def _prepare_coverage_file(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.unlink(missing_ok=True) + for stale_path in path.parent.glob(f"{path.name}.*"): + stale_path.unlink(missing_ok=True) + + def run_orchestrator_coverage(report_path: Path, pytest_target: str, pytest_mark: str) -> None: report_path.parent.mkdir(parents=True, exist_ok=True) + coverage_file = DEFAULT_COVERAGE_DATA_DIR / ".coverage" + _prepare_coverage_file(coverage_file) override = os.getenv("CORTEXPILOT_PYTHON", "").strip() toolchain_python = ROOT_DIR / ".runtime-cache" / "cache" / "toolchains" / "python" / "current" / "bin" / "python" python_bin = Path(override) if override else toolchain_python @@ -201,11 +212,22 @@ def run_orchestrator_coverage(report_path: Path, pytest_target: str, pytest_mark f"--cov-report=json:{report_path}", "--cov-fail-under=0", ] - run_command(command, env_overrides={"PYTHONPATH": "apps/orchestrator/src"}) + try: + run_command( + command, + env_overrides={ + "PYTHONPATH": "apps/orchestrator/src", + "COVERAGE_FILE": str(coverage_file), + "HYPOTHESIS_STORAGE_DIRECTORY": str(DEFAULT_HYPOTHESIS_DATA_DIR), + }, + ) + finally: + _prepare_coverage_file(coverage_file) def run_dashboard_coverage(report_path: Path, test_targets: list[str]) -> None: report_path.parent.mkdir(parents=True, exist_ok=True) + run_command(["bash", "scripts/install_dashboard_deps.sh"]) command = [ "pnpm", "--dir", @@ -225,11 +247,19 @@ def run_dashboard_coverage(report_path: Path, test_targets: list[str]) -> None: f"--coverage.reportsDirectory={report_path.parent}", ] command.extend(test_targets) - run_command(command, env_overrides={"CI": "1", "CORTEXPILOT_COVERAGE_HTML": "0"}) + run_command( + command, + env_overrides={ + "CI": "1", + "CORTEXPILOT_COVERAGE_HTML": "0", + "CORTEXPILOT_DASHBOARD_COVERAGE_DIR": str(report_path.parent), + }, + ) def run_desktop_coverage(report_path: Path, test_targets: list[str]) -> None: report_path.parent.mkdir(parents=True, exist_ok=True) + run_command(["bash", "scripts/install_desktop_deps.sh"]) command = [ "pnpm", "--dir", @@ -247,7 +277,15 @@ def run_desktop_coverage(report_path: Path, test_targets: list[str]) -> None: f"--coverage.reportsDirectory={report_path.parent}", ] command.extend(test_targets) - run_command(command) + run_command( + command, + env_overrides={ + "CI": "1", + "CORTEXPILOT_COVERAGE_HTML": "0", + "CORTEXPILOT_DESKTOP_COVERAGE_DIR": str(report_path.parent), + "CORTEXPILOT_DESKTOP_COVERAGE_RUN_ID": "repo-coverage-gate", + }, + ) def aggregate_repo_totals(project_totals: dict[str, CoverageTotals]) -> CoverageTotals: From a668fbbbbaa9c17c5044dfc47e55693601384792 Mon Sep 17 00:00:00 2001 From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:14:14 -0700 Subject: [PATCH 2/9] fix: stabilize runtime run ordering --- .../services/control_plane_read_service.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py index 2a30a79..74c00db 100644 --- a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py +++ b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py @@ -178,13 +178,31 @@ def _last_event_ts(run_id: str) -> str: return value return "" + def _run_sort_ts(run_dir: Path, manifest_record: dict[str, Any]) -> float: + manifest_path = run_dir / "manifest.json" + if manifest_path.exists(): + return manifest_path.stat().st_mtime + created_at = _as_text(manifest_record.get("created_at")) + if created_at: + try: + return _parse_iso_ts(created_at).timestamp() + except Exception: + pass + return run_dir.stat().st_mtime + def _list_runs_runtime() -> list[dict[str, Any]]: results: list[dict[str, Any]] = [] - for run_dir in sorted(runs_root.glob("*"), key=lambda item: item.stat().st_mtime, reverse=True): + run_dirs = [] + for run_dir in runs_root.glob("*"): + if not run_dir.is_dir(): + continue manifest = _read_json(run_dir / "manifest.json", {}) manifest_record = _as_record(manifest) if not manifest_record: continue + run_dirs.append((run_dir, manifest_record, _run_sort_ts(run_dir, manifest_record))) + + for run_dir, manifest_record, _sort_ts in sorted(run_dirs, key=lambda item: item[2], reverse=True): run_id = _as_text(manifest_record.get("run_id")) or run_dir.name payload = dict(manifest_record) payload["run_id"] = run_id From 75d1cf1671aeed2e3d727695ac5bd9ce6de51afa Mon Sep 17 00:00:00 2001 From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:25:35 -0700 Subject: [PATCH 3/9] feat: persist prompt artifacts per run --- .../src/cortexpilot_orch/contract/compiler.py | 47 +++++++++++++++++++ .../scheduler/scheduler_bridge_contract.py | 19 +++++++- .../tests/test_scheduler_bridge_runtime.py | 8 ++++ docs/architecture/runtime-topology.md | 3 ++ 4 files changed, 76 insertions(+), 1 deletion(-) diff --git a/apps/orchestrator/src/cortexpilot_orch/contract/compiler.py b/apps/orchestrator/src/cortexpilot_orch/contract/compiler.py index b992dbf..3c410de 100644 --- a/apps/orchestrator/src/cortexpilot_orch/contract/compiler.py +++ b/apps/orchestrator/src/cortexpilot_orch/contract/compiler.py @@ -421,6 +421,53 @@ def build_role_binding_summary(contract: dict[str, Any]) -> dict[str, Any]: } +def build_prompt_artifact( + contract: dict[str, Any], + *, + run_id: str = "", + task_id: str = "", +) -> dict[str, Any]: + role_contract = contract.get("role_contract") if isinstance(contract.get("role_contract"), dict) else {} + if not role_contract: + role_contract = _build_role_contract(contract, _load_agent_registry()) + assigned_agent = contract.get("assigned_agent") if isinstance(contract.get("assigned_agent"), dict) else {} + role = str( + assigned_agent.get("role") + or (role_contract.get("identity", {}) if isinstance(role_contract.get("identity"), dict) else {}).get("role") + or "WORKER" + ).strip().upper() or "WORKER" + role_contract = _merge_role_config_defaults( + role_contract, + _find_role_config_defaults(_load_role_config_registry(), role), + ) + identity = role_contract.get("identity") if isinstance(role_contract.get("identity"), dict) else {} + runtime_binding_raw = role_contract.get("runtime_binding") if isinstance(role_contract.get("runtime_binding"), dict) else {} + runtime_binding = { + "runner": _normalize_optional_ref(runtime_binding_raw.get("runner")), + "provider": _normalize_optional_ref(runtime_binding_raw.get("provider")), + "model": _normalize_optional_ref(runtime_binding_raw.get("model")), + } + resolved_task_id = str(task_id or contract.get("task_id") or "").strip() + return { + "artifact_type": "prompt_artifact", + "version": "v1", + "source": "contract-derived", + "execution_authority": "task_contract", + "run_id": str(run_id or "").strip(), + "task_id": resolved_task_id, + "assigned_agent": { + "role": role, + "agent_id": str(identity.get("agent_id") or assigned_agent.get("agent_id") or "").strip(), + }, + "purpose": str(role_contract.get("purpose") or "").strip(), + "system_prompt_ref": _normalize_optional_ref(role_contract.get("system_prompt_ref")), + "skills_bundle_ref": _normalize_optional_ref(role_contract.get("skills_bundle_ref")), + "mcp_bundle_ref": _normalize_optional_ref(role_contract.get("mcp_bundle_ref")), + "runtime_binding": runtime_binding, + "role_binding_summary": build_role_binding_summary(contract), + } + + def _build_role_contract(contract: dict[str, Any], registry: dict[str, Any] | None) -> dict[str, Any]: assigned_agent = contract.get("assigned_agent") if isinstance(contract.get("assigned_agent"), dict) else {} role = str(assigned_agent.get("role") or "WORKER").strip().upper() or "WORKER" diff --git a/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py b/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py index 9888504..dbf16ea 100644 --- a/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py +++ b/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py @@ -1,10 +1,11 @@ from __future__ import annotations from collections.abc import Callable +import json from pathlib import Path from typing import Any -from cortexpilot_orch.contract.compiler import build_role_binding_summary +from cortexpilot_orch.contract.compiler import build_prompt_artifact, build_role_binding_summary from cortexpilot_orch.store.run_store import RunStore @@ -208,5 +209,21 @@ def persist_contract_state( ) store.write_task_contract(run_id, task_id, contract) store.write_active_contract(run_id, contract) + prompt_artifact = build_prompt_artifact(contract, run_id=run_id, task_id=task_id) + prompt_artifact_path = store.write_artifact( + run_id, + "prompt_artifact.json", + json.dumps(prompt_artifact, ensure_ascii=False, indent=2), + ) + store.append_event( + run_id, + { + "level": "INFO", + "event": "PROMPT_ARTIFACT_WRITTEN", + "run_id": run_id, + "task_id": task_id, + "meta": {"path": str(prompt_artifact_path.relative_to(store.run_dir(run_id)))}, + }, + ) if ensure_evidence_bundle_fn is not None and failure_reason: ensure_evidence_bundle_fn(store, run_id, contract, failure_reason) diff --git a/apps/orchestrator/tests/test_scheduler_bridge_runtime.py b/apps/orchestrator/tests/test_scheduler_bridge_runtime.py index 27a92d8..faac59d 100644 --- a/apps/orchestrator/tests/test_scheduler_bridge_runtime.py +++ b/apps/orchestrator/tests/test_scheduler_bridge_runtime.py @@ -155,3 +155,11 @@ def test_persist_contract_state_writes_role_binding_summary_to_manifest(tmp_path written = json.loads((store._runs_root / run_id / "manifest.json").read_text(encoding="utf-8")) assert written["role_binding_summary"] == build_role_binding_summary(contract) + prompt_artifact = json.loads( + (store._runs_root / run_id / "artifacts" / "prompt_artifact.json").read_text(encoding="utf-8") + ) + assert prompt_artifact["artifact_type"] == "prompt_artifact" + assert prompt_artifact["execution_authority"] == "task_contract" + assert prompt_artifact["run_id"] == run_id + assert prompt_artifact["task_id"] == "task-role-binding-summary" + assert prompt_artifact["role_binding_summary"] == build_role_binding_summary(contract) diff --git a/docs/architecture/runtime-topology.md b/docs/architecture/runtime-topology.md index 8dba6fc..c1c719e 100644 --- a/docs/architecture/runtime-topology.md +++ b/docs/architecture/runtime-topology.md @@ -72,6 +72,9 @@ flowchart LR `workflow_case_read_model` directly for operator inspection, but those UI cards remain read-only mirrors below `task_contract`. - Runtime artifacts (`manifest`, `events.jsonl`, reports) are generated per run. +- Runs may now also persist `artifacts/prompt_artifact.json`, a contract-derived + snapshot of prompt/bundle/runtime-binding refs for that run. It is a + read-only audit artifact, not a second execution authority source. - Run detail views may now include derived decision packs such as `incident_pack.json`, while approval queues synthesize `approval_pack` summaries from run events plus manifest metadata. These are derived operator From 721f9081de16b9c880c35d7d01e018b4a1b69513 Mon Sep 17 00:00:00 2001 From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:35:23 -0700 Subject: [PATCH 4/9] fix: use nanos for runtime run ordering --- .../services/control_plane_read_service.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py index 74c00db..f89e4bd 100644 --- a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py +++ b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py @@ -178,17 +178,17 @@ def _last_event_ts(run_id: str) -> str: return value return "" - def _run_sort_ts(run_dir: Path, manifest_record: dict[str, Any]) -> float: + def _run_sort_ts(run_dir: Path, manifest_record: dict[str, Any]) -> int: manifest_path = run_dir / "manifest.json" if manifest_path.exists(): - return manifest_path.stat().st_mtime + return manifest_path.stat().st_mtime_ns created_at = _as_text(manifest_record.get("created_at")) if created_at: try: - return _parse_iso_ts(created_at).timestamp() + return int(_parse_iso_ts(created_at).timestamp() * 1_000_000_000) except Exception: pass - return run_dir.stat().st_mtime + return run_dir.stat().st_mtime_ns def _list_runs_runtime() -> list[dict[str, Any]]: results: list[dict[str, Any]] = [] From 05d0f3cbc561a9258ef61aea817ff1351635f459 Mon Sep 17 00:00:00 2001 From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:41:01 -0700 Subject: [PATCH 5/9] feat: persist planning artifacts for runs --- .../api/main_pm_intake_helpers.py | 58 ++++++++- .../test_main_pm_intake_helpers_branches.py | 110 ++++++++++++++++++ 2 files changed, 167 insertions(+), 1 deletion(-) diff --git a/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py b/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py index 2d8c4ad..a670d4b 100644 --- a/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py +++ b/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py @@ -17,8 +17,9 @@ from cortexpilot_orch.config import load_config from cortexpilot_orch.contract.compiler import build_role_binding_summary, sync_role_contract from cortexpilot_orch.observability.logger import log_event -from cortexpilot_orch.planning.intake import IntakeService +from cortexpilot_orch.planning.intake import IntakeService, _build_wave_plan, _build_worker_prompt_contracts from cortexpilot_orch.store.intake_store import IntakeStore +from cortexpilot_orch.store.run_store import RunStore _TRUTHY_VALUES = {"1", "true", "yes", "y", "on"} @@ -118,6 +119,55 @@ def _strip_intake_only_contract_fields(contract: dict[str, Any]) -> dict[str, An return sanitized +def _safe_read_intake_store_payload(store: object, method_name: str, intake_id: str) -> dict[str, Any]: + reader = getattr(store, method_name, None) + if not callable(reader): + return {} + try: + payload = reader(intake_id) + except Exception: + return {} + return payload if isinstance(payload, dict) else {} + + +def _persist_planning_artifacts_for_run( + *, + intake_id: str, + run_id: str, + runs_root: Path, +) -> list[str]: + intake_store = IntakeStore() + intake_payload = _safe_read_intake_store_payload(intake_store, "read_intake", intake_id) + response_payload = _safe_read_intake_store_payload(intake_store, "read_response", intake_id) + plan_bundle = response_payload.get("plan_bundle") if isinstance(response_payload.get("plan_bundle"), dict) else None + if not intake_payload or not isinstance(plan_bundle, dict): + return [] + + run_store = RunStore(runs_root=runs_root) + artifacts_to_write: list[tuple[str, Any]] = [ + ("planning_wave_plan.json", _build_wave_plan(plan_bundle)), + ("planning_worker_prompt_contracts.json", _build_worker_prompt_contracts(plan_bundle, intake_payload)), + ] + written: list[str] = [] + for filename, payload in artifacts_to_write: + if payload in ({}, [], None): + continue + run_store.write_artifact(run_id, filename, json.dumps(payload, ensure_ascii=False, indent=2)) + written.append(filename) + + if written: + run_store.append_event( + run_id, + { + "level": "INFO", + "event": "PLANNING_ARTIFACTS_WRITTEN", + "run_id": run_id, + "meta": {"intake_id": intake_id, "artifacts": written}, + }, + ) + return written + + def configure_pm_session_aggregation( *, runs_root_fn: Callable[[], Path], @@ -597,10 +647,16 @@ def _execute_in_background() -> None: ) IntakeStore().append_event(intake_id, {"event": "INTAKE_RUN", "run_id": run_id}) + planning_artifacts = _persist_planning_artifacts_for_run( + intake_id=intake_id, + run_id=run_id, + runs_root=runs_root, + ) return { "ok": True, "run_id": run_id, "contract_path": str(contract_path), "strict_acceptance": bool(runtime_options.get("strict_acceptance", False)), "role_binding_summary": build_role_binding_summary(contract), + "planning_artifacts": planning_artifacts, } diff --git a/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py b/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py index 3f9ee3f..2d3999a 100644 --- a/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py +++ b/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py @@ -463,6 +463,116 @@ def execute_task(contract_path: Path, mock_mode: bool = False) -> str: assert observed_contract["runtime_options"]["strict_acceptance"] is True +def test_run_intake_persists_planning_artifacts_into_run_bundle(monkeypatch, tmp_path: Path) -> None: + runs_root = tmp_path / "runs" + runtime_contract_root = tmp_path / ".runtime-cache" / "cortexpilot" / "contracts" + intake_payload = { + "objective": "Ship one planning artifact bridge", + "constraints": ["truthful-public-surface"], + "search_queries": ["command tower planning artifact"], + } + response_payload = { + "plan_bundle": { + "bundle_id": "bundle-1", + "objective": "Ship one planning artifact bridge", + "owner_agent": {"role": "PM", "agent_id": "pm-1"}, + "plans": [ + { + "plan_id": "worker-1", + "assigned_agent": {"role": "WORKER", "agent_id": "worker-1"}, + "spec": "Persist the planning artifact into the run bundle.", + "allowed_paths": ["apps/orchestrator"], + "acceptance_tests": [{"name": "pytest", "cmd": "python3 -m pytest -q", "must_pass": True}], + "mcp_tool_set": ["codex"], + "required_outputs": [{"name": "task_result.json", "type": "report"}], + } + ], + } + } + intake_events: list[tuple[str, dict[str, object]]] = [] + + class _Store: + def append_event(self, intake_id: str, payload: dict[str, object]) -> None: + intake_events.append((intake_id, payload)) + + def read_intake(self, intake_id: str) -> dict[str, object]: + assert intake_id == "persist" + return intake_payload + + def read_response(self, intake_id: str) -> dict[str, object]: + assert intake_id == "persist" + return response_payload + + monkeypatch.setattr(helpers, "IntakeStore", lambda: _Store()) + monkeypatch.setattr( + helpers, + "load_config", + lambda: types.SimpleNamespace( + repo_root=tmp_path, + runs_root=runs_root, + contract_root=tmp_path / "contracts", + runtime_contract_root=runtime_contract_root, + ), + ) + + class _BuildOK: + def build_contract(self, intake_id: str) -> dict[str, object]: + assert intake_id == "persist" + return { + "task_id": "task-persist", + "owner_agent": {"role": "PM", "agent_id": "pm-1"}, + "assigned_agent": {"role": "WORKER", "agent_id": "worker-1"}, + "inputs": {"spec": "repro", "artifacts": []}, + "required_outputs": [{"name": "task_result.json", "type": "json", "acceptance": "ok"}], + "allowed_paths": ["apps/orchestrator"], + "forbidden_actions": [], + "acceptance_tests": [{"name": "pytest", "cmd": "python3 -m pytest -q", "must_pass": True}], + "tool_permissions": { + "filesystem": "workspace-write", + "shell": "on-request", + "network": "deny", + "mcp_tools": ["codex"], + }, + "mcp_tool_set": ["codex"], + "timeout_retry": {"timeout_sec": 60, "max_retries": 0, "retry_backoff_sec": 0}, + "rollback": {"strategy": "git_reset_hard", "baseline_ref": "HEAD"}, + "evidence_links": [], + "log_refs": {"run_id": "", "paths": {}}, + } + + class _Orchestrator: + @staticmethod + def execute_task(contract_path: Path, mock_mode: bool = False) -> str: + del mock_mode + payload = json.loads(contract_path.read_text(encoding="utf-8")) + store = RunStore(runs_root=runs_root) + run_id = store.create_run(str(payload.get("task_id") or "task")) + store.write_manifest(run_id, {"run_id": run_id, "task_id": payload.get("task_id"), "status": "RUNNING", "repo": {}}) + return run_id + + result = helpers.run_intake( + "persist", + {"mock": True}, + intake_service_cls=_BuildOK, + orchestration_service=_Orchestrator(), + error_detail_fn=lambda code: {"code": code}, + current_request_id_fn=lambda: "req-persist", + ) + + run_id = result["run_id"] + wave_plan = json.loads((runs_root / run_id / "artifacts" / "planning_wave_plan.json").read_text(encoding="utf-8")) + worker_contracts = json.loads( + (runs_root / run_id / "artifacts" / "planning_worker_prompt_contracts.json").read_text(encoding="utf-8") + ) + + assert result["planning_artifacts"] == ["planning_wave_plan.json", "planning_worker_prompt_contracts.json"] + assert wave_plan["wave_id"] == "bundle-1" + assert wave_plan["objective"] == "Ship one planning artifact bridge" + assert worker_contracts[0]["prompt_contract_id"] == "worker-1" + assert worker_contracts[0]["continuation_policy"]["on_blocked"] == "spawn_independent_temporary_unblock_task" + assert intake_events[-1] == ("persist", {"event": "INTAKE_RUN", "run_id": run_id}) + + def test_build_role_binding_summary_marks_skills_and_mcp_registry_refs_as_registry_backed() -> None: summary = build_role_binding_summary( { From cdfc07f880cdc1c9618bb068da127e6fc0ee596b Mon Sep 17 00:00:00 2001 From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:43:08 -0700 Subject: [PATCH 6/9] feat: persist planning artifacts for runs From b2cb869d65ad989d8c55de9128f2077823c62f5b Mon Sep 17 00:00:00 2001 From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:50:47 -0700 Subject: [PATCH 7/9] feat: register planning artifacts in run manifests --- .../api/main_pm_intake_helpers.py | 41 ++++++++++++++++++- .../scheduler/scheduler_bridge_contract.py | 34 +++++++++++++++ .../test_main_pm_intake_helpers_branches.py | 4 ++ .../tests/test_scheduler_bridge_runtime.py | 2 + 4 files changed, 80 insertions(+), 1 deletion(-) diff --git a/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py b/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py index a670d4b..53a01c1 100644 --- a/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py +++ b/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py @@ -1,5 +1,6 @@ from __future__ import annotations +import hashlib import json import threading from datetime import datetime, timezone @@ -119,6 +120,29 @@ def _strip_intake_only_contract_fields(contract: dict[str, Any]) -> dict[str, An return sanitized +def _artifact_ref_for_path(path: Path, *, rel_path: str, name: str, media_type: str = "application/json") -> dict[str, Any]: + payload = path.read_bytes() + return { + "name": name, + "path": rel_path, + "sha256": hashlib.sha256(payload).hexdigest(), + "media_type": media_type, + "size_bytes": len(payload), + } + + +def _append_manifest_artifact(manifest: dict[str, Any], ref: dict[str, Any]) -> None: + artifacts = manifest.get("artifacts") if isinstance(manifest.get("artifacts"), list) else [] + key = (str(ref.get("name") or ""), str(ref.get("path") or "")) + for item in artifacts: + if not isinstance(item, dict): + continue + if (str(item.get("name") or ""), str(item.get("path") or "")) == key: + return + artifacts.append(ref) + manifest["artifacts"] = artifacts + + def _safe_read_intake_store_payload(store: object, method_name: str, intake_id: str) -> dict[str, Any]: reader = getattr(store, method_name, None) if not callable(reader): @@ -144,18 +168,33 @@ def _persist_planning_artifacts_for_run( return [] run_store = RunStore(runs_root=runs_root) + run_dir = run_store.run_dir(run_id) artifacts_to_write: list[tuple[str, Any]] = [ ("planning_wave_plan.json", _build_wave_plan(plan_bundle)), ("planning_worker_prompt_contracts.json", _build_worker_prompt_contracts(plan_bundle, intake_payload)), ] written: list[str] = [] + artifact_refs: list[dict[str, Any]] = [] for filename, payload in artifacts_to_write: if payload in ({}, [], None): continue - run_store.write_artifact(run_id, filename, json.dumps(payload, ensure_ascii=False, indent=2)) + artifact_path = run_store.write_artifact(run_id, filename, json.dumps(payload, ensure_ascii=False, indent=2)) written.append(filename) + artifact_refs.append( + _artifact_ref_for_path( + artifact_path, + rel_path=f"artifacts/{filename}", + name=filename.removesuffix(".json"), + ) + ) if written: + manifest_path = run_dir / "manifest.json" + manifest = _read_json_file(manifest_path) + if manifest: + for ref in artifact_refs: + _append_manifest_artifact(manifest, ref) + manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8") run_store.append_event( run_id, { diff --git a/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py b/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py index dbf16ea..6d16354 100644 --- a/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py +++ b/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Callable +import hashlib import json from pathlib import Path from typing import Any @@ -9,6 +10,29 @@ from cortexpilot_orch.store.run_store import RunStore +def _artifact_ref_for_path(path: Path, *, rel_path: str, name: str, media_type: str = "application/json") -> dict[str, Any]: + payload = path.read_bytes() + return { + "name": name, + "path": rel_path, + "sha256": hashlib.sha256(payload).hexdigest(), + "media_type": media_type, + "size_bytes": len(payload), + } + + +def _append_manifest_artifact(manifest: dict[str, Any], ref: dict[str, Any]) -> None: + artifacts = manifest.get("artifacts") if isinstance(manifest.get("artifacts"), list) else [] + key = (str(ref.get("name") or ""), str(ref.get("path") or "")) + for item in artifacts: + if not isinstance(item, dict): + continue + if (str(item.get("name") or ""), str(item.get("path") or "")) == key: + return + artifacts.append(ref) + manifest["artifacts"] = artifacts + + class ContractStateWriter: def __init__( self, @@ -215,6 +239,16 @@ def persist_contract_state( "prompt_artifact.json", json.dumps(prompt_artifact, ensure_ascii=False, indent=2), ) + if manifest is not None: + _append_manifest_artifact( + manifest, + _artifact_ref_for_path( + prompt_artifact_path, + rel_path="artifacts/prompt_artifact.json", + name="prompt_artifact", + ), + ) + write_manifest_fn(store, run_id, manifest) store.append_event( run_id, { diff --git a/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py b/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py index 2d3999a..6499250 100644 --- a/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py +++ b/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py @@ -564,12 +564,16 @@ def execute_task(contract_path: Path, mock_mode: bool = False) -> str: worker_contracts = json.loads( (runs_root / run_id / "artifacts" / "planning_worker_prompt_contracts.json").read_text(encoding="utf-8") ) + manifest = json.loads((runs_root / run_id / "manifest.json").read_text(encoding="utf-8")) assert result["planning_artifacts"] == ["planning_wave_plan.json", "planning_worker_prompt_contracts.json"] assert wave_plan["wave_id"] == "bundle-1" assert wave_plan["objective"] == "Ship one planning artifact bridge" assert worker_contracts[0]["prompt_contract_id"] == "worker-1" assert worker_contracts[0]["continuation_policy"]["on_blocked"] == "spawn_independent_temporary_unblock_task" + artifact_names = [item["name"] for item in manifest["artifacts"]] + assert "planning_wave_plan" in artifact_names + assert "planning_worker_prompt_contracts" in artifact_names assert intake_events[-1] == ("persist", {"event": "INTAKE_RUN", "run_id": run_id}) diff --git a/apps/orchestrator/tests/test_scheduler_bridge_runtime.py b/apps/orchestrator/tests/test_scheduler_bridge_runtime.py index faac59d..0fe1c0f 100644 --- a/apps/orchestrator/tests/test_scheduler_bridge_runtime.py +++ b/apps/orchestrator/tests/test_scheduler_bridge_runtime.py @@ -163,3 +163,5 @@ def test_persist_contract_state_writes_role_binding_summary_to_manifest(tmp_path assert prompt_artifact["run_id"] == run_id assert prompt_artifact["task_id"] == "task-role-binding-summary" assert prompt_artifact["role_binding_summary"] == build_role_binding_summary(contract) + artifact_names = [item["name"] for item in written["artifacts"]] + assert "prompt_artifact" in artifact_names From 700b6bf83c96c2463c8b6497d30ba1d3048316d4 Mon Sep 17 00:00:00 2001 From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:59:42 -0700 Subject: [PATCH 8/9] fix: stabilize hosted run ordering --- .../cortexpilot_orch/services/control_plane_read_service.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py index f89e4bd..16ef84b 100644 --- a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py +++ b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py @@ -202,7 +202,11 @@ def _list_runs_runtime() -> list[dict[str, Any]]: continue run_dirs.append((run_dir, manifest_record, _run_sort_ts(run_dir, manifest_record))) - for run_dir, manifest_record, _sort_ts in sorted(run_dirs, key=lambda item: item[2], reverse=True): + for run_dir, manifest_record, _sort_ts in sorted( + run_dirs, + key=lambda item: (item[2], item[0].name), + reverse=True, + ): run_id = _as_text(manifest_record.get("run_id")) or run_dir.name payload = dict(manifest_record) payload["run_id"] = run_id From 5de7f4d6a03d3796b8a7119127b4e20caa7b22ae Mon Sep 17 00:00:00 2001 From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com> Date: Sun, 12 Apr 2026 15:18:16 -0700 Subject: [PATCH 9/9] fix: drop unused control-plane test import --- apps/orchestrator/tests/test_control_plane_read_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/orchestrator/tests/test_control_plane_read_service.py b/apps/orchestrator/tests/test_control_plane_read_service.py index c075ebf..0ccc00f 100644 --- a/apps/orchestrator/tests/test_control_plane_read_service.py +++ b/apps/orchestrator/tests/test_control_plane_read_service.py @@ -2,7 +2,7 @@ import json import sys -from datetime import datetime, timezone +from datetime import datetime from pathlib import Path from types import SimpleNamespace from types import ModuleType