From 5d8fa5d2d09ad06652be7607c04a494a27579b4a Mon Sep 17 00:00:00 2001
From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com>
Date: Sun, 12 Apr 2026 13:52:16 -0700
Subject: [PATCH 1/9] fix: harden truth and coverage gates
---
README.md | 9 +
apps/dashboard/vitest.config.mts | 4 +-
apps/desktop/scripts/playwright-tempdir.mjs | 2 +-
.../src/components/chain/ChainPanel.test.tsx | 8 +-
.../copilot/DesktopCopilotPanel.test.tsx | 98 ++++++
.../desktop/src/hooks/useDesktopData.test.tsx | 22 +-
apps/desktop/src/lib/desktopUi.test.ts | 17 +-
apps/desktop/src/lib/uiError.test.ts | 10 +-
.../src/pages/AgentsRoleConfigPanel.test.tsx | 174 +++++++++
apps/desktop/src/pages/EventsPage.test.tsx | 4 +-
apps/desktop/src/pages/ReviewsPage.test.tsx | 6 +-
apps/desktop/src/pages/TestsPage.test.tsx | 7 +-
.../pages/coverage_sprint_f_pages.test.tsx | 8 +-
.../pages/workflow_queue_controls.test.tsx | 2 +-
.../src/cortexpilot_orch/gates/tests_gate.py | 53 ++-
.../services/control_plane_read_service.py | 17 +-
.../tests/test_bench_e2e_speed_gate.py | 100 ++++++
.../tests/test_control_plane_read_service.py | 330 ++++++++++++++++++
.../test_mcp_queue_pilot_server_branches.py | 132 +++++++
.../tests/test_repo_coverage_gate.py | 40 +++
.../tests/test_tests_gate_extended.py | 57 +++
configs/env.registry.json | 36 ++
.../storefront/benchmark-methodology.md | 18 +
package.json | 7 +-
scripts/check_bench_e2e_speed_gate.py | 135 +++++++
scripts/repo_coverage_gate.py | 44 ++-
26 files changed, 1272 insertions(+), 68 deletions(-)
create mode 100644 apps/desktop/src/components/copilot/DesktopCopilotPanel.test.tsx
create mode 100644 apps/desktop/src/pages/AgentsRoleConfigPanel.test.tsx
create mode 100644 apps/orchestrator/tests/test_bench_e2e_speed_gate.py
create mode 100644 apps/orchestrator/tests/test_control_plane_read_service.py
create mode 100644 apps/orchestrator/tests/test_mcp_queue_pilot_server_branches.py
create mode 100644 apps/orchestrator/tests/test_repo_coverage_gate.py
create mode 100644 scripts/check_bench_e2e_speed_gate.py
diff --git a/README.md b/README.md
index 8ba5279..9126d8c 100644
--- a/README.md
+++ b/README.md
@@ -526,6 +526,8 @@ Default local verification path:
npm run ci
npm run test:quick
npm run test
+npm run mutation:gate
+npm run bench:e2e:speed:gate
```
`npm run ci` is now the hosted-aligned local fast gate. Use
@@ -533,6 +535,13 @@ npm run test
`npm run scan:workflow-security`, `npm run scan:trivy`, and
`npm run security:scan:closeout` only when you intentionally want the stricter
closeout/manual layers.
+`npm run mutation:gate` is the root mutation entrypoint for the existing
+Orchestrator mutation profiles, `npm run bench:e2e:speed:gate` is the
+fail-closed benchmark gate that evaluates a real benchmark summary once a run
+has produced one, and `npm run coverage:repo` now points to the active
+coverage runner that prepares subproject dependencies before generating fresh
+repo-level coverage receipts. Use `npm run coverage:repo:aggregate` only when
+you intentionally want to re-aggregate already-existing coverage artifacts.
Current CI contract has five layers only:
diff --git a/apps/dashboard/vitest.config.mts b/apps/dashboard/vitest.config.mts
index bc44f9e..adaab2e 100644
--- a/apps/dashboard/vitest.config.mts
+++ b/apps/dashboard/vitest.config.mts
@@ -27,7 +27,9 @@ if (pool !== requestedPool) {
}
const shouldEmitHtmlCoverage = !process.env.CI || process.env.CORTEXPILOT_COVERAGE_HTML === "1";
const coverageReporter = shouldEmitHtmlCoverage ? ["text", "html", "json-summary"] : ["text", "json-summary"];
-const coverageReportsDirectory = path.resolve(process.cwd(), "coverage");
+const coverageReportsDirectory = process.env.CORTEXPILOT_DASHBOARD_COVERAGE_DIR
+ ? path.resolve(process.env.CORTEXPILOT_DASHBOARD_COVERAGE_DIR)
+ : path.resolve(process.cwd(), "coverage");
const coverageClean = !serialCoverageMode;
const coverageProcessingConcurrency = serialCoverageMode ? 1 : undefined;
const testTimeout = process.env.CI ? 45000 : 15000;
diff --git a/apps/desktop/scripts/playwright-tempdir.mjs b/apps/desktop/scripts/playwright-tempdir.mjs
index 3d5be06..7fa4c67 100644
--- a/apps/desktop/scripts/playwright-tempdir.mjs
+++ b/apps/desktop/scripts/playwright-tempdir.mjs
@@ -14,7 +14,7 @@ function sanitizeScope(scope) {
function resolveTempRoot(scriptDir) {
const runnerTemp = normalizeValue(process.env.RUNNER_TEMP);
if (runnerTemp) return resolve(runnerTemp);
- return resolve(scriptDir, "..", "..", "..", ".runtime-cache", "temp");
+ return resolve(scriptDir, "..", "..", "..", ".runtime-cache", "cache", "tmp");
}
export function configurePlaywrightTempDir(scope) {
diff --git a/apps/desktop/src/components/chain/ChainPanel.test.tsx b/apps/desktop/src/components/chain/ChainPanel.test.tsx
index 73e5dd4..25191f0 100644
--- a/apps/desktop/src/components/chain/ChainPanel.test.tsx
+++ b/apps/desktop/src/components/chain/ChainPanel.test.tsx
@@ -51,9 +51,9 @@ describe("ChainPanel", () => {
/>
);
- fireEvent.click(screen.getByRole("button", { name: "简洁视图" }));
- fireEvent.click(screen.getByRole("button", { name: "详细视图" }));
- fireEvent.click(screen.getByRole("button", { name: "Chain 优先" }));
+ fireEvent.click(screen.getByRole("button", { name: "Compact view" }));
+ fireEvent.click(screen.getByRole("button", { name: "Detailed view" }));
+ fireEvent.click(screen.getByRole("button", { name: "Chain first" }));
expect(setChainDisplayMode).toHaveBeenCalledWith("compact");
expect(setChainDisplayMode).toHaveBeenCalledWith("detail");
@@ -79,7 +79,7 @@ describe("ChainPanel", () => {
/>
);
- const legend = screen.getByLabelText("节点状态说明");
+ const legend = screen.getByLabelText("Node status legend");
const items = legend.querySelectorAll("li");
expect(items).toHaveLength(2);
expect(items[0]).toHaveClass("is-active");
diff --git a/apps/desktop/src/components/copilot/DesktopCopilotPanel.test.tsx b/apps/desktop/src/components/copilot/DesktopCopilotPanel.test.tsx
new file mode 100644
index 0000000..50e9059
--- /dev/null
+++ b/apps/desktop/src/components/copilot/DesktopCopilotPanel.test.tsx
@@ -0,0 +1,98 @@
+import { fireEvent, render, screen, waitFor } from "@testing-library/react";
+import { describe, expect, it, vi } from "vitest";
+
+import { DesktopCopilotPanel } from "./DesktopCopilotPanel";
+
+describe("DesktopCopilotPanel", () => {
+ it("renders operator-brief truth surfaces and grounded takeaways after generation", async () => {
+ const loadBrief = vi.fn().mockResolvedValue({
+ report_type: "operator_copilot_brief",
+ status: "AVAILABLE",
+ scope: "run_detail",
+ subject_id: "run-123",
+ summary: "The operator should compare the staged diff before accepting the run.",
+ likely_cause: "The last proof pack is stale.",
+ compare_takeaway: "Compare the staged diff against the last approved run.",
+ proof_takeaway: "Refresh the proof pack before asking for review.",
+ incident_takeaway: "Treat stale proof as an incident until it is re-generated.",
+ queue_takeaway: "Keep the queue paused until proof is current.",
+ approval_takeaway: "Approval should wait for a fresh proof receipt.",
+ used_truth_surfaces: ["run_detail", "", "proof_pack"],
+ limitations: ["review not started", " "],
+ recommended_actions: ["Refresh proof", "Request review", " "],
+ top_risks: ["stale-proof", "", "queue drift"],
+ });
+
+ render(
+ ,
+ );
+
+ expect(screen.getByText("Only grounded control-plane truth belongs here.")).toBeInTheDocument();
+ expect(screen.getByText("What is blocked?")).toBeInTheDocument();
+ expect(screen.getByText("What should the operator do next?")).toBeInTheDocument();
+ expect(screen.getByText("On demand")).toBeInTheDocument();
+
+ fireEvent.click(screen.getByRole("button", { name: "Generate operator brief" }));
+
+ expect(await screen.findByText("Grounded brief")).toBeInTheDocument();
+ expect(await screen.findByText("The operator should compare the staged diff before accepting the run.")).toBeInTheDocument();
+ expect(screen.getByText("The last proof pack is stale.")).toBeInTheDocument();
+ expect(screen.getByText("Scope: run_detail")).toBeInTheDocument();
+ expect(screen.getByText("Subject: run-123")).toBeInTheDocument();
+ expect(screen.getByText("Truth surfaces: run_detail | proof_pack")).toBeInTheDocument();
+ expect(screen.getByText("Limitations: review not started")).toBeInTheDocument();
+ expect(screen.getByText("Compare the staged diff against the last approved run.")).toBeInTheDocument();
+ expect(screen.getByText("Keep the queue paused until proof is current.")).toBeInTheDocument();
+ expect(screen.getByText("Refresh proof")).toBeInTheDocument();
+ expect(screen.getByText("queue drift")).toBeInTheDocument();
+ expect(screen.getByRole("button", { name: "Regenerate brief" })).toBeInTheDocument();
+
+ expect(loadBrief).toHaveBeenCalledTimes(1);
+ });
+
+ it("covers flight-plan fallback labels and empty action/risk lists", async () => {
+ const loadBrief = vi.fn().mockResolvedValue({
+ report_type: "flight_plan_copilot_brief",
+ status: "UNAVAILABLE",
+ summary: "The plan is still advisory because execution has not started yet.",
+ risk_takeaway: "Approval is still blocked on a missing operator confirmation.",
+ capability_takeaway: "Runtime capability is unresolved until the runner binds.",
+ approval_takeaway: "An operator must confirm the start gate before execution.",
+ used_truth_surfaces: ["execution_plan_preview"],
+ recommended_actions: ["", " "],
+ top_risks: [],
+ limitations: undefined,
+ });
+
+ render();
+
+ fireEvent.click(screen.getByRole("button", { name: "Generate operator brief" }));
+
+ expect(await screen.findByText("Unavailable")).toBeInTheDocument();
+ expect(screen.getByText("Scope: flight_plan")).toBeInTheDocument();
+ expect(screen.getByText("Subject: execution_plan_report")).toBeInTheDocument();
+ expect(screen.getByText("Truth surfaces: execution_plan_preview")).toBeInTheDocument();
+ expect(screen.getByText("Limitations: -")).toBeInTheDocument();
+ expect(screen.getAllByText("Approval is still blocked on a missing operator confirmation.").length).toBeGreaterThan(0);
+ expect(screen.getByText("This brief stays advisory until a run actually starts.")).toBeInTheDocument();
+ expect(screen.getByText("No recommended actions were returned.")).toBeInTheDocument();
+ expect(screen.getByText("No explicit risks were returned.")).toBeInTheDocument();
+ });
+
+ it("surfaces load failures without leaving the panel in generating state", async () => {
+ const loadBrief = vi.fn().mockRejectedValue("brief backend unavailable");
+
+ render();
+
+ fireEvent.click(screen.getByRole("button", { name: "Generate operator brief" }));
+
+ expect(await screen.findByText("brief backend unavailable")).toBeInTheDocument();
+ await waitFor(() => {
+ expect(screen.getByRole("button", { name: "Generate operator brief" })).toBeEnabled();
+ });
+ });
+});
diff --git a/apps/desktop/src/hooks/useDesktopData.test.tsx b/apps/desktop/src/hooks/useDesktopData.test.tsx
index 6d5ea46..df8e113 100644
--- a/apps/desktop/src/hooks/useDesktopData.test.tsx
+++ b/apps/desktop/src/hooks/useDesktopData.test.tsx
@@ -104,7 +104,9 @@ describe("useDesktopData", () => {
const user = userEvent.setup();
render();
await waitFor(() => {
- expect(screen.getByTestId("live-error")).toHaveTextContent("总览数据拉取失败");
+ expect(screen.getByTestId("live-error")).toHaveTextContent(
+ "Failed to refresh overview data: the service is temporarily unavailable. Try again in a moment.",
+ );
});
overviewFail = false;
@@ -131,7 +133,9 @@ describe("useDesktopData", () => {
);
render();
await waitFor(() => {
- expect(screen.getByTestId("live-error")).toHaveTextContent("会话列表拉取失败");
+ expect(screen.getByTestId("live-error")).toHaveTextContent(
+ "Failed to refresh the session list: the service is temporarily unavailable. Try again in a moment.",
+ );
});
});
@@ -154,7 +158,9 @@ describe("useDesktopData", () => {
);
render();
await waitFor(() => {
- expect(screen.getByTestId("live-error")).toHaveTextContent("后端暂不可达,已进入退避重试");
+ expect(screen.getByTestId("live-error")).toHaveTextContent(
+ "The backend is currently unreachable. Backoff retry is active and local actions can continue.",
+ );
});
});
@@ -233,7 +239,9 @@ describe("useDesktopData", () => {
try {
render();
await waitFor(() => {
- expect(screen.getByTestId("live-error")).toHaveTextContent("当前网络离线,已暂停实时拉取。恢复联网后将自动重试。");
+ expect(screen.getByTestId("live-error")).toHaveTextContent(
+ "The network is offline. Live polling is paused and will retry automatically when connectivity returns.",
+ );
});
} finally {
Object.defineProperty(window.navigator, "onLine", { configurable: true, value: originalOnLine });
@@ -260,7 +268,9 @@ describe("useDesktopData", () => {
render();
await waitFor(() => {
- expect(screen.getByTestId("live-error")).toHaveTextContent("会话列表拉取失败:权限或认证异常,请确认登录状态。");
+ expect(screen.getByTestId("live-error")).toHaveTextContent(
+ "Failed to refresh the session list: authentication or permission check failed. Confirm your sign-in state.",
+ );
});
});
@@ -380,7 +390,7 @@ describe("useDesktopData", () => {
try {
render();
await waitFor(() => {
- expect(screen.getByTestId("live-error")).toHaveTextContent("策略告警拉取失败");
+ expect(screen.getByTestId("live-error")).toHaveTextContent("Failed to refresh policy alerts");
});
expect(consoleSpy).toHaveBeenCalled();
} finally {
diff --git a/apps/desktop/src/lib/desktopUi.test.ts b/apps/desktop/src/lib/desktopUi.test.ts
index c3700f9..375fea2 100644
--- a/apps/desktop/src/lib/desktopUi.test.ts
+++ b/apps/desktop/src/lib/desktopUi.test.ts
@@ -49,7 +49,7 @@ describe("desktopUi seed timeline", () => {
),
);
- fireEvent.click(screen.getByRole("button", { name: "查看完整 Diff" }));
+ fireEvent.click(screen.getByRole("button", { name: "View full diff" }));
expect(onViewDiff).toHaveBeenCalledWith("report-1");
});
@@ -108,8 +108,8 @@ describe("desktopUi seed timeline", () => {
render(createElement("div", null, renderChatEmbed(message as any, embed as any, chooseDecision)));
- expect(screen.getByText("推荐")).toBeInTheDocument();
- fireEvent.click(screen.getByRole("button", { name: "选择" }));
+ expect(screen.getByText("Recommended")).toBeInTheDocument();
+ fireEvent.click(screen.getByRole("button", { name: "Choose" }));
expect(chooseDecision).toHaveBeenCalledWith("msg-decision", "decision-1", "fast");
});
@@ -157,10 +157,11 @@ describe("desktopUi seed timeline", () => {
)
);
- expect(screen.getByText("任务:")).toBeInTheDocument();
- expect(screen.getAllByText("进行中")).toHaveLength(2);
- expect(screen.getByText("等待")).toBeInTheDocument();
- expect(screen.getByText("完成")).toBeInTheDocument();
- expect(screen.getByLabelText("警报卡片")).toHaveClass("is-critical");
+ expect(screen.getByText("Task:")).toBeInTheDocument();
+ expect(screen.getByText("进行中")).toBeInTheDocument();
+ expect(screen.getAllByText("In progress")).toHaveLength(1);
+ expect(screen.getByText("Waiting")).toBeInTheDocument();
+ expect(screen.getByText("Done")).toBeInTheDocument();
+ expect(screen.getByLabelText("Alert card")).toHaveClass("is-critical");
});
});
diff --git a/apps/desktop/src/lib/uiError.test.ts b/apps/desktop/src/lib/uiError.test.ts
index 1232157..9e7bd56 100644
--- a/apps/desktop/src/lib/uiError.test.ts
+++ b/apps/desktop/src/lib/uiError.test.ts
@@ -7,13 +7,13 @@ describe("uiError", () => {
});
it("maps network-style messages", () => {
- expect(sanitizeUiError(new Error("Network timeout"), "加载失败")).toContain("未连接到本地服务");
- expect(sanitizeUiError(new Error("fetch failed"), "加载失败")).toContain("未连接到本地服务");
+ expect(sanitizeUiError(new Error("Network timeout"), "Load failed")).toContain("unable to reach the local service");
+ expect(sanitizeUiError(new Error("fetch failed"), "Load failed")).toContain("unable to reach the local service");
});
it("maps auth-style messages", () => {
- expect(sanitizeUiError(new Error("401 unauthorized"), "加载失败")).toContain("权限或认证异常");
- expect(sanitizeUiError(new Error("token invalid"), "加载失败")).toContain("权限或认证异常");
+ expect(sanitizeUiError(new Error("401 unauthorized"), "Load failed")).toContain("authentication or permission check failed");
+ expect(sanitizeUiError(new Error("token invalid"), "Load failed")).toContain("authentication or permission check failed");
});
it("keeps generic fallback for unknown errors", () => {
@@ -21,7 +21,7 @@ describe("uiError", () => {
});
it("maps backend 5xx-style messages", () => {
- expect(sanitizeUiError(new Error("API /path failed: 503"), "加载失败")).toContain("服务暂时不可用");
+ expect(sanitizeUiError(new Error("API /path failed: 503"), "Load failed")).toContain("service is temporarily unavailable");
});
it("extracts detail from unknown payload", () => {
diff --git a/apps/desktop/src/pages/AgentsRoleConfigPanel.test.tsx b/apps/desktop/src/pages/AgentsRoleConfigPanel.test.tsx
new file mode 100644
index 0000000..e41e723
--- /dev/null
+++ b/apps/desktop/src/pages/AgentsRoleConfigPanel.test.tsx
@@ -0,0 +1,174 @@
+import { fireEvent, render, screen, waitFor } from "@testing-library/react";
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+import { AgentsRoleConfigPanel } from "./AgentsRoleConfigPanel";
+
+vi.mock("../lib/api", () => ({
+ applyRoleConfig: vi.fn(),
+ fetchRoleConfig: vi.fn(),
+ mutationExecutionCapability: vi.fn(() => ({ executable: false, operatorRole: null })),
+ previewRoleConfig: vi.fn(),
+}));
+
+import { applyRoleConfig, fetchRoleConfig, mutationExecutionCapability, previewRoleConfig } from "../lib/api";
+
+function makeSurface(overrides: Record = {}) {
+ return {
+ persisted_source: "policies/role_config_registry.json",
+ execution_authority: "task_contract",
+ editable_now: {
+ system_prompt_ref: "policies/agents/codex/roles/20_planner_core.md",
+ skills_bundle_ref: "policies/skills_bundle_registry.json#bundles.planner",
+ mcp_bundle_ref: "policies/agent_registry.json#agents(role=PLANNER).capabilities.mcp_tools",
+ runtime_binding: {
+ runner: "agents",
+ provider: "cliproxyapi",
+ model: "gpt-5.4",
+ },
+ },
+ ...overrides,
+ } as any;
+}
+
+describe("AgentsRoleConfigPanel", () => {
+ beforeEach(() => {
+ vi.clearAllMocks();
+ vi.mocked(mutationExecutionCapability).mockReturnValue({ executable: false, operatorRole: null } as any);
+ });
+
+ it("shows the empty-state desk when no roles are available", () => {
+ render();
+
+ expect(screen.getByRole("heading", { name: "Role configuration desk" })).toBeInTheDocument();
+ expect(screen.getByText("No registered roles are available for configuration yet.")).toBeInTheDocument();
+ });
+
+ it("supports preview mode and reports role-load failures when switching roles", async () => {
+ let resolveFirstFetch: (value: any) => void = () => {};
+ vi.mocked(fetchRoleConfig)
+ .mockImplementationOnce(() => new Promise((resolve) => {
+ resolveFirstFetch = resolve;
+ }) as any)
+ .mockRejectedValueOnce("role config fetch failed");
+ vi.mocked(previewRoleConfig).mockResolvedValue({
+ changes: [
+ { field: "runtime_binding.runner", current: "agents", next: "codex" },
+ ],
+ preview_surface: {
+ runtime_capability: {
+ lane: "tool-capable-provider",
+ tool_execution: "available",
+ },
+ },
+ } as any);
+
+ render(
+ ,
+ );
+
+ expect(screen.getByText("Loading role configuration…")).toBeInTheDocument();
+
+ resolveFirstFetch(makeSurface());
+ expect(await screen.findByText("Drive wave planning")).toBeInTheDocument();
+ expect(screen.getByText("Preview only")).toBeInTheDocument();
+ expect(screen.getByText("Preview is available, but saving defaults requires an operator role.")).toBeInTheDocument();
+
+ fireEvent.change(screen.getByLabelText("Runtime runner"), { target: { value: "codex" } });
+ fireEvent.click(screen.getByRole("button", { name: "Preview defaults" }));
+
+ await waitFor(() => {
+ expect(previewRoleConfig).toHaveBeenCalledWith("PLANNER", {
+ system_prompt_ref: "policies/agents/codex/roles/20_planner_core.md",
+ skills_bundle_ref: "policies/skills_bundle_registry.json#bundles.planner",
+ mcp_bundle_ref: "policies/agent_registry.json#agents(role=PLANNER).capabilities.mcp_tools",
+ runtime_binding: {
+ runner: "codex",
+ provider: "cliproxyapi",
+ model: "gpt-5.4",
+ },
+ });
+ });
+ await waitFor(() => {
+ expect(screen.getAllByText("Runtime runner").length).toBeGreaterThan(0);
+ });
+ expect(screen.getByText("agents → codex")).toBeInTheDocument();
+ expect(screen.getByText("tool-capable-provider")).toBeInTheDocument();
+
+ fireEvent.change(screen.getByLabelText("Select role for role configuration"), { target: { value: "REVIEWER" } });
+
+ expect(await screen.findByText("role config fetch failed")).toBeInTheDocument();
+ expect(screen.getByText("No role purpose published yet.")).toBeInTheDocument();
+ });
+
+ it("applies repo defaults when mutation execution is enabled", async () => {
+ const onApplied = vi.fn().mockResolvedValue(undefined);
+
+ vi.mocked(fetchRoleConfig).mockResolvedValue(makeSurface());
+ vi.mocked(mutationExecutionCapability).mockReturnValue({ executable: true, operatorRole: "OPS" } as any);
+ vi.mocked(previewRoleConfig).mockResolvedValue({
+ changes: [],
+ preview_surface: {
+ runtime_capability: {
+ lane: "standard-provider-path",
+ tool_execution: "provider-path-required",
+ },
+ },
+ } as any);
+ vi.mocked(applyRoleConfig).mockResolvedValue({
+ role: "PLANNER",
+ surface: makeSurface({
+ editable_now: {
+ system_prompt_ref: "policies/agents/codex/roles/30_ops.md",
+ skills_bundle_ref: "policies/skills_bundle_registry.json#bundles.planner",
+ mcp_bundle_ref: "policies/agent_registry.json#agents(role=PLANNER).capabilities.mcp_tools",
+ runtime_binding: {
+ runner: "codex",
+ provider: null,
+ model: null,
+ },
+ },
+ }),
+ } as any);
+
+ render(
+ ,
+ );
+
+ expect(await screen.findByText("Apply enabled for OPS")).toBeInTheDocument();
+
+ fireEvent.change(screen.getByLabelText("System prompt ref"), {
+ target: { value: " policies/agents/codex/roles/30_ops.md " },
+ });
+ fireEvent.change(screen.getByLabelText("Runtime runner"), { target: { value: "codex" } });
+ fireEvent.change(screen.getByLabelText("Runtime provider"), { target: { value: " " } });
+ fireEvent.change(screen.getByLabelText("Runtime model"), { target: { value: "" } });
+
+ fireEvent.click(screen.getByRole("button", { name: "Save repo defaults" }));
+
+ await waitFor(() => {
+ expect(applyRoleConfig).toHaveBeenCalledWith("PLANNER", {
+ system_prompt_ref: "policies/agents/codex/roles/30_ops.md",
+ skills_bundle_ref: "policies/skills_bundle_registry.json#bundles.planner",
+ mcp_bundle_ref: "policies/agent_registry.json#agents(role=PLANNER).capabilities.mcp_tools",
+ runtime_binding: {
+ runner: "codex",
+ provider: null,
+ model: null,
+ },
+ });
+ });
+
+ expect(await screen.findByText("Saved repo-owned defaults for PLANNER.")).toBeInTheDocument();
+ expect(onApplied).toHaveBeenCalledTimes(1);
+ expect(screen.getByText("codex / Not set / Not set")).toBeInTheDocument();
+ });
+});
diff --git a/apps/desktop/src/pages/EventsPage.test.tsx b/apps/desktop/src/pages/EventsPage.test.tsx
index b4b21bb..f640fdc 100644
--- a/apps/desktop/src/pages/EventsPage.test.tsx
+++ b/apps/desktop/src/pages/EventsPage.test.tsx
@@ -30,7 +30,7 @@ describe("EventsPage", () => {
const user = userEvent.setup();
render();
- const rowToggle = await screen.findByRole("button", { name: "查看事件详情 TEST_EVENT" });
+ const rowToggle = await screen.findByRole("button", { name: "View event details TEST_EVENT" });
expect(rowToggle).toHaveAttribute("aria-expanded", "false");
await user.click(rowToggle);
@@ -57,7 +57,7 @@ describe("EventsPage", () => {
const user = userEvent.setup();
render();
- const rowToggle = await screen.findByRole("button", { name: "查看事件详情 KEYBOARD_EVENT" });
+ const rowToggle = await screen.findByRole("button", { name: "View event details KEYBOARD_EVENT" });
rowToggle.focus();
await user.keyboard("{Enter}");
expect(rowToggle).toHaveAttribute("aria-expanded", "true");
diff --git a/apps/desktop/src/pages/ReviewsPage.test.tsx b/apps/desktop/src/pages/ReviewsPage.test.tsx
index ec4eb65..31ea985 100644
--- a/apps/desktop/src/pages/ReviewsPage.test.tsx
+++ b/apps/desktop/src/pages/ReviewsPage.test.tsx
@@ -33,10 +33,10 @@ describe("ReviewsPage", () => {
] as any);
const user = userEvent.setup();
render();
- expect(screen.getByRole("button", { name: "刷新中..." })).toBeDisabled();
+ expect(screen.getByRole("button", { name: "Refreshing..." })).toBeDisabled();
resolveFirstFetch([]);
- expect(await screen.findByText("暂无评审记录")).toBeInTheDocument();
- await user.click(screen.getByRole("button", { name: "刷新" }));
+ expect(await screen.findByText("No review records yet")).toBeInTheDocument();
+ await user.click(screen.getByRole("button", { name: "Refresh" }));
expect(await screen.findByText("run-1")).toBeInTheDocument();
expect(screen.getByText("looks good")).toBeInTheDocument();
expect(screen.getByText("Scope: ok")).toBeInTheDocument();
diff --git a/apps/desktop/src/pages/TestsPage.test.tsx b/apps/desktop/src/pages/TestsPage.test.tsx
index dcb0c48..e440dc1 100644
--- a/apps/desktop/src/pages/TestsPage.test.tsx
+++ b/apps/desktop/src/pages/TestsPage.test.tsx
@@ -13,6 +13,7 @@ import { fetchTests } from "../lib/api";
describe("TestsPage", () => {
beforeEach(() => {
vi.clearAllMocks();
+ vi.mocked(fetchTests).mockReset();
});
it("renders empty state and then status cards after refresh", async () => {
@@ -25,11 +26,11 @@ describe("TestsPage", () => {
command: "pnpm test",
failure_info: "snapshot mismatch",
},
- ] as any);
+ ] as any);
const user = userEvent.setup();
render();
- expect(await screen.findByText("暂无测试记录")).toBeInTheDocument();
- await user.click(screen.getByRole("button", { name: "刷新" }));
+ expect(await screen.findByText("No test records yet")).toBeInTheDocument();
+ await user.click(screen.getByRole("button", { name: "Refresh" }));
expect(await screen.findByText("回归检查")).toBeInTheDocument();
expect(screen.getByText("pnpm test")).toBeInTheDocument();
expect(screen.getByText("snapshot mismatch")).toBeInTheDocument();
diff --git a/apps/desktop/src/pages/coverage_sprint_f_pages.test.tsx b/apps/desktop/src/pages/coverage_sprint_f_pages.test.tsx
index ed15b13..54de14c 100644
--- a/apps/desktop/src/pages/coverage_sprint_f_pages.test.tsx
+++ b/apps/desktop/src/pages/coverage_sprint_f_pages.test.tsx
@@ -125,13 +125,13 @@ describe("coverage sprint F: low-branch pages", () => {
locks: [],
role_catalog: [],
} as FirstAgentsPayload);
- expect(await screen.findByText(/活跃状态机|Active State Machines/)).toBeInTheDocument();
- expect(screen.getByText(/注册代理 \(1\)|Registered Agents \(1\)/)).toBeInTheDocument();
+ expect(await screen.findByText("Execution lane triage")).toBeInTheDocument();
+ expect(screen.getByText("Registered execution seats (expandable, 1 items)")).toBeInTheDocument();
expect(screen.getByText("run-12345678")).toBeInTheDocument();
fireEvent.click(screen.getByRole("button", { name: /刷新|Refresh/ }));
- expect(await screen.findByText(/暂无注册代理|No agents are registered yet/)).toBeInTheDocument();
- expect(screen.queryByText(/活跃状态机|Active state machines/)).not.toBeInTheDocument();
+ expect(await screen.findByText("No registered agents")).toBeInTheDocument();
+ expect(screen.queryByText("Execution lane triage")).not.toBeInTheDocument();
fireEvent.click(screen.getByRole("button", { name: /刷新|Refresh/ }));
const errorBanner = await screen.findByRole("alert");
diff --git a/apps/desktop/src/pages/workflow_queue_controls.test.tsx b/apps/desktop/src/pages/workflow_queue_controls.test.tsx
index f124b83..0453813 100644
--- a/apps/desktop/src/pages/workflow_queue_controls.test.tsx
+++ b/apps/desktop/src/pages/workflow_queue_controls.test.tsx
@@ -141,7 +141,7 @@ describe("workflow queue controls", () => {
}),
);
});
- expect(await screen.findByText("Queued task-queue.")).toBeInTheDocument();
+ expect(await screen.findByText("Queued task-queue. Refreshing the workflow view...")).toBeInTheDocument();
});
it("renders locale-aware workflow detail labels when zh-CN is requested", async () => {
diff --git a/apps/orchestrator/src/cortexpilot_orch/gates/tests_gate.py b/apps/orchestrator/src/cortexpilot_orch/gates/tests_gate.py
index 7753920..0fccf98 100644
--- a/apps/orchestrator/src/cortexpilot_orch/gates/tests_gate.py
+++ b/apps/orchestrator/src/cortexpilot_orch/gates/tests_gate.py
@@ -25,6 +25,7 @@
"true",
":",
}
+_TRIVIAL_ECHO_PAYLOADS = {"", "ok", "hello", "pass", "success", "done", "1"}
def _coerce_timeout_sec(raw: object) -> float:
@@ -38,6 +39,16 @@ def _coerce_timeout_sec(raw: object) -> float:
return timeout_sec
+def _coerce_gate_result(gate: object) -> dict[str, object]:
+ if isinstance(gate, dict):
+ return gate
+ return {
+ "ok": False,
+ "reason": "invalid validate_command result",
+ "raw": repr(gate),
+ }
+
+
def _now_ts() -> str:
return datetime.now(timezone.utc).isoformat()
@@ -145,9 +156,13 @@ def _is_trivial_acceptance_command(command: str) -> bool:
return True
if normalized in _TRIVIAL_ACCEPTANCE_COMMANDS:
return True
- if normalized.startswith("echo "):
- payload = normalized[5:].strip().strip('"').strip("'")
- if payload in {"", "ok", "hello", "pass", "success", "done", "1"}:
+ try:
+ tokens = shlex.split(command)
+ except ValueError:
+ return False
+ if tokens and tokens[0].lower() == "echo":
+ payload = " ".join(tokens[1:]).strip().lower()
+ if payload in _TRIVIAL_ECHO_PAYLOADS:
return True
return False
@@ -167,7 +182,7 @@ def _normalize_tests(test_items: Iterable[object]) -> list[dict[str, object]]:
continue
if isinstance(item, dict):
cmd = item.get("cmd") or item.get("command")
- if isinstance(cmd, str) and cmd.strip():
+ if isinstance(cmd, str):
timeout_sec = _coerce_timeout_sec(item.get("timeout_sec", _DEFAULT_TIMEOUT_SEC))
normalized.append(
{
@@ -233,7 +248,7 @@ def run_acceptance_tests(
strict_nontrivial_enabled = (
bool(strict_nontrivial) if strict_nontrivial is not None else _is_strict_nontrivial_enabled()
)
- has_must_pass = any(bool(test.get("must_pass", True)) for test in normalized)
+ has_must_pass = any(_coerce_must_pass(test.get("must_pass", True)) for test in normalized)
if not has_must_pass:
finished_at = _now_ts()
report = _build_report(
@@ -277,12 +292,14 @@ def run_acceptance_tests(
"reason": "trivial acceptance command blocked",
}
- gate = validate_command(
- command,
- forbidden,
- network_policy=network_policy,
- policy_pack=policy_pack,
- repo_root=worktree_root,
+ gate = _coerce_gate_result(
+ validate_command(
+ command,
+ forbidden,
+ network_policy=network_policy,
+ policy_pack=policy_pack,
+ repo_root=worktree_root,
+ )
)
if not gate.get("ok", False):
finished_at = _now_ts()
@@ -416,12 +433,14 @@ def run_evals_gate(
relative = script_path.relative_to(repo_root)
command = f"bash {relative}"
- gate = validate_command(
- command,
- forbidden_actions or [],
- network_policy=network_policy,
- policy_pack=policy_pack,
- repo_root=repo_root,
+ gate = _coerce_gate_result(
+ validate_command(
+ command,
+ forbidden_actions or [],
+ network_policy=network_policy,
+ policy_pack=policy_pack,
+ repo_root=repo_root,
+ )
)
if not gate.get("ok", False):
return {"ok": False, "reason": "tool gate violation", "gate": gate, "command": command}
diff --git a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
index 7d20138..2a30a79 100644
--- a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
+++ b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
@@ -2,6 +2,7 @@
from dataclasses import dataclass
from datetime import datetime, timezone
+import importlib
import json
from pathlib import Path
from typing import Any, Callable
@@ -40,9 +41,9 @@ class ControlPlaneReadService:
@classmethod
def from_api_main(cls) -> "ControlPlaneReadService":
- from cortexpilot_orch.api import main as api_main
- from cortexpilot_orch.api import main_state_store_helpers
- from cortexpilot_orch.queue import QueueStore
+ api_main = importlib.import_module("cortexpilot_orch.api.main")
+ main_state_store_helpers = importlib.import_module("cortexpilot_orch.api.main_state_store_helpers")
+ QueueStore = importlib.import_module("cortexpilot_orch.queue").QueueStore
def _list_workflows_readonly() -> list[dict[str, Any]]:
workflows = list(
@@ -135,11 +136,11 @@ def _list_queue_readonly(*, workflow_id: str | None = None, status: str | None =
@classmethod
def from_runtime(cls) -> "ControlPlaneReadService":
- from cortexpilot_orch.api import main_run_views_helpers
- from cortexpilot_orch.api import main_state_store_helpers
- from cortexpilot_orch.config import load_config
- from cortexpilot_orch.contract.compiler import build_role_binding_summary
- from cortexpilot_orch.queue import QueueStore
+ main_run_views_helpers = importlib.import_module("cortexpilot_orch.api.main_run_views_helpers")
+ main_state_store_helpers = importlib.import_module("cortexpilot_orch.api.main_state_store_helpers")
+ load_config = importlib.import_module("cortexpilot_orch.config").load_config
+ build_role_binding_summary = importlib.import_module("cortexpilot_orch.contract.compiler").build_role_binding_summary
+ QueueStore = importlib.import_module("cortexpilot_orch.queue").QueueStore
cfg = load_config()
runs_root = cfg.runs_root
diff --git a/apps/orchestrator/tests/test_bench_e2e_speed_gate.py b/apps/orchestrator/tests/test_bench_e2e_speed_gate.py
new file mode 100644
index 0000000..ddc246a
--- /dev/null
+++ b/apps/orchestrator/tests/test_bench_e2e_speed_gate.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+def _repo_root() -> Path:
+ return Path(__file__).resolve().parents[3]
+
+
+def _gate_script() -> Path:
+ return _repo_root() / "scripts" / "check_bench_e2e_speed_gate.py"
+
+
+def _write_summary(path: Path, *, overall_fail_rate: float, ui_p95: float, dash_p95: float) -> None:
+ path.write_text(
+ json.dumps(
+ {
+ "run_id": "bench_test",
+ "overall": {"fail_rate": overall_fail_rate},
+ "suites": {
+ "ui_full_gemini_strict": {"duration_sec": {"p95": ui_p95}},
+ "dashboard_high_risk_e2e": {"duration_sec": {"p95": dash_p95}},
+ },
+ }
+ )
+ + "\n",
+ encoding="utf-8",
+ )
+
+
+def test_bench_gate_passes_for_summary_within_thresholds(tmp_path: Path) -> None:
+ summary = tmp_path / "summary.json"
+ _write_summary(summary, overall_fail_rate=0.0, ui_p95=90.0, dash_p95=45.0)
+
+ result = subprocess.run(
+ [sys.executable, str(_gate_script()), "--summary", str(summary), "--ui-max-p95-sec", "120", "--dash-max-p95-sec", "60"],
+ check=False,
+ capture_output=True,
+ text=True,
+ )
+
+ assert result.returncode == 0
+ assert "benchmark gate passed" in result.stdout
+
+
+def test_bench_gate_fails_when_overall_fail_rate_exceeds_threshold(tmp_path: Path) -> None:
+ summary = tmp_path / "summary.json"
+ _write_summary(summary, overall_fail_rate=0.2, ui_p95=90.0, dash_p95=45.0)
+
+ result = subprocess.run(
+ [sys.executable, str(_gate_script()), "--summary", str(summary), "--max-overall-fail-rate", "0.1"],
+ check=False,
+ capture_output=True,
+ text=True,
+ )
+
+ assert result.returncode == 1
+ assert "overall.fail_rate=0.2000 > max_overall_fail_rate=0.1000" in result.stderr
+
+
+def test_bench_gate_fails_when_suite_p95_exceeds_threshold(tmp_path: Path) -> None:
+ summary = tmp_path / "summary.json"
+ _write_summary(summary, overall_fail_rate=0.0, ui_p95=181.0, dash_p95=91.0)
+
+ result = subprocess.run(
+ [
+ sys.executable,
+ str(_gate_script()),
+ "--summary",
+ str(summary),
+ "--ui-max-p95-sec",
+ "180",
+ "--dash-max-p95-sec",
+ "90",
+ ],
+ check=False,
+ capture_output=True,
+ text=True,
+ )
+
+ assert result.returncode == 1
+ assert "ui_full_gemini_strict.p95=181.000s > max_p95=180.000s" in result.stderr
+ assert "dashboard_high_risk_e2e.p95=91.000s > max_p95=90.000s" in result.stderr
+
+
+def test_bench_gate_fails_closed_when_summary_is_missing(tmp_path: Path) -> None:
+ summary = tmp_path / "missing-summary.json"
+
+ result = subprocess.run(
+ [sys.executable, str(_gate_script()), "--summary", str(summary)],
+ check=False,
+ capture_output=True,
+ text=True,
+ )
+
+ assert result.returncode == 2
+ assert "benchmark summary not found" in result.stderr
diff --git a/apps/orchestrator/tests/test_control_plane_read_service.py b/apps/orchestrator/tests/test_control_plane_read_service.py
new file mode 100644
index 0000000..c075ebf
--- /dev/null
+++ b/apps/orchestrator/tests/test_control_plane_read_service.py
@@ -0,0 +1,330 @@
+from __future__ import annotations
+
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from types import SimpleNamespace
+from types import ModuleType
+
+import pytest
+
+from cortexpilot_orch.services.control_plane_read_service import (
+ ControlPlaneReadService,
+ _as_array,
+ _as_record,
+ _as_text,
+ _find_report,
+)
+
+
+def _write_json(path: Path, payload: object) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(json.dumps(payload), encoding="utf-8")
+
+
+def test_control_plane_read_service_wrapper_filters_and_summary_helpers() -> None:
+ service = ControlPlaneReadService(
+ list_runs_fn=lambda: [{"run_id": "run-1"}],
+ get_run_fn=lambda run_id: {"run_id": run_id},
+ get_events_fn=lambda run_id: [{"run_id": run_id, "event": "RUN_UPDATED"}],
+ get_reports_fn=lambda run_id: [
+ {"name": "run_compare_report.json", "data": {"compare_summary": {"mismatched_count": 2}}},
+ {"name": "proof_pack.json", "data": {"summary": "proof-ready"}},
+ {"name": "incident_pack.json", "data": "not-a-record"},
+ ]
+ if run_id == "run-1"
+ else "not-a-list",
+ list_workflows_fn=lambda: [{"workflow_id": "wf-1"}],
+ get_workflow_fn=lambda workflow_id: {"workflow": {"workflow_id": workflow_id}, "runs": [], "events": []},
+ list_queue_fn=lambda **_: [{"queue_id": "queue-1"}],
+ list_pending_approvals_fn=lambda: [
+ {"run_id": "run-1", "status": "pending"},
+ {"run_id": "run-2", "status": "pending"},
+ ],
+ list_diff_gate_fn=lambda: [
+ {"run_id": "run-1", "status": "FAILED"},
+ {"run_id": "run-2", "status": "PASS"},
+ ],
+ )
+
+ assert _as_record({"ok": True}) == {"ok": True}
+ assert _as_record("bad") == {}
+ assert _as_array([1, 2]) == [1, 2]
+ assert _as_array("bad") == []
+ assert _as_text(" run-1 ") == "run-1"
+ assert _find_report([{"name": "proof_pack.json", "data": {"summary": "ready"}}], "proof_pack.json") == {
+ "summary": "ready"
+ }
+ assert _find_report([{"name": "proof_pack.json", "data": "bad"}], "proof_pack.json") == {}
+
+ assert service.list_runs() == [{"run_id": "run-1"}]
+ assert service.get_run("run-9") == {"run_id": "run-9"}
+ assert service.get_run_events("run-9") == [{"run_id": "run-9", "event": "RUN_UPDATED"}]
+ assert service.get_run_reports("run-2") == []
+ assert service.list_workflows() == [{"workflow_id": "wf-1"}]
+ assert service.get_workflow("wf-1") == {"workflow": {"workflow_id": "wf-1"}, "runs": [], "events": []}
+ assert service.list_queue(workflow_id="wf-1", status="pending") == [{"queue_id": "queue-1"}]
+ assert service.get_pending_approvals() == [
+ {"run_id": "run-1", "status": "pending"},
+ {"run_id": "run-2", "status": "pending"},
+ ]
+ assert service.get_pending_approvals(run_id="run-1") == [{"run_id": "run-1", "status": "pending"}]
+ assert service.get_diff_gate_state() == [
+ {"run_id": "run-1", "status": "FAILED"},
+ {"run_id": "run-2", "status": "PASS"},
+ ]
+ assert service.get_diff_gate_state(run_id="run-2") == [{"run_id": "run-2", "status": "PASS"}]
+ assert service.get_compare_summary("run-1") == {"mismatched_count": 2}
+ assert service.get_proof_summary("run-1") == {"summary": "proof-ready"}
+ assert service.get_incident_summary("run-1") == {}
+
+
+def test_control_plane_read_service_from_api_main_builds_workflows_and_queue_filters(monkeypatch) -> None:
+ event_map = {
+ "run-b": [
+ {"event": "WORKFLOW_STATUS", "ts": "2026-04-12T10:00:00Z", "context": {"workflow_id": "wf-1"}},
+ {"event": "IGNORED", "context": {"workflow_id": "wf-2"}},
+ ],
+ "run-a": [
+ {"event": "WORKFLOW_BOUND", "ts": "2026-04-11T10:00:00Z"},
+ {"event": "CUSTOM", "_ts": "2026-04-11T09:00:00Z", "context": {"workflow_id": "wf-1"}},
+ ],
+ }
+
+ workflows = {
+ "wf-1": {
+ "workflow_id": "wf-1",
+ "runs": [
+ {"run_id": "run-a", "created_at": "2026-04-11T08:00:00Z"},
+ {"run_id": "run-b", "created_at": "2026-04-12T08:00:00Z"},
+ ],
+ },
+ "wf-2": {"workflow_id": "wf-2", "runs": [{"run_id": "run-z", "created_at": "broken-ts"}]},
+ }
+
+ class _FakeQueueStore:
+ def __init__(self, *, ensure_storage: bool = False) -> None:
+ self.ensure_storage = ensure_storage
+
+ def list_items(self) -> list[dict[str, str]]:
+ return [
+ {"queue_id": "queue-1", "workflow_id": "wf-1", "status": "PENDING"},
+ {"queue_id": "queue-2", "workflow_id": "wf-2", "status": "DONE"},
+ ]
+
+ api_main = ModuleType("cortexpilot_orch.api.main")
+ api_main.load_config = lambda: SimpleNamespace(runs_root=Path("/tmp/runs"), runtime_root=Path("/tmp/runtime"))
+ api_main._read_events = lambda run_id: event_map.get(run_id, [])
+ api_main._parse_iso_ts = lambda value: datetime.fromisoformat(value.replace("Z", "+00:00"))
+ api_main.list_runs = lambda: [{"run_id": "api-run"}]
+ api_main.get_run = lambda run_id: {"run_id": run_id, "source": "api"}
+ api_main.get_events = lambda run_id: event_map.get(run_id, [])
+ api_main.get_reports = lambda run_id: [{"name": "proof_pack.json", "data": {"run_id": run_id}}]
+ api_main.list_pending_approvals = lambda: [{"run_id": "run-a"}]
+ api_main.list_diff_gate = lambda: [{"run_id": "run-b", "status": "FAILED"}]
+
+ main_state_store_helpers = ModuleType("cortexpilot_orch.api.main_state_store_helpers")
+ main_state_store_helpers.collect_workflows = lambda **_: workflows
+
+ queue_module = ModuleType("cortexpilot_orch.queue")
+ queue_module.QueueStore = _FakeQueueStore
+
+ monkeypatch.setitem(sys.modules, "cortexpilot_orch.api.main", api_main)
+ monkeypatch.setitem(sys.modules, "cortexpilot_orch.api.main_state_store_helpers", main_state_store_helpers)
+ monkeypatch.setitem(sys.modules, "cortexpilot_orch.queue", queue_module)
+
+ service = ControlPlaneReadService.from_api_main()
+
+ assert service.list_runs() == [{"run_id": "api-run"}]
+ assert service.get_run("run-a") == {"run_id": "run-a", "source": "api"}
+ assert [item["workflow_id"] for item in service.list_workflows()] == ["wf-1", "wf-2"]
+
+ workflow_payload = service.get_workflow("wf-1")
+ assert [event["_run_id"] for event in workflow_payload["events"]] == ["run-b", "run-a", "run-a"]
+ assert workflow_payload["runs"][0]["run_id"] == "run-a"
+ with pytest.raises(KeyError, match="workflow `missing` not found"):
+ service.get_workflow("missing")
+
+ assert service.list_queue(workflow_id="wf-1") == [{"queue_id": "queue-1", "workflow_id": "wf-1", "status": "PENDING"}]
+ assert service.list_queue(status="done") == [{"queue_id": "queue-2", "workflow_id": "wf-2", "status": "DONE"}]
+ assert service.get_pending_approvals() == [{"run_id": "run-a"}]
+ assert service.get_diff_gate_state(run_id="run-b") == [{"run_id": "run-b", "status": "FAILED"}]
+
+
+def test_control_plane_read_service_from_runtime_builds_runtime_views_and_pending_approvals(
+ monkeypatch, tmp_path: Path
+) -> None:
+ runtime_root = tmp_path / "runtime"
+ runs_root = runtime_root / "runs"
+ run_a = runs_root / "run-a"
+ run_b = runs_root / "run-b"
+ run_skip = runs_root / "run-skip"
+
+ _write_json(
+ run_a / "manifest.json",
+ {
+ "run_id": "run-a",
+ "task_id": "task-a",
+ "status": "",
+ "role_binding_summary": {"source": "persisted"},
+ },
+ )
+ _write_json(
+ run_a / "contract.json",
+ {
+ "task_id": "contract-task-a",
+ "allowed_paths": ["apps/orchestrator"],
+ },
+ )
+ _write_json(run_a / "reports" / "proof_pack.json", {"summary": "proof-a"})
+ _write_json(run_a / "reports" / "run_compare_report.json", {"compare_summary": {"mismatched_count": 3}})
+
+ _write_json(
+ run_b / "manifest.json",
+ {
+ "status": "SUCCESS",
+ },
+ )
+ _write_json(
+ run_b / "contract.json",
+ {
+ "task_id": "contract-task-b",
+ "allowed_paths": "not-a-list",
+ },
+ )
+ _write_json(run_b / "reports" / "incident_pack.json", {"summary": "incident-b"})
+
+ run_skip.mkdir(parents=True, exist_ok=True)
+ (run_skip / "manifest.json").write_text("{bad json", encoding="utf-8")
+
+ run_a.touch()
+ run_b.touch()
+ run_skip.touch()
+ (run_a / "manifest.json").touch()
+ (run_b / "manifest.json").touch()
+ (run_skip / "manifest.json").touch()
+
+ event_map = {
+ "run-a": [
+ {"event": "WORKFLOW_BOUND", "ts": "2026-04-12T10:00:00Z"},
+ {
+ "event": "HUMAN_APPROVAL_REQUIRED",
+ "ts": "2026-04-12T10:01:00Z",
+ "context": {
+ "reason": ["owner review"],
+ "actions": ["approve"],
+ "verify_steps": ["pytest"],
+ "resume_step": "resume-from-review",
+ "workflow_id": "wf-1",
+ },
+ },
+ {"event": "CUSTOM", "_ts": "2026-04-12T10:02:00Z", "context": {"workflow_id": "wf-1"}},
+ ],
+ "run-b": [
+ {"event": "HUMAN_APPROVAL_REQUIRED", "ts": "2026-04-11T09:00:00Z", "meta": {"workflow_id": "wf-2"}},
+ {"event": "HUMAN_APPROVAL_COMPLETED", "ts": "2026-04-11T09:05:00Z"},
+ {"event": "TEMPORAL_NOTIFY_DONE", "ts": "2026-04-11T09:10:00Z"},
+ ],
+ "run-skip": [],
+ }
+
+ workflows = {
+ "wf-1": {
+ "workflow_id": "wf-1",
+ "runs": [
+ {"run_id": "run-b", "created_at": "2026-04-11T08:00:00Z"},
+ {"run_id": "run-a", "created_at": "2026-04-12T08:00:00Z"},
+ ],
+ },
+ "wf-2": {"workflow_id": "wf-2", "runs": [{"run_id": "run-b", "created_at": "invalid"}]},
+ }
+
+ class _FakeQueueStore:
+ def __init__(self, *, ensure_storage: bool = False) -> None:
+ self.ensure_storage = ensure_storage
+
+ def list_items(self) -> list[dict[str, str]]:
+ return [
+ {"queue_id": "queue-1", "workflow_id": "wf-1", "status": "PENDING"},
+ {"queue_id": "queue-2", "workflow_id": "wf-2", "status": "DONE"},
+ ]
+
+ config_module = ModuleType("cortexpilot_orch.config")
+ config_module.load_config = lambda: SimpleNamespace(runs_root=runs_root, runtime_root=runtime_root)
+
+ main_state_store_helpers = ModuleType("cortexpilot_orch.api.main_state_store_helpers")
+ main_state_store_helpers.read_events = lambda *, run_id, runs_root: event_map.get(run_id, [])
+ main_state_store_helpers.collect_workflows = lambda **_: workflows
+
+ main_run_views_helpers = ModuleType("cortexpilot_orch.api.main_run_views_helpers")
+ main_run_views_helpers.list_diff_gate = lambda **_: [
+ {"run_id": "run-a", "status": "FAILED"},
+ {"run_id": "run-b", "status": "PASS"},
+ ]
+
+ compiler_module = ModuleType("cortexpilot_orch.contract.compiler")
+ compiler_module.build_role_binding_summary = lambda contract: {
+ "source": "generated",
+ "task_id": contract.get("task_id"),
+ }
+
+ queue_module = ModuleType("cortexpilot_orch.queue")
+ queue_module.QueueStore = _FakeQueueStore
+
+ monkeypatch.setitem(sys.modules, "cortexpilot_orch.config", config_module)
+ monkeypatch.setitem(sys.modules, "cortexpilot_orch.api.main_state_store_helpers", main_state_store_helpers)
+ monkeypatch.setitem(sys.modules, "cortexpilot_orch.api.main_run_views_helpers", main_run_views_helpers)
+ monkeypatch.setitem(sys.modules, "cortexpilot_orch.contract.compiler", compiler_module)
+ monkeypatch.setitem(sys.modules, "cortexpilot_orch.queue", queue_module)
+
+ service = ControlPlaneReadService.from_runtime()
+
+ listed_runs = service.list_runs()
+ assert [item["run_id"] for item in listed_runs] == ["run-b", "run-a"]
+ assert listed_runs[1]["status"] == "UNKNOWN"
+ assert listed_runs[1]["last_event_ts"] == "2026-04-12T10:02:00Z"
+
+ runtime_run = service.get_run("run-a")
+ assert runtime_run["task_id"] == "task-a"
+ assert runtime_run["allowed_paths"] == ["apps/orchestrator"]
+ assert runtime_run["role_binding_read_model"] == {"source": "persisted"}
+
+ generated_run = service.get_run("run-b")
+ assert generated_run["task_id"] == "contract-task-b"
+ assert generated_run["allowed_paths"] == []
+ assert generated_run["role_binding_read_model"] == {"source": "generated", "task_id": "contract-task-b"}
+ with pytest.raises(KeyError, match="run `missing` not found"):
+ service.get_run("missing")
+
+ assert service.get_run_reports("run-a") == [
+ {"name": "proof_pack.json", "data": {"summary": "proof-a"}},
+ {"name": "run_compare_report.json", "data": {"compare_summary": {"mismatched_count": 3}}},
+ ]
+ assert [item["workflow_id"] for item in service.list_workflows()] == ["wf-1", "wf-2"]
+
+ workflow_payload = service.get_workflow("wf-1")
+ assert [event["_run_id"] for event in workflow_payload["events"]] == ["run-a", "run-a", "run-a", "run-b"]
+ with pytest.raises(KeyError, match="workflow `missing` not found"):
+ service.get_workflow("missing")
+
+ assert service.list_queue(workflow_id="wf-1", status="pending") == [
+ {"queue_id": "queue-1", "workflow_id": "wf-1", "status": "PENDING"}
+ ]
+ assert service.get_pending_approvals() == [
+ {
+ "run_id": "run-a",
+ "status": "pending",
+ "task_id": "task-a",
+ "failure_reason": "",
+ "reason": ["owner review"],
+ "actions": ["approve"],
+ "verify_steps": ["pytest"],
+ "resume_step": "resume-from-review",
+ }
+ ]
+ assert service.get_pending_approvals(run_id="run-a")[0]["run_id"] == "run-a"
+ assert service.get_diff_gate_state(run_id="run-a") == [{"run_id": "run-a", "status": "FAILED"}]
+ assert service.get_compare_summary("run-a") == {"mismatched_count": 3}
+ assert service.get_proof_summary("run-a") == {"summary": "proof-a"}
+ assert service.get_incident_summary("run-b") == {"summary": "incident-b"}
diff --git a/apps/orchestrator/tests/test_mcp_queue_pilot_server_branches.py b/apps/orchestrator/tests/test_mcp_queue_pilot_server_branches.py
new file mode 100644
index 0000000..d210814
--- /dev/null
+++ b/apps/orchestrator/tests/test_mcp_queue_pilot_server_branches.py
@@ -0,0 +1,132 @@
+from __future__ import annotations
+
+from dataclasses import replace
+import io
+import sys
+from types import ModuleType
+
+from cortexpilot_orch import mcp_queue_pilot_server as queue_pilot_module
+
+
+def test_mcp_queue_pilot_helpers_and_protocol_edges(monkeypatch) -> None:
+ monkeypatch.setenv("CORTEXPILOT_APPROVAL_ALLOWED_ROLES", " owner , ops ")
+
+ assert queue_pilot_module._mutation_roles() == {"OWNER", "OPS"}
+ assert queue_pilot_module._required_role_arg({"actor_role": "owner"}) == "OWNER"
+ assert queue_pilot_module._queue_payload(
+ {"priority": 3, "scheduled_at": " 2026-04-12T09:00:00Z ", "deadline_at": " "}
+ ) == {
+ "priority": 3,
+ "scheduled_at": "2026-04-12T09:00:00Z",
+ }
+ assert queue_pilot_module._error_response(7, -32601, "boom") == {
+ "jsonrpc": "2.0",
+ "id": 7,
+ "error": {"code": -32601, "message": "boom"},
+ }
+
+
+def test_mcp_queue_pilot_server_covers_default_constructor_unknown_methods_and_stream(monkeypatch) -> None:
+ captured: list[tuple[str, dict[str, object]]] = []
+
+ def _preview(run_id: str, payload: dict[str, object]) -> dict[str, object]:
+ captured.append((run_id, payload))
+ return {
+ "run_id": run_id,
+ "validation": "ok",
+ "can_apply": True,
+ "preview_item": {"queue_id": "preview-1"},
+ }
+
+ def _apply(run_id: str, payload: dict[str, object]) -> dict[str, object]:
+ return {"queue_id": f"{run_id}-queue", "task_id": "task-1", "status": "PENDING"}
+
+ api_main = ModuleType("cortexpilot_orch.api.main")
+ api_main.preview_enqueue_run_queue = _preview
+ api_main.enqueue_run_queue = _apply
+ monkeypatch.setitem(sys.modules, "cortexpilot_orch.api.main", api_main)
+
+ server = queue_pilot_module.CortexPilotQueuePilotMcpServer()
+
+ assert server.handle_message({"jsonrpc": "2.0", "method": "initialized"}) is None
+ assert server.handle_message({"jsonrpc": "2.0", "id": 1, "method": "ping"}) == {
+ "jsonrpc": "2.0",
+ "id": 1,
+ "result": {},
+ }
+ init_response = server.handle_message({"jsonrpc": "2.0", "id": 2, "method": "initialize"})
+ assert init_response is not None
+ assert init_response["result"]["serverInfo"]["name"] == "cortexpilot-queue-pilot"
+
+ alias_list = server.handle_message({"jsonrpc": "2.0", "id": 3, "method": "tooling/list"})
+ assert alias_list is not None
+ assert {tool["name"] for tool in alias_list["result"]["tools"]} == {
+ "preview_enqueue_from_run",
+ "enqueue_from_run",
+ }
+
+ unknown_tool = server.handle_message(
+ {
+ "jsonrpc": "2.0",
+ "id": 4,
+ "method": "tools/call",
+ "params": {"name": "missing_tool", "arguments": {}},
+ }
+ )
+ assert unknown_tool == {
+ "jsonrpc": "2.0",
+ "id": 4,
+ "error": {"code": -32601, "message": "unknown tool `missing_tool`"},
+ }
+
+ missing_run_id = server.handle_message(
+ {
+ "jsonrpc": "2.0",
+ "id": 5,
+ "method": "tools/call",
+ "params": {"name": "preview_enqueue_from_run", "arguments": {}},
+ }
+ )
+ assert missing_run_id is not None
+ assert missing_run_id["result"]["isError"] is True
+ assert "`run_id` is required" in missing_run_id["result"]["structuredContent"]["error"]
+
+ broken_tool = replace(
+ server._tool_map["preview_enqueue_from_run"],
+ handler=lambda arguments: (_ for _ in ()).throw(RuntimeError("preview exploded")),
+ )
+ server._tool_map["preview_enqueue_from_run"] = broken_tool
+ runtime_error = server.handle_message(
+ {
+ "jsonrpc": "2.0",
+ "id": 6,
+ "method": "tools/call",
+ "params": {"name": "preview_enqueue_from_run", "arguments": {"run_id": "run-9"}},
+ }
+ )
+ assert runtime_error is not None
+ assert runtime_error["result"]["isError"] is True
+ assert runtime_error["result"]["structuredContent"]["error"] == "preview exploded"
+
+ assert server.handle_message({"jsonrpc": "2.0", "method": "unsupported"}) is None
+ unsupported = server.handle_message({"jsonrpc": "2.0", "id": 7, "method": "unsupported"})
+ assert unsupported == {
+ "jsonrpc": "2.0",
+ "id": 7,
+ "error": {"code": -32601, "message": "method `unsupported` is not supported"},
+ }
+
+ source = io.StringIO('\nnot-json\n[]\n{"jsonrpc":"2.0","id":8,"method":"ping"}\n')
+ target = io.StringIO()
+ server.serve_forever(instream=source, outstream=target)
+ assert target.getvalue().strip() == '{"jsonrpc": "2.0", "id": 8, "result": {}}'
+
+ called = {"serve_forever": False}
+
+ class _FakeServer:
+ def serve_forever(self) -> None:
+ called["serve_forever"] = True
+
+ monkeypatch.setattr(queue_pilot_module, "CortexPilotQueuePilotMcpServer", _FakeServer)
+ queue_pilot_module.serve_queue_pilot_mcp()
+ assert called["serve_forever"] is True
diff --git a/apps/orchestrator/tests/test_repo_coverage_gate.py b/apps/orchestrator/tests/test_repo_coverage_gate.py
new file mode 100644
index 0000000..a08dd6c
--- /dev/null
+++ b/apps/orchestrator/tests/test_repo_coverage_gate.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def _read_script() -> str:
+ script_path = Path(__file__).resolve().parents[3] / "scripts" / "repo_coverage_gate.py"
+ return script_path.read_text(encoding="utf-8")
+
+
+def test_dashboard_coverage_installs_deps_before_vitest() -> None:
+ text = _read_script()
+ install_idx = text.index('run_command(["bash", "scripts/install_dashboard_deps.sh"])')
+ vitest_idx = text.index('"pnpm",\n "--dir",\n "apps/dashboard",\n "exec",\n "vitest"')
+ assert install_idx < vitest_idx
+
+
+def test_desktop_coverage_installs_deps_before_vitest() -> None:
+ text = _read_script()
+ install_idx = text.index('run_command(["bash", "scripts/install_desktop_deps.sh"])')
+ vitest_idx = text.index('"pnpm",\n "--dir",\n "apps/desktop",\n "exec",\n "vitest"')
+ assert install_idx < vitest_idx
+
+
+def test_orchestrator_coverage_uses_managed_coverage_file() -> None:
+ text = _read_script()
+ assert 'DEFAULT_COVERAGE_DATA_DIR = ROOT_DIR / ".runtime-cache" / "cache" / "test" / "coverage" / "repo_coverage_gate"' in text
+ assert '"COVERAGE_FILE": str(coverage_file)' in text
+
+
+def test_dashboard_and_desktop_coverage_use_managed_report_dirs() -> None:
+ text = _read_script()
+ assert '"CORTEXPILOT_DASHBOARD_COVERAGE_DIR": str(report_path.parent)' in text
+ assert '"CORTEXPILOT_DESKTOP_COVERAGE_DIR": str(report_path.parent)' in text
+
+
+def test_orchestrator_coverage_uses_managed_hypothesis_storage() -> None:
+ text = _read_script()
+ assert 'DEFAULT_HYPOTHESIS_DATA_DIR = ROOT_DIR / ".runtime-cache" / "cache" / "hypothesis" / "repo_coverage_gate"' in text
+ assert '"HYPOTHESIS_STORAGE_DIRECTORY": str(DEFAULT_HYPOTHESIS_DATA_DIR)' in text
diff --git a/apps/orchestrator/tests/test_tests_gate_extended.py b/apps/orchestrator/tests/test_tests_gate_extended.py
index e916320..535335a 100644
--- a/apps/orchestrator/tests/test_tests_gate_extended.py
+++ b/apps/orchestrator/tests/test_tests_gate_extended.py
@@ -42,6 +42,19 @@ def test_tests_gate_tool_gate_violation(tmp_path: Path, monkeypatch) -> None:
assert result["reason"] == "tool gate violation"
+def test_tests_gate_tool_gate_non_dict_result_fails_closed(tmp_path: Path, monkeypatch) -> None:
+ monkeypatch.setattr(tests_gate, "validate_command", lambda *args, **kwargs: False)
+
+ result = tests_gate.run_acceptance_tests(
+ tmp_path,
+ [{"name": "hygiene", "cmd": "bash scripts/check_repo_hygiene.sh", "must_pass": True}],
+ )
+
+ assert result["ok"] is False
+ assert result["reason"] == "tool gate violation"
+ assert result["gate"]["reason"] == "invalid validate_command result"
+
+
def test_tests_gate_invalid_shlex(tmp_path: Path, monkeypatch) -> None:
monkeypatch.setattr(tests_gate, "validate_command", lambda *args, **kwargs: {"ok": True})
result = tests_gate.run_acceptance_tests(tmp_path, ['echo "unterminated'])
@@ -131,6 +144,16 @@ def test_tests_gate_strict_nontrivial_blocks_echo_numeric_payload(tmp_path: Path
assert result["reason"] == "trivial acceptance command blocked"
+def test_tests_gate_strict_nontrivial_blocks_echo_whitespace_payload(tmp_path: Path, monkeypatch) -> None:
+ monkeypatch.setenv("CORTEXPILOT_ACCEPTANCE_STRICT_NONTRIVIAL", "1")
+ monkeypatch.setattr(tests_gate, "validate_command", lambda *args, **kwargs: {"ok": True})
+
+ result = tests_gate.run_acceptance_tests(tmp_path, ['echo " "'])
+
+ assert result["ok"] is False
+ assert result["reason"] == "trivial acceptance command blocked"
+
+
def test_is_trivial_acceptance_command_treats_whitespace_only_as_trivial() -> None:
assert tests_gate._is_trivial_acceptance_command(" \t \n ") is True
@@ -284,6 +307,16 @@ def test_tests_gate_rejects_when_all_acceptance_tests_are_not_must_pass(tmp_path
assert result["reason"] == "missing must_pass acceptance test"
+def test_tests_gate_blank_dict_command_reports_empty_command(tmp_path: Path) -> None:
+ result = tests_gate.run_acceptance_tests(
+ tmp_path,
+ [{"name": "blank", "cmd": " ", "must_pass": True}],
+ )
+
+ assert result["ok"] is False
+ assert result["reason"] == "empty command"
+
+
def test_run_evals_gate_blocks_tool_gate_violation(tmp_path: Path, monkeypatch) -> None:
repo_root = tmp_path / "repo"
worktree = repo_root / "worktree"
@@ -307,6 +340,30 @@ def _fake_run(*args, **kwargs):
assert called["run"] is False
+def test_run_evals_gate_non_dict_tool_gate_result_fails_closed(tmp_path: Path, monkeypatch) -> None:
+ repo_root = tmp_path / "repo"
+ worktree = repo_root / "worktree"
+ (repo_root / "scripts").mkdir(parents=True)
+ worktree.mkdir(parents=True)
+ (repo_root / "scripts" / "run_evals.sh").write_text("#!/usr/bin/env bash\necho evals\n", encoding="utf-8")
+
+ called: dict[str, bool] = {"run": False}
+
+ def _fake_run(*args, **kwargs):
+ called["run"] = True
+ return subprocess.CompletedProcess(args, 0, "ok", "")
+
+ monkeypatch.setattr(tests_gate, "validate_command", lambda *args, **kwargs: False)
+ _patch_tests_gate_subprocess(monkeypatch, _fake_run)
+
+ result = tests_gate.run_evals_gate(repo_root, worktree)
+
+ assert result["ok"] is False
+ assert result["reason"] == "tool gate violation"
+ assert result["gate"]["reason"] == "invalid validate_command result"
+ assert called["run"] is False
+
+
def test_tests_gate_coerces_string_must_pass_false(tmp_path: Path, monkeypatch) -> None:
monkeypatch.setattr(tests_gate, "validate_command", lambda *args, **kwargs: {"ok": True})
diff --git a/configs/env.registry.json b/configs/env.registry.json
index 492bd40..293cf2e 100644
--- a/configs/env.registry.json
+++ b/configs/env.registry.json
@@ -596,6 +596,42 @@
"scripts/lib/toolchain_env.sh"
]
},
+ {
+ "name": "CORTEXPILOT_BENCH_DASHBOARD_HIGH_RISK_E2E_MAX_P95_SEC",
+ "scope": "platform",
+ "secret": false,
+ "required": false,
+ "default": "90",
+ "owner": "platform",
+ "description": "Maximum allowed p95 latency, in seconds, for the dashboard_high_risk_e2e suite when benchmark summaries are evaluated by the fail-closed benchmark gate.",
+ "consumers": [
+ "scripts/check_bench_e2e_speed_gate.py"
+ ]
+ },
+ {
+ "name": "CORTEXPILOT_BENCH_MAX_FAIL_RATE",
+ "scope": "platform",
+ "secret": false,
+ "required": false,
+ "default": "0.05",
+ "owner": "platform",
+ "description": "Maximum allowed overall benchmark fail rate enforced by the fail-closed benchmark gate.",
+ "consumers": [
+ "scripts/check_bench_e2e_speed_gate.py"
+ ]
+ },
+ {
+ "name": "CORTEXPILOT_BENCH_UI_FULL_GEMINI_STRICT_MAX_P95_SEC",
+ "scope": "platform",
+ "secret": false,
+ "required": false,
+ "default": "180",
+ "owner": "platform",
+ "description": "Maximum allowed p95 latency, in seconds, for the ui_full_gemini_strict suite when benchmark summaries are evaluated by the fail-closed benchmark gate.",
+ "consumers": [
+ "scripts/check_bench_e2e_speed_gate.py"
+ ]
+ },
{
"name": "CORTEXPILOT_BROWSER_ALLOWLIST",
"scope": "platform",
diff --git a/docs/assets/storefront/benchmark-methodology.md b/docs/assets/storefront/benchmark-methodology.md
index 3b13f2d..b7f1f8f 100644
--- a/docs/assets/storefront/benchmark-methodology.md
+++ b/docs/assets/storefront/benchmark-methodology.md
@@ -8,6 +8,7 @@ inventing numbers.
- Benchmark execution tooling exists:
- `scripts/bench_e2e_speed.py`
- `scripts/bench_e2e_speed.sh`
+ - `scripts/check_bench_e2e_speed_gate.py`
- A first tracked public single-run baseline now exists at
`docs/releases/assets/news-digest-benchmark-summary-2026-03-27.md`.
- Broader multi-round public benchmark figures do **not** exist yet.
@@ -62,6 +63,23 @@ A tracked public benchmark artifact should include:
`.runtime-cache/`
- enough metadata to show which happy path was exercised
+## Gate Contract
+
+Once a real benchmark summary exists, the repo-owned fail-closed gate is:
+
+```bash
+npm run bench:e2e:speed:gate
+```
+
+Default thresholds are driven by:
+
+- `CORTEXPILOT_BENCH_MAX_FAIL_RATE`
+- `CORTEXPILOT_BENCH_UI_FULL_GEMINI_STRICT_MAX_P95_SEC`
+- `CORTEXPILOT_BENCH_DASHBOARD_HIGH_RISK_E2E_MAX_P95_SEC`
+
+The gate is intentionally strict about artifact presence: if no benchmark
+summary exists yet, it fails instead of inventing a baseline.
+
## Anti-Fraud Rule
Do not copy raw numbers into README, release notes, or social posts unless they
diff --git a/package.json b/package.json
index 8770f1e..718d8b2 100644
--- a/package.json
+++ b/package.json
@@ -41,8 +41,10 @@
"test:smell": "bash scripts/test_smell_gate.sh",
"quality:full": "npm run lint && npm run test:smell && npm run test",
"quality:full:host": "npm run lint && npm run test:smell && npm run test:host",
- "coverage:repo": "bash scripts/run_governance_py.sh scripts/repo_coverage_aggregate.py --threshold ${CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD:-95}",
- "coverage:repo:gate": "bash scripts/run_governance_py.sh scripts/repo_coverage_aggregate.py --threshold ${CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD:-95} --enforce-gate",
+ "coverage:repo": "bash scripts/run_governance_py.sh scripts/repo_coverage_gate.py --threshold ${CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD:-95}",
+ "coverage:repo:gate": "bash scripts/run_governance_py.sh scripts/repo_coverage_gate.py --threshold ${CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD:-95} --enforce-gate",
+ "coverage:repo:aggregate": "bash scripts/run_governance_py.sh scripts/repo_coverage_aggregate.py --threshold ${CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD:-95}",
+ "mutation:gate": "bash scripts/mutation_gate.sh",
"test:quick": "bash scripts/docker_ci.sh test-quick",
"test:quick:host": "bash scripts/test_quick.sh",
"test:live:preflight": "${CORTEXPILOT_PYTHON:-python3} scripts/e2e_external_web_probe.py --url ${CORTEXPILOT_EXTERNAL_WEB_PROBE_URL:-https://example.com} --provider-api-mode ${CORTEXPILOT_EXTERNAL_WEB_PROBE_PROVIDER_API_MODE:-require} --hard-timeout-sec ${CORTEXPILOT_EXTERNAL_WEB_PROBE_HARD_TIMEOUT_SEC:-120}",
@@ -99,6 +101,7 @@
"bench:e2e:speed": "bash scripts/bench_e2e_speed.sh",
"bench:e2e:speed:dry-run": "bash scripts/bench_e2e_speed.sh --rounds 3 --ui-full-gemini-strict --dashboard-high-risk --dry-run",
"bench:e2e:speed:report-only": "bash scripts/bench_e2e_speed.sh --report-only",
+ "bench:e2e:speed:gate": "python3 scripts/check_bench_e2e_speed_gate.py",
"e2e:pm-chat": "bash scripts/e2e_pm_chat_command_tower_success.sh",
"e2e:pm-chat:real": "CORTEXPILOT_E2E_RUN_MODE=real CORTEXPILOT_E2E_RUNNER=agents CORTEXPILOT_E2E_REEXEC_STRICT=true bash scripts/e2e_pm_chat_command_tower_success.sh",
"ci": "bash scripts/ci_local_fast.sh",
diff --git a/scripts/check_bench_e2e_speed_gate.py b/scripts/check_bench_e2e_speed_gate.py
new file mode 100644
index 0000000..e82ad0b
--- /dev/null
+++ b/scripts/check_bench_e2e_speed_gate.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""Fail-closed gate for benchmark summaries produced by scripts/bench_e2e_speed.py."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import os
+import sys
+from pathlib import Path
+from typing import Any
+
+
+ROOT = Path(__file__).resolve().parents[1]
+BENCH_ROOT = ROOT / ".runtime-cache" / "test_output" / "benchmarks"
+DEFAULT_MAX_FAIL_RATE = float(os.environ.get("CORTEXPILOT_BENCH_MAX_FAIL_RATE", "0.05"))
+DEFAULT_UI_MAX_P95_SEC = float(os.environ.get("CORTEXPILOT_BENCH_UI_FULL_GEMINI_STRICT_MAX_P95_SEC", "180"))
+DEFAULT_DASH_MAX_P95_SEC = float(os.environ.get("CORTEXPILOT_BENCH_DASHBOARD_HIGH_RISK_E2E_MAX_P95_SEC", "90"))
+
+
+def _find_latest_summary() -> Path | None:
+ candidates = sorted(BENCH_ROOT.glob("*/summary.json"), key=lambda path: path.stat().st_mtime, reverse=True)
+ return candidates[0] if candidates else None
+
+
+def _load_json(path: Path) -> dict[str, Any]:
+ try:
+ return json.loads(path.read_text(encoding="utf-8"))
+ except FileNotFoundError as exc:
+ raise FileNotFoundError(f"benchmark summary not found: {path}") from exc
+ except json.JSONDecodeError as exc:
+ raise ValueError(f"invalid JSON in benchmark summary {path}: {exc}") from exc
+
+
+def _to_float(value: Any, *, field: str) -> float:
+ try:
+ parsed = float(value)
+ except (TypeError, ValueError) as exc:
+ raise ValueError(f"invalid numeric field {field!r}: {value!r}") from exc
+ if not math.isfinite(parsed):
+ raise ValueError(f"non-finite numeric field {field!r}: {value!r}")
+ return parsed
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ description="Fail-closed gate for benchmark summaries emitted by scripts/bench_e2e_speed.py."
+ )
+ parser.add_argument("--summary", default="", help="Explicit benchmark summary path. Defaults to the latest summary.json.")
+ parser.add_argument(
+ "--max-overall-fail-rate",
+ type=float,
+ default=DEFAULT_MAX_FAIL_RATE,
+ help="Maximum allowed overall fail_rate (default from CORTEXPILOT_BENCH_MAX_FAIL_RATE or 0.05).",
+ )
+ parser.add_argument(
+ "--ui-max-p95-sec",
+ type=float,
+ default=DEFAULT_UI_MAX_P95_SEC,
+ help="Maximum allowed p95 for ui_full_gemini_strict (default env or 180).",
+ )
+ parser.add_argument(
+ "--dash-max-p95-sec",
+ type=float,
+ default=DEFAULT_DASH_MAX_P95_SEC,
+ help="Maximum allowed p95 for dashboard_high_risk_e2e (default env or 90).",
+ )
+ return parser.parse_args()
+
+
+def main() -> int:
+ args = parse_args()
+ summary_path = Path(args.summary).expanduser().resolve() if args.summary else _find_latest_summary()
+ if summary_path is None:
+ print("❌ benchmark gate requires a benchmark summary; run `npm run bench:e2e:speed` first", file=sys.stderr)
+ return 2
+
+ try:
+ payload = _load_json(summary_path)
+ except (FileNotFoundError, ValueError) as exc:
+ print(f"❌ [bench-gate] {exc}", file=sys.stderr)
+ return 2
+ overall = payload.get("overall")
+ suites = payload.get("suites")
+ if not isinstance(overall, dict) or not isinstance(suites, dict):
+ print(f"❌ benchmark summary missing overall/suites maps: {summary_path}", file=sys.stderr)
+ return 2
+
+ failures: list[str] = []
+ overall_fail_rate = _to_float(overall.get("fail_rate"), field="overall.fail_rate")
+ if overall_fail_rate > args.max_overall_fail_rate:
+ failures.append(
+ f"overall.fail_rate={overall_fail_rate:.4f} > max_overall_fail_rate={args.max_overall_fail_rate:.4f}"
+ )
+
+ suite_thresholds = {
+ "ui_full_gemini_strict": args.ui_max_p95_sec,
+ "dashboard_high_risk_e2e": args.dash_max_p95_sec,
+ }
+ for suite_name, max_p95 in suite_thresholds.items():
+ if suite_name not in suites:
+ failures.append(f"missing suite in benchmark summary: {suite_name}")
+ continue
+ suite = suites[suite_name]
+ if not isinstance(suite, dict):
+ failures.append(f"invalid suite payload: {suite_name}")
+ continue
+ duration = suite.get("duration_sec")
+ if not isinstance(duration, dict):
+ failures.append(f"missing duration metrics for suite: {suite_name}")
+ continue
+ p95 = _to_float(duration.get("p95"), field=f"{suite_name}.duration_sec.p95")
+ if p95 > max_p95:
+ failures.append(f"{suite_name}.p95={p95:.3f}s > max_p95={max_p95:.3f}s")
+
+ print(f"📄 [bench-gate] summary={summary_path}")
+ print(
+ "ℹ️ [bench-gate] thresholds: "
+ f"overall_fail_rate<={args.max_overall_fail_rate:.4f} "
+ f"ui_p95<={args.ui_max_p95_sec:.3f}s "
+ f"dashboard_p95<={args.dash_max_p95_sec:.3f}s"
+ )
+ if failures:
+ print("❌ [bench-gate] benchmark gate failed:", file=sys.stderr)
+ for failure in failures:
+ print(f" - {failure}", file=sys.stderr)
+ return 1
+
+ print("✅ [bench-gate] benchmark gate passed")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/scripts/repo_coverage_gate.py b/scripts/repo_coverage_gate.py
index e2bd0ed..2c058c4 100644
--- a/scripts/repo_coverage_gate.py
+++ b/scripts/repo_coverage_gate.py
@@ -25,6 +25,8 @@
DEFAULT_DESKTOP_REPORT = (
ROOT_DIR / ".runtime-cache" / "test_output" / "repo_coverage" / "desktop" / "coverage-summary.json"
)
+DEFAULT_COVERAGE_DATA_DIR = ROOT_DIR / ".runtime-cache" / "cache" / "test" / "coverage" / "repo_coverage_gate"
+DEFAULT_HYPOTHESIS_DATA_DIR = ROOT_DIR / ".runtime-cache" / "cache" / "hypothesis" / "repo_coverage_gate"
DEFAULT_THRESHOLD = float(os.environ.get("CORTEXPILOT_REPO_COVERAGE_GATE_THRESHOLD", "95"))
@@ -178,8 +180,17 @@ def run_command(command: list[str], env_overrides: dict[str, str] | None = None)
raise RuntimeError(f"command failed (exit={result.returncode}): {' '.join(command)}")
+def _prepare_coverage_file(path: Path) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.unlink(missing_ok=True)
+ for stale_path in path.parent.glob(f"{path.name}.*"):
+ stale_path.unlink(missing_ok=True)
+
+
def run_orchestrator_coverage(report_path: Path, pytest_target: str, pytest_mark: str) -> None:
report_path.parent.mkdir(parents=True, exist_ok=True)
+ coverage_file = DEFAULT_COVERAGE_DATA_DIR / ".coverage"
+ _prepare_coverage_file(coverage_file)
override = os.getenv("CORTEXPILOT_PYTHON", "").strip()
toolchain_python = ROOT_DIR / ".runtime-cache" / "cache" / "toolchains" / "python" / "current" / "bin" / "python"
python_bin = Path(override) if override else toolchain_python
@@ -201,11 +212,22 @@ def run_orchestrator_coverage(report_path: Path, pytest_target: str, pytest_mark
f"--cov-report=json:{report_path}",
"--cov-fail-under=0",
]
- run_command(command, env_overrides={"PYTHONPATH": "apps/orchestrator/src"})
+ try:
+ run_command(
+ command,
+ env_overrides={
+ "PYTHONPATH": "apps/orchestrator/src",
+ "COVERAGE_FILE": str(coverage_file),
+ "HYPOTHESIS_STORAGE_DIRECTORY": str(DEFAULT_HYPOTHESIS_DATA_DIR),
+ },
+ )
+ finally:
+ _prepare_coverage_file(coverage_file)
def run_dashboard_coverage(report_path: Path, test_targets: list[str]) -> None:
report_path.parent.mkdir(parents=True, exist_ok=True)
+ run_command(["bash", "scripts/install_dashboard_deps.sh"])
command = [
"pnpm",
"--dir",
@@ -225,11 +247,19 @@ def run_dashboard_coverage(report_path: Path, test_targets: list[str]) -> None:
f"--coverage.reportsDirectory={report_path.parent}",
]
command.extend(test_targets)
- run_command(command, env_overrides={"CI": "1", "CORTEXPILOT_COVERAGE_HTML": "0"})
+ run_command(
+ command,
+ env_overrides={
+ "CI": "1",
+ "CORTEXPILOT_COVERAGE_HTML": "0",
+ "CORTEXPILOT_DASHBOARD_COVERAGE_DIR": str(report_path.parent),
+ },
+ )
def run_desktop_coverage(report_path: Path, test_targets: list[str]) -> None:
report_path.parent.mkdir(parents=True, exist_ok=True)
+ run_command(["bash", "scripts/install_desktop_deps.sh"])
command = [
"pnpm",
"--dir",
@@ -247,7 +277,15 @@ def run_desktop_coverage(report_path: Path, test_targets: list[str]) -> None:
f"--coverage.reportsDirectory={report_path.parent}",
]
command.extend(test_targets)
- run_command(command)
+ run_command(
+ command,
+ env_overrides={
+ "CI": "1",
+ "CORTEXPILOT_COVERAGE_HTML": "0",
+ "CORTEXPILOT_DESKTOP_COVERAGE_DIR": str(report_path.parent),
+ "CORTEXPILOT_DESKTOP_COVERAGE_RUN_ID": "repo-coverage-gate",
+ },
+ )
def aggregate_repo_totals(project_totals: dict[str, CoverageTotals]) -> CoverageTotals:
From a668fbbbbaa9c17c5044dfc47e55693601384792 Mon Sep 17 00:00:00 2001
From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com>
Date: Sun, 12 Apr 2026 14:14:14 -0700
Subject: [PATCH 2/9] fix: stabilize runtime run ordering
---
.../services/control_plane_read_service.py | 20 ++++++++++++++++++-
1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
index 2a30a79..74c00db 100644
--- a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
+++ b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
@@ -178,13 +178,31 @@ def _last_event_ts(run_id: str) -> str:
return value
return ""
+ def _run_sort_ts(run_dir: Path, manifest_record: dict[str, Any]) -> float:
+ manifest_path = run_dir / "manifest.json"
+ if manifest_path.exists():
+ return manifest_path.stat().st_mtime
+ created_at = _as_text(manifest_record.get("created_at"))
+ if created_at:
+ try:
+ return _parse_iso_ts(created_at).timestamp()
+ except Exception:
+ pass
+ return run_dir.stat().st_mtime
+
def _list_runs_runtime() -> list[dict[str, Any]]:
results: list[dict[str, Any]] = []
- for run_dir in sorted(runs_root.glob("*"), key=lambda item: item.stat().st_mtime, reverse=True):
+ run_dirs = []
+ for run_dir in runs_root.glob("*"):
+ if not run_dir.is_dir():
+ continue
manifest = _read_json(run_dir / "manifest.json", {})
manifest_record = _as_record(manifest)
if not manifest_record:
continue
+ run_dirs.append((run_dir, manifest_record, _run_sort_ts(run_dir, manifest_record)))
+
+ for run_dir, manifest_record, _sort_ts in sorted(run_dirs, key=lambda item: item[2], reverse=True):
run_id = _as_text(manifest_record.get("run_id")) or run_dir.name
payload = dict(manifest_record)
payload["run_id"] = run_id
From 75d1cf1671aeed2e3d727695ac5bd9ce6de51afa Mon Sep 17 00:00:00 2001
From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com>
Date: Sun, 12 Apr 2026 14:25:35 -0700
Subject: [PATCH 3/9] feat: persist prompt artifacts per run
---
.../src/cortexpilot_orch/contract/compiler.py | 47 +++++++++++++++++++
.../scheduler/scheduler_bridge_contract.py | 19 +++++++-
.../tests/test_scheduler_bridge_runtime.py | 8 ++++
docs/architecture/runtime-topology.md | 3 ++
4 files changed, 76 insertions(+), 1 deletion(-)
diff --git a/apps/orchestrator/src/cortexpilot_orch/contract/compiler.py b/apps/orchestrator/src/cortexpilot_orch/contract/compiler.py
index b992dbf..3c410de 100644
--- a/apps/orchestrator/src/cortexpilot_orch/contract/compiler.py
+++ b/apps/orchestrator/src/cortexpilot_orch/contract/compiler.py
@@ -421,6 +421,53 @@ def build_role_binding_summary(contract: dict[str, Any]) -> dict[str, Any]:
}
+def build_prompt_artifact(
+ contract: dict[str, Any],
+ *,
+ run_id: str = "",
+ task_id: str = "",
+) -> dict[str, Any]:
+ role_contract = contract.get("role_contract") if isinstance(contract.get("role_contract"), dict) else {}
+ if not role_contract:
+ role_contract = _build_role_contract(contract, _load_agent_registry())
+ assigned_agent = contract.get("assigned_agent") if isinstance(contract.get("assigned_agent"), dict) else {}
+ role = str(
+ assigned_agent.get("role")
+ or (role_contract.get("identity", {}) if isinstance(role_contract.get("identity"), dict) else {}).get("role")
+ or "WORKER"
+ ).strip().upper() or "WORKER"
+ role_contract = _merge_role_config_defaults(
+ role_contract,
+ _find_role_config_defaults(_load_role_config_registry(), role),
+ )
+ identity = role_contract.get("identity") if isinstance(role_contract.get("identity"), dict) else {}
+ runtime_binding_raw = role_contract.get("runtime_binding") if isinstance(role_contract.get("runtime_binding"), dict) else {}
+ runtime_binding = {
+ "runner": _normalize_optional_ref(runtime_binding_raw.get("runner")),
+ "provider": _normalize_optional_ref(runtime_binding_raw.get("provider")),
+ "model": _normalize_optional_ref(runtime_binding_raw.get("model")),
+ }
+ resolved_task_id = str(task_id or contract.get("task_id") or "").strip()
+ return {
+ "artifact_type": "prompt_artifact",
+ "version": "v1",
+ "source": "contract-derived",
+ "execution_authority": "task_contract",
+ "run_id": str(run_id or "").strip(),
+ "task_id": resolved_task_id,
+ "assigned_agent": {
+ "role": role,
+ "agent_id": str(identity.get("agent_id") or assigned_agent.get("agent_id") or "").strip(),
+ },
+ "purpose": str(role_contract.get("purpose") or "").strip(),
+ "system_prompt_ref": _normalize_optional_ref(role_contract.get("system_prompt_ref")),
+ "skills_bundle_ref": _normalize_optional_ref(role_contract.get("skills_bundle_ref")),
+ "mcp_bundle_ref": _normalize_optional_ref(role_contract.get("mcp_bundle_ref")),
+ "runtime_binding": runtime_binding,
+ "role_binding_summary": build_role_binding_summary(contract),
+ }
+
+
def _build_role_contract(contract: dict[str, Any], registry: dict[str, Any] | None) -> dict[str, Any]:
assigned_agent = contract.get("assigned_agent") if isinstance(contract.get("assigned_agent"), dict) else {}
role = str(assigned_agent.get("role") or "WORKER").strip().upper() or "WORKER"
diff --git a/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py b/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py
index 9888504..dbf16ea 100644
--- a/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py
+++ b/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py
@@ -1,10 +1,11 @@
from __future__ import annotations
from collections.abc import Callable
+import json
from pathlib import Path
from typing import Any
-from cortexpilot_orch.contract.compiler import build_role_binding_summary
+from cortexpilot_orch.contract.compiler import build_prompt_artifact, build_role_binding_summary
from cortexpilot_orch.store.run_store import RunStore
@@ -208,5 +209,21 @@ def persist_contract_state(
)
store.write_task_contract(run_id, task_id, contract)
store.write_active_contract(run_id, contract)
+ prompt_artifact = build_prompt_artifact(contract, run_id=run_id, task_id=task_id)
+ prompt_artifact_path = store.write_artifact(
+ run_id,
+ "prompt_artifact.json",
+ json.dumps(prompt_artifact, ensure_ascii=False, indent=2),
+ )
+ store.append_event(
+ run_id,
+ {
+ "level": "INFO",
+ "event": "PROMPT_ARTIFACT_WRITTEN",
+ "run_id": run_id,
+ "task_id": task_id,
+ "meta": {"path": str(prompt_artifact_path.relative_to(store.run_dir(run_id)))},
+ },
+ )
if ensure_evidence_bundle_fn is not None and failure_reason:
ensure_evidence_bundle_fn(store, run_id, contract, failure_reason)
diff --git a/apps/orchestrator/tests/test_scheduler_bridge_runtime.py b/apps/orchestrator/tests/test_scheduler_bridge_runtime.py
index 27a92d8..faac59d 100644
--- a/apps/orchestrator/tests/test_scheduler_bridge_runtime.py
+++ b/apps/orchestrator/tests/test_scheduler_bridge_runtime.py
@@ -155,3 +155,11 @@ def test_persist_contract_state_writes_role_binding_summary_to_manifest(tmp_path
written = json.loads((store._runs_root / run_id / "manifest.json").read_text(encoding="utf-8"))
assert written["role_binding_summary"] == build_role_binding_summary(contract)
+ prompt_artifact = json.loads(
+ (store._runs_root / run_id / "artifacts" / "prompt_artifact.json").read_text(encoding="utf-8")
+ )
+ assert prompt_artifact["artifact_type"] == "prompt_artifact"
+ assert prompt_artifact["execution_authority"] == "task_contract"
+ assert prompt_artifact["run_id"] == run_id
+ assert prompt_artifact["task_id"] == "task-role-binding-summary"
+ assert prompt_artifact["role_binding_summary"] == build_role_binding_summary(contract)
diff --git a/docs/architecture/runtime-topology.md b/docs/architecture/runtime-topology.md
index 8dba6fc..c1c719e 100644
--- a/docs/architecture/runtime-topology.md
+++ b/docs/architecture/runtime-topology.md
@@ -72,6 +72,9 @@ flowchart LR
`workflow_case_read_model` directly for operator inspection, but those UI
cards remain read-only mirrors below `task_contract`.
- Runtime artifacts (`manifest`, `events.jsonl`, reports) are generated per run.
+- Runs may now also persist `artifacts/prompt_artifact.json`, a contract-derived
+ snapshot of prompt/bundle/runtime-binding refs for that run. It is a
+ read-only audit artifact, not a second execution authority source.
- Run detail views may now include derived decision packs such as
`incident_pack.json`, while approval queues synthesize `approval_pack`
summaries from run events plus manifest metadata. These are derived operator
From 721f9081de16b9c880c35d7d01e018b4a1b69513 Mon Sep 17 00:00:00 2001
From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com>
Date: Sun, 12 Apr 2026 14:35:23 -0700
Subject: [PATCH 4/9] fix: use nanos for runtime run ordering
---
.../services/control_plane_read_service.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
index 74c00db..f89e4bd 100644
--- a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
+++ b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
@@ -178,17 +178,17 @@ def _last_event_ts(run_id: str) -> str:
return value
return ""
- def _run_sort_ts(run_dir: Path, manifest_record: dict[str, Any]) -> float:
+ def _run_sort_ts(run_dir: Path, manifest_record: dict[str, Any]) -> int:
manifest_path = run_dir / "manifest.json"
if manifest_path.exists():
- return manifest_path.stat().st_mtime
+ return manifest_path.stat().st_mtime_ns
created_at = _as_text(manifest_record.get("created_at"))
if created_at:
try:
- return _parse_iso_ts(created_at).timestamp()
+ return int(_parse_iso_ts(created_at).timestamp() * 1_000_000_000)
except Exception:
pass
- return run_dir.stat().st_mtime
+ return run_dir.stat().st_mtime_ns
def _list_runs_runtime() -> list[dict[str, Any]]:
results: list[dict[str, Any]] = []
From 05d0f3cbc561a9258ef61aea817ff1351635f459 Mon Sep 17 00:00:00 2001
From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com>
Date: Sun, 12 Apr 2026 14:41:01 -0700
Subject: [PATCH 5/9] feat: persist planning artifacts for runs
---
.../api/main_pm_intake_helpers.py | 58 ++++++++-
.../test_main_pm_intake_helpers_branches.py | 110 ++++++++++++++++++
2 files changed, 167 insertions(+), 1 deletion(-)
diff --git a/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py b/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py
index 2d8c4ad..a670d4b 100644
--- a/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py
+++ b/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py
@@ -17,8 +17,9 @@
from cortexpilot_orch.config import load_config
from cortexpilot_orch.contract.compiler import build_role_binding_summary, sync_role_contract
from cortexpilot_orch.observability.logger import log_event
-from cortexpilot_orch.planning.intake import IntakeService
+from cortexpilot_orch.planning.intake import IntakeService, _build_wave_plan, _build_worker_prompt_contracts
from cortexpilot_orch.store.intake_store import IntakeStore
+from cortexpilot_orch.store.run_store import RunStore
_TRUTHY_VALUES = {"1", "true", "yes", "y", "on"}
@@ -118,6 +119,55 @@ def _strip_intake_only_contract_fields(contract: dict[str, Any]) -> dict[str, An
return sanitized
+def _safe_read_intake_store_payload(store: object, method_name: str, intake_id: str) -> dict[str, Any]:
+ reader = getattr(store, method_name, None)
+ if not callable(reader):
+ return {}
+ try:
+ payload = reader(intake_id)
+ except Exception:
+ return {}
+ return payload if isinstance(payload, dict) else {}
+
+
+def _persist_planning_artifacts_for_run(
+ *,
+ intake_id: str,
+ run_id: str,
+ runs_root: Path,
+) -> list[str]:
+ intake_store = IntakeStore()
+ intake_payload = _safe_read_intake_store_payload(intake_store, "read_intake", intake_id)
+ response_payload = _safe_read_intake_store_payload(intake_store, "read_response", intake_id)
+ plan_bundle = response_payload.get("plan_bundle") if isinstance(response_payload.get("plan_bundle"), dict) else None
+ if not intake_payload or not isinstance(plan_bundle, dict):
+ return []
+
+ run_store = RunStore(runs_root=runs_root)
+ artifacts_to_write: list[tuple[str, Any]] = [
+ ("planning_wave_plan.json", _build_wave_plan(plan_bundle)),
+ ("planning_worker_prompt_contracts.json", _build_worker_prompt_contracts(plan_bundle, intake_payload)),
+ ]
+ written: list[str] = []
+ for filename, payload in artifacts_to_write:
+ if payload in ({}, [], None):
+ continue
+ run_store.write_artifact(run_id, filename, json.dumps(payload, ensure_ascii=False, indent=2))
+ written.append(filename)
+
+ if written:
+ run_store.append_event(
+ run_id,
+ {
+ "level": "INFO",
+ "event": "PLANNING_ARTIFACTS_WRITTEN",
+ "run_id": run_id,
+ "meta": {"intake_id": intake_id, "artifacts": written},
+ },
+ )
+ return written
+
+
def configure_pm_session_aggregation(
*,
runs_root_fn: Callable[[], Path],
@@ -597,10 +647,16 @@ def _execute_in_background() -> None:
)
IntakeStore().append_event(intake_id, {"event": "INTAKE_RUN", "run_id": run_id})
+ planning_artifacts = _persist_planning_artifacts_for_run(
+ intake_id=intake_id,
+ run_id=run_id,
+ runs_root=runs_root,
+ )
return {
"ok": True,
"run_id": run_id,
"contract_path": str(contract_path),
"strict_acceptance": bool(runtime_options.get("strict_acceptance", False)),
"role_binding_summary": build_role_binding_summary(contract),
+ "planning_artifacts": planning_artifacts,
}
diff --git a/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py b/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py
index 3f9ee3f..2d3999a 100644
--- a/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py
+++ b/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py
@@ -463,6 +463,116 @@ def execute_task(contract_path: Path, mock_mode: bool = False) -> str:
assert observed_contract["runtime_options"]["strict_acceptance"] is True
+def test_run_intake_persists_planning_artifacts_into_run_bundle(monkeypatch, tmp_path: Path) -> None:
+ runs_root = tmp_path / "runs"
+ runtime_contract_root = tmp_path / ".runtime-cache" / "cortexpilot" / "contracts"
+ intake_payload = {
+ "objective": "Ship one planning artifact bridge",
+ "constraints": ["truthful-public-surface"],
+ "search_queries": ["command tower planning artifact"],
+ }
+ response_payload = {
+ "plan_bundle": {
+ "bundle_id": "bundle-1",
+ "objective": "Ship one planning artifact bridge",
+ "owner_agent": {"role": "PM", "agent_id": "pm-1"},
+ "plans": [
+ {
+ "plan_id": "worker-1",
+ "assigned_agent": {"role": "WORKER", "agent_id": "worker-1"},
+ "spec": "Persist the planning artifact into the run bundle.",
+ "allowed_paths": ["apps/orchestrator"],
+ "acceptance_tests": [{"name": "pytest", "cmd": "python3 -m pytest -q", "must_pass": True}],
+ "mcp_tool_set": ["codex"],
+ "required_outputs": [{"name": "task_result.json", "type": "report"}],
+ }
+ ],
+ }
+ }
+ intake_events: list[tuple[str, dict[str, object]]] = []
+
+ class _Store:
+ def append_event(self, intake_id: str, payload: dict[str, object]) -> None:
+ intake_events.append((intake_id, payload))
+
+ def read_intake(self, intake_id: str) -> dict[str, object]:
+ assert intake_id == "persist"
+ return intake_payload
+
+ def read_response(self, intake_id: str) -> dict[str, object]:
+ assert intake_id == "persist"
+ return response_payload
+
+ monkeypatch.setattr(helpers, "IntakeStore", lambda: _Store())
+ monkeypatch.setattr(
+ helpers,
+ "load_config",
+ lambda: types.SimpleNamespace(
+ repo_root=tmp_path,
+ runs_root=runs_root,
+ contract_root=tmp_path / "contracts",
+ runtime_contract_root=runtime_contract_root,
+ ),
+ )
+
+ class _BuildOK:
+ def build_contract(self, intake_id: str) -> dict[str, object]:
+ assert intake_id == "persist"
+ return {
+ "task_id": "task-persist",
+ "owner_agent": {"role": "PM", "agent_id": "pm-1"},
+ "assigned_agent": {"role": "WORKER", "agent_id": "worker-1"},
+ "inputs": {"spec": "repro", "artifacts": []},
+ "required_outputs": [{"name": "task_result.json", "type": "json", "acceptance": "ok"}],
+ "allowed_paths": ["apps/orchestrator"],
+ "forbidden_actions": [],
+ "acceptance_tests": [{"name": "pytest", "cmd": "python3 -m pytest -q", "must_pass": True}],
+ "tool_permissions": {
+ "filesystem": "workspace-write",
+ "shell": "on-request",
+ "network": "deny",
+ "mcp_tools": ["codex"],
+ },
+ "mcp_tool_set": ["codex"],
+ "timeout_retry": {"timeout_sec": 60, "max_retries": 0, "retry_backoff_sec": 0},
+ "rollback": {"strategy": "git_reset_hard", "baseline_ref": "HEAD"},
+ "evidence_links": [],
+ "log_refs": {"run_id": "", "paths": {}},
+ }
+
+ class _Orchestrator:
+ @staticmethod
+ def execute_task(contract_path: Path, mock_mode: bool = False) -> str:
+ del mock_mode
+ payload = json.loads(contract_path.read_text(encoding="utf-8"))
+ store = RunStore(runs_root=runs_root)
+ run_id = store.create_run(str(payload.get("task_id") or "task"))
+ store.write_manifest(run_id, {"run_id": run_id, "task_id": payload.get("task_id"), "status": "RUNNING", "repo": {}})
+ return run_id
+
+ result = helpers.run_intake(
+ "persist",
+ {"mock": True},
+ intake_service_cls=_BuildOK,
+ orchestration_service=_Orchestrator(),
+ error_detail_fn=lambda code: {"code": code},
+ current_request_id_fn=lambda: "req-persist",
+ )
+
+ run_id = result["run_id"]
+ wave_plan = json.loads((runs_root / run_id / "artifacts" / "planning_wave_plan.json").read_text(encoding="utf-8"))
+ worker_contracts = json.loads(
+ (runs_root / run_id / "artifacts" / "planning_worker_prompt_contracts.json").read_text(encoding="utf-8")
+ )
+
+ assert result["planning_artifacts"] == ["planning_wave_plan.json", "planning_worker_prompt_contracts.json"]
+ assert wave_plan["wave_id"] == "bundle-1"
+ assert wave_plan["objective"] == "Ship one planning artifact bridge"
+ assert worker_contracts[0]["prompt_contract_id"] == "worker-1"
+ assert worker_contracts[0]["continuation_policy"]["on_blocked"] == "spawn_independent_temporary_unblock_task"
+ assert intake_events[-1] == ("persist", {"event": "INTAKE_RUN", "run_id": run_id})
+
+
def test_build_role_binding_summary_marks_skills_and_mcp_registry_refs_as_registry_backed() -> None:
summary = build_role_binding_summary(
{
From cdfc07f880cdc1c9618bb068da127e6fc0ee596b Mon Sep 17 00:00:00 2001
From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com>
Date: Sun, 12 Apr 2026 14:43:08 -0700
Subject: [PATCH 6/9] feat: persist planning artifacts for runs
From b2cb869d65ad989d8c55de9128f2077823c62f5b Mon Sep 17 00:00:00 2001
From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com>
Date: Sun, 12 Apr 2026 14:50:47 -0700
Subject: [PATCH 7/9] feat: register planning artifacts in run manifests
---
.../api/main_pm_intake_helpers.py | 41 ++++++++++++++++++-
.../scheduler/scheduler_bridge_contract.py | 34 +++++++++++++++
.../test_main_pm_intake_helpers_branches.py | 4 ++
.../tests/test_scheduler_bridge_runtime.py | 2 +
4 files changed, 80 insertions(+), 1 deletion(-)
diff --git a/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py b/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py
index a670d4b..53a01c1 100644
--- a/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py
+++ b/apps/orchestrator/src/cortexpilot_orch/api/main_pm_intake_helpers.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+import hashlib
import json
import threading
from datetime import datetime, timezone
@@ -119,6 +120,29 @@ def _strip_intake_only_contract_fields(contract: dict[str, Any]) -> dict[str, An
return sanitized
+def _artifact_ref_for_path(path: Path, *, rel_path: str, name: str, media_type: str = "application/json") -> dict[str, Any]:
+ payload = path.read_bytes()
+ return {
+ "name": name,
+ "path": rel_path,
+ "sha256": hashlib.sha256(payload).hexdigest(),
+ "media_type": media_type,
+ "size_bytes": len(payload),
+ }
+
+
+def _append_manifest_artifact(manifest: dict[str, Any], ref: dict[str, Any]) -> None:
+ artifacts = manifest.get("artifacts") if isinstance(manifest.get("artifacts"), list) else []
+ key = (str(ref.get("name") or ""), str(ref.get("path") or ""))
+ for item in artifacts:
+ if not isinstance(item, dict):
+ continue
+ if (str(item.get("name") or ""), str(item.get("path") or "")) == key:
+ return
+ artifacts.append(ref)
+ manifest["artifacts"] = artifacts
+
+
def _safe_read_intake_store_payload(store: object, method_name: str, intake_id: str) -> dict[str, Any]:
reader = getattr(store, method_name, None)
if not callable(reader):
@@ -144,18 +168,33 @@ def _persist_planning_artifacts_for_run(
return []
run_store = RunStore(runs_root=runs_root)
+ run_dir = run_store.run_dir(run_id)
artifacts_to_write: list[tuple[str, Any]] = [
("planning_wave_plan.json", _build_wave_plan(plan_bundle)),
("planning_worker_prompt_contracts.json", _build_worker_prompt_contracts(plan_bundle, intake_payload)),
]
written: list[str] = []
+ artifact_refs: list[dict[str, Any]] = []
for filename, payload in artifacts_to_write:
if payload in ({}, [], None):
continue
- run_store.write_artifact(run_id, filename, json.dumps(payload, ensure_ascii=False, indent=2))
+ artifact_path = run_store.write_artifact(run_id, filename, json.dumps(payload, ensure_ascii=False, indent=2))
written.append(filename)
+ artifact_refs.append(
+ _artifact_ref_for_path(
+ artifact_path,
+ rel_path=f"artifacts/{filename}",
+ name=filename.removesuffix(".json"),
+ )
+ )
if written:
+ manifest_path = run_dir / "manifest.json"
+ manifest = _read_json_file(manifest_path)
+ if manifest:
+ for ref in artifact_refs:
+ _append_manifest_artifact(manifest, ref)
+ manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
run_store.append_event(
run_id,
{
diff --git a/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py b/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py
index dbf16ea..6d16354 100644
--- a/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py
+++ b/apps/orchestrator/src/cortexpilot_orch/scheduler/scheduler_bridge_contract.py
@@ -1,6 +1,7 @@
from __future__ import annotations
from collections.abc import Callable
+import hashlib
import json
from pathlib import Path
from typing import Any
@@ -9,6 +10,29 @@
from cortexpilot_orch.store.run_store import RunStore
+def _artifact_ref_for_path(path: Path, *, rel_path: str, name: str, media_type: str = "application/json") -> dict[str, Any]:
+ payload = path.read_bytes()
+ return {
+ "name": name,
+ "path": rel_path,
+ "sha256": hashlib.sha256(payload).hexdigest(),
+ "media_type": media_type,
+ "size_bytes": len(payload),
+ }
+
+
+def _append_manifest_artifact(manifest: dict[str, Any], ref: dict[str, Any]) -> None:
+ artifacts = manifest.get("artifacts") if isinstance(manifest.get("artifacts"), list) else []
+ key = (str(ref.get("name") or ""), str(ref.get("path") or ""))
+ for item in artifacts:
+ if not isinstance(item, dict):
+ continue
+ if (str(item.get("name") or ""), str(item.get("path") or "")) == key:
+ return
+ artifacts.append(ref)
+ manifest["artifacts"] = artifacts
+
+
class ContractStateWriter:
def __init__(
self,
@@ -215,6 +239,16 @@ def persist_contract_state(
"prompt_artifact.json",
json.dumps(prompt_artifact, ensure_ascii=False, indent=2),
)
+ if manifest is not None:
+ _append_manifest_artifact(
+ manifest,
+ _artifact_ref_for_path(
+ prompt_artifact_path,
+ rel_path="artifacts/prompt_artifact.json",
+ name="prompt_artifact",
+ ),
+ )
+ write_manifest_fn(store, run_id, manifest)
store.append_event(
run_id,
{
diff --git a/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py b/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py
index 2d3999a..6499250 100644
--- a/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py
+++ b/apps/orchestrator/tests/test_main_pm_intake_helpers_branches.py
@@ -564,12 +564,16 @@ def execute_task(contract_path: Path, mock_mode: bool = False) -> str:
worker_contracts = json.loads(
(runs_root / run_id / "artifacts" / "planning_worker_prompt_contracts.json").read_text(encoding="utf-8")
)
+ manifest = json.loads((runs_root / run_id / "manifest.json").read_text(encoding="utf-8"))
assert result["planning_artifacts"] == ["planning_wave_plan.json", "planning_worker_prompt_contracts.json"]
assert wave_plan["wave_id"] == "bundle-1"
assert wave_plan["objective"] == "Ship one planning artifact bridge"
assert worker_contracts[0]["prompt_contract_id"] == "worker-1"
assert worker_contracts[0]["continuation_policy"]["on_blocked"] == "spawn_independent_temporary_unblock_task"
+ artifact_names = [item["name"] for item in manifest["artifacts"]]
+ assert "planning_wave_plan" in artifact_names
+ assert "planning_worker_prompt_contracts" in artifact_names
assert intake_events[-1] == ("persist", {"event": "INTAKE_RUN", "run_id": run_id})
diff --git a/apps/orchestrator/tests/test_scheduler_bridge_runtime.py b/apps/orchestrator/tests/test_scheduler_bridge_runtime.py
index faac59d..0fe1c0f 100644
--- a/apps/orchestrator/tests/test_scheduler_bridge_runtime.py
+++ b/apps/orchestrator/tests/test_scheduler_bridge_runtime.py
@@ -163,3 +163,5 @@ def test_persist_contract_state_writes_role_binding_summary_to_manifest(tmp_path
assert prompt_artifact["run_id"] == run_id
assert prompt_artifact["task_id"] == "task-role-binding-summary"
assert prompt_artifact["role_binding_summary"] == build_role_binding_summary(contract)
+ artifact_names = [item["name"] for item in written["artifacts"]]
+ assert "prompt_artifact" in artifact_names
From 700b6bf83c96c2463c8b6497d30ba1d3048316d4 Mon Sep 17 00:00:00 2001
From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com>
Date: Sun, 12 Apr 2026 14:59:42 -0700
Subject: [PATCH 8/9] fix: stabilize hosted run ordering
---
.../cortexpilot_orch/services/control_plane_read_service.py | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
index f89e4bd..16ef84b 100644
--- a/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
+++ b/apps/orchestrator/src/cortexpilot_orch/services/control_plane_read_service.py
@@ -202,7 +202,11 @@ def _list_runs_runtime() -> list[dict[str, Any]]:
continue
run_dirs.append((run_dir, manifest_record, _run_sort_ts(run_dir, manifest_record)))
- for run_dir, manifest_record, _sort_ts in sorted(run_dirs, key=lambda item: item[2], reverse=True):
+ for run_dir, manifest_record, _sort_ts in sorted(
+ run_dirs,
+ key=lambda item: (item[2], item[0].name),
+ reverse=True,
+ ):
run_id = _as_text(manifest_record.get("run_id")) or run_dir.name
payload = dict(manifest_record)
payload["run_id"] = run_id
From 5de7f4d6a03d3796b8a7119127b4e20caa7b22ae Mon Sep 17 00:00:00 2001
From: "Yifeng[Terry] Yu" <125581657+xiaojiou176@users.noreply.github.com>
Date: Sun, 12 Apr 2026 15:18:16 -0700
Subject: [PATCH 9/9] fix: drop unused control-plane test import
---
apps/orchestrator/tests/test_control_plane_read_service.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/apps/orchestrator/tests/test_control_plane_read_service.py b/apps/orchestrator/tests/test_control_plane_read_service.py
index c075ebf..0ccc00f 100644
--- a/apps/orchestrator/tests/test_control_plane_read_service.py
+++ b/apps/orchestrator/tests/test_control_plane_read_service.py
@@ -2,7 +2,7 @@
import json
import sys
-from datetime import datetime, timezone
+from datetime import datetime
from pathlib import Path
from types import SimpleNamespace
from types import ModuleType