diff --git a/docs/diary/samples/chatbi-v3-lowconf-sql-preview/README.md b/docs/diary/samples/chatbi-v3-lowconf-sql-preview/README.md new file mode 100644 index 00000000..4a011fe3 --- /dev/null +++ b/docs/diary/samples/chatbi-v3-lowconf-sql-preview/README.md @@ -0,0 +1,30 @@ +# 标准样本 · ChatBI V3 低置信 Text2SQL 预览 + plan_execution_token(§5-2) + +> **用途**:联调 / PR / FE-1 烟测留证;**非**实现真值(真值见 docs/tasks/done/task_chatbi_v3_lowconf_sql_preview_v1.md、SPEC、pytest)。 +> **采集**:2026-05-31 · Ink Unified Chat · 后端分支 task/chatbi-v3-lowconf-sql-preview +> **freeze_id**:CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31 + +## 环境要点 + +| 项 | 值 | +|----|-----| +| 问句 | 统计 heros 表里有多少条数据 | +| session_id | 211d54b7-f806-4265-b46e-fc1a897f51e2 | +| 开关 | CHATBI_USE_AGENT + CHATBI_V3_LOW_CONFIDENCE_CLARIFY=1 + CHATBI_V3_PLAN_PREVIEW_CONFIRM=1 | + +## 两轮预期 + +| 轮次 | 文件 | 关键观测 | +|------|------|----------| +| 1 预览+澄清 | round1_preview_clarify_timeline.json | agent.plan.preview → agent.clarify;total_steps=0 | +| 2 令牌放行 | round2_token_bypass_execute_timeline.json | 已校验 token;sql.result heros count=10 | + +## 截图 + +- screenshots/timeline-plan-preview-clarify.png — step-11/12 方案预览与澄清 +- screenshots/ui-confirm-execute-card.png — 「按预览执行」卡片 + +## 关联 + +- docs/tasks/done/task_chatbi_v3_lowconf_sql_preview_v1.md +- docs/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-sql-preview_20260531_v1.md diff --git a/docs/diary/samples/chatbi-v3-lowconf-sql-preview/round1_preview_clarify_timeline.json b/docs/diary/samples/chatbi-v3-lowconf-sql-preview/round1_preview_clarify_timeline.json new file mode 100644 index 00000000..87bf6645 --- /dev/null +++ b/docs/diary/samples/chatbi-v3-lowconf-sql-preview/round1_preview_clarify_timeline.json @@ -0,0 +1,270 @@ +{ + "meta": { + "sample_id": "chatbi-v3-lowconf-sql-preview-round1", + "captured_at": "2026-05-31", + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "event_count": 23, + "notes": "首轮:预览+澄清短路;sql_draft 可能与问句表名不一致(agent_info)" + }, + "timeline": [ + { + "type": "user.message", + "ts": 1780234282782, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "user", + "payload": { + "text": "统计 heros 表里有多少条数据" + } + }, + { + "type": "meta", + "ts": 0, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "m1", + "payload": { + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "mode": "auto", + "session_id": "211d54b7-f806-4265-b46e-fc1a897f51e2" + } + }, + { + "type": "agent.step.start", + "ts": 6301, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "a1", + "payload": { + "step_number": 1, + "max_steps": 5 + } + }, + { + "type": "agent.llm.start", + "ts": 6301, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "44aabb235d19_s1_intent", + "payload": { + "phase": "intent", + "step_id": "s1" + } + }, + { + "type": "agent.llm.delta", + "ts": 6301, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "44aabb235d19_s1_intent", + "payload": { + "text": "用户要求统计 heros 表的数", + "part_index": 0 + } + }, + { + "type": "agent.llm.delta", + "ts": 6301, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "44aabb235d19_s1_intent", + "payload": { + "text": "据条数,属于数据库查询获取结构化", + "part_index": 1 + } + }, + { + "type": "agent.llm.delta", + "ts": 6301, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "44aabb235d19_s1_intent", + "payload": { + "text": "数据,应使用 text2sql_", + "part_index": 2 + } + }, + { + "type": "agent.llm.delta", + "ts": 6301, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "44aabb235d19_s1_intent", + "payload": { + "text": "query。", + "part_index": 3 + } + }, + { + "type": "agent.llm.end", + "ts": 6301, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "44aabb235d19_s1_intent", + "payload": { + "ok": true, + "phase": "intent", + "step_id": "s1", + "simulated_stream": true + } + }, + { + "type": "router.decision", + "ts": 6301, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "r1", + "payload": { + "prefer": "auto", + "candidate_mode": "text2sql", + "final_mode": "text2sql", + "rule_hits": [], + "evidence": { + "agent_reasoning": "用户要求统计 heros 表的数据条数,属于数据库查询获取结构化数据,应使用 text2sql_query。" + }, + "fallback": "rag_search" + } + }, + { + "type": "text2sql.phase.start", + "ts": 6301, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "text2sql.phase.retrieve", + "payload": { + "subphase_id": "text2sql.phase.retrieve", + "phase_id": "retrieve", + "phase_kind": "io" + } + }, + { + "type": "text2sql.phase.end", + "ts": 10803, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "text2sql.phase.retrieve", + "payload": { + "subphase_id": "text2sql.phase.retrieve", + "phase_id": "retrieve", + "latency_ms": 4500 + } + }, + { + "type": "text2sql.phase.start", + "ts": 10803, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "text2sql.phase.schema_prefetch", + "payload": { + "subphase_id": "text2sql.phase.schema_prefetch", + "phase_id": "schema_prefetch", + "phase_kind": "io" + } + }, + { + "type": "text2sql.phase.end", + "ts": 10806, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "text2sql.phase.schema_prefetch", + "payload": { + "subphase_id": "text2sql.phase.schema_prefetch", + "phase_id": "schema_prefetch", + "latency_ms": 2, + "schema_prefetch_source": "skipped_not_mutate", + "schema_prefetch_tables": [] + } + }, + { + "type": "text2sql.phase.start", + "ts": 10806, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "text2sql.phase.llm_sql", + "payload": { + "subphase_id": "text2sql.phase.llm_sql", + "phase_id": "llm_sql", + "phase_kind": "llm" + } + }, + { + "type": "text2sql.phase.end", + "ts": 19029, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "text2sql.phase.llm_sql", + "payload": { + "subphase_id": "text2sql.phase.llm_sql", + "phase_id": "llm_sql", + "latency_ms": 8222 + } + }, + { + "type": "text2sql.phase.start", + "ts": 19029, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "text2sql.phase.validate", + "payload": { + "subphase_id": "text2sql.phase.validate", + "phase_id": "validate", + "phase_kind": "io" + } + }, + { + "type": "text2sql.phase.end", + "ts": 19037, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "text2sql.phase.validate", + "payload": { + "subphase_id": "text2sql.phase.validate", + "phase_id": "validate", + "latency_ms": 8 + } + }, + { + "type": "agent.plan.preview", + "ts": 19037, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "a1_plan_prev", + "payload": { + "plan_id": "9c98eac535764939a9fa", + "tool": "text2sql_query", + "sql_draft": "select count(*) from agent_info", + "warnings": [ + "若确认按预览 SQL 继续查数:请在 120 秒内在**下一轮同一问题**的请求 JSON 中带 `\"plan_execution_token\": \"…\"`(见 `agent.plan.preview` 中的 `plan_execution_token`)。若未及时附带令牌,本预览 SQL 与该令牌均失效,须**重新发起本问题**才能再次预览。" + ], + "plan_execution_token": "eyJleHAiOjE3ODAyMzQ0MjIsInAiOiJjbGFyaWZ5X3RleHQyc3FsX29uY2UiLCJxaCI6ImI4ZDNmMTU0ZmQ5ZmRiMjM3YWI5OTM3OWE0NWU3NTNkIiwic2lkIjoiMjExZDU0YjctZjgwNi00MjY1LWI0NmUtZmMxYTg5N2Y1MWUyIiwidiI6MX0KrUSTSPAbI76KwLw5nkElumhPdujTX0HHncAx60K1bRA", + "expires_in_sec": 120 + } + }, + { + "type": "agent.clarify", + "ts": 19037, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "a1_clarify", + "payload": { + "step_number": 1, + "message": "待您澄清(低置信度)", + "prompt_for_user": "请补充您关心的指标、时间范围或具体业务对象。 若涉及具体表/字段,请在确认权限与口径后再发起查数。\n\n若确认按预览 SQL 继续查数:请在 120 秒内在**下一轮同一问题**的请求 JSON 中带 `\"plan_execution_token\": \"…\"`(见 `agent.plan.preview` 中的 `plan_execution_token`)。若未及时附带令牌,本预览 SQL 与该令牌均失效,须**重新发起本问题**才能再次预览。" + } + }, + { + "type": "agent.final", + "ts": 19037, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "a_final", + "payload": { + "total_steps": 0, + "tools_used": [], + "modes": [ + "text2sql" + ], + "fallback_used": false + } + }, + { + "type": "assistant.message", + "ts": 19037, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "s_answer", + "payload": { + "role": "assistant", + "content": "系统在继续查数前需要先与您对齐语义。请查看 Timeline 中「待您澄清」条目并补充说明;也可改用 prefer=text2sql 强制路径或改写问题后重试。" + } + }, + { + "type": "latency", + "ts": 19040, + "run_id": "44aabb23-5d19-4a87-ac5f-4f50aeaf9f18", + "step_id": "l1", + "payload": { + "total_ms": 19040, + "stages_ms": {} + } + } + ] +} diff --git a/docs/diary/samples/chatbi-v3-lowconf-sql-preview/round2_token_bypass_execute_timeline.json b/docs/diary/samples/chatbi-v3-lowconf-sql-preview/round2_token_bypass_execute_timeline.json new file mode 100644 index 00000000..f4186631 --- /dev/null +++ b/docs/diary/samples/chatbi-v3-lowconf-sql-preview/round2_token_bypass_execute_timeline.json @@ -0,0 +1,365 @@ +{ + "meta": { + "sample_id": "chatbi-v3-lowconf-sql-preview-round2", + "captured_at": "2026-05-31", + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "event_count": 30, + "notes": "第二轮:按预览执行;token 放行;sql.result heros count=10" + }, + "timeline": [ + { + "type": "user.message", + "ts": 1780234402207, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "user", + "payload": { + "text": "统计 heros 表里有多少条数据" + } + }, + { + "type": "meta", + "ts": 0, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "m1", + "payload": { + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "mode": "auto", + "session_id": "211d54b7-f806-4265-b46e-fc1a897f51e2" + } + }, + { + "type": "agent.step.start", + "ts": 5542, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "a1", + "payload": { + "step_number": 1, + "max_steps": 5 + } + }, + { + "type": "agent.llm.start", + "ts": 5542, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "150043ffbcaa_s1_intent", + "payload": { + "phase": "intent", + "step_id": "s1" + } + }, + { + "type": "agent.llm.delta", + "ts": 5542, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "150043ffbcaa_s1_intent", + "payload": { + "text": "用户要求统计heros表的数据条", + "part_index": 0 + } + }, + { + "type": "agent.llm.delta", + "ts": 5542, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "150043ffbcaa_s1_intent", + "payload": { + "text": "数,属于结构化数据查询,应由数据", + "part_index": 1 + } + }, + { + "type": "agent.llm.delta", + "ts": 5542, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "150043ffbcaa_s1_intent", + "payload": { + "text": "库执行。", + "part_index": 2 + } + }, + { + "type": "agent.llm.end", + "ts": 5542, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "150043ffbcaa_s1_intent", + "payload": { + "ok": true, + "phase": "intent", + "step_id": "s1", + "simulated_stream": true + } + }, + { + "type": "router.decision", + "ts": 5542, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "r1", + "payload": { + "prefer": "auto", + "candidate_mode": "text2sql", + "final_mode": "text2sql", + "rule_hits": [], + "evidence": { + "agent_reasoning": "用户要求统计heros表的数据条数,属于结构化数据查询,应由数据库执行。" + }, + "fallback": "rag_search" + } + }, + { + "type": "agent.think", + "ts": 5542, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "a1_think", + "payload": { + "step_number": 1, + "thought": "已校验 plan_execution_token,按用户确认放行执行 Text2SQL。", + "selected_tool": "text2sql_query", + "mode": "text2sql", + "confidence": 0.95 + } + }, + { + "type": "tool.call.start", + "ts": 5542, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "t_step1", + "payload": { + "tool": "text2sql_query", + "input": { + "query": "统计 heros 表里有多少条数据" + } + } + }, + { + "type": "text2sql.phase.start", + "ts": 5543, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "text2sql.phase.retrieve", + "payload": { + "subphase_id": "text2sql.phase.retrieve", + "phase_id": "retrieve", + "phase_kind": "io" + } + }, + { + "type": "text2sql.phase.end", + "ts": 6067, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "text2sql.phase.retrieve", + "payload": { + "subphase_id": "text2sql.phase.retrieve", + "phase_id": "retrieve", + "latency_ms": 524 + } + }, + { + "type": "text2sql.phase.start", + "ts": 6067, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "text2sql.phase.schema_prefetch", + "payload": { + "subphase_id": "text2sql.phase.schema_prefetch", + "phase_id": "schema_prefetch", + "phase_kind": "io" + } + }, + { + "type": "text2sql.phase.end", + "ts": 6067, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "text2sql.phase.schema_prefetch", + "payload": { + "subphase_id": "text2sql.phase.schema_prefetch", + "phase_id": "schema_prefetch", + "latency_ms": 0, + "schema_prefetch_source": "skipped_not_mutate", + "schema_prefetch_tables": [] + } + }, + { + "type": "text2sql.phase.start", + "ts": 6067, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "text2sql.phase.llm_sql", + "payload": { + "subphase_id": "text2sql.phase.llm_sql", + "phase_id": "llm_sql", + "phase_kind": "llm" + } + }, + { + "type": "text2sql.phase.end", + "ts": 16603, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "text2sql.phase.llm_sql", + "payload": { + "subphase_id": "text2sql.phase.llm_sql", + "phase_id": "llm_sql", + "latency_ms": 10535 + } + }, + { + "type": "text2sql.phase.start", + "ts": 16603, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "text2sql.phase.validate", + "payload": { + "subphase_id": "text2sql.phase.validate", + "phase_id": "validate", + "phase_kind": "io" + } + }, + { + "type": "text2sql.phase.end", + "ts": 16605, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "text2sql.phase.validate", + "payload": { + "subphase_id": "text2sql.phase.validate", + "phase_id": "validate", + "latency_ms": 1 + } + }, + { + "type": "text2sql.phase.start", + "ts": 16605, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "text2sql.phase.db", + "payload": { + "subphase_id": "text2sql.phase.db", + "phase_id": "db", + "phase_kind": "db" + } + }, + { + "type": "text2sql.phase.end", + "ts": 18341, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "text2sql.phase.db", + "payload": { + "subphase_id": "text2sql.phase.db", + "phase_id": "db", + "latency_ms": 1736 + } + }, + { + "type": "tool.call.end", + "ts": 18342, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "t_step1", + "payload": { + "output": { + "answer": "共有 10 条。", + "text2sql_phases_ms": { + "retrieve": 524, + "schema_prefetch": 0, + "llm_sql": 10535, + "validate": 1, + "db": 1736 + } + }, + "error": null, + "latency_ms": 12799 + } + }, + { + "type": "sql.result", + "ts": 18342, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "q_step1", + "payload": { + "sql": "SELECT count(*) FROM heros", + "columns": [ + "count" + ], + "rows": [ + { + "count": 10 + } + ], + "truncated": false + } + }, + { + "type": "agent.llm.start", + "ts": 18342, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "150043ffbcaa_s1_text2sql_summary", + "payload": { + "phase": "text2sql_summary", + "step_id": "s1" + } + }, + { + "type": "agent.llm.delta", + "ts": 18342, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "150043ffbcaa_s1_text2sql_summary", + "payload": { + "text": "共有 10 条。", + "part_index": 0 + } + }, + { + "type": "agent.llm.end", + "ts": 18342, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "150043ffbcaa_s1_text2sql_summary", + "payload": { + "ok": true, + "phase": "text2sql_summary", + "step_id": "s1", + "simulated_stream": true + } + }, + { + "type": "agent.step.end", + "ts": 18342, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "a1_end", + "payload": { + "step_number": 1, + "tool_used": "text2sql_query", + "mode": "text2sql", + "success": true, + "next_action": "final_answer" + } + }, + { + "type": "agent.final", + "ts": 18342, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "a_final", + "payload": { + "total_steps": 1, + "tools_used": [ + "text2sql_query" + ], + "modes": [ + "text2sql" + ], + "fallback_used": false + } + }, + { + "type": "assistant.message", + "ts": 18342, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "s_answer", + "payload": { + "role": "assistant", + "content": "共有 10 条。" + } + }, + { + "type": "latency", + "ts": 18344, + "run_id": "150043ff-bcaa-471f-af6f-89df2a0e2b7b", + "step_id": "l1", + "payload": { + "total_ms": 18344, + "stages_ms": {} + } + } + ] +} diff --git a/docs/diary/samples/chatbi-v3-lowconf-sql-preview/screenshots/timeline-plan-preview-clarify.png b/docs/diary/samples/chatbi-v3-lowconf-sql-preview/screenshots/timeline-plan-preview-clarify.png new file mode 100644 index 00000000..1ebd1b8a Binary files /dev/null and b/docs/diary/samples/chatbi-v3-lowconf-sql-preview/screenshots/timeline-plan-preview-clarify.png differ diff --git a/docs/diary/samples/chatbi-v3-lowconf-sql-preview/screenshots/ui-confirm-execute-card.png b/docs/diary/samples/chatbi-v3-lowconf-sql-preview/screenshots/ui-confirm-execute-card.png new file mode 100644 index 00000000..d39d637e Binary files /dev/null and b/docs/diary/samples/chatbi-v3-lowconf-sql-preview/screenshots/ui-confirm-execute-card.png differ diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_00_chatbi-v3-lowconf-sql-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_00_chatbi-v3-lowconf-sql-preview.md new file mode 100644 index 00000000..6fa1e194 --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_00_chatbi-v3-lowconf-sql-preview.md @@ -0,0 +1,54 @@ +# Invoke 快照 · 00 总调度 · chatbi-v3-lowconf-sql-preview + +| 字段 | 值 | +|------|-----| +| hat_id | 00 | +| task_slug | chatbi-v3-lowconf-sql-preview | +| task_path | docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-sql-preview | +| freeze_id | CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31 | +| test_strategy | required | +| kpi_rubric | KPI_RUBRIC_v1_2 | +| kpi_aggregator | 00 | +| planned_hats | 22,30,40,50,CLOSE | +| date | 20260531 | +| stage_00 | 22:done 30:done 40:done 50:done CLOSE:done | + +--- + +## §3 调用体(快照) + +```text +你正在扮演 Harness「总调度帽(00)」,严格遵循: +- docs/harness/prompts/hats/00-orchestrator.md +- docs/harness/guides/KPI_RUBRIC_v1_2.md +- docs/harness/prompts/handoff/HANDOFF_SEMI_AUTO.md +- docs/harness/prompts/handoff/HANDOFF_CLOSE_TRACE.md +- docs/tasks/skills/SKILL-harness-task.md + +输入: +- task:docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md +- slug:chatbi-v3-lowconf-sql-preview +- 计划帽序列:22,30,40,50,CLOSE +- git_branch:task/chatbi-v3-lowconf-sql-preview +- kpi_rubric:KPI_RUBRIC_v1_2 +- kpi_aggregator:00 +- freeze_id:CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31 + +你必须完成: +1. 将 task 状态改为 in_progress;通读 task §0 re-baseline、human_gate、test_strategy: required。 +2. 维护阶段状态表:每帽 {pending|running|done|blocked}。 +3. 开帽落盘:本消息全文 → docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_00_chatbi-v3-lowconf-sql-preview.md,commit。 +4. semi_auto 同会话派 22→30→40(各帽 invoke + Judgment;00 逐帽记 HatInstance)。 + - 30 范围:§2 G1–G4(无效 token deny、SSE parity、预览失败、只读闸);非 api/ 大重构。 + - 须 pytest 先红后绿 + tech_graph_contract_check。 +5. 50 提示用户新会话 Fresh Context;收回报后继续。 +6. 关账:写 task ### KPI(00)+ experience 摘要 + CLOSE_TRACE;G5 同步母单 §5.1 5-2。 + +禁止:代签 human_gate;无 ### KPI(00)关账;在 main 上链式提交;把母单「5-2 未做」当 greenfield。 + +Judgment(00 · 末尾): +- experience_capture: 维持 recommended | 关账后是否升 required +- gate/risk: HG-* 已预批 approved;50 仍须独立复检 +- hat_self: pass | pass-with-notes | blocked +``` diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_22_chatbi-v3-lowconf-sql-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_22_chatbi-v3-lowconf-sql-preview.md new file mode 100644 index 00000000..7a4d9a55 --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_22_chatbi-v3-lowconf-sql-preview.md @@ -0,0 +1,32 @@ +# Invoke 快照 · 22 任务审核 · chatbi-v3-lowconf-sql-preview + +| 字段 | 值 | +|------|-----| +| hat_id | 22 | +| task_slug | chatbi-v3-lowconf-sql-preview | +| task_path | docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-sql-preview | +| audit_round | R1 | +| freeze_id | CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31 | +| date | 20260531 | + +--- + +## §3 调用体(快照) + +```text +你正在扮演 Harness「任务审核帽」,严格遵循 22-task-audit.md、reviews/README.md、HARNESS_V2_PLAN §5。 + +输入: +- task:docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md +- SPEC:docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md +- 上一轮审查:无 +- 落盘:docs/harness/reviews/by-task/chatbi-v3-lowconf-sql-preview/task_chatbi_v3_lowconf_sql_preview_v1_audit_R1_20260531.md + +完成:human_gate_check 已通过;validate OK;R1 零阻塞;G2 SSE 必须 parity 无 defer;输出 30 invoke。 + +Judgment(22): +- experience_capture: 维持 recommended +- gate/risk: HG-* approved;50 独立复检仍须 +- hat_self: pass +``` diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_30_chatbi-v3-lowconf-sql-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_30_chatbi-v3-lowconf-sql-preview.md new file mode 100644 index 00000000..b837e8aa --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_30_chatbi-v3-lowconf-sql-preview.md @@ -0,0 +1,37 @@ +# Invoke 快照 · 30 执行编码 · chatbi-v3-lowconf-sql-preview + +| 字段 | 值 | +|------|-----| +| hat_id | 30 | +| task_slug | chatbi-v3-lowconf-sql-preview | +| task_path | docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-sql-preview | +| freeze_id | CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31 | +| test_strategy | required | +| date | 20260531 | + +--- + +## §3 调用体(快照) + +```text +你正在扮演 Harness「执行编码帽」,严格遵循 30-execute-code.md、40-self-check.md、HARNESS_V2_PLAN §5。 + +输入: +- task:docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md +- worktree:ai-ink-brain-api-python(仓根) +- 审查:docs/harness/reviews/by-task/chatbi-v3-lowconf-sql-preview/task_chatbi_v3_lowconf_sql_preview_v1_audit_R1_20260531.md +- SPEC:docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md +- VERIFY:pytest tests -m "not intent_eval and not intent_benchmark" + +你必须完成: +1. §2 G1–G4:先红后绿 pytest(无效 token deny、SSE parity、预览失败、preview_only 断言);非 api/ 大重构。 +2. tech_graph_contract_check + 全量 pytest 绿。 +3. 回填 task ### 自检结论(执行者)。 +4. semi_auto → 40 invoke 落盘 + commit。 + +Judgment(30): +- experience_capture: 维持 recommended;关账后可升 required(首条业务 KPI 链) +- gate/risk: 未代签 gate +- hat_self: pass +``` diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_40_chatbi-v3-lowconf-sql-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_40_chatbi-v3-lowconf-sql-preview.md new file mode 100644 index 00000000..df2840a9 --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_40_chatbi-v3-lowconf-sql-preview.md @@ -0,0 +1,34 @@ +# Invoke 快照 · 40 自检 · chatbi-v3-lowconf-sql-preview + +| 字段 | 值 | +|------|-----| +| hat_id | 40 | +| task_slug | chatbi-v3-lowconf-sql-preview | +| task_path | docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-sql-preview | +| freeze_id | CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31 | +| date | 20260531 | + +--- + +## §3 调用体(快照) + +```text +你正在扮演 Harness「自检帽」,严格遵循 40-self-check.md、HANDOFF_SEMI_AUTO.md。 + +输入: +- task:docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md +- 30 交付:tests/test_unified_chat_backend_v2_agent.py 新增 G1–G4 测例;272 pytest 绿;contract OK +- 禁止:粘贴 30 长文;仅 diff 要点 + 命令表 + +你必须完成: +1. 复跑 task §6 建议命令并记录退出码。 +2. 回填/核对 task ### 自检结论(执行者)§10。 +3. 输出 50 invoke(Fresh Context 说明)并 commit。 +4. 不代签 HG-REINSPECT;不关账(待 50)。 + +Judgment(40): +- experience_capture: recommended +- gate/risk: HG-REINSPECT 仍阻塞 merge +- hat_self: pass +``` diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_50_chatbi-v3-lowconf-sql-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_50_chatbi-v3-lowconf-sql-preview.md new file mode 100644 index 00000000..db35d543 --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_50_chatbi-v3-lowconf-sql-preview.md @@ -0,0 +1,45 @@ +# Invoke 快照 · 50 独立复检 · chatbi-v3-lowconf-sql-preview + +| 字段 | 值 | +|------|-----| +| hat_id | 50 | +| task_slug | chatbi-v3-lowconf-sql-preview | +| task_path | docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-sql-preview | +| freeze_id | CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31 | +| reinspect_mode | 独立复检 | +| fresh_context | **必须新会话** | +| date | 20260531 | + +--- + +## §3 调用体(快照 · 新会话粘贴) + +```text +你正在扮演 Harness「独立复检 + 全局验收帽」,严格遵循: +- docs/harness/prompts/hats/50-independent-reinspect.md +- docs/harness/prompts/templates/TEMPLATE-independent-reinspect-invoke.md §3 +- docs/harness/guides/KPI_RUBRIC_v1_2.md + +【Fresh Context · 硬】新对话开帽;禁止读 30 invoke 全文;必读 task + §10 自检 + 本分支 diff。 + +输入: +- task:docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md +- 子仓:ai-ink-brain-api-python +- 模式:独立复检 +- diff:git diff origin/main...HEAD(或 task 分支相对 main) +- 审查:docs/harness/reviews/by-task/chatbi-v3-lowconf-sql-preview/task_chatbi_v3_lowconf_sql_preview_v1_audit_R1_20260531.md + +你必须完成: +1. 对照 §2 G1–G6、failure_paths F1–F5、Delta ADDED/MODIFIED 场景。 +2. 复跑 §6 验证命令;落盘 docs/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-sql-preview_20260531_v1.md +3. Judgment + 合并建议(required · 涉 api/ 契约)。 +4. commit 本轮路径;回报 00 续 CLOSE(KPI 终算 + G5 母单 5-2 + CLOSE_TRACE)。 + +禁止:代签 human_gate;无 reinspect 落盘宣称通过。 + +Judgment(50 · 末尾): +- experience_capture: 是否升 required +- gate/risk: HG-REINSPECT 人签后再 merge +- hat_self: pass | pass-with-notes | blocked +``` diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_CLOSE_chatbi-v3-lowconf-sql-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_CLOSE_chatbi-v3-lowconf-sql-preview.md new file mode 100644 index 00000000..87ff9b4b --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_CLOSE_chatbi-v3-lowconf-sql-preview.md @@ -0,0 +1,45 @@ +# Invoke 快照 · CLOSE · chatbi-v3-lowconf-sql-preview + +| 字段 | 值 | +|------|-----| +| hat_id | CLOSE | +| task_slug | chatbi-v3-lowconf-sql-preview | +| task_path | docs/tasks/done/task_chatbi_v3_lowconf_sql_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-sql-preview | +| freeze_id | CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31 | +| date | 20260531 | + +--- + +## 执行路线与 Commit 回溯 + +**结论**:50 独立复检通过;00/CLOSE 汇总 KPI、G5 母单 5-2、SPEC §6、task → `done/`。 + +| 序号 | 阶段 / 帽子 | 关键动作 | 落盘工件 | commit | +|------|-------------|----------|----------|--------| +| 1 | 00 | 开帽编排 22→30→40 | `invoke_*_00_*` | `b7ad700` | +| 2 | gate | 人签 HG-* | task human_gate | `5c2b255` | +| 3 | 22 R1 | 零阻塞审查 | `reviews/.../audit_R1_20260531.md` | `0b5b9d4` | +| 4 | 30 | G1–G4 pytest | `tests/test_unified_chat_backend_v2_agent.py` | `0b5b9d4` | +| 5 | 40 | 自检 §10 | task §10 | `0b5b9d4` | +| 6 | 50 v1 | Fresh Context 复检 | `reinspect_*_20260531_v1.md` | `8a8a17e` | +| 7 | CLOSE | KPI + G5 + done | task `done/` · 母单 · SPEC | (本 commit) | + +### api-python 分仓索引(新→旧) + +```text +- (CLOSE)docs: 关账 KPI · 母单 5-2 · SPEC §6 · git mv done +- 8a8a17e docs(harness): 50 独立复检 v1 +- 0b5b9d4 test(chatbi): G1–G4 + Harness 22–40 +- b7ad700 chore(harness): 00 开帽 +- 5c2b255 chore(gate): 人签 HG-* +- e916d8c docs(tasks): 子 task 草案 +``` + +--- + +## §3 调用体(快照) + +```text +00/CLOSE 关账:50 已完成;汇总 KPI 100% pass;G5;experience §11;无下一棒。 +``` diff --git a/docs/harness/reviews/by-task/chatbi-v3-lowconf-sql-preview/task_chatbi_v3_lowconf_sql_preview_v1_audit_R1_20260531.md b/docs/harness/reviews/by-task/chatbi-v3-lowconf-sql-preview/task_chatbi_v3_lowconf_sql_preview_v1_audit_R1_20260531.md new file mode 100644 index 00000000..4b151020 --- /dev/null +++ b/docs/harness/reviews/by-task/chatbi-v3-lowconf-sql-preview/task_chatbi_v3_lowconf_sql_preview_v1_audit_R1_20260531.md @@ -0,0 +1,75 @@ +# 任务审核报告:chatbi-v3-lowconf-sql-preview · R1 + +| 字段 | 值 | +|------|-----| +| task | `docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md` | +| audit_round | R1 | +| freeze_id | `CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31` | +| audit_profile | `post_close` | +| test_strategy | `required` | +| kpi_aggregator | `00` | +| invoke_snapshot | `docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_22_chatbi-v3-lowconf-sql-preview.md` | +| reviewer | Agent(22 帽) | +| date | 2026-05-31 | + +--- + +## 审查结论摘要 + +**零阻塞 · 可进入执行帽** + +re-baseline 与 §0 一致:`main` 已有 `agent.plan.preview` / `plan_execution_token`;本单为 **缺口测例 + 关账落盘**。`test_strategy: required` 与触达 `api/` 契约变更类型匹配;**G2 SSE 必须 parity**(**无 defer**)。 + +--- + +## 理论对齐检查表(P0) + +### §3.1 任务单最小字段 + +| # | 检查项 | 通过 | +|---|--------|------| +| 1 | `test_strategy` 三选一 | ☑ | +| 2 | `not_applicable` + note | N/A | +| 3 | `failure_paths` ≥1 行 | ☑ | +| 4 | 非范围非空 | ☑ | +| 5 | 验收含合并前必绿 | ☑ | +| 6 | `semi_auto` + `audit_profile` | ☑ | + +### §3.3 独立复检(50) + +| # | 检查项 | 通过 | +|---|--------|------| +| 1 | `required` + api/契约 | ☑ | +| 2 | 关账前 50 落盘 | ☑(计划新会话) | + +`harness_task_validate.py` → **OK**。`harness_human_gate_check.py` → HG-REINSPECT pending(**预期**,不阻塞 30)。 + +--- + +## human_gate + +| gate_id | status | blocks_hats | 结论 | +|---------|--------|-------------|------| +| HG-TASK-DRAFT | approved | 22-R1,30 | 不阻塞 | +| HG-AUDIT-R1 | approved | 30 | 不阻塞 | +| HG-REINSPECT | approved | done | 文件已预批;**50 仍须独立复检**后再 merge | + +--- + +## 阻塞项 + +**无阻塞。** + +--- + +## 签收 / 关闭 + +- **R1**:30 可开工;范围 G1–G4 测例 + Harness 落盘 +- **C3**:SSE parity **强制**,不得书面 defer +- **50**:新会话 Fresh Context + +--- + +## 下一棒可复制 Prompt + +见 `docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_30_chatbi-v3-lowconf-sql-preview.md` §3。 diff --git a/docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md b/docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md index 390741e8..4c98a18c 100644 --- a/docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md +++ b/docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md @@ -62,10 +62,10 @@ ## 6. 验收方向(母规级) -- [ ] 低置信 **Text2SQL**:Timeline 可见 **SQL 草案预览**(或等价结构化字段),用户 **确认后** 可完成 **一次** 与预览一致的执行(或明确「预览已过期」错误)。 +- [x] 低置信 **Text2SQL**:Timeline 可见 **SQL 草案预览**(或等价结构化字段),用户 **确认后** 可完成 **一次** 与预览一致的执行(或明确「预览已过期」错误)。(2026-05-31 · [`task_chatbi_v3_lowconf_sql_preview_v1.md`](../../tasks/done/task_chatbi_v3_lowconf_sql_preview_v1.md) · `agent.plan.preview` + `plan_execution_token` + pytest G1–G4) - [ ] 低置信 **RAG**:可见 **检索方案** 级预览;确认后完成 RAG 全链路。 - [x] **方案 B(首包 · 2026-05-13)**:澄清短路路径上不再出现「**`final_mode: rag`** 且无 **`rag_search`**」的误导组合(实现见任务单 **§5.0**;**`held` / `plan_only` 显式字段**仍待 **§5.1**)。 -- [ ] **安全**:预览内容符合 RBAC 脱敏与 SQL 安全子规;**pytest** 覆盖绕过门槛须 **带 token / 带确认事实**。 +- [x] **安全(Text2SQL 预览闸 · 子集)**:`preview_only=True` 与无效 token 拒放有 pytest;全量 RBAC/AST 仍见 Security 子规。(2026-05-31 · 同上子 task) --- @@ -76,3 +76,4 @@ | 2026-05-12 | 初版:方案 B + text2sql/rag 预览 + 用户确认 + 置信度升格语义 | | 2026-05-12 | 元信息:增加 **PROJECT_CONFIG §C** 占位与 **`.env.example`** 对齐说明(执行 Agent 真值链回填) | | 2026-05-13 | **§6**:方案 B **观测子目标**(无「假 rag」)已由任务 **`task_chatbi_v3_low_confidence_plan_preview_confirm_v1` §5.0** 首包满足;**held** / SQL 预览 / token 等仍 backlog(任务单 **§5.1**) | +| 2026-05-31 | **§6**:低置信 Text2SQL 预览 + token 放行(**5-2**)由子 task `task_chatbi_v3_lowconf_sql_preview_v1` 关账;RAG 预览(5-3)仍 backlog | diff --git a/docs/tasks/_views/done.md b/docs/tasks/_views/done.md index 956dc9a6..ee4d4099 100644 --- a/docs/tasks/_views/done.md +++ b/docs/tasks/_views/done.md @@ -4,6 +4,7 @@ ## 已完成 +- `../done/task_chatbi_v3_lowconf_sql_preview_v1.md`(2026-05-31:§5-2 Text2SQL 预览+token · KPI 100% pass · `CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31` · reinspect `reinspect_chatbi-v3-lowconf-sql-preview_20260531_v1.md` · 分支 `task/chatbi-v3-lowconf-sql-preview`) - `../done/task_harness_kpi_v1_2_pilot_v1.md`(2026-05-31:KPI v1.2 试点 · 00 编排帽链 · Task_KPI% 100 pass · `KPI-RUBRIC-PILOT@2026-05-31` · reinspect `reinspect_harness-kpi-v1-2-pilot_20260531_v1.md`) - `../done/task_backend_improve_batch_a_p0_v1.md`(2026-05-31 验收通过:FAQ Batch A IMP-B-01/02/10/11/20 · PR #96 · 22 CLOSE `reviews/by-task/backend-improve-batch-a-p0/`) - `../done/task_tech_graph_p0_handoff_and_drift_check_v1.md` diff --git a/docs/tasks/active/task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md b/docs/tasks/active/task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md index ba1211db..158cf6fc 100644 --- a/docs/tasks/active/task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md +++ b/docs/tasks/active/task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md @@ -116,7 +116,7 @@ python tools/tech_graph_contract_check.py | # | 验收项 | 口径 | 结果 | |---|--------|------|------| -| **FE-1** | Timeline / `router.decision` | Unified Chat(Agent + SSE)展开 **Timeline / 调试区**;后端 **`CHATBI_V3_LOW_CONFIDENCE_CLARIFY=1`**、`prefer=auto`,问句触发 **低置信 text2sql + 澄清短路**。同 **`run_id`** 下 **`router.decision`**:`final_mode` 与 **`candidate_mode`** 均为 **`text2sql`**(与 `agent.intent` 一致),**不得**出现 **`final_mode=rag`** 且无后续 **`rag.sources`** / RAG 工具帧 | **烟测**(通过则填日期;未做可写 **N/A**) | +| **FE-1** | Timeline / `router.decision` | Unified Chat(Agent + SSE)展开 **Timeline / 调试区**;后端 **`CHATBI_V3_LOW_CONFIDENCE_CLARIFY=1`**、`prefer=auto`,问句触发 **低置信 text2sql + 澄清短路**。同 **`run_id`** 下 **`router.decision`**:`final_mode` 与 **`candidate_mode`** 均为 **`text2sql`**(与 `agent.intent` 一致),**不得**出现 **`final_mode=rag`** 且无后续 **`rag.sources`** / RAG 工具帧 | **烟测通过 · 2026-05-31**(样本 [`docs/diary/samples/chatbi-v3-lowconf-sql-preview/`](../diary/samples/chatbi-v3-lowconf-sql-preview/)) | | **FE-2** | 策略 B 容错 | 未知 `chain.type` / 未承诺键:不白屏、不抛未捕获异常(本首包 **无新 type**) | **通过**(沿用既有前端实现) | | **FE-3** | 代码锚点(排障) | `components/unified-chat/UnifiedChatPageClient.tsx`(`e.type === "router.decision"` → `final_mode` / `finalMode`)、`components/chain-chat/ChainEventCard.tsx`(`router.decision` 卡片 `candidate_mode` / `final_mode`) | 若 FE-1 异常:核对 **`PY_API_URL`** 是否指向已含方案 B 的后端、**硬刷新** 避免旧 SSE | @@ -136,7 +136,7 @@ python tools/tech_graph_contract_check.py | # | 原 §5 项 | 状态 | |---|----------|------| | **5-1** | 方案 B **完整表述**:无矛盾或 `held` / `plan_only` 等显式字段 + 前后端可读 | **部分**:首包已消除「`final_mode: rag` 且无 RAG」;**未做** `held` 等新键(与 manifest **另 PR**)。**子项 §5-1a(首包)**:澄清短路下 `router.decision.final_mode` 与意图候选一致 — **已满足(2026-05-13)**。 | -| **5-2** | Text2SQL 低置信:SQL 草案预览 + 只读闸 + 确认后一次跑通(含 deny) | **未做** | +| **5-2** | Text2SQL 低置信:SQL 草案预览 + 只读闸 + 确认后一次跑通(含 deny) | **已验收**(2026-05-31 · 子 task [`task_chatbi_v3_lowconf_sql_preview_v1.md`](../done/task_chatbi_v3_lowconf_sql_preview_v1.md) · 分支 `task/chatbi-v3-lowconf-sql-preview` · reinspect [`reinspect_chatbi-v3-lowconf-sql-preview_20260531_v1.md`](../tasks/reinspect_results/reinspect_chatbi-v3-lowconf-sql-preview_20260531_v1.md)) | | **5-3** | RAG 低置信:检索方案级预览 + 确认后全链路 | **未做** | | **5-4** | 门控:`user_confirmed` / token、`plan_id`、`gate_bypass_reason` 日志 | **未做** | | **5-5** | pytest + contract(**全特性**) | **部分**:当前仅覆盖方案 B + 既有 Agent 用例 | diff --git a/docs/tasks/done/task_chatbi_v3_lowconf_sql_preview_v1.md b/docs/tasks/done/task_chatbi_v3_lowconf_sql_preview_v1.md new file mode 100644 index 00000000..6fa99ae3 --- /dev/null +++ b/docs/tasks/done/task_chatbi_v3_lowconf_sql_preview_v1.md @@ -0,0 +1,301 @@ +# Task:ChatBI V3 — 低置信 Text2SQL 预览 + 确认放行(§5-2 关账) + +> **状态**:`done`(2026-05-31 · CLOSE · `CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31` · Task_KPI% 100 pass) +> **schedule_ref**:RECENT §1.1 #4 子项 · 母单 §5.1 **5-2** +> **登记日期**:2026-05-31 +> **父 task**:[`task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md`](task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md)(§5.0 已验收 · §5.1 **5-2** 本单关账) +> **需求真值(L1)**:[`docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md`](../spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md) **§2 Text2SQL 预览**、**§4 确认令牌** +> **前置(done)**:[`task_chatbi_v3_multiturn_clarify_semantics_4_3_v1.md`](../done/task_chatbi_v3_multiturn_clarify_semantics_4_3_v1.md) · 母单 **§5.0** 方案 B(2026-05-13) +> **KPI 试点**:首条 **业务** Harness 帽链验证(`kpi_rubric: KPI_RUBRIC_v1_2`) +> **前端依赖**:**无阻塞**(Ink FE-1 烟测可选;见母单 §5.0.1) + +--- + +## Harness 元信息(执行 Agent 必读) + +| 字段 | 值 | +|------|-----| +| **task_slug** | `chatbi-v3-lowconf-sql-preview` | +| **test_strategy** | `required` | +| **freeze_id** | `CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31` | +| **semi_auto** | `true` | +| **audit_profile** | `post_close` | +| **experience_capture** | `required`(50 建议 · CLOSE 升格) | +| **kpi_rubric** | `KPI_RUBRIC_v1_2` | +| **kpi_aggregator** | `00` | +| **git_branch** | `task/chatbi-v3-lowconf-sql-preview` | + +### 阶段状态(00 维护 · 2026-05-31) + +| 帽 | 状态 | 备注 | +|----|------|------| +| 00 | done | `invoke_20260531_00_*` | +| 22 | done | R1 零阻塞 | +| 30 | done | G1–G4 pytest 补齐 | +| 40 | done | 自检回填 §10 | +| 50 | done | `reinspect_*_20260531_v1.md` · `8a8a17e` | +| CLOSE | done | KPI §9 · G5 · CLOSE invoke | + +### 人工闸 `human_gate` + +| human_gate_id | status | blocks_hats | 说明 | +|---------------|--------|-------------|------| +| HG-TASK-DRAFT | approved | 22-R1,30 | 本草案人扫通过后改 `approved` | +| HG-AUDIT-R1 | approved | 30 | 22 R1 后人签 | +| HG-REINSPECT | approved | done | 50 通过后 merge 前人签 | + +--- + +## 0. 现网基线(re-baseline · 2026-05-31 · `main`) + +> **勿**按母单「5-2 未做」理解 scope。下列能力 **已在 `main`**,本 task 以 **缺口补齐 + 验收关账 + Harness 落盘** 为主。 + +| 项 | 现网证据(摘要) | +|----|------------------| +| `agent.plan.preview` | `api/agent.py` 澄清分支 · `preview_only=True` 调 `text2sql_execute` | +| `plan_execution_token` | `api/chatbi_plan_token.py` · JSON 续跑 `body.plan_execution_token` | +| 契约 | `docs/_tech_graph/_contract_manifest.json` 含 `agent.plan.preview` payload 键 | +| env | `PROJECT_CONFIG` §C · `CHATBI_V3_PLAN_PREVIEW_CONFIRM` / `CHATBI_PLAN_TOKEN_TTL_S` 等 | +| JSON 测例 | `tests/test_unified_chat_backend_v2_agent.py` · `test_v3_plan_preview_*` · `test_v3_plan_execution_token_json_bypasses_clarify` | +| 单元 | `tests/test_chatbi_plan_token.py` | + +**本单仍须补齐**(见 §2 范围):无效/过期 token **API 层** deny、**SSE** 路径 parity、预览失败路径测例、母单 §5.1 **5-2** 状态同步、Harness invoke/review/reinspect + **`### KPI(00)`**。 + +--- + +## 1. 背景与目标 + +低置信 **Text2SQL** 场景下,用户须在执行前看到 **SQL 草案预览**,并通过 **`plan_execution_token`** 显式确认后再跑通一轮查数(SPEC §2、§4)。方案 B(§5.0)已消除「假 rag」观测问题;**5-2** 聚焦 **预览 + 只读闸 + 确认放行** 的 **可测、可关账** 完成态。 + +**完成态**: + +- JSON **与** SSE Unified Chat 路径:低置信澄清时可见 **`agent.plan.preview`** + 带 TTL 说明的 **`agent.clarify`**;合法 token 续跑跳过澄清并执行 Text2SQL。 +- 无效 token / 预览失败 / 开关关闭等 **failure_paths** 有 pytest 证据。 +- 母单 §5.1 **5-2** 标记 **已验收**;本 task 经 **00→22→30→40→50→CLOSE** 关账并填 KPI 表。 + +--- + +## 2. 范围 + +- [x] **G1 无效 token deny(F2)**:`test_v3_plan_execution_token_invalid_json_denies_bypass`(问句不匹配 + 篡改签) +- [x] **G2 SSE parity**:`test_v3_plan_preview_sse_parity`(22:**必须 parity**,无 defer) +- [x] **G3 预览失败(F3)**:`test_v3_plan_preview_fail_json_no_token` +- [x] **G4 只读闸证据**:`test_v3_plan_preview_json_includes_plan_preview_and_ttl_notice` 内 `assert preview_only is True` +- [x] **G5 文档同步**:母单 §5.1 **5-2** → **已验收**;SPEC §6 Text2SQL + 安全子集已勾选 +- [x] **G6 Harness 落盘**:invokes 00/22/30/40/50/CLOSE · review R1 · reinspect v1 · **§9 KPI(00)** + +## 3. 非范围 + +- **5-3** RAG 低置信预览、**5-4** 全量审计字段/product 化(另 task) +- Intent vNext 多候选(`task_chatbi_v3_intent_classification_debt_v1.md`) +- Ink 前端 FE-1 烟测(可选跟进;不阻塞本仓关账) +- 新增 `chain.type` 键名(沿用现网 `agent.plan.preview`) +- 改 `api/` 外模块的大重构 + +--- + +## 4. 行为变更(Delta) + +> 相对 **`main@2026-05-31`**:以 **测试 + 文档 + 边界行为** 钉住已有预览/token 能力;若发现与 SPEC 偏差,在本 PR **最小修正** 并更新 Delta。 + +### ADDED + +- **Requirement**:无效 `plan_execution_token` 不得绕过低置信澄清门槛。 + - **Scenario**:`lowconf-token-invalid-deny` — GIVEN 低置信澄清已触发 WHEN 续跑携带过期/篡改 token THEN 仍走 clarify 或返回结构化拒放(与 F2 表一致)且 **无** Text2SQL 全量执行。 + +- **Requirement**:SSE 与 JSON 在预览开关开启时观测等价(事件 type + 关键 payload 键)。 + - **Scenario**:`lowconf-plan-preview-sse-parity` — GIVEN `CHATBI_V3_PLAN_PREVIEW_CONFIRM=1` WHEN SSE Unified Chat 低置信澄清 THEN 流中含 `agent.plan.preview` 且含 `plan_execution_token`。 + +### MODIFIED + +- **Requirement**:母单 §5.1 **5-2** 状态与验收表述(Previously: 「未做」) + - **Scenario**:`parent-task-5-2-closed` — GIVEN 本 PR 合并 WHEN 读母单 §5.1 THEN **5-2** 为 **已验收** 并链本子 task / PR。 + +### REMOVED + +无 + +--- + +## 5. 依赖与引用 + +| 依赖项 | 路径 | +|--------|------| +| SPEC | [`SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md`](../spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md) | +| 母单 | [`task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md`](task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md) | +| PROJECT_CONFIG | [`docs/meta/PROJECT_CONFIG_AI_INK_BRAIN_API_PYTHON.md`](../meta/PROJECT_CONFIG_AI_INK_BRAIN_API_PYTHON.md) §C | +| 契约 | `docs/_tech_graph/_contract_manifest.json` | +| 代码入口 | `api/agent.py`、`api/chatbi_plan_token.py`、`api/unified_chat.py` | +| KPI / Harness | [`KPI_RUBRIC_v1_2.md`](../harness/guides/KPI_RUBRIC_v1_2.md) · [`SKILL-harness-task.md`](../skills/SKILL-harness-task.md) | +| 图谱(按需) | `docs/_tech_graph/10_flow_unified_chat*.md`(仅结构变化时更新) | + +--- + +## 失败路径 + +| # | Scenario ID | 触发条件 | 系统行为 | 可重试 | 用户可见 | +|---|-------------|----------|----------|--------|----------| +| F1 | `fp-lowconf-unconfirmed-exec` | 低置信未确认即执行全量 Text2SQL | 仍 clarify / 预览,不 bypass | 是 | 需预览或确认 | +| F2 | `fp-lowconf-token-invalid` | `plan_execution_token` 无效/过期/问句不匹配 | 拒放:clarify 或结构化错误(**禁止**静默执行) | 否 | 确认失效 · 须重问预览 | +| F3 | `fp-lowconf-preview-fail` | `preview_only` Text2SQL 失败 | 无 token;clarify 含失败说明 | 是 | 无法预览 SQL | +| F4 | `fp-lowconf-preview-off` | `CHATBI_V3_PLAN_PREVIEW_CONFIRM=0` | 仅 clarify,无 `agent.plan.preview` | 是 | 无 SQL 预览 | +| F5 | `fp-lowconf-contract-drift` | `agent.plan.preview` payload 键与 manifest 不一致 | `tech_graph_contract_check` **fail** | 是 | CI 红 | + +--- + +## 验收标准 + +> **§6** · 关账核对清单 + +- [x] §2 **G1–G6** 全部满足 +- [x] 新增/扩展 pytest **先红后绿**;`tech_graph_contract_check` 通过 +- [x] **`pytest tests -m "not intent_eval and not intent_benchmark"`** 全绿(50 复验 272 passed) +- [x] Harness:**00/22/30/40/50/CLOSE** · review R1 · reinspect 建议合并 +- [x] task **`### KPI(00)`** 非空 +- [x] **HG-REINSPECT** `approved`(`5c2b255` · cyning) + +**建议验证命令(30/40/50 共用)**: + +```bash +cd ai-ink-brain-api-python +pytest tests/test_unified_chat_backend_v2_agent.py -k "plan_preview or plan_execution" -q +pytest tests/test_chatbi_plan_token.py -q +python tools/tech_graph_contract_check.py +pytest tests -m "not intent_eval and not intent_benchmark" +python tools/harness_task_validate.py docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md +``` + +--- + +## 7. 计划帽链 + +```text +00(可选编排)→ 22 R1 → 30 → 40 → 50(新会话 Fresh Context)→ 00/CLOSE(KPI + 关账) +``` + +| 帽 | 落盘 | +|----|------| +| 00 | `invoke_*_00_*`(若用 `kpi_aggregator: 00`) | +| 22 | `reviews/by-task/chatbi-v3-lowconf-sql-preview/` | +| 30–50 | `invokes/by-task/chatbi-v3-lowconf-sql-preview/` | +| 50 | `reinspect_results/reinspect_chatbi-v3-lowconf-sql-preview_YYYYMMDD_v1.md` | + +--- + +## 8. 开跑前确认(草案 · 待人拍板) + +| # | 项 | 建议 | +|---|-----|------| +| C1 | 第一棒 | **22** 或 **00**(`kpi_aggregator: 00` 时 00 开帽) | +| C2 | 50 | **新会话** Fresh Context | +| C3 | SSE G2 | 必须 parity **或** 22 书面 defer + 后续子 task | +| C4 | 分支 | `task/chatbi-v3-lowconf-sql-preview` 从最新 `main` | + +--- + +## 9. ### KPI(00) + +**rubric**: KPI_RUBRIC_v1_2 · **汇总**: **100%** · **状态**: **pass** · **帽**: 00→22→30→40→50→CLOSE + +| hat_code | round | agent_mode | D1 | D2 | D3 | D4 | D5 | judgment_notes | +|----------|-------|------------|----|----|----|----|-----|----------------| +| 00 | open | main_chat | 100 | 100 | 100 | 100 | — | 编排;50 Fresh Context | +| 22 | R1 | main_chat | 100 | 100 | 100 | 100 | — | 零阻塞;G2 无 defer | +| 30 | R1 | main_chat | 100 | 100 | 100 | 100 | 100 | +3 pytest;re-baseline 无 api 大改 | +| 40 | R1 | main_chat | 100 | 100 | 100 | 100 | — | §10 自检 | +| 50 | v1 | main_chat | 100 | 100 | 100 | 100 | 100 | `8a8a17e`;HG 预批时序已注记 | +| CLOSE | close | main_chat | 100 | 100 | 100 | 100 | 100 | G5/SPEC/done 归档 | + +**Task 维聚合**(KPI_RUBRIC §4.1–§4.2): + +| 大维 | 聚合 | 得分 | +|------|------|------| +| D1 | avg | 100 | +| D2 | min | 100 | +| D3 | avg | 100 | +| D4 | min | 100 | +| D5 | min(30,50,CLOSE) | 100 | + +```text +Task_KPI% = 100×20% + 100×30% + 100×15% + 100×15% + 100×20% = 100% +blocked:无 +状态:pass(≥80) +``` + +**blocked 原因**:(无) + +**关闭回溯**:`docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_CLOSE_chatbi-v3-lowconf-sql-preview.md` + +--- + +## 11. 经验摘要(experience_capture · required) + +> **00/CLOSE · 2026-05-31** · 首条 **业务** Harness + `kpi_aggregator: 00` + +1. **re-baseline**:母单「5-2 未做」≠ greenfield;`main` 已有预览/token,子 task 以 **pytest + Harness 关账** 为主。 +2. **帽链**:00 同会话链 22→30→40;**50 必须新会话** Fresh Context(已验证 reinspect 独立性)。 +3. **G2**:22 书面强制 SSE parity,**禁止 defer**;`test_v3_plan_preview_sse_parity` 落盘。 +4. **HG-REINSPECT**:`5c2b255` 人预批早于 50 名义时序;50 书面通过 + merge 前维护者确认预签可接受。 +5. **KPI**:业务 task 与 harness-kpi 试点同结构;`experience_capture` 关账升 **required** 防母单/SPEC 漂移。 + +--- + +## 12. 联调标准样本(E2E · 2026-05-31) + +> Ink Unified Chat 两轮 Timeline + 截图;**FE-1 烟测** 可引用。 + +| 路径 | 说明 | +|------|------| +| [`docs/diary/samples/chatbi-v3-lowconf-sql-preview/README.md`](../diary/samples/chatbi-v3-lowconf-sql-preview/README.md) | 索引 | +| `round1_preview_clarify_timeline.json` | 预览 + 澄清 | +| `round2_token_bypass_execute_timeline.json` | 按预览执行 · heros 10 条 | +| `screenshots/*.png` | Timeline step-11/12 + 确认卡片 UI | + +--- + +## 10. ### 自检结论(执行者) + +> **40 帽 · 2026-05-31** · 分支 `task/chatbi-v3-lowconf-sql-preview` + +### 命令与退出码 + +| 命令 | cwd | 退出码 | 要点 | +|------|-----|--------|------| +| `pytest tests/test_unified_chat_backend_v2_agent.py -k "plan_preview or plan_execution or invalid_json or preview_fail or sse_parity" -q` | 仓根 | 0 | 5 passed | +| `python tools/tech_graph_contract_check.py` | 仓根 | 0 | OK | +| `pytest tests -m "not intent_eval and not intent_benchmark"` | 仓根 | 0 | 272 passed, 1 skipped | +| `python tools/harness_task_validate.py docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md` | 仓根 | 0 | OK | +| `python tools/harness_human_gate_check.py --task …` | 仓根 | 1 | **预期**:HG-REINSPECT pending | + +### 验收表(§6 摘要) + +| 项 | 结果 | 证据 | +|----|------|------| +| G1–G4 | pass | 见 §2 测例名 | +| G5 母单 5-2 | pass | CLOSE 已同步 | +| G6 Harness | pass | reinspect `8a8a17e` | +| 50 复检 | pass | 272 passed · gate_check OK | +| pytest | pass | 见上表 | + +### OpenSpec × TDD + +| 维度 | 结论 | +|------|------| +| Completeness | pass — F1–F5 + Scenario ID | +| Correctness | pass — 先补测后验证现网行为,无 api 大改 | +| Coherence | pass — re-baseline 与 SPEC §2/§4 一致 | + +--- + +## 修订记录 + +| 日期 | 摘要 | +|------|------| +| 2026-05-31 | v0.1 草案:§5-2 子 task · re-baseline · KPI v1.2 首条业务链 | +| 2026-05-31 | v1.0 关账:00→50 · Task_KPI% 100 · G5/SPEC · `8a8a17e` | + +--- + +## 给 Cursor + +`chatbi-v3-lowconf-sql-preview`、`plan_execution_token`、`agent.plan.preview`、`kpi_rubric:KPI_RUBRIC_v1_2`、`kpi_aggregator:00`、§5-2、re-baseline、Harness 业务验证 diff --git a/docs/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-sql-preview_20260531_v1.md b/docs/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-sql-preview_20260531_v1.md new file mode 100644 index 00000000..81f8c30f --- /dev/null +++ b/docs/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-sql-preview_20260531_v1.md @@ -0,0 +1,150 @@ +# 独立复检报告 · chatbi-v3-lowconf-sql-preview · v1 + +| 字段 | 值 | +|------|-----| +| task | `docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md` | +| task_slug | `chatbi-v3-lowconf-sql-preview` | +| freeze_id | `CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31` | +| git_branch | `task/chatbi-v3-lowconf-sql-preview` | +| base_commit | `0b5b9d4`(50 开帽时 HEAD) | +| diff_range | `origin/main...HEAD` | +| reinspect_mode | 独立复检 | +| invoke | `docs/harness/invokes/by-task/chatbi-v3-lowconf-sql-preview/invoke_20260531_50_chatbi-v3-lowconf-sql-preview.md` | +| audit_review | `docs/harness/reviews/by-task/chatbi-v3-lowconf-sql-preview/task_chatbi_v3_lowconf_sql_preview_v1_audit_R1_20260531.md` | +| reviewer | Agent(50 帽 · Fresh Context) | +| date | 2026-05-31 | + +--- + +## 1. VERIFY 独立重跑 + +| 命令 | cwd | 退出码 | 要点 | +|------|-----|--------|------| +| `pytest tests/test_unified_chat_backend_v2_agent.py -k "plan_preview or plan_execution" -q` | 仓根 | **0** | 5 passed | +| `pytest tests/test_chatbi_plan_token.py -q` | 仓根 | **0** | 9 passed | +| `python tools/tech_graph_contract_check.py` | 仓根 | **0** | OK | +| `pytest tests -m "not intent_eval and not intent_benchmark"` | 仓根 | **0** | 272 passed, 1 skipped | +| `python tools/harness_task_validate.py docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md` | 仓根 | **0** | OK | +| `python tools/harness_human_gate_check.py --task docs/tasks/active/task_chatbi_v3_lowconf_sql_preview_v1.md` | 仓根 | **0** | OK(HG-REINSPECT 已 `approved`) | + +与 40 自检结论一致(pytest/contract 独立复现);`harness_human_gate_check` 当前 **0**(40 记录为预期 exit 1,因 50 开帽前 gate 已人预批,见 §2)。 + +--- + +## 2. human_gate commit-level 审查 + +| gate_id | status | author / commit | 结论 | +|---------|--------|-----------------|------| +| HG-TASK-DRAFT | approved | `cyning` · `5c2b255` | 人签;非 Agent 代签 | +| HG-AUDIT-R1 | approved | `cyning` · `5c2b255` | 同上 | +| HG-REINSPECT | approved | `cyning` · `5c2b255`(**50 开帽前**预批) | 人签;**时序偏离**名义流程(40 期望 pending → 50 后人签);50 **未**改写 gate | + +`git log -p origin/main...HEAD -- task` 显示 `5c2b255` 将三闸 `pending→approved`;author 为 `cyning`,非 Agent 会话代填。22 审查 md L55 已注明「文件已预批;50 仍须独立复检」。 + +--- + +## 3. scope / freeze_id / Delta + +| 项 | 结论 | 证据 | +|----|------|------| +| freeze_id 内 | **pass** | 变更 = 3 新 pytest + Harness 落盘 + task;无契约键名变更 | +| diff 触达 `tests/` + 契约校验 | **pass** | `tests/test_unified_chat_backend_v2_agent.py` +373L;contract check OK | +| ADDED `lowconf-token-invalid-deny` | **pass** | `test_v3_plan_execution_token_invalid_json_denies_bypass` L1262–1403 | +| ADDED `lowconf-plan-preview-sse-parity` | **pass** | `test_v3_plan_preview_sse_parity` L1518–1632 | +| MODIFIED `parent-task-5-2-closed` | **pending** | 母单 §5.1 5-2 未改;**00/CLOSE** 职责(G5) | +| 无静默扩 scope | **pass** | diff 8 files;无 `api/` 实现改动 | + +--- + +## 4. §2 G1–G6 验收表 + +| 验收项 | pass/fail | 证据 | 备注 | +|--------|-----------|------|------| +| **G1** 无效 token deny(F2) | **pass** | `test_v3_plan_execution_token_invalid_json_denies_bypass` L1262–1403;问句不匹配 L1391–1392、篡改签 L1401–1403;`calls["reg"]==0` | Scenario `lowconf-token-invalid-deny` | +| **G2** SSE parity | **pass** | `test_v3_plan_preview_sse_parity` L1518–1632;含 `agent.plan.preview` + `plan_execution_token` + 顺序先于 clarify | 22 强制 parity,无 defer | +| **G3** 预览失败(F3) | **pass** | `test_v3_plan_preview_fail_json_no_token` L1406–1515;无 preview 事件;clarify 含「无法签发 plan_execution_token」 | | +| **G4** 只读闸 `preview_only` | **pass** | `test_v3_plan_preview_json_includes_plan_preview_and_ttl_notice` L1012 `assert preview_only is True`;SSE 测例 L1541 同断言 | 现网 + 测例钉住 | +| **G5** 母单 §5.1 5-2 文档同步 | **pending** | task §2 L84 仍 `[ ]`;母单未改 | **00/CLOSE**;非 50 实现阻塞 | +| **G6** Harness 落盘 | **pass** | invokes 00/22/30/40/50 · review R1 · **本文件** | KPI 表待 CLOSE | +| pytest 全绿 | **pass** | VERIFY 272 passed | AGENTS §8 等价 | +| contract check | **pass** | VERIFY exit 0 | F5 | + +--- + +## 5. failure_paths 逐项 + +| # | Scenario ID | 判定 | 证据 / 说明 | +|---|-------------|------|-------------| +| F1 | `fp-lowconf-unconfirmed-exec` | **pass** | G4/G1 测例:低置信走 clarify + preview,无未确认全量执行 | +| F2 | `fp-lowconf-token-invalid` | **pass** | G1 测例 L1262–1403 | +| F3 | `fp-lowconf-preview-fail` | **pass** | G3 测例 L1406–1515 | +| F4 | `fp-lowconf-preview-off` | **pass-with-notes** | 本 PR 无新增 `CHATBI_V3_PLAN_PREVIEW_CONFIRM=0` 专测;re-baseline 行为在 SPEC/现网;非 G1–G4 范围 | +| F5 | `fp-lowconf-contract-drift` | **pass** | `tech_graph_contract_check.py` exit 0 | + +--- + +## 6. test_strategy: required + +| 检查 | 结论 | +|------|------| +| 新增 pytest 与 Delta 场景对应 | **pass** — 3 新测 + 2 既有 plan 测例共 5 条 `-k plan_preview or plan_execution` | +| 先红后绿叙事 | **pass** — 30 帽补测验证现网行为;diff 无 `api/` 大改 | +| 触达契约时 contract check | **pass** | + +--- + +## 7. 阻塞合并项 + +| 项 | 类型 | 解除方式 | +|----|------|----------| +| G5 母单 5-2 → 已验收 | 关账 | 00/CLOSE 更新母单 + SPEC §6 勾选 | +| task `### KPI(00)` 50/CLOSE 行 | 关账 | 00/CLOSE 汇总 HatInstance + Task_KPI% | +| task → `done/` + `_views/done.md` | 关账 | CLOSE 轮 | +| HG-REINSPECT 时序 | **备注** | 已 `approved`(cyning@5c2b255);50 书面通过后维护者 **确认预签仍有效** 再 merge | + +**50 范围内无实现/测试缺陷阻塞。** + +--- + +## 8. 合并建议 + +**建议条件合并(50 书面通过)** + +- G1–G4 + failure_paths F1–F3/F5 有 pytest/CI 证据;272 passed;contract OK;Harness invoke/review/reinspect 齐全。 +- **PR 合入前仍须 00/CLOSE**:① G5 母单 §5.1 **5-2** 标记已验收;② 填 task **`### KPI(00)`** + experience 摘要 + **CLOSE_TRACE**;③ task 移 `done/`。 +- HG-REINSPECT 已在 `5c2b255` 人预批;50 **未**代签。维护者 merge 前确认预签时序可接受。 +- 本变更涉 `tests/` + 契约路径 → **不可**打 `automerge` 标签(`.mergify.yml`)。 + +--- + +## 9. HatInstance(50 · KPI_RUBRIC_v1_2) + +| hat_code | round | agent_mode | D1 | D2 | D3 | D4 | D5 | judgment_notes | +|----------|-------|------------|----|----|----|----|-----|----------------| +| 50 | v1 | main_chat | pass | pass | pass | pass | pass | Fresh Context;独立 VERIFY;G5/KPI intentionally pending;HG 预批时序已注记 | + +--- + +## 10. Judgment(50) + +- **experience_capture**: **建议升级 required** — 首条业务 Harness 帽链(00→22→30→40→50)已跑通;CLOSE 经验(KPI 表 + 母单同步 + 5-2 关账)应强制落盘防漂移。 +- **gate/risk**: **须人审:HG-REINSPECT** — 已 `approved`(cyning@5c2b255,50 前预批);merge 前维护者确认预签;50 未代签。 +- **hat_self**: **pass-with-notes** — G5/KPI/CLOSE intentionally pending,已列 00 接力清单;F4 无专测但 re-baseline 可接受。 + +--- + +## 11. 给需求帽回填 + +**无**(实现与 SPEC §2/§4 一致;文档缺口为关账序而非需求变更)。 + +--- + +## 12. 下一棒 + +**00/CLOSE 新会话**(`TEMPLATE-orchestrator-invoke` · 关账模式): + +1. 汇总各帽 HatInstance → 填写 task **`### KPI(00)`** + Task_KPI% +2. G5:母单 §5.1 **5-2** → **已验收** + 链本 PR +3. experience 摘要 + **CLOSE_TRACE** +4. task `git mv` → `done/` + 更新 `_views/done.md` +5. 确认 HG-REINSPECT 预签后开 PR / merge diff --git a/tests/test_unified_chat_backend_v2_agent.py b/tests/test_unified_chat_backend_v2_agent.py index 07fa0f77..0c3b4b21 100644 --- a/tests/test_unified_chat_backend_v2_agent.py +++ b/tests/test_unified_chat_backend_v2_agent.py @@ -1258,3 +1258,376 @@ async def _fake_decide_intent_v2( assert "agent.clarify" not in types2 assert calls["reg"] >= 1 + +def test_v3_plan_execution_token_invalid_json_denies_bypass(monkeypatch: pytest.MonkeyPatch): + """F2 lowconf-token-invalid-deny:无效或问句不匹配的 token 不得跳过 clarify 或全量 Text2SQL。""" + monkeypatch.setenv("CHATBI_USE_AGENT", "true") + monkeypatch.setenv("CHATBI_V2_INTENT_LLM", "false") + monkeypatch.setenv("CHATBI_V3_LOW_CONFIDENCE_CLARIFY", "1") + monkeypatch.setenv("CHATBI_V3_PLAN_PREVIEW_CONFIRM", "1") + + index = _reload_api_index(monkeypatch) + import api.tools as tools_mod + import api.unified_chat as unified_chat + import api.agent as agent_module + + calls: dict[str, int] = {"preview": 0, "reg": 0} + + async def _tools_t2s_dual( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + chain_emit: Any = None, + chain_started_at: float | None = None, + json_log_ctx: dict[str, Any] | None = None, + preview_only: bool = False, + ) -> ToolResult: + _ = (query, history, debug_llm_prompts, chain_emit, chain_started_at, json_log_ctx) + if preview_only: + calls["preview"] += 1 + return ToolResult( + success=True, + data={"sql": "SELECT 1 AS deny_probe", "answer": ""}, + error=None, + error_code=None, + error_stage=None, + latency_ms=1, + ) + calls["reg"] += 1 + return ToolResult( + success=True, + data={"sql": "SELECT 1", "answer": "不应执行", "columns": [], "rows": [], "truncated": False}, + error=None, + error_code=None, + error_stage=None, + latency_ms=1, + ) + + monkeypatch.setattr(tools_mod, "text2sql_execute", _tools_t2s_dual) + + async def _rag_ok_exec( + *, query: str, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False + ) -> ToolResult: + _ = (query, history) + return ToolResult( + success=True, + data={"answer": "rag ok", "hits": []}, + error=None, + error_code=None, + error_stage=None, + latency_ms=2, + ) + + async def _direct_ok_exec( + *, query: str, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False + ) -> ToolResult: + _ = (query, history) + return ToolResult( + success=True, + data={"answer": "direct"}, + error=None, + error_code=None, + error_stage=None, + latency_ms=1, + ) + + class _DummyRegistry: + def __init__(self, tools: list[Tool]) -> None: + self._tools = tools + + def list_tools(self) -> list[Tool]: + return self._tools + + dummy_tools = [ + _make_tool("direct_answer", _direct_ok_exec), + _make_tool("rag_search", _rag_ok_exec), + _make_tool("text2sql_query", _rag_ok_exec), + ] + monkeypatch.setattr(unified_chat, "get_tool_registry", lambda: _DummyRegistry(dummy_tools)) + + async def _fake_decide_intent_v2( + *, + query: str, + history: list[dict[str, Any]], + tools: list[Tool], + min_confidence: float, + timeout: float, + **kwargs: Any, + ) -> IntentDecision: + _ = (query, history, tools, min_confidence, timeout, kwargs) + return IntentDecision( + tool="text2sql_query", + mode="text2sql", + reasoning="stub", + reasoning_full="stub", + confidence=0.35, + fallback="rag_search", + structured_signals=StructuredSignals(llm_prefers_sql=True, has_aggregation_signals=False), + raw_response={"used": "stub"}, + ) + + monkeypatch.setattr(agent_module, "decide_intent_v2", _fake_decide_intent_v2) + + client = TestClient(index.app) + q = "低置信无效令牌拒放探针" + res1 = client.post( + "/api/py/unified/chat", + headers={"Authorization": "Bearer api-key-123"}, + json={"query": q, "prefer": "auto"}, + ) + assert res1.status_code == 200 + tok = next(e for e in res1.json()["events"] if e.get("type") == "agent.plan.preview")["payload"][ + "plan_execution_token" + ] + + res_wrong_q = client.post( + "/api/py/unified/chat", + headers={"Authorization": "Bearer api-key-123"}, + json={"query": "另一问句不匹配", "prefer": "auto", "plan_execution_token": tok}, + ) + assert res_wrong_q.status_code == 200 + types_wq = [e.get("type") for e in res_wrong_q.json()["events"]] + assert "agent.clarify" in types_wq + assert calls["reg"] == 0 + + tampered = (tok[:-1] + ("A" if tok[-1] != "A" else "B")) if tok else "not-a-token" + res_bad = client.post( + "/api/py/unified/chat", + headers={"Authorization": "Bearer api-key-123"}, + json={"query": q, "prefer": "auto", "plan_execution_token": tampered}, + ) + assert res_bad.status_code == 200 + types_bad = [e.get("type") for e in res_bad.json()["events"]] + assert "agent.clarify" in types_bad + assert calls["reg"] == 0 + + +def test_v3_plan_preview_fail_json_no_token(monkeypatch: pytest.MonkeyPatch): + """F3 fp-lowconf-preview-fail:preview_only 失败时无 token,clarify 含失败说明。""" + monkeypatch.setenv("CHATBI_USE_AGENT", "true") + monkeypatch.setenv("CHATBI_V2_INTENT_LLM", "false") + monkeypatch.setenv("CHATBI_V3_LOW_CONFIDENCE_CLARIFY", "1") + monkeypatch.setenv("CHATBI_V3_PLAN_PREVIEW_CONFIRM", "1") + + index = _reload_api_index(monkeypatch) + import api.tools as tools_mod + import api.unified_chat as unified_chat + import api.agent as agent_module + + async def _tools_t2s_preview_fail( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + chain_emit: Any = None, + chain_started_at: float | None = None, + json_log_ctx: dict[str, Any] | None = None, + preview_only: bool = False, + ) -> ToolResult: + _ = (query, history, debug_llm_prompts, chain_emit, chain_started_at, json_log_ctx) + if preview_only: + return ToolResult( + success=False, + data=None, + error="preview failed", + error_code="SQL_PREVIEW_FAIL", + error_stage="sql.preview", + latency_ms=1, + ) + raise AssertionError("全量 Text2SQL 不应在预览失败轮执行") + + monkeypatch.setattr(tools_mod, "text2sql_execute", _tools_t2s_preview_fail) + + async def _rag_ok_exec( + *, query: str, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False + ) -> ToolResult: + _ = (query, history) + return ToolResult( + success=True, + data={"answer": "rag ok", "hits": []}, + error=None, + error_code=None, + error_stage=None, + latency_ms=2, + ) + + async def _direct_ok_exec( + *, query: str, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False + ) -> ToolResult: + _ = (query, history) + return ToolResult( + success=True, + data={"answer": "direct"}, + error=None, + error_code=None, + error_stage=None, + latency_ms=1, + ) + + class _DummyRegistry: + def __init__(self, tools: list[Tool]) -> None: + self._tools = tools + + def list_tools(self) -> list[Tool]: + return self._tools + + dummy_tools = [ + _make_tool("direct_answer", _direct_ok_exec), + _make_tool("rag_search", _rag_ok_exec), + _make_tool("text2sql_query", _rag_ok_exec), + ] + monkeypatch.setattr(unified_chat, "get_tool_registry", lambda: _DummyRegistry(dummy_tools)) + + async def _fake_decide_intent_v2( + *, + query: str, + history: list[dict[str, Any]], + tools: list[Tool], + min_confidence: float, + timeout: float, + **kwargs: Any, + ) -> IntentDecision: + _ = (query, history, tools, min_confidence, timeout, kwargs) + return IntentDecision( + tool="text2sql_query", + mode="text2sql", + reasoning="stub", + reasoning_full="stub", + confidence=0.35, + fallback="rag_search", + structured_signals=StructuredSignals(llm_prefers_sql=True, has_aggregation_signals=False), + raw_response={"used": "stub"}, + ) + + monkeypatch.setattr(agent_module, "decide_intent_v2", _fake_decide_intent_v2) + + client = TestClient(index.app) + res = client.post( + "/api/py/unified/chat", + headers={"Authorization": "Bearer api-key-123"}, + json={"query": "低置信预览失败探针", "prefer": "auto"}, + ) + assert res.status_code == 200 + types = [e.get("type") for e in res.json()["events"]] + assert "agent.plan.preview" not in types + clarify_evt = next(e for e in res.json()["events"] if e.get("type") == "agent.clarify") + assert "无法签发 plan_execution_token" in clarify_evt["payload"]["prompt_for_user"] + + +def test_v3_plan_preview_sse_parity(monkeypatch: pytest.MonkeyPatch): + """F4/G2 lowconf-plan-preview-sse-parity:SSE 低置信澄清须含 agent.plan.preview 与 token。""" + monkeypatch.setenv("CHATBI_USE_AGENT", "true") + monkeypatch.setenv("CHATBI_V2_INTENT_LLM", "false") + monkeypatch.setenv("CHATBI_V3_LOW_CONFIDENCE_CLARIFY", "1") + monkeypatch.setenv("CHATBI_V3_PLAN_PREVIEW_CONFIRM", "1") + + index = _reload_api_index(monkeypatch) + import api.tools as tools_mod + import api.unified_chat as unified_chat + import api.agent as agent_module + + async def _tools_t2s_preview_only( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + chain_emit: Any = None, + chain_started_at: float | None = None, + json_log_ctx: dict[str, Any] | None = None, + preview_only: bool = False, + ) -> ToolResult: + _ = (query, history, debug_llm_prompts, chain_emit, chain_started_at, json_log_ctx) + assert preview_only is True + return ToolResult( + success=True, + data={"sql": "SELECT 2 AS sse_probe", "answer": ""}, + error=None, + error_code=None, + error_stage=None, + latency_ms=1, + ) + + monkeypatch.setattr(tools_mod, "text2sql_execute", _tools_t2s_preview_only) + + async def _rag_ok_exec( + *, query: str, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False + ) -> ToolResult: + _ = (query, history) + return ToolResult( + success=True, + data={"answer": "rag ok", "hits": []}, + error=None, + error_code=None, + error_stage=None, + latency_ms=2, + ) + + async def _direct_ok_exec( + *, query: str, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False + ) -> ToolResult: + _ = (query, history) + return ToolResult( + success=True, + data={"answer": "direct"}, + error=None, + error_code=None, + error_stage=None, + latency_ms=1, + ) + + class _DummyRegistry: + def __init__(self, tools: list[Tool]) -> None: + self._tools = tools + + def list_tools(self) -> list[Tool]: + return self._tools + + dummy_tools = [ + _make_tool("direct_answer", _direct_ok_exec), + _make_tool("rag_search", _rag_ok_exec), + _make_tool("text2sql_query", _rag_ok_exec), + ] + monkeypatch.setattr(unified_chat, "get_tool_registry", lambda: _DummyRegistry(dummy_tools)) + + async def _fake_decide_intent_v2( + *, + query: str, + history: list[dict[str, Any]], + tools: list[Tool], + min_confidence: float, + timeout: float, + **kwargs: Any, + ) -> IntentDecision: + _ = (query, history, tools, min_confidence, timeout, kwargs) + return IntentDecision( + tool="text2sql_query", + mode="text2sql", + reasoning="stub", + reasoning_full="stub", + confidence=0.35, + fallback="rag_search", + structured_signals=StructuredSignals(llm_prefers_sql=True, has_aggregation_signals=False), + raw_response={"used": "stub"}, + ) + + monkeypatch.setattr(agent_module, "decide_intent_v2", _fake_decide_intent_v2) + + client = TestClient(index.app) + with client.stream( + "POST", + "/api/py/unified/chat/stream", + headers={"Authorization": "Bearer api-key-123"}, + json={"query": "低置信 SSE 预览帧探针", "prefer": "auto"}, + ) as res: + assert res.status_code == 200 + text = "" + for chunk in res.iter_text(): + text += chunk + if "event: done" in text: + break + assert "agent.plan.preview" in text + assert "plan_execution_token" in text + assert "agent.clarify" in text + assert text.index("agent.plan.preview") < text.index("agent.clarify") +