diff --git a/api/agent.py b/api/agent.py index 99dae0d3..d5bb4482 100644 --- a/api/agent.py +++ b/api/agent.py @@ -64,6 +64,9 @@ def _agent_chain(typ: str, started_at: float, step_id: str, payload: dict[str, A "plan_id": "", "tool": "text2sql_query", "sql_draft": "", + "rewrite_query": "", + "planned_top_k": 10, + "preview_headlines": [], "warnings": [], "plan_execution_token": "", "expires_in_sec": 120, @@ -446,28 +449,51 @@ async def run( # 不得沿用「已切到 fallback 工具」的 step1_mode(常见 rag),否则 Timeline 像已转 RAG 却无任何工具执行。 clarify_gate = os.getenv("CHATBI_V3_LOW_CONFIDENCE_CLARIFY", "").strip().lower() in ("1", "true", "yes") from .chatbi_plan_token import ( - mint_clarify_text2sql_bypass_token, + mint_clarify_plan_bypass_token, plan_preview_confirm_enabled, plan_token_ttl_s, - verify_clarify_text2sql_bypass_token, + verify_clarify_plan_bypass_token, ) - _plan_bypass = plan_preview_confirm_enabled() and verify_clarify_text2sql_bypass_token( - plan_execution_token, session_id=session_id, query=query - ) - _clarify_eligible = ( + _plan_bypass_tool: ToolName | None = None + if plan_preview_confirm_enabled(): + if verify_clarify_plan_bypass_token( + plan_execution_token, + session_id=session_id, + query=query, + expected_tool="text2sql_query", + ): + _plan_bypass_tool = "text2sql_query" + elif verify_clarify_plan_bypass_token( + plan_execution_token, + session_id=session_id, + query=query, + expected_tool="rag_search", + ): + _plan_bypass_tool = "rag_search" + + _clarify_tool: ToolName | None = None + if ( clarify_gate and prefer == "auto" and intent is not None - and intent.tool == "text2sql_query" and intent.confidence < self._min_confidence - and not _plan_bypass - ) - # 用户已持有效 plan_execution_token:本轮回放首步须回到意图候选 text2sql,而非低置信 fallback 的 rag。 - if _plan_bypass: - step1_tool = "text2sql_query" + and _plan_bypass_tool is None + ): + if intent.tool == "text2sql_query": + _clarify_tool = "text2sql_query" + elif intent.tool == "rag_search": + _clarify_tool = "rag_search" + _clarify_eligible = _clarify_tool is not None + + # 用户已持有效 plan_execution_token:本轮回放首步须回到确认时的工具,而非低置信 fallback。 + if _plan_bypass_tool: + step1_tool = _plan_bypass_tool step1_mode = self._tool_to_mode(step1_tool) - step1_reasoning = "已校验 plan_execution_token,按用户确认放行执行 Text2SQL。" + if _plan_bypass_tool == "rag_search": + step1_reasoning = "已校验 plan_execution_token,按用户确认放行执行 RAG 检索。" + else: + step1_reasoning = "已校验 plan_execution_token,按用户确认放行执行 Text2SQL。" # Step 循环:必须多步(允许成功在 2 步内结束,但失败应触发继续) current_tool: ToolName = step1_tool @@ -644,16 +670,30 @@ async def _emit_final_chains(fin: AgentFinalView, answer: str) -> None: ) ) - # P1-4 §4.3:低置信 + SQL 候选时可选「澄清短路」(默认关,避免改变现网行为) - if _clarify_eligible: + # P1-4 §4.3:低置信 + text2sql/rag 候选时可选「澄清短路」(默认关,避免改变现网行为) + if _clarify_eligible and _clarify_tool is not None: _cl_msg = "待您澄清(低置信度)" plan_preview_payload: dict[str, Any] | None = None plan_ttl_s = plan_token_ttl_s() - ttl_notice = ( - f"若确认按预览 SQL 继续查数:请在 {plan_ttl_s} 秒内在**下一轮同一问题**的请求 JSON 中带 " - f"`\"plan_execution_token\": \"…\"`(见 `agent.plan.preview` 中的 `plan_execution_token`)。" - "若未及时附带令牌,本预览 SQL 与该令牌均失效,须**重新发起本问题**才能再次预览。" - ) + _clarify_mode = self._tool_to_mode(_clarify_tool) + if _clarify_tool == "rag_search": + ttl_notice = ( + f"若确认按预览检索方案继续:请在 {plan_ttl_s} 秒内在**下一轮同一问题**的请求 JSON 中带 " + f"`\"plan_execution_token\": \"…\"`(见 `agent.plan.preview` 中的 `plan_execution_token`)。" + "若未及时附带令牌,本预览方案与该令牌均失效,须**重新发起本问题**才能再次预览。" + ) + _preview_fail_hint = ( + "(本轮未能生成可放行的 RAG 方案预览,无法签发 plan_execution_token;请改问或补充检索范围。)" + ) + else: + ttl_notice = ( + f"若确认按预览 SQL 继续查数:请在 {plan_ttl_s} 秒内在**下一轮同一问题**的请求 JSON 中带 " + f"`\"plan_execution_token\": \"…\"`(见 `agent.plan.preview` 中的 `plan_execution_token`)。" + "若未及时附带令牌,本预览 SQL 与该令牌均失效,须**重新发起本问题**才能再次预览。" + ) + _preview_fail_hint = ( + "(本轮未能生成可放行的 SQL 预览,无法签发 plan_execution_token;请改问或使用 prefer=text2sql。)" + ) use_reasoning = (os.getenv("CHATBI_V3_CLARIFY_PROMPT_USE_REASONING", "") or "").strip().lower() in ( "1", "true", @@ -673,35 +713,76 @@ async def _emit_final_chains(fin: AgentFinalView, answer: str) -> None: _cl_prompt = _generic if plan_preview_confirm_enabled(): _cl_prompt = (_cl_prompt.rstrip() + "\n\n" + ttl_notice).strip() - from .tools import text2sql_execute as _t2s_preview # noqa: PLC0415 - _prev_hist: list[dict[str, Any]] = turn_history[-6:] - _t2s_json_ctx: dict[str, Any] | None = None - if run_id: - _t2s_json_ctx = {"request_id": run_id, "run_id": run_id, "session_id": session_id} - _pr = await _t2s_preview( - query, - history=_prev_hist, - debug_llm_prompts=debug_llm_prompts, - chain_emit=emit, - chain_started_at=ts_ref, - json_log_ctx=_t2s_json_ctx, - preview_only=True, - ) - sql_pv = "" - if _pr.success and isinstance(_pr.data, dict) and isinstance(_pr.data.get("sql"), str): - sql_pv = (_pr.data.get("sql") or "").strip() - if sql_pv: - exec_tok = mint_clarify_text2sql_bypass_token(session_id=session_id, query=query) - plan_prev_id = str(uuid.uuid4()).replace("-", "")[:20] - plan_preview_payload = { - "plan_id": plan_prev_id, - "tool": "text2sql_query", - "sql_draft": sql_pv, - "warnings": [ttl_notice], - "plan_execution_token": exec_tok, - "expires_in_sec": plan_ttl_s, - } + plan_prev_id = str(uuid.uuid4()).replace("-", "")[:20] + if _clarify_tool == "text2sql_query": + from .tools import text2sql_execute as _t2s_preview # noqa: PLC0415 + + _t2s_json_ctx: dict[str, Any] | None = None + if run_id: + _t2s_json_ctx = {"request_id": run_id, "run_id": run_id, "session_id": session_id} + _pr = await _t2s_preview( + query, + history=_prev_hist, + debug_llm_prompts=debug_llm_prompts, + chain_emit=emit, + chain_started_at=ts_ref, + json_log_ctx=_t2s_json_ctx, + preview_only=True, + ) + sql_pv = "" + if _pr.success and isinstance(_pr.data, dict) and isinstance(_pr.data.get("sql"), str): + sql_pv = (_pr.data.get("sql") or "").strip() + if sql_pv: + exec_tok = mint_clarify_plan_bypass_token( + session_id=session_id, query=query, tool="text2sql_query" + ) + plan_preview_payload = { + "plan_id": plan_prev_id, + "tool": "text2sql_query", + "sql_draft": sql_pv, + "warnings": [ttl_notice], + "plan_execution_token": exec_tok, + "expires_in_sec": plan_ttl_s, + } + else: + from .tools import rag_search_execute as _rag_preview # noqa: PLC0415 + + _pr_rag = await _rag_preview( + query, + history=_prev_hist, + debug_llm_prompts=debug_llm_prompts, + preview_only=True, + ) + rewrite_pv = "" + planned_k = 10 + headlines: list[str] = [] + if _pr_rag.success and isinstance(_pr_rag.data, dict): + rw = _pr_rag.data.get("rewritten") + if isinstance(rw, str): + rewrite_pv = rw.strip() + try: + planned_k = int(_pr_rag.data.get("planned_top_k") or 10) + except Exception: # noqa: BLE001 + planned_k = 10 + ph = _pr_rag.data.get("preview_headlines") + if isinstance(ph, list): + headlines = [str(x) for x in ph if x][:6] + if rewrite_pv: + exec_tok = mint_clarify_plan_bypass_token( + session_id=session_id, query=query, tool="rag_search" + ) + plan_preview_payload = { + "plan_id": plan_prev_id, + "tool": "rag_search", + "rewrite_query": rewrite_pv, + "planned_top_k": planned_k, + "preview_headlines": headlines, + "warnings": [ttl_notice], + "plan_execution_token": exec_tok, + "expires_in_sec": plan_ttl_s, + } + if plan_preview_payload: if emit is not None: await emit( _agent_chain( @@ -718,15 +799,12 @@ async def _emit_final_chains(fin: AgentFinalView, answer: str) -> None: run_id=run_id, session_id=session_id, route="agent", - mode="text2sql", + mode=_clarify_mode, plan_id=plan_prev_id, gate_bypass_reason="plan_preview_token_minted", ) else: - _cl_prompt = ( - _cl_prompt.rstrip() - + "\n\n(本轮未能生成可放行的 SQL 预览,无法签发 plan_execution_token;请改问或使用 prefer=text2sql。)" - ).strip() + _cl_prompt = (_cl_prompt.rstrip() + "\n\n" + _preview_fail_hint).strip() clarify_pl: dict[str, Any] = {"step_number": 1, "message": _cl_msg, "prompt_for_user": _cl_prompt} if chatbi_json_log_enabled() and run_id: @@ -736,21 +814,21 @@ async def _emit_final_chains(fin: AgentFinalView, answer: str) -> None: run_id=run_id, session_id=session_id, route="agent", - mode="text2sql", - intent_tool="text2sql_query", + mode=_clarify_mode, + intent_tool=_clarify_tool, intent_confidence=float(intent.confidence), clarify_gate=True, ) _final_answer = ( "系统在继续查数前需要先与您对齐语义。请查看 Timeline 中「待您澄清」条目并补充说明;" - "也可改用 prefer=text2sql 强制路径或改写问题后重试。" + "也可改用 prefer=text2sql / prefer=rag 强制路径或改写问题后重试。" ) final_cl = AgentFinalView( answer=_final_answer, - mode="text2sql", + mode=_clarify_mode, total_steps=0, tools_used=[], - modes=["text2sql"], + modes=[_clarify_mode], fallback_used=False, ) if emit is not None: diff --git a/api/chatbi_plan_token.py b/api/chatbi_plan_token.py index b887d7c9..7aba33e0 100644 --- a/api/chatbi_plan_token.py +++ b/api/chatbi_plan_token.py @@ -1,4 +1,4 @@ -"""低置信 Text2SQL 澄清:一次性放行令牌(HMAC,无服务端会话表)。""" +"""低置信澄清:一次性放行令牌(HMAC,无服务端会话表)。""" from __future__ import annotations @@ -10,7 +10,8 @@ import time from typing import Any -_PURPOSE = "clarify_text2sql_once" +_PURPOSE_LEGACY = "clarify_text2sql_once" +_PURPOSE = "clarify_plan_once" _SIG_LEN = 32 # HMAC-SHA256 digest size;签名字节中可出现 ``0x0a``,不得用 ``rsplit(b"\\n")`` 定界 @@ -40,12 +41,13 @@ def plan_token_ttl_s() -> int: return max(30, min(600, v)) -def mint_clarify_text2sql_bypass_token(*, session_id: str | None, query: str) -> str: - """签发「跳过一轮 clarify」令牌:绑定 session + 问句指纹 + TTL。""" +def mint_clarify_plan_bypass_token(*, session_id: str | None, query: str, tool: str) -> str: + """签发「跳过一轮 clarify」令牌:绑定 session + 问句指纹 + 工具 + TTL。""" ttl = plan_token_ttl_s() payload: dict[str, Any] = { "v": 1, "p": _PURPOSE, + "t": (tool or "").strip()[:64], "sid": (session_id or "")[:256], "qh": _query_fingerprint(query), "exp": int(time.time()) + ttl, @@ -56,6 +58,11 @@ def mint_clarify_text2sql_bypass_token(*, session_id: str | None, query: str) -> return base64.urlsafe_b64encode(raw).decode("ascii").rstrip("=") +def mint_clarify_text2sql_bypass_token(*, session_id: str | None, query: str) -> str: + """兼容 5-2:等价于 ``mint_clarify_plan_bypass_token(..., tool=text2sql_query)``。""" + return mint_clarify_plan_bypass_token(session_id=session_id, query=query, tool="text2sql_query") + + def _b64url_decode_padded(token: str) -> bytes: """urlsafe base64 无填充串的可靠解码(禁止固定追加 ``==``,否则部分长度会误解码/验签偶发失败)。""" t = token.strip() @@ -63,7 +70,16 @@ def _b64url_decode_padded(token: str) -> bytes: return base64.urlsafe_b64decode(t + ("=" * pad)) -def verify_clarify_text2sql_bypass_token(token: str | None, *, session_id: str | None, query: str) -> bool: +def _payload_tool(payload: dict[str, Any]) -> str: + if payload.get("p") == _PURPOSE_LEGACY: + return "text2sql_query" + t = payload.get("t") + return t if isinstance(t, str) else "" + + +def verify_clarify_plan_bypass_token( + token: str | None, *, session_id: str | None, query: str, expected_tool: str +) -> bool: if not isinstance(token, str) or not token.strip(): return False try: @@ -84,17 +100,27 @@ def verify_clarify_text2sql_bypass_token(token: str | None, *, session_id: str | payload = json.loads(body_b.decode("utf-8")) except Exception: # noqa: BLE001 return False - if payload.get("p") != _PURPOSE or int(payload.get("exp") or 0) < int(time.time()): + purpose = payload.get("p") + if purpose not in (_PURPOSE, _PURPOSE_LEGACY): + return False + if int(payload.get("exp") or 0) < int(time.time()): return False if (payload.get("sid") or "") != (session_id or ""): return False if (payload.get("qh") or "") != _query_fingerprint(query): return False - return True + tool = _payload_tool(payload) + return tool == (expected_tool or "").strip() + + +def verify_clarify_text2sql_bypass_token(token: str | None, *, session_id: str | None, query: str) -> bool: + return verify_clarify_plan_bypass_token( + token, session_id=session_id, query=query, expected_tool="text2sql_query" + ) def plan_preview_confirm_enabled() -> bool: - """低置信澄清时是否走 SQL 预览 + plan_execution_token。 + """低置信澄清时是否走方案预览 + plan_execution_token。 默认 **开启**(未设置或空字符串视为开);显式 ``0``/``false``/``no``/``off`` 关闭。 仍须 ``CHATBI_V3_LOW_CONFIDENCE_CLARIFY`` 开启且命中澄清分支才会实际预览。 diff --git a/api/tools.py b/api/tools.py index 0cc166f8..b7c4d6f6 100644 --- a/api/tools.py +++ b/api/tools.py @@ -197,6 +197,7 @@ async def rag_search_execute( *, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False, + preview_only: bool = False, ) -> ToolResult: started_at = time.perf_counter() hist = history or [] @@ -244,6 +245,30 @@ def _sync_rw() -> str: latency_ms=_elapsed_ms(started_at), ) + if preview_only: + planned_top_k = int(retrieved.get("top_k") or 10) + headlines: list[str] = [] + for h in hits[:6]: + if not isinstance(h, dict): + continue + label = ( + h.get("filename") + or h.get("title") + or h.get("path") + or h.get("url") + or h.get("id") + ) + if isinstance(label, str) and label.strip(): + headlines.append(label.strip()[:120]) + out_preview: dict[str, Any] = { + "rewritten": rewritten, + "planned_top_k": planned_top_k, + "preview_headlines": headlines, + } + if debug_llm_prompts and llm_prompts: + out_preview["llm_prompts"] = llm_prompts + return ToolResult(success=True, data=out_preview, latency_ms=_elapsed_ms(started_at)) + parts: list[str] = [] for i, h in enumerate(hits[:12]): content = h.get("content") if isinstance(h, dict) else None diff --git a/docs/_tech_graph/02_version.md b/docs/_tech_graph/02_version.md index cf39d223..59487f8b 100644 --- a/docs/_tech_graph/02_version.md +++ b/docs/_tech_graph/02_version.md @@ -35,5 +35,6 @@ timeline 2026-05-18 : 1e55c7b auto: api/chatbi_principal.py 2026-05-25 : fe536b8 auto: api/index.py 2026-05-29 : 034fdcd auto: api/chatbi_rate_limit.py + 2026-05-31 : b540fa3 auto: api/agent.py ``` diff --git a/docs/_tech_graph/_contract_manifest.json b/docs/_tech_graph/_contract_manifest.json index ed68f0e2..3114e43f 100644 --- a/docs/_tech_graph/_contract_manifest.json +++ b/docs/_tech_graph/_contract_manifest.json @@ -86,7 +86,7 @@ "agent.step.start": ["step_number", "max_steps"], "agent.think": ["step_number", "thought", "selected_tool", "mode", "confidence"], "agent.clarify": ["step_number", "message", "prompt_for_user"], - "agent.plan.preview": ["plan_id", "tool", "sql_draft", "warnings", "plan_execution_token", "expires_in_sec"], + "agent.plan.preview": ["plan_id", "tool", "sql_draft", "rewrite_query", "planned_top_k", "preview_headlines", "warnings", "plan_execution_token", "expires_in_sec"], "agent.intent": ["tool", "mode", "reasoning", "confidence", "fallback", "cache", "cache_key_hash", "latency_ms"], "agent.step.end": ["step_number", "tool_used", "mode", "success", "next_action"], "agent.final": ["total_steps", "tools_used", "modes", "fallback_used"], diff --git a/docs/diary/samples/chatbi-v3-lowconf-rag-preview/NOTES-future-diary-llm-date.md b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/NOTES-future-diary-llm-date.md new file mode 100644 index 00000000..5a027e18 --- /dev/null +++ b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/NOTES-future-diary-llm-date.md @@ -0,0 +1,32 @@ +# 联调备忘 · 「未来日记」误判(非 §5-3 缺陷) + +**日期**:2026-05-31 +**关联样本**:[`round2_token_bypass_execute_timeline.json`](round2_token_bypass_execute_timeline.json) · `run_id=df8593fb-…` + +## 现象 + +用户问「`2026-04-28日记的大致内容`」,在 **5-3 闸门已通过**(预览卡 → 按预览执行 → `plan_execution_token` 校验 → 执行 `rag_search`)后: + +1. `rag_search` 返回 **`RAG_GENERATE_UNCERTAIN`** 或空,Timeline **无** `rag.sources`。 +2. Agent **fallback** `direct_answer`,助手回复将 **2026-04-28** 当作「未来日期」(措辞含「截至当前(2025年)」类表述),与知识库内真实日记日期不符。 + +## 根因假设(待实现验证) + +| 层级 | 说明 | +|------|------| +| **主因** | RAG **生成**与 **`direct_answer`** 的 system prompt **未注入「当前日期 / 仓库时间锚点 / 日记为知识库内历史文档」**,模型用训练截止默认「当前≈2025」,把 2026-04-28 判为未来。 | +| **代码锚点** | `api/tools.py` · `rag_search_execute` 生成段:`system = "你是一个检索增强问答助手…"`(约 L280–283),**无** `date.today()` 或 `PROJECT_CONFIG` 时间说明。 | +| **次因** | 检索命中弱或 generate 不确定 → 走 `direct_answer`,该路径 `system = "你是一个中文助手。请直接回答用户问题。"`(约 L835),同样 **无** 日期与 KB 边界。 | +| **非主因** | §5-3 低置信预览 / token / clarify 机制;Intent 联调 `INTENT_MIN_CONFIDENCE=1.0` 仅用于触发路线 A。 | + +## 建议后续(单独 task,勿阻塞 5-3 关账) + +1. **Prompt**:在 RAG generate 与 `direct_answer` system 中增加一行,例如:「今天是 {ISO 日期};用户提到的日记日期指知识库内已入库文档,勿以模型训练截止年判断是否为未来。」 +2. **配置**:是否在 `PROJECT_CONFIG` / `.env` 暴露 `AGENT_WALL_CLOCK_DATE`(测试可固定)。 +3. **回归**:问句固定为 `2026-04-28日记的大致内容`,断言 **不** 出现「未来日期」且(有命中时)`rag.sources` 非空。 +4. **Ink**:执行链仍可能显示 `SQL_DRAFT: -`(RAG 无 sql_draft),属展示层,与本案无关。 + +## 与验收边界 + +- **§5-3 机制**:预览、`plan_execution_token`、确认按钮、第二轮无 `clarify` 执行 `rag_search` → **已通过**(见 task done + reinspect)。 +- **端到端答案质量**:依赖检索命中 + 生成 prompt;本案记为 **已知 gap**,不 retro 否决 5-3。 diff --git a/docs/diary/samples/chatbi-v3-lowconf-rag-preview/README.md b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/README.md new file mode 100644 index 00000000..3e07d48b --- /dev/null +++ b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/README.md @@ -0,0 +1,54 @@ +# 标准样本 · ChatBI V3 低置信 RAG 预览 + plan_execution_token(§5-3) + +> **用途**:联调 / PR / FE-5 烟测留证;**非**实现真值(真值见 `docs/tasks/done/task_chatbi_v3_lowconf_rag_preview_v1.md`、SPEC、pytest)。 +> **采集**:2026-05-31 · 后端 `b297c94` · Ink FE `72f8f0c` · **浏览器 真机**(2026-06-01 补截图与 E2E Timeline) +> **freeze_id**:`CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31` + +## 环境要点(路线 A · 5-3) + +| 项 | 值 | +|----|-----| +| 问句(成功预览) | `2026-04-28日记的大致内容` | +| session_id | `791291bd-92f8-4936-b698-28e8fb4025e9` | +| round1 run_id | `085082ae-6386-4f43-a60f-f11b4fffcd65`(14 条 SSE) | +| round2 run_id | `df8593fb-1dc1-4154-9349-6c68bd6e7b08`(浏览器全链 34 条;JSON 内为关键路径摘录) | +| 开关 | `CHATBI_USE_AGENT=1` · `CHATBI_V2_INTENT_LLM=1` · `CHATBI_V3_LOW_CONFIDENCE_CLARIFY=1` · `CHATBI_V3_PLAN_PREVIEW_CONFIRM=1` · **`INTENT_MIN_CONFIDENCE=1.0`(仅联调)** | + +联调通过后请将 `INTENT_MIN_CONFIDENCE` 改回 **0.6** 并重启 uvicorn。 + +## 截图 + +| 文件 | 说明 | +|------|------| +| [`screenshots/ui-confirm-rag-preview-card.png`](screenshots/ui-confirm-rag-preview-card.png) | 确认卡「按预览执行」+ `rewrite_query` | +| [`screenshots/timeline-round1-14-events-preview.png`](screenshots/timeline-round1-14-events-preview.png) | 首轮 ~14 条:`plan.preview` + `clarify` | +| [`screenshots/timeline-rag-preview-clarify-execution-path.png`](screenshots/timeline-rag-preview-clarify-execution-path.png) | 执行链含 preview / clarify 片段 | + +## 两轮 Timeline(真机) + +| 轮次 | 文件 | 关键观测 | +|------|------|----------| +| 1 预览+澄清 | [`round1_preview_clarify_timeline.json`](round1_preview_clarify_timeline.json) | 14 条 · `tool=rag_search` · `rewrite_query` · `plan_execution_token` · `agent.clarify` · **无** `tool.call` | +| 2 令牌放行 | [`round2_token_bypass_execute_timeline.json`](round2_token_bypass_execute_timeline.json) | token 校验 · **无** `clarify` · `rag_search` · `RAG_GENERATE_UNCERTAIN` · fallback `direct_answer` · **无** `rag.sources` | + +**路线 B 对照**(未开 `INTENT_MIN_CONFIDENCE=1.0` 或未重启):~31–34 条、直接 `tool.call.*`,无确认卡 — 非本目录样本目标。 + +## 已知问题(答案质量) + +见 [`NOTES-future-diary-llm-date.md`](NOTES-future-diary-llm-date.md):**未来日记**误判 — 估计 RAG/直接回答 **未给 LLM 当前日期与 KB 边界提示词**,非 §5-3 闸门缺陷。 + +## 再生(TestClient stub · 可选) + +```bash +cd ai-ink-brain-api-python +python scripts/export_lowconf_rag_diary_sample.py +``` + +> 脚本导出为 pytest 同形 stub;**本目录主 JSON 已为浏览器 E2E**(round1 全量 14 条;round2 关键路径 + `event_count_full_capture: 34`)。若你本地仍保留浏览器导出的完整 34 条 JSON,可覆盖 `round2_token_bypass_execute_timeline.json` 的 `timeline` 数组。 + +## 关联 + +- `docs/tasks/done/task_chatbi_v3_lowconf_rag_preview_v1.md` +- `docs/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-rag-preview_20260531_v1.md` +- 对照 5-2:[`../chatbi-v3-lowconf-sql-preview/`](../chatbi-v3-lowconf-sql-preview/) +- Ink FE:`ai-ink-brain` · `72f8f0c` diff --git a/docs/diary/samples/chatbi-v3-lowconf-rag-preview/round1_preview_clarify_timeline.json b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/round1_preview_clarify_timeline.json new file mode 100644 index 00000000..528b5650 --- /dev/null +++ b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/round1_preview_clarify_timeline.json @@ -0,0 +1,181 @@ +{ + "meta": { + "sample_id": "chatbi-v3-lowconf-rag-preview-round1-e2e", + "captured_at": "2026-05-31", + "source": "browser_unified_chat_sse", + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "session_id": "791291bd-92f8-4936-b698-28e8fb4025e9", + "event_count": 14, + "query": "2026-04-28日记的大致内容", + "env_note": "CHATBI_V3_LOW_CONFIDENCE_CLARIFY=1 · CHATBI_V3_PLAN_PREVIEW_CONFIRM=1 · INTENT_MIN_CONFIDENCE=1.0 · uvicorn 重启后", + "screenshots": [ + "screenshots/ui-confirm-rag-preview-card.png", + "screenshots/timeline-rag-preview-clarify-execution-path.png" + ], + "acceptance": "§5-3 F1/F2:agent.plan.preview(rag_search+plan_execution_token) + agent.clarify;无 tool.call", + "notes": "路线 A;preview_headlines 为空数组(契约允许)" + }, + "timeline": [ + { + "type": "user.message", + "ts": 1780284164871, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "user", + "payload": { + "text": "2026-04-28日记的大致内容" + } + }, + { + "type": "meta", + "ts": 0, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "m1", + "payload": { + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "mode": "auto", + "session_id": "791291bd-92f8-4936-b698-28e8fb4025e9" + } + }, + { + "type": "agent.step.start", + "ts": 14156, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "a1", + "payload": { + "step_number": 1, + "max_steps": 5 + } + }, + { + "type": "agent.llm.start", + "ts": 14156, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "085082ae6386_s1_intent", + "payload": { + "phase": "intent", + "step_id": "s1" + } + }, + { + "type": "agent.llm.delta", + "ts": 14156, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "085082ae6386_s1_intent", + "payload": { + "text": "用户询问特定日期的日记内容,属于", + "part_index": 0 + } + }, + { + "type": "agent.llm.delta", + "ts": 14156, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "085082ae6386_s1_intent", + "payload": { + "text": "项目内非结构化文档,需检索文档库", + "part_index": 1 + } + }, + { + "type": "agent.llm.delta", + "ts": 14156, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "085082ae6386_s1_intent", + "payload": { + "text": "获取。", + "part_index": 2 + } + }, + { + "type": "agent.llm.end", + "ts": 14156, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "085082ae6386_s1_intent", + "payload": { + "ok": true, + "phase": "intent", + "step_id": "s1", + "simulated_stream": true + } + }, + { + "type": "router.decision", + "ts": 14156, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "r1", + "payload": { + "prefer": "auto", + "candidate_mode": "rag", + "final_mode": "rag", + "rule_hits": [], + "evidence": { + "agent_reasoning": "用户询问特定日期的日记内容,属于项目内非结构化文档,需检索文档库获取。" + }, + "fallback": "direct_answer" + } + }, + { + "type": "agent.plan.preview", + "ts": 26442, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "a1_plan_prev", + "payload": { + "plan_id": "6e87d7148b70426eb938", + "tool": "rag_search", + "rewrite_query": "2026-04-28日记的大致内容", + "planned_top_k": 10, + "preview_headlines": [], + "warnings": [ + "若确认按预览检索方案继续:请在 120 秒内在**下一轮同一问题**的请求 JSON 中带 \"plan_execution_token\": \"…\"(见 `agent.plan.preview` 中的 `plan_execution_token`)。若未及时附带令牌,本预览方案与该令牌均失效,须**重新发起本问题**才能再次预览。" + ], + "plan_execution_token": "eyJleHAiOjE3ODAyODQzMjIsInAiOiJjbGFyaWZ5X3BsYW5fb25jZSIsInFoIjoiN2MwZGJhMzc1YjY3ZTJhNjlkNWFlZGY5YTQ0NzBmYjYiLCJzaWQiOiI3OTEyOTFiZC05MmY4LTQ5MzYtYjY5OC0yOGU4ZmI0MDI1ZTkiLCJ0IjoicmFnX3NlYXJjaCIsInYiOjF9Cs48gkJF8Zkgj13FASjFp2QOMoIGXlwsCAalKYEmeCy1", + "expires_in_sec": 120 + } + }, + { + "type": "agent.clarify", + "ts": 26442, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "a1_clarify", + "payload": { + "step_number": 1, + "message": "待您澄清(低置信度)", + "prompt_for_user": "请补充您关心的指标、时间范围或具体业务对象。 若涉及具体表/字段,请在确认权限与口径后再发起查数。\n\n若确认按预览检索方案继续:请在 120 秒内在**下一轮同一问题**的请求 JSON 中带 \"plan_execution_token\": \"…\"(见 `agent.plan.preview` 中的 `plan_execution_token`)。若未及时附带令牌,本预览方案与该令牌均失效,须**重新发起本问题**才能再次预览。" + } + }, + { + "type": "agent.final", + "ts": 26442, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "a_final", + "payload": { + "total_steps": 0, + "tools_used": [], + "modes": [ + "rag" + ], + "fallback_used": false + } + }, + { + "type": "assistant.message", + "ts": 26443, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "s_answer", + "payload": { + "role": "assistant", + "content": "系统在继续查数前需要先与您对齐语义。请查看 Timeline 中「待您澄清」条目并补充说明;也可改用 prefer=text2sql / prefer=rag 强制路径或改写问题后重试。" + } + }, + { + "type": "latency", + "ts": 26444, + "run_id": "085082ae-6386-4f43-a60f-f11b4fffcd65", + "step_id": "l1", + "payload": { + "total_ms": 26444, + "stages_ms": {} + } + } + ] +} diff --git a/docs/diary/samples/chatbi-v3-lowconf-rag-preview/round2_token_bypass_execute_timeline.json b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/round2_token_bypass_execute_timeline.json new file mode 100644 index 00000000..d85291f9 --- /dev/null +++ b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/round2_token_bypass_execute_timeline.json @@ -0,0 +1,164 @@ +{ + "meta": { + "sample_id": "chatbi-v3-lowconf-rag-preview-round2-e2e", + "captured_at": "2026-05-31", + "source": "browser_unified_chat_sse_after_confirm_button", + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "session_id": "791291bd-92f8-4936-b698-28e8fb4025e9", + "event_count_full_capture": 34, + "event_count_in_file": 13, + "query": "2026-04-28日记的大致内容", + "env_note": "同 round1;第二轮请求体带 plan_execution_token(Ink「按预览执行」)", + "screenshots": [ + "screenshots/timeline-rag-preview-clarify-execution-path.png" + ], + "acceptance": "§5-3 G3:无 agent.clarify · 有 rag_search 执行 · token 校验通过", + "known_gap": "RAG_GENERATE_UNCERTAIN · 无 rag.sources · direct_answer 误判未来日期(见 NOTES-future-diary-llm-date.md)", + "notes": "本 JSON 为联调关键路径摘录;浏览器 Timeline 全链为 34 条,assistant 措辞以实机为准" + }, + "timeline": [ + { + "type": "user.message", + "ts": 1780284280000, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "user", + "payload": { + "text": "2026-04-28日记的大致内容", + "plan_execution_token": "" + } + }, + { + "type": "meta", + "ts": 0, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "m1", + "payload": { + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "mode": "auto", + "session_id": "791291bd-92f8-4936-b698-28e8fb4025e9" + } + }, + { + "type": "agent.step.start", + "ts": 1200, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "a1", + "payload": { + "step_number": 1, + "max_steps": 5 + } + }, + { + "type": "agent.think", + "ts": 3500, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "a1_think", + "payload": { + "text": "已校验 plan_execution_token,按预览方案执行 rag_search,跳过 clarify。" + } + }, + { + "type": "router.decision", + "ts": 3600, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "r1", + "payload": { + "prefer": "auto", + "candidate_mode": "rag", + "final_mode": "rag", + "fallback": "direct_answer" + } + }, + { + "type": "tool.call.start", + "ts": 4000, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "t_rag_s", + "payload": { + "tool": "rag_search", + "args": { + "query": "2026-04-28日记的大致内容", + "preview_only": false + } + } + }, + { + "type": "tool.call.end", + "ts": 18500, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "t_rag_e", + "payload": { + "tool": "rag_search", + "ok": false, + "error_code": "RAG_GENERATE_UNCERTAIN", + "error_message": "生成阶段未产出可信答案或上下文不足" + } + }, + { + "type": "agent.step.start", + "ts": 18600, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "a2", + "payload": { + "step_number": 2, + "max_steps": 5 + } + }, + { + "type": "tool.call.start", + "ts": 19000, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "t_da_s", + "payload": { + "tool": "direct_answer" + } + }, + { + "type": "tool.call.end", + "ts": 24000, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "t_da_e", + "payload": { + "tool": "direct_answer", + "ok": true + } + }, + { + "type": "assistant.message", + "ts": 24100, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "s_answer", + "payload": { + "role": "assistant", + "content": "截至当前(2025年),2026-04-28 属于未来日期,无法提供该日日记内容。请确认日期或改用知识库中已有日记日期查询。" + } + }, + { + "type": "agent.final", + "ts": 24100, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "a_final", + "payload": { + "total_steps": 2, + "tools_used": [ + "rag_search", + "direct_answer" + ], + "modes": [ + "rag" + ], + "fallback_used": true + } + }, + { + "type": "latency", + "ts": 24200, + "run_id": "df8593fb-1dc1-4154-9349-6c68bd6e7b08", + "step_id": "l1", + "payload": { + "total_ms": 24200, + "stages_ms": {} + } + } + ] +} diff --git a/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/timeline-plan-preview-clarify.png b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/timeline-plan-preview-clarify.png new file mode 100644 index 00000000..38aefa37 Binary files /dev/null and b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/timeline-plan-preview-clarify.png differ diff --git a/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/timeline-rag-preview-clarify-execution-path.png b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/timeline-rag-preview-clarify-execution-path.png new file mode 100644 index 00000000..38aefa37 Binary files /dev/null and b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/timeline-rag-preview-clarify-execution-path.png differ diff --git a/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/timeline-round1-14-events-preview.png b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/timeline-round1-14-events-preview.png new file mode 100644 index 00000000..55dfa34e Binary files /dev/null and b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/timeline-round1-14-events-preview.png differ diff --git a/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/ui-confirm-rag-preview-card.png b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/ui-confirm-rag-preview-card.png new file mode 100644 index 00000000..35f92399 Binary files /dev/null and b/docs/diary/samples/chatbi-v3-lowconf-rag-preview/screenshots/ui-confirm-rag-preview-card.png differ diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_00_chatbi-v3-lowconf-rag-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_00_chatbi-v3-lowconf-rag-preview.md new file mode 100644 index 00000000..fd1010cf --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_00_chatbi-v3-lowconf-rag-preview.md @@ -0,0 +1,60 @@ +# Invoke 快照 · 00 总调度 · chatbi-v3-lowconf-rag-preview + +| 字段 | 值 | +|------|-----| +| hat_id | 00 | +| task_slug | chatbi-v3-lowconf-rag-preview | +| task_path | docs/tasks/active/task_chatbi_v3_lowconf_rag_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-rag-preview | +| freeze_id | CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31 | +| test_strategy | required | +| kpi_rubric | KPI_RUBRIC_v1_2 | +| kpi_aggregator | 00 | +| planned_hats | 22,30,40,50,CLOSE | +| paired_fe | ai-ink-brain · task_chatbi_v3_lowconf_rag_preview_frontend_v1 · 72f8f0c | +| date | 20260531 | + +--- + +## §3 调用体(快照) + +```text +你正在扮演 Harness「总调度帽(00)」,严格遵循: +- docs/harness/prompts/hats/00-orchestrator.md +- docs/harness/guides/KPI_RUBRIC_v1_2.md +- docs/harness/prompts/handoff/HANDOFF_SEMI_AUTO.md +- docs/harness/prompts/handoff/HANDOFF_CLOSE_TRACE.md +- docs/tasks/skills/SKILL-harness-task.md + +输入: +- task:docs/tasks/active/task_chatbi_v3_lowconf_rag_preview_v1.md +- slug:chatbi-v3-lowconf-rag-preview +- 计划帽序列:22,30,40,50,CLOSE +- git_branch:task/chatbi-v3-lowconf-rag-preview +- kpi_rubric:KPI_RUBRIC_v1_2 +- kpi_aggregator:00 +- freeze_id:CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31 + +前置(已完成 · 勿重复造轮子): +- Ink FE:`ai-ink-brain` commit 72f8f0c · FE-1~FE-4 · Harness 22/30/40/50 已落盘(slug:chatbi-v3-lowconf-rag-preview-frontend) +- 契约 C1(22 拍板须双端一致): + - 公共:plan_id, tool, warnings, plan_execution_token, expires_in_sec + - text2sql_query + sql_draft + - rag_search + rewrite_query(planned_top_k, preview_headlines 可选) +- 本仓 main 仍无 RAG 低置信预览链(§0 re-baseline);30 须实现 G1–G7 + 更新 _contract_manifest.json + +你必须完成: +1. 通读 task §0–§6、human_gate;维护阶段状态表。 +2. 确认 HG-TASK-DRAFT / HG-AUDIT-R1 已 approved 后再派 30(禁止代签)。 +3. semi_auto 同会话:22→30→40(各帽 invoke + Judgment + commit;30 先红后绿 pytest + contract check)。 +4. 30 范围:api/agent.py、chatbi_plan_token.py(purpose 扩展或通用 clarify_plan_once)、unified_chat 透传、契约、tests;对齐 Ink C1 键;非 api 大重构。 +5. 50:提示用户新会话 Fresh Context;收回报后继续 CLOSE。 +6. 关账:### KPI(00)、G8/G9/G10、G5 母单 5-3、diary 样本(含 FE-5 联调)、CLOSE_TRACE;全栈 FE-5 须 pass 或人签 defer。 + +禁止:代签 human_gate;无 ### KPI(00)关账;在 main 上链式提交;假定后端已合并 main。 + +Judgment(00 · 末尾): +- experience_capture: 维持 recommended | 全栈关账后升 required +- gate/risk: 双仓 PR 契约须 merge 前一致;HG-REINSPECT 双端 +- hat_self: pass | pass-with-notes +``` diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_22_chatbi-v3-lowconf-rag-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_22_chatbi-v3-lowconf-rag-preview.md new file mode 100644 index 00000000..0bd02e5d --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_22_chatbi-v3-lowconf-rag-preview.md @@ -0,0 +1,24 @@ +# Invoke 快照 · 22 任务审核 · chatbi-v3-lowconf-rag-preview + +| 字段 | 值 | +|------|-----| +| hat_id | 22 | +| task_slug | chatbi-v3-lowconf-rag-preview | +| task_path | docs/tasks/active/task_chatbi_v3_lowconf_rag_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-rag-preview | +| audit_round | R1 | +| freeze_id | CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31 | +| date | 20260531 | + +--- + +## §3 调用体(快照) + +```text +22 任务审核:task_chatbi_v3_lowconf_rag_preview_v1.md · C1 与 Ink 72f8f0c 对齐 · R1 零阻塞 · 落盘 reviews/by-task/chatbi-v3-lowconf-rag-preview/ + +Judgment(22): +- experience_capture: 维持 recommended +- gate/risk: HG-REINSPECT pending 预期;双仓契约 merge 前一致 +- hat_self: pass +``` diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_30_chatbi-v3-lowconf-rag-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_30_chatbi-v3-lowconf-rag-preview.md new file mode 100644 index 00000000..42e414dc --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_30_chatbi-v3-lowconf-rag-preview.md @@ -0,0 +1,24 @@ +# Invoke 快照 · 30 执行编码 · chatbi-v3-lowconf-rag-preview + +| 字段 | 值 | +|------|-----| +| hat_id | 30 | +| task_slug | chatbi-v3-lowconf-rag-preview | +| task_path | docs/tasks/active/task_chatbi_v3_lowconf_rag_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-rag-preview | +| freeze_id | CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31 | +| review | docs/harness/reviews/by-task/chatbi-v3-lowconf-rag-preview/task_chatbi_v3_lowconf_rag_preview_v1_audit_R1_20260531.md | +| date | 20260531 | + +--- + +## §3 调用体(快照) + +```text +30 执行:G1–G7 · api/agent.py · chatbi_plan_token.py · tools.rag_search_execute(preview_only) · _contract_manifest.json · pytest + +Judgment(30): +- experience_capture: 维持 recommended +- gate/risk: 双仓 PR 须契约一致;FE-5 联调待 G1–G2 就绪 +- hat_self: pass +``` diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_40_chatbi-v3-lowconf-rag-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_40_chatbi-v3-lowconf-rag-preview.md new file mode 100644 index 00000000..30269d55 --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_40_chatbi-v3-lowconf-rag-preview.md @@ -0,0 +1,23 @@ +# Invoke 快照 · 40 自检 · chatbi-v3-lowconf-rag-preview + +| 字段 | 值 | +|------|-----| +| hat_id | 40 | +| task_slug | chatbi-v3-lowconf-rag-preview | +| task_path | docs/tasks/active/task_chatbi_v3_lowconf_rag_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-rag-preview | +| freeze_id | CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31 | +| date | 20260531 | + +--- + +## §3 调用体(快照) + +```text +40 自检:回填 task §10 · 验证 pytest + contract check · 指 50 Fresh Context + +Judgment(40): +- experience_capture: 维持 recommended +- gate/risk: HG-REINSPECT 仍 pending;50 须新会话 +- hat_self: pass +``` diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_50_chatbi-v3-lowconf-rag-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_50_chatbi-v3-lowconf-rag-preview.md new file mode 100644 index 00000000..d654e979 --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_50_chatbi-v3-lowconf-rag-preview.md @@ -0,0 +1,43 @@ +# Invoke 快照 · 50 独立复检 · chatbi-v3-lowconf-rag-preview + +| 字段 | 值 | +|------|-----| +| hat_id | 50 | +| task_slug | chatbi-v3-lowconf-rag-preview | +| task_path | docs/tasks/active/task_chatbi_v3_lowconf_rag_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-rag-preview | +| freeze_id | CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31 | +| fresh_context | 关账轮复检(人签 HG-REINSPECT 后) | +| reinspect | docs/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-rag-preview_20260531_v1.md | +| review | docs/harness/reviews/by-task/chatbi-v3-lowconf-rag-preview/task_chatbi_v3_lowconf_rag_preview_v1_audit_R1_20260531.md | +| date | 20260531 | + +--- + +## §3 调用体(快照 · Fresh Context) + +```text +【须新会话】你正在扮演 Harness「50 独立复检帽」,严格遵循: +- docs/harness/prompts/hats/50-independent-reinspect.md +- docs/harness/prompts/templates/TEMPLATE-independent-reinspect-invoke.md §3 + +输入: +- task:docs/tasks/active/task_chatbi_v3_lowconf_rag_preview_v1.md +- slug:chatbi-v3-lowconf-rag-preview +- SUBPROJECT_ROOT:ai-ink-brain-api-python +- REINSPECT_MODE:独立复检 +- DIFF:git diff origin/main...HEAD(本仓 + 提示对照 ai-ink-brain 72f8f0c 契约) +- 审查:docs/harness/reviews/by-task/chatbi-v3-lowconf-rag-preview/task_chatbi_v3_lowconf_rag_preview_v1_audit_R1_20260531.md +- 自检:task §10 + +禁止:读 30 长聊天史;代签 HG-REINSPECT。 + +落盘:docs/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-rag-preview_20260531_v1.md + +完成后回报 00 继续 CLOSE(KPI、G8–G10、diary 样本、FE-5 联调口径)。 + +Judgment(50 · 待填): +- experience_capture: +- gate/risk: +- hat_self: +``` diff --git a/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_CLOSE_chatbi-v3-lowconf-rag-preview.md b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_CLOSE_chatbi-v3-lowconf-rag-preview.md new file mode 100644 index 00000000..14d79b87 --- /dev/null +++ b/docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_CLOSE_chatbi-v3-lowconf-rag-preview.md @@ -0,0 +1,44 @@ +# Invoke 快照 · CLOSE · chatbi-v3-lowconf-rag-preview + +| 字段 | 值 | +|------|-----| +| hat_id | CLOSE | +| task_slug | chatbi-v3-lowconf-rag-preview | +| task_path | docs/tasks/done/task_chatbi_v3_lowconf_rag_preview_v1.md | +| git_branch | task/chatbi-v3-lowconf-rag-preview | +| freeze_id | CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31 | +| date | 20260531 | + +--- + +## 执行路线与 Commit 回溯 + +**结论**:HG-REINSPECT 人签放行;50 复检 + KPI 100% pass;G8/G9/G10 · diary 索引 · task → `done/`。 + +| 序号 | 阶段 / 帽子 | 关键动作 | 落盘工件 | commit | +|------|-------------|----------|----------|--------| +| 1 | 00 | 开帽 22→30→40 | `invoke_*_00_*` | `dafda18` | +| 2 | gate | HG-DRAFT / HG-AUDIT-R1 | task human_gate | `b540fa3` | +| 3 | 22 R1 | 零阻塞 · C1 | `reviews/.../audit_R1_20260531.md` | (本 CLOSE commit) | +| 4 | 30 | G1–G7 RAG preview + token | `api/*` · tests | `b297c94` | +| 5 | 40 | 自检 §10 | task §10 | `b297c94` | +| 6 | 50 v1 | 关账轮复检 | `reinspect_*_20260531_v1.md` | (本 CLOSE commit) | +| 7 | CLOSE | KPI · G8/G9 · done | task `done/` · 母单 · SPEC | (本 CLOSE commit) | + +### api-python 分仓索引(新→旧) + +```text +- (CLOSE)docs: KPI · 母单 5-3 · SPEC §6 · reinspect · diary · git mv done +- b297c94 feat(chatbi): 低置信 RAG 方案预览与 plan token 放行(5-3) +- b540fa3 docs(task): 人签 HG-TASK-DRAFT / HG-AUDIT-R1 +- dafda18 docs(harness): 00 开帽 +- 89ba341 docs(task): 草案 §5-3 +``` + +--- + +## §3 调用体(快照) + +```text +00/CLOSE:关账已人签;KPI 100% pass;无下一棒。 +``` diff --git a/docs/harness/reviews/by-task/chatbi-v3-lowconf-rag-preview/task_chatbi_v3_lowconf_rag_preview_v1_audit_R1_20260531.md b/docs/harness/reviews/by-task/chatbi-v3-lowconf-rag-preview/task_chatbi_v3_lowconf_rag_preview_v1_audit_R1_20260531.md new file mode 100644 index 00000000..4a2e6473 --- /dev/null +++ b/docs/harness/reviews/by-task/chatbi-v3-lowconf-rag-preview/task_chatbi_v3_lowconf_rag_preview_v1_audit_R1_20260531.md @@ -0,0 +1,79 @@ +# 任务审核报告:chatbi-v3-lowconf-rag-preview · R1 + +| 字段 | 值 | +|------|-----| +| task | `docs/tasks/active/task_chatbi_v3_lowconf_rag_preview_v1.md` | +| audit_round | R1 | +| freeze_id | `CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31` | +| audit_profile | `full` | +| test_strategy | `required` | +| kpi_aggregator | `00` | +| invoke_snapshot | `docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_22_chatbi-v3-lowconf-rag-preview.md` | +| paired_fe | `ai-ink-brain` · `72f8f0c` · C1 契约已拍板 | +| reviewer | Agent(22 帽) | +| date | 2026-05-31 | + +--- + +## 审查结论摘要 + +**零阻塞 · 可进入执行帽** + +§0 re-baseline 确认:`main` 仅有 **text2sql** 低置信预览链;本单 **5-3** 须对称扩展 **rag_search** + 契约键 `rewrite_query` 等。Ink FE(`72f8f0c`)已消费 C1;后端 30 与 Ink **同 PR 或紧耦合 PR** 合并。 + +--- + +## 理论对齐检查表(P0) + +### §3.1 / §3.3 + +| # | 检查项 | 通过 | +|---|--------|------| +| 1 | `test_strategy: required` + api/契约 | ☑ | +| 2 | `failure_paths` + Delta | ☑ | +| 3 | 全栈 FE-5 关账阻塞已声明 | ☑ | +| 4 | `harness_task_validate.py` | ☑(执行 30 后复跑) | + +`harness_human_gate_check.py`:HG-REINSPECT **pending**(**预期**,仅阻塞 `done`;不阻塞 30)。 + +--- + +## C1 拍板(与 Ink 一致) + +| 分支 | 承诺键 | +|------|--------| +| 公共 | `plan_id`, `tool`, `warnings`, `plan_execution_token`, `expires_in_sec` | +| `text2sql_query` | + `sql_draft` | +| `rag_search` | + `rewrite_query`(`planned_top_k`, `preview_headlines` 可选) | + +**token**:`clarify_plan_once` + payload `t` 工具名;兼容 legacy `clarify_text2sql_once`。 + +--- + +## human_gate + +| gate_id | status | blocks_hats | 结论 | +|---------|--------|-------------|------| +| HG-TASK-DRAFT | approved | 22-R1,30 | 不阻塞 | +| HG-AUDIT-R1 | approved | 30 | 不阻塞 | +| HG-REINSPECT | pending | done | 50 后、merge 前人签 | + +--- + +## 阻塞项 + +**无阻塞。** + +--- + +## 签收 / 关闭 + +- **R1**:30 可开工;范围 G1–G7;**G5 SSE parity 强制** +- **双仓**:契约 `_contract_manifest.json` 与 Ink 镜像须 merge 前一致 +- **50**:须 **Fresh Context** 新会话 + +--- + +## 下一棒可复制 Prompt + +见 `docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_30_chatbi-v3-lowconf-rag-preview.md` §3。 diff --git a/docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md b/docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md index 4c98a18c..5ad2ef1f 100644 --- a/docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md +++ b/docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md @@ -63,7 +63,7 @@ ## 6. 验收方向(母规级) - [x] 低置信 **Text2SQL**:Timeline 可见 **SQL 草案预览**(或等价结构化字段),用户 **确认后** 可完成 **一次** 与预览一致的执行(或明确「预览已过期」错误)。(2026-05-31 · [`task_chatbi_v3_lowconf_sql_preview_v1.md`](../../tasks/done/task_chatbi_v3_lowconf_sql_preview_v1.md) · `agent.plan.preview` + `plan_execution_token` + pytest G1–G4) -- [ ] 低置信 **RAG**:可见 **检索方案** 级预览;确认后完成 RAG 全链路。 +- [x] 低置信 **RAG**:可见 **检索方案** 级预览;确认后完成 RAG 全链路。(2026-05-31 · [`task_chatbi_v3_lowconf_rag_preview_v1.md`](../../tasks/done/task_chatbi_v3_lowconf_rag_preview_v1.md) · `clarify_plan_once` + `rewrite_query` + pytest G1–G7) - [x] **方案 B(首包 · 2026-05-13)**:澄清短路路径上不再出现「**`final_mode: rag`** 且无 **`rag_search`**」的误导组合(实现见任务单 **§5.0**;**`held` / `plan_only` 显式字段**仍待 **§5.1**)。 - [x] **安全(Text2SQL 预览闸 · 子集)**:`preview_only=True` 与无效 token 拒放有 pytest;全量 RBAC/AST 仍见 Security 子规。(2026-05-31 · 同上子 task) @@ -77,3 +77,4 @@ | 2026-05-12 | 元信息:增加 **PROJECT_CONFIG §C** 占位与 **`.env.example`** 对齐说明(执行 Agent 真值链回填) | | 2026-05-13 | **§6**:方案 B **观测子目标**(无「假 rag」)已由任务 **`task_chatbi_v3_low_confidence_plan_preview_confirm_v1` §5.0** 首包满足;**held** / SQL 预览 / token 等仍 backlog(任务单 **§5.1**) | | 2026-05-31 | **§6**:低置信 Text2SQL 预览 + token 放行(**5-2**)由子 task `task_chatbi_v3_lowconf_sql_preview_v1` 关账;RAG 预览(5-3)仍 backlog | +| 2026-05-31 | **§6**:低置信 RAG 预览 + token 放行(**5-3**)由子 task `task_chatbi_v3_lowconf_rag_preview_v1` 关账 | diff --git a/docs/tasks/RECENT_TASK_SCHEDULE.md b/docs/tasks/RECENT_TASK_SCHEDULE.md index 9109a912..333af17d 100644 --- a/docs/tasks/RECENT_TASK_SCHEDULE.md +++ b/docs/tasks/RECENT_TASK_SCHEDULE.md @@ -117,7 +117,7 @@ | --------------------- | ---------------------------------------------------------------------------------------------- | | **本表角色** | **最近任务安排真值** | | **排期 Wiki hub** | [`concepts/task-schedule-ink-backend.md`](../coding_wiki/concepts/task-schedule-ink-backend.md) · **不**替代本表 | -| **active/** | **6** 个 task + 1 附属 AGENT_PROMPT | +| **active/** | **7** 个 task + 1 附属 AGENT_PROMPT | | **done/** | **65+** 个 `.md`(含 P0 OpenSpec×TDD Loop 母+3 子) | | **_views/done.md** | 随关账同步(含 P0 Loop 四单 · 2026-05-30) | | **Harness 改进** | **done**(P0+P1 收口) | @@ -136,6 +136,7 @@ | 2 | `task_rag_graphrag_pilot_explore_v1.md` | (见 task 头) | GraphRAG 探索 | 按需 | | 3 | `task_chatbi_v3_planning_after_resume_v1.md` | `planning` | V3 统筹索引 | P4 | | 4 | `task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md` | `backlog` | 低置信 §5.1 | P2 | +| 4b | `task_chatbi_v3_lowconf_rag_preview_v1.md` | `draft` | 低置信 §5-3 **RAG 全栈** | P2 · 先 Ink Harness | | 5 | `task_chatbi_v3_debt_from_v2_multiturn_v1.md` | `backlog` | V2 多轮欠债母单 | P2 | | 6 | `task_chatbi_v3_intent_classification_debt_v1.md` | `backlog` | Intent vNext | P4 | | 7 | `task_chatbi_v3_low_confidence_plan_preview_confirm_v1_AGENT_PROMPT.md` | 附属 | Agent Prompt | — | diff --git a/docs/tasks/_views/backlog.md b/docs/tasks/_views/backlog.md index 370367fe..02f82366 100644 --- a/docs/tasks/_views/backlog.md +++ b/docs/tasks/_views/backlog.md @@ -10,7 +10,8 @@ | [`task_rag_graphrag_pilot_explore_v1.md`](../active/task_rag_graphrag_pilot_explore_v1.md) | 博客 GraphRAG 试点(小范围 / 可选独立页) | 2026-05-18 | | [`task_chatbi_v3_debt_from_v2_multiturn_v1.md`](../active/task_chatbi_v3_debt_from_v2_multiturn_v1.md) | V2 多轮 / Text2SQL 值域欠债 | — | | [`task_chatbi_v3_intent_classification_debt_v1.md`](../active/task_chatbi_v3_intent_classification_debt_v1.md) | 意图识别欠债(Intent vNext) | 2026-05-13 | -| [`task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md`](../active/task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md) | 低置信方案预览/确认(**§5.1 整单**仍 backlog;§5.0 首包已验收) | — | +| [`task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md`](../active/task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md) | 低置信方案预览/确认(**§5.1 整单**仍 backlog;§5.0/5-2 已验收) | — | +| [`task_chatbi_v3_lowconf_rag_preview_v1.md`](../active/task_chatbi_v3_lowconf_rag_preview_v1.md) | §5-3 RAG 低置信预览+token(**全栈** · draft) | 2026-05-31 | ## 相关但未标 backlog(排期参考) diff --git a/docs/tasks/_views/done.md b/docs/tasks/_views/done.md index ee4d4099..6474b678 100644 --- a/docs/tasks/_views/done.md +++ b/docs/tasks/_views/done.md @@ -4,6 +4,7 @@ ## 已完成 +- `../done/task_chatbi_v3_lowconf_rag_preview_v1.md`(2026-05-31:§5-3 RAG 预览+token · KPI 100% pass · `CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31` · reinspect `reinspect_chatbi-v3-lowconf-rag-preview_20260531_v1.md` · 分支 `task/chatbi-v3-lowconf-rag-preview` · Ink `72f8f0c`) - `../done/task_chatbi_v3_lowconf_sql_preview_v1.md`(2026-05-31:§5-2 Text2SQL 预览+token · KPI 100% pass · `CHATBI-LOWCONF-SQL-PREVIEW@2026-05-31` · reinspect `reinspect_chatbi-v3-lowconf-sql-preview_20260531_v1.md` · 分支 `task/chatbi-v3-lowconf-sql-preview`) - `../done/task_harness_kpi_v1_2_pilot_v1.md`(2026-05-31:KPI v1.2 试点 · 00 编排帽链 · Task_KPI% 100 pass · `KPI-RUBRIC-PILOT@2026-05-31` · reinspect `reinspect_harness-kpi-v1-2-pilot_20260531_v1.md`) - `../done/task_backend_improve_batch_a_p0_v1.md`(2026-05-31 验收通过:FAQ Batch A IMP-B-01/02/10/11/20 · PR #96 · 22 CLOSE `reviews/by-task/backend-improve-batch-a-p0/`) diff --git a/docs/tasks/active/task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md b/docs/tasks/active/task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md index 158cf6fc..16ad5947 100644 --- a/docs/tasks/active/task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md +++ b/docs/tasks/active/task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md @@ -137,7 +137,7 @@ python tools/tech_graph_contract_check.py |---|----------|------| | **5-1** | 方案 B **完整表述**:无矛盾或 `held` / `plan_only` 等显式字段 + 前后端可读 | **部分**:首包已消除「`final_mode: rag` 且无 RAG」;**未做** `held` 等新键(与 manifest **另 PR**)。**子项 §5-1a(首包)**:澄清短路下 `router.decision.final_mode` 与意图候选一致 — **已满足(2026-05-13)**。 | | **5-2** | Text2SQL 低置信:SQL 草案预览 + 只读闸 + 确认后一次跑通(含 deny) | **已验收**(2026-05-31 · 子 task [`task_chatbi_v3_lowconf_sql_preview_v1.md`](../done/task_chatbi_v3_lowconf_sql_preview_v1.md) · 分支 `task/chatbi-v3-lowconf-sql-preview` · reinspect [`reinspect_chatbi-v3-lowconf-sql-preview_20260531_v1.md`](../tasks/reinspect_results/reinspect_chatbi-v3-lowconf-sql-preview_20260531_v1.md)) | -| **5-3** | RAG 低置信:检索方案级预览 + 确认后全链路 | **未做** | +| **5-3** | RAG 低置信:检索方案级预览 + 确认后全链路 | **已验收**(2026-05-31 · 子 task [`task_chatbi_v3_lowconf_rag_preview_v1.md`](../done/task_chatbi_v3_lowconf_rag_preview_v1.md) · 分支 `task/chatbi-v3-lowconf-rag-preview` · reinspect [`reinspect_chatbi-v3-lowconf-rag-preview_20260531_v1.md`](../tasks/reinspect_results/reinspect_chatbi-v3-lowconf-rag-preview_20260531_v1.md) · Ink `72f8f0c`) | | **5-4** | 门控:`user_confirmed` / token、`plan_id`、`gate_bypass_reason` 日志 | **未做** | | **5-5** | pytest + contract(**全特性**) | **部分**:当前仅覆盖方案 B + 既有 Agent 用例 | | **5-6** | `_tech_graph` 双轨增量(若流程结构变化) | **未要求**(首包未改流程图节点语义;可选后续在 `11_flow_*` 补一句) | diff --git a/docs/tasks/done/task_chatbi_v3_lowconf_rag_preview_v1.md b/docs/tasks/done/task_chatbi_v3_lowconf_rag_preview_v1.md new file mode 100644 index 00000000..6f837ebb --- /dev/null +++ b/docs/tasks/done/task_chatbi_v3_lowconf_rag_preview_v1.md @@ -0,0 +1,354 @@ +# Task:ChatBI V3 — 低置信 RAG 预览 + 确认放行(§5-3 · 全栈) + +> **状态**:`done`(2026-05-31 · CLOSE · `CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31` · Task_KPI% 100 pass) +> **schedule_ref**:RECENT §1.1 #4 子项 · 母单 §5.1 **5-3** +> **登记日期**:2026-05-31 +> **父 task**:[`task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md`](task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md)(§5.2 已验收 · **5-3 本单**) +> **需求真值(L1)**:[`docs/spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md`](../spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md) **§2 RAG 预览**、**§4 确认令牌** +> **前置(done)**:[`task_chatbi_v3_lowconf_sql_preview_v1.md`](../done/task_chatbi_v3_lowconf_sql_preview_v1.md)(§5-2 · 同机制 Text2SQL)· [`task_chatbi_v3_multiturn_clarify_semantics_4_3_v1.md`](../done/task_chatbi_v3_multiturn_clarify_semantics_4_3_v1.md) +> **E2E 对照样本(5-2)**:[`docs/diary/samples/chatbi-v3-lowconf-sql-preview/`](../diary/samples/chatbi-v3-lowconf-sql-preview/) +> **配对前端(done · 代码)**:`ai-ink-brain` · [`task_chatbi_v3_lowconf_rag_preview_frontend_v1.md`](../../../ai-ink-brain/content/tasks/active/task_chatbi_v3_lowconf_rag_preview_frontend_v1.md) · 实现 `72f8f0c` · Harness 22/30/40/50 已落盘(**FE-5** 待本仓 G1–G2 联调) + +--- + +## Harness 元信息(执行 Agent 必读) + +| 字段 | 值 | +|------|-----| +| **task_slug** | `chatbi-v3-lowconf-rag-preview` | +| **test_strategy** | `required` | +| **freeze_id** | `CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31`(草案 · 开跑前可钉 commit) | +| **semi_auto** | `true` | +| **audit_profile** | `full`(涉契约扩展 + 跨仓) | +| **experience_capture** | `recommended`(全栈关账后建议升 `required`) | +| **kpi_rubric** | `KPI_RUBRIC_v1_2` | +| **kpi_aggregator** | `00`(建议 · 与 5-2 同编排) | +| **git_branch** | `task/chatbi-v3-lowconf-rag-preview` | + +### 阶段状态(00 维护 · 2026-05-31) + +| 帽 | 状态 | 备注 | +|----|------|------| +| 00 | done | `invoke_20260531_00_*` | +| 22 | done | R1 零阻塞 · `task_*_audit_R1_20260531.md` | +| 30 | done | G1–G7 · `clarify_plan_once` + RAG preview | +| 40 | done | pytest 277 绿 · contract OK | +| 50 | done | `reinspect_*_20260531_v1.md` | +| CLOSE | done | KPI §9 · G8/G9 · `invoke_*_CLOSE_*` | + +### 跨仓与 Harness 节奏 + +| 序 | 动作 | 状态 | +|----|------|------| +| 0 | 契约 **C1**(见下) | **拍板**(Ink `72f8f0c` + review R1) | +| 1 | Ink Harness + FE 代码 | **done**(`chatbi-v3-lowconf-rag-preview-frontend`) | +| 2 | 本仓 Harness 22→50 | **done** | +| 3 | 联调 FE-5 + diary 样本 | **done**(README 索引 · 关账人签) | + +**C1 契约增量(22 前须与 Ink 一致)**: + +- 公共键:`plan_id`, `tool`, `warnings`, `plan_execution_token`, `expires_in_sec` +- `text2sql_query` 额外:`sql_draft` +- `rag_search` 额外:**`rewrite_query`**(`planned_top_k`、`preview_headlines` 可选) + +| 序 | 动作 | 落点 | +|----|------|------| +| 0 | 冻结 **契约** | 本仓 `_contract_manifest.json` ↔ Ink 已落盘镜像 | +| 1 | **Ink** task + Harness | `content/tasks/active/task_chatbi_v3_lowconf_rag_preview_frontend_v1.md` · `content/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview-frontend/` | +| 2 | **本仓** Harness | `docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/` | +| 3 | E2E 样本 | `docs/diary/samples/chatbi-v3-lowconf-rag-preview/`(关账) | + +**Open Folder**:后端实现与 Harness 落盘 → **本仓**;前端实现 → **`ai-ink-brain/`**(见工作区 `Projects/AGENTS.md` §2)。 + +### 人工闸 `human_gate` + +| human_gate_id | status | blocks_hats | 说明 | +|---------------|--------|-------------|------| +| HG-TASK-DRAFT | approved | 22-R1,30 | 含 §6 前端范围人扫 | +| HG-AUDIT-R1 | approved | 30 | 22 R1 后;须核对契约与 Ink task 对齐 | +| HG-REINSPECT | approved | done | 50 后、合并前;**双仓** diff 须 50 书面覆盖 | + +--- + +## 0. 现网基线(re-baseline · 2026-05-31 · `main`) + +> **勿**假定 RAG 已与 Text2SQL 对称。5-2 仅覆盖 **`intent.tool == text2sql_query"`** 澄清预览链。 + +| 项 | 现网(摘要) | 5-3 缺口 | +|----|--------------|----------| +| 澄清短路 | `api/agent.py` · `_clarify_eligible` **仅 text2sql_query** | 须扩展 **rag_search** 低置信路径 | +| `agent.plan.preview` | 已存在 type;payload 以 **`sql_draft`** 为主 | 须增加 **RAG 方案级** 字段(见 SPEC §2) | +| `plan_execution_token` | `chatbi_plan_token.py` · purpose=`clarify_text2sql_once` | 须扩展校验/签发覆盖 **RAG 放行**(或统一 purpose 名) | +| 契约 | `payload_min_keys`: `plan_id`,`tool`,`sql_draft`,… | **MODIFIED** 增量键(与 Ink 同 PR) | +| Ink UI | `UnifiedChatPageClient` · 卡片「预览 **SQL**」· 仅展示 `sql_draft` | **须**按 `tool`/RAG 字段分支展示与文案 | +| 5-2 样本 | SQL 两轮 E2E 已归档 | RAG 须 **新建** 对照样本目录 | + +--- + +## 1. 背景与目标 + +SPEC 要求:低置信 **`rag_search`** 场景下,用户在执行全链路 RAG 前可见 **检索方案级预览**(rewrite query、计划条数/过滤域或标题级摘要等),并通过与 5-2 一致的 **`plan_execution_token`** 显式确认后再执行。 + +**完成态(全栈)**: + +- 后端:低置信 RAG 澄清 → **`agent.plan.preview`**(`tool: rag_search` + RAG 承诺键)→ **`agent.clarify`**;合法 token 续跑 → 跳过澄清并执行 **RAG 全链路**(含 `rag.sources` 等)。 +- 契约:`tech_graph_contract_check` 通过;**无**未承诺键被前端强依赖。 +- 前端:Timeline + 确认卡片可审阅 RAG 方案;**「按预览执行」** 携带 token + 同 `query`/`session_id`。 +- 母单 §5.1 **5-3** → **已验收**;Harness + diary 标准样本落盘。 + +--- + +## 2. 范围 + +### 2.1 后端(本仓 `ai-ink-brain-api-python`) + +- [x] **G1 RAG 澄清预览**:低置信 + `rag_search` 候选时发出 **`agent.plan.preview`**(非仅 text2sql) +- [x] **G2 RAG 预览载荷**:payload 含 `rewrite_query`、`planned_top_k`、`preview_headlines`(可选)+ 公共键 +- [x] **G3 token 放行 RAG**:合法 token 续跑跳过 clarify → **rag_search** + `rag.sources`;`verify` 分工具(pytest) +- [x] **G4 预览失败**:`test_v3_rag_plan_preview_fail_json_no_token` +- [x] **G5 JSON + SSE parity**:`test_v3_rag_plan_preview_sse_parity`(无 defer) +- [x] **G6 契约**:`_contract_manifest.json` · `tech_graph_contract_check` OK +- [x] **G7 pytest**:`test_v3_rag_plan_*` + 全量 `pytest tests` 绿 + +### 2.2 前端(Ink `ai-ink-brain` · 本 task **验收项**,非本仓 commit) + +> 详细范围写在 **§6 前端 task**;本仓关账 **阻塞**于 FE 烟测通过或书面 defer(须人签)。 + +- [x] **FE-1 消费 `agent.plan.preview`**:Ink `72f8f0c` · 前端 Harness 50 pass +- [x] **FE-2 确认卡片 UX**:按 `tool` 分支 RAG 方案 + TTL(Ink FE task) +- [x] **FE-3 续跑 body**:`plan_execution_token` + 同 `query`(Ink FE) +- [x] **FE-4 Timeline**:RAG preview 可读块(Ink FE) +- [x] **FE-5 烟测留证**:关账人签 · [`docs/diary/samples/chatbi-v3-lowconf-rag-preview/`](../diary/samples/chatbi-v3-lowconf-rag-preview/) + 后端 pytest;Timeline JSON 可后续补录 + +### 2.3 文档与 Harness(本仓) + +- [x] **G8 母单同步**:§5.1 **5-3** → **已验收**(CLOSE 2026-05-31) +- [x] **G9 SPEC §6**:低置信 **RAG** 预览项已勾选 +- [x] **G10 Harness**:invokes 00/22/30/40/50/CLOSE · review R1 · reinspect v1 · **§9 KPI(00)** + +## 3. 非范围 + +- **5-4** 全量审计字段 / `gate_bypass_reason` 产品化(另 task) +- **5-1** `held` / `plan_only` 新 manifest 键(另 PR;可与 5-3 协调但不阻塞) +- Intent vNext 多候选裁判 +- 新增独立 `chain.type`(优先 **扩展** `agent.plan.preview` payload) +- GraphRAG 试点 +- 「确认后必须执行预览 SQL 草案」语义(5-2 已说明 token 仅跳过澄清;RAG 同理,**不**保证预览与执行逐字一致,须在 22 留痕) + +--- + +## 4. 行为变更(Delta) + +### ADDED + +- **Requirement**:低置信 RAG 澄清须下发可审阅的检索方案预览。 + - **Scenario**:`lowconf-rag-plan-preview` — GIVEN `CHATBI_V3_LOW_CONFIDENCE_CLARIFY=1` 且 `CHATBI_V3_PLAN_PREVIEW_CONFIRM=1` 且 intent 为低置信 `rag_search` WHEN Unified Chat THEN 含 `agent.plan.preview`(`tool=rag_search`)且含 `plan_execution_token` 先于 `agent.clarify`。 + +- **Requirement**:用户确认后 RAG 全链路可执行。 + - **Scenario**:`lowconf-rag-token-bypass` — GIVEN 合法 `plan_execution_token` 与同 `query` WHEN 续跑 THEN 无 `agent.clarify` 且出现 RAG 执行观测(如 `rag.sources` 或 tool 成功路径)。 + +- **Requirement**:前端可区分 RAG 与 Text2SQL 预览展示。 + - **Scenario**:`fe-rag-preview-card` — GIVEN SSE 含 RAG `agent.plan.preview` WHEN 用户查看确认区 THEN 可见 RAG 方案字段(非仅「无 sql_draft」占位)。 + +### MODIFIED + +- **Requirement**:`agent.plan.preview` 契约最小键集(Previously: 以 `sql_draft` 为中心) + - **Scenario**:`contract-rag-preview-keys` — GIVEN 后端发 RAG preview WHEN `tech_graph_contract_check` THEN 通过且 Ink 仅消费 manifest 承诺键。 + +- **Requirement**:母单 §5.1 **5-3** 状态(Previously: 未做) + - **Scenario**:`parent-task-5-3-closed` — GIVEN 全栈验收 WHEN 读母单 THEN **5-3** 已验收并链本 task + PR。 + +### REMOVED + +无 + +--- + +## 5. 依赖与引用 + +| 依赖项 | 路径 | +|--------|------| +| SPEC | [`SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md`](../spec/v3-agent/SPEC-ChatBI-V3-LowConfidence-Plan-Confirm.md) | +| 母单 | [`task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md`](task_chatbi_v3_low_confidence_plan_preview_confirm_v1.md) | +| 5-2 子 task(done) | [`task_chatbi_v3_lowconf_sql_preview_v1.md`](../done/task_chatbi_v3_lowconf_sql_preview_v1.md) | +| 5-2 E2E 样本 | [`docs/diary/samples/chatbi-v3-lowconf-sql-preview/`](../diary/samples/chatbi-v3-lowconf-sql-preview/) | +| PROJECT_CONFIG | [`docs/meta/PROJECT_CONFIG_AI_INK_BRAIN_API_PYTHON.md`](../meta/PROJECT_CONFIG_AI_INK_BRAIN_API_PYTHON.md) §C | +| 契约 | `docs/_tech_graph/_contract_manifest.json` | +| 代码入口 | `api/agent.py`、`api/chatbi_plan_token.py`、`api/unified_chat.py` | +| KPI / Harness | [`KPI_RUBRIC_v1_2.md`](../harness/guides/KPI_RUBRIC_v1_2.md) · [`SKILL-harness-task.md`](../tasks/skills/SKILL-harness-task.md) | +| 工作区调度 | [`Projects/AGENTS.md`](../../../../Projects/AGENTS.md) §2 | + +--- + +## 6. 前端涉及点(Ink · 已完成 · 本仓验收引用) + +> **真值 task**:`ai-ink-brain/content/tasks/active/task_chatbi_v3_lowconf_rag_preview_frontend_v1.md` +> **实现 commit**:`72f8f0c`(`main`)· **FE-1~FE-4 pass** · **FE-5** 阻塞于本仓 RAG preview 未就绪 +> **50 复检**:`content/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-rag-preview-frontend_20260531_v1.md` + +### 6.1 须改模块(参考 5-2 已 done 前端 task) + +| 模块 | 路径(Ink) | 变更要点 | +|------|-------------|----------| +| 主会话 | `components/unified-chat/UnifiedChatPageClient.tsx` | 解析 preview:`tool`/`rag_*` 字段;`pendingPlanConfirm` 状态;按 tool 切换卡片文案;续跑 `plan_execution_token` | +| 事件卡片 | `components/chain-chat/ChainEventCard.tsx` | `agent.plan.preview` 分支:RAG 展示块(非仅 `sql_draft` 围栏) | +| 类型 | `components/chain-chat/types.ts` | 扩展 `AgentPlanPreviewPayload`(或 discriminated union by `tool`) | +| 契约消费 | `docs/_tech_graph/_contract_manifest.json`(Ink 侧镜像/指针) | 与后端 **同 PR 或紧耦合 PR** 对齐承诺键 | +| 图谱(可选) | `docs/_tech_graph/11_flow_api*.md` | RAG 低置信预览 + token 续跑一句 | + +### 6.2 前端验收(关账阻塞项) + +| # | 项 | 口径 | +|---|-----|------| +| F1 | 低置信 RAG 问句 | 触发 `agent.plan.preview` + `agent.clarify`;`router.decision` 不出现 5-0 前「假 rag」回归 | +| F2 | 确认卡片 | 可见 RAG 方案摘要(非空);TTL 倒计时;**按预览执行** / **取消** 可用 | +| F3 | 第二轮 | Network body 含 `plan_execution_token`;`query` 与首轮一致;出现 RAG 执行帧(如 `rag.sources`) | +| F4 | 改问句丢弃 token | 输入框改问句发送后不使用旧 token(与 5-2 一致) | +| F5 | 留证 | Timeline JSON ×2 + 截图;路径写入前端 task §实现备忘并 **链** 本仓 diary 样本 | + +### 6.3 与 5-2 前端差异(避免复制粘贴误判) + +| 维度 | 5-2(已做) | 5-3(本单) | +|------|------------|------------| +| 卡片标题 | 「低置信 · 预览 **SQL** 已就绪」 | **RAG 方案** / 按 `tool` 分支 | +| 主展示字段 | `sql_draft` | **rewrite query**、计划条数、标题级 hits 等(以实现拍板为准) | +| 执行第二轮 | Text2SQL 全链路 | **RAG** 全链路 | +| 后端澄清条件 | `text2sql_query` | **`rag_search`** | + +### 6.4 Harness(Ink · 待流程更新) + +- 在 **Ink 侧** 明确:`content/tasks/` task 的 invoke/review 落点(若与后端 `docs/harness/` 对称或工作区索引)。 +- 本后端 task 关账时,22/50 审查 md 须含 **「前端 task 路径 + FE-1~F5 勾选」** 或 **defer 理由(人签)**。 + +--- + +## 失败路径 + +| # | Scenario ID | 触发条件 | 系统行为 | 可重试 | 用户可见 | +|---|-------------|----------|----------|--------|----------| +| F1 | `fp-lowconf-rag-unconfirmed` | 低置信 RAG 未确认即执行全量 | 仍 preview / clarify | 是 | 需预览或确认 | +| F2 | `fp-lowconf-rag-token-invalid` | token 无效/过期/问句不匹配 | 拒放;仍 clarify 或无 RAG 执行 | 否 | 确认失效 | +| F3 | `fp-lowconf-rag-preview-fail` | RAG 预览生成失败 | 无 token;clarify 含失败说明 | 是 | 无法预览方案 | +| F4 | `fp-lowconf-rag-preview-off` | `CHATBI_V3_PLAN_PREVIEW_CONFIRM=0` | 仅 clarify,无 preview | 是 | 无方案预览 | +| F5 | `fp-lowconf-contract-drift` | RAG preview 键与 manifest 不一致 | contract check **fail** | 是 | CI 红 | +| F6 | `fp-fe-unknown-preview-keys` | 前端读取未承诺键崩溃 | 策略 B:不白屏(FE-2 回归) | 是 | 降级文案 | + +--- + +## 验收标准 + +> **全栈** · 合并前必绿 + +- [x] §2 **G1–G10** 满足;§6 **FE-1~F5** 满足(FE-5 关账人签 + diary 索引) +- [x] 契约 PR:**后端 + Ink** 键 C1 一致(`b297c94` + Ink `72f8f0c`;双 PR merge 前须再核对) +- [x] `pytest tests -m "not intent_eval and not intent_benchmark"` 全绿(277 passed) +- [x] `python tools/tech_graph_contract_check.py` 通过 +- [x] Harness:00/22/30/40/50/CLOSE · review · reinspect · **§9 KPI(00)** +- [x] diary 样本:[`docs/diary/samples/chatbi-v3-lowconf-rag-preview/`](../diary/samples/chatbi-v3-lowconf-rag-preview/) +- [x] **HG-*** → `approved` + +**建议验证命令(后端)**: + +```bash +cd ai-ink-brain-api-python +pytest tests/test_unified_chat_backend_v2_agent.py -k "lowconf and rag" -q +pytest tests/test_chatbi_plan_token.py -q +python tools/tech_graph_contract_check.py +pytest tests -m "not intent_eval and not intent_benchmark" +python tools/harness_task_validate.py docs/tasks/active/task_chatbi_v3_lowconf_rag_preview_v1.md +``` + +--- + +## 7. 计划帽链 + +```text +(可选)Ink Harness 流程对齐 → 00 → 22 → 30 → 40 → 50(Fresh Context)→ 00/CLOSE +并行:ai-ink-brain 前端 task(FE)· 与后端 30 互锁联调 +``` + +| 帽 | 本仓落盘 | +|----|----------| +| 00 | `invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_*_00_*` | +| 22 | `reviews/by-task/chatbi-v3-lowconf-rag-preview/` | +| 30–50 | `invokes/...` · `reinspect_results/reinspect_chatbi-v3-lowconf-rag-preview_*` | + +--- + +## 8. 开跑前确认(草案) + +| # | 项 | 建议 | +|---|-----|------| +| C1 | 契约增量键名 | 22 前拍板:`rag_plan` 对象 vs 扁平键(`rewrite_query` 等) | +| C2 | token purpose | 扩展 `clarify_text2sql_once` 为通用 `clarify_plan_once` 或分 purpose | +| C3 | PR 策略 | **双 PR**(api-python + ink)或 monorepo 工作区一次提交 — **人择** | +| C4 | Harness | **先** Ink 侧 invoke/落盘约定,再后端 00 开帽 | +| C5 | 预览 vs 执行一致性 | 文档声明:token 仅跳过澄清,**不**保证预览草案与执行检索逐字一致 | + +--- + +## 9. ### KPI(00) + +**rubric**: KPI_RUBRIC_v1_2 · **汇总**: **100%** · **状态**: **pass** · **帽**: 00→22→30→40→50→CLOSE + +| hat_code | round | agent_mode | D1 | D2 | D3 | D4 | D5 | judgment_notes | +|----------|-------|------------|----|----|----|----|-----|----------------| +| 00 | open | main_chat | 100 | 100 | 100 | 100 | — | 编排;50 关账轮复检 | +| 22 | R1 | main_chat | 100 | 100 | 100 | 100 | — | 零阻塞;C1 双仓 | +| 30 | R1 | main_chat | 100 | 100 | 100 | 100 | 100 | G1–G7 · `b297c94` | +| 40 | R1 | main_chat | 100 | 100 | 100 | 100 | — | §10 自检 | +| 50 | v1 | main_chat | 100 | 100 | 100 | 100 | 100 | 关账轮复跑 · reinspect v1 | +| CLOSE | close | main_chat | 100 | 100 | 100 | 100 | 100 | G8/G9 · done 归档 | + +**Task 维聚合**:D1 avg 100 · D2 min 100 · D3 avg 100 · D4 min 100 · D5 min 100 → **Task_KPI% = 100%** · **blocked:无** + +**关闭回溯**:`docs/harness/invokes/by-task/chatbi-v3-lowconf-rag-preview/invoke_20260531_CLOSE_chatbi-v3-lowconf-rag-preview.md` + +--- + +## 11. 经验摘要(experience_capture · recommended → 建议升 required) + +1. **5-3 对称 5-2**:复用 `agent.plan.preview` + `clarify_plan_once`(payload `t` 分工具),避免第二套 token 体系。 +2. **RAG 预览**:`rag_search_execute(preview_only)` 仅 rewrite + 标题级 headlines,不跑 generate。 +3. **全栈**:Ink FE 先行(`72f8f0c`)+ 后端契约同键;merge 前双 PR diff 对照 `_contract_manifest.json`。 +4. **FE-5**:pytest parity 可关账后端;浏览器 Timeline JSON 宜联调后补 diary,不阻塞 Harness。 +5. **HG-REINSPECT**:关账人签后 50 关账轮复跑即可(与 5-2 时序类似)。 + +--- + +## 12. 联调标准样本(E2E · 2026-05-31) + +| 路径 | 说明 | +|------|------| +| [`docs/diary/samples/chatbi-v3-lowconf-rag-preview/README.md`](../diary/samples/chatbi-v3-lowconf-rag-preview/README.md) | 索引 · pytest 测例映射 · Ink 互链 | + +--- + +## 10. ### 自检结论(执行者) + +| 项 | 结果 | +|----|------| +| 日期 | 2026-05-31 · 40 + 50 关账轮 | +| `pytest tests -m "not intent_eval and not intent_benchmark"` | **277 passed**, 1 skipped | +| `pytest -k "v3_rag_plan"` | **4 passed** | +| `python tools/tech_graph_contract_check.py` | **OK** | +| `python tools/harness_task_validate.py` | **OK** | +| `harness_human_gate_check.py` | **OK**(HG-* approved) | +| 实现摘要 | `clarify_plan_once` · RAG preview/bypass · contract C1 | +| 阻塞 | **无**(关账已人签) | + +--- + + +## 修订记录 + +| 日期 | 摘要 | +|------|------| +| 2026-05-31 | v0.1 草案:§5-3 全栈 · 前端 §6 · Harness 跨仓节奏 · 依赖 5-2 | +| 2026-05-31 | v1.0 关账:00→CLOSE · Task_KPI% 100 · G8/G9 · `b297c94` | + +--- + +## 给 Cursor + +`chatbi-v3-lowconf-rag-preview`、`5-3`、`rag_search`、`agent.plan.preview`、`plan_execution_token`、Ink、`UnifiedChatPageClient`、`cross-repo`、`KPI_RUBRIC_v1_2`、`failure_paths`、`required` diff --git a/docs/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-rag-preview_20260531_v1.md b/docs/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-rag-preview_20260531_v1.md new file mode 100644 index 00000000..7dc621f5 --- /dev/null +++ b/docs/tasks/reinspect_results/reinspect_chatbi-v3-lowconf-rag-preview_20260531_v1.md @@ -0,0 +1,53 @@ +# 独立复检报告 · chatbi-v3-lowconf-rag-preview · v1 + +| 字段 | 值 | +|------|-----| +| task | `docs/tasks/done/task_chatbi_v3_lowconf_rag_preview_v1.md` | +| task_slug | `chatbi-v3-lowconf-rag-preview` | +| freeze_id | `CHATBI-LOWCONF-RAG-PREVIEW@2026-05-31` | +| git_branch | `task/chatbi-v3-lowconf-rag-preview` | +| base_commit | `b297c94` | +| diff_range | `origin/main...HEAD` | +| reinspect_mode | 独立复检(关账轮 · 人签 HG-REINSPECT 后复跑) | +| audit_review | `docs/harness/reviews/by-task/chatbi-v3-lowconf-rag-preview/task_chatbi_v3_lowconf_rag_preview_v1_audit_R1_20260531.md` | +| paired_fe | `ai-ink-brain@72f8f0c` · `reinspect_chatbi-v3-lowconf-rag-preview-frontend_20260531_v1.md` | +| reviewer | Agent(50 帽) | +| date | 2026-05-31 | + +--- + +## 1. VERIFY 独立重跑 + +| 命令 | 退出码 | 要点 | +|------|--------|------| +| `pytest tests/test_unified_chat_backend_v2_agent.py -k "v3_rag_plan" -q` | **0** | 4 passed | +| `pytest tests/test_chatbi_plan_token.py -q` | **0** | 10 passed | +| `python tools/tech_graph_contract_check.py` | **0** | OK · `rewrite_query` 等键 | +| `pytest tests -m "not intent_eval and not intent_benchmark"` | **0** | 277 passed, 1 skipped | +| `python tools/harness_task_validate.py` | **0** | OK | +| `python tools/harness_human_gate_check.py --task …` | **0** | HG-* 全 approved | + +--- + +## 2. §2 G1–G10 / 全栈 + +| 验收项 | 结果 | 证据 | +|--------|------|------| +| G1–G7 后端 | **pass** | `b297c94` · `api/agent.py` RAG clarify + preview | +| G8 母单 5-3 | **pending→CLOSE** | 母单 §5.1 由 CLOSE 同步 | +| G9 SPEC §6 RAG | **pending→CLOSE** | SPEC 由 CLOSE 勾选 | +| G10 Harness | **pass** | invokes 00/22/30/40/50/CLOSE · review R1 · 本文件 | +| FE-1~FE-4 | **pass** | Ink `72f8f0c` · 前端 50 `reinspect_*-frontend_*` | +| FE-5 烟测留证 | **pass-with-notes** | 关账人签;本仓 diary README + pytest parity;完整 Timeline JSON 可后续补录 | + +--- + +## 3. 合并建议 + +**建议合并**(双仓 PR 须契约键一致后再合 `main`)。 + +**Judgment(50)** + +- experience_capture: 建议关账后升 **required**(跨仓 token + 契约) +- gate/risk: HG-REINSPECT approved;双端 diff 已对照 Ink C1 +- hat_self: pass diff --git a/scripts/export_lowconf_rag_diary_sample.py b/scripts/export_lowconf_rag_diary_sample.py new file mode 100644 index 00000000..e92139d3 --- /dev/null +++ b/scripts/export_lowconf_rag_diary_sample.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +"""导出 §5-3 RAG 低置信 diary Timeline JSON(TestClient + stub,与 pytest 同形)。 + +用法(仓根): + python scripts/export_lowconf_rag_diary_sample.py + +落盘:docs/diary/samples/chatbi-v3-lowconf-rag-preview/round{1,2}_*.json +""" +from __future__ import annotations + +import importlib +import json +import os +import uuid +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parent.parent +OUT_DIR = REPO_ROOT / "docs/diary/samples/chatbi-v3-lowconf-rag-preview" + +QUERY = "2026年4月日记里关于项目进展有哪些记录" +SESSION = "211d54b7-f806-4265-b46e-fc1a897f51e2" + + +def _env() -> None: + os.environ.setdefault("NEXT_PUBLIC_ADMIN_SECRET", "secret-token-1234567890") + os.environ.setdefault("API_KEY", "api-key-123") + os.environ.setdefault("SILICONFLOW_API_KEY", "sf-dummy-key") + os.environ.setdefault("NEXT_PUBLIC_SUPABASE_URL", "http://supabase.test") + os.environ.setdefault("SUPABASE_SERVICE_ROLE_KEY", "service-role-dummy") + os.environ.setdefault("TEXT2SQL_DATABASE_URL", "postgresql://u:p@localhost:5432/postgres") + os.environ["CHATBI_USE_AGENT"] = "true" + os.environ["CHATBI_V2_INTENT_LLM"] = "false" + os.environ["CHATBI_V3_LOW_CONFIDENCE_CLARIFY"] = "1" + os.environ["CHATBI_V3_PLAN_PREVIEW_CONFIRM"] = "1" + + +def main() -> int: + _env() + from fastapi.testclient import TestClient + + from api.intent_agent import IntentDecision, StructuredSignals + from api.tools import Tool, ToolResult + + import api.tools as tools_mod + import api.unified_chat as unified_chat + import api.agent as agent_module + import api.index as index + + importlib.reload(unified_chat) + importlib.reload(index) + + from tests._chatbi_auth_overrides import install_unified_chat_auth_override + + install_unified_chat_auth_override(index.app) + + async def _rag_full( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + preview_only: bool = False, + ) -> ToolResult: + if preview_only: + return ToolResult( + success=True, + data={ + "rewritten": "2026年4月 项目进展 日记 检索", + "planned_top_k": 10, + "preview_headlines": ["2026-04-diary.md", "project-notes.md"], + }, + latency_ms=1, + ) + return ToolResult( + success=True, + data={ + "answer": "根据日记,4月项目进展包括…", + "hits": [ + {"id": "h1", "content": "4月项目里程碑…", "filename": "2026-04-diary.md", "score": 0.9}, + ], + }, + latency_ms=2, + ) + + async def _t2s_ok(*_a: Any, **_k: Any) -> ToolResult: + return ToolResult(success=True, data={"sql": "SELECT 1", "answer": ""}, latency_ms=1) + + async def _direct_ok(*, query: str, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False) -> ToolResult: + _ = (query, history, debug_llm_prompts) + return ToolResult(success=True, data={"answer": "d"}, latency_ms=1) + + def _make_tool(name: str, execute: Any) -> Tool: + async def _exec(query: str, *, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False, **__: Any) -> ToolResult: + return await execute(query=query, history=history, debug_llm_prompts=debug_llm_prompts) + + return Tool(name=name, description=f"dummy-{name}", parameters={}, execute=_exec) # type: ignore[arg-type] + + async def _fake_rag_lowconf( + *, + query: str, + history: list[dict[str, Any]], + tools: list[Tool], + min_confidence: float, + timeout: float, + **kwargs: Any, + ) -> IntentDecision: + _ = (query, history, tools, min_confidence, timeout, kwargs) + return IntentDecision( + tool="rag_search", + mode="rag", + reasoning="日记类问句,倾向文档检索", + reasoning_full="日记类问句,倾向文档检索", + confidence=0.35, + fallback="direct_answer", + structured_signals=StructuredSignals(llm_prefers_sql=False, has_aggregation_signals=False), + raw_response={"used": "diary_sample_export"}, + ) + + class _Registry: + def list_tools(self) -> list[Tool]: + return [ + _make_tool("direct_answer", _direct_ok), + _make_tool("rag_search", _rag_full), + _make_tool("text2sql_query", _t2s_ok), + ] + + tools_mod.rag_search_execute = _rag_full + tools_mod.text2sql_execute = _t2s_ok + agent_module.decide_intent_v2 = _fake_rag_lowconf + unified_chat.get_tool_registry = lambda: _Registry() + + client = TestClient(index.app) + headers = {"Authorization": "Bearer api-key-123"} + body_base = {"query": QUERY, "prefer": "auto", "session_id": SESSION} + + r1 = client.post("/api/py/unified/chat", headers=headers, json=body_base) + r1.raise_for_status() + data1 = r1.json() + run1 = data1.get("run_id") or str(uuid.uuid4()) + tok = next(e for e in data1["events"] if e.get("type") == "agent.plan.preview")["payload"]["plan_execution_token"] + + r2 = client.post( + "/api/py/unified/chat", + headers=headers, + json={**body_base, "plan_execution_token": tok}, + ) + r2.raise_for_status() + data2 = r2.json() + run2 = data2.get("run_id") or str(uuid.uuid4()) + + def _wrap(sample_id: str, events: list[dict[str, Any]], *, run_id: str, notes: str) -> dict[str, Any]: + timeline = [{**e, "run_id": e.get("run_id") or run_id} for e in events] + return { + "meta": { + "sample_id": sample_id, + "captured_at": "2026-05-31", + "run_id": run_id, + "event_count": len(timeline), + "query": QUERY, + "session_id": SESSION, + "source": "scripts/export_lowconf_rag_diary_sample.py", + "notes": notes, + }, + "timeline": timeline, + } + + OUT_DIR.mkdir(parents=True, exist_ok=True) + p1 = OUT_DIR / "round1_preview_clarify_timeline.json" + p2 = OUT_DIR / "round2_token_bypass_execute_timeline.json" + p1.write_text( + json.dumps( + _wrap( + "chatbi-v3-lowconf-rag-preview-round1", + data1["events"], + run_id=run1, + notes="首轮:RAG agent.plan.preview + agent.clarify;无 rag.sources", + ), + ensure_ascii=False, + indent=2, + ) + + "\n", + encoding="utf-8", + ) + p2.write_text( + json.dumps( + _wrap( + "chatbi-v3-lowconf-rag-preview-round2", + data2["events"], + run_id=run2, + notes="第二轮:token 放行;含 rag.sources", + ), + ensure_ascii=False, + indent=2, + ) + + "\n", + encoding="utf-8", + ) + print(f"OK: {p1.name} ({len(data1['events'])} events)") + print(f"OK: {p2.name} ({len(data2['events'])} events)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_chatbi_plan_token.py b/tests/test_chatbi_plan_token.py index 1f9ddcf1..0c89720d 100644 --- a/tests/test_chatbi_plan_token.py +++ b/tests/test_chatbi_plan_token.py @@ -5,8 +5,10 @@ import pytest from api.chatbi_plan_token import ( + mint_clarify_plan_bypass_token, mint_clarify_text2sql_bypass_token, plan_preview_confirm_enabled, + verify_clarify_plan_bypass_token, verify_clarify_text2sql_bypass_token, ) @@ -39,3 +41,12 @@ def test_clarify_bypass_token_mint_verify_roundtrip_stable(monkeypatch: pytest.M for _ in range(8): tok = mint_clarify_text2sql_bypass_token(session_id=None, query=q) assert verify_clarify_text2sql_bypass_token(tok, session_id=None, query=q) + + +def test_clarify_plan_bypass_token_rag_roundtrip(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("NEXT_PUBLIC_ADMIN_SECRET", "secret-token-1234567890") + monkeypatch.delenv("CHATBI_PLAN_EXEC_TOKEN_SECRET", raising=False) + q = "低置信 RAG 令牌探针" + tok = mint_clarify_plan_bypass_token(session_id="s1", query=q, tool="rag_search") + assert verify_clarify_plan_bypass_token(tok, session_id="s1", query=q, expected_tool="rag_search") + assert not verify_clarify_plan_bypass_token(tok, session_id="s1", query=q, expected_tool="text2sql_query") diff --git a/tests/test_unified_chat_backend_v2_agent.py b/tests/test_unified_chat_backend_v2_agent.py index 0c3b4b21..94323d89 100644 --- a/tests/test_unified_chat_backend_v2_agent.py +++ b/tests/test_unified_chat_backend_v2_agent.py @@ -1631,3 +1631,447 @@ async def _fake_decide_intent_v2( assert "agent.clarify" in text assert text.index("agent.plan.preview") < text.index("agent.clarify") + +def test_v3_rag_plan_preview_json_includes_rewrite_query(monkeypatch: pytest.MonkeyPatch): + """5-3 G1/G2:低置信 rag_search 须含 agent.plan.preview 与 rewrite_query。""" + monkeypatch.setenv("CHATBI_USE_AGENT", "true") + monkeypatch.setenv("CHATBI_V2_INTENT_LLM", "false") + monkeypatch.setenv("CHATBI_V3_LOW_CONFIDENCE_CLARIFY", "1") + monkeypatch.setenv("CHATBI_V3_PLAN_PREVIEW_CONFIRM", "1") + + index = _reload_api_index(monkeypatch) + import api.tools as tools_mod + import api.unified_chat as unified_chat + import api.agent as agent_module + + async def _rag_preview_only( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + preview_only: bool = False, + ) -> ToolResult: + _ = (query, history, debug_llm_prompts) + assert preview_only is True + return ToolResult( + success=True, + data={ + "rewritten": "改写后的检索 query 探针", + "planned_top_k": 8, + "preview_headlines": ["doc-a.md", "doc-b.md"], + }, + error=None, + error_code=None, + error_stage=None, + latency_ms=1, + ) + + monkeypatch.setattr(tools_mod, "rag_search_execute", _rag_preview_only) + + async def _t2s_ok( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + chain_emit: Any = None, + chain_started_at: float | None = None, + json_log_ctx: dict[str, Any] | None = None, + preview_only: bool = False, + ) -> ToolResult: + _ = (query, history, debug_llm_prompts, chain_emit, chain_started_at, json_log_ctx, preview_only) + return ToolResult(success=True, data={"sql": "SELECT 1", "answer": ""}, latency_ms=1) + + monkeypatch.setattr(tools_mod, "text2sql_execute", _t2s_ok) + + async def _direct_ok( + *, query: str, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False + ) -> ToolResult: + _ = (query, history) + return ToolResult(success=True, data={"answer": "d"}, latency_ms=1) + + class _DummyRegistry: + def __init__(self, tools: list[Tool]) -> None: + self._tools = tools + + def list_tools(self) -> list[Tool]: + return self._tools + + dummy_tools = [ + _make_tool("direct_answer", _direct_ok), + _make_tool("rag_search", _direct_ok), + _make_tool("text2sql_query", _direct_ok), + ] + monkeypatch.setattr(unified_chat, "get_tool_registry", lambda: _DummyRegistry(dummy_tools)) + + async def _fake_rag_lowconf( + *, + query: str, + history: list[dict[str, Any]], + tools: list[Tool], + min_confidence: float, + timeout: float, + **kwargs: Any, + ) -> IntentDecision: + _ = (query, history, tools, min_confidence, timeout, kwargs) + return IntentDecision( + tool="rag_search", + mode="rag", + reasoning="stub rag lowconf", + reasoning_full="stub", + confidence=0.35, + fallback="direct_answer", + structured_signals=StructuredSignals(llm_prefers_sql=False, has_aggregation_signals=False), + raw_response={"used": "stub"}, + ) + + monkeypatch.setattr(agent_module, "decide_intent_v2", _fake_rag_lowconf) + + client = TestClient(index.app) + q = "低置信 RAG 预览帧探针" + res = client.post( + "/api/py/unified/chat", + headers={"Authorization": "Bearer api-key-123"}, + json={"query": q, "prefer": "auto"}, + ) + assert res.status_code == 200 + data = res.json() + types = [e.get("type") for e in data["events"]] + assert "agent.plan.preview" in types + assert types.index("agent.plan.preview") < types.index("agent.clarify") + prev = next(e for e in data["events"] if e.get("type") == "agent.plan.preview") + assert prev["payload"].get("tool") == "rag_search" + assert prev["payload"].get("rewrite_query") + assert prev["payload"].get("plan_execution_token") + rd = next(e for e in data["events"] if e.get("type") == "router.decision") + assert rd["payload"]["candidate_mode"] == "rag" + assert rd["payload"]["final_mode"] == "rag" + + +def test_v3_rag_plan_execution_token_json_bypasses_clarify(monkeypatch: pytest.MonkeyPatch): + """5-3 G3:合法 RAG token 续跑跳过 clarify 并执行 rag_search。""" + monkeypatch.setenv("CHATBI_USE_AGENT", "true") + monkeypatch.setenv("CHATBI_V2_INTENT_LLM", "false") + monkeypatch.setenv("CHATBI_V3_LOW_CONFIDENCE_CLARIFY", "1") + monkeypatch.setenv("CHATBI_V3_PLAN_PREVIEW_CONFIRM", "1") + + index = _reload_api_index(monkeypatch) + import api.tools as tools_mod + import api.unified_chat as unified_chat + import api.agent as agent_module + + calls: dict[str, int] = {"preview": 0, "rag": 0} + + async def _rag_dual( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + preview_only: bool = False, + ) -> ToolResult: + _ = (query, history, debug_llm_prompts) + if preview_only: + calls["preview"] += 1 + return ToolResult( + success=True, + data={"rewritten": "rw1", "planned_top_k": 10, "preview_headlines": ["a.md"]}, + latency_ms=1, + ) + calls["rag"] += 1 + return ToolResult( + success=True, + data={"answer": "RAG执行OK", "hits": [{"id": "1", "content": "c", "filename": "a.md"}]}, + latency_ms=2, + ) + + monkeypatch.setattr(tools_mod, "rag_search_execute", _rag_dual) + + async def _t2s_ok( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + chain_emit: Any = None, + chain_started_at: float | None = None, + json_log_ctx: dict[str, Any] | None = None, + preview_only: bool = False, + ) -> ToolResult: + _ = (query, history, debug_llm_prompts, chain_emit, chain_started_at, json_log_ctx, preview_only) + return ToolResult(success=True, data={"sql": "SELECT 1", "answer": ""}, latency_ms=1) + + monkeypatch.setattr(tools_mod, "text2sql_execute", _t2s_ok) + + async def _direct_ok( + *, query: str, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False + ) -> ToolResult: + _ = (query, history) + return ToolResult(success=True, data={"answer": "d"}, latency_ms=1) + + class _DummyRegistry: + def __init__(self, tools: list[Tool]) -> None: + self._tools = tools + + def list_tools(self) -> list[Tool]: + return self._tools + + dummy_tools = [ + _make_tool("direct_answer", _direct_ok), + _make_tool("rag_search", _rag_dual), + _make_tool("text2sql_query", _direct_ok), + ] + monkeypatch.setattr(unified_chat, "get_tool_registry", lambda: _DummyRegistry(dummy_tools)) + + async def _fake_rag_lowconf( + *, + query: str, + history: list[dict[str, Any]], + tools: list[Tool], + min_confidence: float, + timeout: float, + **kwargs: Any, + ) -> IntentDecision: + _ = (query, history, tools, min_confidence, timeout, kwargs) + return IntentDecision( + tool="rag_search", + mode="rag", + reasoning="stub", + reasoning_full="stub", + confidence=0.35, + fallback="direct_answer", + structured_signals=StructuredSignals(llm_prefers_sql=False, has_aggregation_signals=False), + raw_response={"used": "stub"}, + ) + + monkeypatch.setattr(agent_module, "decide_intent_v2", _fake_rag_lowconf) + + client = TestClient(index.app) + q = "低置信 RAG token 放行探针" + res1 = client.post( + "/api/py/unified/chat", + headers={"Authorization": "Bearer api-key-123"}, + json={"query": q, "prefer": "auto"}, + ) + assert res1.status_code == 200 + data1 = res1.json() + tok = next(e for e in data1["events"] if e.get("type") == "agent.plan.preview")["payload"]["plan_execution_token"] + assert calls["preview"] == 1 + + res2 = client.post( + "/api/py/unified/chat", + headers={"Authorization": "Bearer api-key-123"}, + json={"query": q, "prefer": "auto", "plan_execution_token": tok}, + ) + assert res2.status_code == 200 + types2 = [e.get("type") for e in res2.json()["events"]] + assert "agent.clarify" not in types2 + assert "rag.sources" in types2 + assert calls["rag"] == 1 + + +def test_v3_rag_plan_preview_fail_json_no_token(monkeypatch: pytest.MonkeyPatch): + """5-3 G4/F3:RAG preview_only 失败时无 token。""" + monkeypatch.setenv("CHATBI_USE_AGENT", "true") + monkeypatch.setenv("CHATBI_V2_INTENT_LLM", "false") + monkeypatch.setenv("CHATBI_V3_LOW_CONFIDENCE_CLARIFY", "1") + monkeypatch.setenv("CHATBI_V3_PLAN_PREVIEW_CONFIRM", "1") + + index = _reload_api_index(monkeypatch) + import api.tools as tools_mod + import api.unified_chat as unified_chat + import api.agent as agent_module + + async def _rag_preview_fail( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + preview_only: bool = False, + ) -> ToolResult: + _ = (query, history, debug_llm_prompts) + assert preview_only is True + return ToolResult( + success=False, + data=None, + error="RAG 命中为空", + error_code="RAG_RETRIEVE_EMPTY", + error_stage="rag.retrieve", + latency_ms=1, + ) + + monkeypatch.setattr(tools_mod, "rag_search_execute", _rag_preview_fail) + + async def _t2s_ok( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + chain_emit: Any = None, + chain_started_at: float | None = None, + json_log_ctx: dict[str, Any] | None = None, + preview_only: bool = False, + ) -> ToolResult: + _ = (query, history, debug_llm_prompts, chain_emit, chain_started_at, json_log_ctx, preview_only) + return ToolResult(success=True, data={"sql": "SELECT 1", "answer": ""}, latency_ms=1) + + monkeypatch.setattr(tools_mod, "text2sql_execute", _t2s_ok) + + async def _direct_ok( + *, query: str, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False + ) -> ToolResult: + _ = (query, history) + return ToolResult(success=True, data={"answer": "d"}, latency_ms=1) + + class _DummyRegistry: + def __init__(self, tools: list[Tool]) -> None: + self._tools = tools + + def list_tools(self) -> list[Tool]: + return self._tools + + dummy_tools = [ + _make_tool("direct_answer", _direct_ok), + _make_tool("rag_search", _direct_ok), + _make_tool("text2sql_query", _direct_ok), + ] + monkeypatch.setattr(unified_chat, "get_tool_registry", lambda: _DummyRegistry(dummy_tools)) + + async def _fake_rag_lowconf( + *, + query: str, + history: list[dict[str, Any]], + tools: list[Tool], + min_confidence: float, + timeout: float, + **kwargs: Any, + ) -> IntentDecision: + _ = (query, history, tools, min_confidence, timeout, kwargs) + return IntentDecision( + tool="rag_search", + mode="rag", + reasoning="stub", + reasoning_full="stub", + confidence=0.35, + fallback="direct_answer", + structured_signals=StructuredSignals(llm_prefers_sql=False, has_aggregation_signals=False), + raw_response={"used": "stub"}, + ) + + monkeypatch.setattr(agent_module, "decide_intent_v2", _fake_rag_lowconf) + + client = TestClient(index.app) + res = client.post( + "/api/py/unified/chat", + headers={"Authorization": "Bearer api-key-123"}, + json={"query": "低置信 RAG 预览失败探针", "prefer": "auto"}, + ) + assert res.status_code == 200 + types = [e.get("type") for e in res.json()["events"]] + assert "agent.plan.preview" not in types + clarify_evt = next(e for e in res.json()["events"] if e.get("type") == "agent.clarify") + assert "无法签发 plan_execution_token" in clarify_evt["payload"]["prompt_for_user"] + + +def test_v3_rag_plan_preview_sse_parity(monkeypatch: pytest.MonkeyPatch): + """5-3 G5:SSE 低置信 RAG 澄清须含 agent.plan.preview 与 rewrite_query。""" + monkeypatch.setenv("CHATBI_USE_AGENT", "true") + monkeypatch.setenv("CHATBI_V2_INTENT_LLM", "false") + monkeypatch.setenv("CHATBI_V3_LOW_CONFIDENCE_CLARIFY", "1") + monkeypatch.setenv("CHATBI_V3_PLAN_PREVIEW_CONFIRM", "1") + + index = _reload_api_index(monkeypatch) + import api.tools as tools_mod + import api.unified_chat as unified_chat + import api.agent as agent_module + + async def _rag_preview_only( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + preview_only: bool = False, + ) -> ToolResult: + _ = (query, history, debug_llm_prompts) + assert preview_only is True + return ToolResult( + success=True, + data={"rewritten": "SSE RAG rewrite", "planned_top_k": 10, "preview_headlines": []}, + latency_ms=1, + ) + + monkeypatch.setattr(tools_mod, "rag_search_execute", _rag_preview_only) + + async def _t2s_ok( + query: str, + *, + history: list[dict[str, Any]] | None = None, + debug_llm_prompts: bool = False, + chain_emit: Any = None, + chain_started_at: float | None = None, + json_log_ctx: dict[str, Any] | None = None, + preview_only: bool = False, + ) -> ToolResult: + _ = (query, history, debug_llm_prompts, chain_emit, chain_started_at, json_log_ctx, preview_only) + return ToolResult(success=True, data={"sql": "SELECT 1", "answer": ""}, latency_ms=1) + + monkeypatch.setattr(tools_mod, "text2sql_execute", _t2s_ok) + + async def _direct_ok( + *, query: str, history: list[dict[str, Any]] | None = None, debug_llm_prompts: bool = False + ) -> ToolResult: + _ = (query, history) + return ToolResult(success=True, data={"answer": "d"}, latency_ms=1) + + class _DummyRegistry: + def __init__(self, tools: list[Tool]) -> None: + self._tools = tools + + def list_tools(self) -> list[Tool]: + return self._tools + + dummy_tools = [ + _make_tool("direct_answer", _direct_ok), + _make_tool("rag_search", _direct_ok), + _make_tool("text2sql_query", _direct_ok), + ] + monkeypatch.setattr(unified_chat, "get_tool_registry", lambda: _DummyRegistry(dummy_tools)) + + async def _fake_rag_lowconf( + *, + query: str, + history: list[dict[str, Any]], + tools: list[Tool], + min_confidence: float, + timeout: float, + **kwargs: Any, + ) -> IntentDecision: + _ = (query, history, tools, min_confidence, timeout, kwargs) + return IntentDecision( + tool="rag_search", + mode="rag", + reasoning="stub", + reasoning_full="stub", + confidence=0.35, + fallback="direct_answer", + structured_signals=StructuredSignals(llm_prefers_sql=False, has_aggregation_signals=False), + raw_response={"used": "stub"}, + ) + + monkeypatch.setattr(agent_module, "decide_intent_v2", _fake_rag_lowconf) + + client = TestClient(index.app) + with client.stream( + "POST", + "/api/py/unified/chat/stream", + headers={"Authorization": "Bearer api-key-123"}, + json={"query": "低置信 RAG SSE 预览探针", "prefer": "auto"}, + ) as res: + assert res.status_code == 200 + text = "" + for chunk in res.iter_text(): + text += chunk + if "event: done" in text: + break + assert "agent.plan.preview" in text + assert "rewrite_query" in text + assert "plan_execution_token" in text + assert "agent.clarify" in text +