Skip to content

Commit 82c515f

Browse files
committed
Address review: hoist constants, normalize ToolCallPart args, comments
- Move NONE_RESULT_* / TOOL_LOOP_THRESHOLD / STRUCTURED_OUTPUT_RETRIES into opencontractserver/constants/llm.py per the no-magic-numbers rule. data_extract_tasks re-exports the NONE_RESULT_* constants so existing import paths in tests and dashboards keep working. - Promote pydantic_ai imports inside _classify_none_result to module level — pydantic-ai is a hard dep, the deferred import broke IDE type resolution and was inconsistent with the rest of the file. - Normalise ToolCallPart.args before json.dumps so ArgsJson (str) and ArgsDict variants of the same call hash to the same Counter key, preventing silent under-counting in tool-loop detection. - Document why _prepare_pydantic_ai_model_settings can return None and add a TODO referencing the temperature pin's safety assumption so a future column-configurable-model change doesn't regress issue #1381.
1 parent e599330 commit 82c515f

4 files changed

Lines changed: 74 additions & 27 deletions

File tree

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
LLM / agent integration constants.
3+
4+
Anthropic structured-extraction reliability knobs and the failure-mode
5+
classifier vocabulary that ``data_extract_tasks._classify_none_result``
6+
emits to ``Datacell.stacktrace``. Lives here per CLAUDE.md "no magic
7+
numbers in business code" rule so operators can grep canonical values
8+
instead of chasing literals across modules.
9+
"""
10+
11+
# Retry budget passed to ``PydanticAIAgent`` for structured extraction.
12+
# pydantic-ai's default is 1; Claude/Anthropic models routinely fail to
13+
# call ``final_result`` on the first turn for sparse documents and we
14+
# observed an ~85% failure rate without retries. 3 strikes the right
15+
# balance: enough to absorb a single missed-tool-call attempt with a
16+
# follow-up reminder, without blowing the per-cell wall-clock budget.
17+
STRUCTURED_OUTPUT_RETRIES = 3
18+
19+
# Threshold for declaring a tool loop in ``_classify_none_result``. If
20+
# the same ``(tool_name, args)`` signature appears at least this many
21+
# times in the captured pydantic-ai message log without a matching
22+
# ``final_result`` call, the cell is classified as
23+
# ``NONE_RESULT_TOOL_LOOP`` (integration failure, NOT a "data absent"
24+
# answer). Distinct from ``STRUCTURED_OUTPUT_RETRIES`` despite the
25+
# coincidental equality — the retry budget is a pydantic-ai input,
26+
# this threshold is a post-mortem heuristic.
27+
TOOL_LOOP_THRESHOLD = 3
28+
29+
# Failure-mode classification vocabulary written to ``Datacell.stacktrace``
30+
# when extraction returns ``None``. Operators grep ``failure_mode=`` to
31+
# separate legitimate "data not present" outcomes from pipeline bugs;
32+
# changing these strings is a breaking change for downstream dashboards.
33+
NONE_RESULT_AGENT_COMMITTED = "agent_committed_none"
34+
NONE_RESULT_NO_FINAL = "no_final_response"
35+
NONE_RESULT_TOOL_LOOP = "tool_loop_no_output"
36+
NONE_RESULT_UNKNOWN = "unknown"

opencontractserver/llms/agents/pydantic_ai_agents.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from pydantic_graph import End
3232

3333
from opencontractserver.constants.context_guardrails import COMPACTION_SUMMARY_PREFIX
34+
from opencontractserver.constants.llm import STRUCTURED_OUTPUT_RETRIES
3435
from opencontractserver.conversations.models import Conversation
3536
from opencontractserver.corpuses.models import Corpus
3637
from opencontractserver.documents.models import Document
@@ -103,12 +104,6 @@
103104
# Type variable for structured responses
104105
T = TypeVar("T")
105106

106-
# Number of retries pydantic-ai will attempt when the model fails to produce
107-
# a valid structured response. Bumped above pydantic-ai's default of 1 because
108-
# Anthropic models routinely need an extra round-trip after tool calls before
109-
# they commit to the final result tool. See issue #1381.
110-
STRUCTURED_OUTPUT_RETRIES = 3
111-
112107

113108
def _is_anthropic_model(model_name: Optional[str]) -> bool:
114109
"""Return True if ``model_name`` looks like an Anthropic / Claude model.
@@ -1339,7 +1334,14 @@ async def _structured_response_raw(
13391334
)
13401335

13411336
try:
1342-
# Build model settings with overrides
1337+
# Build model settings with overrides.
1338+
# ``_prepare_pydantic_ai_model_settings`` returns ``None`` when
1339+
# both temperature and max_tokens are unset on ``self.config``
1340+
# (the helper signals "no settings to pass" rather than
1341+
# returning an empty dict). We need a mutable dict here so
1342+
# the function-level ``temperature`` / ``max_tokens`` overrides
1343+
# — and the Anthropic temperature-0 nudge below — have
1344+
# somewhere to land.
13431345
model_settings = _prepare_pydantic_ai_model_settings(self.config)
13441346
if model_settings is None:
13451347
model_settings = {}

opencontractserver/tasks/data_extract_tasks.py

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,16 @@
66
from typing import Any, Optional
77

88
from asgiref.sync import sync_to_async
9+
from pydantic_ai.messages import ModelResponse, ToolCallPart
910

1011
from opencontractserver.annotations.compact_json import iter_page_annotations
12+
from opencontractserver.constants.llm import (
13+
NONE_RESULT_AGENT_COMMITTED,
14+
NONE_RESULT_NO_FINAL,
15+
NONE_RESULT_TOOL_LOOP,
16+
NONE_RESULT_UNKNOWN,
17+
TOOL_LOOP_THRESHOLD,
18+
)
1119
from opencontractserver.extracts.models import Datacell
1220
from opencontractserver.shared.decorators import celery_task_with_async_to_sync
1321
from opencontractserver.utils.compact_pawls import expand_pawls_pages
@@ -42,16 +50,6 @@
4250
# Operators want to grep ``failure_mode=`` to separate legitimate "data not
4351
# present" outcomes from pipeline bugs.
4452

45-
NONE_RESULT_AGENT_COMMITTED = "agent_committed_none"
46-
NONE_RESULT_NO_FINAL = "no_final_response"
47-
NONE_RESULT_TOOL_LOOP = "tool_loop_no_output"
48-
NONE_RESULT_UNKNOWN = "unknown"
49-
50-
# Threshold for declaring a tool loop. If any single tool name + arguments
51-
# combination appears at least this many times in the captured message log
52-
# without a final structured response, classify as ``tool_loop_no_output``.
53-
_TOOL_LOOP_THRESHOLD = 3
54-
5553

5654
def _classify_none_result(messages: Optional[list[Any]]) -> str:
5755
"""Classify *why* a structured extraction returned ``None``.
@@ -63,11 +61,6 @@ def _classify_none_result(messages: Optional[list[Any]]) -> str:
6361
if not messages:
6462
return NONE_RESULT_UNKNOWN
6563

66-
try:
67-
from pydantic_ai.messages import ModelResponse, ToolCallPart
68-
except ImportError: # pragma: no cover - pydantic_ai is a hard dep
69-
return NONE_RESULT_UNKNOWN
70-
7164
# Scan the log for ``ModelResponse`` parts. A "final structured response"
7265
# is a ToolCallPart whose tool_name starts with ``final_result`` —
7366
# pydantic-ai routes structured outputs through this synthetic tool.
@@ -84,6 +77,17 @@ def _classify_none_result(messages: Optional[list[Any]]) -> str:
8477
saw_final_result = True
8578
else:
8679
raw_args = getattr(part, "args", None)
80+
# ``ToolCallPart.args`` is typed ``str | dict`` in
81+
# pydantic-ai (``ArgsJson`` vs ``ArgsDict``). Normalise
82+
# the JSON-string variant so identical calls hash to the
83+
# same Counter key regardless of which variant the model
84+
# returned, otherwise tool-loop detection silently
85+
# under-counts.
86+
if isinstance(raw_args, str):
87+
try:
88+
raw_args = json.loads(raw_args)
89+
except (json.JSONDecodeError, ValueError):
90+
pass
8791
try:
8892
args_repr = json.dumps(raw_args, sort_keys=True, default=str)
8993
except (TypeError, ValueError):
@@ -99,7 +103,7 @@ def _classify_none_result(messages: Optional[list[Any]]) -> str:
99103
# No final_result anywhere. Look for tool-call repetition.
100104
if tool_call_signatures:
101105
most_common = Counter(tool_call_signatures).most_common(1)
102-
if most_common and most_common[0][1] >= _TOOL_LOOP_THRESHOLD:
106+
if most_common and most_common[0][1] >= TOOL_LOOP_THRESHOLD:
103107
return NONE_RESULT_TOOL_LOOP
104108

105109
return NONE_RESULT_NO_FINAL
@@ -394,9 +398,14 @@ def sync_add_sources(datacell, sources):
394398
# function-level pin bypasses the Anthropic temperature=0
395399
# override in `_structured_response_raw` (issue #1381).
396400
# Safe today because the model is also pinned to
397-
# `openai:gpt-4o-mini`; if the model becomes column-
398-
# configurable, gate this temperature on the model family
399-
# so Claude variants still get the override.
401+
# `openai:gpt-4o-mini`.
402+
#
403+
# TODO(#1381 follow-up): if/when the model becomes
404+
# column-configurable, gate this temperature on the
405+
# model family — passing ``temperature=0.3`` to a
406+
# Claude model silently regresses the structured-output
407+
# reliability fix this PR introduced. Track removal
408+
# of this pin alongside the column-model feature.
400409
temperature=0.3,
401410
similarity_top_k=similarity_top_k,
402411
model="openai:gpt-4o-mini", # Fast and reliable

opencontractserver/tests/test_data_extract_failure_classification.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def test_repeated_tool_call_classifies_as_tool_loop(self) -> None:
8989
def test_repeats_below_threshold_are_not_tool_loop(self) -> None:
9090
"""Two repeats (threshold - 1) ⇒ no_final_response, not tool_loop.
9191
92-
Pins the boundary so a future tweak of ``_TOOL_LOOP_THRESHOLD``
92+
Pins the boundary so a future tweak of ``TOOL_LOOP_THRESHOLD``
9393
forces this test to be updated explicitly.
9494
"""
9595
repeated = _tool_call("similarity_search", {"query": "same"})

0 commit comments

Comments
 (0)