Address review: hoist constants, normalize ToolCallPart args, comments

JSv4 · JSv4 · commit 82c515f4b5e9 · 2026-04-28T19:22:45.000-05:00
- Move NONE_RESULT_* / TOOL_LOOP_THRESHOLD / STRUCTURED_OUTPUT_RETRIES into opencontractserver/constants/llm.py per the no-magic-numbers rule. data_extract_tasks re-exports the NONE_RESULT_* constants so existing import paths in tests and dashboards keep working. - Promote pydantic_ai imports inside _classify_none_result to module level — pydantic-ai is a hard dep, the deferred import broke IDE type resolution and was inconsistent with the rest of the file. - Normalise ToolCallPart.args before json.dumps so ArgsJson (str) and ArgsDict variants of the same call hash to the same Counter key, preventing silent under-counting in tool-loop detection. - Document why _prepare_pydantic_ai_model_settings can return None and add a TODO referencing the temperature pin's safety assumption so a future column-configurable-model change doesn't regress issue #1381.
diff --git a/opencontractserver/constants/llm.py b/opencontractserver/constants/llm.py
@@ -0,0 +1,36 @@
+"""
+LLM / agent integration constants.
+
+Anthropic structured-extraction reliability knobs and the failure-mode
+classifier vocabulary that ``data_extract_tasks._classify_none_result``
+emits to ``Datacell.stacktrace``.  Lives here per CLAUDE.md "no magic
+numbers in business code" rule so operators can grep canonical values
+instead of chasing literals across modules.
+"""
+
+# Retry budget passed to ``PydanticAIAgent`` for structured extraction.
+# pydantic-ai's default is 1; Claude/Anthropic models routinely fail to
+# call ``final_result`` on the first turn for sparse documents and we
+# observed an ~85% failure rate without retries.  3 strikes the right
+# balance: enough to absorb a single missed-tool-call attempt with a
+# follow-up reminder, without blowing the per-cell wall-clock budget.
+STRUCTURED_OUTPUT_RETRIES = 3
+
+# Threshold for declaring a tool loop in ``_classify_none_result``.  If
+# the same ``(tool_name, args)`` signature appears at least this many
+# times in the captured pydantic-ai message log without a matching
+# ``final_result`` call, the cell is classified as
+# ``NONE_RESULT_TOOL_LOOP`` (integration failure, NOT a "data absent"
+# answer).  Distinct from ``STRUCTURED_OUTPUT_RETRIES`` despite the
+# coincidental equality — the retry budget is a pydantic-ai input,
+# this threshold is a post-mortem heuristic.
+TOOL_LOOP_THRESHOLD = 3
+
+# Failure-mode classification vocabulary written to ``Datacell.stacktrace``
+# when extraction returns ``None``.  Operators grep ``failure_mode=`` to
+# separate legitimate "data not present" outcomes from pipeline bugs;
+# changing these strings is a breaking change for downstream dashboards.
+NONE_RESULT_AGENT_COMMITTED = "agent_committed_none"
+NONE_RESULT_NO_FINAL = "no_final_response"
+NONE_RESULT_TOOL_LOOP = "tool_loop_no_output"
+NONE_RESULT_UNKNOWN = "unknown"
diff --git a/opencontractserver/llms/agents/pydantic_ai_agents.py b/opencontractserver/llms/agents/pydantic_ai_agents.py
@@ -31,6 +31,7 @@
 from pydantic_graph import End
 
 from opencontractserver.constants.context_guardrails import COMPACTION_SUMMARY_PREFIX
+from opencontractserver.constants.llm import STRUCTURED_OUTPUT_RETRIES
 from opencontractserver.conversations.models import Conversation
 from opencontractserver.corpuses.models import Corpus
 from opencontractserver.documents.models import Document
@@ -103,12 +104,6 @@
 # Type variable for structured responses
 T = TypeVar("T")
 
-# Number of retries pydantic-ai will attempt when the model fails to produce
-# a valid structured response. Bumped above pydantic-ai's default of 1 because
-# Anthropic models routinely need an extra round-trip after tool calls before
-# they commit to the final result tool. See issue #1381.
-STRUCTURED_OUTPUT_RETRIES = 3
-
 
 def _is_anthropic_model(model_name: Optional[str]) -> bool:
     """Return True if ``model_name`` looks like an Anthropic / Claude model.
@@ -1339,7 +1334,14 @@ async def _structured_response_raw(
         )
 
         try:
-            # Build model settings with overrides
+            # Build model settings with overrides.
+            # ``_prepare_pydantic_ai_model_settings`` returns ``None`` when
+            # both temperature and max_tokens are unset on ``self.config``
+            # (the helper signals "no settings to pass" rather than
+            # returning an empty dict).  We need a mutable dict here so
+            # the function-level ``temperature`` / ``max_tokens`` overrides
+            # — and the Anthropic temperature-0 nudge below — have
+            # somewhere to land.
             model_settings = _prepare_pydantic_ai_model_settings(self.config)
             if model_settings is None:
                 model_settings = {}
diff --git a/opencontractserver/tasks/data_extract_tasks.py b/opencontractserver/tasks/data_extract_tasks.py
@@ -6,8 +6,16 @@
 from typing import Any, Optional
 
 from asgiref.sync import sync_to_async
+from pydantic_ai.messages import ModelResponse, ToolCallPart
 
 from opencontractserver.annotations.compact_json import iter_page_annotations
+from opencontractserver.constants.llm import (
+    NONE_RESULT_AGENT_COMMITTED,
+    NONE_RESULT_NO_FINAL,
+    NONE_RESULT_TOOL_LOOP,
+    NONE_RESULT_UNKNOWN,
+    TOOL_LOOP_THRESHOLD,
+)
 from opencontractserver.extracts.models import Datacell
 from opencontractserver.shared.decorators import celery_task_with_async_to_sync
 from opencontractserver.utils.compact_pawls import expand_pawls_pages
@@ -42,16 +50,6 @@
 # Operators want to grep ``failure_mode=`` to separate legitimate "data not
 # present" outcomes from pipeline bugs.
 
-NONE_RESULT_AGENT_COMMITTED = "agent_committed_none"
-NONE_RESULT_NO_FINAL = "no_final_response"
-NONE_RESULT_TOOL_LOOP = "tool_loop_no_output"
-NONE_RESULT_UNKNOWN = "unknown"
-
-# Threshold for declaring a tool loop. If any single tool name + arguments
-# combination appears at least this many times in the captured message log
-# without a final structured response, classify as ``tool_loop_no_output``.
-_TOOL_LOOP_THRESHOLD = 3
-
 
 def _classify_none_result(messages: Optional[list[Any]]) -> str:
     """Classify *why* a structured extraction returned ``None``.
@@ -63,11 +61,6 @@ def _classify_none_result(messages: Optional[list[Any]]) -> str:
     if not messages:
         return NONE_RESULT_UNKNOWN
 
-    try:
-        from pydantic_ai.messages import ModelResponse, ToolCallPart
-    except ImportError:  # pragma: no cover - pydantic_ai is a hard dep
-        return NONE_RESULT_UNKNOWN
-
     # Scan the log for ``ModelResponse`` parts. A "final structured response"
     # is a ToolCallPart whose tool_name starts with ``final_result`` —
     # pydantic-ai routes structured outputs through this synthetic tool.
@@ -84,6 +77,17 @@ def _classify_none_result(messages: Optional[list[Any]]) -> str:
                     saw_final_result = True
                 else:
                     raw_args = getattr(part, "args", None)
+                    # ``ToolCallPart.args`` is typed ``str | dict`` in
+                    # pydantic-ai (``ArgsJson`` vs ``ArgsDict``).  Normalise
+                    # the JSON-string variant so identical calls hash to the
+                    # same Counter key regardless of which variant the model
+                    # returned, otherwise tool-loop detection silently
+                    # under-counts.
+                    if isinstance(raw_args, str):
+                        try:
+                            raw_args = json.loads(raw_args)
+                        except (json.JSONDecodeError, ValueError):
+                            pass
                     try:
                         args_repr = json.dumps(raw_args, sort_keys=True, default=str)
                     except (TypeError, ValueError):
@@ -99,7 +103,7 @@ def _classify_none_result(messages: Optional[list[Any]]) -> str:
     # No final_result anywhere. Look for tool-call repetition.
     if tool_call_signatures:
         most_common = Counter(tool_call_signatures).most_common(1)
-        if most_common and most_common[0][1] >= _TOOL_LOOP_THRESHOLD:
+        if most_common and most_common[0][1] >= TOOL_LOOP_THRESHOLD:
             return NONE_RESULT_TOOL_LOOP
 
     return NONE_RESULT_NO_FINAL
@@ -394,9 +398,14 @@ def sync_add_sources(datacell, sources):
                     # function-level pin bypasses the Anthropic temperature=0
                     # override in `_structured_response_raw` (issue #1381).
                     # Safe today because the model is also pinned to
-                    # `openai:gpt-4o-mini`; if the model becomes column-
-                    # configurable, gate this temperature on the model family
-                    # so Claude variants still get the override.
+                    # `openai:gpt-4o-mini`.
+                    #
+                    # TODO(#1381 follow-up): if/when the model becomes
+                    # column-configurable, gate this temperature on the
+                    # model family — passing ``temperature=0.3`` to a
+                    # Claude model silently regresses the structured-output
+                    # reliability fix this PR introduced.  Track removal
+                    # of this pin alongside the column-model feature.
                     temperature=0.3,
                     similarity_top_k=similarity_top_k,
                     model="openai:gpt-4o-mini",  # Fast and reliable
diff --git a/opencontractserver/tests/test_data_extract_failure_classification.py b/opencontractserver/tests/test_data_extract_failure_classification.py
@@ -89,7 +89,7 @@ def test_repeated_tool_call_classifies_as_tool_loop(self) -> None:
     def test_repeats_below_threshold_are_not_tool_loop(self) -> None:
         """Two repeats (threshold - 1) ⇒ no_final_response, not tool_loop.
 
-        Pins the boundary so a future tweak of ``_TOOL_LOOP_THRESHOLD``
+        Pins the boundary so a future tweak of ``TOOL_LOOP_THRESHOLD``
         forces this test to be updated explicitly.
         """
         repeated = _tool_call("similarity_search", {"query": "same"})