Address review: relocate is_anthropic_model and tighten structured run

JSv4 · JSv4 · commit b960f1ed0823 · 2026-04-29T01:36:37.000-05:00
- Move is_anthropic_model from constants/llm.py to utils/llm.py per CLAUDE.md (utility functions belong in opencontractserver/utils/). Update imports in pydantic_ai_agents.py, data_extract_tasks.py, and the failure-classification test module. - Trim verbose docstrings/comments in constants/llm.py while keeping the TOOL_LOOP_THRESHOLD ↔ STRUCTURED_OUTPUT_RETRIES coupling note explicit. - Pass model_settings=None (not {}) to PydanticAIAgent when no temperature/max_tokens pin is set, preserving the pre-issue-#1381 contract for non-Anthropic structured runs without caller pins. Add a regression test in test_pydantic_ai_agents.py. - Convert the if/if/if return chain in _failure_message_for_classification to if/elif for clarity. - Add an inline TODO at the doc_extract_query_task call site so a future refactor that accepts a caller-supplied model passes the same value to _resolve_extract_temperature, keeping model and temperature in lock-step.
diff --git a/opencontractserver/constants/llm.py b/opencontractserver/constants/llm.py
@@ -1,65 +1,22 @@
-"""
-LLM / agent integration constants.
+"""LLM / agent integration constants (issue #1381)."""
 
-Anthropic structured-extraction reliability knobs and the failure-mode
-classifier vocabulary that ``data_extract_tasks._classify_none_result``
-emits to ``Datacell.stacktrace``.  Lives here per CLAUDE.md "no magic
-numbers in business code" rule so operators can grep canonical values
-instead of chasing literals across modules.
-"""
-
-from typing import Optional
-
-# Retry budget passed to ``PydanticAIAgent`` for structured extraction.
-# pydantic-ai's default is 1; Claude/Anthropic models routinely fail to
-# call ``final_result`` on the first turn for sparse documents and we
-# observed an ~85% failure rate without retries.  3 strikes the right
-# balance: enough to absorb a single missed-tool-call attempt with a
-# follow-up reminder, without blowing the per-cell wall-clock budget.
+# pydantic-ai default is 1; Anthropic models often need retries to commit
+# to ``final_result``. Bumping this requires re-checking ``TOOL_LOOP_THRESHOLD``
+# below — a legitimate retried tool call could be mis-classified as a loop
+# if the threshold is lower than the retry budget.
 STRUCTURED_OUTPUT_RETRIES = 3
 
-# Threshold for declaring a tool loop in ``_classify_none_result``.  If
-# the same ``(tool_name, args)`` signature appears at least this many
-# times in the captured pydantic-ai message log without a matching
-# ``final_result`` call, the cell is classified as
-# ``NONE_RESULT_TOOL_LOOP`` (integration failure, NOT a "data absent"
-# answer).  Distinct from ``STRUCTURED_OUTPUT_RETRIES`` despite the
-# coincidental equality — the retry budget is a pydantic-ai input,
-# this threshold is a post-mortem heuristic.
+# Same-call repetition count that ``_classify_none_result`` treats as a
+# pipeline bug (post-mortem heuristic, not a pydantic-ai input). Keep
+# >= STRUCTURED_OUTPUT_RETRIES so legitimate retries don't trip it.
 TOOL_LOOP_THRESHOLD = 3
 
-# Failure-mode classification vocabulary written to ``Datacell.stacktrace``
-# when extraction returns ``None``.  Operators grep ``failure_mode=`` to
-# separate legitimate "data not present" outcomes from pipeline bugs;
-# changing these strings is a breaking change for downstream dashboards.
+# Vocabulary written to ``Datacell.stacktrace`` as ``failure_mode=...``.
+# Operators grep these; changing the strings breaks downstream dashboards.
 NONE_RESULT_AGENT_COMMITTED = "agent_committed_none"
 NONE_RESULT_NO_FINAL = "no_final_response"
 NONE_RESULT_TOOL_LOOP = "tool_loop_no_output"
 NONE_RESULT_UNKNOWN = "unknown"
 
-# Default model for ``doc_extract_query_task``.  Co-located with
-# ``EXTRACT_DEFAULT_TEMPERATURE`` and the ``is_anthropic_model`` helper
-# below so the model/family/temperature relationship lives in one place.
-# Call sites must pass ``temperature=None`` when the model family is
-# Anthropic so the structured-extraction guard in
-# ``_structured_response_raw`` can apply ``temperature=0`` automatically
-# (issue #1381).
 EXTRACT_DEFAULT_MODEL = "openai:gpt-4o-mini"
 EXTRACT_DEFAULT_TEMPERATURE = 0.3
-
-
-def is_anthropic_model(model_name: Optional[str]) -> bool:
-    """Return True if ``model_name`` looks like an Anthropic / Claude model.
-
-    Accepts both pydantic-ai-style ``"anthropic:..."`` prefixes and bare
-    model names containing ``"claude"``.  Lives in ``constants/llm.py``
-    rather than in an agent module because call sites outside the agents
-    layer (notably ``data_extract_tasks.doc_extract_query_task``) need to
-    decide whether to pass ``temperature=None`` so the Anthropic guard in
-    ``_structured_response_raw`` activates.  Pure stateless string check —
-    no imports beyond ``typing``.
-    """
-    if not model_name:
-        return False
-    name = model_name.lower()
-    return name.startswith("anthropic:") or "claude" in name
diff --git a/opencontractserver/llms/agents/pydantic_ai_agents.py b/opencontractserver/llms/agents/pydantic_ai_agents.py
@@ -31,10 +31,7 @@
 from pydantic_graph import End
 
 from opencontractserver.constants.context_guardrails import COMPACTION_SUMMARY_PREFIX
-from opencontractserver.constants.llm import (
-    STRUCTURED_OUTPUT_RETRIES,
-    is_anthropic_model,
-)
+from opencontractserver.constants.llm import STRUCTURED_OUTPUT_RETRIES
 from opencontractserver.conversations.models import Conversation
 from opencontractserver.corpuses.models import Corpus
 from opencontractserver.documents.models import Document
@@ -92,6 +89,7 @@
     PydanticAIAnnotationVectorStore,
 )
 from opencontractserver.utils.embeddings import aget_embedder
+from opencontractserver.utils.llm import is_anthropic_model
 from opencontractserver.utils.prompt_sanitization import (
     UNTRUSTED_CONTENT_NOTICE,
     fence_user_content,
@@ -1387,13 +1385,17 @@ async def _structured_response_raw(
 
             logger.info(f"Structured system prompt: {structured_system_prompt}")
 
+            # Preserve the pre-issue-#1381 behaviour of passing
+            # ``model_settings=None`` to ``PydanticAIAgent`` when nothing
+            # ended up being set, so non-Anthropic structured runs without
+            # caller pins are bit-identical to before.
             structured_agent = PydanticAIAgent(
                 model=effective_model,
                 instructions=structured_system_prompt,
                 output_type=target_type,
                 deps_type=PydanticAIDependencies,
                 tools=final_tools,
-                model_settings=model_settings,
+                model_settings=model_settings or None,
                 # Give pydantic-ai room to retry the structured output when
                 # the model fails to commit on the first pass (issue #1381).
                 output_retries=STRUCTURED_OUTPUT_RETRIES,
diff --git a/opencontractserver/tasks/data_extract_tasks.py b/opencontractserver/tasks/data_extract_tasks.py
@@ -17,14 +17,14 @@
     NONE_RESULT_TOOL_LOOP,
     NONE_RESULT_UNKNOWN,
     TOOL_LOOP_THRESHOLD,
-    is_anthropic_model,
 )
 from opencontractserver.extracts.models import Datacell
 from opencontractserver.shared.decorators import celery_task_with_async_to_sync
 from opencontractserver.utils.compact_pawls import expand_pawls_pages
 from opencontractserver.utils.extraction_grounding import (
     ground_extraction_to_annotations,
 )
+from opencontractserver.utils.llm import is_anthropic_model
 
 logger = logging.getLogger(__name__)
 
@@ -116,13 +116,8 @@ def _classify_none_result(messages: Optional[list[Any]]) -> str:
 
 
 def _resolve_extract_temperature(model_name: Optional[str]) -> Optional[float]:
-    """Pick the temperature to pass to ``get_structured_response_from_document``.
-
-    Returns ``None`` when ``model_name`` is an Anthropic / Claude model so the
-    Anthropic guard in ``_structured_response_raw`` can apply ``temperature=0``
-    automatically (issue #1381). Returns :data:`EXTRACT_DEFAULT_TEMPERATURE`
-    otherwise. Pulled out as a helper so the model-family→temperature
-    coupling is unit-testable without standing up the full extract task.
+    """Return ``None`` for Anthropic models (so the structured-response guard
+    can force ``temperature=0``), otherwise :data:`EXTRACT_DEFAULT_TEMPERATURE`.
     """
     if is_anthropic_model(model_name):
         return None
@@ -136,15 +131,15 @@ def _failure_message_for_classification(classification: str) -> str:
             "The extraction agent committed to a None result — the requested "
             "information was not found in the document."
         )
-    if classification == NONE_RESULT_NO_FINAL:
+    elif classification == NONE_RESULT_NO_FINAL:
         return (
             "The extraction agent never produced a final structured response. "
             "This is an integration failure (the model exhausted its tool-use "
             "budget without committing to the result tool), not a statement "
             "about the document. Check ``llm_call_log`` for the raw message "
             "history."
         )
-    if classification == NONE_RESULT_TOOL_LOOP:
+    elif classification == NONE_RESULT_TOOL_LOOP:
         return (
             "The extraction agent looped on the same tool call without "
             "producing a final structured response. This is an integration "
@@ -368,7 +363,11 @@ def sync_add_sources(datacell, sources):
         # Anthropic ``temperature=0`` override in
         # ``_structured_response_raw`` activates automatically when
         # ``EXTRACT_DEFAULT_MODEL`` is a Claude model (issue #1381).
-        extract_temperature = _resolve_extract_temperature(EXTRACT_DEFAULT_MODEL)
+        # NOTE: if this task is refactored to accept a caller-supplied
+        # model, pass that same value to ``_resolve_extract_temperature``
+        # so the temperature stays in lock-step with the model family.
+        extract_model = EXTRACT_DEFAULT_MODEL
+        extract_temperature = _resolve_extract_temperature(extract_model)
 
         try:
             # Wrap the agent call in the context manager to capture messages
@@ -381,7 +380,7 @@ def sync_add_sources(datacell, sources):
                     framework=AgentFramework.PYDANTIC_AI,
                     temperature=extract_temperature,
                     similarity_top_k=similarity_top_k,
-                    model=EXTRACT_DEFAULT_MODEL,
+                    model=extract_model,
                     user_id=datacell.creator.id,
                 )
 
diff --git a/opencontractserver/tests/test_data_extract_failure_classification.py b/opencontractserver/tests/test_data_extract_failure_classification.py
@@ -18,13 +18,13 @@
     NONE_RESULT_NO_FINAL,
     NONE_RESULT_TOOL_LOOP,
     NONE_RESULT_UNKNOWN,
-    is_anthropic_model,
 )
 from opencontractserver.tasks.data_extract_tasks import (
     _classify_none_result,
     _failure_message_for_classification,
     _resolve_extract_temperature,
 )
+from opencontractserver.utils.llm import is_anthropic_model
 
 
 def _make_response(*parts: Any) -> Any:
diff --git a/opencontractserver/tests/test_pydantic_ai_agents.py b/opencontractserver/tests/test_pydantic_ai_agents.py
@@ -1547,11 +1547,10 @@ class DummyOutput(BaseModel):
 
         structured_call = self._structured_agent_call(mock_pyd_ai_cls)
         self.assertIsNotNone(structured_call)
-        # No temperature override fired ⇒ model_settings has no temperature key
-        # (or an empty model_settings dict because _prepare returned None).
-        ms = structured_call.kwargs.get("model_settings") or {}
-        self.assertNotIn(
-            "temperature",
-            ms,
-            "OpenAI structured runs must not get the Anthropic temperature override",
+        # When neither the caller nor the config pin temperature/max_tokens,
+        # model_settings should be passed through as ``None`` (matching the
+        # pre-issue-#1381 contract with PydanticAIAgent), not as ``{}``.
+        self.assertIsNone(
+            structured_call.kwargs.get("model_settings"),
+            "OpenAI structured runs without pins must pass model_settings=None",
         )
diff --git a/opencontractserver/utils/llm.py b/opencontractserver/utils/llm.py
@@ -0,0 +1,16 @@
+"""LLM/agent helper utilities."""
+
+from typing import Optional
+
+
+def is_anthropic_model(model_name: Optional[str]) -> bool:
+    """Return True if ``model_name`` looks like an Anthropic / Claude model.
+
+    Accepts pydantic-ai-style ``"anthropic:..."`` prefixes and bare model
+    names containing ``"claude"``. Used to decide whether to apply the
+    Anthropic temperature guard in structured extraction (issue #1381).
+    """
+    if not model_name:
+        return False
+    name = model_name.lower()
+    return name.startswith("anthropic:") or "claude" in name

Original file line number	Diff line number	Diff line change
`@@ -18,13 +18,13 @@`
`18`	`18`	`NONE_RESULT_NO_FINAL,`
`19`	`19`	`NONE_RESULT_TOOL_LOOP,`
`20`	`20`	`NONE_RESULT_UNKNOWN,`
`21`		`- is_anthropic_model,`
`22`	`21`	`)`
`23`	`22`	`from opencontractserver.tasks.data_extract_tasks import (`
`24`	`23`	`_classify_none_result,`
`25`	`24`	`_failure_message_for_classification,`
`26`	`25`	`_resolve_extract_temperature,`
`27`	`26`	`)`
	`27`	`+from opencontractserver.utils.llm import is_anthropic_model`
`28`	`28`
`29`	`29`
`30`	`30`	`def _make_response(*parts: Any) -> Any:`