Skip to content

Commit b960f1e

Browse files
committed
Address review: relocate is_anthropic_model and tighten structured run
- Move is_anthropic_model from constants/llm.py to utils/llm.py per CLAUDE.md (utility functions belong in opencontractserver/utils/). Update imports in pydantic_ai_agents.py, data_extract_tasks.py, and the failure-classification test module. - Trim verbose docstrings/comments in constants/llm.py while keeping the TOOL_LOOP_THRESHOLD ↔ STRUCTURED_OUTPUT_RETRIES coupling note explicit. - Pass model_settings=None (not {}) to PydanticAIAgent when no temperature/max_tokens pin is set, preserving the pre-issue-#1381 contract for non-Anthropic structured runs without caller pins. Add a regression test in test_pydantic_ai_agents.py. - Convert the if/if/if return chain in _failure_message_for_classification to if/elif for clarity. - Add an inline TODO at the doc_extract_query_task call site so a future refactor that accepts a caller-supplied model passes the same value to _resolve_extract_temperature, keeping model and temperature in lock-step.
1 parent 1416951 commit b960f1e

6 files changed

Lines changed: 51 additions & 78 deletions

File tree

Lines changed: 10 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,22 @@
1-
"""
2-
LLM / agent integration constants.
1+
"""LLM / agent integration constants (issue #1381)."""
32

4-
Anthropic structured-extraction reliability knobs and the failure-mode
5-
classifier vocabulary that ``data_extract_tasks._classify_none_result``
6-
emits to ``Datacell.stacktrace``. Lives here per CLAUDE.md "no magic
7-
numbers in business code" rule so operators can grep canonical values
8-
instead of chasing literals across modules.
9-
"""
10-
11-
from typing import Optional
12-
13-
# Retry budget passed to ``PydanticAIAgent`` for structured extraction.
14-
# pydantic-ai's default is 1; Claude/Anthropic models routinely fail to
15-
# call ``final_result`` on the first turn for sparse documents and we
16-
# observed an ~85% failure rate without retries. 3 strikes the right
17-
# balance: enough to absorb a single missed-tool-call attempt with a
18-
# follow-up reminder, without blowing the per-cell wall-clock budget.
3+
# pydantic-ai default is 1; Anthropic models often need retries to commit
4+
# to ``final_result``. Bumping this requires re-checking ``TOOL_LOOP_THRESHOLD``
5+
# below — a legitimate retried tool call could be mis-classified as a loop
6+
# if the threshold is lower than the retry budget.
197
STRUCTURED_OUTPUT_RETRIES = 3
208

21-
# Threshold for declaring a tool loop in ``_classify_none_result``. If
22-
# the same ``(tool_name, args)`` signature appears at least this many
23-
# times in the captured pydantic-ai message log without a matching
24-
# ``final_result`` call, the cell is classified as
25-
# ``NONE_RESULT_TOOL_LOOP`` (integration failure, NOT a "data absent"
26-
# answer). Distinct from ``STRUCTURED_OUTPUT_RETRIES`` despite the
27-
# coincidental equality — the retry budget is a pydantic-ai input,
28-
# this threshold is a post-mortem heuristic.
9+
# Same-call repetition count that ``_classify_none_result`` treats as a
10+
# pipeline bug (post-mortem heuristic, not a pydantic-ai input). Keep
11+
# >= STRUCTURED_OUTPUT_RETRIES so legitimate retries don't trip it.
2912
TOOL_LOOP_THRESHOLD = 3
3013

31-
# Failure-mode classification vocabulary written to ``Datacell.stacktrace``
32-
# when extraction returns ``None``. Operators grep ``failure_mode=`` to
33-
# separate legitimate "data not present" outcomes from pipeline bugs;
34-
# changing these strings is a breaking change for downstream dashboards.
14+
# Vocabulary written to ``Datacell.stacktrace`` as ``failure_mode=...``.
15+
# Operators grep these; changing the strings breaks downstream dashboards.
3516
NONE_RESULT_AGENT_COMMITTED = "agent_committed_none"
3617
NONE_RESULT_NO_FINAL = "no_final_response"
3718
NONE_RESULT_TOOL_LOOP = "tool_loop_no_output"
3819
NONE_RESULT_UNKNOWN = "unknown"
3920

40-
# Default model for ``doc_extract_query_task``. Co-located with
41-
# ``EXTRACT_DEFAULT_TEMPERATURE`` and the ``is_anthropic_model`` helper
42-
# below so the model/family/temperature relationship lives in one place.
43-
# Call sites must pass ``temperature=None`` when the model family is
44-
# Anthropic so the structured-extraction guard in
45-
# ``_structured_response_raw`` can apply ``temperature=0`` automatically
46-
# (issue #1381).
4721
EXTRACT_DEFAULT_MODEL = "openai:gpt-4o-mini"
4822
EXTRACT_DEFAULT_TEMPERATURE = 0.3
49-
50-
51-
def is_anthropic_model(model_name: Optional[str]) -> bool:
52-
"""Return True if ``model_name`` looks like an Anthropic / Claude model.
53-
54-
Accepts both pydantic-ai-style ``"anthropic:..."`` prefixes and bare
55-
model names containing ``"claude"``. Lives in ``constants/llm.py``
56-
rather than in an agent module because call sites outside the agents
57-
layer (notably ``data_extract_tasks.doc_extract_query_task``) need to
58-
decide whether to pass ``temperature=None`` so the Anthropic guard in
59-
``_structured_response_raw`` activates. Pure stateless string check —
60-
no imports beyond ``typing``.
61-
"""
62-
if not model_name:
63-
return False
64-
name = model_name.lower()
65-
return name.startswith("anthropic:") or "claude" in name

opencontractserver/llms/agents/pydantic_ai_agents.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,7 @@
3131
from pydantic_graph import End
3232

3333
from opencontractserver.constants.context_guardrails import COMPACTION_SUMMARY_PREFIX
34-
from opencontractserver.constants.llm import (
35-
STRUCTURED_OUTPUT_RETRIES,
36-
is_anthropic_model,
37-
)
34+
from opencontractserver.constants.llm import STRUCTURED_OUTPUT_RETRIES
3835
from opencontractserver.conversations.models import Conversation
3936
from opencontractserver.corpuses.models import Corpus
4037
from opencontractserver.documents.models import Document
@@ -92,6 +89,7 @@
9289
PydanticAIAnnotationVectorStore,
9390
)
9491
from opencontractserver.utils.embeddings import aget_embedder
92+
from opencontractserver.utils.llm import is_anthropic_model
9593
from opencontractserver.utils.prompt_sanitization import (
9694
UNTRUSTED_CONTENT_NOTICE,
9795
fence_user_content,
@@ -1387,13 +1385,17 @@ async def _structured_response_raw(
13871385

13881386
logger.info(f"Structured system prompt: {structured_system_prompt}")
13891387

1388+
# Preserve the pre-issue-#1381 behaviour of passing
1389+
# ``model_settings=None`` to ``PydanticAIAgent`` when nothing
1390+
# ended up being set, so non-Anthropic structured runs without
1391+
# caller pins are bit-identical to before.
13901392
structured_agent = PydanticAIAgent(
13911393
model=effective_model,
13921394
instructions=structured_system_prompt,
13931395
output_type=target_type,
13941396
deps_type=PydanticAIDependencies,
13951397
tools=final_tools,
1396-
model_settings=model_settings,
1398+
model_settings=model_settings or None,
13971399
# Give pydantic-ai room to retry the structured output when
13981400
# the model fails to commit on the first pass (issue #1381).
13991401
output_retries=STRUCTURED_OUTPUT_RETRIES,

opencontractserver/tasks/data_extract_tasks.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@
1717
NONE_RESULT_TOOL_LOOP,
1818
NONE_RESULT_UNKNOWN,
1919
TOOL_LOOP_THRESHOLD,
20-
is_anthropic_model,
2120
)
2221
from opencontractserver.extracts.models import Datacell
2322
from opencontractserver.shared.decorators import celery_task_with_async_to_sync
2423
from opencontractserver.utils.compact_pawls import expand_pawls_pages
2524
from opencontractserver.utils.extraction_grounding import (
2625
ground_extraction_to_annotations,
2726
)
27+
from opencontractserver.utils.llm import is_anthropic_model
2828

2929
logger = logging.getLogger(__name__)
3030

@@ -116,13 +116,8 @@ def _classify_none_result(messages: Optional[list[Any]]) -> str:
116116

117117

118118
def _resolve_extract_temperature(model_name: Optional[str]) -> Optional[float]:
119-
"""Pick the temperature to pass to ``get_structured_response_from_document``.
120-
121-
Returns ``None`` when ``model_name`` is an Anthropic / Claude model so the
122-
Anthropic guard in ``_structured_response_raw`` can apply ``temperature=0``
123-
automatically (issue #1381). Returns :data:`EXTRACT_DEFAULT_TEMPERATURE`
124-
otherwise. Pulled out as a helper so the model-family→temperature
125-
coupling is unit-testable without standing up the full extract task.
119+
"""Return ``None`` for Anthropic models (so the structured-response guard
120+
can force ``temperature=0``), otherwise :data:`EXTRACT_DEFAULT_TEMPERATURE`.
126121
"""
127122
if is_anthropic_model(model_name):
128123
return None
@@ -136,15 +131,15 @@ def _failure_message_for_classification(classification: str) -> str:
136131
"The extraction agent committed to a None result — the requested "
137132
"information was not found in the document."
138133
)
139-
if classification == NONE_RESULT_NO_FINAL:
134+
elif classification == NONE_RESULT_NO_FINAL:
140135
return (
141136
"The extraction agent never produced a final structured response. "
142137
"This is an integration failure (the model exhausted its tool-use "
143138
"budget without committing to the result tool), not a statement "
144139
"about the document. Check ``llm_call_log`` for the raw message "
145140
"history."
146141
)
147-
if classification == NONE_RESULT_TOOL_LOOP:
142+
elif classification == NONE_RESULT_TOOL_LOOP:
148143
return (
149144
"The extraction agent looped on the same tool call without "
150145
"producing a final structured response. This is an integration "
@@ -368,7 +363,11 @@ def sync_add_sources(datacell, sources):
368363
# Anthropic ``temperature=0`` override in
369364
# ``_structured_response_raw`` activates automatically when
370365
# ``EXTRACT_DEFAULT_MODEL`` is a Claude model (issue #1381).
371-
extract_temperature = _resolve_extract_temperature(EXTRACT_DEFAULT_MODEL)
366+
# NOTE: if this task is refactored to accept a caller-supplied
367+
# model, pass that same value to ``_resolve_extract_temperature``
368+
# so the temperature stays in lock-step with the model family.
369+
extract_model = EXTRACT_DEFAULT_MODEL
370+
extract_temperature = _resolve_extract_temperature(extract_model)
372371

373372
try:
374373
# Wrap the agent call in the context manager to capture messages
@@ -381,7 +380,7 @@ def sync_add_sources(datacell, sources):
381380
framework=AgentFramework.PYDANTIC_AI,
382381
temperature=extract_temperature,
383382
similarity_top_k=similarity_top_k,
384-
model=EXTRACT_DEFAULT_MODEL,
383+
model=extract_model,
385384
user_id=datacell.creator.id,
386385
)
387386

opencontractserver/tests/test_data_extract_failure_classification.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818
NONE_RESULT_NO_FINAL,
1919
NONE_RESULT_TOOL_LOOP,
2020
NONE_RESULT_UNKNOWN,
21-
is_anthropic_model,
2221
)
2322
from opencontractserver.tasks.data_extract_tasks import (
2423
_classify_none_result,
2524
_failure_message_for_classification,
2625
_resolve_extract_temperature,
2726
)
27+
from opencontractserver.utils.llm import is_anthropic_model
2828

2929

3030
def _make_response(*parts: Any) -> Any:

opencontractserver/tests/test_pydantic_ai_agents.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1547,11 +1547,10 @@ class DummyOutput(BaseModel):
15471547

15481548
structured_call = self._structured_agent_call(mock_pyd_ai_cls)
15491549
self.assertIsNotNone(structured_call)
1550-
# No temperature override fired ⇒ model_settings has no temperature key
1551-
# (or an empty model_settings dict because _prepare returned None).
1552-
ms = structured_call.kwargs.get("model_settings") or {}
1553-
self.assertNotIn(
1554-
"temperature",
1555-
ms,
1556-
"OpenAI structured runs must not get the Anthropic temperature override",
1550+
# When neither the caller nor the config pin temperature/max_tokens,
1551+
# model_settings should be passed through as ``None`` (matching the
1552+
# pre-issue-#1381 contract with PydanticAIAgent), not as ``{}``.
1553+
self.assertIsNone(
1554+
structured_call.kwargs.get("model_settings"),
1555+
"OpenAI structured runs without pins must pass model_settings=None",
15571556
)

opencontractserver/utils/llm.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""LLM/agent helper utilities."""
2+
3+
from typing import Optional
4+
5+
6+
def is_anthropic_model(model_name: Optional[str]) -> bool:
7+
"""Return True if ``model_name`` looks like an Anthropic / Claude model.
8+
9+
Accepts pydantic-ai-style ``"anthropic:..."`` prefixes and bare model
10+
names containing ``"claude"``. Used to decide whether to apply the
11+
Anthropic temperature guard in structured extraction (issue #1381).
12+
"""
13+
if not model_name:
14+
return False
15+
name = model_name.lower()
16+
return name.startswith("anthropic:") or "claude" in name

0 commit comments

Comments
 (0)