Address review: clarify control flow, respect config.temperature, durable error msgs

JSv4 · JSv4 · commit 81b4c6825451 · 2026-04-27T23:00:31.000-05:00
- Initialize llm_log/messages at top of try-block so outer except branches don't depend on locals() introspection - Replace 'X in locals()' guards with simple None checks - Anthropic temperature override now respects a non-zero config.temperature instead of silently overriding - Failure messages reference llm_call_log instead of issue #1381 (durable phrasing) - Tool-loop fingerprinting uses json.dumps(sort_keys=True, default=str) before falling back to repr - Test updated to match the durable phrasing
diff --git a/opencontractserver/llms/agents/pydantic_ai_agents.py b/opencontractserver/llms/agents/pydantic_ai_agents.py
@@ -1351,15 +1351,19 @@ async def _structured_response_raw(
             # Anthropic models tend to keep narrating / calling tools instead
             # of committing to the structured output when given any wiggle
             # room (issue #1381). Force temperature down to 0 unless the
-            # caller explicitly asked for something else.
+            # caller explicitly asked for something else (function-level
+            # temperature pin OR a non-zero config.temperature).
             effective_model = model or self.config.model_name
-            if _is_anthropic_model(effective_model) and temperature is None:
-                if self.config.temperature is None or self.config.temperature > 0:
-                    logger.info(
-                        "Forcing temperature=0 for structured extraction with "
-                        "Anthropic model %s (issue #1381).",
-                        effective_model,
-                    )
+            if (
+                _is_anthropic_model(effective_model)
+                and temperature is None
+                and (self.config.temperature is None or self.config.temperature == 0)
+            ):
+                logger.info(
+                    "Forcing temperature=0 for structured extraction with "
+                    "Anthropic model %s (issue #1381).",
+                    effective_model,
+                )
                 model_settings["temperature"] = 0
 
             # Seed tools from the main agent so the structured run has the same capabilities
diff --git a/opencontractserver/tasks/data_extract_tasks.py b/opencontractserver/tasks/data_extract_tasks.py
@@ -83,7 +83,11 @@ def _classify_none_result(messages: Optional[list[Any]]) -> str:
                 if tool_name.startswith("final_result"):
                     saw_final_result = True
                 else:
-                    args_repr = repr(getattr(part, "args", None))
+                    raw_args = getattr(part, "args", None)
+                    try:
+                        args_repr = json.dumps(raw_args, sort_keys=True, default=str)
+                    except (TypeError, ValueError):
+                        args_repr = repr(raw_args)
                     tool_call_signatures.append((tool_name, args_repr))
 
     if saw_final_result:
@@ -113,13 +117,15 @@ def _failure_message_for_classification(classification: str) -> str:
             "The extraction agent never produced a final structured response. "
             "This is an integration failure (the model exhausted its tool-use "
             "budget without committing to the result tool), not a statement "
-            "about the document. See issue #1381."
+            "about the document. Check ``llm_call_log`` for the raw message "
+            "history."
         )
     if classification == NONE_RESULT_TOOL_LOOP:
         return (
             "The extraction agent looped on the same tool call without "
             "producing a final structured response. This is an integration "
-            "failure, not a statement about the document. See issue #1381."
+            "failure, not a statement about the document. Check "
+            "``llm_call_log`` for the repeated tool call."
         )
     return (
         "The extraction returned None and the cause could not be classified. "
@@ -269,8 +275,9 @@ def sync_add_sources(datacell, sources):
             if annotation_ids:
                 datacell.sources.add(*annotation_ids)
 
-    # Initialize datacell to None to avoid UnboundLocalError
+    # Initialize to None to avoid UnboundLocalError in the outer except block
     datacell = None
+    llm_log: Optional[str] = None
 
     logger.info("=" * 60)
     logger.info(f"doc_extract_query_task STARTED for cell_id: {cell_id}")
@@ -371,7 +378,7 @@ def sync_add_sources(datacell, sources):
         logger.info(f"  - Corpus ID: {corpus_id}")
 
         # Capture LLM messages for debugging
-        llm_log = None
+        messages: Optional[list[Any]] = None
 
         try:
             # Wrap the agent call in the context manager to capture messages
@@ -396,7 +403,7 @@ def sync_add_sources(datacell, sources):
 
         except Exception as e:
             # If we have messages, capture them before re-raising
-            if "messages" in locals():
+            if messages:
                 llm_log = ModelMessagesTypeAdapter.dump_json(
                     messages, indent=2
                 ).decode()
@@ -454,9 +461,7 @@ def sync_add_sources(datacell, sources):
             # Extraction returned None — classify *why* so operators can
             # distinguish legitimate "data not present" outcomes from
             # pipeline bugs (issue #1381).
-            classification = _classify_none_result(
-                messages if "messages" in locals() else None
-            )
+            classification = _classify_none_result(messages)
             failure_message = _failure_message_for_classification(classification)
             logger.warning(
                 f"✗ Extraction returned None for cell {cell_id} "
@@ -477,9 +482,7 @@ def sync_add_sources(datacell, sources):
         # Only try to mark failed if we have a datacell
         if datacell:
             # Pass llm_log if we have it
-            await sync_mark_failed(
-                datacell, e, tb, llm_log if "llm_log" in locals() else None
-            )
+            await sync_mark_failed(datacell, e, tb, llm_log)
         else:
             logger.error(f"Failed to get datacell for cell_id {cell_id}: {e}\n{tb}")
         raise
diff --git a/opencontractserver/tests/test_data_extract_failure_classification.py b/opencontractserver/tests/test_data_extract_failure_classification.py
@@ -131,12 +131,12 @@ def test_messages_are_distinct_per_classification(self) -> None:
         self.assertIn("integration failure", messages[NONE_RESULT_NO_FINAL])
         self.assertIn("looped", messages[NONE_RESULT_TOOL_LOOP])
 
-    def test_integration_failure_messages_reference_issue(self) -> None:
-        """Operators need to find the GitHub issue from the cell stacktrace."""
+    def test_integration_failure_messages_reference_log(self) -> None:
+        """Operators need a pointer to the raw conversation in the cell stacktrace."""
         for classification in (NONE_RESULT_NO_FINAL, NONE_RESULT_TOOL_LOOP):
             with self.subTest(classification=classification):
                 self.assertIn(
-                    "#1381",
+                    "llm_call_log",
                     _failure_message_for_classification(classification),
                 )