Fix test expectations for intentional behavior changes; address review feedback

JSv4 · JSv4 · commit c50ba128c154 · 2026-04-28T01:04:02.000-05:00
Test fixes (8 tests broken by PR's production changes): - test_openai_embedder: assert max_retries=OPENAI_CLIENT_MAX_RETRIES kwarg on OpenAI() construction; rename rate-limit test to assert re-raise (issue #1380 routes RateLimitError to celery for retry instead of returning None). - test_embeddings_task.TestArrayFormatHandling: patch _get_session for the sent_transformer microservice (PR routes through shared session instead of bare requests.post). - test_corpus_forking: compare annotation JSON via compact_annotation_json on both sides; forked annotations are saved through Annotation.save() which now lazily compacts v1 PAWLs to v2. - test_structural_annotation_sets: rename test to reflect new orphan-GC behavior — SAS is preserved only while another document references it (orphan path covered separately in test_orphan_structural_set_gc.py). Review feedback (PR #1380): - openai_embedder: extract magic 30000 to OPENAI_EMBEDDER_MAX_INPUT_CHARS in constants/document_processing.py; reuse at both call sites. - pydantic_ai_agents: extract duplicated similarity_search closure into _make_similarity_search_tool factory; both document and corpus agent factories now share one citation-capturing implementation. - data_extract_tasks: hoist _link_retrieval_citations to module level so the @sync_to_async wrapper isn't rebuilt on every Celery invocation; drop unused sync_add_sources helper. Tighten _classify_none_result parameter type from object to Sequence[Any] | None.
diff --git a/opencontractserver/constants/document_processing.py b/opencontractserver/constants/document_processing.py
@@ -53,6 +53,16 @@
 # Larger than the single timeout because batches process multiple texts.
 EMBEDDER_BATCH_REQUEST_TIMEOUT_SECONDS = 60
 
+# Character-count guard for OpenAI embedding input. The hosted /embeddings
+# endpoint caps input at 8192 tokens per text; truncating on the char side
+# at ~4x the token budget (English averages ~4 chars/token) keeps us well
+# under the cap for any realistic input. Mirrors the silent-tokenizer
+# truncation that ``sentence-transformers`` applies locally so OpenAI users
+# get the same robustness instead of a fatal 400 "maximum context length"
+# from a long whole-document chunk. See ``OpenAIEmbedder._embed_text_impl``
+# and ``OpenAIEmbedder.embed_texts_batch``.
+OPENAI_EMBEDDER_MAX_INPUT_CHARS = 30_000
+
 # HTTP request timeout (seconds) for reranker microservice calls.
 # Reranking typically runs over tens of candidates (top_k * oversample), so
 # a modest timeout is sufficient. Retrieval degrades gracefully to the
diff --git a/opencontractserver/llms/agents/pydantic_ai_agents.py b/opencontractserver/llms/agents/pydantic_ai_agents.py
@@ -105,6 +105,48 @@
 T = TypeVar("T")
 
 
+def _make_similarity_search_tool(vector_store: Any) -> Callable:
+    """Build the citation-capturing similarity_search tool for a vector store.
+
+    Both the document and corpus agent factories used to define this closure
+    inline; the only thing that differed was which vector store was bound.
+    Centralising it here keeps the citation-accumulation contract — push
+    every real annotation PK into ``ctx.deps.retrieved_annotation_ids`` —
+    in a single place. The tool name remains ``similarity_search`` so
+    downstream event handlers and source-linking logic are unaffected.
+    """
+
+    async def similarity_search(
+        ctx: RunContext[PydanticAIDependencies],
+        query: str,
+        k: int = 8,
+        modalities: Optional[list[str]] = None,
+    ) -> list[dict[str, Any]]:
+        """Semantic vector search over the corpus annotations.
+
+        Returns the top-k nearest annotations for ``query`` as a list of
+        dicts with keys ``annotation_id``, ``content``, ``document_id``,
+        ``corpus_id``, ``page``, ``similarity_score``, ``label``, and
+        ``json``. Each real annotation's ID is captured into
+        ``ctx.deps.retrieved_annotation_ids`` so the caller can later link
+        citations to the owning object (e.g. ``Datacell.sources``).
+        """
+        results = await vector_store.similarity_search(
+            query, k=k, modalities=modalities
+        )
+        for r in results:
+            if not isinstance(r, dict):
+                continue
+            aid = r.get("annotation_id")
+            # Real annotation PKs are positive ints; synthetic / ad-hoc
+            # match IDs are negative and must not be persisted.
+            if isinstance(aid, int) and aid > 0:
+                ctx.deps.retrieved_annotation_ids.append(aid)
+        return results
+
+    return similarity_search
+
+
 def _get_function_tools(agent: PydanticAIAgent) -> dict:
     """Return the function-tools dict from a pydantic-ai Agent.
 
@@ -2059,42 +2101,10 @@ async def create(
                 **_vs_kwargs
             )
 
-        # Default vector search tool: wraps the store's bound method so we can
-        # append real annotation IDs returned by the retrieval to the per-run
-        # citation accumulator on ``ctx.deps``.  Pydantic-AI inspects the
-        # signature and injects ``ctx`` because its first parameter is typed
-        # as ``RunContext[PydanticAIDependencies]``.  The tool name is
-        # preserved as ``similarity_search`` so existing event handlers that
-        # match on the tool name continue to work.
-        async def similarity_search(
-            ctx: RunContext[PydanticAIDependencies],
-            query: str,
-            k: int = 8,
-            modalities: Optional[list[str]] = None,
-        ) -> list[dict[str, Any]]:
-            """Semantic vector search over the corpus annotations.
-
-            Returns the top-k nearest annotations for ``query`` as a list of
-            dicts with keys ``annotation_id``, ``content``, ``document_id``,
-            ``corpus_id``, ``page``, ``similarity_score``, ``label``, and
-            ``json``.  Each real annotation's ID is captured into
-            ``ctx.deps.retrieved_annotation_ids`` so the caller can later link
-            citations to the owning object (e.g. ``Datacell.sources``).
-            """
-            results = await vector_store.similarity_search(
-                query, k=k, modalities=modalities
-            )
-            for r in results:
-                if not isinstance(r, dict):
-                    continue
-                aid = r.get("annotation_id")
-                # Real annotation PKs are positive ints; synthetic / ad-hoc
-                # match IDs are negative and must not be persisted.
-                if isinstance(aid, int) and aid > 0:
-                    ctx.deps.retrieved_annotation_ids.append(aid)
-            return results
-
-        default_vs_tool: Callable = similarity_search
+        # See ``_make_similarity_search_tool`` for the citation-accumulation
+        # contract; the tool name remains ``similarity_search`` so existing
+        # event handlers that match on the tool name continue to work.
+        default_vs_tool: Callable = _make_similarity_search_tool(vector_store)
 
         # -----------------------------
         # Auto-build pure passthrough tools from registry
@@ -2598,36 +2608,9 @@ async def create(
                 **_vs_kwargs
             )
 
-        # Default vector search tool: wraps the store's bound method to
-        # capture real annotation IDs returned during retrieval.  See the
-        # equivalent wrapper in ``PydanticAIDocumentAgent.create`` for the
-        # rationale — we preserve the tool name ``similarity_search`` so
-        # downstream event / source handling is unaffected.
-        async def similarity_search(
-            ctx: RunContext[PydanticAIDependencies],
-            query: str,
-            k: int = 8,
-            modalities: Optional[list[str]] = None,
-        ) -> list[dict[str, Any]]:
-            """Semantic vector search over the corpus annotations.
-
-            Returns the top-k nearest annotations for ``query`` as dicts.
-            Appends every real annotation PK returned to
-            ``ctx.deps.retrieved_annotation_ids`` so the caller can link
-            citations to the owning object after the run completes.
-            """
-            results = await vector_store.similarity_search(
-                query, k=k, modalities=modalities
-            )
-            for r in results:
-                if not isinstance(r, dict):
-                    continue
-                aid = r.get("annotation_id")
-                if isinstance(aid, int) and aid > 0:
-                    ctx.deps.retrieved_annotation_ids.append(aid)
-            return results
-
-        default_vs_tool: Callable = similarity_search
+        # See ``_make_similarity_search_tool`` for the shared citation-capturing
+        # closure used by both the document and corpus agent factories.
+        default_vs_tool: Callable = _make_similarity_search_tool(vector_store)
 
         # -----------------------------
         # Auto-build passthrough tools from registry
diff --git a/opencontractserver/pipeline/embedders/openai_embedder.py b/opencontractserver/pipeline/embedders/openai_embedder.py
@@ -4,6 +4,9 @@
 
 import openai
 
+from opencontractserver.constants.document_processing import (
+    OPENAI_EMBEDDER_MAX_INPUT_CHARS,
+)
 from opencontractserver.constants.embeddings import (
     DEFAULT_OPENAI_EMBEDDING_DIMENSIONS,
     DEFAULT_OPENAI_EMBEDDING_MODEL,
@@ -184,23 +187,17 @@ def _embed_text_impl(self, text: str, **all_kwargs) -> Optional[list[float]]:
                 )
             )
 
-            # OpenAI embeddings API caps input at 8192 tokens; a 400 "maximum
-            # context length" is fatal to ingestion pipelines that produce
-            # long chunks (e.g. whole-document summaries, un-capped paragraph
-            # chunks of legalese). Local embedders like
-            # ``sentence-transformers`` silently truncate via the tokenizer,
-            # so users expect the same robustness here. Truncate on the char
-            # side at ~4x the token budget (English averages ~4 chars/token)
-            # to stay well under 8192 tokens for any realistic input.
-            max_chars = 30000
-            if len(text) > max_chars:
+            # See OPENAI_EMBEDDER_MAX_INPUT_CHARS for the rationale behind the
+            # truncation cap (mirrors the silent tokenizer truncation that
+            # ``sentence-transformers`` applies locally).
+            if len(text) > OPENAI_EMBEDDER_MAX_INPUT_CHARS:
                 logger.warning(
                     "OpenAIEmbedder truncating input from %d to %d chars to fit "
                     "the 8192-token context window",
                     len(text),
-                    max_chars,
+                    OPENAI_EMBEDDER_MAX_INPUT_CHARS,
                 )
-                text = text[:max_chars]
+                text = text[:OPENAI_EMBEDDER_MAX_INPUT_CHARS]
 
             client = self._build_client(**all_kwargs)
 
diff --git a/opencontractserver/tasks/data_extract_tasks.py b/opencontractserver/tasks/data_extract_tasks.py
@@ -2,6 +2,8 @@
 import json
 import logging
 import os
+from collections.abc import Sequence
+from typing import Any
 
 from asgiref.sync import sync_to_async
 
@@ -162,36 +164,6 @@ def sync_get_corpus_id(document):
             return doc_path.corpus_id
         return None
 
-    @sync_to_async
-    def sync_add_sources(datacell, sources):
-        """Add source annotations to datacell."""
-        if sources:
-            # Extract annotation IDs from SourceNode objects
-            annotation_ids = [s.annotation_id for s in sources if s.annotation_id > 0]
-            if annotation_ids:
-                datacell.sources.add(*annotation_ids)
-
-    @sync_to_async
-    def _link_retrieval_citations(datacell, annotation_ids):
-        """Link raw Annotation PKs retrieved by the agent to ``datacell.sources``.
-
-        Filters defensively: only positive ints that correspond to real
-        Annotation rows are persisted.  Duplicates are deduped by the M2M
-        unique constraint, so ``add(*ids)`` with repeats is safe.
-        """
-        from opencontractserver.annotations.models import Annotation
-
-        valid_ids = [a for a in annotation_ids if isinstance(a, int) and a > 0]
-        if not valid_ids:
-            return
-        # Guard against IDs that don't exist (e.g. race with deletion).
-        existing = set(
-            Annotation.objects.filter(id__in=valid_ids).values_list("id", flat=True)
-        )
-        existing_ids = [aid for aid in valid_ids if aid in existing]
-        if existing_ids:
-            datacell.sources.add(*existing_ids)
-
     # Initialize datacell to None to avoid UnboundLocalError
     datacell = None
 
@@ -459,7 +431,29 @@ def _link_retrieval_citations(datacell, annotation_ids):
         raise
 
 
-def _classify_none_result(messages: object) -> tuple[str, str]:
+@sync_to_async
+def _link_retrieval_citations(datacell, annotation_ids):
+    """Link raw Annotation PKs retrieved by the agent to ``datacell.sources``.
+
+    Filters defensively: only positive ints that correspond to real
+    Annotation rows are persisted.  Duplicates are deduped by the M2M
+    unique constraint, so ``add(*ids)`` with repeats is safe.
+    """
+    from opencontractserver.annotations.models import Annotation
+
+    valid_ids = [a for a in annotation_ids if isinstance(a, int) and a > 0]
+    if not valid_ids:
+        return
+    # Guard against IDs that don't exist (e.g. race with deletion).
+    existing = set(
+        Annotation.objects.filter(id__in=valid_ids).values_list("id", flat=True)
+    )
+    existing_ids = [aid for aid in valid_ids if aid in existing]
+    if existing_ids:
+        datacell.sources.add(*existing_ids)
+
+
+def _classify_none_result(messages: Sequence[Any] | None) -> tuple[str, str]:
     """Categorise a ``result is None`` outcome from ``agent.run()``.
 
     Reads the captured pydantic-ai message history (a list of
diff --git a/opencontractserver/tests/test_corpus_forking.py b/opencontractserver/tests/test_corpus_forking.py
@@ -336,7 +336,18 @@ def test_forked_annotation_field_integrity(self):
             # Core data fields must match
             self.assertEqual(forked.page, orig.page)
             self.assertEqual(forked.raw_text, orig.raw_text)
-            self.assertEqual(forked.json, orig.json)
+            # Forked annotations are saved through Annotation.save(), which
+            # auto-compacts v1 PAWLs JSON to v2 (issue: lazy migration, see
+            # compact_annotation_json). Compare both sides in compact form so
+            # the test is format-agnostic.
+            from opencontractserver.annotations.compact_json import (
+                compact_annotation_json,
+            )
+
+            self.assertEqual(
+                compact_annotation_json(forked.json),
+                compact_annotation_json(orig.json),
+            )
             self.assertEqual(forked.annotation_type, orig.annotation_type)
 
             # Creator should propagate
diff --git a/opencontractserver/tests/test_embeddings_task.py b/opencontractserver/tests/test_embeddings_task.py
@@ -1294,16 +1294,22 @@ def test_microservice_embedder_handles_1d_array(self):
         embedder = MicroserviceEmbedder()
 
         # Simulate 1D response: [0.1, 0.2, 0.3]
-        with patch("requests.post") as mock_post:
-            mock_response = MagicMock()
-            mock_response.status_code = 200
-            mock_response.json.return_value = {"embeddings": [0.1, 0.2, 0.3]}
-            mock_post.return_value = mock_response
-
+        # PR #1380 routes embedder requests through a shared session, so
+        # patch the session getter instead of the global requests.post.
+        mock_session = MagicMock()
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"embeddings": [0.1, 0.2, 0.3]}
+        mock_session.post.return_value = mock_response
+
+        with patch(
+            "opencontractserver.pipeline.embedders.sent_transformer_microservice._get_session",
+            return_value=mock_session,
+        ):
             result = embedder.embed_text("test")
 
-            self.assertEqual(result, [0.1, 0.2, 0.3])
-            self.assertIsInstance(result, list)
+        self.assertEqual(result, [0.1, 0.2, 0.3])
+        self.assertIsInstance(result, list)
 
     def test_microservice_embedder_handles_2d_array(self):
         """Test that MicroserviceEmbedder correctly handles 2D array responses."""
@@ -1314,16 +1320,20 @@ def test_microservice_embedder_handles_2d_array(self):
         embedder = MicroserviceEmbedder()
 
         # Simulate 2D response: [[0.1, 0.2, 0.3]]
-        with patch("requests.post") as mock_post:
-            mock_response = MagicMock()
-            mock_response.status_code = 200
-            mock_response.json.return_value = {"embeddings": [[0.1, 0.2, 0.3]]}
-            mock_post.return_value = mock_response
-
+        mock_session = MagicMock()
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"embeddings": [[0.1, 0.2, 0.3]]}
+        mock_session.post.return_value = mock_response
+
+        with patch(
+            "opencontractserver.pipeline.embedders.sent_transformer_microservice._get_session",
+            return_value=mock_session,
+        ):
             result = embedder.embed_text("test")
 
-            self.assertEqual(result, [0.1, 0.2, 0.3])
-            self.assertIsInstance(result, list)
+        self.assertEqual(result, [0.1, 0.2, 0.3])
+        self.assertIsInstance(result, list)
 
     def test_multimodal_embedder_handles_1d_array(self):
         """Test that CLIPMicroserviceEmbedder correctly handles 1D array responses."""
@@ -1377,15 +1387,19 @@ def test_microservice_embedder_settings_none_fallback(self):
         # Force settings to None to exercise the fallback path
         embedder._settings = None
 
-        with patch("requests.post") as mock_post:
-            mock_response = MagicMock()
-            mock_response.status_code = 200
-            mock_response.json.return_value = {"embeddings": [0.1, 0.2, 0.3]}
-            mock_post.return_value = mock_response
+        mock_session = MagicMock()
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"embeddings": [0.1, 0.2, 0.3]}
+        mock_session.post.return_value = mock_response
 
+        with patch(
+            "opencontractserver.pipeline.embedders.sent_transformer_microservice._get_session",
+            return_value=mock_session,
+        ):
             result = embedder.embed_text("test")
 
-            self.assertEqual(result, [0.1, 0.2, 0.3])
+        self.assertEqual(result, [0.1, 0.2, 0.3])
 
     def test_clip_embedder_settings_none_fallback(self):
         """CLIPMicroserviceEmbedder._get_service_config falls back to Settings()."""
diff --git a/opencontractserver/tests/test_openai_embedder.py b/opencontractserver/tests/test_openai_embedder.py
diff --git a/opencontractserver/tests/test_structural_annotation_sets.py b/opencontractserver/tests/test_structural_annotation_sets.py