Address review: similarity_top_k closure default, fuzzy cap, magic number, kwargs split

JSv4 · JSv4 · commit deb0dc0c7736 · 2026-04-28T08:33:50.000-05:00
- Wire AgentConfig.similarity_top_k into _make_similarity_search_tool's
  default_k so the configured retrieval depth wins when the LLM omits k
  (previously hard-coded k=8 partially defeated similarity_top_k plumbing).
- Bump MAX_DOC_LENGTH_FOR_FUZZY 50K -&gt; 200K. Per-query timeout and n-gram
  anchor pre-filter are the real safety valves; the cap only needs to
  guard pathological inputs the timeout might miss.
- Replace the second 30000 magic-number site in OpenAIEmbedder.embed_texts_batch
  with the existing OPENAI_EMBEDDER_MAX_INPUT_CHARS constant.
- Pop similarity_top_k (config-time) from kwargs before forwarding to
  agent.structured_response (run-time) in api.py to make the routing explicit.
diff --git a/opencontractserver/constants/extraction.py b/opencontractserver/constants/extraction.py
@@ -17,11 +17,12 @@
 # attempted. Fuzzy matching is O(n * m) where n = doc length and m = query
 # length, so it becomes prohibitively expensive on very large documents.
 # Documents exceeding this threshold fall back to exact + normalized only.
-# 50 KB covers most production legal documents (avg ~10-25 KB) while keeping
-# worst-case fuzzy cost bounded — was 500 KB which let 25 KB privacy-policy
-# corpora hang the grounder when an LLM paraphrased its answer past the
-# normalized-match tier.
-MAX_DOC_LENGTH_FOR_FUZZY = 50_000
+# 200 KB comfortably covers most production legal documents (MSAs, ISDA
+# schedules, EPC agreements routinely run 100-200 KB). Worst-case fuzzy
+# cost is already bounded by FUZZY_PER_QUERY_TIMEOUT_SECONDS and the
+# n-gram anchor pre-filter (FUZZY_ANCHOR_MIN_NGRAM_WORDS) — those are
+# the real safety valves; this cap is the outer guard.
+MAX_DOC_LENGTH_FOR_FUZZY = 200_000
 
 # Maximum query length (in characters) accepted by the fuzzy fallback.
 # Some models occasionally return entire paragraphs as a single extracted
diff --git a/opencontractserver/llms/agents/pydantic_ai_agents.py b/opencontractserver/llms/agents/pydantic_ai_agents.py
@@ -105,21 +105,19 @@
 T = TypeVar("T")
 
 
-def _make_similarity_search_tool(vector_store: Any) -> Callable:
+def _make_similarity_search_tool(vector_store: Any, default_k: int = 8) -> Callable:
     """Build the citation-capturing similarity_search tool for a vector store.
 
-    Both the document and corpus agent factories used to define this closure
-    inline; the only thing that differed was which vector store was bound.
-    Centralising it here keeps the citation-accumulation contract — push
-    every real annotation PK into ``ctx.deps.retrieved_annotation_ids`` —
-    in a single place. The tool name remains ``similarity_search`` so
-    downstream event handlers and source-linking logic are unaffected.
+    ``default_k`` is the LLM-facing default when the model does not supply
+    its own ``k`` argument. Wired through from ``AgentConfig.similarity_top_k``
+    so callers controlling retrieval depth via the config field actually win
+    when the model omits ``k``.
     """
 
     async def similarity_search(
         ctx: RunContext[PydanticAIDependencies],
         query: str,
-        k: int = 8,
+        k: int = default_k,
         modalities: Optional[list[str]] = None,
     ) -> list[dict[str, Any]]:
         """Semantic vector search over the corpus annotations.
@@ -2104,7 +2102,9 @@ async def create(
         # See ``_make_similarity_search_tool`` for the citation-accumulation
         # contract; the tool name remains ``similarity_search`` so existing
         # event handlers that match on the tool name continue to work.
-        default_vs_tool: Callable = _make_similarity_search_tool(vector_store)
+        default_vs_tool: Callable = _make_similarity_search_tool(
+            vector_store, default_k=config.similarity_top_k
+        )
 
         # -----------------------------
         # Auto-build pure passthrough tools from registry
@@ -2610,7 +2610,9 @@ async def create(
 
         # See ``_make_similarity_search_tool`` for the shared citation-capturing
         # closure used by both the document and corpus agent factories.
-        default_vs_tool: Callable = _make_similarity_search_tool(vector_store)
+        default_vs_tool: Callable = _make_similarity_search_tool(
+            vector_store, default_k=config.similarity_top_k
+        )
 
         # -----------------------------
         # Auto-build passthrough tools from registry
diff --git a/opencontractserver/llms/api.py b/opencontractserver/llms/api.py
@@ -395,6 +395,12 @@ async def get_structured_response_and_sources_from_document(
         if framework is None:
             framework = AgentFramework.PYDANTIC_AI
 
+        # Config-time kwargs (vector store / agent construction) belong on
+        # ``for_document``; ``structured_response`` only accepts run-time
+        # kwargs (filtered through its own allowlist), so don't double-pass.
+        config_only_keys = {"similarity_top_k"}
+        run_kwargs = {k: v for k, v in kwargs.items() if k not in config_only_keys}
+
         agent = await AgentAPI.for_document(
             document=document,
             corpus=corpus,
@@ -416,7 +422,7 @@ async def get_structured_response_and_sources_from_document(
             model=model,
             temperature=temperature,
             max_tokens=max_tokens,
-            **kwargs,
+            **run_kwargs,
         )
 
         # The tool implementations appended to this list during the run.
diff --git a/opencontractserver/pipeline/embedders/openai_embedder.py b/opencontractserver/pipeline/embedders/openai_embedder.py
@@ -287,13 +287,11 @@ def embed_texts_batch(  # type: ignore[override]
         if not texts:
             return []
 
-        # Map original positions to the texts that survive the empty filter
         kept: list[tuple[int, str]] = []
-        max_chars = 30000  # mirror _embed_text_impl's 8192-token guard
         for i, raw in enumerate(texts):
             if not raw or not raw.strip():
                 continue
-            kept.append((i, raw[:max_chars]))
+            kept.append((i, raw[:OPENAI_EMBEDDER_MAX_INPUT_CHARS]))
 
         # Output skeleton — slots for filtered-out texts stay None forever.
         out: list[Optional[list[float]]] = [None] * len(texts)