Skip to content

Commit deb0dc0

Browse files
committed
Address review: similarity_top_k closure default, fuzzy cap, magic number, kwargs split
- Wire AgentConfig.similarity_top_k into _make_similarity_search_tool's default_k so the configured retrieval depth wins when the LLM omits k (previously hard-coded k=8 partially defeated similarity_top_k plumbing). - Bump MAX_DOC_LENGTH_FOR_FUZZY 50K -> 200K. Per-query timeout and n-gram anchor pre-filter are the real safety valves; the cap only needs to guard pathological inputs the timeout might miss. - Replace the second 30000 magic-number site in OpenAIEmbedder.embed_texts_batch with the existing OPENAI_EMBEDDER_MAX_INPUT_CHARS constant. - Pop similarity_top_k (config-time) from kwargs before forwarding to agent.structured_response (run-time) in api.py to make the routing explicit.
1 parent 97ceb29 commit deb0dc0

4 files changed

Lines changed: 26 additions & 19 deletions

File tree

opencontractserver/constants/extraction.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,12 @@
1717
# attempted. Fuzzy matching is O(n * m) where n = doc length and m = query
1818
# length, so it becomes prohibitively expensive on very large documents.
1919
# Documents exceeding this threshold fall back to exact + normalized only.
20-
# 50 KB covers most production legal documents (avg ~10-25 KB) while keeping
21-
# worst-case fuzzy cost bounded — was 500 KB which let 25 KB privacy-policy
22-
# corpora hang the grounder when an LLM paraphrased its answer past the
23-
# normalized-match tier.
24-
MAX_DOC_LENGTH_FOR_FUZZY = 50_000
20+
# 200 KB comfortably covers most production legal documents (MSAs, ISDA
21+
# schedules, EPC agreements routinely run 100-200 KB). Worst-case fuzzy
22+
# cost is already bounded by FUZZY_PER_QUERY_TIMEOUT_SECONDS and the
23+
# n-gram anchor pre-filter (FUZZY_ANCHOR_MIN_NGRAM_WORDS) — those are
24+
# the real safety valves; this cap is the outer guard.
25+
MAX_DOC_LENGTH_FOR_FUZZY = 200_000
2526

2627
# Maximum query length (in characters) accepted by the fuzzy fallback.
2728
# Some models occasionally return entire paragraphs as a single extracted

opencontractserver/llms/agents/pydantic_ai_agents.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -105,21 +105,19 @@
105105
T = TypeVar("T")
106106

107107

108-
def _make_similarity_search_tool(vector_store: Any) -> Callable:
108+
def _make_similarity_search_tool(vector_store: Any, default_k: int = 8) -> Callable:
109109
"""Build the citation-capturing similarity_search tool for a vector store.
110110
111-
Both the document and corpus agent factories used to define this closure
112-
inline; the only thing that differed was which vector store was bound.
113-
Centralising it here keeps the citation-accumulation contract — push
114-
every real annotation PK into ``ctx.deps.retrieved_annotation_ids`` —
115-
in a single place. The tool name remains ``similarity_search`` so
116-
downstream event handlers and source-linking logic are unaffected.
111+
``default_k`` is the LLM-facing default when the model does not supply
112+
its own ``k`` argument. Wired through from ``AgentConfig.similarity_top_k``
113+
so callers controlling retrieval depth via the config field actually win
114+
when the model omits ``k``.
117115
"""
118116

119117
async def similarity_search(
120118
ctx: RunContext[PydanticAIDependencies],
121119
query: str,
122-
k: int = 8,
120+
k: int = default_k,
123121
modalities: Optional[list[str]] = None,
124122
) -> list[dict[str, Any]]:
125123
"""Semantic vector search over the corpus annotations.
@@ -2104,7 +2102,9 @@ async def create(
21042102
# See ``_make_similarity_search_tool`` for the citation-accumulation
21052103
# contract; the tool name remains ``similarity_search`` so existing
21062104
# event handlers that match on the tool name continue to work.
2107-
default_vs_tool: Callable = _make_similarity_search_tool(vector_store)
2105+
default_vs_tool: Callable = _make_similarity_search_tool(
2106+
vector_store, default_k=config.similarity_top_k
2107+
)
21082108

21092109
# -----------------------------
21102110
# Auto-build pure passthrough tools from registry
@@ -2610,7 +2610,9 @@ async def create(
26102610

26112611
# See ``_make_similarity_search_tool`` for the shared citation-capturing
26122612
# closure used by both the document and corpus agent factories.
2613-
default_vs_tool: Callable = _make_similarity_search_tool(vector_store)
2613+
default_vs_tool: Callable = _make_similarity_search_tool(
2614+
vector_store, default_k=config.similarity_top_k
2615+
)
26142616

26152617
# -----------------------------
26162618
# Auto-build passthrough tools from registry

opencontractserver/llms/api.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,12 @@ async def get_structured_response_and_sources_from_document(
395395
if framework is None:
396396
framework = AgentFramework.PYDANTIC_AI
397397

398+
# Config-time kwargs (vector store / agent construction) belong on
399+
# ``for_document``; ``structured_response`` only accepts run-time
400+
# kwargs (filtered through its own allowlist), so don't double-pass.
401+
config_only_keys = {"similarity_top_k"}
402+
run_kwargs = {k: v for k, v in kwargs.items() if k not in config_only_keys}
403+
398404
agent = await AgentAPI.for_document(
399405
document=document,
400406
corpus=corpus,
@@ -416,7 +422,7 @@ async def get_structured_response_and_sources_from_document(
416422
model=model,
417423
temperature=temperature,
418424
max_tokens=max_tokens,
419-
**kwargs,
425+
**run_kwargs,
420426
)
421427

422428
# The tool implementations appended to this list during the run.

opencontractserver/pipeline/embedders/openai_embedder.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -287,13 +287,11 @@ def embed_texts_batch( # type: ignore[override]
287287
if not texts:
288288
return []
289289

290-
# Map original positions to the texts that survive the empty filter
291290
kept: list[tuple[int, str]] = []
292-
max_chars = 30000 # mirror _embed_text_impl's 8192-token guard
293291
for i, raw in enumerate(texts):
294292
if not raw or not raw.strip():
295293
continue
296-
kept.append((i, raw[:max_chars]))
294+
kept.append((i, raw[:OPENAI_EMBEDDER_MAX_INPUT_CHARS]))
297295

298296
# Output skeleton — slots for filtered-out texts stay None forever.
299297
out: list[Optional[list[float]]] = [None] * len(texts)

0 commit comments

Comments
 (0)