Address review: bugs, perf, style fixes

JSv4 · JSv4 · commit a2aa94d1f2e6 · 2026-04-28T18:42:11.000-05:00
- runner.py: set finished_at before BenchmarkReport snapshot
- metrics.py: token_recall("","") now returns 0.0 with warning to mirror
  char_recall and avoid silently inflating aggregate F1
- legalbench_rag.py: precompute keys with isolated random.Random so the
  sort no longer mutates global random state
- data_extract_tasks.py: document model_override trust assumption
- local.yml: comment API_KEY as local-only placeholder
- retrieval.py: elevate struct_set→doc cache to module level keyed by
  (corpus_id, struct_set_id) so a full benchmark run amortises lookups
- text_alignment.py: hoist doc_text.lower() out of per-query loop
- run_benchmark.py: replace magic 194 with PAPER_MAX_TESTS_PER_BENCHMARK
- constants/benchmarks.py: derive TRIM_LEN from MAX_LEN
- cross_encoder_reranker.py: rename comprehension var to avoid shadowing
- text_chunkers.py: tighten _INVISIBLE_CHARS_RE to format chars only,
  preserving en/em dash and thin space
diff --git a/local.yml b/local.yml
@@ -86,14 +86,17 @@ services:
     image: ghcr.io/open-source-legal/docxodus-service:1.1.0-docxodus5.4.2
     container_name: docxodus-parser
 
+  # API_KEY values below are LOCAL DEVELOPMENT ONLY placeholders.  Override
+  # via environment / .env before exposing these services on any network
+  # other than the local docker bridge.
   vector-embedder:
     image: ghcr.io/jsv4/vectorembeddermicroservice:latest
     container_name: vector-embedder
     environment:
       PORT: 8000
       TRANSFORMERS_OFFLINE: 1
       HF_DATASETS_OFFLINE: 1
-      API_KEY: abc123
+      API_KEY: abc123  # local-only placeholder; override before deploying
 
   multimodal-embedder:
     image: ghcr.io/jsv4/vectorembeddermicroservice-multimodal:latest
@@ -102,7 +105,7 @@ services:
       PORT: 8000
       TRANSFORMERS_OFFLINE: 1
       HF_DATASETS_OFFLINE: 1
-      API_KEY: abc123
+      API_KEY: abc123  # local-only placeholder; override before deploying
 
   celeryworker:
     image: opencontractserver_local_django
diff --git a/opencontractserver/benchmarks/adapters/legalbench_rag.py b/opencontractserver/benchmarks/adapters/legalbench_rag.py
@@ -98,11 +98,18 @@ def _paper_sample_tests(
             continue
         valid.append(test)
 
-    def _seed_key(test: dict[str, Any]) -> float:
-        random.seed(test["snippets"][0]["file_path"])
-        return random.random()
-
-    valid.sort(key=_seed_key)
+    # Precompute sort keys with an isolated ``random.Random`` instance
+    # so we don't mutate the process-wide random state on every sort
+    # comparison.  Upstream code seeds the global PRNG inside the sort
+    # key; we get the same per-test deterministic value without the
+    # global side effect.
+    def _seed_value(file_path: str) -> float:
+        rng = random.Random()
+        rng.seed(file_path)
+        return rng.random()
+
+    keys = {id(test): _seed_value(test["snippets"][0]["file_path"]) for test in valid}
+    valid.sort(key=lambda test: keys[id(test)])
     return valid[:max_per_subset]
 
 
diff --git a/opencontractserver/benchmarks/management/commands/run_benchmark.py b/opencontractserver/benchmarks/management/commands/run_benchmark.py
@@ -24,6 +24,7 @@
 
 from opencontractserver.benchmarks.adapters.legalbench_rag import (
     LEGALBENCH_RAG_SUBSETS,
+    PAPER_MAX_TESTS_PER_BENCHMARK,
     LegalBenchRAGAdapter,
 )
 from opencontractserver.benchmarks.runner import run_benchmark
@@ -118,10 +119,11 @@ def add_arguments(self, parser: argparse.ArgumentParser) -> None:
         parser.add_argument(
             "--max-per-subset",
             type=int,
-            default=194,
+            default=PAPER_MAX_TESTS_PER_BENCHMARK,
             help=(
                 "Per-subset cap when --paper-sampling is on. Defaults to "
-                "upstream's MAX_TESTS_PER_BENCHMARK = 194."
+                "upstream's MAX_TESTS_PER_BENCHMARK "
+                f"= {PAPER_MAX_TESTS_PER_BENCHMARK}."
             ),
         )
         parser.add_argument(
@@ -194,7 +196,9 @@ def handle(self, *args, **options) -> None:
         # Today only LegalBenchRAGAdapter does; future adapters can opt in.
         if benchmark_name == "legalbench-rag":
             adapter_kwargs["paper_sampling"] = options.get("paper_sampling", True)
-            adapter_kwargs["max_per_subset"] = options.get("max_per_subset", 194)
+            adapter_kwargs["max_per_subset"] = options.get(
+                "max_per_subset", PAPER_MAX_TESTS_PER_BENCHMARK
+            )
         # mypy can't statically narrow ``adapter_kwargs: dict[str, object]``
         # against each adapter subclass's specific parameter types. The dict
         # values are sourced from argparse, which already validated them.
diff --git a/opencontractserver/benchmarks/metrics.py b/opencontractserver/benchmarks/metrics.py
@@ -12,10 +12,13 @@
 
 from __future__ import annotations
 
+import logging
 import re
 import string
 from collections.abc import Iterable, Sequence
 
+logger = logging.getLogger(__name__)
+
 Span = tuple[int, int]
 
 # --------------------------------------------------------------------------- #
@@ -97,7 +100,15 @@ def token_recall(prediction: str, gold: str) -> float:
     pred_tokens = set(normalize_answer(prediction).split())
     gold_tokens = list(normalize_answer(gold).split())
     if not gold_tokens:
-        return 0.0 if pred_tokens else 1.0
+        # Empty gold is almost always a data-quality bug; treating
+        # ``token_recall("", "")`` as 1.0 silently inflates aggregate
+        # F1.  Mirror :func:`char_recall` and return 0.0 with a warning
+        # so the operator sees the offending row.
+        logger.warning(
+            "token_recall called with empty gold; returning 0.0. "
+            "This usually indicates a data-quality issue in the dataset."
+        )
+        return 0.0
     unique_gold = set(gold_tokens)
     if not pred_tokens:
         return 0.0
diff --git a/opencontractserver/benchmarks/retrieval.py b/opencontractserver/benchmarks/retrieval.py
@@ -27,6 +27,25 @@
 Span = tuple[int, int]
 
 
+# Module-level cache for ``StructuralAnnotationSet → Document`` resolution
+# inside a benchmark process.  A full LegalBench-RAG run probes ~776
+# queries × top_k=32 hits and reuses a small set of structural sets, so
+# making the cache call-local would re-issue the same query hundreds of
+# times.  Keying on ``(corpus_id, struct_set_id)`` keeps separate corpora
+# isolated when the same struct_set appears in multiple corpora.
+_STRUCT_SET_TO_DOC: dict[tuple[int, int], int | None] = {}
+
+
+def _clear_struct_set_cache() -> None:
+    """Test hook: drop cached struct-set→document resolutions.
+
+    The cache is process-wide; tests that recreate corpora between
+    cases need to clear it so a stale ``Document.id`` from a torn-down
+    fixture isn't returned.
+    """
+    _STRUCT_SET_TO_DOC.clear()
+
+
 @dataclass
 class RetrievalResult:
     """Retrieval probe output for a single (query, document) pair.
@@ -113,19 +132,19 @@ def probe_retrieval(
     # off ``StructuralAnnotationSet`` (shared across documents with the
     # same content hash). To produce a per-result document_id we resolve
     # ``structural_set_id`` → ``Document.id`` via the reverse FK on
-    # Document. Cache lookups per-call so a top-k of 64 doesn't trigger
-    # 64 queries when many hits share the same set.
+    # Document.  The cache is module-level (keyed by corpus_id) so a
+    # full benchmark run amortises the lookup across all probe calls
+    # rather than re-querying once per-call.
     from opencontractserver.documents.models import Document
 
-    struct_set_to_doc: dict[int, int | None] = {}
-
     def _resolve_doc_id(annotation: Annotation) -> int | None:
         if annotation.document_id is not None:
             return annotation.document_id
         struct_set_id = annotation.structural_set_id
         if struct_set_id is None:
             return None
-        if struct_set_id not in struct_set_to_doc:
+        cache_key = (corpus_id, struct_set_id)
+        if cache_key not in _STRUCT_SET_TO_DOC:
             # Resolve to the Document in the *target corpus* — across
             # benchmark runs the same content_hash can reappear, and
             # ``CoreAnnotationVectorStore``'s structural-annotation
@@ -142,8 +161,8 @@ def _resolve_doc_id(annotation: Annotation) -> int | None:
                 .distinct()
                 .first()
             )
-            struct_set_to_doc[struct_set_id] = doc.id if doc else None
-        return struct_set_to_doc[struct_set_id]
+            _STRUCT_SET_TO_DOC[cache_key] = doc.id if doc else None
+        return _STRUCT_SET_TO_DOC[cache_key]
 
     for hit in results:
         annotation: Annotation = hit.annotation
diff --git a/opencontractserver/benchmarks/runner.py b/opencontractserver/benchmarks/runner.py
@@ -175,6 +175,7 @@ def run_benchmark(
         corpus_wide=corpus_wide,
     )
 
+    config["finished_at"] = timezone.now().isoformat()
     report = BenchmarkReport(
         adapter=loaded.adapter_description,
         config=dict(config),  # snapshot; avoid aliasing with the mutable local
@@ -183,7 +184,6 @@ def run_benchmark(
         task_results=task_results,
     )
     # compute_aggregates() is auto-called unconditionally by __post_init__.
-    config["finished_at"] = timezone.now().isoformat()
 
     if write_report:
         resolved_run_dir = _resolve_run_dir(
diff --git a/opencontractserver/constants/benchmarks.py b/opencontractserver/constants/benchmarks.py
@@ -24,6 +24,8 @@
 BENCHMARK_QUERY_PREVIEW_MAX_LEN = 64
 
 # Character budget left for the query preview after reserving room for the
-# trailing "…" ellipsis marker. Equals
-# ``BENCHMARK_QUERY_PREVIEW_MAX_LEN - 1``.
-BENCHMARK_QUERY_PREVIEW_TRIM_LEN = 63
+# trailing "…" ellipsis marker.  Derived from
+# ``BENCHMARK_QUERY_PREVIEW_MAX_LEN`` so changing the cap auto-updates the
+# trim point — defining as a literal here invites silent skew when the
+# cap moves.
+BENCHMARK_QUERY_PREVIEW_TRIM_LEN = BENCHMARK_QUERY_PREVIEW_MAX_LEN - 1
diff --git a/opencontractserver/pipeline/parsers/text_chunkers.py b/opencontractserver/pipeline/parsers/text_chunkers.py
@@ -242,15 +242,18 @@ def chunk(self, text: str) -> Iterator[TextChunk]:
 # even when the separator width varies.
 _PARAGRAPH_SEPARATOR_RE = re.compile(r"\n[ \t]*(?:\n[ \t]*)+")
 
-# Characters that look like whitespace to a human but aren't matched by
-# Python's ``str.strip()``: zero-width spaces, BOM, soft hyphens, etc. A
-# paragraph composed only of these characters has no embeddable content
-# and must be dropped — otherwise the embedding microservice tokenises
-# it down to an empty input and computes mean-of-empty, which returns
-# NaN and aborts the entire ingest pipeline. Observed in CUAD documents
-# (e.g. JuniperPharmaceuticalsInc_…) where copy-paste artifacts left
-# runs of ``​`` characters between real paragraphs.
-_INVISIBLE_CHARS_RE = re.compile(r"[   -‏ -  -⁯⁠　﻿­]")
+# Targeted allowlist of format characters (Unicode category Cf) — NOT
+# the whole General Punctuation block.  Typographic characters with
+# content meaning (en dash, em dash, thin space, …) must not be
+# stripped.  A paragraph composed only of these characters has no
+# embeddable content and must be dropped — otherwise the embedding
+# microservice tokenises it down to an empty input and computes
+# mean-of-empty, which returns NaN and aborts ingest.  Observed in
+# CUAD documents where copy-paste artifacts left runs of ​
+# (ZWSP) characters between real paragraphs.
+_INVISIBLE_CHARS_RE = re.compile(
+    "[\u00ad\u180e\u200b-\u200f\u202a-\u202e\u2060-\u2064\u2066-\u2069\ufeff]"
+)
 
 
 @register_chunker
diff --git a/opencontractserver/pipeline/rerankers/cross_encoder_reranker.py b/opencontractserver/pipeline/rerankers/cross_encoder_reranker.py
@@ -183,7 +183,7 @@ def _rerank_impl(
         # Some cross-encoders (activation=sigmoid) return 0D arrays per pair
         # or numpy floats — normalize to Python floats.
         try:
-            score_list = [float(s) for s in scores]
+            score_list = [float(score) for score in scores]
         except TypeError:
             # Single-pair responses may come back as a scalar
             score_list = [float(scores)]
diff --git a/opencontractserver/tasks/data_extract_tasks.py b/opencontractserver/tasks/data_extract_tasks.py
@@ -105,6 +105,15 @@ async def doc_extract_query_task(
             ``"anthropic:claude-opus-4-6"``). When provided it overrides the
             default extraction model. Used by the benchmark runner to sweep
             models without touching production defaults.
+
+            Trust assumption: this string is passed straight to the agent
+            factory and ultimately to the model registry.  Current call
+            sites (CLI ``run_benchmark`` command, internal benchmark
+            runner) are operator-controlled.  If this task is ever
+            exposed to user-controlled input (webhook, public API), gate
+            it behind an allowlist of approved model identifiers — an
+            arbitrary string here can redirect extraction traffic to an
+            unintended model endpoint.
     """
     import traceback
     from typing import get_origin
diff --git a/opencontractserver/utils/text_alignment.py b/opencontractserver/utils/text_alignment.py
@@ -108,6 +108,7 @@ def _has_anchor_ngram(
     doc_text: str,
     *,
     n: int = FUZZY_ANCHOR_MIN_NGRAM_WORDS,
+    doc_lower: str | None = None,
 ) -> bool:
     """Cheap pre-filter: does ``query`` share any n consecutive words with
     ``doc_text`` as an exact substring?
@@ -118,6 +119,12 @@ def _has_anchor_ngram(
     on pathological paraphrases for predictable grounding latency.
 
     ``n=0`` short-circuits to ``True`` (filter disabled).
+
+    ``doc_lower`` lets the caller hoist the lowercase conversion out of a
+    per-query loop — at ``MAX_DOC_LENGTH_FOR_FUZZY`` (200 KB) every call
+    otherwise allocates a fresh 200 KB string.  When ``None`` (default),
+    the function falls back to computing it locally so single-call sites
+    don't have to plumb it through.
     """
     if n <= 0:
         return True
@@ -128,8 +135,9 @@ def _has_anchor_ngram(
         return True
     # Case-insensitive substring match: paraphrases often shift case
     # (titles, sentence-initial caps) without breaking the underlying
-    # anchor. Build the doc index once per call rather than per ngram.
-    doc_lower = doc_text.lower()
+    # anchor.
+    if doc_lower is None:
+        doc_lower = doc_text.lower()
     for i in range(len(words) - n + 1):
         ngram = " ".join(words[i : i + n])
         if ngram and ngram in doc_lower:
@@ -287,6 +295,10 @@ def align_text_to_document(
 
     # Pre-build normalized index for tier 2 (amortized across queries)
     norm_doc, char_map = _build_normalized_index(document_text)
+    # Hoist the lowercase conversion used by ``_has_anchor_ngram`` out of
+    # the per-query loop — at the fuzzy cap (200 KB) the per-call
+    # allocation cost dominated the anchor check itself.
+    doc_lower_cached: str | None = None
 
     for query in query_texts:
         if not query or len(query) < min_query_length:
@@ -357,7 +369,9 @@ def align_text_to_document(
                     MAX_QUERY_LENGTH_FOR_FUZZY,
                 )
                 continue
-            if not _has_anchor_ngram(query, document_text):
+            if doc_lower_cached is None:
+                doc_lower_cached = document_text.lower()
+            if not _has_anchor_ngram(query, document_text, doc_lower=doc_lower_cached):
                 logger.debug(
                     "Skipping fuzzy match: query %r has no exact %d-gram "
                     "anchor in document. Most queries that fail this test "

Original file line number	Diff line number	Diff line change
`@@ -175,6 +175,7 @@ def run_benchmark(`
`175`	`175`	`corpus_wide=corpus_wide,`
`176`	`176`	`)`
`177`	`177`
	`178`	`+ config["finished_at"] = timezone.now().isoformat()`
`178`	`179`	`report = BenchmarkReport(`
`179`	`180`	`adapter=loaded.adapter_description,`
`180`	`181`	`config=dict(config), # snapshot; avoid aliasing with the mutable local`
`@@ -183,7 +184,6 @@ def run_benchmark(`
`183`	`184`	`task_results=task_results,`
`184`	`185`	`)`
`185`	`186`	`# compute_aggregates() is auto-called unconditionally by __post_init__.`
`186`		`- config["finished_at"] = timezone.now().isoformat()`
`187`	`187`
`188`	`188`	`if write_report:`
`189`	`189`	`resolved_run_dir = _resolve_run_dir(`