Skip to content

Commit a2aa94d

Browse files
committed
Address review: bugs, perf, style fixes
- runner.py: set finished_at before BenchmarkReport snapshot - metrics.py: token_recall("","") now returns 0.0 with warning to mirror char_recall and avoid silently inflating aggregate F1 - legalbench_rag.py: precompute keys with isolated random.Random so the sort no longer mutates global random state - data_extract_tasks.py: document model_override trust assumption - local.yml: comment API_KEY as local-only placeholder - retrieval.py: elevate struct_set→doc cache to module level keyed by (corpus_id, struct_set_id) so a full benchmark run amortises lookups - text_alignment.py: hoist doc_text.lower() out of per-query loop - run_benchmark.py: replace magic 194 with PAPER_MAX_TESTS_PER_BENCHMARK - constants/benchmarks.py: derive TRIM_LEN from MAX_LEN - cross_encoder_reranker.py: rename comprehension var to avoid shadowing - text_chunkers.py: tighten _INVISIBLE_CHARS_RE to format chars only, preserving en/em dash and thin space
1 parent 833f31d commit a2aa94d

11 files changed

Lines changed: 107 additions & 35 deletions

File tree

local.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,17 @@ services:
8686
image: ghcr.io/open-source-legal/docxodus-service:1.1.0-docxodus5.4.2
8787
container_name: docxodus-parser
8888

89+
# API_KEY values below are LOCAL DEVELOPMENT ONLY placeholders. Override
90+
# via environment / .env before exposing these services on any network
91+
# other than the local docker bridge.
8992
vector-embedder:
9093
image: ghcr.io/jsv4/vectorembeddermicroservice:latest
9194
container_name: vector-embedder
9295
environment:
9396
PORT: 8000
9497
TRANSFORMERS_OFFLINE: 1
9598
HF_DATASETS_OFFLINE: 1
96-
API_KEY: abc123
99+
API_KEY: abc123 # local-only placeholder; override before deploying
97100

98101
multimodal-embedder:
99102
image: ghcr.io/jsv4/vectorembeddermicroservice-multimodal:latest
@@ -102,7 +105,7 @@ services:
102105
PORT: 8000
103106
TRANSFORMERS_OFFLINE: 1
104107
HF_DATASETS_OFFLINE: 1
105-
API_KEY: abc123
108+
API_KEY: abc123 # local-only placeholder; override before deploying
106109

107110
celeryworker:
108111
image: opencontractserver_local_django

opencontractserver/benchmarks/adapters/legalbench_rag.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,18 @@ def _paper_sample_tests(
9898
continue
9999
valid.append(test)
100100

101-
def _seed_key(test: dict[str, Any]) -> float:
102-
random.seed(test["snippets"][0]["file_path"])
103-
return random.random()
104-
105-
valid.sort(key=_seed_key)
101+
# Precompute sort keys with an isolated ``random.Random`` instance
102+
# so we don't mutate the process-wide random state on every sort
103+
# comparison. Upstream code seeds the global PRNG inside the sort
104+
# key; we get the same per-test deterministic value without the
105+
# global side effect.
106+
def _seed_value(file_path: str) -> float:
107+
rng = random.Random()
108+
rng.seed(file_path)
109+
return rng.random()
110+
111+
keys = {id(test): _seed_value(test["snippets"][0]["file_path"]) for test in valid}
112+
valid.sort(key=lambda test: keys[id(test)])
106113
return valid[:max_per_subset]
107114

108115

opencontractserver/benchmarks/management/commands/run_benchmark.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
from opencontractserver.benchmarks.adapters.legalbench_rag import (
2626
LEGALBENCH_RAG_SUBSETS,
27+
PAPER_MAX_TESTS_PER_BENCHMARK,
2728
LegalBenchRAGAdapter,
2829
)
2930
from opencontractserver.benchmarks.runner import run_benchmark
@@ -118,10 +119,11 @@ def add_arguments(self, parser: argparse.ArgumentParser) -> None:
118119
parser.add_argument(
119120
"--max-per-subset",
120121
type=int,
121-
default=194,
122+
default=PAPER_MAX_TESTS_PER_BENCHMARK,
122123
help=(
123124
"Per-subset cap when --paper-sampling is on. Defaults to "
124-
"upstream's MAX_TESTS_PER_BENCHMARK = 194."
125+
"upstream's MAX_TESTS_PER_BENCHMARK "
126+
f"= {PAPER_MAX_TESTS_PER_BENCHMARK}."
125127
),
126128
)
127129
parser.add_argument(
@@ -194,7 +196,9 @@ def handle(self, *args, **options) -> None:
194196
# Today only LegalBenchRAGAdapter does; future adapters can opt in.
195197
if benchmark_name == "legalbench-rag":
196198
adapter_kwargs["paper_sampling"] = options.get("paper_sampling", True)
197-
adapter_kwargs["max_per_subset"] = options.get("max_per_subset", 194)
199+
adapter_kwargs["max_per_subset"] = options.get(
200+
"max_per_subset", PAPER_MAX_TESTS_PER_BENCHMARK
201+
)
198202
# mypy can't statically narrow ``adapter_kwargs: dict[str, object]``
199203
# against each adapter subclass's specific parameter types. The dict
200204
# values are sourced from argparse, which already validated them.

opencontractserver/benchmarks/metrics.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,13 @@
1212

1313
from __future__ import annotations
1414

15+
import logging
1516
import re
1617
import string
1718
from collections.abc import Iterable, Sequence
1819

20+
logger = logging.getLogger(__name__)
21+
1922
Span = tuple[int, int]
2023

2124
# --------------------------------------------------------------------------- #
@@ -97,7 +100,15 @@ def token_recall(prediction: str, gold: str) -> float:
97100
pred_tokens = set(normalize_answer(prediction).split())
98101
gold_tokens = list(normalize_answer(gold).split())
99102
if not gold_tokens:
100-
return 0.0 if pred_tokens else 1.0
103+
# Empty gold is almost always a data-quality bug; treating
104+
# ``token_recall("", "")`` as 1.0 silently inflates aggregate
105+
# F1. Mirror :func:`char_recall` and return 0.0 with a warning
106+
# so the operator sees the offending row.
107+
logger.warning(
108+
"token_recall called with empty gold; returning 0.0. "
109+
"This usually indicates a data-quality issue in the dataset."
110+
)
111+
return 0.0
101112
unique_gold = set(gold_tokens)
102113
if not pred_tokens:
103114
return 0.0

opencontractserver/benchmarks/retrieval.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,25 @@
2727
Span = tuple[int, int]
2828

2929

30+
# Module-level cache for ``StructuralAnnotationSet → Document`` resolution
31+
# inside a benchmark process. A full LegalBench-RAG run probes ~776
32+
# queries × top_k=32 hits and reuses a small set of structural sets, so
33+
# making the cache call-local would re-issue the same query hundreds of
34+
# times. Keying on ``(corpus_id, struct_set_id)`` keeps separate corpora
35+
# isolated when the same struct_set appears in multiple corpora.
36+
_STRUCT_SET_TO_DOC: dict[tuple[int, int], int | None] = {}
37+
38+
39+
def _clear_struct_set_cache() -> None:
40+
"""Test hook: drop cached struct-set→document resolutions.
41+
42+
The cache is process-wide; tests that recreate corpora between
43+
cases need to clear it so a stale ``Document.id`` from a torn-down
44+
fixture isn't returned.
45+
"""
46+
_STRUCT_SET_TO_DOC.clear()
47+
48+
3049
@dataclass
3150
class RetrievalResult:
3251
"""Retrieval probe output for a single (query, document) pair.
@@ -113,19 +132,19 @@ def probe_retrieval(
113132
# off ``StructuralAnnotationSet`` (shared across documents with the
114133
# same content hash). To produce a per-result document_id we resolve
115134
# ``structural_set_id`` → ``Document.id`` via the reverse FK on
116-
# Document. Cache lookups per-call so a top-k of 64 doesn't trigger
117-
# 64 queries when many hits share the same set.
135+
# Document. The cache is module-level (keyed by corpus_id) so a
136+
# full benchmark run amortises the lookup across all probe calls
137+
# rather than re-querying once per-call.
118138
from opencontractserver.documents.models import Document
119139

120-
struct_set_to_doc: dict[int, int | None] = {}
121-
122140
def _resolve_doc_id(annotation: Annotation) -> int | None:
123141
if annotation.document_id is not None:
124142
return annotation.document_id
125143
struct_set_id = annotation.structural_set_id
126144
if struct_set_id is None:
127145
return None
128-
if struct_set_id not in struct_set_to_doc:
146+
cache_key = (corpus_id, struct_set_id)
147+
if cache_key not in _STRUCT_SET_TO_DOC:
129148
# Resolve to the Document in the *target corpus* — across
130149
# benchmark runs the same content_hash can reappear, and
131150
# ``CoreAnnotationVectorStore``'s structural-annotation
@@ -142,8 +161,8 @@ def _resolve_doc_id(annotation: Annotation) -> int | None:
142161
.distinct()
143162
.first()
144163
)
145-
struct_set_to_doc[struct_set_id] = doc.id if doc else None
146-
return struct_set_to_doc[struct_set_id]
164+
_STRUCT_SET_TO_DOC[cache_key] = doc.id if doc else None
165+
return _STRUCT_SET_TO_DOC[cache_key]
147166

148167
for hit in results:
149168
annotation: Annotation = hit.annotation

opencontractserver/benchmarks/runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ def run_benchmark(
175175
corpus_wide=corpus_wide,
176176
)
177177

178+
config["finished_at"] = timezone.now().isoformat()
178179
report = BenchmarkReport(
179180
adapter=loaded.adapter_description,
180181
config=dict(config), # snapshot; avoid aliasing with the mutable local
@@ -183,7 +184,6 @@ def run_benchmark(
183184
task_results=task_results,
184185
)
185186
# compute_aggregates() is auto-called unconditionally by __post_init__.
186-
config["finished_at"] = timezone.now().isoformat()
187187

188188
if write_report:
189189
resolved_run_dir = _resolve_run_dir(

opencontractserver/constants/benchmarks.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
BENCHMARK_QUERY_PREVIEW_MAX_LEN = 64
2525

2626
# Character budget left for the query preview after reserving room for the
27-
# trailing "…" ellipsis marker. Equals
28-
# ``BENCHMARK_QUERY_PREVIEW_MAX_LEN - 1``.
29-
BENCHMARK_QUERY_PREVIEW_TRIM_LEN = 63
27+
# trailing "…" ellipsis marker. Derived from
28+
# ``BENCHMARK_QUERY_PREVIEW_MAX_LEN`` so changing the cap auto-updates the
29+
# trim point — defining as a literal here invites silent skew when the
30+
# cap moves.
31+
BENCHMARK_QUERY_PREVIEW_TRIM_LEN = BENCHMARK_QUERY_PREVIEW_MAX_LEN - 1

opencontractserver/pipeline/parsers/text_chunkers.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -242,15 +242,18 @@ def chunk(self, text: str) -> Iterator[TextChunk]:
242242
# even when the separator width varies.
243243
_PARAGRAPH_SEPARATOR_RE = re.compile(r"\n[ \t]*(?:\n[ \t]*)+")
244244

245-
# Characters that look like whitespace to a human but aren't matched by
246-
# Python's ``str.strip()``: zero-width spaces, BOM, soft hyphens, etc. A
247-
# paragraph composed only of these characters has no embeddable content
248-
# and must be dropped — otherwise the embedding microservice tokenises
249-
# it down to an empty input and computes mean-of-empty, which returns
250-
# NaN and aborts the entire ingest pipeline. Observed in CUAD documents
251-
# (e.g. JuniperPharmaceuticalsInc_…) where copy-paste artifacts left
252-
# runs of ``​`` characters between real paragraphs.
253-
_INVISIBLE_CHARS_RE = re.compile(r"[   -‏
-  -⁠ ­]")
245+
# Targeted allowlist of format characters (Unicode category Cf) — NOT
246+
# the whole General Punctuation block. Typographic characters with
247+
# content meaning (en dash, em dash, thin space, …) must not be
248+
# stripped. A paragraph composed only of these characters has no
249+
# embeddable content and must be dropped — otherwise the embedding
250+
# microservice tokenises it down to an empty input and computes
251+
# mean-of-empty, which returns NaN and aborts ingest. Observed in
252+
# CUAD documents where copy-paste artifacts left runs of ​
253+
# (ZWSP) characters between real paragraphs.
254+
_INVISIBLE_CHARS_RE = re.compile(
255+
"[\u00ad\u180e\u200b-\u200f\u202a-\u202e\u2060-\u2064\u2066-\u2069\ufeff]"
256+
)
254257

255258

256259
@register_chunker

opencontractserver/pipeline/rerankers/cross_encoder_reranker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def _rerank_impl(
183183
# Some cross-encoders (activation=sigmoid) return 0D arrays per pair
184184
# or numpy floats — normalize to Python floats.
185185
try:
186-
score_list = [float(s) for s in scores]
186+
score_list = [float(score) for score in scores]
187187
except TypeError:
188188
# Single-pair responses may come back as a scalar
189189
score_list = [float(scores)]

opencontractserver/tasks/data_extract_tasks.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,15 @@ async def doc_extract_query_task(
105105
``"anthropic:claude-opus-4-6"``). When provided it overrides the
106106
default extraction model. Used by the benchmark runner to sweep
107107
models without touching production defaults.
108+
109+
Trust assumption: this string is passed straight to the agent
110+
factory and ultimately to the model registry. Current call
111+
sites (CLI ``run_benchmark`` command, internal benchmark
112+
runner) are operator-controlled. If this task is ever
113+
exposed to user-controlled input (webhook, public API), gate
114+
it behind an allowlist of approved model identifiers — an
115+
arbitrary string here can redirect extraction traffic to an
116+
unintended model endpoint.
108117
"""
109118
import traceback
110119
from typing import get_origin

0 commit comments

Comments
 (0)