Skip to content

Commit f2eaccf

Browse files
committed
Address review: tighten return type, cache label, document key asymmetry
- _create_pdf_annotation return type Annotation (not Annotation | None) and remove dead 'if annot is not None' guard in caller — function raises on failure - Cache ensure_label_and_labelset across loop iterations; reset on savepoint rollback so the happy path is one DB lookup, not N - Document why json is in defaults for PDFs but a lookup key for spans - Simplify test stub so it doesn't depend on SpanAnnotation internals - Fix test page assertion: PlasmaPDF returns 0-indexed pages
1 parent 14a20b8 commit f2eaccf

2 files changed

Lines changed: 37 additions & 27 deletions

File tree

opencontractserver/tests/test_extraction_grounding.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -514,17 +514,18 @@ def test_ground_pdf_creates_token_label_annotations(self):
514514
self.assertIsNotNone(annot.annotation_label)
515515
assert annot.annotation_label is not None # narrow for mypy
516516
self.assertEqual(annot.annotation_label.text, OC_EXTRACT_SOURCE_LABEL)
517-
# Page must be a positive integer; never the silent default of 1
518-
# for a span that actually lives on page 2.
517+
# PlasmaPDF returns 0-indexed pages; valid range is
518+
# [0, len(pages) - 1]. The bug we're guarding against is the
519+
# silent fallback that would have saved everything on a single
520+
# default page even when the span lives on a different one.
519521
self.assertIsInstance(annot.page, int)
520-
self.assertGreaterEqual(annot.page, 1)
521-
self.assertLessEqual(annot.page, len(self.pages_text))
522+
self.assertGreaterEqual(annot.page, 0)
523+
self.assertLess(annot.page, len(self.pages_text))
522524
self.assertTrue(annot.raw_text)
523525

524-
# "Acme Holdings" is on page 1 (1-indexed) and
525-
# "Global Acquisitions" on page 2 — confirm the per-page mapping
526-
# actually works by checking we got at least one annotation off
527-
# page 1.
526+
# "Acme Holdings" is on page 0 (0-indexed) and "Global Acquisitions"
527+
# on page 1 — confirm the per-page mapping actually works by
528+
# checking we got annotations on more than one page.
528529
pages_seen = {a.page for a in annotations}
529530
self.assertGreater(
530531
len(pages_seen),
@@ -587,11 +588,7 @@ def test_ground_pdf_skips_when_page_is_none(self):
587588
def stub_create(self, span_annotation):
588589
# Mimic PlasmaPDF's payload but force page=None so the grounding
589590
# pipeline must take the skip-rather-than-fallback path.
590-
return {
591-
"page": None,
592-
"rawText": span_annotation.span["text"],
593-
"annotation_json": {},
594-
}
591+
return {"page": None, "rawText": "stub", "annotation_json": {}}
595592

596593
with patch(
597594
"plasmapdf.models.PdfDataLayer.PdfDataLayer."

opencontractserver/utils/extraction_grounding.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -207,16 +207,21 @@ def _create_grounding_annotations(
207207

208208
annotations: list[Annotation] = []
209209

210+
# Cache the label across iterations so the happy path is one DB lookup,
211+
# not N. Reset to None on savepoint rollback so a transient labelset
212+
# failure can be retried on the next annotation.
213+
cached_label: AnnotationLabel | None = None
214+
210215
for result in alignment_results:
211216
try:
212217
with transaction.atomic():
213-
# Resolve label inside the savepoint so a labelset failure
214-
# only skips this annotation, not the whole batch.
215-
label_obj = corpus.ensure_label_and_labelset(
216-
label_text=OC_EXTRACT_SOURCE_LABEL,
217-
creator_id=creator_id,
218-
label_type=annotation_type,
219-
)
218+
if cached_label is None:
219+
cached_label = corpus.ensure_label_and_labelset(
220+
label_text=OC_EXTRACT_SOURCE_LABEL,
221+
creator_id=creator_id,
222+
label_type=annotation_type,
223+
)
224+
label_obj = cached_label
220225

221226
if annotation_type == TOKEN_LABEL and pdf_layer is not None:
222227
annot = _create_pdf_annotation(
@@ -229,10 +234,10 @@ def _create_grounding_annotations(
229234
else:
230235
continue
231236

232-
if annot is not None:
233-
annotations.append(annot)
237+
annotations.append(annot)
234238

235239
except Exception:
240+
cached_label = None
236241
logger.warning(
237242
"Failed to create grounding annotation for %r "
238243
"at [%d:%d] in document %d",
@@ -253,15 +258,16 @@ def _create_pdf_annotation(
253258
creator_id: int,
254259
pdf_layer: Any,
255260
label_obj: AnnotationLabel,
256-
) -> Annotation | None:
261+
) -> Annotation:
257262
"""Create (or fetch) a TOKEN_LABEL annotation for a PDF document via PlasmaPDF.
258263
259264
Idempotent: returns the existing annotation if one with the same
260-
document, label, and span coordinates already exists.
265+
document, label, page, and raw text already exists.
261266
262-
Returns ``None`` if the annotation cannot be created safely (e.g.
263-
PlasmaPDF could not determine the page) so the caller can skip it.
264-
Raises inside the savepoint to roll back on hard failures.
267+
Raises ``ValueError`` inside the savepoint when the annotation cannot
268+
be created safely (e.g. PlasmaPDF could not determine the page); the
269+
caller's per-annotation ``try/except`` rolls back the savepoint and
270+
logs the skip.
265271
"""
266272
from plasmapdf.models.types import SpanAnnotation, TextSpan
267273

@@ -289,6 +295,13 @@ def _create_pdf_annotation(
289295
f"{document.id}; skipping grounding annotation."
290296
)
291297

298+
# Note: ``json`` (bounding boxes) is in ``defaults``, NOT a lookup key.
299+
# For PDFs the (document, label, page, raw_text) tuple already uniquely
300+
# identifies the span; PlasmaPDF's bounding-box layout is deterministic
301+
# for stable input, so on Celery retry we want to reuse the existing
302+
# annotation rather than create a near-duplicate that differs only by
303+
# bounding-box reformatting. Span annotations key on ``json`` because
304+
# the char offsets ARE the identity for a text/DOCX document.
292305
annot, _ = Annotation.objects.get_or_create(
293306
document=document,
294307
annotation_label=label_obj,

0 commit comments

Comments
 (0)