Skip to content

Commit f5a0bdf

Browse files
committed
Address review: corpus in lookup key, dedup annotations, jsonb note
- Move corpus from defaults into the get_or_create lookup so a document shared across multiple corpora gets a distinct annotation row per corpus; previously the second corpus's grounding silently reused the first corpus's row, leaving datacell.sources pointing at an annotation whose corpus mismatched the extract (breaking MIN(document_permission, corpus_permission)). Applies to both PDF and text/DOCX paths. - Deduplicate the returned annotations list by primary key so len(annotations) == datacell.sources.count() when the same span resolves to a single get_or_create row from multiple alignment hits. - Update span-annotation docstring: NullableJSONField → jsonb in Postgres, so dict key order is moot for the get_or_create lookup — we still construct the dict in stable order for forward compatibility. - Add regression test verifying that grounding the same document under two corpora produces disjoint annotation sets with correct corpus FKs.
1 parent 73e3198 commit f5a0bdf

2 files changed

Lines changed: 112 additions & 14 deletions

File tree

opencontractserver/tests/test_extraction_grounding.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,3 +616,77 @@ def stub_create(self, span_annotation):
616616
).count(),
617617
0,
618618
)
619+
620+
def test_ground_pdf_separate_corpora_create_separate_annotations(self):
621+
"""A document shared across two corpora must NOT collapse its
622+
grounding annotations into a single shared row.
623+
624+
Regression for issue raised in PR review: ``corpus`` was previously
625+
only in ``defaults`` so the second corpus's grounding would silently
626+
return the first corpus's annotation, producing a ``datacell.sources``
627+
FK whose ``corpus`` mismatched the extract. Fixing the lookup key
628+
to include ``corpus`` means each corpus now owns a distinct row
629+
with the correct FK.
630+
"""
631+
from opencontractserver.corpuses.models import Corpus
632+
from opencontractserver.extracts.models import (
633+
Datacell,
634+
Extract,
635+
)
636+
from opencontractserver.utils.extraction_grounding import (
637+
ground_extraction_to_annotations,
638+
)
639+
640+
# Re-add the document to a SECOND corpus so it lives in both.
641+
other_corpus = Corpus.objects.create(
642+
title="Second PDF Grounding Corpus", creator=self.user
643+
)
644+
other_corpus.add_document(document=self.document, user=self.user)
645+
646+
# Build a parallel extract+datacell anchored to the OTHER corpus.
647+
other_extract = Extract.objects.create(
648+
name="Second PDF Extract",
649+
corpus=other_corpus,
650+
fieldset=self.fieldset,
651+
creator=self.user,
652+
)
653+
other_datacell = Datacell.objects.create(
654+
extract=other_extract,
655+
column=self.column,
656+
document=self.document,
657+
creator=self.user,
658+
data={"data": ["Acme Holdings", "Global Acquisitions"]},
659+
)
660+
661+
first_corpus_annotations = async_to_sync(ground_extraction_to_annotations)(
662+
datacell=self.datacell,
663+
document=self.document,
664+
corpus=self.corpus,
665+
user_id=self.user.id,
666+
enable_fuzzy=False,
667+
)
668+
second_corpus_annotations = async_to_sync(ground_extraction_to_annotations)(
669+
datacell=other_datacell,
670+
document=self.document,
671+
corpus=other_corpus,
672+
user_id=self.user.id,
673+
enable_fuzzy=False,
674+
)
675+
676+
self.assertGreater(len(first_corpus_annotations), 0)
677+
self.assertGreater(len(second_corpus_annotations), 0)
678+
679+
# The two corpora's grounding annotations must be DISJOINT.
680+
first_ids = {a.id for a in first_corpus_annotations}
681+
second_ids = {a.id for a in second_corpus_annotations}
682+
self.assertTrue(
683+
first_ids.isdisjoint(second_ids),
684+
"Annotations leaked between corpora — corpus is missing from "
685+
"the get_or_create lookup key.",
686+
)
687+
688+
# Each annotation should point to its own corpus, not the other one.
689+
for annot in first_corpus_annotations:
690+
self.assertEqual(annot.corpus_id, self.corpus.id)
691+
for annot in second_corpus_annotations:
692+
self.assertEqual(annot.corpus_id, other_corpus.id)

opencontractserver/utils/extraction_grounding.py

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,20 @@ def _create_grounding_annotations(
251251
exc_info=True,
252252
)
253253

254-
return annotations
254+
# Deduplicate by primary key, preserving first-seen order: two
255+
# alignment results for the same phrase on the same page produce a
256+
# single ``get_or_create`` row, but each iteration appends it.
257+
# Returning duplicates makes ``len(annotations)`` diverge from
258+
# ``datacell.sources.count()`` and breaks idempotency invariants
259+
# for downstream consumers.
260+
seen_pks: set[int] = set()
261+
deduped: list[Annotation] = []
262+
for annot in annotations:
263+
if annot.pk in seen_pks:
264+
continue
265+
seen_pks.add(annot.pk)
266+
deduped.append(annot)
267+
return deduped
255268

256269

257270
def _create_pdf_annotation(
@@ -299,21 +312,28 @@ def _create_pdf_annotation(
299312
)
300313

301314
# Note: ``json`` (bounding boxes) is in ``defaults``, NOT a lookup key.
302-
# For PDFs the (document, label, page, raw_text) tuple already uniquely
303-
# identifies the span; PlasmaPDF's bounding-box layout is deterministic
304-
# for stable input, so on Celery retry we want to reuse the existing
305-
# annotation rather than create a near-duplicate that differs only by
306-
# bounding-box reformatting. Span annotations key on ``json`` because
307-
# the char offsets ARE the identity for a text/DOCX document.
315+
# For PDFs the (document, corpus, label, page, raw_text) tuple already
316+
# uniquely identifies the span; PlasmaPDF's bounding-box layout is
317+
# deterministic for stable input, so on Celery retry we want to reuse
318+
# the existing annotation rather than create a near-duplicate that
319+
# differs only by bounding-box reformatting. Span annotations key on
320+
# ``json`` because the char offsets ARE the identity for a text/DOCX
321+
# document.
322+
#
323+
# ``corpus`` IS in the lookup so a multi-corpus document doesn't share
324+
# a single annotation between unrelated corpora — datacell.sources
325+
# must point to an annotation whose ``corpus`` matches the extract's
326+
# corpus, otherwise ``MIN(document_permission, corpus_permission)``
327+
# falls back to the wrong corpus's permissions.
308328
annot, _ = Annotation.objects.get_or_create(
309329
document=document,
330+
corpus=corpus,
310331
annotation_label=label_obj,
311332
page=page,
312333
annotation_type=TOKEN_LABEL,
313334
raw_text=oc_ann["rawText"],
314335
defaults={
315336
"json": oc_ann["annotation_json"],
316-
"corpus": corpus,
317337
"creator_id": creator_id,
318338
"structural": False,
319339
},
@@ -340,23 +360,27 @@ def _create_span_annotation(
340360
serves as a placeholder and the actual location is encoded by the
341361
character offsets in ``json``.
342362
343-
Identity key uses ``json={"start": ..., "end": ...}``. PostgreSQL
344-
JSON equality is order-sensitive, so the key order in this literal
345-
must remain stable for ``get_or_create`` to deduplicate on retry.
346-
Python 3.7+ guarantees dict-literal insertion order, and this is the
347-
only construction site, so the ordering is locally enforced.
363+
Identity key uses ``json={"start": ..., "end": ...}``. The ``json``
364+
column is a Django ``JSONField``, which maps to PostgreSQL ``jsonb``
365+
— equality compares structurally, so key order does not affect
366+
``get_or_create``'s lookup. We still construct the dict in a stable
367+
order to avoid surprises if the column type ever changes.
368+
369+
``corpus`` IS in the lookup so a multi-corpus document doesn't share
370+
a single annotation between unrelated corpora — see the parallel
371+
docstring on ``_create_pdf_annotation`` for the permission rationale.
348372
"""
349373
from opencontractserver.annotations.models import SPAN_LABEL, Annotation
350374

351375
annot, _ = Annotation.objects.get_or_create(
352376
document=document,
377+
corpus=corpus,
353378
annotation_label=label_obj,
354379
annotation_type=SPAN_LABEL,
355380
raw_text=result.matched_text,
356381
json={"start": result.char_start, "end": result.char_end},
357382
defaults={
358383
"page": 1,
359-
"corpus": corpus,
360384
"creator_id": creator_id,
361385
"structural": False,
362386
},

0 commit comments

Comments
 (0)