Address review: corpus in lookup key, dedup annotations, jsonb note

JSv4 · JSv4 · commit f5a0bdf99369 · 2026-04-28T19:02:59.000-05:00
- Move corpus from defaults into the get_or_create lookup so a document
  shared across multiple corpora gets a distinct annotation row per
  corpus; previously the second corpus's grounding silently reused the
  first corpus's row, leaving datacell.sources pointing at an annotation
  whose corpus mismatched the extract (breaking
  MIN(document_permission, corpus_permission)). Applies to both PDF and
  text/DOCX paths.
- Deduplicate the returned annotations list by primary key so
  len(annotations) == datacell.sources.count() when the same span
  resolves to a single get_or_create row from multiple alignment hits.
- Update span-annotation docstring: NullableJSONField → jsonb in
  Postgres, so dict key order is moot for the get_or_create lookup —
  we still construct the dict in stable order for forward compatibility.
- Add regression test verifying that grounding the same document under
  two corpora produces disjoint annotation sets with correct corpus FKs.
diff --git a/opencontractserver/tests/test_extraction_grounding.py b/opencontractserver/tests/test_extraction_grounding.py
@@ -616,3 +616,77 @@ def stub_create(self, span_annotation):
             ).count(),
             0,
         )
+
+    def test_ground_pdf_separate_corpora_create_separate_annotations(self):
+        """A document shared across two corpora must NOT collapse its
+        grounding annotations into a single shared row.
+
+        Regression for issue raised in PR review: ``corpus`` was previously
+        only in ``defaults`` so the second corpus's grounding would silently
+        return the first corpus's annotation, producing a ``datacell.sources``
+        FK whose ``corpus`` mismatched the extract.  Fixing the lookup key
+        to include ``corpus`` means each corpus now owns a distinct row
+        with the correct FK.
+        """
+        from opencontractserver.corpuses.models import Corpus
+        from opencontractserver.extracts.models import (
+            Datacell,
+            Extract,
+        )
+        from opencontractserver.utils.extraction_grounding import (
+            ground_extraction_to_annotations,
+        )
+
+        # Re-add the document to a SECOND corpus so it lives in both.
+        other_corpus = Corpus.objects.create(
+            title="Second PDF Grounding Corpus", creator=self.user
+        )
+        other_corpus.add_document(document=self.document, user=self.user)
+
+        # Build a parallel extract+datacell anchored to the OTHER corpus.
+        other_extract = Extract.objects.create(
+            name="Second PDF Extract",
+            corpus=other_corpus,
+            fieldset=self.fieldset,
+            creator=self.user,
+        )
+        other_datacell = Datacell.objects.create(
+            extract=other_extract,
+            column=self.column,
+            document=self.document,
+            creator=self.user,
+            data={"data": ["Acme Holdings", "Global Acquisitions"]},
+        )
+
+        first_corpus_annotations = async_to_sync(ground_extraction_to_annotations)(
+            datacell=self.datacell,
+            document=self.document,
+            corpus=self.corpus,
+            user_id=self.user.id,
+            enable_fuzzy=False,
+        )
+        second_corpus_annotations = async_to_sync(ground_extraction_to_annotations)(
+            datacell=other_datacell,
+            document=self.document,
+            corpus=other_corpus,
+            user_id=self.user.id,
+            enable_fuzzy=False,
+        )
+
+        self.assertGreater(len(first_corpus_annotations), 0)
+        self.assertGreater(len(second_corpus_annotations), 0)
+
+        # The two corpora's grounding annotations must be DISJOINT.
+        first_ids = {a.id for a in first_corpus_annotations}
+        second_ids = {a.id for a in second_corpus_annotations}
+        self.assertTrue(
+            first_ids.isdisjoint(second_ids),
+            "Annotations leaked between corpora — corpus is missing from "
+            "the get_or_create lookup key.",
+        )
+
+        # Each annotation should point to its own corpus, not the other one.
+        for annot in first_corpus_annotations:
+            self.assertEqual(annot.corpus_id, self.corpus.id)
+        for annot in second_corpus_annotations:
+            self.assertEqual(annot.corpus_id, other_corpus.id)
diff --git a/opencontractserver/utils/extraction_grounding.py b/opencontractserver/utils/extraction_grounding.py
@@ -251,7 +251,20 @@ def _create_grounding_annotations(
                 exc_info=True,
             )
 
-    return annotations
+    # Deduplicate by primary key, preserving first-seen order: two
+    # alignment results for the same phrase on the same page produce a
+    # single ``get_or_create`` row, but each iteration appends it.
+    # Returning duplicates makes ``len(annotations)`` diverge from
+    # ``datacell.sources.count()`` and breaks idempotency invariants
+    # for downstream consumers.
+    seen_pks: set[int] = set()
+    deduped: list[Annotation] = []
+    for annot in annotations:
+        if annot.pk in seen_pks:
+            continue
+        seen_pks.add(annot.pk)
+        deduped.append(annot)
+    return deduped
 
 
 def _create_pdf_annotation(
@@ -299,21 +312,28 @@ def _create_pdf_annotation(
         )
 
     # Note: ``json`` (bounding boxes) is in ``defaults``, NOT a lookup key.
-    # For PDFs the (document, label, page, raw_text) tuple already uniquely
-    # identifies the span; PlasmaPDF's bounding-box layout is deterministic
-    # for stable input, so on Celery retry we want to reuse the existing
-    # annotation rather than create a near-duplicate that differs only by
-    # bounding-box reformatting. Span annotations key on ``json`` because
-    # the char offsets ARE the identity for a text/DOCX document.
+    # For PDFs the (document, corpus, label, page, raw_text) tuple already
+    # uniquely identifies the span; PlasmaPDF's bounding-box layout is
+    # deterministic for stable input, so on Celery retry we want to reuse
+    # the existing annotation rather than create a near-duplicate that
+    # differs only by bounding-box reformatting. Span annotations key on
+    # ``json`` because the char offsets ARE the identity for a text/DOCX
+    # document.
+    #
+    # ``corpus`` IS in the lookup so a multi-corpus document doesn't share
+    # a single annotation between unrelated corpora — datacell.sources
+    # must point to an annotation whose ``corpus`` matches the extract's
+    # corpus, otherwise ``MIN(document_permission, corpus_permission)``
+    # falls back to the wrong corpus's permissions.
     annot, _ = Annotation.objects.get_or_create(
         document=document,
+        corpus=corpus,
         annotation_label=label_obj,
         page=page,
         annotation_type=TOKEN_LABEL,
         raw_text=oc_ann["rawText"],
         defaults={
             "json": oc_ann["annotation_json"],
-            "corpus": corpus,
             "creator_id": creator_id,
             "structural": False,
         },
@@ -340,23 +360,27 @@ def _create_span_annotation(
     serves as a placeholder and the actual location is encoded by the
     character offsets in ``json``.
 
-    Identity key uses ``json={"start": ..., "end": ...}``. PostgreSQL
-    JSON equality is order-sensitive, so the key order in this literal
-    must remain stable for ``get_or_create`` to deduplicate on retry.
-    Python 3.7+ guarantees dict-literal insertion order, and this is the
-    only construction site, so the ordering is locally enforced.
+    Identity key uses ``json={"start": ..., "end": ...}``. The ``json``
+    column is a Django ``JSONField``, which maps to PostgreSQL ``jsonb``
+    — equality compares structurally, so key order does not affect
+    ``get_or_create``'s lookup.  We still construct the dict in a stable
+    order to avoid surprises if the column type ever changes.
+
+    ``corpus`` IS in the lookup so a multi-corpus document doesn't share
+    a single annotation between unrelated corpora — see the parallel
+    docstring on ``_create_pdf_annotation`` for the permission rationale.
     """
     from opencontractserver.annotations.models import SPAN_LABEL, Annotation
 
     annot, _ = Annotation.objects.get_or_create(
         document=document,
+        corpus=corpus,
         annotation_label=label_obj,
         annotation_type=SPAN_LABEL,
         raw_text=result.matched_text,
         json={"start": result.char_start, "end": result.char_end},
         defaults={
             "page": 1,
-            "corpus": corpus,
             "creator_id": creator_id,
             "structural": False,
         },