|
5 | 5 | (integration with Django models). |
6 | 6 | """ |
7 | 7 |
|
| 8 | +import json |
| 9 | + |
8 | 10 | from asgiref.sync import async_to_sync |
9 | 11 | from django.test import SimpleTestCase, TestCase |
10 | 12 |
|
11 | 13 | from opencontractserver.utils.extraction_grounding import extract_groundable_strings |
12 | 14 |
|
13 | 15 |
|
| 16 | +def _build_pawls_for_text( |
| 17 | + pages_text: list[str], page_width: float = 612.0, page_height: float = 792.0 |
| 18 | +) -> str: |
| 19 | + """Build a v1 PAWLS JSON payload that embeds ``pages_text`` as tokens. |
| 20 | +
|
| 21 | + Each page's text is split on whitespace and laid out as a single row |
| 22 | + of tokens with simple monotonically increasing x-coordinates. The |
| 23 | + resulting JSON is suitable for ``build_translation_layer`` and lets |
| 24 | + integration tests exercise the PDF grounding path without a real PDF. |
| 25 | + """ |
| 26 | + pages: list[dict] = [] |
| 27 | + for page_index, text in enumerate(pages_text): |
| 28 | + tokens: list[dict] = [] |
| 29 | + x_pos = 10.0 |
| 30 | + for word in text.split(): |
| 31 | + tokens.append( |
| 32 | + { |
| 33 | + "x": x_pos, |
| 34 | + "y": 100.0, |
| 35 | + "width": float(len(word)) * 6.0, |
| 36 | + "height": 12.0, |
| 37 | + "text": word, |
| 38 | + } |
| 39 | + ) |
| 40 | + x_pos += float(len(word)) * 6.0 + 4.0 |
| 41 | + pages.append( |
| 42 | + { |
| 43 | + "page": { |
| 44 | + "width": page_width, |
| 45 | + "height": page_height, |
| 46 | + "index": page_index, |
| 47 | + }, |
| 48 | + "tokens": tokens, |
| 49 | + } |
| 50 | + ) |
| 51 | + return json.dumps(pages) |
| 52 | + |
| 53 | + |
14 | 54 | class TestExtractGroundableStrings(SimpleTestCase): |
15 | 55 | """Unit tests for extract_groundable_strings() — no Django DB needed.""" |
16 | 56 |
|
@@ -327,3 +367,251 @@ def test_ground_no_matches_returns_empty(self): |
327 | 367 | ) |
328 | 368 |
|
329 | 369 | self.assertEqual(len(annotations), 0) |
| 370 | + |
| 371 | + def test_ground_text_document_is_idempotent(self): |
| 372 | + """Running grounding twice should not create duplicate annotations. |
| 373 | +
|
| 374 | + Simulates a Celery retry after a partial failure. The second call |
| 375 | + must reuse existing OC_EXTRACT_SOURCE annotations rather than |
| 376 | + bloating ``datacell.sources`` with duplicates. |
| 377 | + """ |
| 378 | + from opencontractserver.annotations.models import Annotation |
| 379 | + from opencontractserver.utils.extraction_grounding import ( |
| 380 | + ground_extraction_to_annotations, |
| 381 | + ) |
| 382 | + |
| 383 | + first = async_to_sync(ground_extraction_to_annotations)( |
| 384 | + datacell=self.datacell, |
| 385 | + document=self.document, |
| 386 | + corpus=self.corpus, |
| 387 | + user_id=self.user.id, |
| 388 | + enable_fuzzy=False, |
| 389 | + ) |
| 390 | + self.assertGreater(len(first), 0) |
| 391 | + first_count = Annotation.objects.filter(document=self.document).count() |
| 392 | + first_ids = sorted(a.id for a in first) |
| 393 | + |
| 394 | + second = async_to_sync(ground_extraction_to_annotations)( |
| 395 | + datacell=self.datacell, |
| 396 | + document=self.document, |
| 397 | + corpus=self.corpus, |
| 398 | + user_id=self.user.id, |
| 399 | + enable_fuzzy=False, |
| 400 | + ) |
| 401 | + second_count = Annotation.objects.filter(document=self.document).count() |
| 402 | + second_ids = sorted(a.id for a in second) |
| 403 | + |
| 404 | + self.assertEqual( |
| 405 | + first_count, |
| 406 | + second_count, |
| 407 | + "Re-running grounding created duplicate annotations.", |
| 408 | + ) |
| 409 | + self.assertEqual( |
| 410 | + first_ids, |
| 411 | + second_ids, |
| 412 | + "Re-running grounding returned annotations with different IDs.", |
| 413 | + ) |
| 414 | + |
| 415 | + self.datacell.refresh_from_db() |
| 416 | + self.assertEqual(self.datacell.sources.count(), first_count) |
| 417 | + |
| 418 | + |
| 419 | +class TestGroundingPipelinePDFIntegration(TestCase): |
| 420 | + """Integration tests for grounding against a PDF-shaped document. |
| 421 | +
|
| 422 | + Builds a synthetic multi-page PAWLS payload (no real PDF needed) and |
| 423 | + exercises the TOKEN_LABEL path through PlasmaPDF's translation layer. |
| 424 | + """ |
| 425 | + |
| 426 | + def setUp(self): |
| 427 | + from django.contrib.auth import get_user_model |
| 428 | + from django.core.files.base import ContentFile |
| 429 | + |
| 430 | + from opencontractserver.corpuses.models import Corpus |
| 431 | + from opencontractserver.documents.models import Document |
| 432 | + from opencontractserver.extracts.models import ( |
| 433 | + Column, |
| 434 | + Datacell, |
| 435 | + Extract, |
| 436 | + Fieldset, |
| 437 | + ) |
| 438 | + from opencontractserver.notifications.models import Notification |
| 439 | + |
| 440 | + User = get_user_model() |
| 441 | + self.user = User.objects.create_user( |
| 442 | + username="grounding_pdf_user", password="testpass" |
| 443 | + ) |
| 444 | + Notification.objects.filter(recipient=self.user).delete() |
| 445 | + |
| 446 | + self.corpus = Corpus.objects.create( |
| 447 | + title="PDF Grounding Corpus", creator=self.user |
| 448 | + ) |
| 449 | + |
| 450 | + # Two-page synthetic document; "Acme Holdings" is on page 0, |
| 451 | + # "Global Acquisitions" on page 1. |
| 452 | + self.pages_text = [ |
| 453 | + "ASSET PURCHASE AGREEMENT between Acme Holdings Inc and others", |
| 454 | + "Global Acquisitions LLC shall serve as the Buyer of record", |
| 455 | + ] |
| 456 | + pawls_json = _build_pawls_for_text(self.pages_text) |
| 457 | + |
| 458 | + self.document = Document.objects.create( |
| 459 | + title="PDF Grounding Test", |
| 460 | + creator=self.user, |
| 461 | + file_type="application/pdf", |
| 462 | + ) |
| 463 | + self.document.pawls_parse_file.save( |
| 464 | + "test.pawls", ContentFile(pawls_json.encode()) |
| 465 | + ) |
| 466 | + self.corpus.add_document(document=self.document, user=self.user) |
| 467 | + |
| 468 | + self.fieldset = Fieldset.objects.create(name="PDF Fieldset", creator=self.user) |
| 469 | + self.column = Column.objects.create( |
| 470 | + fieldset=self.fieldset, |
| 471 | + name="Parties", |
| 472 | + query="Extract parties", |
| 473 | + output_type="str", |
| 474 | + creator=self.user, |
| 475 | + ) |
| 476 | + self.extract = Extract.objects.create( |
| 477 | + name="PDF Extract", |
| 478 | + corpus=self.corpus, |
| 479 | + fieldset=self.fieldset, |
| 480 | + creator=self.user, |
| 481 | + ) |
| 482 | + self.datacell = Datacell.objects.create( |
| 483 | + extract=self.extract, |
| 484 | + column=self.column, |
| 485 | + document=self.document, |
| 486 | + creator=self.user, |
| 487 | + data={"data": ["Acme Holdings", "Global Acquisitions"]}, |
| 488 | + ) |
| 489 | + |
| 490 | + def test_ground_pdf_creates_token_label_annotations(self): |
| 491 | + """PDF grounding should create TOKEN_LABEL annotations with valid pages.""" |
| 492 | + from opencontractserver.annotations.models import TOKEN_LABEL |
| 493 | + from opencontractserver.constants.annotations import OC_EXTRACT_SOURCE_LABEL |
| 494 | + from opencontractserver.utils.extraction_grounding import ( |
| 495 | + ground_extraction_to_annotations, |
| 496 | + ) |
| 497 | + |
| 498 | + annotations = async_to_sync(ground_extraction_to_annotations)( |
| 499 | + datacell=self.datacell, |
| 500 | + document=self.document, |
| 501 | + corpus=self.corpus, |
| 502 | + user_id=self.user.id, |
| 503 | + enable_fuzzy=False, |
| 504 | + ) |
| 505 | + |
| 506 | + self.assertGreater(len(annotations), 0) |
| 507 | + for annot in annotations: |
| 508 | + self.assertEqual(annot.annotation_type, TOKEN_LABEL) |
| 509 | + self.assertEqual(annot.document, self.document) |
| 510 | + self.assertEqual(annot.corpus, self.corpus) |
| 511 | + self.assertFalse(annot.structural) |
| 512 | + self.assertEqual(annot.annotation_label.text, OC_EXTRACT_SOURCE_LABEL) |
| 513 | + # Page must be a positive integer; never the silent default of 1 |
| 514 | + # for a span that actually lives on page 2. |
| 515 | + self.assertIsInstance(annot.page, int) |
| 516 | + self.assertGreaterEqual(annot.page, 1) |
| 517 | + self.assertLessEqual(annot.page, len(self.pages_text)) |
| 518 | + self.assertTrue(annot.raw_text) |
| 519 | + |
| 520 | + # "Acme Holdings" is on page 1 (1-indexed) and |
| 521 | + # "Global Acquisitions" on page 2 — confirm the per-page mapping |
| 522 | + # actually works by checking we got at least one annotation off |
| 523 | + # page 1. |
| 524 | + pages_seen = {a.page for a in annotations} |
| 525 | + self.assertGreater( |
| 526 | + len(pages_seen), |
| 527 | + 1, |
| 528 | + "Expected grounding to span multiple PDF pages.", |
| 529 | + ) |
| 530 | + |
| 531 | + self.datacell.refresh_from_db() |
| 532 | + self.assertEqual(self.datacell.sources.count(), len(annotations)) |
| 533 | + |
| 534 | + def test_ground_pdf_is_idempotent(self): |
| 535 | + """Re-running PDF grounding must not duplicate TOKEN_LABEL annotations.""" |
| 536 | + from opencontractserver.annotations.models import Annotation |
| 537 | + from opencontractserver.utils.extraction_grounding import ( |
| 538 | + ground_extraction_to_annotations, |
| 539 | + ) |
| 540 | + |
| 541 | + first = async_to_sync(ground_extraction_to_annotations)( |
| 542 | + datacell=self.datacell, |
| 543 | + document=self.document, |
| 544 | + corpus=self.corpus, |
| 545 | + user_id=self.user.id, |
| 546 | + enable_fuzzy=False, |
| 547 | + ) |
| 548 | + self.assertGreater(len(first), 0) |
| 549 | + first_count = Annotation.objects.filter(document=self.document).count() |
| 550 | + first_ids = sorted(a.id for a in first) |
| 551 | + |
| 552 | + second = async_to_sync(ground_extraction_to_annotations)( |
| 553 | + datacell=self.datacell, |
| 554 | + document=self.document, |
| 555 | + corpus=self.corpus, |
| 556 | + user_id=self.user.id, |
| 557 | + enable_fuzzy=False, |
| 558 | + ) |
| 559 | + second_count = Annotation.objects.filter(document=self.document).count() |
| 560 | + second_ids = sorted(a.id for a in second) |
| 561 | + |
| 562 | + self.assertEqual(first_count, second_count) |
| 563 | + self.assertEqual(first_ids, second_ids) |
| 564 | + |
| 565 | + self.datacell.refresh_from_db() |
| 566 | + self.assertEqual(self.datacell.sources.count(), first_count) |
| 567 | + |
| 568 | + def test_ground_pdf_skips_when_page_is_none(self): |
| 569 | + """If PlasmaPDF returns page=None, the annotation must be skipped. |
| 570 | +
|
| 571 | + Regression for the silent ``page=1`` fallback bug: a missing page |
| 572 | + on a multi-page PDF should result in *no* annotation being saved |
| 573 | + rather than a structurally incorrect one anchored to page 1. |
| 574 | + """ |
| 575 | + from unittest.mock import patch |
| 576 | + |
| 577 | + from opencontractserver.annotations.models import Annotation |
| 578 | + from opencontractserver.constants.annotations import OC_EXTRACT_SOURCE_LABEL |
| 579 | + from opencontractserver.utils.extraction_grounding import ( |
| 580 | + ground_extraction_to_annotations, |
| 581 | + ) |
| 582 | + |
| 583 | + def stub_create(self, span_annotation): |
| 584 | + # Mimic PlasmaPDF's payload but force page=None so the grounding |
| 585 | + # pipeline must take the skip-rather-than-fallback path. |
| 586 | + return { |
| 587 | + "page": None, |
| 588 | + "rawText": span_annotation.span["text"], |
| 589 | + "annotation_json": {}, |
| 590 | + } |
| 591 | + |
| 592 | + with patch( |
| 593 | + "plasmapdf.models.PdfDataLayer.PdfDataLayer." |
| 594 | + "create_opencontract_annotation_from_span", |
| 595 | + new=stub_create, |
| 596 | + ): |
| 597 | + annotations = async_to_sync(ground_extraction_to_annotations)( |
| 598 | + datacell=self.datacell, |
| 599 | + document=self.document, |
| 600 | + corpus=self.corpus, |
| 601 | + user_id=self.user.id, |
| 602 | + enable_fuzzy=False, |
| 603 | + ) |
| 604 | + |
| 605 | + self.assertEqual( |
| 606 | + len(annotations), |
| 607 | + 0, |
| 608 | + "Annotations with page=None must be skipped, not saved on page 1.", |
| 609 | + ) |
| 610 | + # And nothing should have been persisted to the database either. |
| 611 | + self.assertEqual( |
| 612 | + Annotation.objects.filter( |
| 613 | + document=self.document, |
| 614 | + annotation_label__text=OC_EXTRACT_SOURCE_LABEL, |
| 615 | + ).count(), |
| 616 | + 0, |
| 617 | + ) |
0 commit comments