Skip to content

Commit cc4348f

Browse files
committed
Address PR #1379 review: import consistency, exists tradeoff, partial deletion test
- Move ``DocumentPath`` into the same method-local import statement as ``Document`` (the existing pattern for documents.models inside ``_build_base_queryset``). Reconciles the inconsistency flagged in review where ``DocumentPath`` was the only inline-deeper import after ``StructuralAnnotationSet`` was promoted module-level. - Document the ``active_doc_ids_qs.exists()`` trade-off: one extra boolean SELECT on the happy path buys an early ``Annotation.objects.none()`` short-circuit for empty/all-deleted corpora and preserves the operational warning that the previous list-materialisation provided. - Add ``test_deletion_aware_path_partial_deletion_keeps_active_drops_deleted`` to cover the actual subquery-filter branch (one active doc + one deleted doc in the same corpus). The existing ``test_deletion_aware_path_excludes_deleted_document_structural`` only exercises the early-exit branch; this test exercises the ``Q(document_id__in=...) | Q(structural=True, structural_set_id__in=...)`` filter — the case most likely to regress silently if the OR clause is rewritten.
1 parent d9e2178 commit cc4348f

2 files changed

Lines changed: 90 additions & 3 deletions

File tree

opencontractserver/llms/vector_stores/core_vector_stores.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
208208
# enumeration attacks.
209209
# -------------------------------------------------------------------------
210210
from opencontractserver.corpuses.models import Corpus
211-
from opencontractserver.documents.models import Document
211+
from opencontractserver.documents.models import Document, DocumentPath
212212

213213
user = None
214214
if self.user_id:
@@ -289,8 +289,6 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
289289

290290
# Check for deleted documents in corpus
291291
if self.check_corpus_deletion and self.corpus_id and not self.document_id:
292-
from opencontractserver.documents.models import DocumentPath
293-
294292
# Lazy subquery — never round-trips through Python, so the
295293
# generated SQL stays a single statement with a real subquery
296294
# rather than a giant ``IN (val, val, ...)`` literal even for
@@ -302,6 +300,15 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
302300
.values("document_id")
303301
.distinct()
304302
)
303+
# Trade-off: this ``EXISTS`` adds one extra round-trip on the
304+
# happy path, but lets us short-circuit the entire vector search
305+
# for empty/all-deleted corpora (returning ``Annotation.none()``
306+
# spares a downstream HNSW probe and keeps the existing
307+
# operational warning). For corpora with at least one active
308+
# document the cost is a single boolean SELECT and is dwarfed
309+
# by the main query. Removing the check would also remove the
310+
# debug log of the active-doc count that the materialised list
311+
# used to provide.
305312
has_active_docs = await sync_to_async(active_doc_ids_qs.exists)()
306313

307314
if has_active_docs:

opencontractserver/tests/test_corpus_isolation_vector_store.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,86 @@ def test_deletion_aware_path_excludes_deleted_document_structural(self) -> None:
294294
"leaked through the deletion-aware corpus-only path",
295295
)
296296

297+
def test_deletion_aware_path_partial_deletion_keeps_active_drops_deleted(
298+
self,
299+
) -> None:
300+
"""Partial deletion: with two docs in one corpus, only the active stays.
301+
302+
The previous test stresses the early-exit branch (every doc deleted
303+
→ ``has_active_docs`` is False → ``Annotation.none()``). This test
304+
stresses the actual subquery filter:
305+
306+
active_filters &= Q(document_id__in=active_doc_ids_qs) | Q(
307+
structural=True, structural_set_id__in=active_struct_set_ids
308+
)
309+
310+
With one active document and one deleted document in the same
311+
corpus, ``active_doc_ids_qs`` is non-empty but partial. The
312+
deleted document's structural annotation must be excluded while
313+
the active document's must come through. This is the scenario
314+
most likely to regress silently if the OR clause is rebuilt or
315+
the subquery is rewritten incorrectly.
316+
"""
317+
# Add a second document to corpus_a with its own structural set,
318+
# then mark its DocumentPath deleted while doc_a's stays active.
319+
deleted_set = StructuralAnnotationSet.objects.create(
320+
content_hash=hashlib.sha256(b"corpus-a-deleted-doc").hexdigest(),
321+
creator=self.user,
322+
parser_name="TestParser",
323+
parser_version="1.0",
324+
)
325+
deleted_doc = Document.objects.create(
326+
title="Doc A2 (deleted)",
327+
creator=self.user,
328+
is_public=True,
329+
structural_annotation_set=deleted_set,
330+
)
331+
DocumentPath.objects.create(
332+
document=deleted_doc,
333+
corpus=self.corpus_a,
334+
path="/doc_a2.pdf",
335+
version_number=1,
336+
is_current=True,
337+
is_deleted=True,
338+
creator=self.user,
339+
)
340+
deleted_struct = Annotation.objects.create(
341+
structural_set=deleted_set,
342+
annotation_label=self.label,
343+
creator=self.user,
344+
raw_text="Deleted-doc structural paragraph",
345+
structural=True,
346+
page=1,
347+
)
348+
deleted_struct.add_embedding(
349+
get_default_embedder_path(), _constant_vector(384, 0.15)
350+
)
351+
352+
store = CoreAnnotationVectorStore(
353+
user_id=self.user.id,
354+
corpus_id=self.corpus_a.id,
355+
document_id=None,
356+
check_corpus_deletion=True,
357+
)
358+
query = VectorSearchQuery(
359+
query_embedding=_constant_vector(384, 0.5), similarity_top_k=10
360+
)
361+
results = store.search(query)
362+
returned_ids = {r.annotation.id for r in results}
363+
364+
self.assertIn(
365+
self.struct_a.id,
366+
returned_ids,
367+
"Active document's structural annotation was excluded by the "
368+
"partial-deletion subquery filter",
369+
)
370+
self.assertNotIn(
371+
deleted_struct.id,
372+
returned_ids,
373+
"Deleted document's structural annotation leaked through the "
374+
"partial-deletion subquery filter",
375+
)
376+
297377
def test_document_scoped_search_still_returns_structural(self) -> None:
298378
"""Document-scoped retrieval is the existing well-tested path.
299379

0 commit comments

Comments
 (0)