Skip to content

Commit d9e2178

Browse files
committed
Address PR #1376 follow-up: lazy subqueries, top-level import, test gap
Follow-up to issue #1377: - Move ``StructuralAnnotationSet`` to the existing top-level ``opencontractserver.annotations.models`` import. ``Annotation`` was already imported at module scope from the same module, so the inline imports inside ``_build_base_queryset`` ("avoid circular imports") had no rationale. - Convert ``active_doc_ids`` and ``visible_corpus_doc_ids`` from Python-materialised lists to fully lazy querysets so the SQL planner receives ``IN (SELECT ...)`` subqueries rather than ``IN (val, val, ...)`` literals — important for corpora with tens of thousands of documents. The deletion-aware empty-corpus short-circuit now uses ``.exists()`` instead of list truthiness. - Add ``.distinct()`` to ``visible_corpus_set_ids`` and ``active_struct_set_ids`` for consistency with ``visible_corpus_doc_ids`` and to avoid duplicate IDs when a structural set is shared across multiple documents in the corpus. - Drop stale ``(line 211)`` line-number anchor from the corpus-only branch comment. - Add ``test_deletion_aware_path_excludes_deleted_document_structural`` to close the previously uncovered branch where ``check_corpus_deletion=True`` AND a document's only ``DocumentPath`` has ``is_deleted=True`` — the structural annotation must be excluded.
1 parent c7f6732 commit d9e2178

2 files changed

Lines changed: 71 additions & 30 deletions

File tree

opencontractserver/llms/vector_stores/core_vector_stores.py

Lines changed: 37 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from django.contrib.auth import get_user_model
1010
from django.db.models import Q, QuerySet
1111

12-
from opencontractserver.annotations.models import Annotation
12+
from opencontractserver.annotations.models import Annotation, StructuralAnnotationSet
1313
from opencontractserver.constants.search import (
1414
FTS_CONFIG,
1515
HYBRID_SEARCH_OVERSAMPLE_FACTOR,
@@ -289,20 +289,22 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
289289

290290
# Check for deleted documents in corpus
291291
if self.check_corpus_deletion and self.corpus_id and not self.document_id:
292-
# Note: sync_to_async already imported at module level
293-
from opencontractserver.annotations.models import StructuralAnnotationSet
294292
from opencontractserver.documents.models import DocumentPath
295293

296-
# Get documents with active (non-deleted) paths in corpus
297-
active_doc_ids = await sync_to_async(
298-
lambda: list(
299-
DocumentPath.objects.filter(
300-
corpus_id=self.corpus_id, is_current=True, is_deleted=False
301-
).values_list("document_id", flat=True)
294+
# Lazy subquery — never round-trips through Python, so the
295+
# generated SQL stays a single statement with a real subquery
296+
# rather than a giant ``IN (val, val, ...)`` literal even for
297+
# corpora with tens of thousands of documents.
298+
active_doc_ids_qs = (
299+
DocumentPath.objects.filter(
300+
corpus_id=self.corpus_id, is_current=True, is_deleted=False
302301
)
303-
)()
302+
.values("document_id")
303+
.distinct()
304+
)
305+
has_active_docs = await sync_to_async(active_doc_ids_qs.exists)()
304306

305-
if active_doc_ids:
307+
if has_active_docs:
306308
# Two annotation shapes pass this filter:
307309
# 1. Direct: Annotation.document_id is in the active set.
308310
# 2. Structural: Annotation.document_id is NULL but the
@@ -311,13 +313,16 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
311313
# structural annotations need an explicit OR clause —
312314
# otherwise every parser-produced structural row is
313315
# silently dropped on this corpus-wide path.
314-
active_struct_set_ids = StructuralAnnotationSet.objects.filter(
315-
documents__in=active_doc_ids
316-
).values("id")
317-
active_filters &= Q(document_id__in=active_doc_ids) | Q(
316+
active_struct_set_ids = (
317+
StructuralAnnotationSet.objects.filter(
318+
documents__in=active_doc_ids_qs
319+
)
320+
.values("id")
321+
.distinct()
322+
)
323+
active_filters &= Q(document_id__in=active_doc_ids_qs) | Q(
318324
structural=True, structural_set_id__in=active_struct_set_ids
319325
)
320-
_logger.debug(f"Found {len(active_doc_ids)} active documents in corpus")
321326
else:
322327
_logger.warning(f"No active documents found in corpus {self.corpus_id}")
323328
return Annotation.objects.none()
@@ -396,21 +401,23 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
396401
# ``Annotation.corpus_id``. Per-document visibility for these
397402
# rows is enforced by the visibility filter further below
398403
# plus the upfront IDOR check on ``corpus_id``.
399-
# Document is imported earlier in this method (line 211); reusing
400-
# the local binding avoids an F811 redefinition warning.
401-
from opencontractserver.annotations.models import StructuralAnnotationSet
402-
403-
visible_corpus_doc_ids = await sync_to_async(
404-
lambda: list(
405-
Document.objects.visible_to_user(user)
406-
.filter(path_records__corpus_id=self.corpus_id)
407-
.values_list("id", flat=True)
408-
.distinct()
404+
# Both subqueries below stay lazy so the SQL planner sees a
405+
# nested ``IN (SELECT ...)`` rather than a Python-materialised
406+
# ``IN (val, val, ...)`` literal — important for corpora with
407+
# tens of thousands of documents.
408+
visible_corpus_doc_ids_qs = (
409+
Document.objects.visible_to_user(user)
410+
.filter(path_records__corpus_id=self.corpus_id)
411+
.values("id")
412+
.distinct()
413+
)
414+
visible_corpus_set_ids = (
415+
StructuralAnnotationSet.objects.filter(
416+
documents__in=visible_corpus_doc_ids_qs
409417
)
410-
)()
411-
visible_corpus_set_ids = StructuralAnnotationSet.objects.filter(
412-
documents__in=visible_corpus_doc_ids
413-
).values("id")
418+
.values("id")
419+
.distinct()
420+
)
414421
active_filters &= Q(
415422
structural=True, structural_set_id__in=visible_corpus_set_ids
416423
) | Q(structural=False, corpus_id=self.corpus_id)

opencontractserver/tests/test_corpus_isolation_vector_store.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,40 @@ def test_corpus_wide_search_excludes_orphan_structural_set(self) -> None:
260260
# Document-scoped retrieval — must continue to work post-fix
261261
# ------------------------------------------------------------------ #
262262

263+
def test_deletion_aware_path_excludes_deleted_document_structural(self) -> None:
264+
"""Structural rows whose only ``DocumentPath`` is deleted are dropped.
265+
266+
Covers the deletion side of ``check_corpus_deletion=True``: the
267+
``DocumentPath`` row for ``doc_a`` is flipped to ``is_deleted=True``,
268+
so ``active_doc_ids`` in
269+
``CoreAnnotationVectorStore._build_base_queryset`` becomes empty and
270+
the corpus-only branch must short-circuit to no results — the
271+
document's structural annotation must NOT come back, even though the
272+
annotation row itself is otherwise unchanged.
273+
"""
274+
DocumentPath.objects.filter(document=self.doc_a, corpus=self.corpus_a).update(
275+
is_deleted=True
276+
)
277+
278+
store = CoreAnnotationVectorStore(
279+
user_id=self.user.id,
280+
corpus_id=self.corpus_a.id,
281+
document_id=None,
282+
check_corpus_deletion=True,
283+
)
284+
query = VectorSearchQuery(
285+
query_embedding=_constant_vector(384, 0.5), similarity_top_k=10
286+
)
287+
results = store.search(query)
288+
returned_ids = {r.annotation.id for r in results}
289+
290+
self.assertNotIn(
291+
self.struct_a.id,
292+
returned_ids,
293+
"Structural annotation whose only DocumentPath is deleted "
294+
"leaked through the deletion-aware corpus-only path",
295+
)
296+
263297
def test_document_scoped_search_still_returns_structural(self) -> None:
264298
"""Document-scoped retrieval is the existing well-tested path.
265299

0 commit comments

Comments
 (0)