Skip to content

Commit 49309d5

Browse files
authored
Merge pull request #1379 from Open-Source-Legal/claude/fix-issue-1377-gTvaU
Optimize corpus deletion checks with lazy subqueries
2 parents c7f6732 + cc4348f commit 49309d5

2 files changed

Lines changed: 162 additions & 34 deletions

File tree

opencontractserver/llms/vector_stores/core_vector_stores.py

Lines changed: 48 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from django.contrib.auth import get_user_model
1010
from django.db.models import Q, QuerySet
1111

12-
from opencontractserver.annotations.models import Annotation
12+
from opencontractserver.annotations.models import Annotation, StructuralAnnotationSet
1313
from opencontractserver.constants.search import (
1414
FTS_CONFIG,
1515
HYBRID_SEARCH_OVERSAMPLE_FACTOR,
@@ -208,7 +208,7 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
208208
# enumeration attacks.
209209
# -------------------------------------------------------------------------
210210
from opencontractserver.corpuses.models import Corpus
211-
from opencontractserver.documents.models import Document
211+
from opencontractserver.documents.models import Document, DocumentPath
212212

213213
user = None
214214
if self.user_id:
@@ -289,20 +289,29 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
289289

290290
# Check for deleted documents in corpus
291291
if self.check_corpus_deletion and self.corpus_id and not self.document_id:
292-
# Note: sync_to_async already imported at module level
293-
from opencontractserver.annotations.models import StructuralAnnotationSet
294-
from opencontractserver.documents.models import DocumentPath
295-
296-
# Get documents with active (non-deleted) paths in corpus
297-
active_doc_ids = await sync_to_async(
298-
lambda: list(
299-
DocumentPath.objects.filter(
300-
corpus_id=self.corpus_id, is_current=True, is_deleted=False
301-
).values_list("document_id", flat=True)
292+
# Lazy subquery — never round-trips through Python, so the
293+
# generated SQL stays a single statement with a real subquery
294+
# rather than a giant ``IN (val, val, ...)`` literal even for
295+
# corpora with tens of thousands of documents.
296+
active_doc_ids_qs = (
297+
DocumentPath.objects.filter(
298+
corpus_id=self.corpus_id, is_current=True, is_deleted=False
302299
)
303-
)()
304-
305-
if active_doc_ids:
300+
.values("document_id")
301+
.distinct()
302+
)
303+
# Trade-off: this ``EXISTS`` adds one extra round-trip on the
304+
# happy path, but lets us short-circuit the entire vector search
305+
# for empty/all-deleted corpora (returning ``Annotation.none()``
306+
# spares a downstream HNSW probe and keeps the existing
307+
# operational warning). For corpora with at least one active
308+
# document the cost is a single boolean SELECT and is dwarfed
309+
# by the main query. Removing the check would also remove the
310+
# debug log of the active-doc count that the materialised list
311+
# used to provide.
312+
has_active_docs = await sync_to_async(active_doc_ids_qs.exists)()
313+
314+
if has_active_docs:
306315
# Two annotation shapes pass this filter:
307316
# 1. Direct: Annotation.document_id is in the active set.
308317
# 2. Structural: Annotation.document_id is NULL but the
@@ -311,13 +320,16 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
311320
# structural annotations need an explicit OR clause —
312321
# otherwise every parser-produced structural row is
313322
# silently dropped on this corpus-wide path.
314-
active_struct_set_ids = StructuralAnnotationSet.objects.filter(
315-
documents__in=active_doc_ids
316-
).values("id")
317-
active_filters &= Q(document_id__in=active_doc_ids) | Q(
323+
active_struct_set_ids = (
324+
StructuralAnnotationSet.objects.filter(
325+
documents__in=active_doc_ids_qs
326+
)
327+
.values("id")
328+
.distinct()
329+
)
330+
active_filters &= Q(document_id__in=active_doc_ids_qs) | Q(
318331
structural=True, structural_set_id__in=active_struct_set_ids
319332
)
320-
_logger.debug(f"Found {len(active_doc_ids)} active documents in corpus")
321333
else:
322334
_logger.warning(f"No active documents found in corpus {self.corpus_id}")
323335
return Annotation.objects.none()
@@ -396,21 +408,23 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
396408
# ``Annotation.corpus_id``. Per-document visibility for these
397409
# rows is enforced by the visibility filter further below
398410
# plus the upfront IDOR check on ``corpus_id``.
399-
# Document is imported earlier in this method (line 211); reusing
400-
# the local binding avoids an F811 redefinition warning.
401-
from opencontractserver.annotations.models import StructuralAnnotationSet
402-
403-
visible_corpus_doc_ids = await sync_to_async(
404-
lambda: list(
405-
Document.objects.visible_to_user(user)
406-
.filter(path_records__corpus_id=self.corpus_id)
407-
.values_list("id", flat=True)
408-
.distinct()
411+
# Both subqueries below stay lazy so the SQL planner sees a
412+
# nested ``IN (SELECT ...)`` rather than a Python-materialised
413+
# ``IN (val, val, ...)`` literal — important for corpora with
414+
# tens of thousands of documents.
415+
visible_corpus_doc_ids_qs = (
416+
Document.objects.visible_to_user(user)
417+
.filter(path_records__corpus_id=self.corpus_id)
418+
.values("id")
419+
.distinct()
420+
)
421+
visible_corpus_set_ids = (
422+
StructuralAnnotationSet.objects.filter(
423+
documents__in=visible_corpus_doc_ids_qs
409424
)
410-
)()
411-
visible_corpus_set_ids = StructuralAnnotationSet.objects.filter(
412-
documents__in=visible_corpus_doc_ids
413-
).values("id")
425+
.values("id")
426+
.distinct()
427+
)
414428
active_filters &= Q(
415429
structural=True, structural_set_id__in=visible_corpus_set_ids
416430
) | Q(structural=False, corpus_id=self.corpus_id)

opencontractserver/tests/test_corpus_isolation_vector_store.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,120 @@ def test_corpus_wide_search_excludes_orphan_structural_set(self) -> None:
260260
# Document-scoped retrieval — must continue to work post-fix
261261
# ------------------------------------------------------------------ #
262262

263+
def test_deletion_aware_path_excludes_deleted_document_structural(self) -> None:
264+
"""Structural rows whose only ``DocumentPath`` is deleted are dropped.
265+
266+
Covers the deletion side of ``check_corpus_deletion=True``: the
267+
``DocumentPath`` row for ``doc_a`` is flipped to ``is_deleted=True``,
268+
so ``active_doc_ids`` in
269+
``CoreAnnotationVectorStore._build_base_queryset`` becomes empty and
270+
the corpus-only branch must short-circuit to no results — the
271+
document's structural annotation must NOT come back, even though the
272+
annotation row itself is otherwise unchanged.
273+
"""
274+
DocumentPath.objects.filter(document=self.doc_a, corpus=self.corpus_a).update(
275+
is_deleted=True
276+
)
277+
278+
store = CoreAnnotationVectorStore(
279+
user_id=self.user.id,
280+
corpus_id=self.corpus_a.id,
281+
document_id=None,
282+
check_corpus_deletion=True,
283+
)
284+
query = VectorSearchQuery(
285+
query_embedding=_constant_vector(384, 0.5), similarity_top_k=10
286+
)
287+
results = store.search(query)
288+
returned_ids = {r.annotation.id for r in results}
289+
290+
self.assertNotIn(
291+
self.struct_a.id,
292+
returned_ids,
293+
"Structural annotation whose only DocumentPath is deleted "
294+
"leaked through the deletion-aware corpus-only path",
295+
)
296+
297+
def test_deletion_aware_path_partial_deletion_keeps_active_drops_deleted(
298+
self,
299+
) -> None:
300+
"""Partial deletion: with two docs in one corpus, only the active stays.
301+
302+
The previous test stresses the early-exit branch (every doc deleted
303+
→ ``has_active_docs`` is False → ``Annotation.none()``). This test
304+
stresses the actual subquery filter:
305+
306+
active_filters &= Q(document_id__in=active_doc_ids_qs) | Q(
307+
structural=True, structural_set_id__in=active_struct_set_ids
308+
)
309+
310+
With one active document and one deleted document in the same
311+
corpus, ``active_doc_ids_qs`` is non-empty but partial. The
312+
deleted document's structural annotation must be excluded while
313+
the active document's must come through. This is the scenario
314+
most likely to regress silently if the OR clause is rebuilt or
315+
the subquery is rewritten incorrectly.
316+
"""
317+
# Add a second document to corpus_a with its own structural set,
318+
# then mark its DocumentPath deleted while doc_a's stays active.
319+
deleted_set = StructuralAnnotationSet.objects.create(
320+
content_hash=hashlib.sha256(b"corpus-a-deleted-doc").hexdigest(),
321+
creator=self.user,
322+
parser_name="TestParser",
323+
parser_version="1.0",
324+
)
325+
deleted_doc = Document.objects.create(
326+
title="Doc A2 (deleted)",
327+
creator=self.user,
328+
is_public=True,
329+
structural_annotation_set=deleted_set,
330+
)
331+
DocumentPath.objects.create(
332+
document=deleted_doc,
333+
corpus=self.corpus_a,
334+
path="/doc_a2.pdf",
335+
version_number=1,
336+
is_current=True,
337+
is_deleted=True,
338+
creator=self.user,
339+
)
340+
deleted_struct = Annotation.objects.create(
341+
structural_set=deleted_set,
342+
annotation_label=self.label,
343+
creator=self.user,
344+
raw_text="Deleted-doc structural paragraph",
345+
structural=True,
346+
page=1,
347+
)
348+
deleted_struct.add_embedding(
349+
get_default_embedder_path(), _constant_vector(384, 0.15)
350+
)
351+
352+
store = CoreAnnotationVectorStore(
353+
user_id=self.user.id,
354+
corpus_id=self.corpus_a.id,
355+
document_id=None,
356+
check_corpus_deletion=True,
357+
)
358+
query = VectorSearchQuery(
359+
query_embedding=_constant_vector(384, 0.5), similarity_top_k=10
360+
)
361+
results = store.search(query)
362+
returned_ids = {r.annotation.id for r in results}
363+
364+
self.assertIn(
365+
self.struct_a.id,
366+
returned_ids,
367+
"Active document's structural annotation was excluded by the "
368+
"partial-deletion subquery filter",
369+
)
370+
self.assertNotIn(
371+
deleted_struct.id,
372+
returned_ids,
373+
"Deleted document's structural annotation leaked through the "
374+
"partial-deletion subquery filter",
375+
)
376+
263377
def test_document_scoped_search_still_returns_structural(self) -> None:
264378
"""Document-scoped retrieval is the existing well-tested path.
265379

0 commit comments

Comments
 (0)