99from django .contrib .auth import get_user_model
1010from django .db .models import Q , QuerySet
1111
12- from opencontractserver .annotations .models import Annotation
12+ from opencontractserver .annotations .models import Annotation , StructuralAnnotationSet
1313from opencontractserver .constants .search import (
1414 FTS_CONFIG ,
1515 HYBRID_SEARCH_OVERSAMPLE_FACTOR ,
@@ -208,7 +208,7 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
208208 # enumeration attacks.
209209 # -------------------------------------------------------------------------
210210 from opencontractserver .corpuses .models import Corpus
211- from opencontractserver .documents .models import Document
211+ from opencontractserver .documents .models import Document , DocumentPath
212212
213213 user = None
214214 if self .user_id :
@@ -289,20 +289,29 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
289289
290290 # Check for deleted documents in corpus
291291 if self .check_corpus_deletion and self .corpus_id and not self .document_id :
292- # Note: sync_to_async already imported at module level
293- from opencontractserver .annotations .models import StructuralAnnotationSet
294- from opencontractserver .documents .models import DocumentPath
295-
296- # Get documents with active (non-deleted) paths in corpus
297- active_doc_ids = await sync_to_async (
298- lambda : list (
299- DocumentPath .objects .filter (
300- corpus_id = self .corpus_id , is_current = True , is_deleted = False
301- ).values_list ("document_id" , flat = True )
292+ # Lazy subquery — never round-trips through Python, so the
293+ # generated SQL stays a single statement with a real subquery
294+ # rather than a giant ``IN (val, val, ...)`` literal even for
295+ # corpora with tens of thousands of documents.
296+ active_doc_ids_qs = (
297+ DocumentPath .objects .filter (
298+ corpus_id = self .corpus_id , is_current = True , is_deleted = False
302299 )
303- )()
304-
305- if active_doc_ids :
300+ .values ("document_id" )
301+ .distinct ()
302+ )
303+ # Trade-off: this ``EXISTS`` adds one extra round-trip on the
304+ # happy path, but lets us short-circuit the entire vector search
305+ # for empty/all-deleted corpora (returning ``Annotation.none()``
306+ # spares a downstream HNSW probe and keeps the existing
307+ # operational warning). For corpora with at least one active
308+ # document the cost is a single boolean SELECT and is dwarfed
309+ # by the main query. Removing the check would also remove the
310+ # debug log of the active-doc count that the materialised list
311+ # used to provide.
312+ has_active_docs = await sync_to_async (active_doc_ids_qs .exists )()
313+
314+ if has_active_docs :
306315 # Two annotation shapes pass this filter:
307316 # 1. Direct: Annotation.document_id is in the active set.
308317 # 2. Structural: Annotation.document_id is NULL but the
@@ -311,13 +320,16 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
311320 # structural annotations need an explicit OR clause —
312321 # otherwise every parser-produced structural row is
313322 # silently dropped on this corpus-wide path.
314- active_struct_set_ids = StructuralAnnotationSet .objects .filter (
315- documents__in = active_doc_ids
316- ).values ("id" )
317- active_filters &= Q (document_id__in = active_doc_ids ) | Q (
323+ active_struct_set_ids = (
324+ StructuralAnnotationSet .objects .filter (
325+ documents__in = active_doc_ids_qs
326+ )
327+ .values ("id" )
328+ .distinct ()
329+ )
330+ active_filters &= Q (document_id__in = active_doc_ids_qs ) | Q (
318331 structural = True , structural_set_id__in = active_struct_set_ids
319332 )
320- _logger .debug (f"Found { len (active_doc_ids )} active documents in corpus" )
321333 else :
322334 _logger .warning (f"No active documents found in corpus { self .corpus_id } " )
323335 return Annotation .objects .none ()
@@ -396,21 +408,23 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]:
396408 # ``Annotation.corpus_id``. Per-document visibility for these
397409 # rows is enforced by the visibility filter further below
398410 # plus the upfront IDOR check on ``corpus_id``.
399- # Document is imported earlier in this method (line 211); reusing
400- # the local binding avoids an F811 redefinition warning.
401- from opencontractserver .annotations .models import StructuralAnnotationSet
402-
403- visible_corpus_doc_ids = await sync_to_async (
404- lambda : list (
405- Document .objects .visible_to_user (user )
406- .filter (path_records__corpus_id = self .corpus_id )
407- .values_list ("id" , flat = True )
408- .distinct ()
411+ # Both subqueries below stay lazy so the SQL planner sees a
412+ # nested ``IN (SELECT ...)`` rather than a Python-materialised
413+ # ``IN (val, val, ...)`` literal — important for corpora with
414+ # tens of thousands of documents.
415+ visible_corpus_doc_ids_qs = (
416+ Document .objects .visible_to_user (user )
417+ .filter (path_records__corpus_id = self .corpus_id )
418+ .values ("id" )
419+ .distinct ()
420+ )
421+ visible_corpus_set_ids = (
422+ StructuralAnnotationSet .objects .filter (
423+ documents__in = visible_corpus_doc_ids_qs
409424 )
410- )()
411- visible_corpus_set_ids = StructuralAnnotationSet .objects .filter (
412- documents__in = visible_corpus_doc_ids
413- ).values ("id" )
425+ .values ("id" )
426+ .distinct ()
427+ )
414428 active_filters &= Q (
415429 structural = True , structural_set_id__in = visible_corpus_set_ids
416430 ) | Q (structural = False , corpus_id = self .corpus_id )
0 commit comments