|
9 | 9 | from django.contrib.auth import get_user_model |
10 | 10 | from django.db.models import Q, QuerySet |
11 | 11 |
|
12 | | -from opencontractserver.annotations.models import Annotation |
| 12 | +from opencontractserver.annotations.models import Annotation, StructuralAnnotationSet |
13 | 13 | from opencontractserver.constants.search import ( |
14 | 14 | FTS_CONFIG, |
15 | 15 | HYBRID_SEARCH_OVERSAMPLE_FACTOR, |
@@ -208,7 +208,7 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]: |
208 | 208 | # enumeration attacks. |
209 | 209 | # ------------------------------------------------------------------------- |
210 | 210 | from opencontractserver.corpuses.models import Corpus |
211 | | - from opencontractserver.documents.models import Document |
| 211 | + from opencontractserver.documents.models import Document, DocumentPath |
212 | 212 |
|
213 | 213 | user = None |
214 | 214 | if self.user_id: |
@@ -289,22 +289,47 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]: |
289 | 289 |
|
290 | 290 | # Check for deleted documents in corpus |
291 | 291 | if self.check_corpus_deletion and self.corpus_id and not self.document_id: |
292 | | - # Note: sync_to_async already imported at module level |
293 | | - from opencontractserver.documents.models import DocumentPath |
294 | | - |
295 | | - # Get documents with active (non-deleted) paths in corpus |
296 | | - active_doc_ids = await sync_to_async( |
297 | | - lambda: list( |
298 | | - DocumentPath.objects.filter( |
299 | | - corpus_id=self.corpus_id, is_current=True, is_deleted=False |
300 | | - ).values_list("document_id", flat=True) |
| 292 | + # Lazy subquery — never round-trips through Python, so the |
| 293 | + # generated SQL stays a single statement with a real subquery |
| 294 | + # rather than a giant ``IN (val, val, ...)`` literal even for |
| 295 | + # corpora with tens of thousands of documents. |
| 296 | + active_doc_ids_qs = ( |
| 297 | + DocumentPath.objects.filter( |
| 298 | + corpus_id=self.corpus_id, is_current=True, is_deleted=False |
| 299 | + ) |
| 300 | + .values("document_id") |
| 301 | + .distinct() |
| 302 | + ) |
| 303 | + # Trade-off: this ``EXISTS`` adds one extra round-trip on the |
| 304 | + # happy path, but lets us short-circuit the entire vector search |
| 305 | + # for empty/all-deleted corpora (returning ``Annotation.none()`` |
| 306 | + # spares a downstream HNSW probe and keeps the existing |
| 307 | + # operational warning). For corpora with at least one active |
| 308 | + # document the cost is a single boolean SELECT and is dwarfed |
| 309 | + # by the main query. Removing the check would also remove the |
| 310 | + # debug log of the active-doc count that the materialised list |
| 311 | + # used to provide. |
| 312 | + has_active_docs = await sync_to_async(active_doc_ids_qs.exists)() |
| 313 | + |
| 314 | + if has_active_docs: |
| 315 | + # Two annotation shapes pass this filter: |
| 316 | + # 1. Direct: Annotation.document_id is in the active set. |
| 317 | + # 2. Structural: Annotation.document_id is NULL but the |
| 318 | + # structural_set links to one of those active documents. |
| 319 | + # ``document_id__in`` cannot match NULL on its own, so |
| 320 | + # structural annotations need an explicit OR clause — |
| 321 | + # otherwise every parser-produced structural row is |
| 322 | + # silently dropped on this corpus-wide path. |
| 323 | + active_struct_set_ids = ( |
| 324 | + StructuralAnnotationSet.objects.filter( |
| 325 | + documents__in=active_doc_ids_qs |
| 326 | + ) |
| 327 | + .values("id") |
| 328 | + .distinct() |
| 329 | + ) |
| 330 | + active_filters &= Q(document_id__in=active_doc_ids_qs) | Q( |
| 331 | + structural=True, structural_set_id__in=active_struct_set_ids |
301 | 332 | ) |
302 | | - )() |
303 | | - |
304 | | - if active_doc_ids: |
305 | | - # Ensure we only search documents with active paths |
306 | | - active_filters &= Q(document_id__in=active_doc_ids) |
307 | | - _logger.debug(f"Found {len(active_doc_ids)} active documents in corpus") |
308 | 333 | else: |
309 | 334 | _logger.warning(f"No active documents found in corpus {self.corpus_id}") |
310 | 335 | return Annotation.objects.none() |
@@ -361,11 +386,48 @@ async def _build_base_queryset(self) -> QuerySet[Annotation]: |
361 | 386 | # --- Corpus-only context (no document_id specified) --- |
362 | 387 | _logger.debug(f"Corpus-only context: corpus_id={self.corpus_id}") |
363 | 388 | # Annotations must be either: |
364 | | - # a) Structural (their Annotation.corpus_id might be null, included by nature) |
365 | | - # b) Non-structural AND directly linked to this corpus via Annotation.corpus_id. |
366 | | - active_filters &= Q(structural=True) | Q( |
367 | | - structural=False, corpus_id=self.corpus_id |
| 389 | + # a) Structural — restricted to ``structural_set``s reachable from |
| 390 | + # a document in this corpus that is *visible to the requesting |
| 391 | + # user*. This collapses two checks into one filter: |
| 392 | + # |
| 393 | + # • Corpus boundary (was missing): a parser-produced |
| 394 | + # structural annotation has ``document_id = corpus_id = |
| 395 | + # NULL``; its corpus membership is only knowable via |
| 396 | + # ``structural_set → Document.structural_annotation_set |
| 397 | + # (reverse FK) → DocumentPath.corpus_id``. |
| 398 | + # • Per-document permission (was bypassed): structural rows |
| 399 | + # previously matched ``Q(structural=True)`` unconditionally, |
| 400 | + # so a user with permission on the *corpus* could be served |
| 401 | + # rows whose underlying documents they have no permission |
| 402 | + # on. |
| 403 | + # |
| 404 | + # Without this, the corpus-wide path leaked structural rows from |
| 405 | + # every other corpus in the database (and from inaccessible |
| 406 | + # documents within the same corpus). |
| 407 | + # b) Non-structural — directly linked to this corpus via |
| 408 | + # ``Annotation.corpus_id``. Per-document visibility for these |
| 409 | + # rows is enforced by the visibility filter further below |
| 410 | + # plus the upfront IDOR check on ``corpus_id``. |
| 411 | + # Both subqueries below stay lazy so the SQL planner sees a |
| 412 | + # nested ``IN (SELECT ...)`` rather than a Python-materialised |
| 413 | + # ``IN (val, val, ...)`` literal — important for corpora with |
| 414 | + # tens of thousands of documents. |
| 415 | + visible_corpus_doc_ids_qs = ( |
| 416 | + Document.objects.visible_to_user(user) |
| 417 | + .filter(path_records__corpus_id=self.corpus_id) |
| 418 | + .values("id") |
| 419 | + .distinct() |
| 420 | + ) |
| 421 | + visible_corpus_set_ids = ( |
| 422 | + StructuralAnnotationSet.objects.filter( |
| 423 | + documents__in=visible_corpus_doc_ids_qs |
| 424 | + ) |
| 425 | + .values("id") |
| 426 | + .distinct() |
368 | 427 | ) |
| 428 | + active_filters &= Q( |
| 429 | + structural=True, structural_set_id__in=visible_corpus_set_ids |
| 430 | + ) | Q(structural=False, corpus_id=self.corpus_id) |
369 | 431 |
|
370 | 432 | # ------------------------------------------------------------------ # |
371 | 433 | # Apply accumulated document/corpus scope filters if any were added |
|
0 commit comments