Address review feedback: keyset pagination + sanity-check log

JSv4 · JSv4 · commit d503dda63e66 · 2026-04-27T22:29:06.000-05:00
- Switch backfill loop from .iterator(chunk_size=500) to keyset
  pagination (re-query each chunk filtered by embedder_path__isnull
  AND pk &gt; last_pk). The previous .iterator() form is unsafe because
  every row we visit is mutated or deleted, and OFFSET-based chunking
  against a shrinking NULL result set would silently skip rows.
- Add a sanity-check warning if backfilled + deleted != initial NULL
  count, in case any row took an unexpected exception path or new
  NULL rows arrived during the migration.
diff --git a/opencontractserver/annotations/migrations/0068_enforce_embedder_path_not_null.py b/opencontractserver/annotations/migrations/0068_enforce_embedder_path_not_null.py
@@ -31,8 +31,7 @@
 def backfill_null_embedder_paths(apps, schema_editor):
     Embedding = apps.get_model("annotations", "Embedding")
 
-    null_rows = Embedding.objects.filter(embedder_path__isnull=True)
-    total = null_rows.count()
+    total = Embedding.objects.filter(embedder_path__isnull=True).count()
     if total == 0:
         logger.info("No Embedding rows with NULL embedder_path — nothing to backfill.")
         return
@@ -50,31 +49,56 @@ def backfill_null_embedder_paths(apps, schema_editor):
     backfilled = 0
     deleted = 0
 
-    # Use .iterator() to avoid loading the full set into memory on large tables.
-    for emb in null_rows.iterator(chunk_size=500):
-        emb.embedder_path = default_embedder_path
-        try:
-            with transaction.atomic():
-                emb.save(update_fields=["embedder_path"])
-            backfilled += 1
-        except IntegrityError:
-            # A (default_embedder_path, parent) row already exists and is
-            # covered by the partial unique constraint. The legacy NULL row
-            # cannot be queried (no call site filters on NULL), so dropping
-            # it is a lossless cleanup.
-            logger.info(
-                "Dropping NULL-embedder_path Embedding id=%s: backfill to %r "
-                "would duplicate an existing row under the partial unique "
-                "constraint.",
-                emb.pk,
-                default_embedder_path,
-            )
-            emb.delete()
-            deleted += 1
-
+    # Keyset pagination: re-query each chunk for rows that still match the
+    # NULL predicate AND have pk > the previous batch's max. Using
+    # `.iterator(chunk_size=N)` here would be unsafe because we mutate or
+    # delete every row we visit, and OFFSET-based chunking against a
+    # shrinking result set would silently skip rows.
+    chunk_size = 500
+    last_pk = 0
+    while True:
+        batch = list(
+            Embedding.objects.filter(
+                embedder_path__isnull=True, pk__gt=last_pk
+            ).order_by("pk")[:chunk_size]
+        )
+        if not batch:
+            break
+        for emb in batch:
+            emb.embedder_path = default_embedder_path
+            try:
+                with transaction.atomic():
+                    emb.save(update_fields=["embedder_path"])
+                backfilled += 1
+            except IntegrityError:
+                # A (default_embedder_path, parent) row already exists and is
+                # covered by the partial unique constraint. The legacy NULL row
+                # cannot be queried (no call site filters on NULL), so dropping
+                # it is a lossless cleanup.
+                logger.info(
+                    "Dropping NULL-embedder_path Embedding id=%s: backfill to %r "
+                    "would duplicate an existing row under the partial unique "
+                    "constraint.",
+                    emb.pk,
+                    default_embedder_path,
+                )
+                emb.delete()
+                deleted += 1
+        last_pk = batch[-1].pk
+
+    if backfilled + deleted != total:
+        logger.warning(
+            "Embedding.embedder_path backfill: processed %s != initial NULL count %s "
+            "(backfilled=%s, deleted=%s). Some rows may have been added/removed by "
+            "concurrent traffic during the migration.",
+            backfilled + deleted,
+            total,
+            backfilled,
+            deleted,
+        )
     logger.info(
         "Embedding.embedder_path backfill complete: backfilled=%s, deleted=%s, "
-        "total=%s.",
+        "initial_null_count=%s.",
         backfilled,
         deleted,
         total,