Skip to content

Commit d503dda

Browse files
committed
Address review feedback: keyset pagination + sanity-check log
- Switch backfill loop from .iterator(chunk_size=500) to keyset pagination (re-query each chunk filtered by embedder_path__isnull AND pk > last_pk). The previous .iterator() form is unsafe because every row we visit is mutated or deleted, and OFFSET-based chunking against a shrinking NULL result set would silently skip rows. - Add a sanity-check warning if backfilled + deleted != initial NULL count, in case any row took an unexpected exception path or new NULL rows arrived during the migration.
1 parent 070e530 commit d503dda

1 file changed

Lines changed: 49 additions & 25 deletions

File tree

opencontractserver/annotations/migrations/0068_enforce_embedder_path_not_null.py

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,7 @@
3131
def backfill_null_embedder_paths(apps, schema_editor):
3232
Embedding = apps.get_model("annotations", "Embedding")
3333

34-
null_rows = Embedding.objects.filter(embedder_path__isnull=True)
35-
total = null_rows.count()
34+
total = Embedding.objects.filter(embedder_path__isnull=True).count()
3635
if total == 0:
3736
logger.info("No Embedding rows with NULL embedder_path — nothing to backfill.")
3837
return
@@ -50,31 +49,56 @@ def backfill_null_embedder_paths(apps, schema_editor):
5049
backfilled = 0
5150
deleted = 0
5251

53-
# Use .iterator() to avoid loading the full set into memory on large tables.
54-
for emb in null_rows.iterator(chunk_size=500):
55-
emb.embedder_path = default_embedder_path
56-
try:
57-
with transaction.atomic():
58-
emb.save(update_fields=["embedder_path"])
59-
backfilled += 1
60-
except IntegrityError:
61-
# A (default_embedder_path, parent) row already exists and is
62-
# covered by the partial unique constraint. The legacy NULL row
63-
# cannot be queried (no call site filters on NULL), so dropping
64-
# it is a lossless cleanup.
65-
logger.info(
66-
"Dropping NULL-embedder_path Embedding id=%s: backfill to %r "
67-
"would duplicate an existing row under the partial unique "
68-
"constraint.",
69-
emb.pk,
70-
default_embedder_path,
71-
)
72-
emb.delete()
73-
deleted += 1
74-
52+
# Keyset pagination: re-query each chunk for rows that still match the
53+
# NULL predicate AND have pk > the previous batch's max. Using
54+
# `.iterator(chunk_size=N)` here would be unsafe because we mutate or
55+
# delete every row we visit, and OFFSET-based chunking against a
56+
# shrinking result set would silently skip rows.
57+
chunk_size = 500
58+
last_pk = 0
59+
while True:
60+
batch = list(
61+
Embedding.objects.filter(
62+
embedder_path__isnull=True, pk__gt=last_pk
63+
).order_by("pk")[:chunk_size]
64+
)
65+
if not batch:
66+
break
67+
for emb in batch:
68+
emb.embedder_path = default_embedder_path
69+
try:
70+
with transaction.atomic():
71+
emb.save(update_fields=["embedder_path"])
72+
backfilled += 1
73+
except IntegrityError:
74+
# A (default_embedder_path, parent) row already exists and is
75+
# covered by the partial unique constraint. The legacy NULL row
76+
# cannot be queried (no call site filters on NULL), so dropping
77+
# it is a lossless cleanup.
78+
logger.info(
79+
"Dropping NULL-embedder_path Embedding id=%s: backfill to %r "
80+
"would duplicate an existing row under the partial unique "
81+
"constraint.",
82+
emb.pk,
83+
default_embedder_path,
84+
)
85+
emb.delete()
86+
deleted += 1
87+
last_pk = batch[-1].pk
88+
89+
if backfilled + deleted != total:
90+
logger.warning(
91+
"Embedding.embedder_path backfill: processed %s != initial NULL count %s "
92+
"(backfilled=%s, deleted=%s). Some rows may have been added/removed by "
93+
"concurrent traffic during the migration.",
94+
backfilled + deleted,
95+
total,
96+
backfilled,
97+
deleted,
98+
)
7599
logger.info(
76100
"Embedding.embedder_path backfill complete: backfilled=%s, deleted=%s, "
77-
"total=%s.",
101+
"initial_null_count=%s.",
78102
backfilled,
79103
deleted,
80104
total,

0 commit comments

Comments
 (0)