3131def backfill_null_embedder_paths (apps , schema_editor ):
3232 Embedding = apps .get_model ("annotations" , "Embedding" )
3333
34- null_rows = Embedding .objects .filter (embedder_path__isnull = True )
35- total = null_rows .count ()
34+ total = Embedding .objects .filter (embedder_path__isnull = True ).count ()
3635 if total == 0 :
3736 logger .info ("No Embedding rows with NULL embedder_path — nothing to backfill." )
3837 return
@@ -50,31 +49,56 @@ def backfill_null_embedder_paths(apps, schema_editor):
5049 backfilled = 0
5150 deleted = 0
5251
53- # Use .iterator() to avoid loading the full set into memory on large tables.
54- for emb in null_rows .iterator (chunk_size = 500 ):
55- emb .embedder_path = default_embedder_path
56- try :
57- with transaction .atomic ():
58- emb .save (update_fields = ["embedder_path" ])
59- backfilled += 1
60- except IntegrityError :
61- # A (default_embedder_path, parent) row already exists and is
62- # covered by the partial unique constraint. The legacy NULL row
63- # cannot be queried (no call site filters on NULL), so dropping
64- # it is a lossless cleanup.
65- logger .info (
66- "Dropping NULL-embedder_path Embedding id=%s: backfill to %r "
67- "would duplicate an existing row under the partial unique "
68- "constraint." ,
69- emb .pk ,
70- default_embedder_path ,
71- )
72- emb .delete ()
73- deleted += 1
74-
52+ # Keyset pagination: re-query each chunk for rows that still match the
53+ # NULL predicate AND have pk > the previous batch's max. Using
54+ # `.iterator(chunk_size=N)` here would be unsafe because we mutate or
55+ # delete every row we visit, and OFFSET-based chunking against a
56+ # shrinking result set would silently skip rows.
57+ chunk_size = 500
58+ last_pk = 0
59+ while True :
60+ batch = list (
61+ Embedding .objects .filter (
62+ embedder_path__isnull = True , pk__gt = last_pk
63+ ).order_by ("pk" )[:chunk_size ]
64+ )
65+ if not batch :
66+ break
67+ for emb in batch :
68+ emb .embedder_path = default_embedder_path
69+ try :
70+ with transaction .atomic ():
71+ emb .save (update_fields = ["embedder_path" ])
72+ backfilled += 1
73+ except IntegrityError :
74+ # A (default_embedder_path, parent) row already exists and is
75+ # covered by the partial unique constraint. The legacy NULL row
76+ # cannot be queried (no call site filters on NULL), so dropping
77+ # it is a lossless cleanup.
78+ logger .info (
79+ "Dropping NULL-embedder_path Embedding id=%s: backfill to %r "
80+ "would duplicate an existing row under the partial unique "
81+ "constraint." ,
82+ emb .pk ,
83+ default_embedder_path ,
84+ )
85+ emb .delete ()
86+ deleted += 1
87+ last_pk = batch [- 1 ].pk
88+
89+ if backfilled + deleted != total :
90+ logger .warning (
91+ "Embedding.embedder_path backfill: processed %s != initial NULL count %s "
92+ "(backfilled=%s, deleted=%s). Some rows may have been added/removed by "
93+ "concurrent traffic during the migration." ,
94+ backfilled + deleted ,
95+ total ,
96+ backfilled ,
97+ deleted ,
98+ )
7599 logger .info (
76100 "Embedding.embedder_path backfill complete: backfilled=%s, deleted=%s, "
77- "total =%s." ,
101+ "initial_null_count =%s." ,
78102 backfilled ,
79103 deleted ,
80104 total ,
0 commit comments