|
| 1 | +""" |
| 2 | +Backfill any legacy Embedding rows with NULL ``embedder_path`` and then |
| 3 | +tighten the column to ``NOT NULL``. |
| 4 | +
|
| 5 | +Context (issue #1357): ``Embedding.embedder_path`` was declared |
| 6 | +``null=True, blank=True`` on the Django field while its Python annotation |
| 7 | +claimed ``str``. The partial unique constraints added in migration 0059 |
| 8 | +reference ``embedder_path`` with ``condition=Q(<parent>__isnull=False)``, |
| 9 | +meaning any row where ``embedder_path`` is NULL silently bypasses duplicate |
| 10 | +prevention. Every production creation path (store_embedding, add_embedding, |
| 11 | +worker_uploads) already supplies a concrete value, so we enforce the |
| 12 | +invariant at the DB level to match. |
| 13 | +
|
| 14 | +Backfill strategy: |
| 15 | + 1. For each NULL-embedder_path row, set ``embedder_path`` to |
| 16 | + ``settings.DEFAULT_EMBEDDER``. |
| 17 | + 2. If that assignment would collide with an existing (embedder_path, |
| 18 | + parent) row under a partial unique constraint, delete the NULL row |
| 19 | + instead — it cannot be matched by any query (all call sites filter |
| 20 | + on a concrete ``embedder_path``) so it was effectively dead data. |
| 21 | +""" |
| 22 | + |
| 23 | +import logging |
| 24 | + |
| 25 | +from django.conf import settings |
| 26 | +from django.db import IntegrityError, migrations, models, transaction |
| 27 | + |
| 28 | +logger = logging.getLogger(__name__) |
| 29 | + |
| 30 | + |
| 31 | +def backfill_null_embedder_paths(apps, schema_editor): |
| 32 | + Embedding = apps.get_model("annotations", "Embedding") |
| 33 | + |
| 34 | + total = Embedding.objects.filter(embedder_path__isnull=True).count() |
| 35 | + if total == 0: |
| 36 | + logger.info("No Embedding rows with NULL embedder_path — nothing to backfill.") |
| 37 | + return |
| 38 | + |
| 39 | + # Refuse to run if there's no default to backfill with — silently deleting |
| 40 | + # embedding rows because of a misconfigured env var would be irreversible. |
| 41 | + default_embedder_path = getattr(settings, "DEFAULT_EMBEDDER", "") or "" |
| 42 | + if not default_embedder_path: |
| 43 | + raise ValueError( |
| 44 | + f"settings.DEFAULT_EMBEDDER is empty but {total} Embedding row(s) " |
| 45 | + "have NULL embedder_path. Set DEFAULT_EMBEDDER (or manually clean " |
| 46 | + "up the NULL rows) before running this migration." |
| 47 | + ) |
| 48 | + |
| 49 | + backfilled = 0 |
| 50 | + deleted = 0 |
| 51 | + |
| 52 | + # Keyset pagination: re-query each chunk for rows that still match the |
| 53 | + # NULL predicate AND have pk > the previous batch's max. Using |
| 54 | + # `.iterator(chunk_size=N)` here would be unsafe because we mutate or |
| 55 | + # delete every row we visit, and OFFSET-based chunking against a |
| 56 | + # shrinking result set would silently skip rows. |
| 57 | + chunk_size = 500 |
| 58 | + last_pk = 0 |
| 59 | + while True: |
| 60 | + batch = list( |
| 61 | + Embedding.objects.filter( |
| 62 | + embedder_path__isnull=True, pk__gt=last_pk |
| 63 | + ).order_by("pk")[:chunk_size] |
| 64 | + ) |
| 65 | + if not batch: |
| 66 | + break |
| 67 | + for emb in batch: |
| 68 | + emb.embedder_path = default_embedder_path |
| 69 | + try: |
| 70 | + with transaction.atomic(): |
| 71 | + emb.save(update_fields=["embedder_path"]) |
| 72 | + backfilled += 1 |
| 73 | + except IntegrityError: |
| 74 | + # A (default_embedder_path, parent) row already exists and is |
| 75 | + # covered by the partial unique constraint. The legacy NULL row |
| 76 | + # cannot be queried (no call site filters on NULL), so dropping |
| 77 | + # it is a lossless cleanup. |
| 78 | + logger.info( |
| 79 | + "Dropping NULL-embedder_path Embedding id=%s: backfill to %r " |
| 80 | + "would duplicate an existing row under the partial unique " |
| 81 | + "constraint.", |
| 82 | + emb.pk, |
| 83 | + default_embedder_path, |
| 84 | + ) |
| 85 | + emb.delete() |
| 86 | + deleted += 1 |
| 87 | + last_pk = batch[-1].pk |
| 88 | + |
| 89 | + if backfilled + deleted != total: |
| 90 | + logger.warning( |
| 91 | + "Embedding.embedder_path backfill: processed %s != initial NULL count %s " |
| 92 | + "(backfilled=%s, deleted=%s). Some rows may have been added/removed by " |
| 93 | + "concurrent traffic during the migration.", |
| 94 | + backfilled + deleted, |
| 95 | + total, |
| 96 | + backfilled, |
| 97 | + deleted, |
| 98 | + ) |
| 99 | + logger.info( |
| 100 | + "Embedding.embedder_path backfill complete: backfilled=%s, deleted=%s, " |
| 101 | + "initial_null_count=%s.", |
| 102 | + backfilled, |
| 103 | + deleted, |
| 104 | + total, |
| 105 | + ) |
| 106 | + |
| 107 | + |
| 108 | +def reverse_backfill(apps, schema_editor): |
| 109 | + """No-op: we cannot restore rows that were deleted, and re-nulling |
| 110 | + backfilled rows would be indistinguishable from values that have always |
| 111 | + been ``settings.DEFAULT_EMBEDDER``.""" |
| 112 | + |
| 113 | + |
| 114 | +class Migration(migrations.Migration): |
| 115 | + atomic = False |
| 116 | + |
| 117 | + dependencies = [ |
| 118 | + ("annotations", "0067_merge_20260316_0312"), |
| 119 | + ] |
| 120 | + |
| 121 | + operations = [ |
| 122 | + migrations.RunPython(backfill_null_embedder_paths, reverse_backfill), |
| 123 | + migrations.AlterField( |
| 124 | + model_name="embedding", |
| 125 | + name="embedder_path", |
| 126 | + field=models.CharField( |
| 127 | + help_text=( |
| 128 | + "Identifier for the embedding model or pipeline used " |
| 129 | + "(e.g. 'openai/text-embedding-ada-002')." |
| 130 | + ), |
| 131 | + max_length=256, |
| 132 | + ), |
| 133 | + ), |
| 134 | + ] |
0 commit comments