arabold · arabold · May 18, 2026 · May 18, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/db/migrations/002-normalize-library-table.sql b/db/migrations/002-normalize-library-table.sql
@@ -1,5 +1,6 @@
 -- Migration: Normalize schema by introducing libraries and versions tables
 
+-- @migration-step create normalized tables
 -- 1. Create libraries table
 CREATE TABLE IF NOT EXISTS libraries (
   id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -16,10 +17,12 @@ CREATE TABLE IF NOT EXISTS versions (
   UNIQUE(library_id, name) -- Allows one NULL version per library
 );
 
+-- @migration-step add document foreign keys
 -- 3. Add foreign key columns to documents
 ALTER TABLE documents ADD COLUMN library_id INTEGER REFERENCES libraries(id);
 ALTER TABLE documents ADD COLUMN version_id INTEGER REFERENCES versions(id);
 
+-- @migration-step populate libraries and versions
 -- 4. Populate libraries table from existing documents
 INSERT OR IGNORE INTO libraries (name)
 SELECT DISTINCT library FROM documents;
@@ -32,6 +35,7 @@ SELECT DISTINCT
 FROM documents d
 JOIN libraries l ON l.name = d.library;
 
+-- @migration-step backfill document references
 -- 6. Update documents with foreign key references
 UPDATE documents
 SET library_id = (SELECT id FROM libraries WHERE libraries.name = documents.library),
@@ -42,6 +46,7 @@ SET library_id = (SELECT id FROM libraries WHERE libraries.name = documents.libr
       AND COALESCE(v.name, '') = COALESCE(documents.version, '')
     );
 
+-- @migration-step create normalization indexes
 -- 7. Add indexes for performance
 CREATE INDEX IF NOT EXISTS idx_documents_library_id ON documents(library_id);
 CREATE INDEX IF NOT EXISTS idx_documents_version_id ON documents(version_id);

diff --git a/db/migrations/003-normalize-vector-table.sql b/db/migrations/003-normalize-vector-table.sql
@@ -1,9 +1,11 @@
 -- Migration: Normalize documents_vec table to use library_id and version_id
 -- Optimized for large datasets (1GB+)
 
+-- @migration-step prepare vector join index
 -- 1. Ensure optimal indexes for the migration JOIN
 CREATE INDEX IF NOT EXISTS idx_documents_id_lib_ver ON documents(id, library_id, version_id);
 
+-- @migration-step preserve vectors with normalized keys
 -- 2. Create temporary table to store vector data with foreign key IDs
 CREATE TEMPORARY TABLE temp_vector_migration AS
 SELECT 
@@ -14,6 +16,7 @@ SELECT
 FROM documents_vec dv
 JOIN documents d ON dv.rowid = d.id;
 
+-- @migration-step rebuild vector table
 -- 3. Drop the old virtual table
 DROP TABLE documents_vec;
 
@@ -24,10 +27,12 @@ CREATE VIRTUAL TABLE documents_vec USING vec0(
   embedding FLOAT[1536]
 );
 
+-- @migration-step restore normalized vectors
 -- 5. Restore vector data using foreign key IDs
 INSERT INTO documents_vec (rowid, library_id, version_id, embedding)
 SELECT rowid, library_id, version_id, embedding
 FROM temp_vector_migration;
 
+-- @migration-step cleanup vector staging data
 -- 6. Clean up temporary table
 DROP TABLE temp_vector_migration;
diff --git a/db/migrations/004-complete-normalization.sql b/db/migrations/004-complete-normalization.sql
@@ -2,6 +2,7 @@
 -- This migration finalizes the schema normalization process
 -- Note: Must recreate table because obsolete columns are part of UNIQUE constraint
 
+-- @migration-step create normalized documents table
 -- 1. Create new documents table with only foreign key references
 CREATE TABLE documents_new (
   id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -15,22 +16,26 @@ CREATE TABLE documents_new (
   UNIQUE(url, library_id, version_id, sort_order)
 );
 
+-- @migration-step copy normalized documents
 -- 2. Copy data from old table (excluding obsolete library and version columns)
 INSERT INTO documents_new (id, library_id, version_id, url, content, metadata, sort_order, indexed_at)
 SELECT id, library_id, version_id, url, content, metadata, sort_order, indexed_at
 FROM documents;
 
+-- @migration-step replace documents table
 -- 3. Drop the old documents table
 DROP TABLE documents;
 
 -- 4. Rename the new table to documents
 ALTER TABLE documents_new RENAME TO documents;
 
+-- @migration-step recreate document indexes
 -- 5. Recreate indexes that were lost when dropping the table
 CREATE INDEX IF NOT EXISTS idx_documents_library_id ON documents(library_id);
 CREATE INDEX IF NOT EXISTS idx_documents_version_id ON documents(version_id);
 CREATE INDEX IF NOT EXISTS idx_documents_lib_ver_id ON documents(library_id, version_id);
 
+-- @migration-step recreate fts schema
 -- 6. Recreate FTS5 virtual table (gets dropped when main table is dropped)
 -- Using external content approach - FTS index is maintained entirely through triggers
 CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
@@ -41,6 +46,7 @@ CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
   tokenize='porter unicode61'
 );
 
+-- @migration-step recreate fts triggers
 -- 7. Recreate FTS triggers to maintain the index
 -- Note: Triggers work directly with documents table, no JOIN needed for FTS content
 CREATE TRIGGER IF NOT EXISTS documents_fts_after_delete AFTER DELETE ON documents BEGIN
@@ -60,6 +66,7 @@ CREATE TRIGGER IF NOT EXISTS documents_fts_after_insert AFTER INSERT ON document
   VALUES(new.id, new.content, json_extract(new.metadata, '$.title'), new.url, json_extract(new.metadata, '$.path'));
 END;
 
+-- @migration-step rebuild fts index
 -- 8. Rebuild FTS index from existing documents data
 -- Manually populate the FTS index since we're using external content approach
 INSERT INTO documents_fts(rowid, content, title, url, path)

diff --git a/db/migrations/007-dedupe-unversioned-versions.sql b/db/migrations/007-dedupe-unversioned-versions.sql
@@ -14,13 +14,15 @@
 -- across multiple subsequent statements. All TEMP objects are connection-scoped
 -- and vanish automatically; safe for repeated runs (we DROP IF EXISTS first).
 
+-- @migration-step collect null-name versions
 DROP TABLE IF EXISTS temp_null_versions;
 CREATE TEMP TABLE temp_null_versions AS
 SELECT v.id, v.library_id,
        (SELECT COUNT(*) FROM documents d WHERE d.version_id = v.id) AS doc_count
 FROM versions v
 WHERE v.name IS NULL;
 
+-- @migration-step choose canonical versions
 -- Build canonical mapping per library (one row per library_id)
 DROP TABLE IF EXISTS temp_canonical_versions;
 CREATE TEMP TABLE temp_canonical_versions AS
@@ -40,6 +42,7 @@ SELECT nv.library_id,
 FROM temp_null_versions nv
 GROUP BY nv.library_id;
 
+-- @migration-step repoint documents to canonical versions
 -- Repoint documents from non-canonical NULL-name versions
 UPDATE documents
 SET version_id = (
@@ -49,12 +52,14 @@ SET version_id = (
 WHERE version_id IN (SELECT id FROM versions WHERE name IS NULL)
   AND version_id NOT IN (SELECT keep_id FROM temp_canonical_versions);
 
+-- @migration-step remove surplus versions
 -- 3: Delete surplus NULL-name rows now unreferenced
 DELETE FROM versions
 WHERE name IS NULL
   AND id NOT IN (SELECT keep_id FROM temp_canonical_versions)
   AND (SELECT COUNT(*) FROM documents d WHERE d.version_id = versions.id) = 0;
 
+-- @migration-step normalize remaining version names
 -- 4: Normalize remaining NULL names to ''
 UPDATE versions SET name = '' WHERE name IS NULL;
 

diff --git a/db/migrations/009-add-pages-table.sql b/db/migrations/009-add-pages-table.sql
@@ -2,6 +2,7 @@
 -- This migration introduces a pages table to store page-level metadata once per URL
 -- and links document chunks to their parent pages via page_id foreign key
 
+-- @migration-step create pages table
 -- 1. Create pages table to store unique page-level metadata
 CREATE TABLE IF NOT EXISTS pages (
   id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -16,11 +17,13 @@ CREATE TABLE IF NOT EXISTS pages (
   UNIQUE(version_id, url)
 );
 
+-- @migration-step create page indexes
 -- 2. Add indexes for efficient querying
 CREATE INDEX IF NOT EXISTS idx_pages_version_id ON pages(version_id);
 CREATE INDEX IF NOT EXISTS idx_pages_url ON pages(url);
 CREATE INDEX IF NOT EXISTS idx_pages_etag ON pages(etag);
 
+-- @migration-step create page-based documents table
 -- 3. Create new documents table with page_id foreign key
 CREATE TABLE documents_new (
   id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -32,10 +35,12 @@ CREATE TABLE documents_new (
   created_at DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 
+-- @migration-step create new document indexes
 -- 4. Create indexes for the new documents table
 CREATE INDEX IF NOT EXISTS idx_documents_page_id ON documents_new(page_id);
 CREATE INDEX IF NOT EXISTS idx_documents_sort_order ON documents_new(page_id, sort_order);
 
+-- @migration-step populate pages
 -- 5. Migrate data from old documents table to new structure
 -- First, populate pages table with unique page data from existing documents
 -- Group by version_id and url to ensure uniqueness, using MAX() to handle any duplicates
@@ -49,6 +54,7 @@ SELECT
 FROM documents
 GROUP BY version_id, url;
 
+-- @migration-step migrate document chunks
 -- 6. Migrate document chunks to new table structure
 -- Preserve all existing metadata except page-level fields (url, title, library, version)
 -- that are now stored in pages and versions tables
@@ -72,12 +78,14 @@ SELECT
 FROM documents d
 JOIN pages p ON d.version_id = p.version_id AND d.url = p.url;
 
+-- @migration-step replace documents table
 -- 7. Drop the old documents table
 DROP TABLE documents;
 
 -- 8. Rename the new table to documents
 ALTER TABLE documents_new RENAME TO documents;
 
+-- @migration-step replace fts schema
 -- 9. Recreate FTS5 virtual table to work with new structure
 -- Drop existing FTS table and triggers
 DROP TRIGGER IF EXISTS documents_fts_after_delete;
@@ -94,6 +102,7 @@ CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
   tokenize='porter unicode61'
 );
 
+-- @migration-step recreate fts and page triggers
 -- 10. Create new FTS triggers that join with pages table
 CREATE TRIGGER IF NOT EXISTS documents_fts_after_delete AFTER DELETE ON documents BEGIN
   DELETE FROM documents_fts WHERE rowid = old.id;
@@ -117,6 +126,7 @@ CREATE TRIGGER IF NOT EXISTS pages_updated_at_trigger AFTER UPDATE ON pages BEGI
   UPDATE pages SET updated_at = CURRENT_TIMESTAMP WHERE id = new.id;
 END;
 
+-- @migration-step rebuild fts index
 -- 12. Rebuild FTS index from migrated data
 INSERT INTO documents_fts(rowid, content, title, url, path)
 SELECT d.id, d.content, p.title, p.url, json_extract(d.metadata, '$.path')

diff --git a/db/migrations/014-rebuild-vector-partition-keys.sql b/db/migrations/014-rebuild-vector-partition-keys.sql
@@ -0,0 +1,56 @@
+-- Migration: rebuild documents_vec with sqlite-vec partition keys
+-- This enables selective KNN queries by library_id and version_id.
+
+-- @migration-step preserve existing vectors
+-- Preserve compatible vectors from the existing vec table. The migration
+-- runner replaces __DOCUMENTS_VEC_DIMENSION__ with the current documents_vec
+-- dimension so databases already reconciled to a custom embedding dimension
+-- keep their existing vector size. This uses a disk-backed staging table
+-- because large vector indexes can exceed memory.
+DROP TABLE IF EXISTS _documents_vec_partition_migration;
+
+CREATE TABLE _documents_vec_partition_migration AS
+SELECT
+  d.id AS rowid,
+  v.library_id,
+  v.id AS version_id,
+  dv.embedding
+FROM documents_vec dv
+JOIN documents d ON dv.rowid = d.id
+JOIN pages p ON d.page_id = p.id
+JOIN versions v ON p.version_id = v.id
+WHERE vec_length(dv.embedding) = __DOCUMENTS_VEC_DIMENSION__;
+
+-- @migration-step rebuild vector table
+DROP TABLE documents_vec;
+
+CREATE VIRTUAL TABLE documents_vec USING vec0(
+  library_id INTEGER partition key,
+  version_id INTEGER partition key,
+  embedding FLOAT[__DOCUMENTS_VEC_DIMENSION__]
+);
+
+-- @migration-step restore existing vectors
+INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding)
+SELECT rowid, library_id, version_id, embedding
+FROM _documents_vec_partition_migration;
+
+-- @migration-step backfill missing vectors
+-- Backfill any vectors stored on documents but missing from the vec table.
+INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding)
+SELECT
+  d.id,
+  v.library_id,
+  v.id AS version_id,
+  json_extract(d.embedding, '$') AS embedding
+FROM documents d
+JOIN pages p ON d.page_id = p.id
+JOIN versions v ON p.version_id = v.id
+WHERE d.embedding IS NOT NULL
+  AND vec_length(json_extract(d.embedding, '$')) = __DOCUMENTS_VEC_DIMENSION__
+  AND NOT EXISTS (
+    SELECT 1 FROM documents_vec existing WHERE existing.rowid = d.id
+  );
+
+-- @migration-step cleanup staging data
+DROP TABLE _documents_vec_partition_migration;
diff --git a/docs/concepts/data-storage.md b/docs/concepts/data-storage.md
@@ -315,6 +315,26 @@ Database transactions ensure consistency:
 - Batch operations for performance
 - Automatic rollback on errors
 
+### Migration Safety
+
+Schema migrations run inside an IMMEDIATE transaction with rollback-capable
+SQLite journaling enabled. The migration runner does not use `journal_mode =
+OFF` during migration execution because destructive migrations may need to drop
+and recreate tables or virtual tables, and rollback must preserve the
+pre-migration database if a later step fails.
+
+The runner still applies rollback-safe tuning for large migrations, including
+cache, memory mapping, temporary storage, and `synchronous = NORMAL` settings.
+After migrations finish, it configures production settings such as WAL mode,
+bounded WAL checkpointing, busy timeout, foreign keys, and normal synchronous
+durability.
+
+Large or destructive migrations should be validated against a copy of important
+local databases before running them against the live store. Use SQLite's backup
+API or an application-level export/copy workflow so the copy is consistent, then
+run the new version against that copy and verify expected table counts and
+search behavior before migrating high-value data.
+
 ### Concurrent Access
 
 Safe concurrent database access:

diff --git a/openspec/changes/harden-migration-workflow/.openspec.yaml b/openspec/changes/harden-migration-workflow/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-06-06