Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions db/migrations/002-normalize-library-table.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
-- Migration: Normalize schema by introducing libraries and versions tables

-- @migration-step create normalized tables
-- 1. Create libraries table
CREATE TABLE IF NOT EXISTS libraries (
id INTEGER PRIMARY KEY AUTOINCREMENT,
Expand All @@ -16,10 +17,12 @@ CREATE TABLE IF NOT EXISTS versions (
UNIQUE(library_id, name) -- Allows one NULL version per library
);

-- @migration-step add document foreign keys
-- 3. Add foreign key columns to documents
ALTER TABLE documents ADD COLUMN library_id INTEGER REFERENCES libraries(id);
ALTER TABLE documents ADD COLUMN version_id INTEGER REFERENCES versions(id);

-- @migration-step populate libraries and versions
-- 4. Populate libraries table from existing documents
INSERT OR IGNORE INTO libraries (name)
SELECT DISTINCT library FROM documents;
Expand All @@ -32,6 +35,7 @@ SELECT DISTINCT
FROM documents d
JOIN libraries l ON l.name = d.library;

-- @migration-step backfill document references
-- 6. Update documents with foreign key references
UPDATE documents
SET library_id = (SELECT id FROM libraries WHERE libraries.name = documents.library),
Expand All @@ -42,6 +46,7 @@ SET library_id = (SELECT id FROM libraries WHERE libraries.name = documents.libr
AND COALESCE(v.name, '') = COALESCE(documents.version, '')
);

-- @migration-step create normalization indexes
-- 7. Add indexes for performance
CREATE INDEX IF NOT EXISTS idx_documents_library_id ON documents(library_id);
CREATE INDEX IF NOT EXISTS idx_documents_version_id ON documents(version_id);
Expand Down
5 changes: 5 additions & 0 deletions db/migrations/003-normalize-vector-table.sql
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
-- Migration: Normalize documents_vec table to use library_id and version_id
-- Optimized for large datasets (1GB+)

-- @migration-step prepare vector join index
-- 1. Ensure optimal indexes for the migration JOIN
CREATE INDEX IF NOT EXISTS idx_documents_id_lib_ver ON documents(id, library_id, version_id);

-- @migration-step preserve vectors with normalized keys
-- 2. Create temporary table to store vector data with foreign key IDs
CREATE TEMPORARY TABLE temp_vector_migration AS
SELECT
Expand All @@ -14,6 +16,7 @@ SELECT
FROM documents_vec dv
JOIN documents d ON dv.rowid = d.id;

-- @migration-step rebuild vector table
-- 3. Drop the old virtual table
DROP TABLE documents_vec;

Expand All @@ -24,10 +27,12 @@ CREATE VIRTUAL TABLE documents_vec USING vec0(
embedding FLOAT[1536]
);

-- @migration-step restore normalized vectors
-- 5. Restore vector data using foreign key IDs
INSERT INTO documents_vec (rowid, library_id, version_id, embedding)
SELECT rowid, library_id, version_id, embedding
FROM temp_vector_migration;

-- @migration-step cleanup vector staging data
-- 6. Clean up temporary table
DROP TABLE temp_vector_migration;
7 changes: 7 additions & 0 deletions db/migrations/004-complete-normalization.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
-- This migration finalizes the schema normalization process
-- Note: Must recreate table because obsolete columns are part of UNIQUE constraint

-- @migration-step create normalized documents table
-- 1. Create new documents table with only foreign key references
CREATE TABLE documents_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
Expand All @@ -15,22 +16,26 @@ CREATE TABLE documents_new (
UNIQUE(url, library_id, version_id, sort_order)
);

-- @migration-step copy normalized documents
-- 2. Copy data from old table (excluding obsolete library and version columns)
INSERT INTO documents_new (id, library_id, version_id, url, content, metadata, sort_order, indexed_at)
SELECT id, library_id, version_id, url, content, metadata, sort_order, indexed_at
FROM documents;

-- @migration-step replace documents table
-- 3. Drop the old documents table
DROP TABLE documents;

-- 4. Rename the new table to documents
ALTER TABLE documents_new RENAME TO documents;

-- @migration-step recreate document indexes
-- 5. Recreate indexes that were lost when dropping the table
CREATE INDEX IF NOT EXISTS idx_documents_library_id ON documents(library_id);
CREATE INDEX IF NOT EXISTS idx_documents_version_id ON documents(version_id);
CREATE INDEX IF NOT EXISTS idx_documents_lib_ver_id ON documents(library_id, version_id);

-- @migration-step recreate fts schema
-- 6. Recreate FTS5 virtual table (gets dropped when main table is dropped)
-- Using external content approach - FTS index is maintained entirely through triggers
CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
Expand All @@ -41,6 +46,7 @@ CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
tokenize='porter unicode61'
);

-- @migration-step recreate fts triggers
-- 7. Recreate FTS triggers to maintain the index
-- Note: Triggers work directly with documents table, no JOIN needed for FTS content
CREATE TRIGGER IF NOT EXISTS documents_fts_after_delete AFTER DELETE ON documents BEGIN
Expand All @@ -60,6 +66,7 @@ CREATE TRIGGER IF NOT EXISTS documents_fts_after_insert AFTER INSERT ON document
VALUES(new.id, new.content, json_extract(new.metadata, '$.title'), new.url, json_extract(new.metadata, '$.path'));
END;

-- @migration-step rebuild fts index
-- 8. Rebuild FTS index from existing documents data
-- Manually populate the FTS index since we're using external content approach
INSERT INTO documents_fts(rowid, content, title, url, path)
Expand Down
5 changes: 5 additions & 0 deletions db/migrations/007-dedupe-unversioned-versions.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@
-- across multiple subsequent statements. All TEMP objects are connection-scoped
-- and vanish automatically; safe for repeated runs (we DROP IF EXISTS first).

-- @migration-step collect null-name versions
DROP TABLE IF EXISTS temp_null_versions;
CREATE TEMP TABLE temp_null_versions AS
SELECT v.id, v.library_id,
(SELECT COUNT(*) FROM documents d WHERE d.version_id = v.id) AS doc_count
FROM versions v
WHERE v.name IS NULL;

-- @migration-step choose canonical versions
-- Build canonical mapping per library (one row per library_id)
DROP TABLE IF EXISTS temp_canonical_versions;
CREATE TEMP TABLE temp_canonical_versions AS
Expand All @@ -40,6 +42,7 @@ SELECT nv.library_id,
FROM temp_null_versions nv
GROUP BY nv.library_id;

-- @migration-step repoint documents to canonical versions
-- Repoint documents from non-canonical NULL-name versions
UPDATE documents
SET version_id = (
Expand All @@ -49,12 +52,14 @@ SET version_id = (
WHERE version_id IN (SELECT id FROM versions WHERE name IS NULL)
AND version_id NOT IN (SELECT keep_id FROM temp_canonical_versions);

-- @migration-step remove surplus versions
-- 3: Delete surplus NULL-name rows now unreferenced
DELETE FROM versions
WHERE name IS NULL
AND id NOT IN (SELECT keep_id FROM temp_canonical_versions)
AND (SELECT COUNT(*) FROM documents d WHERE d.version_id = versions.id) = 0;

-- @migration-step normalize remaining version names
-- 4: Normalize remaining NULL names to ''
UPDATE versions SET name = '' WHERE name IS NULL;

Expand Down
10 changes: 10 additions & 0 deletions db/migrations/009-add-pages-table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
-- This migration introduces a pages table to store page-level metadata once per URL
-- and links document chunks to their parent pages via page_id foreign key

-- @migration-step create pages table
-- 1. Create pages table to store unique page-level metadata
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
Expand All @@ -16,11 +17,13 @@ CREATE TABLE IF NOT EXISTS pages (
UNIQUE(version_id, url)
);

-- @migration-step create page indexes
-- 2. Add indexes for efficient querying
CREATE INDEX IF NOT EXISTS idx_pages_version_id ON pages(version_id);
CREATE INDEX IF NOT EXISTS idx_pages_url ON pages(url);
CREATE INDEX IF NOT EXISTS idx_pages_etag ON pages(etag);

-- @migration-step create page-based documents table
-- 3. Create new documents table with page_id foreign key
CREATE TABLE documents_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
Expand All @@ -32,10 +35,12 @@ CREATE TABLE documents_new (
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
);

-- @migration-step create new document indexes
-- 4. Create indexes for the new documents table
CREATE INDEX IF NOT EXISTS idx_documents_page_id ON documents_new(page_id);
CREATE INDEX IF NOT EXISTS idx_documents_sort_order ON documents_new(page_id, sort_order);

-- @migration-step populate pages
-- 5. Migrate data from old documents table to new structure
-- First, populate pages table with unique page data from existing documents
-- Group by version_id and url to ensure uniqueness, using MAX() to handle any duplicates
Expand All @@ -49,6 +54,7 @@ SELECT
FROM documents
GROUP BY version_id, url;

-- @migration-step migrate document chunks
-- 6. Migrate document chunks to new table structure
-- Preserve all existing metadata except page-level fields (url, title, library, version)
-- that are now stored in pages and versions tables
Expand All @@ -72,12 +78,14 @@ SELECT
FROM documents d
JOIN pages p ON d.version_id = p.version_id AND d.url = p.url;

-- @migration-step replace documents table
-- 7. Drop the old documents table
DROP TABLE documents;

-- 8. Rename the new table to documents
ALTER TABLE documents_new RENAME TO documents;

-- @migration-step replace fts schema
-- 9. Recreate FTS5 virtual table to work with new structure
-- Drop existing FTS table and triggers
DROP TRIGGER IF EXISTS documents_fts_after_delete;
Expand All @@ -94,6 +102,7 @@ CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
tokenize='porter unicode61'
);

-- @migration-step recreate fts and page triggers
-- 10. Create new FTS triggers that join with pages table
CREATE TRIGGER IF NOT EXISTS documents_fts_after_delete AFTER DELETE ON documents BEGIN
DELETE FROM documents_fts WHERE rowid = old.id;
Expand All @@ -117,6 +126,7 @@ CREATE TRIGGER IF NOT EXISTS pages_updated_at_trigger AFTER UPDATE ON pages BEGI
UPDATE pages SET updated_at = CURRENT_TIMESTAMP WHERE id = new.id;
END;

-- @migration-step rebuild fts index
-- 12. Rebuild FTS index from migrated data
INSERT INTO documents_fts(rowid, content, title, url, path)
SELECT d.id, d.content, p.title, p.url, json_extract(d.metadata, '$.path')
Expand Down
56 changes: 56 additions & 0 deletions db/migrations/014-rebuild-vector-partition-keys.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
-- Migration: rebuild documents_vec with sqlite-vec partition keys
-- This enables selective KNN queries by library_id and version_id.

-- @migration-step preserve existing vectors
-- Preserve compatible vectors from the existing vec table. The migration
-- runner replaces __DOCUMENTS_VEC_DIMENSION__ with the current documents_vec
-- dimension so databases already reconciled to a custom embedding dimension
-- keep their existing vector size. This uses a disk-backed staging table
-- because large vector indexes can exceed memory.
DROP TABLE IF EXISTS _documents_vec_partition_migration;

CREATE TABLE _documents_vec_partition_migration AS
SELECT
d.id AS rowid,
v.library_id,
v.id AS version_id,
dv.embedding
FROM documents_vec dv
JOIN documents d ON dv.rowid = d.id
JOIN pages p ON d.page_id = p.id
JOIN versions v ON p.version_id = v.id
WHERE vec_length(dv.embedding) = __DOCUMENTS_VEC_DIMENSION__;

-- @migration-step rebuild vector table
DROP TABLE documents_vec;

CREATE VIRTUAL TABLE documents_vec USING vec0(
library_id INTEGER partition key,
version_id INTEGER partition key,
embedding FLOAT[__DOCUMENTS_VEC_DIMENSION__]
);
Comment thread
arabold marked this conversation as resolved.

-- @migration-step restore existing vectors
INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding)
SELECT rowid, library_id, version_id, embedding
FROM _documents_vec_partition_migration;

-- @migration-step backfill missing vectors
-- Backfill any vectors stored on documents but missing from the vec table.
INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding)
SELECT
d.id,
v.library_id,
v.id AS version_id,
json_extract(d.embedding, '$') AS embedding
FROM documents d
JOIN pages p ON d.page_id = p.id
JOIN versions v ON p.version_id = v.id
WHERE d.embedding IS NOT NULL
AND vec_length(json_extract(d.embedding, '$')) = __DOCUMENTS_VEC_DIMENSION__
AND NOT EXISTS (
SELECT 1 FROM documents_vec existing WHERE existing.rowid = d.id
);

-- @migration-step cleanup staging data
DROP TABLE _documents_vec_partition_migration;
20 changes: 20 additions & 0 deletions docs/concepts/data-storage.md
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,26 @@ Database transactions ensure consistency:
- Batch operations for performance
- Automatic rollback on errors

### Migration Safety

Schema migrations run inside an IMMEDIATE transaction with rollback-capable
SQLite journaling enabled. The migration runner does not use `journal_mode =
OFF` during migration execution because destructive migrations may need to drop
and recreate tables or virtual tables, and rollback must preserve the
pre-migration database if a later step fails.

The runner still applies rollback-safe tuning for large migrations, including
cache, memory mapping, temporary storage, and `synchronous = NORMAL` settings.
After migrations finish, it configures production settings such as WAL mode,
bounded WAL checkpointing, busy timeout, foreign keys, and normal synchronous
durability.

Large or destructive migrations should be validated against a copy of important
local databases before running them against the live store. Use SQLite's backup
API or an application-level export/copy workflow so the copy is consistent, then
run the new version against that copy and verify expected table counts and
search behavior before migrating high-value data.

### Concurrent Access

Safe concurrent database access:
Expand Down
2 changes: 2 additions & 0 deletions openspec/changes/harden-migration-workflow/.openspec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
schema: spec-driven
created: 2026-06-06
Loading
Loading