context_harness/migrate.rs
1//! Database schema migrations.
2//!
3//! Creates all required tables and ensures idempotent execution.
4//! Designed to be run via `ctx init`.
5//!
6//! # Schema
7//!
8//! ```text
9//! ┌──────────────┐ ┌──────────────┐ ┌──────────────┐
10//! │ documents │──┐ │ chunks │──┐ │ embeddings │
11//! │ │ │ │ │ │ │ │
12//! │ id (PK) │ │ │ id (PK) │ │ │ chunk_id(PK) │
13//! │ source │ └──│ document_id │ └──│ model │
14//! │ source_id │ │ chunk_index │ │ dims │
15//! │ source_url │ │ text │ │ created_at │
16//! │ title │ │ hash │ │ hash │
17//! │ author │ └──────────────┘ └──────────────┘
18//! │ created_at │
19//! │ updated_at │ ┌──────────────┐ ┌──────────────┐
20//! │ content_type │ │ chunks_fts │ │chunk_vectors │
21//! │ body │ │ (FTS5) │ │ │
22//! │ metadata_json│ │ chunk_id │ │ chunk_id(PK) │
23//! │ raw_json │ │ document_id │ │ document_id │
24//! │ dedup_hash │ │ text │ │ embedding │
25//! └──────────────┘ └──────────────┘ └──────────────┘
26//!
27//! ┌──────────────┐
28//! │ checkpoints │
29//! │ │
30//! │ source (PK) │
31//! │ cursor │
32//! │ updated_at │
33//! └──────────────┘
34//! ```
35//!
36//! # Tables
37//!
38//! | Table | Purpose |
39//! |-------|---------|
40//! | `documents` | Normalized document metadata and body text |
41//! | `chunks` | Text segments with content hashes |
42//! | `checkpoints` | Incremental sync cursors per connector |
43//! | `chunks_fts` | FTS5 full-text index over chunk text (BM25) |
44//! | `embeddings` | Embedding metadata (model, dims, hash) |
45//! | `chunk_vectors` | Embedding vectors stored as BLOBs |
46//!
47//! # Indexes
48//!
49//! - `idx_chunks_document_id` — fast chunk lookup by document
50//! - `idx_documents_source` — fast document filtering by connector
51//! - `idx_documents_updated_at` — efficient date range queries
52//! - `idx_chunk_vectors_document_id` — fast vector lookup by document
53//!
54//! # Idempotency
55//!
56//! All operations use `CREATE TABLE IF NOT EXISTS` or check for existing
57//! objects before creation. Running `ctx init` multiple times is safe.
58
59use anyhow::Result;
60
61use crate::config::Config;
62use crate::db;
63
64/// Run all database migrations.
65///
66/// Creates all tables, indexes, and virtual tables required by Context
67/// Harness. Safe to call multiple times — all operations are idempotent.
68///
69/// # Tables Created
70///
71/// - `documents` — normalized document storage
72/// - `chunks` — text segments with content hashes
73/// - `checkpoints` — incremental sync cursors
74/// - `chunks_fts` — FTS5 full-text search index
75/// - `embeddings` — embedding metadata (model, dims, staleness hash)
76/// - `chunk_vectors` — embedding vector BLOBs
77///
78/// # Errors
79///
80/// Returns an error if the database connection fails or any SQL statement
81/// cannot be executed.
82pub async fn run_migrations(config: &Config) -> Result<()> {
83 let pool = db::connect(config).await?;
84
85 // Create documents table
86 sqlx::query(
87 r#"
88 CREATE TABLE IF NOT EXISTS documents (
89 id TEXT PRIMARY KEY,
90 source TEXT NOT NULL,
91 source_id TEXT NOT NULL,
92 source_url TEXT,
93 title TEXT,
94 author TEXT,
95 created_at INTEGER NOT NULL,
96 updated_at INTEGER NOT NULL,
97 content_type TEXT NOT NULL DEFAULT 'text/plain',
98 body TEXT NOT NULL,
99 metadata_json TEXT NOT NULL DEFAULT '{}',
100 raw_json TEXT,
101 dedup_hash TEXT NOT NULL,
102 UNIQUE(source, source_id)
103 )
104 "#,
105 )
106 .execute(&pool)
107 .await?;
108
109 // Create chunks table
110 sqlx::query(
111 r#"
112 CREATE TABLE IF NOT EXISTS chunks (
113 id TEXT PRIMARY KEY,
114 document_id TEXT NOT NULL,
115 chunk_index INTEGER NOT NULL,
116 text TEXT NOT NULL,
117 hash TEXT NOT NULL,
118 UNIQUE(document_id, chunk_index),
119 FOREIGN KEY (document_id) REFERENCES documents(id)
120 )
121 "#,
122 )
123 .execute(&pool)
124 .await?;
125
126 // Create checkpoints table
127 sqlx::query(
128 r#"
129 CREATE TABLE IF NOT EXISTS checkpoints (
130 source TEXT PRIMARY KEY,
131 cursor TEXT NOT NULL,
132 updated_at INTEGER NOT NULL
133 )
134 "#,
135 )
136 .execute(&pool)
137 .await?;
138
139 // Create FTS5 virtual table over chunks (not idempotent natively, check first)
140 let fts_exists: bool = sqlx::query_scalar(
141 "SELECT COUNT(*) > 0 FROM sqlite_master WHERE type='table' AND name='chunks_fts'",
142 )
143 .fetch_one(&pool)
144 .await?;
145
146 if !fts_exists {
147 sqlx::query(
148 r#"
149 CREATE VIRTUAL TABLE chunks_fts USING fts5(
150 chunk_id UNINDEXED,
151 document_id UNINDEXED,
152 text
153 )
154 "#,
155 )
156 .execute(&pool)
157 .await?;
158 }
159
160 // Embeddings metadata table
161 sqlx::query(
162 r#"
163 CREATE TABLE IF NOT EXISTS embeddings (
164 chunk_id TEXT PRIMARY KEY,
165 model TEXT NOT NULL,
166 dims INTEGER NOT NULL,
167 created_at INTEGER NOT NULL,
168 hash TEXT NOT NULL,
169 FOREIGN KEY (chunk_id) REFERENCES chunks(id)
170 )
171 "#,
172 )
173 .execute(&pool)
174 .await?;
175
176 // Chunk vectors table (stores embedding BLOBs)
177 sqlx::query(
178 r#"
179 CREATE TABLE IF NOT EXISTS chunk_vectors (
180 chunk_id TEXT PRIMARY KEY,
181 document_id TEXT NOT NULL,
182 embedding BLOB NOT NULL,
183 FOREIGN KEY (chunk_id) REFERENCES chunks(id),
184 FOREIGN KEY (document_id) REFERENCES documents(id)
185 )
186 "#,
187 )
188 .execute(&pool)
189 .await?;
190
191 // Create indexes for common query patterns
192 sqlx::query("CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)")
193 .execute(&pool)
194 .await?;
195 sqlx::query("CREATE INDEX IF NOT EXISTS idx_documents_source ON documents(source)")
196 .execute(&pool)
197 .await?;
198 sqlx::query(
199 "CREATE INDEX IF NOT EXISTS idx_documents_updated_at ON documents(updated_at DESC)",
200 )
201 .execute(&pool)
202 .await?;
203 sqlx::query(
204 "CREATE INDEX IF NOT EXISTS idx_chunk_vectors_document_id ON chunk_vectors(document_id)",
205 )
206 .execute(&pool)
207 .await?;
208
209 pool.close().await;
210 Ok(())
211}