context_harness/
migrate.rs

1//! Database schema migrations.
2//!
3//! Creates all required tables and ensures idempotent execution.
4//! Designed to be run via `ctx init`.
5//!
6//! # Schema
7//!
8//! ```text
9//! ┌──────────────┐     ┌──────────────┐     ┌──────────────┐
10//! │  documents   │──┐  │   chunks     │──┐  │  embeddings  │
11//! │              │  │  │              │  │  │              │
12//! │ id (PK)      │  │  │ id (PK)      │  │  │ chunk_id(PK) │
13//! │ source       │  └──│ document_id  │  └──│ model        │
14//! │ source_id    │     │ chunk_index  │     │ dims         │
15//! │ source_url   │     │ text         │     │ created_at   │
16//! │ title        │     │ hash         │     │ hash         │
17//! │ author       │     └──────────────┘     └──────────────┘
18//! │ created_at   │
19//! │ updated_at   │     ┌──────────────┐     ┌──────────────┐
20//! │ content_type │     │  chunks_fts  │     │chunk_vectors │
21//! │ body         │     │  (FTS5)      │     │              │
22//! │ metadata_json│     │ chunk_id     │     │ chunk_id(PK) │
23//! │ raw_json     │     │ document_id  │     │ document_id  │
24//! │ dedup_hash   │     │ text         │     │ embedding    │
25//! └──────────────┘     └──────────────┘     └──────────────┘
26//!
27//! ┌──────────────┐
28//! │ checkpoints  │
29//! │              │
30//! │ source (PK)  │
31//! │ cursor       │
32//! │ updated_at   │
33//! └──────────────┘
34//! ```
35//!
36//! # Tables
37//!
38//! | Table | Purpose |
39//! |-------|---------|
40//! | `documents` | Normalized document metadata and body text |
41//! | `chunks` | Text segments with content hashes |
42//! | `checkpoints` | Incremental sync cursors per connector |
43//! | `chunks_fts` | FTS5 full-text index over chunk text (BM25) |
44//! | `embeddings` | Embedding metadata (model, dims, hash) |
45//! | `chunk_vectors` | Embedding vectors stored as BLOBs |
46//!
47//! # Indexes
48//!
49//! - `idx_chunks_document_id` — fast chunk lookup by document
50//! - `idx_documents_source` — fast document filtering by connector
51//! - `idx_documents_updated_at` — efficient date range queries
52//! - `idx_chunk_vectors_document_id` — fast vector lookup by document
53//!
54//! # Idempotency
55//!
56//! All operations use `CREATE TABLE IF NOT EXISTS` or check for existing
57//! objects before creation. Running `ctx init` multiple times is safe.
58
59use anyhow::Result;
60
61use crate::config::Config;
62use crate::db;
63
64/// Run all database migrations.
65///
66/// Creates all tables, indexes, and virtual tables required by Context
67/// Harness. Safe to call multiple times — all operations are idempotent.
68///
69/// # Tables Created
70///
71/// - `documents` — normalized document storage
72/// - `chunks` — text segments with content hashes
73/// - `checkpoints` — incremental sync cursors
74/// - `chunks_fts` — FTS5 full-text search index
75/// - `embeddings` — embedding metadata (model, dims, staleness hash)
76/// - `chunk_vectors` — embedding vector BLOBs
77///
78/// # Errors
79///
80/// Returns an error if the database connection fails or any SQL statement
81/// cannot be executed.
82pub async fn run_migrations(config: &Config) -> Result<()> {
83    let pool = db::connect(config).await?;
84
85    // Create documents table
86    sqlx::query(
87        r#"
88        CREATE TABLE IF NOT EXISTS documents (
89            id TEXT PRIMARY KEY,
90            source TEXT NOT NULL,
91            source_id TEXT NOT NULL,
92            source_url TEXT,
93            title TEXT,
94            author TEXT,
95            created_at INTEGER NOT NULL,
96            updated_at INTEGER NOT NULL,
97            content_type TEXT NOT NULL DEFAULT 'text/plain',
98            body TEXT NOT NULL,
99            metadata_json TEXT NOT NULL DEFAULT '{}',
100            raw_json TEXT,
101            dedup_hash TEXT NOT NULL,
102            UNIQUE(source, source_id)
103        )
104        "#,
105    )
106    .execute(&pool)
107    .await?;
108
109    // Create chunks table
110    sqlx::query(
111        r#"
112        CREATE TABLE IF NOT EXISTS chunks (
113            id TEXT PRIMARY KEY,
114            document_id TEXT NOT NULL,
115            chunk_index INTEGER NOT NULL,
116            text TEXT NOT NULL,
117            hash TEXT NOT NULL,
118            UNIQUE(document_id, chunk_index),
119            FOREIGN KEY (document_id) REFERENCES documents(id)
120        )
121        "#,
122    )
123    .execute(&pool)
124    .await?;
125
126    // Create checkpoints table
127    sqlx::query(
128        r#"
129        CREATE TABLE IF NOT EXISTS checkpoints (
130            source TEXT PRIMARY KEY,
131            cursor TEXT NOT NULL,
132            updated_at INTEGER NOT NULL
133        )
134        "#,
135    )
136    .execute(&pool)
137    .await?;
138
139    // Create FTS5 virtual table over chunks (not idempotent natively, check first)
140    let fts_exists: bool = sqlx::query_scalar(
141        "SELECT COUNT(*) > 0 FROM sqlite_master WHERE type='table' AND name='chunks_fts'",
142    )
143    .fetch_one(&pool)
144    .await?;
145
146    if !fts_exists {
147        sqlx::query(
148            r#"
149            CREATE VIRTUAL TABLE chunks_fts USING fts5(
150                chunk_id UNINDEXED,
151                document_id UNINDEXED,
152                text
153            )
154            "#,
155        )
156        .execute(&pool)
157        .await?;
158    }
159
160    // Embeddings metadata table
161    sqlx::query(
162        r#"
163        CREATE TABLE IF NOT EXISTS embeddings (
164            chunk_id TEXT PRIMARY KEY,
165            model TEXT NOT NULL,
166            dims INTEGER NOT NULL,
167            created_at INTEGER NOT NULL,
168            hash TEXT NOT NULL,
169            FOREIGN KEY (chunk_id) REFERENCES chunks(id)
170        )
171        "#,
172    )
173    .execute(&pool)
174    .await?;
175
176    // Chunk vectors table (stores embedding BLOBs)
177    sqlx::query(
178        r#"
179        CREATE TABLE IF NOT EXISTS chunk_vectors (
180            chunk_id TEXT PRIMARY KEY,
181            document_id TEXT NOT NULL,
182            embedding BLOB NOT NULL,
183            FOREIGN KEY (chunk_id) REFERENCES chunks(id),
184            FOREIGN KEY (document_id) REFERENCES documents(id)
185        )
186        "#,
187    )
188    .execute(&pool)
189    .await?;
190
191    // Create indexes for common query patterns
192    sqlx::query("CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)")
193        .execute(&pool)
194        .await?;
195    sqlx::query("CREATE INDEX IF NOT EXISTS idx_documents_source ON documents(source)")
196        .execute(&pool)
197        .await?;
198    sqlx::query(
199        "CREATE INDEX IF NOT EXISTS idx_documents_updated_at ON documents(updated_at DESC)",
200    )
201    .execute(&pool)
202    .await?;
203    sqlx::query(
204        "CREATE INDEX IF NOT EXISTS idx_chunk_vectors_document_id ON chunk_vectors(document_id)",
205    )
206    .execute(&pool)
207    .await?;
208
209    pool.close().await;
210    Ok(())
211}