context_harness_core/models.rs
1//! Core data models used throughout Context Harness.
2//!
3//! These types represent the documents, chunks, and search results that flow
4//! through the ingestion and retrieval pipeline. The data lifecycle is:
5//!
6//! ```text
7//! Connector → SourceItem → normalize() → Document → chunk() → Chunk
8//! ↓
9//! embed() → Embedding
10//! ↓
11//! search() → SearchResult
12//! ```
13//!
14//! # Type Relationships
15//!
16//! - A **[`SourceItem`]** is produced by a connector (filesystem, Git, S3)
17//! before any normalization or storage.
18//! - A **[`Document`]** is the normalized, stored representation with a
19//! deduplication hash and Unix timestamps.
20//! - A **[`Chunk`]** is a segment of a document's body, stored alongside
21//! a content hash for embedding staleness detection.
22//! - A **[`SearchResult`]** is returned by the query engine with a
23//! relevance score and snippet.
24
25use chrono::{DateTime, Utc};
26
27/// Raw item produced by a connector before normalization.
28///
29/// Connectors (filesystem, Git, S3) emit `SourceItem`s that are then
30/// normalized into [`Document`]s during the ingestion pipeline.
31///
32/// # Fields
33///
34/// | Field | Description |
35/// |-------|-------------|
36/// | `source` | Connector name, e.g. `"filesystem"`, `"git"`, `"s3"` |
37/// | `source_id` | Unique identifier within the source (e.g. relative file path, S3 key) |
38/// | `source_url` | Optional web-browsable URL (e.g. GitHub blob URL, `s3://` URI) |
39/// | `title` | Human-readable title, typically the filename |
40/// | `author` | Author extracted from source metadata (e.g. last Git committer) |
41/// | `created_at` / `updated_at` | Timestamps from the source (commit time, mtime, S3 `LastModified`) |
42/// | `content_type` | MIME type, e.g. `"text/plain"`, `"text/markdown"` |
43/// | `body` | Full text content of the document |
44/// | `metadata_json` | Connector-specific metadata as a JSON string |
45/// | `raw_json` | Optional raw API response for debugging |
46/// | `raw_bytes` | When set, the pipeline runs extraction and sets `body` before upsert; content_type identifies the format |
47#[derive(Debug, Clone)]
48pub struct SourceItem {
49 /// Connector name: `"filesystem"`, `"git"`, or `"s3"`.
50 pub source: String,
51 /// Unique identifier within the source (e.g. relative file path or S3 object key).
52 pub source_id: String,
53 /// Web-browsable URL for the source item, if available.
54 pub source_url: Option<String>,
55 /// Human-readable title (typically the filename).
56 pub title: Option<String>,
57 /// Author extracted from source metadata (e.g. last Git committer).
58 pub author: Option<String>,
59 /// Creation timestamp from the source.
60 pub created_at: DateTime<Utc>,
61 /// Last modification timestamp from the source.
62 pub updated_at: DateTime<Utc>,
63 /// MIME content type (e.g. `"text/plain"`, `"text/markdown"`).
64 pub content_type: String,
65 /// Full text content of the document.
66 pub body: String,
67 /// Connector-specific metadata serialized as JSON.
68 pub metadata_json: String,
69 /// Optional raw API/connector response for debugging.
70 pub raw_json: Option<String>,
71 /// When set, the pipeline runs extraction and sets body from the result before upsert; content_type identifies the format.
72 pub raw_bytes: Option<Vec<u8>>,
73}
74
75/// Normalized document stored in the `documents` table.
76///
77/// Created during ingestion by normalizing a [`SourceItem`]. Each document
78/// is uniquely identified by the `(source, source_id)` pair, and carries
79/// a `dedup_hash` (SHA-256 of source + source_id + updated_at + body) to
80/// detect content changes.
81///
82/// Timestamps are stored as Unix epoch seconds (i64) for efficient
83/// comparison and indexing.
84#[derive(Debug, Clone)]
85#[allow(dead_code)]
86pub struct Document {
87 /// UUID v4 primary key.
88 pub id: String,
89 /// Connector name that produced this document.
90 pub source: String,
91 /// Unique identifier within the source.
92 pub source_id: String,
93 /// Web-browsable URL, if available.
94 pub source_url: Option<String>,
95 /// Human-readable title.
96 pub title: Option<String>,
97 /// Author from source metadata.
98 pub author: Option<String>,
99 /// Creation timestamp (Unix epoch seconds).
100 pub created_at: i64,
101 /// Last modification timestamp (Unix epoch seconds).
102 pub updated_at: i64,
103 /// MIME content type.
104 pub content_type: String,
105 /// Full text body.
106 pub body: String,
107 /// Connector-specific metadata as JSON.
108 pub metadata_json: String,
109 /// Raw connector response.
110 pub raw_json: Option<String>,
111 /// SHA-256 hash for deduplication: `H(source || source_id || updated_at || body)`.
112 pub dedup_hash: String,
113}
114
115/// A chunk of a document's body text, stored in the `chunks` table.
116///
117/// Documents are split into chunks by the [`crate::chunk`] module to enable
118/// granular retrieval and embedding. Each chunk has:
119///
120/// - A deterministic UUID (derived from document_id + chunk_index)
121/// - A contiguous `chunk_index` starting at 0
122/// - A SHA-256 `hash` of its text content, used by the embedding pipeline
123/// to detect when re-embedding is needed (staleness detection)
124#[derive(Debug, Clone)]
125pub struct Chunk {
126 /// UUID v4 primary key.
127 pub id: String,
128 /// Foreign key to the parent [`Document`].
129 pub document_id: String,
130 /// Zero-based index within the document's chunk sequence.
131 pub chunk_index: i64,
132 /// Chunk text content.
133 pub text: String,
134 /// SHA-256 hash of `text`, used for embedding staleness detection.
135 pub hash: String,
136}
137
138/// A search result returned from the query engine.
139///
140/// Contains the document metadata, a relevance `score` normalized to
141/// `[0.0, 1.0]`, and a `snippet` extracted from the best-matching chunk.
142///
143/// Used internally by the CLI; the HTTP server uses [`crate::search::SearchResultItem`]
144/// which has the same shape but derives `Serialize`.
145#[derive(Debug, Clone)]
146#[allow(dead_code)]
147pub struct SearchResult {
148 /// Document UUID.
149 pub id: String,
150 /// Document title.
151 pub title: Option<String>,
152 /// Connector name.
153 pub source: String,
154 /// Source identifier.
155 pub source_id: String,
156 /// Last modification timestamp (Unix epoch seconds).
157 pub updated_at: i64,
158 /// Relevance score in `[0.0, 1.0]`.
159 pub score: f64,
160 /// Text excerpt from the best-matching chunk.
161 pub snippet: String,
162 /// Web-browsable URL, if available.
163 pub source_url: Option<String>,
164}