context_harness_core/
models.rs

1//! Core data models used throughout Context Harness.
2//!
3//! These types represent the documents, chunks, and search results that flow
4//! through the ingestion and retrieval pipeline. The data lifecycle is:
5//!
6//! ```text
7//! Connector → SourceItem → normalize() → Document → chunk() → Chunk
8//!                                                       ↓
9//!                                                  embed() → Embedding
10//!                                                       ↓
11//!                                                  search() → SearchResult
12//! ```
13//!
14//! # Type Relationships
15//!
16//! - A **[`SourceItem`]** is produced by a connector (filesystem, Git, S3)
17//!   before any normalization or storage.
18//! - A **[`Document`]** is the normalized, stored representation with a
19//!   deduplication hash and Unix timestamps.
20//! - A **[`Chunk`]** is a segment of a document's body, stored alongside
21//!   a content hash for embedding staleness detection.
22//! - A **[`SearchResult`]** is returned by the query engine with a
23//!   relevance score and snippet.
24
25use chrono::{DateTime, Utc};
26
27/// Raw item produced by a connector before normalization.
28///
29/// Connectors (filesystem, Git, S3) emit `SourceItem`s that are then
30/// normalized into [`Document`]s during the ingestion pipeline.
31///
32/// # Fields
33///
34/// | Field | Description |
35/// |-------|-------------|
36/// | `source` | Connector name, e.g. `"filesystem"`, `"git"`, `"s3"` |
37/// | `source_id` | Unique identifier within the source (e.g. relative file path, S3 key) |
38/// | `source_url` | Optional web-browsable URL (e.g. GitHub blob URL, `s3://` URI) |
39/// | `title` | Human-readable title, typically the filename |
40/// | `author` | Author extracted from source metadata (e.g. last Git committer) |
41/// | `created_at` / `updated_at` | Timestamps from the source (commit time, mtime, S3 `LastModified`) |
42/// | `content_type` | MIME type, e.g. `"text/plain"`, `"text/markdown"` |
43/// | `body` | Full text content of the document |
44/// | `metadata_json` | Connector-specific metadata as a JSON string |
45/// | `raw_json` | Optional raw API response for debugging |
46/// | `raw_bytes` | When set, the pipeline runs extraction and sets `body` before upsert; content_type identifies the format |
47#[derive(Debug, Clone)]
48pub struct SourceItem {
49    /// Connector name: `"filesystem"`, `"git"`, or `"s3"`.
50    pub source: String,
51    /// Unique identifier within the source (e.g. relative file path or S3 object key).
52    pub source_id: String,
53    /// Web-browsable URL for the source item, if available.
54    pub source_url: Option<String>,
55    /// Human-readable title (typically the filename).
56    pub title: Option<String>,
57    /// Author extracted from source metadata (e.g. last Git committer).
58    pub author: Option<String>,
59    /// Creation timestamp from the source.
60    pub created_at: DateTime<Utc>,
61    /// Last modification timestamp from the source.
62    pub updated_at: DateTime<Utc>,
63    /// MIME content type (e.g. `"text/plain"`, `"text/markdown"`).
64    pub content_type: String,
65    /// Full text content of the document.
66    pub body: String,
67    /// Connector-specific metadata serialized as JSON.
68    pub metadata_json: String,
69    /// Optional raw API/connector response for debugging.
70    pub raw_json: Option<String>,
71    /// When set, the pipeline runs extraction and sets body from the result before upsert; content_type identifies the format.
72    pub raw_bytes: Option<Vec<u8>>,
73}
74
75/// Normalized document stored in the `documents` table.
76///
77/// Created during ingestion by normalizing a [`SourceItem`]. Each document
78/// is uniquely identified by the `(source, source_id)` pair, and carries
79/// a `dedup_hash` (SHA-256 of source + source_id + updated_at + body) to
80/// detect content changes.
81///
82/// Timestamps are stored as Unix epoch seconds (i64) for efficient
83/// comparison and indexing.
84#[derive(Debug, Clone)]
85#[allow(dead_code)]
86pub struct Document {
87    /// UUID v4 primary key.
88    pub id: String,
89    /// Connector name that produced this document.
90    pub source: String,
91    /// Unique identifier within the source.
92    pub source_id: String,
93    /// Web-browsable URL, if available.
94    pub source_url: Option<String>,
95    /// Human-readable title.
96    pub title: Option<String>,
97    /// Author from source metadata.
98    pub author: Option<String>,
99    /// Creation timestamp (Unix epoch seconds).
100    pub created_at: i64,
101    /// Last modification timestamp (Unix epoch seconds).
102    pub updated_at: i64,
103    /// MIME content type.
104    pub content_type: String,
105    /// Full text body.
106    pub body: String,
107    /// Connector-specific metadata as JSON.
108    pub metadata_json: String,
109    /// Raw connector response.
110    pub raw_json: Option<String>,
111    /// SHA-256 hash for deduplication: `H(source || source_id || updated_at || body)`.
112    pub dedup_hash: String,
113}
114
115/// A chunk of a document's body text, stored in the `chunks` table.
116///
117/// Documents are split into chunks by the [`crate::chunk`] module to enable
118/// granular retrieval and embedding. Each chunk has:
119///
120/// - A deterministic UUID (derived from document_id + chunk_index)
121/// - A contiguous `chunk_index` starting at 0
122/// - A SHA-256 `hash` of its text content, used by the embedding pipeline
123///   to detect when re-embedding is needed (staleness detection)
124#[derive(Debug, Clone)]
125pub struct Chunk {
126    /// UUID v4 primary key.
127    pub id: String,
128    /// Foreign key to the parent [`Document`].
129    pub document_id: String,
130    /// Zero-based index within the document's chunk sequence.
131    pub chunk_index: i64,
132    /// Chunk text content.
133    pub text: String,
134    /// SHA-256 hash of `text`, used for embedding staleness detection.
135    pub hash: String,
136}
137
138/// A search result returned from the query engine.
139///
140/// Contains the document metadata, a relevance `score` normalized to
141/// `[0.0, 1.0]`, and a `snippet` extracted from the best-matching chunk.
142///
143/// Used internally by the CLI; the HTTP server uses [`crate::search::SearchResultItem`]
144/// which has the same shape but derives `Serialize`.
145#[derive(Debug, Clone)]
146#[allow(dead_code)]
147pub struct SearchResult {
148    /// Document UUID.
149    pub id: String,
150    /// Document title.
151    pub title: Option<String>,
152    /// Connector name.
153    pub source: String,
154    /// Source identifier.
155    pub source_id: String,
156    /// Last modification timestamp (Unix epoch seconds).
157    pub updated_at: i64,
158    /// Relevance score in `[0.0, 1.0]`.
159    pub score: f64,
160    /// Text excerpt from the best-matching chunk.
161    pub snippet: String,
162    /// Web-browsable URL, if available.
163    pub source_url: Option<String>,
164}