context_harness_core/
chunk.rs

1//! Paragraph-boundary text chunker.
2//!
3//! Splits document body text into [`Chunk`]s that respect a configurable
4//! `max_tokens` limit. Splitting occurs on paragraph boundaries (`\n\n`)
5//! to preserve semantic coherence within each chunk.
6//!
7//! Each chunk receives a deterministic UUID derived from its document ID
8//! and index, plus a SHA-256 hash of its text for staleness detection
9//! in the embedding pipeline.
10//!
11//! # Algorithm
12//!
13//! 1. Convert `max_tokens` to `max_chars` using a 4 chars/token ratio.
14//! 2. Split text on `\n\n` paragraph boundaries.
15//! 3. Accumulate paragraphs into a buffer until adding the next paragraph
16//!    would exceed `max_chars`.
17//! 4. When exceeded, flush the buffer as a chunk and start a new one.
18//! 5. If a single paragraph exceeds `max_chars`, perform a hard split at
19//!    the nearest newline or space boundary.
20//! 6. Guarantee at least one chunk per document (even for empty text).
21//!
22//! # Example
23//!
24//! ```rust
25//! use context_harness_core::chunk::chunk_text;
26//!
27//! let chunks = chunk_text("doc-123", "Hello world.\n\nSecond paragraph.", 700);
28//! assert_eq!(chunks.len(), 1);
29//! assert_eq!(chunks[0].chunk_index, 0);
30//! ```
31
32use sha2::{Digest, Sha256};
33use uuid::Uuid;
34
35use crate::models::Chunk;
36
37/// Approximate characters-per-token ratio.
38///
39/// This is a rough heuristic (4 chars ≈ 1 token) used for Phase 1.
40/// Future versions may use a proper tokenizer.
41const CHARS_PER_TOKEN: usize = 4;
42
43/// Split text into chunks on paragraph boundaries, respecting `max_tokens`.
44///
45/// Returns chunks with contiguous indices starting at 0. Each chunk's
46/// `hash` is the SHA-256 of its text content, used for embedding
47/// staleness detection.
48///
49/// # Arguments
50///
51/// * `document_id` — The parent document's UUID (used in chunk metadata).
52/// * `text` — The full document body to chunk.
53/// * `max_tokens` — Maximum tokens per chunk (converted to chars via `× 4`).
54///
55/// # Guarantees
56///
57/// - At least one chunk is always returned (even for empty text).
58/// - Chunk indices are contiguous: `0, 1, 2, …, N-1`.
59/// - Chunks are split on `\n\n` boundaries when possible.
60/// - Oversized paragraphs are hard-split at space/newline boundaries.
61pub fn chunk_text(document_id: &str, text: &str, max_tokens: usize) -> Vec<Chunk> {
62    let max_chars = max_tokens * CHARS_PER_TOKEN;
63
64    if text.is_empty() {
65        return vec![make_chunk(document_id, 0, text)];
66    }
67
68    let paragraphs: Vec<&str> = text.split("\n\n").collect();
69    let mut chunks = Vec::new();
70    let mut current_buf = String::new();
71    let mut chunk_index: i64 = 0;
72
73    for para in paragraphs {
74        let trimmed = para.trim();
75        if trimmed.is_empty() {
76            continue;
77        }
78
79        let would_be = if current_buf.is_empty() {
80            trimmed.len()
81        } else {
82            current_buf.len() + 2 + trimmed.len()
83        };
84
85        if would_be > max_chars && !current_buf.is_empty() {
86            chunks.push(make_chunk(document_id, chunk_index, &current_buf));
87            chunk_index += 1;
88            current_buf.clear();
89        }
90
91        if trimmed.len() > max_chars {
92            if !current_buf.is_empty() {
93                chunks.push(make_chunk(document_id, chunk_index, &current_buf));
94                chunk_index += 1;
95                current_buf.clear();
96            }
97            let mut remaining = trimmed;
98            while !remaining.is_empty() {
99                let split_at = snap_to_char_boundary(remaining, remaining.len().min(max_chars));
100                let split_at = if split_at == 0 && !remaining.is_empty() {
101                    remaining
102                        .char_indices()
103                        .nth(1)
104                        .map(|(i, _)| i)
105                        .unwrap_or(remaining.len())
106                } else {
107                    split_at
108                };
109                let actual_split = if split_at < remaining.len() {
110                    remaining[..split_at]
111                        .rfind('\n')
112                        .or_else(|| remaining[..split_at].rfind(' '))
113                        .map(|pos| pos + 1)
114                        .unwrap_or(split_at)
115                } else {
116                    split_at
117                };
118                let actual_split = snap_to_char_boundary(remaining, actual_split);
119                let actual_split = if actual_split == 0 && !remaining.is_empty() {
120                    remaining
121                        .char_indices()
122                        .nth(1)
123                        .map(|(i, _)| i)
124                        .unwrap_or(remaining.len())
125                } else {
126                    actual_split
127                };
128                let piece = &remaining[..actual_split];
129                if !piece.trim().is_empty() {
130                    chunks.push(make_chunk(document_id, chunk_index, piece.trim()));
131                    chunk_index += 1;
132                }
133                remaining = &remaining[actual_split..];
134            }
135        } else {
136            if !current_buf.is_empty() {
137                current_buf.push_str("\n\n");
138            }
139            current_buf.push_str(trimmed);
140        }
141    }
142
143    if !current_buf.is_empty() {
144        chunks.push(make_chunk(document_id, chunk_index, &current_buf));
145    }
146
147    if chunks.is_empty() {
148        chunks.push(make_chunk(document_id, 0, text.trim()));
149    }
150
151    chunks
152}
153
154/// Snap a byte index back to the nearest valid UTF-8 char boundary.
155fn snap_to_char_boundary(s: &str, index: usize) -> usize {
156    if index >= s.len() {
157        return s.len();
158    }
159    let mut i = index;
160    while i > 0 && !s.is_char_boundary(i) {
161        i -= 1;
162    }
163    i
164}
165
166/// Create a single [`Chunk`] with a UUID and SHA-256 content hash.
167fn make_chunk(document_id: &str, index: i64, text: &str) -> Chunk {
168    let mut hasher = Sha256::new();
169    hasher.update(text.as_bytes());
170    let hash = format!("{:x}", hasher.finalize());
171
172    Chunk {
173        id: Uuid::new_v4().to_string(),
174        document_id: document_id.to_string(),
175        chunk_index: index,
176        text: text.to_string(),
177        hash,
178    }
179}
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184
185    #[test]
186    fn test_small_text_single_chunk() {
187        let chunks = chunk_text("doc1", "Hello, world!", 700);
188        assert_eq!(chunks.len(), 1);
189        assert_eq!(chunks[0].chunk_index, 0);
190        assert_eq!(chunks[0].text, "Hello, world!");
191    }
192
193    #[test]
194    fn test_empty_text() {
195        let chunks = chunk_text("doc1", "", 700);
196        assert_eq!(chunks.len(), 1);
197        assert_eq!(chunks[0].chunk_index, 0);
198    }
199
200    #[test]
201    fn test_multiple_paragraphs_under_limit() {
202        let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
203        let chunks = chunk_text("doc1", text, 700);
204        assert_eq!(chunks.len(), 1);
205        assert!(chunks[0].text.contains("First paragraph."));
206        assert!(chunks[0].text.contains("Third paragraph."));
207    }
208
209    #[test]
210    fn test_multiple_paragraphs_exceed_limit() {
211        let text = "This is paragraph one.\n\nThis is paragraph two.\n\nThis is paragraph three.";
212        let chunks = chunk_text("doc1", text, 5);
213        assert!(chunks.len() > 1);
214        for (i, c) in chunks.iter().enumerate() {
215            assert_eq!(c.chunk_index, i as i64);
216        }
217    }
218
219    #[test]
220    fn test_chunk_indices_contiguous() {
221        let text = (0..50)
222            .map(|i| format!("Paragraph number {}.", i))
223            .collect::<Vec<_>>()
224            .join("\n\n");
225        let chunks = chunk_text("doc1", &text, 10);
226        for (i, c) in chunks.iter().enumerate() {
227            assert_eq!(c.chunk_index, i as i64, "Index mismatch at position {}", i);
228        }
229    }
230
231    #[test]
232    fn test_multibyte_utf8_chars() {
233        let text = "┌──────────────────┐\n│ Hello world      │\n└──────────────────┘";
234        let chunks = chunk_text("doc1", text, 3);
235        assert!(!chunks.is_empty());
236        for c in &chunks {
237            assert!(!c.text.is_empty() || c.chunk_index == 0);
238        }
239    }
240
241    #[test]
242    fn test_deterministic() {
243        let text = "Alpha\n\nBeta\n\nGamma\n\nDelta";
244        let c1 = chunk_text("doc1", text, 5);
245        let c2 = chunk_text("doc1", text, 5);
246        assert_eq!(c1.len(), c2.len());
247        for (a, b) in c1.iter().zip(c2.iter()) {
248            assert_eq!(a.text, b.text);
249            assert_eq!(a.hash, b.hash);
250            assert_eq!(a.chunk_index, b.chunk_index);
251        }
252    }
253}