context_harness_core/
chunk.rs1use sha2::{Digest, Sha256};
33use uuid::Uuid;
34
35use crate::models::Chunk;
36
37const CHARS_PER_TOKEN: usize = 4;
42
43pub fn chunk_text(document_id: &str, text: &str, max_tokens: usize) -> Vec<Chunk> {
62 let max_chars = max_tokens * CHARS_PER_TOKEN;
63
64 if text.is_empty() {
65 return vec![make_chunk(document_id, 0, text)];
66 }
67
68 let paragraphs: Vec<&str> = text.split("\n\n").collect();
69 let mut chunks = Vec::new();
70 let mut current_buf = String::new();
71 let mut chunk_index: i64 = 0;
72
73 for para in paragraphs {
74 let trimmed = para.trim();
75 if trimmed.is_empty() {
76 continue;
77 }
78
79 let would_be = if current_buf.is_empty() {
80 trimmed.len()
81 } else {
82 current_buf.len() + 2 + trimmed.len()
83 };
84
85 if would_be > max_chars && !current_buf.is_empty() {
86 chunks.push(make_chunk(document_id, chunk_index, ¤t_buf));
87 chunk_index += 1;
88 current_buf.clear();
89 }
90
91 if trimmed.len() > max_chars {
92 if !current_buf.is_empty() {
93 chunks.push(make_chunk(document_id, chunk_index, ¤t_buf));
94 chunk_index += 1;
95 current_buf.clear();
96 }
97 let mut remaining = trimmed;
98 while !remaining.is_empty() {
99 let split_at = snap_to_char_boundary(remaining, remaining.len().min(max_chars));
100 let split_at = if split_at == 0 && !remaining.is_empty() {
101 remaining
102 .char_indices()
103 .nth(1)
104 .map(|(i, _)| i)
105 .unwrap_or(remaining.len())
106 } else {
107 split_at
108 };
109 let actual_split = if split_at < remaining.len() {
110 remaining[..split_at]
111 .rfind('\n')
112 .or_else(|| remaining[..split_at].rfind(' '))
113 .map(|pos| pos + 1)
114 .unwrap_or(split_at)
115 } else {
116 split_at
117 };
118 let actual_split = snap_to_char_boundary(remaining, actual_split);
119 let actual_split = if actual_split == 0 && !remaining.is_empty() {
120 remaining
121 .char_indices()
122 .nth(1)
123 .map(|(i, _)| i)
124 .unwrap_or(remaining.len())
125 } else {
126 actual_split
127 };
128 let piece = &remaining[..actual_split];
129 if !piece.trim().is_empty() {
130 chunks.push(make_chunk(document_id, chunk_index, piece.trim()));
131 chunk_index += 1;
132 }
133 remaining = &remaining[actual_split..];
134 }
135 } else {
136 if !current_buf.is_empty() {
137 current_buf.push_str("\n\n");
138 }
139 current_buf.push_str(trimmed);
140 }
141 }
142
143 if !current_buf.is_empty() {
144 chunks.push(make_chunk(document_id, chunk_index, ¤t_buf));
145 }
146
147 if chunks.is_empty() {
148 chunks.push(make_chunk(document_id, 0, text.trim()));
149 }
150
151 chunks
152}
153
154fn snap_to_char_boundary(s: &str, index: usize) -> usize {
156 if index >= s.len() {
157 return s.len();
158 }
159 let mut i = index;
160 while i > 0 && !s.is_char_boundary(i) {
161 i -= 1;
162 }
163 i
164}
165
166fn make_chunk(document_id: &str, index: i64, text: &str) -> Chunk {
168 let mut hasher = Sha256::new();
169 hasher.update(text.as_bytes());
170 let hash = format!("{:x}", hasher.finalize());
171
172 Chunk {
173 id: Uuid::new_v4().to_string(),
174 document_id: document_id.to_string(),
175 chunk_index: index,
176 text: text.to_string(),
177 hash,
178 }
179}
180
181#[cfg(test)]
182mod tests {
183 use super::*;
184
185 #[test]
186 fn test_small_text_single_chunk() {
187 let chunks = chunk_text("doc1", "Hello, world!", 700);
188 assert_eq!(chunks.len(), 1);
189 assert_eq!(chunks[0].chunk_index, 0);
190 assert_eq!(chunks[0].text, "Hello, world!");
191 }
192
193 #[test]
194 fn test_empty_text() {
195 let chunks = chunk_text("doc1", "", 700);
196 assert_eq!(chunks.len(), 1);
197 assert_eq!(chunks[0].chunk_index, 0);
198 }
199
200 #[test]
201 fn test_multiple_paragraphs_under_limit() {
202 let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
203 let chunks = chunk_text("doc1", text, 700);
204 assert_eq!(chunks.len(), 1);
205 assert!(chunks[0].text.contains("First paragraph."));
206 assert!(chunks[0].text.contains("Third paragraph."));
207 }
208
209 #[test]
210 fn test_multiple_paragraphs_exceed_limit() {
211 let text = "This is paragraph one.\n\nThis is paragraph two.\n\nThis is paragraph three.";
212 let chunks = chunk_text("doc1", text, 5);
213 assert!(chunks.len() > 1);
214 for (i, c) in chunks.iter().enumerate() {
215 assert_eq!(c.chunk_index, i as i64);
216 }
217 }
218
219 #[test]
220 fn test_chunk_indices_contiguous() {
221 let text = (0..50)
222 .map(|i| format!("Paragraph number {}.", i))
223 .collect::<Vec<_>>()
224 .join("\n\n");
225 let chunks = chunk_text("doc1", &text, 10);
226 for (i, c) in chunks.iter().enumerate() {
227 assert_eq!(c.chunk_index, i as i64, "Index mismatch at position {}", i);
228 }
229 }
230
231 #[test]
232 fn test_multibyte_utf8_chars() {
233 let text = "┌──────────────────┐\n│ Hello world │\n└──────────────────┘";
234 let chunks = chunk_text("doc1", text, 3);
235 assert!(!chunks.is_empty());
236 for c in &chunks {
237 assert!(!c.text.is_empty() || c.chunk_index == 0);
238 }
239 }
240
241 #[test]
242 fn test_deterministic() {
243 let text = "Alpha\n\nBeta\n\nGamma\n\nDelta";
244 let c1 = chunk_text("doc1", text, 5);
245 let c2 = chunk_text("doc1", text, 5);
246 assert_eq!(c1.len(), c2.len());
247 for (a, b) in c1.iter().zip(c2.iter()) {
248 assert_eq!(a.text, b.text);
249 assert_eq!(a.hash, b.hash);
250 assert_eq!(a.chunk_index, b.chunk_index);
251 }
252 }
253}