context_harness/
extract.rs1use std::io::Read;
7
8pub const MIME_PDF: &str = "application/pdf";
10pub const MIME_DOCX: &str =
11 "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
12pub const MIME_PPTX: &str =
13 "application/vnd.openxmlformats-officedocument.presentationml.presentation";
14pub const MIME_XLSX: &str = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
15
16const XLSX_MAX_SHEETS: usize = 100;
18const XLSX_MAX_CELLS_PER_SHEET: usize = 100_000;
20const MAX_XML_ENTRY_BYTES: u64 = 50 * 1024 * 1024;
22
23#[derive(Debug)]
25pub enum ExtractError {
26 UnsupportedContentType(String),
27 Pdf(String),
28 Ooxml(String),
29}
30
31impl std::fmt::Display for ExtractError {
32 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
33 match self {
34 ExtractError::UnsupportedContentType(ct) => {
35 write!(f, "unsupported content-type: {}", ct)
36 }
37 ExtractError::Pdf(e) => write!(f, "PDF extraction failed: {}", e),
38 ExtractError::Ooxml(e) => write!(f, "OOXML extraction failed: {}", e),
39 }
40 }
41}
42
43impl std::error::Error for ExtractError {}
44
45pub fn extract_text(bytes: &[u8], content_type: &str) -> Result<String, ExtractError> {
47 match content_type {
48 MIME_PDF => extract_pdf(bytes),
49 MIME_DOCX => extract_docx(bytes),
50 MIME_PPTX => extract_pptx(bytes),
51 MIME_XLSX => extract_xlsx(bytes),
52 _ => Err(ExtractError::UnsupportedContentType(
53 content_type.to_string(),
54 )),
55 }
56}
57
58fn extract_pdf(bytes: &[u8]) -> Result<String, ExtractError> {
59 pdf_extract::extract_text_from_mem(bytes).map_err(|e| ExtractError::Pdf(e.to_string()))
60}
61
62fn read_zip_entry_bounded(
63 archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
64 name: &str,
65 max_bytes: u64,
66) -> Result<Vec<u8>, ExtractError> {
67 let entry = archive
68 .by_name(name)
69 .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
70 let mut out = Vec::new();
71 entry
72 .take(max_bytes)
73 .read_to_end(&mut out)
74 .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
75 if out.len() as u64 >= max_bytes {
76 return Err(ExtractError::Ooxml(format!(
77 "ZIP entry {} exceeds size limit ({} bytes)",
78 name, max_bytes
79 )));
80 }
81 Ok(out)
82}
83
84fn extract_docx(bytes: &[u8]) -> Result<String, ExtractError> {
85 let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes))
86 .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
87 let mut doc_xml = Vec::new();
88 let mut found = false;
89 for i in 0..archive.len() {
90 let entry = archive
91 .by_index(i)
92 .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
93 if entry.name() == "word/document.xml" {
94 entry
95 .take(MAX_XML_ENTRY_BYTES)
96 .read_to_end(&mut doc_xml)
97 .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
98 if doc_xml.len() as u64 >= MAX_XML_ENTRY_BYTES {
99 return Err(ExtractError::Ooxml(
100 "word/document.xml exceeds size limit".to_string(),
101 ));
102 }
103 found = true;
104 break;
105 }
106 }
107 if !found {
108 return Err(ExtractError::Ooxml(
109 "word/document.xml not found".to_string(),
110 ));
111 }
112 extract_w_t_elements(&doc_xml)
113}
114
115fn extract_w_t_elements(xml: &[u8]) -> Result<String, ExtractError> {
116 let mut out = String::new();
117 let mut reader = quick_xml::Reader::from_reader(xml);
118 reader.config_mut().trim_text(true);
119 let mut buf = Vec::new();
120 loop {
121 match reader.read_event_into(&mut buf) {
122 Ok(quick_xml::events::Event::Start(e)) => {
123 let name = e.local_name();
124 if name.as_ref() == b"t" {
125 if let Ok(quick_xml::events::Event::Text(te)) = reader.read_event_into(&mut buf)
126 {
127 out.push_str(te.unescape().unwrap_or_default().as_ref());
128 }
129 }
130 }
131 Ok(quick_xml::events::Event::Empty(e)) => {
132 if e.local_name().as_ref() == b"t" {
133 }
135 }
136 Ok(quick_xml::events::Event::Eof) => break,
137 Err(e) => return Err(ExtractError::Ooxml(e.to_string())),
138 _ => {}
139 }
140 buf.clear();
141 }
142 Ok(out)
143}
144
145fn extract_pptx(bytes: &[u8]) -> Result<String, ExtractError> {
146 let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes))
147 .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
148 let mut slide_names: Vec<String> = archive
149 .file_names()
150 .filter(|n| n.starts_with("ppt/slides/slide") && n.ends_with(".xml"))
151 .map(|s| s.to_string())
152 .collect();
153 slide_names.sort_by_key(|name| {
154 name.trim_start_matches("ppt/slides/slide")
155 .trim_end_matches(".xml")
156 .parse::<u32>()
157 .unwrap_or(u32::MAX)
158 });
159 let mut out = String::new();
160 for name in slide_names {
161 let xml = read_zip_entry_bounded(&mut archive, &name, MAX_XML_ENTRY_BYTES)?;
162 let text = extract_a_t_elements(&xml)?;
163 if !out.is_empty() && !text.is_empty() {
164 out.push(' ');
165 }
166 out.push_str(&text);
167 }
168 Ok(out)
169}
170
171fn extract_a_t_elements(xml: &[u8]) -> Result<String, ExtractError> {
172 let mut out = String::new();
173 let mut reader = quick_xml::Reader::from_reader(xml);
174 reader.config_mut().trim_text(true);
175 let mut buf = Vec::new();
176 loop {
177 match reader.read_event_into(&mut buf) {
178 Ok(quick_xml::events::Event::Start(e)) => {
179 if e.local_name().as_ref() == b"t" {
180 if let Ok(quick_xml::events::Event::Text(te)) = reader.read_event_into(&mut buf)
181 {
182 out.push_str(te.unescape().unwrap_or_default().as_ref());
183 }
184 }
185 }
186 Ok(quick_xml::events::Event::Eof) => break,
187 Err(e) => return Err(ExtractError::Ooxml(e.to_string())),
188 _ => {}
189 }
190 buf.clear();
191 }
192 Ok(out)
193}
194
195fn extract_xlsx(bytes: &[u8]) -> Result<String, ExtractError> {
196 let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes))
197 .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
198 let shared_strings = read_shared_strings(&mut archive)?;
199 let sheet_names = list_worksheet_names(&mut archive)?;
200 let mut out = String::new();
201 for (idx, name) in sheet_names.into_iter().take(XLSX_MAX_SHEETS).enumerate() {
202 let sheet_xml = read_zip_entry_bounded(&mut archive, &name, MAX_XML_ENTRY_BYTES)?;
203 let cell_texts = extract_xlsx_sheet_cells(&sheet_xml, &shared_strings)?;
204 if idx > 0 && !out.is_empty() {
205 out.push(' ');
206 }
207 out.push_str(&cell_texts);
208 }
209 Ok(out)
210}
211
212fn read_shared_strings(
213 archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
214) -> Result<Vec<String>, ExtractError> {
215 let xml = match archive.by_name("xl/sharedStrings.xml") {
217 Ok(entry) => {
218 let mut buf = Vec::new();
219 entry
220 .take(MAX_XML_ENTRY_BYTES)
221 .read_to_end(&mut buf)
222 .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
223 if buf.len() as u64 >= MAX_XML_ENTRY_BYTES {
224 return Err(ExtractError::Ooxml(
225 "xl/sharedStrings.xml exceeds size limit".to_string(),
226 ));
227 }
228 buf
229 }
230 Err(e) => {
231 if matches!(e, zip::result::ZipError::FileNotFound) {
232 return Ok(Vec::new());
233 }
234 return Err(ExtractError::Ooxml(e.to_string()));
235 }
236 };
237 let mut strings = Vec::new();
238 let mut reader = quick_xml::Reader::from_reader(xml.as_slice());
239 reader.config_mut().trim_text(true);
240 let mut buf = Vec::new();
241 let mut in_si = false;
242 loop {
243 match reader.read_event_into(&mut buf) {
244 Ok(quick_xml::events::Event::Start(e)) => {
245 if e.local_name().as_ref() == b"si" {
246 in_si = true;
247 } else if in_si && e.local_name().as_ref() == b"t" {
248 if let Ok(quick_xml::events::Event::Text(te)) = reader.read_event_into(&mut buf)
249 {
250 strings.push(te.unescape().unwrap_or_default().into_owned());
251 }
252 }
253 }
254 Ok(quick_xml::events::Event::End(e)) => {
255 if e.local_name().as_ref() == b"si" {
256 in_si = false;
257 }
258 }
259 Ok(quick_xml::events::Event::Eof) => break,
260 Err(e) => return Err(ExtractError::Ooxml(e.to_string())),
261 _ => {}
262 }
263 buf.clear();
264 }
265 Ok(strings)
266}
267
268fn list_worksheet_names(
269 archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
270) -> Result<Vec<String>, ExtractError> {
271 let mut names: Vec<String> = archive
272 .file_names()
273 .filter(|n| n.starts_with("xl/worksheets/sheet") && n.ends_with(".xml"))
274 .map(|s| s.to_string())
275 .collect();
276 names.sort_by_key(|name| {
277 name.trim_start_matches("xl/worksheets/sheet")
278 .trim_end_matches(".xml")
279 .parse::<u32>()
280 .unwrap_or(u32::MAX)
281 });
282 Ok(names)
283}
284
285fn extract_xlsx_sheet_cells(xml: &[u8], shared_strings: &[String]) -> Result<String, ExtractError> {
287 let mut cells: Vec<String> = Vec::new();
288 let mut reader = quick_xml::Reader::from_reader(xml);
289 reader.config_mut().trim_text(true);
290 let mut buf = Vec::new();
291 let mut in_v = false;
292 let mut cell_is_shared_str = false;
293 let mut cell_count = 0usize;
294 loop {
295 if cell_count >= XLSX_MAX_CELLS_PER_SHEET {
296 break;
297 }
298 match reader.read_event_into(&mut buf) {
299 Ok(quick_xml::events::Event::Start(e)) => {
300 if e.local_name().as_ref() == b"c" {
301 cell_count += 1;
302 cell_is_shared_str = e.attributes().any(|a| {
303 a.as_ref()
304 .map(|a| a.key.as_ref() == b"t" && a.value.as_ref() == b"s")
305 .unwrap_or(false)
306 });
307 } else if e.local_name().as_ref() == b"v" {
308 in_v = true;
309 }
310 }
311 Ok(quick_xml::events::Event::Text(te)) if in_v => {
312 let v = te.unescape().unwrap_or_default();
313 let s = v.trim();
314 if !s.is_empty() && cell_is_shared_str {
315 if let Ok(i) = s.parse::<usize>() {
316 if i < shared_strings.len() {
317 cells.push(shared_strings[i].clone());
318 }
319 }
320 }
321 in_v = false;
322 }
323 Ok(quick_xml::events::Event::End(e)) => {
324 if e.local_name().as_ref() == b"v" {
325 in_v = false;
326 } else if e.local_name().as_ref() == b"c" {
327 cell_is_shared_str = false;
328 }
329 }
330 Ok(quick_xml::events::Event::Eof) => break,
331 Err(e) => return Err(ExtractError::Ooxml(e.to_string())),
332 _ => {}
333 }
334 buf.clear();
335 }
336 Ok(cells.join(" "))
337}
338
339#[cfg(test)]
340mod tests {
341 use super::*;
342
343 #[test]
344 fn unsupported_content_type_returns_error() {
345 let err = extract_text(b"foo", "application/octet-stream").unwrap_err();
346 assert!(matches!(err, ExtractError::UnsupportedContentType(_)));
347 }
348
349 #[test]
350 fn invalid_pdf_returns_error() {
351 let err = extract_text(b"not a pdf", MIME_PDF).unwrap_err();
352 assert!(matches!(err, ExtractError::Pdf(_)));
353 }
354
355 #[test]
356 fn invalid_zip_returns_error_for_docx() {
357 let err = extract_text(b"not a zip", MIME_DOCX).unwrap_err();
358 assert!(matches!(err, ExtractError::Ooxml(_)));
359 }
360}