context_harness/
extract.rs

1//! Multi-format text extraction for binary documents (PDF, OOXML).
2//!
3//! Conforms to [FILE_SUPPORT.md](../docs/FILE_SUPPORT.md). Extraction is pipeline-layer:
4//! connectors supply bytes + content-type; this module returns plain UTF-8 text.
5
6use std::io::Read;
7
8/// Supported MIME types for extraction (spec §1.1).
9pub const MIME_PDF: &str = "application/pdf";
10pub const MIME_DOCX: &str =
11    "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
12pub const MIME_PPTX: &str =
13    "application/vnd.openxmlformats-officedocument.presentationml.presentation";
14pub const MIME_XLSX: &str = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
15
16/// Maximum sheets to process in an xlsx (spec §5.2: implementation MAY limit).
17const XLSX_MAX_SHEETS: usize = 100;
18/// Maximum cells to process per sheet (avoids unbounded memory).
19const XLSX_MAX_CELLS_PER_SHEET: usize = 100_000;
20/// Maximum decompressed bytes to read from a single ZIP entry (zip-bomb protection).
21const MAX_XML_ENTRY_BYTES: u64 = 50 * 1024 * 1024;
22
23/// Extraction error (spec §5.1: no panic; return error and pipeline skips item).
24#[derive(Debug)]
25pub enum ExtractError {
26    UnsupportedContentType(String),
27    Pdf(String),
28    Ooxml(String),
29}
30
31impl std::fmt::Display for ExtractError {
32    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
33        match self {
34            ExtractError::UnsupportedContentType(ct) => {
35                write!(f, "unsupported content-type: {}", ct)
36            }
37            ExtractError::Pdf(e) => write!(f, "PDF extraction failed: {}", e),
38            ExtractError::Ooxml(e) => write!(f, "OOXML extraction failed: {}", e),
39        }
40    }
41}
42
43impl std::error::Error for ExtractError {}
44
45/// Extracts plain text from binary content. Returns UTF-8 string or error (spec §5, §6).
46pub fn extract_text(bytes: &[u8], content_type: &str) -> Result<String, ExtractError> {
47    match content_type {
48        MIME_PDF => extract_pdf(bytes),
49        MIME_DOCX => extract_docx(bytes),
50        MIME_PPTX => extract_pptx(bytes),
51        MIME_XLSX => extract_xlsx(bytes),
52        _ => Err(ExtractError::UnsupportedContentType(
53            content_type.to_string(),
54        )),
55    }
56}
57
58fn extract_pdf(bytes: &[u8]) -> Result<String, ExtractError> {
59    pdf_extract::extract_text_from_mem(bytes).map_err(|e| ExtractError::Pdf(e.to_string()))
60}
61
62fn read_zip_entry_bounded(
63    archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
64    name: &str,
65    max_bytes: u64,
66) -> Result<Vec<u8>, ExtractError> {
67    let entry = archive
68        .by_name(name)
69        .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
70    let mut out = Vec::new();
71    entry
72        .take(max_bytes)
73        .read_to_end(&mut out)
74        .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
75    if out.len() as u64 >= max_bytes {
76        return Err(ExtractError::Ooxml(format!(
77            "ZIP entry {} exceeds size limit ({} bytes)",
78            name, max_bytes
79        )));
80    }
81    Ok(out)
82}
83
84fn extract_docx(bytes: &[u8]) -> Result<String, ExtractError> {
85    let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes))
86        .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
87    let mut doc_xml = Vec::new();
88    let mut found = false;
89    for i in 0..archive.len() {
90        let entry = archive
91            .by_index(i)
92            .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
93        if entry.name() == "word/document.xml" {
94            entry
95                .take(MAX_XML_ENTRY_BYTES)
96                .read_to_end(&mut doc_xml)
97                .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
98            if doc_xml.len() as u64 >= MAX_XML_ENTRY_BYTES {
99                return Err(ExtractError::Ooxml(
100                    "word/document.xml exceeds size limit".to_string(),
101                ));
102            }
103            found = true;
104            break;
105        }
106    }
107    if !found {
108        return Err(ExtractError::Ooxml(
109            "word/document.xml not found".to_string(),
110        ));
111    }
112    extract_w_t_elements(&doc_xml)
113}
114
115fn extract_w_t_elements(xml: &[u8]) -> Result<String, ExtractError> {
116    let mut out = String::new();
117    let mut reader = quick_xml::Reader::from_reader(xml);
118    reader.config_mut().trim_text(true);
119    let mut buf = Vec::new();
120    loop {
121        match reader.read_event_into(&mut buf) {
122            Ok(quick_xml::events::Event::Start(e)) => {
123                let name = e.local_name();
124                if name.as_ref() == b"t" {
125                    if let Ok(quick_xml::events::Event::Text(te)) = reader.read_event_into(&mut buf)
126                    {
127                        out.push_str(te.unescape().unwrap_or_default().as_ref());
128                    }
129                }
130            }
131            Ok(quick_xml::events::Event::Empty(e)) => {
132                if e.local_name().as_ref() == b"t" {
133                    // empty t, nothing to add
134                }
135            }
136            Ok(quick_xml::events::Event::Eof) => break,
137            Err(e) => return Err(ExtractError::Ooxml(e.to_string())),
138            _ => {}
139        }
140        buf.clear();
141    }
142    Ok(out)
143}
144
145fn extract_pptx(bytes: &[u8]) -> Result<String, ExtractError> {
146    let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes))
147        .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
148    let mut slide_names: Vec<String> = archive
149        .file_names()
150        .filter(|n| n.starts_with("ppt/slides/slide") && n.ends_with(".xml"))
151        .map(|s| s.to_string())
152        .collect();
153    slide_names.sort_by_key(|name| {
154        name.trim_start_matches("ppt/slides/slide")
155            .trim_end_matches(".xml")
156            .parse::<u32>()
157            .unwrap_or(u32::MAX)
158    });
159    let mut out = String::new();
160    for name in slide_names {
161        let xml = read_zip_entry_bounded(&mut archive, &name, MAX_XML_ENTRY_BYTES)?;
162        let text = extract_a_t_elements(&xml)?;
163        if !out.is_empty() && !text.is_empty() {
164            out.push(' ');
165        }
166        out.push_str(&text);
167    }
168    Ok(out)
169}
170
171fn extract_a_t_elements(xml: &[u8]) -> Result<String, ExtractError> {
172    let mut out = String::new();
173    let mut reader = quick_xml::Reader::from_reader(xml);
174    reader.config_mut().trim_text(true);
175    let mut buf = Vec::new();
176    loop {
177        match reader.read_event_into(&mut buf) {
178            Ok(quick_xml::events::Event::Start(e)) => {
179                if e.local_name().as_ref() == b"t" {
180                    if let Ok(quick_xml::events::Event::Text(te)) = reader.read_event_into(&mut buf)
181                    {
182                        out.push_str(te.unescape().unwrap_or_default().as_ref());
183                    }
184                }
185            }
186            Ok(quick_xml::events::Event::Eof) => break,
187            Err(e) => return Err(ExtractError::Ooxml(e.to_string())),
188            _ => {}
189        }
190        buf.clear();
191    }
192    Ok(out)
193}
194
195fn extract_xlsx(bytes: &[u8]) -> Result<String, ExtractError> {
196    let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes))
197        .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
198    let shared_strings = read_shared_strings(&mut archive)?;
199    let sheet_names = list_worksheet_names(&mut archive)?;
200    let mut out = String::new();
201    for (idx, name) in sheet_names.into_iter().take(XLSX_MAX_SHEETS).enumerate() {
202        let sheet_xml = read_zip_entry_bounded(&mut archive, &name, MAX_XML_ENTRY_BYTES)?;
203        let cell_texts = extract_xlsx_sheet_cells(&sheet_xml, &shared_strings)?;
204        if idx > 0 && !out.is_empty() {
205            out.push(' ');
206        }
207        out.push_str(&cell_texts);
208    }
209    Ok(out)
210}
211
212fn read_shared_strings(
213    archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
214) -> Result<Vec<String>, ExtractError> {
215    // xl/sharedStrings.xml is optional in valid .xlsx (e.g. no shared strings or only inline/numeric cells).
216    let xml = match archive.by_name("xl/sharedStrings.xml") {
217        Ok(entry) => {
218            let mut buf = Vec::new();
219            entry
220                .take(MAX_XML_ENTRY_BYTES)
221                .read_to_end(&mut buf)
222                .map_err(|e| ExtractError::Ooxml(e.to_string()))?;
223            if buf.len() as u64 >= MAX_XML_ENTRY_BYTES {
224                return Err(ExtractError::Ooxml(
225                    "xl/sharedStrings.xml exceeds size limit".to_string(),
226                ));
227            }
228            buf
229        }
230        Err(e) => {
231            if matches!(e, zip::result::ZipError::FileNotFound) {
232                return Ok(Vec::new());
233            }
234            return Err(ExtractError::Ooxml(e.to_string()));
235        }
236    };
237    let mut strings = Vec::new();
238    let mut reader = quick_xml::Reader::from_reader(xml.as_slice());
239    reader.config_mut().trim_text(true);
240    let mut buf = Vec::new();
241    let mut in_si = false;
242    loop {
243        match reader.read_event_into(&mut buf) {
244            Ok(quick_xml::events::Event::Start(e)) => {
245                if e.local_name().as_ref() == b"si" {
246                    in_si = true;
247                } else if in_si && e.local_name().as_ref() == b"t" {
248                    if let Ok(quick_xml::events::Event::Text(te)) = reader.read_event_into(&mut buf)
249                    {
250                        strings.push(te.unescape().unwrap_or_default().into_owned());
251                    }
252                }
253            }
254            Ok(quick_xml::events::Event::End(e)) => {
255                if e.local_name().as_ref() == b"si" {
256                    in_si = false;
257                }
258            }
259            Ok(quick_xml::events::Event::Eof) => break,
260            Err(e) => return Err(ExtractError::Ooxml(e.to_string())),
261            _ => {}
262        }
263        buf.clear();
264    }
265    Ok(strings)
266}
267
268fn list_worksheet_names(
269    archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
270) -> Result<Vec<String>, ExtractError> {
271    let mut names: Vec<String> = archive
272        .file_names()
273        .filter(|n| n.starts_with("xl/worksheets/sheet") && n.ends_with(".xml"))
274        .map(|s| s.to_string())
275        .collect();
276    names.sort_by_key(|name| {
277        name.trim_start_matches("xl/worksheets/sheet")
278            .trim_end_matches(".xml")
279            .parse::<u32>()
280            .unwrap_or(u32::MAX)
281    });
282    Ok(names)
283}
284
285/// Limit bounds parsing work (cells considered), not only text cells emitted.
286fn extract_xlsx_sheet_cells(xml: &[u8], shared_strings: &[String]) -> Result<String, ExtractError> {
287    let mut cells: Vec<String> = Vec::new();
288    let mut reader = quick_xml::Reader::from_reader(xml);
289    reader.config_mut().trim_text(true);
290    let mut buf = Vec::new();
291    let mut in_v = false;
292    let mut cell_is_shared_str = false;
293    let mut cell_count = 0usize;
294    loop {
295        if cell_count >= XLSX_MAX_CELLS_PER_SHEET {
296            break;
297        }
298        match reader.read_event_into(&mut buf) {
299            Ok(quick_xml::events::Event::Start(e)) => {
300                if e.local_name().as_ref() == b"c" {
301                    cell_count += 1;
302                    cell_is_shared_str = e.attributes().any(|a| {
303                        a.as_ref()
304                            .map(|a| a.key.as_ref() == b"t" && a.value.as_ref() == b"s")
305                            .unwrap_or(false)
306                    });
307                } else if e.local_name().as_ref() == b"v" {
308                    in_v = true;
309                }
310            }
311            Ok(quick_xml::events::Event::Text(te)) if in_v => {
312                let v = te.unescape().unwrap_or_default();
313                let s = v.trim();
314                if !s.is_empty() && cell_is_shared_str {
315                    if let Ok(i) = s.parse::<usize>() {
316                        if i < shared_strings.len() {
317                            cells.push(shared_strings[i].clone());
318                        }
319                    }
320                }
321                in_v = false;
322            }
323            Ok(quick_xml::events::Event::End(e)) => {
324                if e.local_name().as_ref() == b"v" {
325                    in_v = false;
326                } else if e.local_name().as_ref() == b"c" {
327                    cell_is_shared_str = false;
328                }
329            }
330            Ok(quick_xml::events::Event::Eof) => break,
331            Err(e) => return Err(ExtractError::Ooxml(e.to_string())),
332            _ => {}
333        }
334        buf.clear();
335    }
336    Ok(cells.join(" "))
337}
338
339#[cfg(test)]
340mod tests {
341    use super::*;
342
343    #[test]
344    fn unsupported_content_type_returns_error() {
345        let err = extract_text(b"foo", "application/octet-stream").unwrap_err();
346        assert!(matches!(err, ExtractError::UnsupportedContentType(_)));
347    }
348
349    #[test]
350    fn invalid_pdf_returns_error() {
351        let err = extract_text(b"not a pdf", MIME_PDF).unwrap_err();
352        assert!(matches!(err, ExtractError::Pdf(_)));
353    }
354
355    #[test]
356    fn invalid_zip_returns_error_for_docx() {
357        let err = extract_text(b"not a zip", MIME_DOCX).unwrap_err();
358        assert!(matches!(err, ExtractError::Ooxml(_)));
359    }
360}