context_harness/
connector_fs.rs

1//! Filesystem connector.
2//!
3//! Walks a local directory, applies glob include/exclude patterns, and produces
4//! [`SourceItem`]s with filesystem metadata (modification time, file path).
5//!
6//! # Configuration
7//!
8//! ```toml
9//! [connectors.filesystem.docs]
10//! root = "./docs"
11//! include_globs = ["**/*.md", "**/*.txt"]
12//! exclude_globs = ["**/drafts/**"]
13//! follow_symlinks = false
14//! ```
15//!
16//! # Default Excludes
17//!
18//! The following directories are always excluded regardless of configuration:
19//! - `**/.git/**`
20//! - `**/target/**`
21//! - `**/node_modules/**`
22//!
23//! # Output
24//!
25//! Each file becomes a [`SourceItem`] with:
26//! - `source`: `"filesystem:<name>"` (e.g. `"filesystem:docs"`)
27//! - `source_id`: relative path from root (e.g. `"guides/deploy.md"`)
28//! - `source_url`: `file://` URI
29//! - `updated_at`: filesystem modification time
30//! - `body`: file contents as UTF-8
31
32use anyhow::{bail, Result};
33use async_trait::async_trait;
34use chrono::{TimeZone, Utc};
35use globset::{Glob, GlobSet, GlobSetBuilder};
36use std::path::Path;
37use walkdir::WalkDir;
38
39use crate::config::FilesystemConnectorConfig;
40use crate::models::SourceItem;
41use crate::traits::Connector;
42
43/// Binary file extensions that are read as bytes and extracted (spec §2.2).
44const BINARY_EXTENSIONS: &[&str] = &[".pdf", ".docx", ".pptx", ".xlsx"];
45
46/// Extension to MIME type per spec §4.1.
47fn binary_content_type(ext: &str) -> Option<&'static str> {
48    match ext.to_lowercase().as_str() {
49        ".pdf" => Some("application/pdf"),
50        ".docx" => Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
51        ".pptx" => {
52            Some("application/vnd.openxmlformats-officedocument.presentationml.presentation")
53        }
54        ".xlsx" => Some("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
55        _ => None,
56    }
57}
58
59// ═══════════════════════════════════════════════════════════════════════
60// Connector trait implementation
61// ═══════════════════════════════════════════════════════════════════════
62
63/// A filesystem connector instance that implements the [`Connector`] trait.
64///
65/// Wraps the [`scan_filesystem`] function, allowing filesystem connectors
66/// to be used through the unified trait-based dispatch.
67///
68/// # Example
69///
70/// ```rust,no_run
71/// use context_harness::connector_fs::FilesystemConnector;
72/// use context_harness::config::FilesystemConnectorConfig;
73/// use context_harness::traits::Connector;
74///
75/// let config: FilesystemConnectorConfig = toml::from_str(r#"
76///     root = "./docs"
77///     include_globs = ["**/*.md"]
78/// "#).unwrap();
79/// let connector = FilesystemConnector::new("docs".into(), config);
80/// assert_eq!(connector.source_label(), "filesystem:docs");
81/// ```
82pub struct FilesystemConnector {
83    /// Instance name (e.g. `"docs"`).
84    name: String,
85    /// Configuration for this filesystem connector instance.
86    config: FilesystemConnectorConfig,
87}
88
89impl FilesystemConnector {
90    /// Create a new filesystem connector instance.
91    pub fn new(name: String, config: FilesystemConnectorConfig) -> Self {
92        Self { name, config }
93    }
94}
95
96#[async_trait]
97impl Connector for FilesystemConnector {
98    fn name(&self) -> &str {
99        &self.name
100    }
101
102    fn description(&self) -> &str {
103        "Walk local directories with glob patterns"
104    }
105
106    fn connector_type(&self) -> &str {
107        "filesystem"
108    }
109
110    async fn scan(&self) -> Result<Vec<SourceItem>> {
111        scan_filesystem(&self.name, &self.config)
112    }
113}
114
115/// Scan a local directory and produce [`SourceItem`]s.
116///
117/// Walks the configured `root` directory, applies include/exclude globs,
118/// reads each matching file, and returns a sorted list of `SourceItem`s.
119///
120/// # Arguments
121///
122/// - `name` — the instance name (e.g. `"docs"`). Used as part of the source
123///   identifier: `"filesystem:<name>"`.
124/// - `fs_config` — the filesystem connector configuration for this instance.
125///
126/// # Errors
127///
128/// Returns an error if:
129/// - The root directory does not exist
130/// - A glob pattern is invalid
131/// - A directory entry cannot be read
132pub fn scan_filesystem(
133    name: &str,
134    fs_config: &FilesystemConnectorConfig,
135) -> Result<Vec<SourceItem>> {
136    let root = &fs_config.root;
137    if !root.exists() {
138        bail!(
139            "Filesystem connector root does not exist: {}",
140            root.display()
141        );
142    }
143
144    let include_set = build_globset(&fs_config.include_globs)?;
145
146    let mut default_excludes = vec![
147        "**/.git/**".to_string(),
148        "**/target/**".to_string(),
149        "**/node_modules/**".to_string(),
150    ];
151    default_excludes.extend(fs_config.exclude_globs.clone());
152    let exclude_set = build_globset(&default_excludes)?;
153
154    let mut items = Vec::new();
155
156    let walker = WalkDir::new(root).follow_links(fs_config.follow_symlinks);
157    for entry in walker {
158        let entry = entry?;
159        if !entry.file_type().is_file() {
160            continue;
161        }
162
163        let path = entry.path();
164        let relative = path.strip_prefix(root).unwrap_or(path);
165        let rel_str = relative.to_string_lossy().to_string();
166
167        // Apply exclude patterns
168        if exclude_set.is_match(&rel_str) {
169            continue;
170        }
171
172        // Apply include patterns
173        if !include_set.is_match(&rel_str) {
174            continue;
175        }
176
177        let source_label = format!("filesystem:{}", name);
178        if let Some(item) = file_to_source_item(path, &rel_str, &source_label, fs_config)? {
179            items.push(item);
180        }
181    }
182
183    // Sort for deterministic ordering
184    items.sort_by(|a, b| a.source_id.cmp(&b.source_id));
185
186    Ok(items)
187}
188
189/// Convert a single file to a [`SourceItem`], or `None` if the file should be skipped (spec §2.2).
190///
191/// For files with a supported binary extension (.pdf, .docx, .pptx, .xlsx), reads raw bytes
192/// and returns an item with `raw_bytes` set and empty `body`. Otherwise reads as UTF-8; on
193/// decode failure and binary extension, falls back to raw bytes; else skips (returns `None`).
194fn file_to_source_item(
195    path: &Path,
196    relative_path: &str,
197    source: &str,
198    fs_config: &FilesystemConnectorConfig,
199) -> Result<Option<SourceItem>> {
200    let metadata = std::fs::metadata(path)?;
201    let modified = metadata
202        .modified()
203        .unwrap_or(std::time::SystemTime::UNIX_EPOCH);
204    let modified_secs = modified
205        .duration_since(std::time::SystemTime::UNIX_EPOCH)
206        .unwrap_or_default()
207        .as_secs() as i64;
208
209    let title = path
210        .file_name()
211        .map(|n| n.to_string_lossy().to_string())
212        .unwrap_or_default();
213
214    let ext = path
215        .extension()
216        .map(|e| format!(".{}", e.to_string_lossy()))
217        .unwrap_or_default()
218        .to_lowercase();
219    let is_binary_ext = BINARY_EXTENSIONS.contains(&ext.as_str());
220    let content_type_from_ext = binary_content_type(&ext);
221
222    if let (true, Some(mime)) = (is_binary_ext, content_type_from_ext) {
223        if metadata.len() > fs_config.max_extract_bytes {
224            return Ok(None);
225        }
226        let bytes = std::fs::read(path)?;
227        return Ok(Some(SourceItem {
228            source: source.to_string(),
229            source_id: relative_path.to_string(),
230            source_url: Some(format!("file://{}", path.display())),
231            title: Some(title),
232            author: None,
233            created_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
234            updated_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
235            content_type: mime.to_string(),
236            body: String::new(),
237            metadata_json: "{}".to_string(),
238            raw_json: None,
239            raw_bytes: Some(bytes),
240        }));
241    }
242
243    match std::fs::read_to_string(path) {
244        Ok(body) => Ok(Some(SourceItem {
245            source: source.to_string(),
246            source_id: relative_path.to_string(),
247            source_url: Some(format!("file://{}", path.display())),
248            title: Some(title),
249            author: None,
250            created_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
251            updated_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
252            content_type: "text/plain".to_string(),
253            body,
254            metadata_json: "{}".to_string(),
255            raw_json: None,
256            raw_bytes: None,
257        })),
258        Err(_) => {
259            if let (true, Some(mime)) = (is_binary_ext, content_type_from_ext) {
260                let bytes = std::fs::read(path)?;
261                Ok(Some(SourceItem {
262                    source: source.to_string(),
263                    source_id: relative_path.to_string(),
264                    source_url: Some(format!("file://{}", path.display())),
265                    title: Some(title),
266                    author: None,
267                    created_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
268                    updated_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
269                    content_type: mime.to_string(),
270                    body: String::new(),
271                    metadata_json: "{}".to_string(),
272                    raw_json: None,
273                    raw_bytes: Some(bytes),
274                }))
275            } else {
276                Ok(None)
277            }
278        }
279    }
280}
281
282/// Build a [`GlobSet`] from a list of glob pattern strings.
283fn build_globset(patterns: &[String]) -> Result<GlobSet> {
284    let mut builder = GlobSetBuilder::new();
285    for pattern in patterns {
286        builder.add(Glob::new(pattern)?);
287    }
288    Ok(builder.build()?)
289}