context_harness/
connector_fs.rs1use anyhow::{bail, Result};
33use async_trait::async_trait;
34use chrono::{TimeZone, Utc};
35use globset::{Glob, GlobSet, GlobSetBuilder};
36use std::path::Path;
37use walkdir::WalkDir;
38
39use crate::config::FilesystemConnectorConfig;
40use crate::models::SourceItem;
41use crate::traits::Connector;
42
43const BINARY_EXTENSIONS: &[&str] = &[".pdf", ".docx", ".pptx", ".xlsx"];
45
46fn binary_content_type(ext: &str) -> Option<&'static str> {
48 match ext.to_lowercase().as_str() {
49 ".pdf" => Some("application/pdf"),
50 ".docx" => Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
51 ".pptx" => {
52 Some("application/vnd.openxmlformats-officedocument.presentationml.presentation")
53 }
54 ".xlsx" => Some("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
55 _ => None,
56 }
57}
58
59pub struct FilesystemConnector {
83 name: String,
85 config: FilesystemConnectorConfig,
87}
88
89impl FilesystemConnector {
90 pub fn new(name: String, config: FilesystemConnectorConfig) -> Self {
92 Self { name, config }
93 }
94}
95
96#[async_trait]
97impl Connector for FilesystemConnector {
98 fn name(&self) -> &str {
99 &self.name
100 }
101
102 fn description(&self) -> &str {
103 "Walk local directories with glob patterns"
104 }
105
106 fn connector_type(&self) -> &str {
107 "filesystem"
108 }
109
110 async fn scan(&self) -> Result<Vec<SourceItem>> {
111 scan_filesystem(&self.name, &self.config)
112 }
113}
114
115pub fn scan_filesystem(
133 name: &str,
134 fs_config: &FilesystemConnectorConfig,
135) -> Result<Vec<SourceItem>> {
136 let root = &fs_config.root;
137 if !root.exists() {
138 bail!(
139 "Filesystem connector root does not exist: {}",
140 root.display()
141 );
142 }
143
144 let include_set = build_globset(&fs_config.include_globs)?;
145
146 let mut default_excludes = vec![
147 "**/.git/**".to_string(),
148 "**/target/**".to_string(),
149 "**/node_modules/**".to_string(),
150 ];
151 default_excludes.extend(fs_config.exclude_globs.clone());
152 let exclude_set = build_globset(&default_excludes)?;
153
154 let mut items = Vec::new();
155
156 let walker = WalkDir::new(root).follow_links(fs_config.follow_symlinks);
157 for entry in walker {
158 let entry = entry?;
159 if !entry.file_type().is_file() {
160 continue;
161 }
162
163 let path = entry.path();
164 let relative = path.strip_prefix(root).unwrap_or(path);
165 let rel_str = relative.to_string_lossy().to_string();
166
167 if exclude_set.is_match(&rel_str) {
169 continue;
170 }
171
172 if !include_set.is_match(&rel_str) {
174 continue;
175 }
176
177 let source_label = format!("filesystem:{}", name);
178 if let Some(item) = file_to_source_item(path, &rel_str, &source_label, fs_config)? {
179 items.push(item);
180 }
181 }
182
183 items.sort_by(|a, b| a.source_id.cmp(&b.source_id));
185
186 Ok(items)
187}
188
189fn file_to_source_item(
195 path: &Path,
196 relative_path: &str,
197 source: &str,
198 fs_config: &FilesystemConnectorConfig,
199) -> Result<Option<SourceItem>> {
200 let metadata = std::fs::metadata(path)?;
201 let modified = metadata
202 .modified()
203 .unwrap_or(std::time::SystemTime::UNIX_EPOCH);
204 let modified_secs = modified
205 .duration_since(std::time::SystemTime::UNIX_EPOCH)
206 .unwrap_or_default()
207 .as_secs() as i64;
208
209 let title = path
210 .file_name()
211 .map(|n| n.to_string_lossy().to_string())
212 .unwrap_or_default();
213
214 let ext = path
215 .extension()
216 .map(|e| format!(".{}", e.to_string_lossy()))
217 .unwrap_or_default()
218 .to_lowercase();
219 let is_binary_ext = BINARY_EXTENSIONS.contains(&ext.as_str());
220 let content_type_from_ext = binary_content_type(&ext);
221
222 if let (true, Some(mime)) = (is_binary_ext, content_type_from_ext) {
223 if metadata.len() > fs_config.max_extract_bytes {
224 return Ok(None);
225 }
226 let bytes = std::fs::read(path)?;
227 return Ok(Some(SourceItem {
228 source: source.to_string(),
229 source_id: relative_path.to_string(),
230 source_url: Some(format!("file://{}", path.display())),
231 title: Some(title),
232 author: None,
233 created_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
234 updated_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
235 content_type: mime.to_string(),
236 body: String::new(),
237 metadata_json: "{}".to_string(),
238 raw_json: None,
239 raw_bytes: Some(bytes),
240 }));
241 }
242
243 match std::fs::read_to_string(path) {
244 Ok(body) => Ok(Some(SourceItem {
245 source: source.to_string(),
246 source_id: relative_path.to_string(),
247 source_url: Some(format!("file://{}", path.display())),
248 title: Some(title),
249 author: None,
250 created_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
251 updated_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
252 content_type: "text/plain".to_string(),
253 body,
254 metadata_json: "{}".to_string(),
255 raw_json: None,
256 raw_bytes: None,
257 })),
258 Err(_) => {
259 if let (true, Some(mime)) = (is_binary_ext, content_type_from_ext) {
260 let bytes = std::fs::read(path)?;
261 Ok(Some(SourceItem {
262 source: source.to_string(),
263 source_id: relative_path.to_string(),
264 source_url: Some(format!("file://{}", path.display())),
265 title: Some(title),
266 author: None,
267 created_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
268 updated_at: Utc.timestamp_opt(modified_secs, 0).unwrap(),
269 content_type: mime.to_string(),
270 body: String::new(),
271 metadata_json: "{}".to_string(),
272 raw_json: None,
273 raw_bytes: Some(bytes),
274 }))
275 } else {
276 Ok(None)
277 }
278 }
279 }
280}
281
282fn build_globset(patterns: &[String]) -> Result<GlobSet> {
284 let mut builder = GlobSetBuilder::new();
285 for pattern in patterns {
286 builder.add(Glob::new(pattern)?);
287 }
288 Ok(builder.build()?)
289}