1use anyhow::{bail, Context, Result};
45use async_trait::async_trait;
46use chrono::{TimeZone, Utc};
47use globset::{Glob, GlobSet, GlobSetBuilder};
48use sha2::{Digest, Sha256};
49use std::path::{Path, PathBuf};
50use std::process::Command;
51use walkdir::WalkDir;
52
53use crate::config::GitConnectorConfig;
54use crate::models::SourceItem;
55use crate::traits::Connector;
56
57pub struct GitConnector {
66 name: String,
68 config: GitConnectorConfig,
70 db_path: PathBuf,
72}
73
74impl GitConnector {
75 pub fn new(name: String, config: GitConnectorConfig, db_path: PathBuf) -> Self {
77 Self {
78 name,
79 config,
80 db_path,
81 }
82 }
83}
84
85#[async_trait]
86impl Connector for GitConnector {
87 fn name(&self) -> &str {
88 &self.name
89 }
90
91 fn description(&self) -> &str {
92 "Clone/pull Git repos and walk files"
93 }
94
95 fn connector_type(&self) -> &str {
96 "git"
97 }
98
99 async fn scan(&self) -> Result<Vec<SourceItem>> {
100 scan_git(&self.name, &self.config, &self.db_path)
101 }
102}
103
104pub fn scan_git(
129 name: &str,
130 git_config: &GitConnectorConfig,
131 db_path: &Path,
132) -> Result<Vec<SourceItem>> {
133 let cache_dir = match &git_config.cache_dir {
135 Some(dir) => dir.clone(),
136 None => {
137 let db_parent = db_path.parent().unwrap_or_else(|| Path::new("."));
139 let url_hash = short_hash(&git_config.url);
140 db_parent.join(".git-cache").join(url_hash)
141 }
142 };
143
144 if cache_dir.join(".git").exists() {
146 git_pull(&cache_dir, &git_config.branch)?;
147 } else {
148 git_clone(
149 &git_config.url,
150 &git_config.branch,
151 git_config.shallow,
152 &cache_dir,
153 )?;
154 }
155
156 let scan_root = if git_config.root == "." {
158 cache_dir.clone()
159 } else {
160 cache_dir.join(&git_config.root)
161 };
162
163 if !scan_root.exists() {
164 bail!(
165 "Git connector root '{}' does not exist in repo {}",
166 git_config.root,
167 git_config.url
168 );
169 }
170
171 let head_sha = git_head_sha(&cache_dir).unwrap_or_else(|_| "unknown".to_string());
173
174 let include_set = build_globset(&git_config.include_globs)?;
176
177 let mut default_excludes = vec![
178 "**/.git/**".to_string(),
179 "**/target/**".to_string(),
180 "**/node_modules/**".to_string(),
181 ];
182 default_excludes.extend(git_config.exclude_globs.clone());
183 let exclude_set = build_globset(&default_excludes)?;
184
185 let mut items = Vec::new();
186
187 for entry in WalkDir::new(&scan_root) {
188 let entry = entry?;
189 if !entry.file_type().is_file() {
190 continue;
191 }
192
193 let path = entry.path();
194 let relative = path.strip_prefix(&scan_root).unwrap_or(path);
195 let rel_str = relative.to_string_lossy().to_string();
196
197 if exclude_set.is_match(&rel_str) {
198 continue;
199 }
200 if !include_set.is_match(&rel_str) {
201 continue;
202 }
203
204 let source_label = format!("git:{}", name);
205 let item = file_to_source_item(
206 path,
207 &rel_str,
208 &cache_dir,
209 &git_config.url,
210 &head_sha,
211 &source_label,
212 )?;
213 items.push(item);
214 }
215
216 items.sort_by(|a, b| a.source_id.cmp(&b.source_id));
217 Ok(items)
218}
219
220fn git_clone(url: &str, branch: &str, shallow: bool, dest: &Path) -> Result<()> {
224 std::fs::create_dir_all(dest)
225 .with_context(|| format!("Failed to create cache directory: {}", dest.display()))?;
226
227 let mut cmd = Command::new("git");
228 cmd.args(["clone", "--branch", branch, "--single-branch"]);
229 if shallow {
230 cmd.args(["--depth", "1"]);
231 }
232 cmd.arg(url);
233 cmd.arg(dest);
234
235 let output = cmd
236 .output()
237 .with_context(|| "Failed to execute 'git clone'. Is git installed?")?;
238
239 if !output.status.success() {
240 let stderr = String::from_utf8_lossy(&output.stderr);
241 bail!("git clone failed: {}", stderr.trim());
242 }
243
244 Ok(())
245}
246
247fn git_pull(repo_dir: &Path, branch: &str) -> Result<()> {
249 let output = Command::new("git")
251 .args(["fetch", "origin", branch])
252 .current_dir(repo_dir)
253 .output()
254 .with_context(|| "Failed to execute 'git fetch'")?;
255
256 if !output.status.success() {
257 let stderr = String::from_utf8_lossy(&output.stderr);
258 bail!("git fetch failed: {}", stderr.trim());
259 }
260
261 let remote_ref = format!("origin/{}", branch);
263 let output = Command::new("git")
264 .args(["reset", "--hard", &remote_ref])
265 .current_dir(repo_dir)
266 .output()
267 .with_context(|| "Failed to execute 'git reset'")?;
268
269 if !output.status.success() {
270 let stderr = String::from_utf8_lossy(&output.stderr);
271 bail!("git reset failed: {}", stderr.trim());
272 }
273
274 Ok(())
275}
276
277fn git_head_sha(repo_dir: &Path) -> Result<String> {
279 let output = Command::new("git")
280 .args(["rev-parse", "HEAD"])
281 .current_dir(repo_dir)
282 .output()
283 .with_context(|| "Failed to get HEAD SHA")?;
284
285 if !output.status.success() {
286 bail!("git rev-parse HEAD failed");
287 }
288
289 Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
290}
291
292fn git_file_last_commit_time(repo_dir: &Path, file_path: &Path) -> Option<i64> {
296 let output = Command::new("git")
297 .args(["log", "-1", "--format=%ct", "--"])
298 .arg(file_path)
299 .current_dir(repo_dir)
300 .output()
301 .ok()?;
302
303 if !output.status.success() {
304 return None;
305 }
306
307 let ts_str = String::from_utf8_lossy(&output.stdout);
308 ts_str.trim().parse::<i64>().ok()
309}
310
311fn git_file_last_author(repo_dir: &Path, file_path: &Path) -> Option<String> {
315 let output = Command::new("git")
316 .args(["log", "-1", "--format=%an", "--"])
317 .arg(file_path)
318 .current_dir(repo_dir)
319 .output()
320 .ok()?;
321
322 if !output.status.success() {
323 return None;
324 }
325
326 let author = String::from_utf8_lossy(&output.stdout).trim().to_string();
327 if author.is_empty() {
328 None
329 } else {
330 Some(author)
331 }
332}
333
334fn file_to_source_item(
339 path: &Path,
340 relative_path: &str,
341 repo_dir: &Path,
342 repo_url: &str,
343 head_sha: &str,
344 source: &str,
345) -> Result<SourceItem> {
346 let body = std::fs::read_to_string(path).unwrap_or_default();
347
348 let title = path
349 .file_name()
350 .map(|n| n.to_string_lossy().to_string())
351 .unwrap_or_default();
352
353 let commit_ts = git_file_last_commit_time(repo_dir, path);
355 let updated_secs = commit_ts.unwrap_or_else(|| {
356 let metadata = std::fs::metadata(path).ok();
357 metadata
358 .and_then(|m| m.modified().ok())
359 .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
360 .map(|d| d.as_secs() as i64)
361 .unwrap_or(0)
362 });
363
364 let author = git_file_last_author(repo_dir, path);
365
366 let source_url = build_web_url(repo_url, head_sha, relative_path);
368
369 let metadata = serde_json::json!({
370 "git_sha": head_sha,
371 "repo_url": repo_url,
372 });
373
374 Ok(SourceItem {
375 source: source.to_string(),
376 source_id: relative_path.to_string(),
377 source_url: Some(source_url),
378 title: Some(title),
379 author,
380 created_at: Utc.timestamp_opt(updated_secs, 0).unwrap(),
381 updated_at: Utc.timestamp_opt(updated_secs, 0).unwrap(),
382 content_type: "text/plain".to_string(),
383 body,
384 metadata_json: metadata.to_string(),
385 raw_json: None,
386 raw_bytes: None,
387 })
388}
389
390fn build_web_url(repo_url: &str, sha: &str, relative_path: &str) -> String {
395 if let Some(rest) = repo_url.strip_prefix("git@github.com:") {
397 let repo = rest.trim_end_matches(".git");
398 return format!("https://github.com/{}/blob/{}/{}", repo, sha, relative_path);
399 }
400
401 if repo_url.contains("github.com") {
403 let base = repo_url.trim_end_matches(".git");
404 return format!("{}/blob/{}/{}", base, sha, relative_path);
405 }
406
407 if let Some(rest) = repo_url.strip_prefix("git@gitlab.com:") {
409 let repo = rest.trim_end_matches(".git");
410 return format!(
411 "https://gitlab.com/{}/-/blob/{}/{}",
412 repo, sha, relative_path
413 );
414 }
415
416 format!("git://{}/{}", repo_url, relative_path)
418}
419
420fn short_hash(input: &str) -> String {
422 let mut hasher = Sha256::new();
423 hasher.update(input.as_bytes());
424 format!("{:x}", hasher.finalize())[..12].to_string()
425}
426
427fn build_globset(patterns: &[String]) -> Result<GlobSet> {
429 let mut builder = GlobSetBuilder::new();
430 for pattern in patterns {
431 builder.add(Glob::new(pattern)?);
432 }
433 Ok(builder.build()?)
434}