1use anyhow::{bail, Context, Result};
46use async_trait::async_trait;
47use chrono::{TimeZone, Utc};
48use globset::{Glob, GlobSet, GlobSetBuilder};
49use sha2::{Digest, Sha256};
50use std::path::{Path, PathBuf};
51use std::process::Command;
52use walkdir::WalkDir;
53
54use crate::config::GitConnectorConfig;
55use crate::ctx_dirs;
56use crate::models::SourceItem;
57use crate::traits::Connector;
58
59pub struct GitConnector {
68 name: String,
70 config: GitConnectorConfig,
72 db_path: PathBuf,
74}
75
76impl GitConnector {
77 pub fn new(name: String, config: GitConnectorConfig, db_path: PathBuf) -> Self {
79 Self {
80 name,
81 config,
82 db_path,
83 }
84 }
85}
86
87#[async_trait]
88impl Connector for GitConnector {
89 fn name(&self) -> &str {
90 &self.name
91 }
92
93 fn description(&self) -> &str {
94 "Clone/pull Git repos and walk files"
95 }
96
97 fn connector_type(&self) -> &str {
98 "git"
99 }
100
101 async fn scan(&self) -> Result<Vec<SourceItem>> {
102 scan_git(&self.name, &self.config, &self.db_path)
103 }
104}
105
106pub fn scan_git(
131 name: &str,
132 git_config: &GitConnectorConfig,
133 db_path: &Path,
134) -> Result<Vec<SourceItem>> {
135 let cache_dir = match &git_config.cache_dir {
137 Some(dir) => dir.clone(),
138 None => {
139 let url_hash = short_hash(&git_config.url);
140 if ctx_dirs::is_default_workspace_db_path(db_path) {
141 ctx_dirs::workspace_git_cache_dir().join(url_hash)
142 } else {
143 let db_parent = db_path.parent().unwrap_or_else(|| Path::new("."));
145 db_parent.join(".git-cache").join(url_hash)
146 }
147 }
148 };
149
150 if cache_dir.join(".git").exists() {
152 git_pull(&cache_dir, &git_config.branch)?;
153 } else {
154 git_clone(
155 &git_config.url,
156 &git_config.branch,
157 git_config.shallow,
158 &cache_dir,
159 )?;
160 }
161
162 let scan_root = if git_config.root == "." {
164 cache_dir.clone()
165 } else {
166 cache_dir.join(&git_config.root)
167 };
168
169 if !scan_root.exists() {
170 bail!(
171 "Git connector root '{}' does not exist in repo {}",
172 git_config.root,
173 git_config.url
174 );
175 }
176
177 let head_sha = git_head_sha(&cache_dir).unwrap_or_else(|_| "unknown".to_string());
179
180 let include_set = build_globset(&git_config.include_globs)?;
182
183 let mut default_excludes = vec![
184 "**/.git/**".to_string(),
185 "**/target/**".to_string(),
186 "**/node_modules/**".to_string(),
187 ];
188 default_excludes.extend(git_config.exclude_globs.clone());
189 let exclude_set = build_globset(&default_excludes)?;
190
191 let mut items = Vec::new();
192
193 for entry in WalkDir::new(&scan_root) {
194 let entry = entry?;
195 if !entry.file_type().is_file() {
196 continue;
197 }
198
199 let path = entry.path();
200 let relative = path.strip_prefix(&scan_root).unwrap_or(path);
201 let rel_str = relative.to_string_lossy().to_string();
202
203 if exclude_set.is_match(&rel_str) {
204 continue;
205 }
206 if !include_set.is_match(&rel_str) {
207 continue;
208 }
209
210 let source_label = format!("git:{}", name);
211 let item = file_to_source_item(
212 path,
213 &rel_str,
214 &cache_dir,
215 &git_config.url,
216 &head_sha,
217 &source_label,
218 )?;
219 items.push(item);
220 }
221
222 items.sort_by(|a, b| a.source_id.cmp(&b.source_id));
223 Ok(items)
224}
225
226fn git_clone(url: &str, branch: &str, shallow: bool, dest: &Path) -> Result<()> {
230 std::fs::create_dir_all(dest)
231 .with_context(|| format!("Failed to create cache directory: {}", dest.display()))?;
232
233 let mut cmd = Command::new("git");
234 cmd.args(["clone", "--branch", branch, "--single-branch"]);
235 if shallow {
236 cmd.args(["--depth", "1"]);
237 }
238 cmd.arg(url);
239 cmd.arg(dest);
240
241 let output = cmd
242 .output()
243 .with_context(|| "Failed to execute 'git clone'. Is git installed?")?;
244
245 if !output.status.success() {
246 let stderr = String::from_utf8_lossy(&output.stderr);
247 bail!("git clone failed: {}", stderr.trim());
248 }
249
250 Ok(())
251}
252
253fn git_pull(repo_dir: &Path, branch: &str) -> Result<()> {
255 let output = Command::new("git")
257 .args(["fetch", "origin", branch])
258 .current_dir(repo_dir)
259 .output()
260 .with_context(|| "Failed to execute 'git fetch'")?;
261
262 if !output.status.success() {
263 let stderr = String::from_utf8_lossy(&output.stderr);
264 bail!("git fetch failed: {}", stderr.trim());
265 }
266
267 let remote_ref = format!("origin/{}", branch);
269 let output = Command::new("git")
270 .args(["reset", "--hard", &remote_ref])
271 .current_dir(repo_dir)
272 .output()
273 .with_context(|| "Failed to execute 'git reset'")?;
274
275 if !output.status.success() {
276 let stderr = String::from_utf8_lossy(&output.stderr);
277 bail!("git reset failed: {}", stderr.trim());
278 }
279
280 Ok(())
281}
282
283fn git_head_sha(repo_dir: &Path) -> Result<String> {
285 let output = Command::new("git")
286 .args(["rev-parse", "HEAD"])
287 .current_dir(repo_dir)
288 .output()
289 .with_context(|| "Failed to get HEAD SHA")?;
290
291 if !output.status.success() {
292 bail!("git rev-parse HEAD failed");
293 }
294
295 Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
296}
297
298fn git_file_last_commit_time(repo_dir: &Path, file_path: &Path) -> Option<i64> {
302 let output = Command::new("git")
303 .args(["log", "-1", "--format=%ct", "--"])
304 .arg(file_path)
305 .current_dir(repo_dir)
306 .output()
307 .ok()?;
308
309 if !output.status.success() {
310 return None;
311 }
312
313 let ts_str = String::from_utf8_lossy(&output.stdout);
314 ts_str.trim().parse::<i64>().ok()
315}
316
317fn git_file_last_author(repo_dir: &Path, file_path: &Path) -> Option<String> {
321 let output = Command::new("git")
322 .args(["log", "-1", "--format=%an", "--"])
323 .arg(file_path)
324 .current_dir(repo_dir)
325 .output()
326 .ok()?;
327
328 if !output.status.success() {
329 return None;
330 }
331
332 let author = String::from_utf8_lossy(&output.stdout).trim().to_string();
333 if author.is_empty() {
334 None
335 } else {
336 Some(author)
337 }
338}
339
340fn file_to_source_item(
345 path: &Path,
346 relative_path: &str,
347 repo_dir: &Path,
348 repo_url: &str,
349 head_sha: &str,
350 source: &str,
351) -> Result<SourceItem> {
352 let body = std::fs::read_to_string(path).unwrap_or_default();
353
354 let title = path
355 .file_name()
356 .map(|n| n.to_string_lossy().to_string())
357 .unwrap_or_default();
358
359 let commit_ts = git_file_last_commit_time(repo_dir, path);
361 let updated_secs = commit_ts.unwrap_or_else(|| {
362 let metadata = std::fs::metadata(path).ok();
363 metadata
364 .and_then(|m| m.modified().ok())
365 .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
366 .map(|d| d.as_secs() as i64)
367 .unwrap_or(0)
368 });
369
370 let author = git_file_last_author(repo_dir, path);
371
372 let source_url = build_web_url(repo_url, head_sha, relative_path);
374
375 let metadata = serde_json::json!({
376 "git_sha": head_sha,
377 "repo_url": repo_url,
378 });
379
380 Ok(SourceItem {
381 source: source.to_string(),
382 source_id: relative_path.to_string(),
383 source_url: Some(source_url),
384 title: Some(title),
385 author,
386 created_at: Utc.timestamp_opt(updated_secs, 0).unwrap(),
387 updated_at: Utc.timestamp_opt(updated_secs, 0).unwrap(),
388 content_type: "text/plain".to_string(),
389 body,
390 metadata_json: metadata.to_string(),
391 raw_json: None,
392 raw_bytes: None,
393 })
394}
395
396fn build_web_url(repo_url: &str, sha: &str, relative_path: &str) -> String {
401 if let Some(rest) = repo_url.strip_prefix("git@github.com:") {
403 let repo = rest.trim_end_matches(".git");
404 return format!("https://github.com/{}/blob/{}/{}", repo, sha, relative_path);
405 }
406
407 if repo_url.contains("github.com") {
409 let base = repo_url.trim_end_matches(".git");
410 return format!("{}/blob/{}/{}", base, sha, relative_path);
411 }
412
413 if let Some(rest) = repo_url.strip_prefix("git@gitlab.com:") {
415 let repo = rest.trim_end_matches(".git");
416 return format!(
417 "https://gitlab.com/{}/-/blob/{}/{}",
418 repo, sha, relative_path
419 );
420 }
421
422 format!("git://{}/{}", repo_url, relative_path)
424}
425
426fn short_hash(input: &str) -> String {
428 let mut hasher = Sha256::new();
429 hasher.update(input.as_bytes());
430 format!("{:x}", hasher.finalize())[..12].to_string()
431}
432
433fn build_globset(patterns: &[String]) -> Result<GlobSet> {
435 let mut builder = GlobSetBuilder::new();
436 for pattern in patterns {
437 builder.add(Glob::new(pattern)?);
438 }
439 Ok(builder.build()?)
440}