context_harness/
connector_git.rs

1//! Git repository connector.
2//!
3//! Clones or updates a Git repository and walks files within a configurable
4//! subdirectory. Extracts rich metadata from `git log`: per-file commit
5//! timestamps, authors, and the HEAD commit SHA. Automatically generates
6//! web-browsable URLs for GitHub and GitLab repositories.
7//!
8//! # Configuration
9//!
10//! ```toml
11//! [connectors.git.platform]
12//! url = "https://github.com/acme/platform.git"
13//! branch = "main"
14//! root = "docs/"
15//! include_globs = ["**/*.md"]
16//! shallow = true
17//! ```
18//!
19//! # Cache Directory
20//!
21//! Cloned repos are cached locally (default: alongside the SQLite DB in
22//! `data/.git-cache/<url-hash>/`). Subsequent syncs do `git fetch && reset`.
23//!
24//! # Metadata Extraction
25//!
26//! For each file, the connector extracts:
27//! - **`updated_at`** — last commit timestamp from `git log -1 --format=%ct`
28//! - **`author`** — last committer name from `git log -1 --format=%an`
29//! - **`source_url`** — web URL (GitHub/GitLab blob link) for the file
30//! - **`metadata_json`** — JSON with `git_sha` and `repo_url`
31//!
32//! # Web URL Generation
33//!
34//! The connector auto-detects GitHub and GitLab URLs and generates
35//! browsable blob links:
36//!
37//! | Input URL | Generated URL |
38//! |-----------|--------------|
39//! | `git@github.com:org/repo.git` | `https://github.com/org/repo/blob/<sha>/<path>` |
40//! | `https://github.com/org/repo.git` | `https://github.com/org/repo/blob/<sha>/<path>` |
41//! | `git@gitlab.com:org/repo.git` | `https://gitlab.com/org/repo/-/blob/<sha>/<path>` |
42//! | Other | `git://<url>/<path>` |
43
44use anyhow::{bail, Context, Result};
45use async_trait::async_trait;
46use chrono::{TimeZone, Utc};
47use globset::{Glob, GlobSet, GlobSetBuilder};
48use sha2::{Digest, Sha256};
49use std::path::{Path, PathBuf};
50use std::process::Command;
51use walkdir::WalkDir;
52
53use crate::config::GitConnectorConfig;
54use crate::models::SourceItem;
55use crate::traits::Connector;
56
57// ═══════════════════════════════════════════════════════════════════════
58// Connector trait implementation
59// ═══════════════════════════════════════════════════════════════════════
60
61/// A Git connector instance that implements the [`Connector`] trait.
62///
63/// Wraps the [`scan_git`] function, allowing Git connectors to be used
64/// through the unified trait-based dispatch.
65pub struct GitConnector {
66    /// Instance name (e.g. `"platform"`).
67    name: String,
68    /// Configuration for this Git connector instance.
69    config: GitConnectorConfig,
70    /// Path to the SQLite database, used to derive the default cache directory.
71    db_path: PathBuf,
72}
73
74impl GitConnector {
75    /// Create a new Git connector instance.
76    pub fn new(name: String, config: GitConnectorConfig, db_path: PathBuf) -> Self {
77        Self {
78            name,
79            config,
80            db_path,
81        }
82    }
83}
84
85#[async_trait]
86impl Connector for GitConnector {
87    fn name(&self) -> &str {
88        &self.name
89    }
90
91    fn description(&self) -> &str {
92        "Clone/pull Git repos and walk files"
93    }
94
95    fn connector_type(&self) -> &str {
96        "git"
97    }
98
99    async fn scan(&self) -> Result<Vec<SourceItem>> {
100        scan_git(&self.name, &self.config, &self.db_path)
101    }
102}
103
104/// Scan a Git repository and produce [`SourceItem`]s.
105///
106/// # Workflow
107///
108/// 1. Determine a local cache directory for the clone.
109/// 2. Clone (shallow if configured) or pull to update.
110/// 3. Walk files under the configured `root` subdirectory.
111/// 4. Apply include/exclude globs.
112/// 5. Extract per-file metadata from `git log`.
113/// 6. Generate web-browsable URLs.
114///
115/// # Arguments
116///
117/// - `name` — the instance name (e.g. `"platform"`). Used as part of the
118///   source identifier: `"git:<name>"`.
119/// - `git_config` — the Git connector configuration for this instance.
120/// - `db_path` — path to the SQLite database, used to derive the default cache directory.
121///
122/// # Errors
123///
124/// Returns an error if:
125/// - `git` binary is not available
126/// - Clone or pull fails
127/// - The configured `root` subdirectory does not exist in the repo
128pub fn scan_git(
129    name: &str,
130    git_config: &GitConnectorConfig,
131    db_path: &Path,
132) -> Result<Vec<SourceItem>> {
133    // Determine cache directory
134    let cache_dir = match &git_config.cache_dir {
135        Some(dir) => dir.clone(),
136        None => {
137            // Default: sibling to the DB file
138            let db_parent = db_path.parent().unwrap_or_else(|| Path::new("."));
139            let url_hash = short_hash(&git_config.url);
140            db_parent.join(".git-cache").join(url_hash)
141        }
142    };
143
144    // Clone or pull
145    if cache_dir.join(".git").exists() {
146        git_pull(&cache_dir, &git_config.branch)?;
147    } else {
148        git_clone(
149            &git_config.url,
150            &git_config.branch,
151            git_config.shallow,
152            &cache_dir,
153        )?;
154    }
155
156    // Resolve the scan root within the cloned repo
157    let scan_root = if git_config.root == "." {
158        cache_dir.clone()
159    } else {
160        cache_dir.join(&git_config.root)
161    };
162
163    if !scan_root.exists() {
164        bail!(
165            "Git connector root '{}' does not exist in repo {}",
166            git_config.root,
167            git_config.url
168        );
169    }
170
171    // Get the HEAD commit SHA for metadata
172    let head_sha = git_head_sha(&cache_dir).unwrap_or_else(|_| "unknown".to_string());
173
174    // Build glob sets
175    let include_set = build_globset(&git_config.include_globs)?;
176
177    let mut default_excludes = vec![
178        "**/.git/**".to_string(),
179        "**/target/**".to_string(),
180        "**/node_modules/**".to_string(),
181    ];
182    default_excludes.extend(git_config.exclude_globs.clone());
183    let exclude_set = build_globset(&default_excludes)?;
184
185    let mut items = Vec::new();
186
187    for entry in WalkDir::new(&scan_root) {
188        let entry = entry?;
189        if !entry.file_type().is_file() {
190            continue;
191        }
192
193        let path = entry.path();
194        let relative = path.strip_prefix(&scan_root).unwrap_or(path);
195        let rel_str = relative.to_string_lossy().to_string();
196
197        if exclude_set.is_match(&rel_str) {
198            continue;
199        }
200        if !include_set.is_match(&rel_str) {
201            continue;
202        }
203
204        let source_label = format!("git:{}", name);
205        let item = file_to_source_item(
206            path,
207            &rel_str,
208            &cache_dir,
209            &git_config.url,
210            &head_sha,
211            &source_label,
212        )?;
213        items.push(item);
214    }
215
216    items.sort_by(|a, b| a.source_id.cmp(&b.source_id));
217    Ok(items)
218}
219
220/// Clone a Git repository into the cache directory.
221///
222/// Supports shallow clones (`--depth 1`) and single-branch checkout.
223fn git_clone(url: &str, branch: &str, shallow: bool, dest: &Path) -> Result<()> {
224    std::fs::create_dir_all(dest)
225        .with_context(|| format!("Failed to create cache directory: {}", dest.display()))?;
226
227    let mut cmd = Command::new("git");
228    cmd.args(["clone", "--branch", branch, "--single-branch"]);
229    if shallow {
230        cmd.args(["--depth", "1"]);
231    }
232    cmd.arg(url);
233    cmd.arg(dest);
234
235    let output = cmd
236        .output()
237        .with_context(|| "Failed to execute 'git clone'. Is git installed?")?;
238
239    if !output.status.success() {
240        let stderr = String::from_utf8_lossy(&output.stderr);
241        bail!("git clone failed: {}", stderr.trim());
242    }
243
244    Ok(())
245}
246
247/// Update an existing cached repository via fetch + hard reset.
248fn git_pull(repo_dir: &Path, branch: &str) -> Result<()> {
249    // Fetch the latest changes
250    let output = Command::new("git")
251        .args(["fetch", "origin", branch])
252        .current_dir(repo_dir)
253        .output()
254        .with_context(|| "Failed to execute 'git fetch'")?;
255
256    if !output.status.success() {
257        let stderr = String::from_utf8_lossy(&output.stderr);
258        bail!("git fetch failed: {}", stderr.trim());
259    }
260
261    // Reset to the fetched branch
262    let remote_ref = format!("origin/{}", branch);
263    let output = Command::new("git")
264        .args(["reset", "--hard", &remote_ref])
265        .current_dir(repo_dir)
266        .output()
267        .with_context(|| "Failed to execute 'git reset'")?;
268
269    if !output.status.success() {
270        let stderr = String::from_utf8_lossy(&output.stderr);
271        bail!("git reset failed: {}", stderr.trim());
272    }
273
274    Ok(())
275}
276
277/// Get the HEAD commit SHA of a repository.
278fn git_head_sha(repo_dir: &Path) -> Result<String> {
279    let output = Command::new("git")
280        .args(["rev-parse", "HEAD"])
281        .current_dir(repo_dir)
282        .output()
283        .with_context(|| "Failed to get HEAD SHA")?;
284
285    if !output.status.success() {
286        bail!("git rev-parse HEAD failed");
287    }
288
289    Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
290}
291
292/// Get the last commit timestamp (Unix epoch) for a specific file.
293///
294/// Returns `None` if the file has no Git history or `git log` fails.
295fn git_file_last_commit_time(repo_dir: &Path, file_path: &Path) -> Option<i64> {
296    let output = Command::new("git")
297        .args(["log", "-1", "--format=%ct", "--"])
298        .arg(file_path)
299        .current_dir(repo_dir)
300        .output()
301        .ok()?;
302
303    if !output.status.success() {
304        return None;
305    }
306
307    let ts_str = String::from_utf8_lossy(&output.stdout);
308    ts_str.trim().parse::<i64>().ok()
309}
310
311/// Get the last commit author name for a specific file.
312///
313/// Returns `None` if the file has no Git history or `git log` fails.
314fn git_file_last_author(repo_dir: &Path, file_path: &Path) -> Option<String> {
315    let output = Command::new("git")
316        .args(["log", "-1", "--format=%an", "--"])
317        .arg(file_path)
318        .current_dir(repo_dir)
319        .output()
320        .ok()?;
321
322    if !output.status.success() {
323        return None;
324    }
325
326    let author = String::from_utf8_lossy(&output.stdout).trim().to_string();
327    if author.is_empty() {
328        None
329    } else {
330        Some(author)
331    }
332}
333
334/// Convert a file in the cloned repo to a [`SourceItem`].
335///
336/// Extracts Git metadata (commit timestamp, author) and generates
337/// a web-browsable URL for GitHub/GitLab repositories.
338fn file_to_source_item(
339    path: &Path,
340    relative_path: &str,
341    repo_dir: &Path,
342    repo_url: &str,
343    head_sha: &str,
344    source: &str,
345) -> Result<SourceItem> {
346    let body = std::fs::read_to_string(path).unwrap_or_default();
347
348    let title = path
349        .file_name()
350        .map(|n| n.to_string_lossy().to_string())
351        .unwrap_or_default();
352
353    // Try to get the git commit timestamp for this file; fall back to filesystem mtime
354    let commit_ts = git_file_last_commit_time(repo_dir, path);
355    let updated_secs = commit_ts.unwrap_or_else(|| {
356        let metadata = std::fs::metadata(path).ok();
357        metadata
358            .and_then(|m| m.modified().ok())
359            .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
360            .map(|d| d.as_secs() as i64)
361            .unwrap_or(0)
362    });
363
364    let author = git_file_last_author(repo_dir, path);
365
366    // Build a web URL if this looks like a GitHub/GitLab repo
367    let source_url = build_web_url(repo_url, head_sha, relative_path);
368
369    let metadata = serde_json::json!({
370        "git_sha": head_sha,
371        "repo_url": repo_url,
372    });
373
374    Ok(SourceItem {
375        source: source.to_string(),
376        source_id: relative_path.to_string(),
377        source_url: Some(source_url),
378        title: Some(title),
379        author,
380        created_at: Utc.timestamp_opt(updated_secs, 0).unwrap(),
381        updated_at: Utc.timestamp_opt(updated_secs, 0).unwrap(),
382        content_type: "text/plain".to_string(),
383        body,
384        metadata_json: metadata.to_string(),
385        raw_json: None,
386        raw_bytes: None,
387    })
388}
389
390/// Attempt to build a web-browsable URL from the git remote URL.
391///
392/// Supports GitHub (`git@github.com:` and `https://github.com/`) and
393/// GitLab (`git@gitlab.com:`) URL formats. Falls back to `git://` URI.
394fn build_web_url(repo_url: &str, sha: &str, relative_path: &str) -> String {
395    // Convert git@github.com:org/repo.git → https://github.com/org/repo/blob/<sha>/<path>
396    if let Some(rest) = repo_url.strip_prefix("git@github.com:") {
397        let repo = rest.trim_end_matches(".git");
398        return format!("https://github.com/{}/blob/{}/{}", repo, sha, relative_path);
399    }
400
401    // Convert https://github.com/org/repo.git → https://github.com/org/repo/blob/<sha>/<path>
402    if repo_url.contains("github.com") {
403        let base = repo_url.trim_end_matches(".git");
404        return format!("{}/blob/{}/{}", base, sha, relative_path);
405    }
406
407    // Convert git@gitlab.com:org/repo.git → https://gitlab.com/org/repo/-/blob/<sha>/<path>
408    if let Some(rest) = repo_url.strip_prefix("git@gitlab.com:") {
409        let repo = rest.trim_end_matches(".git");
410        return format!(
411            "https://gitlab.com/{}/-/blob/{}/{}",
412            repo, sha, relative_path
413        );
414    }
415
416    // Fallback: just reference the relative path
417    format!("git://{}/{}", repo_url, relative_path)
418}
419
420/// Generate a short (12-char) SHA-256 hash of input, used for cache directory naming.
421fn short_hash(input: &str) -> String {
422    let mut hasher = Sha256::new();
423    hasher.update(input.as_bytes());
424    format!("{:x}", hasher.finalize())[..12].to_string()
425}
426
427/// Build a [`GlobSet`] from a list of glob pattern strings.
428fn build_globset(patterns: &[String]) -> Result<GlobSet> {
429    let mut builder = GlobSetBuilder::new();
430    for pattern in patterns {
431        builder.add(Glob::new(pattern)?);
432    }
433    Ok(builder.build()?)
434}