Skip to main content

context_harness/
connector_git.rs

1//! Git repository connector.
2//!
3//! Clones or updates a Git repository and walks files within a configurable
4//! subdirectory. Extracts rich metadata from `git log`: per-file commit
5//! timestamps, authors, and the HEAD commit SHA. Automatically generates
6//! web-browsable URLs for GitHub and GitLab repositories.
7//!
8//! # Configuration
9//!
10//! ```toml
11//! [connectors.git.platform]
12//! url = "https://github.com/acme/platform.git"
13//! branch = "main"
14//! root = "docs/"
15//! include_globs = ["**/*.md"]
16//! shallow = true
17//! ```
18//!
19//! # Cache Directory
20//!
21//! Cloned repos are cached locally (default: alongside the SQLite DB in
22//! `.ctx/cache/git/<url-hash>/` for workspace-default configs). Subsequent
23//! syncs do `git fetch && reset`.
24//!
25//! # Metadata Extraction
26//!
27//! For each file, the connector extracts:
28//! - **`updated_at`** — last commit timestamp from `git log -1 --format=%ct`
29//! - **`author`** — last committer name from `git log -1 --format=%an`
30//! - **`source_url`** — web URL (GitHub/GitLab blob link) for the file
31//! - **`metadata_json`** — JSON with `git_sha` and `repo_url`
32//!
33//! # Web URL Generation
34//!
35//! The connector auto-detects GitHub and GitLab URLs and generates
36//! browsable blob links:
37//!
38//! | Input URL | Generated URL |
39//! |-----------|--------------|
40//! | `git@github.com:org/repo.git` | `https://github.com/org/repo/blob/<sha>/<path>` |
41//! | `https://github.com/org/repo.git` | `https://github.com/org/repo/blob/<sha>/<path>` |
42//! | `git@gitlab.com:org/repo.git` | `https://gitlab.com/org/repo/-/blob/<sha>/<path>` |
43//! | Other | `git://<url>/<path>` |
44
45use anyhow::{bail, Context, Result};
46use async_trait::async_trait;
47use chrono::{TimeZone, Utc};
48use globset::{Glob, GlobSet, GlobSetBuilder};
49use sha2::{Digest, Sha256};
50use std::path::{Path, PathBuf};
51use std::process::Command;
52use walkdir::WalkDir;
53
54use crate::config::GitConnectorConfig;
55use crate::ctx_dirs;
56use crate::models::SourceItem;
57use crate::traits::Connector;
58
59// ═══════════════════════════════════════════════════════════════════════
60// Connector trait implementation
61// ═══════════════════════════════════════════════════════════════════════
62
63/// A Git connector instance that implements the [`Connector`] trait.
64///
65/// Wraps the [`scan_git`] function, allowing Git connectors to be used
66/// through the unified trait-based dispatch.
67pub struct GitConnector {
68    /// Instance name (e.g. `"platform"`).
69    name: String,
70    /// Configuration for this Git connector instance.
71    config: GitConnectorConfig,
72    /// Path to the SQLite database, used to derive the default cache directory.
73    db_path: PathBuf,
74}
75
76impl GitConnector {
77    /// Create a new Git connector instance.
78    pub fn new(name: String, config: GitConnectorConfig, db_path: PathBuf) -> Self {
79        Self {
80            name,
81            config,
82            db_path,
83        }
84    }
85}
86
87#[async_trait]
88impl Connector for GitConnector {
89    fn name(&self) -> &str {
90        &self.name
91    }
92
93    fn description(&self) -> &str {
94        "Clone/pull Git repos and walk files"
95    }
96
97    fn connector_type(&self) -> &str {
98        "git"
99    }
100
101    async fn scan(&self) -> Result<Vec<SourceItem>> {
102        scan_git(&self.name, &self.config, &self.db_path)
103    }
104}
105
106/// Scan a Git repository and produce [`SourceItem`]s.
107///
108/// # Workflow
109///
110/// 1. Determine a local cache directory for the clone.
111/// 2. Clone (shallow if configured) or pull to update.
112/// 3. Walk files under the configured `root` subdirectory.
113/// 4. Apply include/exclude globs.
114/// 5. Extract per-file metadata from `git log`.
115/// 6. Generate web-browsable URLs.
116///
117/// # Arguments
118///
119/// - `name` — the instance name (e.g. `"platform"`). Used as part of the
120///   source identifier: `"git:<name>"`.
121/// - `git_config` — the Git connector configuration for this instance.
122/// - `db_path` — path to the SQLite database, used to derive the default cache directory.
123///
124/// # Errors
125///
126/// Returns an error if:
127/// - `git` binary is not available
128/// - Clone or pull fails
129/// - The configured `root` subdirectory does not exist in the repo
130pub fn scan_git(
131    name: &str,
132    git_config: &GitConnectorConfig,
133    db_path: &Path,
134) -> Result<Vec<SourceItem>> {
135    // Determine cache directory
136    let cache_dir = match &git_config.cache_dir {
137        Some(dir) => dir.clone(),
138        None => {
139            let url_hash = short_hash(&git_config.url);
140            if ctx_dirs::is_default_workspace_db_path(db_path) {
141                ctx_dirs::workspace_git_cache_dir().join(url_hash)
142            } else {
143                // Legacy fallback: sibling to the DB file.
144                let db_parent = db_path.parent().unwrap_or_else(|| Path::new("."));
145                db_parent.join(".git-cache").join(url_hash)
146            }
147        }
148    };
149
150    // Clone or pull
151    if cache_dir.join(".git").exists() {
152        git_pull(&cache_dir, &git_config.branch)?;
153    } else {
154        git_clone(
155            &git_config.url,
156            &git_config.branch,
157            git_config.shallow,
158            &cache_dir,
159        )?;
160    }
161
162    // Resolve the scan root within the cloned repo
163    let scan_root = if git_config.root == "." {
164        cache_dir.clone()
165    } else {
166        cache_dir.join(&git_config.root)
167    };
168
169    if !scan_root.exists() {
170        bail!(
171            "Git connector root '{}' does not exist in repo {}",
172            git_config.root,
173            git_config.url
174        );
175    }
176
177    // Get the HEAD commit SHA for metadata
178    let head_sha = git_head_sha(&cache_dir).unwrap_or_else(|_| "unknown".to_string());
179
180    // Build glob sets
181    let include_set = build_globset(&git_config.include_globs)?;
182
183    let mut default_excludes = vec![
184        "**/.git/**".to_string(),
185        "**/target/**".to_string(),
186        "**/node_modules/**".to_string(),
187    ];
188    default_excludes.extend(git_config.exclude_globs.clone());
189    let exclude_set = build_globset(&default_excludes)?;
190
191    let mut items = Vec::new();
192
193    for entry in WalkDir::new(&scan_root) {
194        let entry = entry?;
195        if !entry.file_type().is_file() {
196            continue;
197        }
198
199        let path = entry.path();
200        let relative = path.strip_prefix(&scan_root).unwrap_or(path);
201        let rel_str = relative.to_string_lossy().to_string();
202
203        if exclude_set.is_match(&rel_str) {
204            continue;
205        }
206        if !include_set.is_match(&rel_str) {
207            continue;
208        }
209
210        let source_label = format!("git:{}", name);
211        let item = file_to_source_item(
212            path,
213            &rel_str,
214            &cache_dir,
215            &git_config.url,
216            &head_sha,
217            &source_label,
218        )?;
219        items.push(item);
220    }
221
222    items.sort_by(|a, b| a.source_id.cmp(&b.source_id));
223    Ok(items)
224}
225
226/// Clone a Git repository into the cache directory.
227///
228/// Supports shallow clones (`--depth 1`) and single-branch checkout.
229fn git_clone(url: &str, branch: &str, shallow: bool, dest: &Path) -> Result<()> {
230    std::fs::create_dir_all(dest)
231        .with_context(|| format!("Failed to create cache directory: {}", dest.display()))?;
232
233    let mut cmd = Command::new("git");
234    cmd.args(["clone", "--branch", branch, "--single-branch"]);
235    if shallow {
236        cmd.args(["--depth", "1"]);
237    }
238    cmd.arg(url);
239    cmd.arg(dest);
240
241    let output = cmd
242        .output()
243        .with_context(|| "Failed to execute 'git clone'. Is git installed?")?;
244
245    if !output.status.success() {
246        let stderr = String::from_utf8_lossy(&output.stderr);
247        bail!("git clone failed: {}", stderr.trim());
248    }
249
250    Ok(())
251}
252
253/// Update an existing cached repository via fetch + hard reset.
254fn git_pull(repo_dir: &Path, branch: &str) -> Result<()> {
255    // Fetch the latest changes
256    let output = Command::new("git")
257        .args(["fetch", "origin", branch])
258        .current_dir(repo_dir)
259        .output()
260        .with_context(|| "Failed to execute 'git fetch'")?;
261
262    if !output.status.success() {
263        let stderr = String::from_utf8_lossy(&output.stderr);
264        bail!("git fetch failed: {}", stderr.trim());
265    }
266
267    // Reset to the fetched branch
268    let remote_ref = format!("origin/{}", branch);
269    let output = Command::new("git")
270        .args(["reset", "--hard", &remote_ref])
271        .current_dir(repo_dir)
272        .output()
273        .with_context(|| "Failed to execute 'git reset'")?;
274
275    if !output.status.success() {
276        let stderr = String::from_utf8_lossy(&output.stderr);
277        bail!("git reset failed: {}", stderr.trim());
278    }
279
280    Ok(())
281}
282
283/// Get the HEAD commit SHA of a repository.
284fn git_head_sha(repo_dir: &Path) -> Result<String> {
285    let output = Command::new("git")
286        .args(["rev-parse", "HEAD"])
287        .current_dir(repo_dir)
288        .output()
289        .with_context(|| "Failed to get HEAD SHA")?;
290
291    if !output.status.success() {
292        bail!("git rev-parse HEAD failed");
293    }
294
295    Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
296}
297
298/// Get the last commit timestamp (Unix epoch) for a specific file.
299///
300/// Returns `None` if the file has no Git history or `git log` fails.
301fn git_file_last_commit_time(repo_dir: &Path, file_path: &Path) -> Option<i64> {
302    let output = Command::new("git")
303        .args(["log", "-1", "--format=%ct", "--"])
304        .arg(file_path)
305        .current_dir(repo_dir)
306        .output()
307        .ok()?;
308
309    if !output.status.success() {
310        return None;
311    }
312
313    let ts_str = String::from_utf8_lossy(&output.stdout);
314    ts_str.trim().parse::<i64>().ok()
315}
316
317/// Get the last commit author name for a specific file.
318///
319/// Returns `None` if the file has no Git history or `git log` fails.
320fn git_file_last_author(repo_dir: &Path, file_path: &Path) -> Option<String> {
321    let output = Command::new("git")
322        .args(["log", "-1", "--format=%an", "--"])
323        .arg(file_path)
324        .current_dir(repo_dir)
325        .output()
326        .ok()?;
327
328    if !output.status.success() {
329        return None;
330    }
331
332    let author = String::from_utf8_lossy(&output.stdout).trim().to_string();
333    if author.is_empty() {
334        None
335    } else {
336        Some(author)
337    }
338}
339
340/// Convert a file in the cloned repo to a [`SourceItem`].
341///
342/// Extracts Git metadata (commit timestamp, author) and generates
343/// a web-browsable URL for GitHub/GitLab repositories.
344fn file_to_source_item(
345    path: &Path,
346    relative_path: &str,
347    repo_dir: &Path,
348    repo_url: &str,
349    head_sha: &str,
350    source: &str,
351) -> Result<SourceItem> {
352    let body = std::fs::read_to_string(path).unwrap_or_default();
353
354    let title = path
355        .file_name()
356        .map(|n| n.to_string_lossy().to_string())
357        .unwrap_or_default();
358
359    // Try to get the git commit timestamp for this file; fall back to filesystem mtime
360    let commit_ts = git_file_last_commit_time(repo_dir, path);
361    let updated_secs = commit_ts.unwrap_or_else(|| {
362        let metadata = std::fs::metadata(path).ok();
363        metadata
364            .and_then(|m| m.modified().ok())
365            .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
366            .map(|d| d.as_secs() as i64)
367            .unwrap_or(0)
368    });
369
370    let author = git_file_last_author(repo_dir, path);
371
372    // Build a web URL if this looks like a GitHub/GitLab repo
373    let source_url = build_web_url(repo_url, head_sha, relative_path);
374
375    let metadata = serde_json::json!({
376        "git_sha": head_sha,
377        "repo_url": repo_url,
378    });
379
380    Ok(SourceItem {
381        source: source.to_string(),
382        source_id: relative_path.to_string(),
383        source_url: Some(source_url),
384        title: Some(title),
385        author,
386        created_at: Utc.timestamp_opt(updated_secs, 0).unwrap(),
387        updated_at: Utc.timestamp_opt(updated_secs, 0).unwrap(),
388        content_type: "text/plain".to_string(),
389        body,
390        metadata_json: metadata.to_string(),
391        raw_json: None,
392        raw_bytes: None,
393    })
394}
395
396/// Attempt to build a web-browsable URL from the git remote URL.
397///
398/// Supports GitHub (`git@github.com:` and `https://github.com/`) and
399/// GitLab (`git@gitlab.com:`) URL formats. Falls back to `git://` URI.
400fn build_web_url(repo_url: &str, sha: &str, relative_path: &str) -> String {
401    // Convert git@github.com:org/repo.git → https://github.com/org/repo/blob/<sha>/<path>
402    if let Some(rest) = repo_url.strip_prefix("git@github.com:") {
403        let repo = rest.trim_end_matches(".git");
404        return format!("https://github.com/{}/blob/{}/{}", repo, sha, relative_path);
405    }
406
407    // Convert https://github.com/org/repo.git → https://github.com/org/repo/blob/<sha>/<path>
408    if repo_url.contains("github.com") {
409        let base = repo_url.trim_end_matches(".git");
410        return format!("{}/blob/{}/{}", base, sha, relative_path);
411    }
412
413    // Convert git@gitlab.com:org/repo.git → https://gitlab.com/org/repo/-/blob/<sha>/<path>
414    if let Some(rest) = repo_url.strip_prefix("git@gitlab.com:") {
415        let repo = rest.trim_end_matches(".git");
416        return format!(
417            "https://gitlab.com/{}/-/blob/{}/{}",
418            repo, sha, relative_path
419        );
420    }
421
422    // Fallback: just reference the relative path
423    format!("git://{}/{}", repo_url, relative_path)
424}
425
426/// Generate a short (12-char) SHA-256 hash of input, used for cache directory naming.
427fn short_hash(input: &str) -> String {
428    let mut hasher = Sha256::new();
429    hasher.update(input.as_bytes());
430    format!("{:x}", hasher.finalize())[..12].to_string()
431}
432
433/// Build a [`GlobSet`] from a list of glob pattern strings.
434fn build_globset(patterns: &[String]) -> Result<GlobSet> {
435    let mut builder = GlobSetBuilder::new();
436    for pattern in patterns {
437        builder.add(Glob::new(pattern)?);
438    }
439    Ok(builder.build()?)
440}