context_harness/
config.rs

1//! Configuration parsing and validation.
2//!
3//! Context Harness is configured via a TOML file (default: `config/ctx.toml`).
4//! The config defines database paths, chunking parameters, embedding provider
5//! settings, retrieval tuning, server bind address, and connector configurations.
6//!
7//! # Example Configuration
8//!
9//! ```toml
10//! [db]
11//! path = ".ctx/data/ctx.sqlite"
12//!
13//! [chunking]
14//! max_tokens = 700
15//! overlap_tokens = 80
16//!
17//! [embedding]
18//! provider = "openai"           # "disabled" | "openai"
19//! model = "text-embedding-3-small"
20//! dims = 1536
21//!
22//! [vector_index]
23//! backend = "auto"
24//! path = "auto"
25//! metric = "cosine"
26//! index = "hnsw"
27//! fallback = "sqlite"
28//!
29//! [retrieval]
30//! final_limit = 12
31//! hybrid_alpha = 0.6            # 0.0 = keyword only, 1.0 = semantic only
32//!
33//! [server]
34//! bind = "127.0.0.1:7331"
35//!
36//! [connectors.filesystem.docs]
37//! root = "./docs"
38//! include_globs = ["**/*.md", "**/*.txt"]
39//!
40//! [connectors.git.platform]
41//! url = "https://github.com/acme/platform.git"
42//! branch = "main"
43//! ```
44//!
45//! # Connectors
46//!
47//! All connector types are **named** — you can configure multiple instances of each:
48//! - **Filesystem** (`[connectors.filesystem.<name>]`) — scan a local directory
49//! - **Git** (`[connectors.git.<name>]`) — clone/pull a Git repository
50//! - **S3** (`[connectors.s3.<name>]`) — list and download from an S3 bucket
51//! - **Script** (`[connectors.script.<name>]`) — custom Lua-scripted data sources
52//!
53//! # Validation
54//!
55//! [`load_config`] performs the following validations:
56//! - `chunking.max_tokens > 0`
57//! - `retrieval.final_limit >= 1`
58//! - `retrieval.hybrid_alpha ∈ [0.0, 1.0]`
59//! - When embedding provider is `openai` or `ollama`: `model` and `dims` must be set
60//! - Embedding provider must be one of: `"disabled"`, `"openai"`, `"ollama"`, `"local"`
61
62use anyhow::{Context, Result};
63use serde::Deserialize;
64use std::collections::HashMap;
65use std::path::{Path, PathBuf};
66
67use crate::ctx_dirs::{self, ConfigSourceKind};
68
69/// Top-level configuration structure.
70///
71/// Deserialized from the TOML config file. All sections are required
72/// except `connectors`, which defaults to an empty set.
73#[derive(Debug, Deserialize, Clone)]
74pub struct Config {
75    /// Database connection settings.
76    pub db: DbConfig,
77    /// Text chunking parameters.
78    pub chunking: ChunkingConfig,
79    /// Search and retrieval tuning.
80    pub retrieval: RetrievalConfig,
81    /// Embedding provider settings (defaults to disabled).
82    #[serde(default)]
83    pub embedding: EmbeddingConfig,
84    /// Optional vector-index acceleration settings (defaults to auto with SQLite fallback).
85    #[serde(default)]
86    #[allow(dead_code)]
87    pub vector_index: VectorIndexConfig,
88    /// HTTP server bind address.
89    #[allow(dead_code)]
90    pub server: ServerConfig,
91    /// Connector configurations (all optional).
92    #[serde(default)]
93    pub connectors: ConnectorsConfig,
94    /// Tool script configurations (all optional).
95    #[serde(default)]
96    pub tools: ToolsConfig,
97    /// Agent configurations (all optional).
98    #[serde(default)]
99    pub agents: AgentsConfig,
100    /// Extension registry configurations (all optional).
101    #[serde(default)]
102    pub registries: HashMap<String, RegistryConfig>,
103}
104
105impl Config {
106    /// Create a minimal config suitable for commands that don't need
107    /// database or connector settings (e.g., `ctx connector test`).
108    pub fn minimal() -> Self {
109        Self {
110            db: DbConfig {
111                path: ctx_dirs::workspace_db_path(),
112            },
113            chunking: ChunkingConfig {
114                max_tokens: 700,
115                overlap_tokens: 0,
116            },
117            retrieval: RetrievalConfig {
118                hybrid_alpha: default_hybrid_alpha(),
119                candidate_k_keyword: default_candidate_k(),
120                candidate_k_vector: default_candidate_k(),
121                final_limit: default_final_limit(),
122                group_by: default_group_by(),
123                doc_agg: default_doc_agg(),
124                max_chunks_per_doc: default_max_chunks_per_doc(),
125            },
126            embedding: EmbeddingConfig::default(),
127            vector_index: VectorIndexConfig::default(),
128            server: ServerConfig {
129                bind: "127.0.0.1:7331".to_string(),
130            },
131            connectors: ConnectorsConfig::default(),
132            tools: ToolsConfig::default(),
133            agents: AgentsConfig::default(),
134            registries: HashMap::new(),
135        }
136    }
137}
138
139#[derive(Debug, Clone)]
140pub struct ResolvedConfig {
141    pub config: Config,
142    pub path: Option<PathBuf>,
143    #[allow(dead_code)]
144    pub source: ConfigSourceKind,
145}
146
147/// Database configuration.
148///
149/// Specifies the path to the SQLite database file. The file and its
150/// parent directories are created automatically on first use.
151#[derive(Debug, Deserialize, Clone)]
152pub struct DbConfig {
153    /// Path to the SQLite database file (e.g. `".ctx/data/ctx.sqlite"`).
154    pub path: PathBuf,
155}
156
157/// Text chunking parameters.
158///
159/// Controls how document bodies are split into chunks for indexing
160/// and embedding. See [`crate::chunk`] for the chunking algorithm.
161#[derive(Debug, Deserialize, Clone)]
162pub struct ChunkingConfig {
163    /// Maximum tokens per chunk. Chunks are split on paragraph boundaries
164    /// to stay within this limit. Converted to characters via `max_tokens × 4`.
165    pub max_tokens: usize,
166    /// Number of overlapping tokens between adjacent chunks (reserved for future use).
167    #[serde(default = "default_overlap")]
168    #[allow(dead_code)]
169    pub overlap_tokens: usize,
170}
171
172fn default_overlap() -> usize {
173    0
174}
175
176/// Search and retrieval tuning parameters.
177///
178/// These settings control how keyword and semantic search results are
179/// merged in hybrid mode, and the overall result limits.
180///
181/// # Hybrid Scoring
182///
183/// The `hybrid_alpha` weight determines the blend between keyword (BM25)
184/// and semantic (cosine similarity) scores:
185///
186/// ```text
187/// hybrid_score = (1 - α) × keyword_score + α × semantic_score
188/// ```
189///
190/// - `α = 0.0` → pure keyword search
191/// - `α = 1.0` → pure semantic search
192/// - `α = 0.6` (default) → 60% semantic, 40% keyword
193///
194/// See `docs/HYBRID_SCORING.md` for the full specification.
195#[derive(Debug, Deserialize, Clone)]
196pub struct RetrievalConfig {
197    /// Weight for semantic vs. keyword scores in hybrid mode.
198    /// Range: `[0.0, 1.0]`. Default: `0.6`.
199    #[serde(default = "default_hybrid_alpha")]
200    pub hybrid_alpha: f64,
201    /// Number of keyword candidates to fetch before merging. Default: `80`.
202    #[serde(default = "default_candidate_k")]
203    pub candidate_k_keyword: i64,
204    /// Number of vector candidates to fetch before merging. Default: `80`.
205    #[serde(default = "default_candidate_k")]
206    pub candidate_k_vector: i64,
207    /// Maximum number of results to return after merging and ranking. Default: `12`.
208    #[serde(default = "default_final_limit")]
209    pub final_limit: i64,
210    /// Grouping strategy for results. Default: `"document"`.
211    #[serde(default = "default_group_by")]
212    #[allow(dead_code)]
213    pub group_by: String,
214    /// Aggregation method for document-level scores. Default: `"max"`.
215    #[serde(default = "default_doc_agg")]
216    #[allow(dead_code)]
217    pub doc_agg: String,
218    /// Maximum chunks per document in results. Default: `3`.
219    #[serde(default = "default_max_chunks_per_doc")]
220    #[allow(dead_code)]
221    pub max_chunks_per_doc: usize,
222}
223
224fn default_hybrid_alpha() -> f64 {
225    0.6
226}
227fn default_candidate_k() -> i64 {
228    80
229}
230fn default_final_limit() -> i64 {
231    12
232}
233fn default_group_by() -> String {
234    "document".to_string()
235}
236fn default_doc_agg() -> String {
237    "max".to_string()
238}
239fn default_max_chunks_per_doc() -> usize {
240    3
241}
242
243/// Embedding provider configuration.
244///
245/// Controls which embedding provider is used and its parameters.
246/// When `provider = "disabled"`, no embeddings are generated and
247/// semantic/hybrid search modes will return errors.
248///
249/// # Providers
250///
251/// | Provider | Description |
252/// |----------|-------------|
253/// | `"disabled"` | No embeddings (default) |
254/// | `"openai"` | OpenAI API (`text-embedding-3-small`, etc.) |
255/// | `"ollama"` | Local Ollama instance (`nomic-embed-text`, etc.) |
256/// | `"local"` | Built-in models via fastembed (primary) or tract (musl/Intel Mac) (`all-minilm-l6-v2`, etc.) |
257///
258/// When using `"openai"`, the `OPENAI_API_KEY` environment variable must be set.
259/// When using `"ollama"`, an Ollama instance must be running (default: `http://localhost:11434`).
260/// When using `"local"`, the model is downloaded on first use and cached in `~/.cache/huggingface/`.
261#[derive(Debug, Deserialize, Clone)]
262pub struct EmbeddingConfig {
263    /// Provider name: `"disabled"`, `"openai"`, `"ollama"`, or `"local"`. Default: `"disabled"`.
264    #[serde(default = "default_provider")]
265    pub provider: String,
266    /// Embedding model name (e.g. `"text-embedding-3-small"`, `"nomic-embed-text"`,
267    /// `"all-minilm-l6-v2"`). Required for `openai` and `ollama`; optional for `local`
268    /// (defaults to `"all-minilm-l6-v2"`).
269    #[serde(default)]
270    pub model: Option<String>,
271    /// Embedding vector dimensionality (e.g. `1536` for `text-embedding-3-small`).
272    /// Required for `openai` and `ollama`; auto-detected for `local`.
273    #[serde(default)]
274    pub dims: Option<usize>,
275    /// Number of texts to embed per batch. Default: `64`.
276    #[serde(default = "default_batch_size")]
277    pub batch_size: usize,
278    /// Maximum retry attempts for transient API errors. Default: `5`.
279    #[serde(default = "default_max_retries")]
280    pub max_retries: u32,
281    /// HTTP timeout per request in seconds. Default: `30`.
282    #[serde(default = "default_timeout_secs")]
283    pub timeout_secs: u64,
284    /// Base URL for Ollama API. Default: `"http://localhost:11434"`.
285    #[serde(default)]
286    pub url: Option<String>,
287}
288
289impl Default for EmbeddingConfig {
290    fn default() -> Self {
291        Self {
292            provider: "disabled".to_string(),
293            model: None,
294            dims: None,
295            batch_size: 64,
296            max_retries: 5,
297            timeout_secs: 30,
298            url: None,
299        }
300    }
301}
302
303fn default_provider() -> String {
304    "disabled".to_string()
305}
306fn default_batch_size() -> usize {
307    64
308}
309fn default_max_retries() -> u32 {
310    5
311}
312fn default_timeout_secs() -> u64 {
313    30
314}
315
316/// Optional vector-index acceleration configuration.
317///
318/// The default is automatic: use the built-in vector accelerator when the
319/// binary supports one and fall back to the exact SQLite vector scan otherwise.
320#[derive(Debug, Deserialize, Clone)]
321pub struct VectorIndexConfig {
322    /// Backend name. Default: `"auto"`.
323    #[serde(default = "default_vector_backend")]
324    #[allow(dead_code)]
325    pub backend: String,
326    /// Filesystem path for external vector-index state. Default: `"auto"`.
327    #[serde(default = "default_vector_path")]
328    #[allow(dead_code)]
329    pub path: PathBuf,
330    /// Distance metric. Default: `"cosine"`.
331    #[serde(default = "default_vector_metric")]
332    #[allow(dead_code)]
333    pub metric: String,
334    /// Index kind. Default: `"hnsw"`.
335    #[serde(default = "default_vector_index")]
336    #[allow(dead_code)]
337    pub index: String,
338    /// Fallback strategy when acceleration is unavailable. Default: `"sqlite"`.
339    #[serde(default = "default_vector_fallback")]
340    #[allow(dead_code)]
341    pub fallback: String,
342}
343
344impl Default for VectorIndexConfig {
345    fn default() -> Self {
346        Self {
347            backend: default_vector_backend(),
348            path: default_vector_path(),
349            metric: default_vector_metric(),
350            index: default_vector_index(),
351            fallback: default_vector_fallback(),
352        }
353    }
354}
355
356fn default_vector_backend() -> String {
357    "auto".to_string()
358}
359
360fn default_vector_path() -> PathBuf {
361    PathBuf::from("auto")
362}
363
364fn default_vector_metric() -> String {
365    "cosine".to_string()
366}
367
368fn default_vector_index() -> String {
369    "hnsw".to_string()
370}
371
372fn default_vector_fallback() -> String {
373    "sqlite".to_string()
374}
375
376/// HTTP server configuration.
377#[derive(Debug, Deserialize, Clone)]
378pub struct ServerConfig {
379    /// Socket address to bind to (e.g. `"127.0.0.1:7331"`).
380    pub bind: String,
381}
382
383/// Container for all connector configurations.
384///
385/// All connector types use named instances — you can configure multiple
386/// of each type. For example:
387///
388/// ```toml
389/// [connectors.git.platform]
390/// url = "https://github.com/acme/platform.git"
391///
392/// [connectors.git.auth-service]
393/// url = "https://github.com/acme/auth-service.git"
394/// ```
395///
396/// Use `ctx sync git` to sync all git connectors, or `ctx sync git:platform`
397/// for a specific one. `ctx sync all` syncs everything.
398#[derive(Debug, Deserialize, Clone, Default)]
399pub struct ConnectorsConfig {
400    /// Named filesystem connectors: walk local directories.
401    #[serde(default)]
402    pub filesystem: HashMap<String, FilesystemConnectorConfig>,
403    /// Named Git connectors: clone and scan Git repositories.
404    #[serde(default)]
405    pub git: HashMap<String, GitConnectorConfig>,
406    /// Named S3 connectors: list and download from S3 buckets.
407    #[serde(default)]
408    pub s3: HashMap<String, S3ConnectorConfig>,
409    /// Named Lua script connectors.
410    /// Each key is a connector name, each value contains the script path
411    /// and arbitrary config keys passed to the Lua `connector.scan()` function.
412    /// See `docs/LUA_CONNECTORS.md` for the full specification.
413    #[serde(default)]
414    pub script: HashMap<String, ScriptConnectorConfig>,
415}
416
417/// Lua script connector configuration.
418///
419/// Points to a `.lua` file implementing the connector interface. All fields
420/// except `path` and `timeout` are passed as a config table to the script's
421/// `connector.scan(config)` function.
422///
423/// Values containing `${VAR_NAME}` are expanded from the process environment.
424///
425/// # Example
426///
427/// ```toml
428/// [connectors.script.jira]
429/// path = "connectors/jira.lua"
430/// timeout = 600
431/// url = "https://mycompany.atlassian.net"
432/// api_token = "${JIRA_API_TOKEN}"
433/// project_key = "ENG"
434/// ```
435#[derive(Debug, Deserialize, Clone)]
436pub struct ScriptConnectorConfig {
437    /// Path to the `.lua` connector script.
438    pub path: PathBuf,
439    /// Maximum execution time in seconds. Default: `300`.
440    #[serde(default = "default_script_timeout")]
441    pub timeout: u64,
442    /// All other config keys — passed to the Lua `connector.scan()` function.
443    #[serde(flatten)]
444    pub extra: toml::Table,
445}
446
447fn default_script_timeout() -> u64 {
448    300
449}
450
451/// Container for all tool script configurations.
452///
453/// Tool scripts are Lua files that define MCP tools agents can discover
454/// and call via the HTTP server. See `docs/LUA_TOOLS.md` for the full
455/// specification.
456#[derive(Debug, Deserialize, Clone, Default)]
457pub struct ToolsConfig {
458    /// Named Lua tool scripts.
459    /// Each key is the tool name, each value contains the script path
460    /// and arbitrary config keys accessible via `context.config` in the script.
461    #[serde(default)]
462    pub script: HashMap<String, ScriptToolConfig>,
463}
464
465/// Lua tool script configuration.
466///
467/// Points to a `.lua` file implementing the tool interface. All fields
468/// except `path` and `timeout` are passed as `context.config` to the
469/// script's `tool.execute(params, context)` function.
470///
471/// Values containing `${VAR_NAME}` are expanded from the process environment.
472///
473/// # Example
474///
475/// ```toml
476/// [tools.script.create_jira_ticket]
477/// path = "tools/create-jira-ticket.lua"
478/// timeout = 30
479/// url = "https://mycompany.atlassian.net"
480/// api_token = "${JIRA_API_TOKEN}"
481/// ```
482#[derive(Debug, Deserialize, Clone)]
483pub struct ScriptToolConfig {
484    /// Path to the `.lua` tool script.
485    pub path: PathBuf,
486    /// Maximum execution time in seconds. Default: `30`.
487    #[serde(default = "default_tool_timeout")]
488    pub timeout: u64,
489    /// All other config keys — accessible via `context.config` in the script.
490    #[serde(flatten)]
491    pub extra: toml::Table,
492}
493
494fn default_tool_timeout() -> u64 {
495    30
496}
497
498/// Container for all agent configurations.
499///
500/// Agents are named personas that combine a system prompt, scoped tools,
501/// and optional dynamic context injection. They can be defined inline
502/// in TOML or via Lua scripts.
503///
504/// # Example
505///
506/// ```toml
507/// [agents.inline.code-reviewer]
508/// description = "Reviews code against project conventions"
509/// tools = ["search", "get"]
510/// system_prompt = "You are a senior code reviewer..."
511///
512/// [agents.script.incident-responder]
513/// path = "agents/incident-responder.lua"
514/// timeout = 30
515/// ```
516#[derive(Debug, Deserialize, Clone, Default)]
517pub struct AgentsConfig {
518    /// Inline TOML agents with static system prompts.
519    /// Each key is the agent name, each value contains the prompt and tool list.
520    #[serde(default)]
521    pub inline: HashMap<String, InlineAgentConfig>,
522    /// Lua script agents with dynamic prompt resolution.
523    /// Each key is the agent name, each value contains the script path
524    /// and arbitrary config keys passed to `agent.resolve()`.
525    #[serde(default)]
526    pub script: HashMap<String, ScriptAgentConfig>,
527}
528
529/// Inline (TOML) agent configuration.
530///
531/// Defines an agent with a static system prompt and fixed tool list.
532/// The simplest way to create an agent — no Lua or Rust code needed.
533///
534/// # Example
535///
536/// ```toml
537/// [agents.inline.architect]
538/// description = "Answers architecture questions"
539/// tools = ["search", "get", "sources"]
540/// system_prompt = """
541/// You are a software architect. Search for ADRs and design
542/// docs to ground your recommendations.
543/// """
544/// ```
545#[derive(Debug, Deserialize, Clone)]
546pub struct InlineAgentConfig {
547    /// One-line description for agent discovery.
548    pub description: String,
549    /// List of tool names this agent should expose.
550    pub tools: Vec<String>,
551    /// The system prompt text.
552    pub system_prompt: String,
553}
554
555/// Lua script agent configuration.
556///
557/// Points to a `.lua` file implementing the agent interface. All fields
558/// except `path` and `timeout` are passed as config to the script's
559/// `agent.resolve(args, config, context)` function.
560///
561/// Values containing `${VAR_NAME}` are expanded from the process environment.
562///
563/// # Example
564///
565/// ```toml
566/// [agents.script.incident-responder]
567/// path = "agents/incident-responder.lua"
568/// timeout = 30
569/// search_limit = 5
570/// priority_sources = ["runbooks"]
571/// ```
572#[derive(Debug, Deserialize, Clone)]
573pub struct ScriptAgentConfig {
574    /// Path to the `.lua` agent script.
575    pub path: PathBuf,
576    /// Maximum execution time in seconds. Default: `30`.
577    #[serde(default = "default_agent_timeout")]
578    pub timeout: u64,
579    /// All other config keys — passed to the Lua `agent.resolve()` function.
580    #[serde(flatten)]
581    pub extra: toml::Table,
582}
583
584fn default_agent_timeout() -> u64 {
585    30
586}
587
588/// Extension registry configuration.
589///
590/// Points to a local directory (optionally backed by a Git repository)
591/// containing Lua connector, tool, and agent scripts described by a
592/// `registry.toml` manifest.
593///
594/// # Example
595///
596/// ```toml
597/// [registries.community]
598/// url = "https://github.com/parallax-labs/ctx-registry.git"
599/// branch = "main"
600/// path = "~/.local/share/ctx/registries/community"
601/// readonly = true
602/// auto_update = true
603/// ```
604#[derive(Debug, Deserialize, Clone)]
605pub struct RegistryConfig {
606    /// Git repository URL to clone from. `None` means local-only (no git).
607    pub url: Option<String>,
608    /// Git branch or tag to track. Default: `"main"`.
609    pub branch: Option<String>,
610    /// Local filesystem path where the registry is (or will be) stored.
611    pub path: PathBuf,
612    /// If `true`, extensions on this path cannot be edited in place;
613    /// overrides are copied to a writable registry.
614    #[serde(default)]
615    pub readonly: bool,
616    /// If `true`, `ctx registry update` (and optionally `ctx sync`) will
617    /// `git pull` this registry automatically.
618    #[serde(default)]
619    #[allow(dead_code)]
620    pub auto_update: bool,
621}
622
623/// Filesystem connector configuration.
624///
625/// Scans a local directory tree, applying glob include/exclude filters.
626/// See [`crate::connector_fs`] for the scanning implementation.
627///
628/// # Example
629///
630/// ```toml
631/// [connectors.filesystem.docs]
632/// root = "./docs"
633/// include_globs = ["**/*.md", "**/*.txt"]
634/// exclude_globs = ["**/drafts/**"]
635/// follow_symlinks = false
636/// max_extract_bytes = 50_000_000
637/// ```
638#[derive(Debug, Deserialize, Clone)]
639pub struct FilesystemConnectorConfig {
640    /// Root directory to scan.
641    pub root: PathBuf,
642    /// Glob patterns for files to include. Default: `["**/*.md", "**/*.txt"]`.
643    #[serde(default = "default_include_globs")]
644    pub include_globs: Vec<String>,
645    /// Glob patterns for files to exclude. Default: `[]`.
646    #[serde(default)]
647    pub exclude_globs: Vec<String>,
648    /// Whether to follow symbolic links. Default: `false`.
649    #[serde(default)]
650    pub follow_symlinks: bool,
651    /// Files larger than this (bytes) are not extracted; they are skipped and counted in extraction skipped. Default: 50_000_000.
652    #[serde(default = "default_max_extract_bytes")]
653    pub max_extract_bytes: u64,
654}
655
656/// Git connector configuration.
657///
658/// Clones (or pulls) a Git repository and scans files within a configurable
659/// subdirectory. Extracts per-file metadata from `git log`.
660/// See [`crate::connector_git`] for the full implementation.
661///
662/// # Example
663///
664/// ```toml
665/// [connectors.git.platform]
666/// url = "https://github.com/acme/platform.git"
667/// branch = "main"
668/// root = "docs/"
669/// include_globs = ["**/*.md"]
670/// shallow = true
671/// ```
672#[derive(Debug, Deserialize, Clone)]
673pub struct GitConnectorConfig {
674    /// Git repository URL (`https://`, `git@`, or local path).
675    pub url: String,
676    /// Branch to clone/pull. Default: `"main"`.
677    #[serde(default = "default_git_branch")]
678    pub branch: String,
679    /// Subdirectory within the repo to scan. Default: `"."` (entire repo).
680    #[serde(default = "default_git_root")]
681    pub root: String,
682    /// Glob patterns for files to include. Default: `["**/*.md", "**/*.txt"]`.
683    #[serde(default = "default_include_globs")]
684    pub include_globs: Vec<String>,
685    /// Glob patterns for files to exclude. Default: `[]`.
686    #[serde(default)]
687    pub exclude_globs: Vec<String>,
688    /// Use shallow clone (`--depth 1`) to save disk space. Default: `true`.
689    #[serde(default = "default_true")]
690    pub shallow: bool,
691    /// Directory to cache cloned repos. Default: `.ctx/cache/git/<url-hash>/`
692    /// when using the workspace DB, otherwise `<db-dir>/.git-cache/<url-hash>/`.
693    #[serde(default)]
694    pub cache_dir: Option<PathBuf>,
695}
696
697/// Amazon S3 connector configuration.
698///
699/// Lists and downloads objects from an S3 bucket using the REST API with
700/// AWS Signature V4. Supports custom endpoints for S3-compatible services.
701/// See [`crate::connector_s3`] for the full implementation.
702///
703/// # Environment Variables
704///
705/// - `AWS_ACCESS_KEY_ID` — required
706/// - `AWS_SECRET_ACCESS_KEY` — required
707/// - `AWS_SESSION_TOKEN` — optional (for temporary credentials)
708///
709/// # Example
710///
711/// ```toml
712/// [connectors.s3.runbooks]
713/// bucket = "acme-docs"
714/// prefix = "engineering/runbooks/"
715/// region = "us-east-1"
716/// include_globs = ["**/*.md"]
717/// # endpoint_url = "http://localhost:9000"   # for MinIO
718/// ```
719#[derive(Debug, Deserialize, Clone)]
720pub struct S3ConnectorConfig {
721    /// S3 bucket name.
722    pub bucket: String,
723    /// Key prefix to filter objects. Default: `""` (entire bucket).
724    #[serde(default)]
725    pub prefix: String,
726    /// AWS region. Default: `"us-east-1"`.
727    #[serde(default = "default_s3_region")]
728    pub region: String,
729    /// Glob patterns for object keys to include. Default: `["**/*.md", "**/*.txt"]`.
730    #[serde(default = "default_include_globs")]
731    pub include_globs: Vec<String>,
732    /// Glob patterns for object keys to exclude. Default: `[]`.
733    #[serde(default)]
734    pub exclude_globs: Vec<String>,
735    /// Custom endpoint URL for S3-compatible services (MinIO, LocalStack).
736    #[serde(default)]
737    pub endpoint_url: Option<String>,
738}
739
740fn default_git_branch() -> String {
741    "main".to_string()
742}
743
744fn default_git_root() -> String {
745    ".".to_string()
746}
747
748fn default_true() -> bool {
749    true
750}
751
752fn default_s3_region() -> String {
753    "us-east-1".to_string()
754}
755
756fn default_include_globs() -> Vec<String> {
757    vec!["**/*.md".to_string(), "**/*.txt".to_string()]
758}
759
760fn default_max_extract_bytes() -> u64 {
761    50_000_000
762}
763
764impl EmbeddingConfig {
765    /// Returns `true` if an embedding provider is configured (not `"disabled"`).
766    pub fn is_enabled(&self) -> bool {
767        self.provider != "disabled"
768    }
769}
770
771/// Load and validate a configuration file from disk.
772///
773/// # Arguments
774///
775/// * `path` — Path to a TOML configuration file.
776///
777/// # Errors
778///
779/// Returns an error if:
780/// - The file cannot be read or parsed
781/// - `chunking.max_tokens` is zero
782/// - `retrieval.final_limit` is less than 1
783/// - `retrieval.hybrid_alpha` is outside `[0.0, 1.0]`
784/// - Embedding provider is enabled but `model` or `dims` is missing/zero
785/// - Unknown embedding provider name
786#[allow(dead_code)]
787pub fn load_config(path: &Path) -> Result<Config> {
788    load_config_file(path)
789}
790
791pub fn load_config_for_cli(explicit_path: Option<PathBuf>) -> Result<ResolvedConfig> {
792    let paths = ctx_dirs::config_paths(explicit_path);
793    let source = paths.resolve();
794
795    match source.kind {
796        ConfigSourceKind::Explicit | ConfigSourceKind::Env | ConfigSourceKind::Global => {
797            let path = source
798                .path
799                .clone()
800                .expect("path-backed config source must include path");
801            let config = load_config_file(&path)?;
802            Ok(ResolvedConfig {
803                config,
804                path: Some(path),
805                source: source.kind,
806            })
807        }
808        ConfigSourceKind::Workspace | ConfigSourceKind::LegacyWorkspace => {
809            let workspace_path = source
810                .path
811                .clone()
812                .expect("path-backed config source must include path");
813            let workspace_value = load_config_value(&workspace_path)?;
814            let merged_value = if paths.global.exists() {
815                let mut global_value = load_config_value(&paths.global)?;
816                merge_toml(&mut global_value, workspace_value);
817                global_value
818            } else {
819                workspace_value
820            };
821            let config = config_from_value(merged_value)?;
822            Ok(ResolvedConfig {
823                config,
824                path: Some(workspace_path),
825                source: source.kind,
826            })
827        }
828        ConfigSourceKind::BuiltIn => Ok(ResolvedConfig {
829            config: Config::minimal(),
830            path: None,
831            source: ConfigSourceKind::BuiltIn,
832        }),
833    }
834}
835
836pub fn ensure_workspace_config_for_init(explicit_path: Option<&Path>) -> Result<Option<PathBuf>> {
837    let paths = ctx_dirs::config_paths(explicit_path.map(Path::to_path_buf));
838    if paths.has_explicit_source() || paths.has_workspace_source() {
839        return Ok(paths.resolve().path);
840    }
841
842    let ctx_dir = ctx_dirs::workspace_dir();
843    std::fs::create_dir_all(ctx_dirs::workspace_data_dir())?;
844    std::fs::create_dir_all(ctx_dirs::workspace_cache_dir())?;
845    std::fs::write(ctx_dir.join(".gitignore"), "data/\ncache/\n")?;
846
847    let config_path = ctx_dirs::workspace_config_path();
848    if !config_path.exists() {
849        std::fs::write(&config_path, default_workspace_config_toml())?;
850    }
851    Ok(Some(config_path))
852}
853
854fn load_config_file(path: &Path) -> Result<Config> {
855    let content = std::fs::read_to_string(path)
856        .with_context(|| format!("Failed to read config file: {}", path.display()))?;
857
858    let config: Config = toml::from_str(&content).with_context(|| "Failed to parse config file")?;
859    validate_config(config)
860}
861
862fn load_config_value(path: &Path) -> Result<toml::Value> {
863    let content = std::fs::read_to_string(path)
864        .with_context(|| format!("Failed to read config file: {}", path.display()))?;
865    toml::from_str(&content).with_context(|| "Failed to parse config file")
866}
867
868fn config_from_value(value: toml::Value) -> Result<Config> {
869    let config: Config = value
870        .try_into()
871        .with_context(|| "Failed to parse config file")?;
872    validate_config(config)
873}
874
875fn validate_config(config: Config) -> Result<Config> {
876    // Validate chunking
877    if config.chunking.max_tokens == 0 {
878        anyhow::bail!("chunking.max_tokens must be > 0");
879    }
880
881    // Validate retrieval
882    if config.retrieval.final_limit < 1 {
883        anyhow::bail!("retrieval.final_limit must be >= 1");
884    }
885
886    if !(0.0..=1.0).contains(&config.retrieval.hybrid_alpha) {
887        anyhow::bail!("retrieval.hybrid_alpha must be in [0.0, 1.0]");
888    }
889
890    // Validate embedding
891    match config.embedding.provider.as_str() {
892        "disabled" => {}
893        "openai" | "ollama" => {
894            if config.embedding.dims.is_none() || config.embedding.dims == Some(0) {
895                anyhow::bail!(
896                    "embedding.dims must be > 0 when provider is '{}'",
897                    config.embedding.provider
898                );
899            }
900            if config.embedding.model.is_none() {
901                anyhow::bail!(
902                    "embedding.model must be specified when provider is '{}'",
903                    config.embedding.provider
904                );
905            }
906        }
907        "local" => {
908            // model and dims are optional for local — defaults applied at runtime
909        }
910        other => anyhow::bail!(
911            "Unknown embedding provider: '{}'. Must be disabled, openai, ollama, or local.",
912            other
913        ),
914    }
915
916    match config.vector_index.backend.as_str() {
917        "auto" | "zvec" | "sqlite" | "disabled" => {}
918        other => anyhow::bail!(
919            "Unknown vector_index.backend: '{}'. Must be auto, zvec, sqlite, or disabled.",
920            other
921        ),
922    }
923
924    match config.vector_index.metric.as_str() {
925        "cosine" => {}
926        other => anyhow::bail!("Unknown vector_index.metric: '{}'. Must be cosine.", other),
927    }
928
929    match config.vector_index.index.as_str() {
930        "hnsw" | "flat" => {}
931        other => anyhow::bail!(
932            "Unknown vector_index.index: '{}'. Must be hnsw or flat.",
933            other
934        ),
935    }
936
937    match config.vector_index.fallback.as_str() {
938        "sqlite" | "disabled" => {}
939        other => anyhow::bail!(
940            "Unknown vector_index.fallback: '{}'. Must be sqlite or disabled.",
941            other
942        ),
943    }
944
945    Ok(config)
946}
947
948fn merge_toml(base: &mut toml::Value, overlay: toml::Value) {
949    match (base, overlay) {
950        (toml::Value::Table(base_table), toml::Value::Table(overlay_table)) => {
951            for (key, overlay_value) in overlay_table {
952                match base_table.get_mut(&key) {
953                    Some(base_value) => merge_toml(base_value, overlay_value),
954                    None => {
955                        base_table.insert(key, overlay_value);
956                    }
957                }
958            }
959        }
960        (base_value, overlay_value) => {
961            *base_value = overlay_value;
962        }
963    }
964}
965
966pub fn default_workspace_config_toml() -> &'static str {
967    r#"[db]
968path = ".ctx/data/ctx.sqlite"
969
970[chunking]
971max_tokens = 700
972overlap_tokens = 0
973
974[retrieval]
975final_limit = 12
976
977[server]
978bind = "127.0.0.1:7331"
979"#
980}
context_harness/config.rs

context_harness/
config.rs