context_harness/config.rs
1//! Configuration parsing and validation.
2//!
3//! Context Harness is configured via a TOML file (default: `config/ctx.toml`).
4//! The config defines database paths, chunking parameters, embedding provider
5//! settings, retrieval tuning, server bind address, and connector configurations.
6//!
7//! # Example Configuration
8//!
9//! ```toml
10//! [db]
11//! path = ".ctx/data/ctx.sqlite"
12//!
13//! [chunking]
14//! max_tokens = 700
15//! overlap_tokens = 80
16//!
17//! [embedding]
18//! provider = "openai" # "disabled" | "openai"
19//! model = "text-embedding-3-small"
20//! dims = 1536
21//!
22//! [vector_index]
23//! backend = "auto"
24//! path = "auto"
25//! metric = "cosine"
26//! index = "hnsw"
27//! fallback = "sqlite"
28//!
29//! [retrieval]
30//! final_limit = 12
31//! hybrid_alpha = 0.6 # 0.0 = keyword only, 1.0 = semantic only
32//!
33//! [server]
34//! bind = "127.0.0.1:7331"
35//!
36//! [connectors.filesystem.docs]
37//! root = "./docs"
38//! include_globs = ["**/*.md", "**/*.txt"]
39//!
40//! [connectors.git.platform]
41//! url = "https://github.com/acme/platform.git"
42//! branch = "main"
43//! ```
44//!
45//! # Connectors
46//!
47//! All connector types are **named** — you can configure multiple instances of each:
48//! - **Filesystem** (`[connectors.filesystem.<name>]`) — scan a local directory
49//! - **Git** (`[connectors.git.<name>]`) — clone/pull a Git repository
50//! - **S3** (`[connectors.s3.<name>]`) — list and download from an S3 bucket
51//! - **Script** (`[connectors.script.<name>]`) — custom Lua-scripted data sources
52//!
53//! # Validation
54//!
55//! [`load_config`] performs the following validations:
56//! - `chunking.max_tokens > 0`
57//! - `retrieval.final_limit >= 1`
58//! - `retrieval.hybrid_alpha ∈ [0.0, 1.0]`
59//! - When embedding provider is `openai` or `ollama`: `model` and `dims` must be set
60//! - Embedding provider must be one of: `"disabled"`, `"openai"`, `"ollama"`, `"local"`
61
62use anyhow::{Context, Result};
63use serde::Deserialize;
64use std::collections::HashMap;
65use std::path::{Path, PathBuf};
66
67use crate::ctx_dirs::{self, ConfigSourceKind};
68
69/// Top-level configuration structure.
70///
71/// Deserialized from the TOML config file. All sections are required
72/// except `connectors`, which defaults to an empty set.
73#[derive(Debug, Deserialize, Clone)]
74pub struct Config {
75 /// Database connection settings.
76 pub db: DbConfig,
77 /// Text chunking parameters.
78 pub chunking: ChunkingConfig,
79 /// Search and retrieval tuning.
80 pub retrieval: RetrievalConfig,
81 /// Embedding provider settings (defaults to disabled).
82 #[serde(default)]
83 pub embedding: EmbeddingConfig,
84 /// Optional vector-index acceleration settings (defaults to auto with SQLite fallback).
85 #[serde(default)]
86 #[allow(dead_code)]
87 pub vector_index: VectorIndexConfig,
88 /// HTTP server bind address.
89 #[allow(dead_code)]
90 pub server: ServerConfig,
91 /// Connector configurations (all optional).
92 #[serde(default)]
93 pub connectors: ConnectorsConfig,
94 /// Tool script configurations (all optional).
95 #[serde(default)]
96 pub tools: ToolsConfig,
97 /// Agent configurations (all optional).
98 #[serde(default)]
99 pub agents: AgentsConfig,
100 /// Extension registry configurations (all optional).
101 #[serde(default)]
102 pub registries: HashMap<String, RegistryConfig>,
103}
104
105impl Config {
106 /// Create a minimal config suitable for commands that don't need
107 /// database or connector settings (e.g., `ctx connector test`).
108 pub fn minimal() -> Self {
109 Self {
110 db: DbConfig {
111 path: ctx_dirs::workspace_db_path(),
112 },
113 chunking: ChunkingConfig {
114 max_tokens: 700,
115 overlap_tokens: 0,
116 },
117 retrieval: RetrievalConfig {
118 hybrid_alpha: default_hybrid_alpha(),
119 candidate_k_keyword: default_candidate_k(),
120 candidate_k_vector: default_candidate_k(),
121 final_limit: default_final_limit(),
122 group_by: default_group_by(),
123 doc_agg: default_doc_agg(),
124 max_chunks_per_doc: default_max_chunks_per_doc(),
125 },
126 embedding: EmbeddingConfig::default(),
127 vector_index: VectorIndexConfig::default(),
128 server: ServerConfig {
129 bind: "127.0.0.1:7331".to_string(),
130 },
131 connectors: ConnectorsConfig::default(),
132 tools: ToolsConfig::default(),
133 agents: AgentsConfig::default(),
134 registries: HashMap::new(),
135 }
136 }
137}
138
139#[derive(Debug, Clone)]
140pub struct ResolvedConfig {
141 pub config: Config,
142 pub path: Option<PathBuf>,
143 #[allow(dead_code)]
144 pub source: ConfigSourceKind,
145}
146
147/// Database configuration.
148///
149/// Specifies the path to the SQLite database file. The file and its
150/// parent directories are created automatically on first use.
151#[derive(Debug, Deserialize, Clone)]
152pub struct DbConfig {
153 /// Path to the SQLite database file (e.g. `".ctx/data/ctx.sqlite"`).
154 pub path: PathBuf,
155}
156
157/// Text chunking parameters.
158///
159/// Controls how document bodies are split into chunks for indexing
160/// and embedding. See [`crate::chunk`] for the chunking algorithm.
161#[derive(Debug, Deserialize, Clone)]
162pub struct ChunkingConfig {
163 /// Maximum tokens per chunk. Chunks are split on paragraph boundaries
164 /// to stay within this limit. Converted to characters via `max_tokens × 4`.
165 pub max_tokens: usize,
166 /// Number of overlapping tokens between adjacent chunks (reserved for future use).
167 #[serde(default = "default_overlap")]
168 #[allow(dead_code)]
169 pub overlap_tokens: usize,
170}
171
172fn default_overlap() -> usize {
173 0
174}
175
176/// Search and retrieval tuning parameters.
177///
178/// These settings control how keyword and semantic search results are
179/// merged in hybrid mode, and the overall result limits.
180///
181/// # Hybrid Scoring
182///
183/// The `hybrid_alpha` weight determines the blend between keyword (BM25)
184/// and semantic (cosine similarity) scores:
185///
186/// ```text
187/// hybrid_score = (1 - α) × keyword_score + α × semantic_score
188/// ```
189///
190/// - `α = 0.0` → pure keyword search
191/// - `α = 1.0` → pure semantic search
192/// - `α = 0.6` (default) → 60% semantic, 40% keyword
193///
194/// See `docs/HYBRID_SCORING.md` for the full specification.
195#[derive(Debug, Deserialize, Clone)]
196pub struct RetrievalConfig {
197 /// Weight for semantic vs. keyword scores in hybrid mode.
198 /// Range: `[0.0, 1.0]`. Default: `0.6`.
199 #[serde(default = "default_hybrid_alpha")]
200 pub hybrid_alpha: f64,
201 /// Number of keyword candidates to fetch before merging. Default: `80`.
202 #[serde(default = "default_candidate_k")]
203 pub candidate_k_keyword: i64,
204 /// Number of vector candidates to fetch before merging. Default: `80`.
205 #[serde(default = "default_candidate_k")]
206 pub candidate_k_vector: i64,
207 /// Maximum number of results to return after merging and ranking. Default: `12`.
208 #[serde(default = "default_final_limit")]
209 pub final_limit: i64,
210 /// Grouping strategy for results. Default: `"document"`.
211 #[serde(default = "default_group_by")]
212 #[allow(dead_code)]
213 pub group_by: String,
214 /// Aggregation method for document-level scores. Default: `"max"`.
215 #[serde(default = "default_doc_agg")]
216 #[allow(dead_code)]
217 pub doc_agg: String,
218 /// Maximum chunks per document in results. Default: `3`.
219 #[serde(default = "default_max_chunks_per_doc")]
220 #[allow(dead_code)]
221 pub max_chunks_per_doc: usize,
222}
223
224fn default_hybrid_alpha() -> f64 {
225 0.6
226}
227fn default_candidate_k() -> i64 {
228 80
229}
230fn default_final_limit() -> i64 {
231 12
232}
233fn default_group_by() -> String {
234 "document".to_string()
235}
236fn default_doc_agg() -> String {
237 "max".to_string()
238}
239fn default_max_chunks_per_doc() -> usize {
240 3
241}
242
243/// Embedding provider configuration.
244///
245/// Controls which embedding provider is used and its parameters.
246/// When `provider = "disabled"`, no embeddings are generated and
247/// semantic/hybrid search modes will return errors.
248///
249/// # Providers
250///
251/// | Provider | Description |
252/// |----------|-------------|
253/// | `"disabled"` | No embeddings (default) |
254/// | `"openai"` | OpenAI API (`text-embedding-3-small`, etc.) |
255/// | `"ollama"` | Local Ollama instance (`nomic-embed-text`, etc.) |
256/// | `"local"` | Built-in models via fastembed (primary) or tract (musl/Intel Mac) (`all-minilm-l6-v2`, etc.) |
257///
258/// When using `"openai"`, the `OPENAI_API_KEY` environment variable must be set.
259/// When using `"ollama"`, an Ollama instance must be running (default: `http://localhost:11434`).
260/// When using `"local"`, the model is downloaded on first use and cached in `~/.cache/huggingface/`.
261#[derive(Debug, Deserialize, Clone)]
262pub struct EmbeddingConfig {
263 /// Provider name: `"disabled"`, `"openai"`, `"ollama"`, or `"local"`. Default: `"disabled"`.
264 #[serde(default = "default_provider")]
265 pub provider: String,
266 /// Embedding model name (e.g. `"text-embedding-3-small"`, `"nomic-embed-text"`,
267 /// `"all-minilm-l6-v2"`). Required for `openai` and `ollama`; optional for `local`
268 /// (defaults to `"all-minilm-l6-v2"`).
269 #[serde(default)]
270 pub model: Option<String>,
271 /// Embedding vector dimensionality (e.g. `1536` for `text-embedding-3-small`).
272 /// Required for `openai` and `ollama`; auto-detected for `local`.
273 #[serde(default)]
274 pub dims: Option<usize>,
275 /// Number of texts to embed per batch. Default: `64`.
276 #[serde(default = "default_batch_size")]
277 pub batch_size: usize,
278 /// Maximum retry attempts for transient API errors. Default: `5`.
279 #[serde(default = "default_max_retries")]
280 pub max_retries: u32,
281 /// HTTP timeout per request in seconds. Default: `30`.
282 #[serde(default = "default_timeout_secs")]
283 pub timeout_secs: u64,
284 /// Base URL for Ollama API. Default: `"http://localhost:11434"`.
285 #[serde(default)]
286 pub url: Option<String>,
287}
288
289impl Default for EmbeddingConfig {
290 fn default() -> Self {
291 Self {
292 provider: "disabled".to_string(),
293 model: None,
294 dims: None,
295 batch_size: 64,
296 max_retries: 5,
297 timeout_secs: 30,
298 url: None,
299 }
300 }
301}
302
303fn default_provider() -> String {
304 "disabled".to_string()
305}
306fn default_batch_size() -> usize {
307 64
308}
309fn default_max_retries() -> u32 {
310 5
311}
312fn default_timeout_secs() -> u64 {
313 30
314}
315
316/// Optional vector-index acceleration configuration.
317///
318/// The default is automatic: use the built-in vector accelerator when the
319/// binary supports one and fall back to the exact SQLite vector scan otherwise.
320#[derive(Debug, Deserialize, Clone)]
321pub struct VectorIndexConfig {
322 /// Backend name. Default: `"auto"`.
323 #[serde(default = "default_vector_backend")]
324 #[allow(dead_code)]
325 pub backend: String,
326 /// Filesystem path for external vector-index state. Default: `"auto"`.
327 #[serde(default = "default_vector_path")]
328 #[allow(dead_code)]
329 pub path: PathBuf,
330 /// Distance metric. Default: `"cosine"`.
331 #[serde(default = "default_vector_metric")]
332 #[allow(dead_code)]
333 pub metric: String,
334 /// Index kind. Default: `"hnsw"`.
335 #[serde(default = "default_vector_index")]
336 #[allow(dead_code)]
337 pub index: String,
338 /// Fallback strategy when acceleration is unavailable. Default: `"sqlite"`.
339 #[serde(default = "default_vector_fallback")]
340 #[allow(dead_code)]
341 pub fallback: String,
342}
343
344impl Default for VectorIndexConfig {
345 fn default() -> Self {
346 Self {
347 backend: default_vector_backend(),
348 path: default_vector_path(),
349 metric: default_vector_metric(),
350 index: default_vector_index(),
351 fallback: default_vector_fallback(),
352 }
353 }
354}
355
356fn default_vector_backend() -> String {
357 "auto".to_string()
358}
359
360fn default_vector_path() -> PathBuf {
361 PathBuf::from("auto")
362}
363
364fn default_vector_metric() -> String {
365 "cosine".to_string()
366}
367
368fn default_vector_index() -> String {
369 "hnsw".to_string()
370}
371
372fn default_vector_fallback() -> String {
373 "sqlite".to_string()
374}
375
376/// HTTP server configuration.
377#[derive(Debug, Deserialize, Clone)]
378pub struct ServerConfig {
379 /// Socket address to bind to (e.g. `"127.0.0.1:7331"`).
380 pub bind: String,
381}
382
383/// Container for all connector configurations.
384///
385/// All connector types use named instances — you can configure multiple
386/// of each type. For example:
387///
388/// ```toml
389/// [connectors.git.platform]
390/// url = "https://github.com/acme/platform.git"
391///
392/// [connectors.git.auth-service]
393/// url = "https://github.com/acme/auth-service.git"
394/// ```
395///
396/// Use `ctx sync git` to sync all git connectors, or `ctx sync git:platform`
397/// for a specific one. `ctx sync all` syncs everything.
398#[derive(Debug, Deserialize, Clone, Default)]
399pub struct ConnectorsConfig {
400 /// Named filesystem connectors: walk local directories.
401 #[serde(default)]
402 pub filesystem: HashMap<String, FilesystemConnectorConfig>,
403 /// Named Git connectors: clone and scan Git repositories.
404 #[serde(default)]
405 pub git: HashMap<String, GitConnectorConfig>,
406 /// Named S3 connectors: list and download from S3 buckets.
407 #[serde(default)]
408 pub s3: HashMap<String, S3ConnectorConfig>,
409 /// Named Lua script connectors.
410 /// Each key is a connector name, each value contains the script path
411 /// and arbitrary config keys passed to the Lua `connector.scan()` function.
412 /// See `docs/LUA_CONNECTORS.md` for the full specification.
413 #[serde(default)]
414 pub script: HashMap<String, ScriptConnectorConfig>,
415}
416
417/// Lua script connector configuration.
418///
419/// Points to a `.lua` file implementing the connector interface. All fields
420/// except `path` and `timeout` are passed as a config table to the script's
421/// `connector.scan(config)` function.
422///
423/// Values containing `${VAR_NAME}` are expanded from the process environment.
424///
425/// # Example
426///
427/// ```toml
428/// [connectors.script.jira]
429/// path = "connectors/jira.lua"
430/// timeout = 600
431/// url = "https://mycompany.atlassian.net"
432/// api_token = "${JIRA_API_TOKEN}"
433/// project_key = "ENG"
434/// ```
435#[derive(Debug, Deserialize, Clone)]
436pub struct ScriptConnectorConfig {
437 /// Path to the `.lua` connector script.
438 pub path: PathBuf,
439 /// Maximum execution time in seconds. Default: `300`.
440 #[serde(default = "default_script_timeout")]
441 pub timeout: u64,
442 /// All other config keys — passed to the Lua `connector.scan()` function.
443 #[serde(flatten)]
444 pub extra: toml::Table,
445}
446
447fn default_script_timeout() -> u64 {
448 300
449}
450
451/// Container for all tool script configurations.
452///
453/// Tool scripts are Lua files that define MCP tools agents can discover
454/// and call via the HTTP server. See `docs/LUA_TOOLS.md` for the full
455/// specification.
456#[derive(Debug, Deserialize, Clone, Default)]
457pub struct ToolsConfig {
458 /// Named Lua tool scripts.
459 /// Each key is the tool name, each value contains the script path
460 /// and arbitrary config keys accessible via `context.config` in the script.
461 #[serde(default)]
462 pub script: HashMap<String, ScriptToolConfig>,
463}
464
465/// Lua tool script configuration.
466///
467/// Points to a `.lua` file implementing the tool interface. All fields
468/// except `path` and `timeout` are passed as `context.config` to the
469/// script's `tool.execute(params, context)` function.
470///
471/// Values containing `${VAR_NAME}` are expanded from the process environment.
472///
473/// # Example
474///
475/// ```toml
476/// [tools.script.create_jira_ticket]
477/// path = "tools/create-jira-ticket.lua"
478/// timeout = 30
479/// url = "https://mycompany.atlassian.net"
480/// api_token = "${JIRA_API_TOKEN}"
481/// ```
482#[derive(Debug, Deserialize, Clone)]
483pub struct ScriptToolConfig {
484 /// Path to the `.lua` tool script.
485 pub path: PathBuf,
486 /// Maximum execution time in seconds. Default: `30`.
487 #[serde(default = "default_tool_timeout")]
488 pub timeout: u64,
489 /// All other config keys — accessible via `context.config` in the script.
490 #[serde(flatten)]
491 pub extra: toml::Table,
492}
493
494fn default_tool_timeout() -> u64 {
495 30
496}
497
498/// Container for all agent configurations.
499///
500/// Agents are named personas that combine a system prompt, scoped tools,
501/// and optional dynamic context injection. They can be defined inline
502/// in TOML or via Lua scripts.
503///
504/// # Example
505///
506/// ```toml
507/// [agents.inline.code-reviewer]
508/// description = "Reviews code against project conventions"
509/// tools = ["search", "get"]
510/// system_prompt = "You are a senior code reviewer..."
511///
512/// [agents.script.incident-responder]
513/// path = "agents/incident-responder.lua"
514/// timeout = 30
515/// ```
516#[derive(Debug, Deserialize, Clone, Default)]
517pub struct AgentsConfig {
518 /// Inline TOML agents with static system prompts.
519 /// Each key is the agent name, each value contains the prompt and tool list.
520 #[serde(default)]
521 pub inline: HashMap<String, InlineAgentConfig>,
522 /// Lua script agents with dynamic prompt resolution.
523 /// Each key is the agent name, each value contains the script path
524 /// and arbitrary config keys passed to `agent.resolve()`.
525 #[serde(default)]
526 pub script: HashMap<String, ScriptAgentConfig>,
527}
528
529/// Inline (TOML) agent configuration.
530///
531/// Defines an agent with a static system prompt and fixed tool list.
532/// The simplest way to create an agent — no Lua or Rust code needed.
533///
534/// # Example
535///
536/// ```toml
537/// [agents.inline.architect]
538/// description = "Answers architecture questions"
539/// tools = ["search", "get", "sources"]
540/// system_prompt = """
541/// You are a software architect. Search for ADRs and design
542/// docs to ground your recommendations.
543/// """
544/// ```
545#[derive(Debug, Deserialize, Clone)]
546pub struct InlineAgentConfig {
547 /// One-line description for agent discovery.
548 pub description: String,
549 /// List of tool names this agent should expose.
550 pub tools: Vec<String>,
551 /// The system prompt text.
552 pub system_prompt: String,
553}
554
555/// Lua script agent configuration.
556///
557/// Points to a `.lua` file implementing the agent interface. All fields
558/// except `path` and `timeout` are passed as config to the script's
559/// `agent.resolve(args, config, context)` function.
560///
561/// Values containing `${VAR_NAME}` are expanded from the process environment.
562///
563/// # Example
564///
565/// ```toml
566/// [agents.script.incident-responder]
567/// path = "agents/incident-responder.lua"
568/// timeout = 30
569/// search_limit = 5
570/// priority_sources = ["runbooks"]
571/// ```
572#[derive(Debug, Deserialize, Clone)]
573pub struct ScriptAgentConfig {
574 /// Path to the `.lua` agent script.
575 pub path: PathBuf,
576 /// Maximum execution time in seconds. Default: `30`.
577 #[serde(default = "default_agent_timeout")]
578 pub timeout: u64,
579 /// All other config keys — passed to the Lua `agent.resolve()` function.
580 #[serde(flatten)]
581 pub extra: toml::Table,
582}
583
584fn default_agent_timeout() -> u64 {
585 30
586}
587
588/// Extension registry configuration.
589///
590/// Points to a local directory (optionally backed by a Git repository)
591/// containing Lua connector, tool, and agent scripts described by a
592/// `registry.toml` manifest.
593///
594/// # Example
595///
596/// ```toml
597/// [registries.community]
598/// url = "https://github.com/parallax-labs/ctx-registry.git"
599/// branch = "main"
600/// path = "~/.local/share/ctx/registries/community"
601/// readonly = true
602/// auto_update = true
603/// ```
604#[derive(Debug, Deserialize, Clone)]
605pub struct RegistryConfig {
606 /// Git repository URL to clone from. `None` means local-only (no git).
607 pub url: Option<String>,
608 /// Git branch or tag to track. Default: `"main"`.
609 pub branch: Option<String>,
610 /// Local filesystem path where the registry is (or will be) stored.
611 pub path: PathBuf,
612 /// If `true`, extensions on this path cannot be edited in place;
613 /// overrides are copied to a writable registry.
614 #[serde(default)]
615 pub readonly: bool,
616 /// If `true`, `ctx registry update` (and optionally `ctx sync`) will
617 /// `git pull` this registry automatically.
618 #[serde(default)]
619 #[allow(dead_code)]
620 pub auto_update: bool,
621}
622
623/// Filesystem connector configuration.
624///
625/// Scans a local directory tree, applying glob include/exclude filters.
626/// See [`crate::connector_fs`] for the scanning implementation.
627///
628/// # Example
629///
630/// ```toml
631/// [connectors.filesystem.docs]
632/// root = "./docs"
633/// include_globs = ["**/*.md", "**/*.txt"]
634/// exclude_globs = ["**/drafts/**"]
635/// follow_symlinks = false
636/// max_extract_bytes = 50_000_000
637/// ```
638#[derive(Debug, Deserialize, Clone)]
639pub struct FilesystemConnectorConfig {
640 /// Root directory to scan.
641 pub root: PathBuf,
642 /// Glob patterns for files to include. Default: `["**/*.md", "**/*.txt"]`.
643 #[serde(default = "default_include_globs")]
644 pub include_globs: Vec<String>,
645 /// Glob patterns for files to exclude. Default: `[]`.
646 #[serde(default)]
647 pub exclude_globs: Vec<String>,
648 /// Whether to follow symbolic links. Default: `false`.
649 #[serde(default)]
650 pub follow_symlinks: bool,
651 /// Files larger than this (bytes) are not extracted; they are skipped and counted in extraction skipped. Default: 50_000_000.
652 #[serde(default = "default_max_extract_bytes")]
653 pub max_extract_bytes: u64,
654}
655
656/// Git connector configuration.
657///
658/// Clones (or pulls) a Git repository and scans files within a configurable
659/// subdirectory. Extracts per-file metadata from `git log`.
660/// See [`crate::connector_git`] for the full implementation.
661///
662/// # Example
663///
664/// ```toml
665/// [connectors.git.platform]
666/// url = "https://github.com/acme/platform.git"
667/// branch = "main"
668/// root = "docs/"
669/// include_globs = ["**/*.md"]
670/// shallow = true
671/// ```
672#[derive(Debug, Deserialize, Clone)]
673pub struct GitConnectorConfig {
674 /// Git repository URL (`https://`, `git@`, or local path).
675 pub url: String,
676 /// Branch to clone/pull. Default: `"main"`.
677 #[serde(default = "default_git_branch")]
678 pub branch: String,
679 /// Subdirectory within the repo to scan. Default: `"."` (entire repo).
680 #[serde(default = "default_git_root")]
681 pub root: String,
682 /// Glob patterns for files to include. Default: `["**/*.md", "**/*.txt"]`.
683 #[serde(default = "default_include_globs")]
684 pub include_globs: Vec<String>,
685 /// Glob patterns for files to exclude. Default: `[]`.
686 #[serde(default)]
687 pub exclude_globs: Vec<String>,
688 /// Use shallow clone (`--depth 1`) to save disk space. Default: `true`.
689 #[serde(default = "default_true")]
690 pub shallow: bool,
691 /// Directory to cache cloned repos. Default: `.ctx/cache/git/<url-hash>/`
692 /// when using the workspace DB, otherwise `<db-dir>/.git-cache/<url-hash>/`.
693 #[serde(default)]
694 pub cache_dir: Option<PathBuf>,
695}
696
697/// Amazon S3 connector configuration.
698///
699/// Lists and downloads objects from an S3 bucket using the REST API with
700/// AWS Signature V4. Supports custom endpoints for S3-compatible services.
701/// See [`crate::connector_s3`] for the full implementation.
702///
703/// # Environment Variables
704///
705/// - `AWS_ACCESS_KEY_ID` — required
706/// - `AWS_SECRET_ACCESS_KEY` — required
707/// - `AWS_SESSION_TOKEN` — optional (for temporary credentials)
708///
709/// # Example
710///
711/// ```toml
712/// [connectors.s3.runbooks]
713/// bucket = "acme-docs"
714/// prefix = "engineering/runbooks/"
715/// region = "us-east-1"
716/// include_globs = ["**/*.md"]
717/// # endpoint_url = "http://localhost:9000" # for MinIO
718/// ```
719#[derive(Debug, Deserialize, Clone)]
720pub struct S3ConnectorConfig {
721 /// S3 bucket name.
722 pub bucket: String,
723 /// Key prefix to filter objects. Default: `""` (entire bucket).
724 #[serde(default)]
725 pub prefix: String,
726 /// AWS region. Default: `"us-east-1"`.
727 #[serde(default = "default_s3_region")]
728 pub region: String,
729 /// Glob patterns for object keys to include. Default: `["**/*.md", "**/*.txt"]`.
730 #[serde(default = "default_include_globs")]
731 pub include_globs: Vec<String>,
732 /// Glob patterns for object keys to exclude. Default: `[]`.
733 #[serde(default)]
734 pub exclude_globs: Vec<String>,
735 /// Custom endpoint URL for S3-compatible services (MinIO, LocalStack).
736 #[serde(default)]
737 pub endpoint_url: Option<String>,
738}
739
740fn default_git_branch() -> String {
741 "main".to_string()
742}
743
744fn default_git_root() -> String {
745 ".".to_string()
746}
747
748fn default_true() -> bool {
749 true
750}
751
752fn default_s3_region() -> String {
753 "us-east-1".to_string()
754}
755
756fn default_include_globs() -> Vec<String> {
757 vec!["**/*.md".to_string(), "**/*.txt".to_string()]
758}
759
760fn default_max_extract_bytes() -> u64 {
761 50_000_000
762}
763
764impl EmbeddingConfig {
765 /// Returns `true` if an embedding provider is configured (not `"disabled"`).
766 pub fn is_enabled(&self) -> bool {
767 self.provider != "disabled"
768 }
769}
770
771/// Load and validate a configuration file from disk.
772///
773/// # Arguments
774///
775/// * `path` — Path to a TOML configuration file.
776///
777/// # Errors
778///
779/// Returns an error if:
780/// - The file cannot be read or parsed
781/// - `chunking.max_tokens` is zero
782/// - `retrieval.final_limit` is less than 1
783/// - `retrieval.hybrid_alpha` is outside `[0.0, 1.0]`
784/// - Embedding provider is enabled but `model` or `dims` is missing/zero
785/// - Unknown embedding provider name
786#[allow(dead_code)]
787pub fn load_config(path: &Path) -> Result<Config> {
788 load_config_file(path)
789}
790
791pub fn load_config_for_cli(explicit_path: Option<PathBuf>) -> Result<ResolvedConfig> {
792 let paths = ctx_dirs::config_paths(explicit_path);
793 let source = paths.resolve();
794
795 match source.kind {
796 ConfigSourceKind::Explicit | ConfigSourceKind::Env | ConfigSourceKind::Global => {
797 let path = source
798 .path
799 .clone()
800 .expect("path-backed config source must include path");
801 let config = load_config_file(&path)?;
802 Ok(ResolvedConfig {
803 config,
804 path: Some(path),
805 source: source.kind,
806 })
807 }
808 ConfigSourceKind::Workspace | ConfigSourceKind::LegacyWorkspace => {
809 let workspace_path = source
810 .path
811 .clone()
812 .expect("path-backed config source must include path");
813 let workspace_value = load_config_value(&workspace_path)?;
814 let merged_value = if paths.global.exists() {
815 let mut global_value = load_config_value(&paths.global)?;
816 merge_toml(&mut global_value, workspace_value);
817 global_value
818 } else {
819 workspace_value
820 };
821 let config = config_from_value(merged_value)?;
822 Ok(ResolvedConfig {
823 config,
824 path: Some(workspace_path),
825 source: source.kind,
826 })
827 }
828 ConfigSourceKind::BuiltIn => Ok(ResolvedConfig {
829 config: Config::minimal(),
830 path: None,
831 source: ConfigSourceKind::BuiltIn,
832 }),
833 }
834}
835
836pub fn ensure_workspace_config_for_init(explicit_path: Option<&Path>) -> Result<Option<PathBuf>> {
837 let paths = ctx_dirs::config_paths(explicit_path.map(Path::to_path_buf));
838 if paths.has_explicit_source() || paths.has_workspace_source() {
839 return Ok(paths.resolve().path);
840 }
841
842 let ctx_dir = ctx_dirs::workspace_dir();
843 std::fs::create_dir_all(ctx_dirs::workspace_data_dir())?;
844 std::fs::create_dir_all(ctx_dirs::workspace_cache_dir())?;
845 std::fs::write(ctx_dir.join(".gitignore"), "data/\ncache/\n")?;
846
847 let config_path = ctx_dirs::workspace_config_path();
848 if !config_path.exists() {
849 std::fs::write(&config_path, default_workspace_config_toml())?;
850 }
851 Ok(Some(config_path))
852}
853
854fn load_config_file(path: &Path) -> Result<Config> {
855 let content = std::fs::read_to_string(path)
856 .with_context(|| format!("Failed to read config file: {}", path.display()))?;
857
858 let config: Config = toml::from_str(&content).with_context(|| "Failed to parse config file")?;
859 validate_config(config)
860}
861
862fn load_config_value(path: &Path) -> Result<toml::Value> {
863 let content = std::fs::read_to_string(path)
864 .with_context(|| format!("Failed to read config file: {}", path.display()))?;
865 toml::from_str(&content).with_context(|| "Failed to parse config file")
866}
867
868fn config_from_value(value: toml::Value) -> Result<Config> {
869 let config: Config = value
870 .try_into()
871 .with_context(|| "Failed to parse config file")?;
872 validate_config(config)
873}
874
875fn validate_config(config: Config) -> Result<Config> {
876 // Validate chunking
877 if config.chunking.max_tokens == 0 {
878 anyhow::bail!("chunking.max_tokens must be > 0");
879 }
880
881 // Validate retrieval
882 if config.retrieval.final_limit < 1 {
883 anyhow::bail!("retrieval.final_limit must be >= 1");
884 }
885
886 if !(0.0..=1.0).contains(&config.retrieval.hybrid_alpha) {
887 anyhow::bail!("retrieval.hybrid_alpha must be in [0.0, 1.0]");
888 }
889
890 // Validate embedding
891 match config.embedding.provider.as_str() {
892 "disabled" => {}
893 "openai" | "ollama" => {
894 if config.embedding.dims.is_none() || config.embedding.dims == Some(0) {
895 anyhow::bail!(
896 "embedding.dims must be > 0 when provider is '{}'",
897 config.embedding.provider
898 );
899 }
900 if config.embedding.model.is_none() {
901 anyhow::bail!(
902 "embedding.model must be specified when provider is '{}'",
903 config.embedding.provider
904 );
905 }
906 }
907 "local" => {
908 // model and dims are optional for local — defaults applied at runtime
909 }
910 other => anyhow::bail!(
911 "Unknown embedding provider: '{}'. Must be disabled, openai, ollama, or local.",
912 other
913 ),
914 }
915
916 match config.vector_index.backend.as_str() {
917 "auto" | "zvec" | "sqlite" | "disabled" => {}
918 other => anyhow::bail!(
919 "Unknown vector_index.backend: '{}'. Must be auto, zvec, sqlite, or disabled.",
920 other
921 ),
922 }
923
924 match config.vector_index.metric.as_str() {
925 "cosine" => {}
926 other => anyhow::bail!("Unknown vector_index.metric: '{}'. Must be cosine.", other),
927 }
928
929 match config.vector_index.index.as_str() {
930 "hnsw" | "flat" => {}
931 other => anyhow::bail!(
932 "Unknown vector_index.index: '{}'. Must be hnsw or flat.",
933 other
934 ),
935 }
936
937 match config.vector_index.fallback.as_str() {
938 "sqlite" | "disabled" => {}
939 other => anyhow::bail!(
940 "Unknown vector_index.fallback: '{}'. Must be sqlite or disabled.",
941 other
942 ),
943 }
944
945 Ok(config)
946}
947
948fn merge_toml(base: &mut toml::Value, overlay: toml::Value) {
949 match (base, overlay) {
950 (toml::Value::Table(base_table), toml::Value::Table(overlay_table)) => {
951 for (key, overlay_value) in overlay_table {
952 match base_table.get_mut(&key) {
953 Some(base_value) => merge_toml(base_value, overlay_value),
954 None => {
955 base_table.insert(key, overlay_value);
956 }
957 }
958 }
959 }
960 (base_value, overlay_value) => {
961 *base_value = overlay_value;
962 }
963 }
964}
965
966pub fn default_workspace_config_toml() -> &'static str {
967 r#"[db]
968path = ".ctx/data/ctx.sqlite"
969
970[chunking]
971max_tokens = 700
972overlap_tokens = 0
973
974[retrieval]
975final_limit = 12
976
977[server]
978bind = "127.0.0.1:7331"
979"#
980}