context_harness/
config.rs

1//! Configuration parsing and validation.
2//!
3//! Context Harness is configured via a TOML file (default: `config/ctx.toml`).
4//! The config defines database paths, chunking parameters, embedding provider
5//! settings, retrieval tuning, server bind address, and connector configurations.
6//!
7//! # Example Configuration
8//!
9//! ```toml
10//! [db]
11//! path = "./data/ctx.sqlite"
12//!
13//! [chunking]
14//! max_tokens = 700
15//! overlap_tokens = 80
16//!
17//! [embedding]
18//! provider = "openai"           # "disabled" | "openai"
19//! model = "text-embedding-3-small"
20//! dims = 1536
21//!
22//! [retrieval]
23//! final_limit = 12
24//! hybrid_alpha = 0.6            # 0.0 = keyword only, 1.0 = semantic only
25//!
26//! [server]
27//! bind = "127.0.0.1:7331"
28//!
29//! [connectors.filesystem.docs]
30//! root = "./docs"
31//! include_globs = ["**/*.md", "**/*.txt"]
32//!
33//! [connectors.git.platform]
34//! url = "https://github.com/acme/platform.git"
35//! branch = "main"
36//! ```
37//!
38//! # Connectors
39//!
40//! All connector types are **named** — you can configure multiple instances of each:
41//! - **Filesystem** (`[connectors.filesystem.<name>]`) — scan a local directory
42//! - **Git** (`[connectors.git.<name>]`) — clone/pull a Git repository
43//! - **S3** (`[connectors.s3.<name>]`) — list and download from an S3 bucket
44//! - **Script** (`[connectors.script.<name>]`) — custom Lua-scripted data sources
45//!
46//! # Validation
47//!
48//! [`load_config`] performs the following validations:
49//! - `chunking.max_tokens > 0`
50//! - `retrieval.final_limit >= 1`
51//! - `retrieval.hybrid_alpha ∈ [0.0, 1.0]`
52//! - When embedding provider is `openai` or `ollama`: `model` and `dims` must be set
53//! - Embedding provider must be one of: `"disabled"`, `"openai"`, `"ollama"`, `"local"`
54
55use anyhow::{Context, Result};
56use serde::Deserialize;
57use std::collections::HashMap;
58use std::path::{Path, PathBuf};
59
60/// Top-level configuration structure.
61///
62/// Deserialized from the TOML config file. All sections are required
63/// except `connectors`, which defaults to an empty set.
64#[derive(Debug, Deserialize, Clone)]
65pub struct Config {
66    /// Database connection settings.
67    pub db: DbConfig,
68    /// Text chunking parameters.
69    pub chunking: ChunkingConfig,
70    /// Search and retrieval tuning.
71    pub retrieval: RetrievalConfig,
72    /// Embedding provider settings (defaults to disabled).
73    #[serde(default)]
74    pub embedding: EmbeddingConfig,
75    /// HTTP server bind address.
76    #[allow(dead_code)]
77    pub server: ServerConfig,
78    /// Connector configurations (all optional).
79    #[serde(default)]
80    pub connectors: ConnectorsConfig,
81    /// Tool script configurations (all optional).
82    #[serde(default)]
83    pub tools: ToolsConfig,
84    /// Agent configurations (all optional).
85    #[serde(default)]
86    pub agents: AgentsConfig,
87    /// Extension registry configurations (all optional).
88    #[serde(default)]
89    pub registries: HashMap<String, RegistryConfig>,
90}
91
92impl Config {
93    /// Create a minimal config suitable for commands that don't need
94    /// database or connector settings (e.g., `ctx connector test`).
95    pub fn minimal() -> Self {
96        Self {
97            db: DbConfig {
98                path: PathBuf::from("./data/ctx.sqlite"),
99            },
100            chunking: ChunkingConfig {
101                max_tokens: 700,
102                overlap_tokens: 0,
103            },
104            retrieval: RetrievalConfig {
105                hybrid_alpha: default_hybrid_alpha(),
106                candidate_k_keyword: default_candidate_k(),
107                candidate_k_vector: default_candidate_k(),
108                final_limit: default_final_limit(),
109                group_by: default_group_by(),
110                doc_agg: default_doc_agg(),
111                max_chunks_per_doc: default_max_chunks_per_doc(),
112            },
113            embedding: EmbeddingConfig::default(),
114            server: ServerConfig {
115                bind: "127.0.0.1:7331".to_string(),
116            },
117            connectors: ConnectorsConfig::default(),
118            tools: ToolsConfig::default(),
119            agents: AgentsConfig::default(),
120            registries: HashMap::new(),
121        }
122    }
123}
124
125/// Database configuration.
126///
127/// Specifies the path to the SQLite database file. The file and its
128/// parent directories are created automatically on first use.
129#[derive(Debug, Deserialize, Clone)]
130pub struct DbConfig {
131    /// Path to the SQLite database file (e.g. `"./data/ctx.sqlite"`).
132    pub path: PathBuf,
133}
134
135/// Text chunking parameters.
136///
137/// Controls how document bodies are split into chunks for indexing
138/// and embedding. See [`crate::chunk`] for the chunking algorithm.
139#[derive(Debug, Deserialize, Clone)]
140pub struct ChunkingConfig {
141    /// Maximum tokens per chunk. Chunks are split on paragraph boundaries
142    /// to stay within this limit. Converted to characters via `max_tokens × 4`.
143    pub max_tokens: usize,
144    /// Number of overlapping tokens between adjacent chunks (reserved for future use).
145    #[serde(default = "default_overlap")]
146    #[allow(dead_code)]
147    pub overlap_tokens: usize,
148}
149
150fn default_overlap() -> usize {
151    0
152}
153
154/// Search and retrieval tuning parameters.
155///
156/// These settings control how keyword and semantic search results are
157/// merged in hybrid mode, and the overall result limits.
158///
159/// # Hybrid Scoring
160///
161/// The `hybrid_alpha` weight determines the blend between keyword (BM25)
162/// and semantic (cosine similarity) scores:
163///
164/// ```text
165/// hybrid_score = (1 - α) × keyword_score + α × semantic_score
166/// ```
167///
168/// - `α = 0.0` → pure keyword search
169/// - `α = 1.0` → pure semantic search
170/// - `α = 0.6` (default) → 60% semantic, 40% keyword
171///
172/// See `docs/HYBRID_SCORING.md` for the full specification.
173#[derive(Debug, Deserialize, Clone)]
174pub struct RetrievalConfig {
175    /// Weight for semantic vs. keyword scores in hybrid mode.
176    /// Range: `[0.0, 1.0]`. Default: `0.6`.
177    #[serde(default = "default_hybrid_alpha")]
178    pub hybrid_alpha: f64,
179    /// Number of keyword candidates to fetch before merging. Default: `80`.
180    #[serde(default = "default_candidate_k")]
181    pub candidate_k_keyword: i64,
182    /// Number of vector candidates to fetch before merging. Default: `80`.
183    #[serde(default = "default_candidate_k")]
184    pub candidate_k_vector: i64,
185    /// Maximum number of results to return after merging and ranking. Default: `12`.
186    #[serde(default = "default_final_limit")]
187    pub final_limit: i64,
188    /// Grouping strategy for results. Default: `"document"`.
189    #[serde(default = "default_group_by")]
190    #[allow(dead_code)]
191    pub group_by: String,
192    /// Aggregation method for document-level scores. Default: `"max"`.
193    #[serde(default = "default_doc_agg")]
194    #[allow(dead_code)]
195    pub doc_agg: String,
196    /// Maximum chunks per document in results. Default: `3`.
197    #[serde(default = "default_max_chunks_per_doc")]
198    #[allow(dead_code)]
199    pub max_chunks_per_doc: usize,
200}
201
202fn default_hybrid_alpha() -> f64 {
203    0.6
204}
205fn default_candidate_k() -> i64 {
206    80
207}
208fn default_final_limit() -> i64 {
209    12
210}
211fn default_group_by() -> String {
212    "document".to_string()
213}
214fn default_doc_agg() -> String {
215    "max".to_string()
216}
217fn default_max_chunks_per_doc() -> usize {
218    3
219}
220
221/// Embedding provider configuration.
222///
223/// Controls which embedding provider is used and its parameters.
224/// When `provider = "disabled"`, no embeddings are generated and
225/// semantic/hybrid search modes will return errors.
226///
227/// # Providers
228///
229/// | Provider | Description |
230/// |----------|-------------|
231/// | `"disabled"` | No embeddings (default) |
232/// | `"openai"` | OpenAI API (`text-embedding-3-small`, etc.) |
233/// | `"ollama"` | Local Ollama instance (`nomic-embed-text`, etc.) |
234/// | `"local"` | Built-in models via fastembed (primary) or tract (musl/Intel Mac) (`all-minilm-l6-v2`, etc.) |
235///
236/// When using `"openai"`, the `OPENAI_API_KEY` environment variable must be set.
237/// When using `"ollama"`, an Ollama instance must be running (default: `http://localhost:11434`).
238/// When using `"local"`, the model is downloaded on first use and cached in `~/.cache/huggingface/`.
239#[derive(Debug, Deserialize, Clone)]
240pub struct EmbeddingConfig {
241    /// Provider name: `"disabled"`, `"openai"`, `"ollama"`, or `"local"`. Default: `"disabled"`.
242    #[serde(default = "default_provider")]
243    pub provider: String,
244    /// Embedding model name (e.g. `"text-embedding-3-small"`, `"nomic-embed-text"`,
245    /// `"all-minilm-l6-v2"`). Required for `openai` and `ollama`; optional for `local`
246    /// (defaults to `"all-minilm-l6-v2"`).
247    #[serde(default)]
248    pub model: Option<String>,
249    /// Embedding vector dimensionality (e.g. `1536` for `text-embedding-3-small`).
250    /// Required for `openai` and `ollama`; auto-detected for `local`.
251    #[serde(default)]
252    pub dims: Option<usize>,
253    /// Number of texts to embed per batch. Default: `64`.
254    #[serde(default = "default_batch_size")]
255    pub batch_size: usize,
256    /// Maximum retry attempts for transient API errors. Default: `5`.
257    #[serde(default = "default_max_retries")]
258    pub max_retries: u32,
259    /// HTTP timeout per request in seconds. Default: `30`.
260    #[serde(default = "default_timeout_secs")]
261    pub timeout_secs: u64,
262    /// Base URL for Ollama API. Default: `"http://localhost:11434"`.
263    #[serde(default)]
264    pub url: Option<String>,
265}
266
267impl Default for EmbeddingConfig {
268    fn default() -> Self {
269        Self {
270            provider: "disabled".to_string(),
271            model: None,
272            dims: None,
273            batch_size: 64,
274            max_retries: 5,
275            timeout_secs: 30,
276            url: None,
277        }
278    }
279}
280
281fn default_provider() -> String {
282    "disabled".to_string()
283}
284fn default_batch_size() -> usize {
285    64
286}
287fn default_max_retries() -> u32 {
288    5
289}
290fn default_timeout_secs() -> u64 {
291    30
292}
293
294/// HTTP server configuration.
295#[derive(Debug, Deserialize, Clone)]
296pub struct ServerConfig {
297    /// Socket address to bind to (e.g. `"127.0.0.1:7331"`).
298    pub bind: String,
299}
300
301/// Container for all connector configurations.
302///
303/// All connector types use named instances — you can configure multiple
304/// of each type. For example:
305///
306/// ```toml
307/// [connectors.git.platform]
308/// url = "https://github.com/acme/platform.git"
309///
310/// [connectors.git.auth-service]
311/// url = "https://github.com/acme/auth-service.git"
312/// ```
313///
314/// Use `ctx sync git` to sync all git connectors, or `ctx sync git:platform`
315/// for a specific one. `ctx sync all` syncs everything.
316#[derive(Debug, Deserialize, Clone, Default)]
317pub struct ConnectorsConfig {
318    /// Named filesystem connectors: walk local directories.
319    #[serde(default)]
320    pub filesystem: HashMap<String, FilesystemConnectorConfig>,
321    /// Named Git connectors: clone and scan Git repositories.
322    #[serde(default)]
323    pub git: HashMap<String, GitConnectorConfig>,
324    /// Named S3 connectors: list and download from S3 buckets.
325    #[serde(default)]
326    pub s3: HashMap<String, S3ConnectorConfig>,
327    /// Named Lua script connectors.
328    /// Each key is a connector name, each value contains the script path
329    /// and arbitrary config keys passed to the Lua `connector.scan()` function.
330    /// See `docs/LUA_CONNECTORS.md` for the full specification.
331    #[serde(default)]
332    pub script: HashMap<String, ScriptConnectorConfig>,
333}
334
335/// Lua script connector configuration.
336///
337/// Points to a `.lua` file implementing the connector interface. All fields
338/// except `path` and `timeout` are passed as a config table to the script's
339/// `connector.scan(config)` function.
340///
341/// Values containing `${VAR_NAME}` are expanded from the process environment.
342///
343/// # Example
344///
345/// ```toml
346/// [connectors.script.jira]
347/// path = "connectors/jira.lua"
348/// timeout = 600
349/// url = "https://mycompany.atlassian.net"
350/// api_token = "${JIRA_API_TOKEN}"
351/// project_key = "ENG"
352/// ```
353#[derive(Debug, Deserialize, Clone)]
354pub struct ScriptConnectorConfig {
355    /// Path to the `.lua` connector script.
356    pub path: PathBuf,
357    /// Maximum execution time in seconds. Default: `300`.
358    #[serde(default = "default_script_timeout")]
359    pub timeout: u64,
360    /// All other config keys — passed to the Lua `connector.scan()` function.
361    #[serde(flatten)]
362    pub extra: toml::Table,
363}
364
365fn default_script_timeout() -> u64 {
366    300
367}
368
369/// Container for all tool script configurations.
370///
371/// Tool scripts are Lua files that define MCP tools agents can discover
372/// and call via the HTTP server. See `docs/LUA_TOOLS.md` for the full
373/// specification.
374#[derive(Debug, Deserialize, Clone, Default)]
375pub struct ToolsConfig {
376    /// Named Lua tool scripts.
377    /// Each key is the tool name, each value contains the script path
378    /// and arbitrary config keys accessible via `context.config` in the script.
379    #[serde(default)]
380    pub script: HashMap<String, ScriptToolConfig>,
381}
382
383/// Lua tool script configuration.
384///
385/// Points to a `.lua` file implementing the tool interface. All fields
386/// except `path` and `timeout` are passed as `context.config` to the
387/// script's `tool.execute(params, context)` function.
388///
389/// Values containing `${VAR_NAME}` are expanded from the process environment.
390///
391/// # Example
392///
393/// ```toml
394/// [tools.script.create_jira_ticket]
395/// path = "tools/create-jira-ticket.lua"
396/// timeout = 30
397/// url = "https://mycompany.atlassian.net"
398/// api_token = "${JIRA_API_TOKEN}"
399/// ```
400#[derive(Debug, Deserialize, Clone)]
401pub struct ScriptToolConfig {
402    /// Path to the `.lua` tool script.
403    pub path: PathBuf,
404    /// Maximum execution time in seconds. Default: `30`.
405    #[serde(default = "default_tool_timeout")]
406    pub timeout: u64,
407    /// All other config keys — accessible via `context.config` in the script.
408    #[serde(flatten)]
409    pub extra: toml::Table,
410}
411
412fn default_tool_timeout() -> u64 {
413    30
414}
415
416/// Container for all agent configurations.
417///
418/// Agents are named personas that combine a system prompt, scoped tools,
419/// and optional dynamic context injection. They can be defined inline
420/// in TOML or via Lua scripts.
421///
422/// # Example
423///
424/// ```toml
425/// [agents.inline.code-reviewer]
426/// description = "Reviews code against project conventions"
427/// tools = ["search", "get"]
428/// system_prompt = "You are a senior code reviewer..."
429///
430/// [agents.script.incident-responder]
431/// path = "agents/incident-responder.lua"
432/// timeout = 30
433/// ```
434#[derive(Debug, Deserialize, Clone, Default)]
435pub struct AgentsConfig {
436    /// Inline TOML agents with static system prompts.
437    /// Each key is the agent name, each value contains the prompt and tool list.
438    #[serde(default)]
439    pub inline: HashMap<String, InlineAgentConfig>,
440    /// Lua script agents with dynamic prompt resolution.
441    /// Each key is the agent name, each value contains the script path
442    /// and arbitrary config keys passed to `agent.resolve()`.
443    #[serde(default)]
444    pub script: HashMap<String, ScriptAgentConfig>,
445}
446
447/// Inline (TOML) agent configuration.
448///
449/// Defines an agent with a static system prompt and fixed tool list.
450/// The simplest way to create an agent — no Lua or Rust code needed.
451///
452/// # Example
453///
454/// ```toml
455/// [agents.inline.architect]
456/// description = "Answers architecture questions"
457/// tools = ["search", "get", "sources"]
458/// system_prompt = """
459/// You are a software architect. Search for ADRs and design
460/// docs to ground your recommendations.
461/// """
462/// ```
463#[derive(Debug, Deserialize, Clone)]
464pub struct InlineAgentConfig {
465    /// One-line description for agent discovery.
466    pub description: String,
467    /// List of tool names this agent should expose.
468    pub tools: Vec<String>,
469    /// The system prompt text.
470    pub system_prompt: String,
471}
472
473/// Lua script agent configuration.
474///
475/// Points to a `.lua` file implementing the agent interface. All fields
476/// except `path` and `timeout` are passed as config to the script's
477/// `agent.resolve(args, config, context)` function.
478///
479/// Values containing `${VAR_NAME}` are expanded from the process environment.
480///
481/// # Example
482///
483/// ```toml
484/// [agents.script.incident-responder]
485/// path = "agents/incident-responder.lua"
486/// timeout = 30
487/// search_limit = 5
488/// priority_sources = ["runbooks"]
489/// ```
490#[derive(Debug, Deserialize, Clone)]
491pub struct ScriptAgentConfig {
492    /// Path to the `.lua` agent script.
493    pub path: PathBuf,
494    /// Maximum execution time in seconds. Default: `30`.
495    #[serde(default = "default_agent_timeout")]
496    pub timeout: u64,
497    /// All other config keys — passed to the Lua `agent.resolve()` function.
498    #[serde(flatten)]
499    pub extra: toml::Table,
500}
501
502fn default_agent_timeout() -> u64 {
503    30
504}
505
506/// Extension registry configuration.
507///
508/// Points to a local directory (optionally backed by a Git repository)
509/// containing Lua connector, tool, and agent scripts described by a
510/// `registry.toml` manifest.
511///
512/// # Example
513///
514/// ```toml
515/// [registries.community]
516/// url = "https://github.com/parallax-labs/ctx-registry.git"
517/// branch = "main"
518/// path = "~/.ctx/registries/community"
519/// readonly = true
520/// auto_update = true
521/// ```
522#[derive(Debug, Deserialize, Clone)]
523pub struct RegistryConfig {
524    /// Git repository URL to clone from. `None` means local-only (no git).
525    pub url: Option<String>,
526    /// Git branch or tag to track. Default: `"main"`.
527    pub branch: Option<String>,
528    /// Local filesystem path where the registry is (or will be) stored.
529    pub path: PathBuf,
530    /// If `true`, extensions on this path cannot be edited in place;
531    /// overrides are copied to a writable registry.
532    #[serde(default)]
533    pub readonly: bool,
534    /// If `true`, `ctx registry update` (and optionally `ctx sync`) will
535    /// `git pull` this registry automatically.
536    #[serde(default)]
537    #[allow(dead_code)]
538    pub auto_update: bool,
539}
540
541/// Filesystem connector configuration.
542///
543/// Scans a local directory tree, applying glob include/exclude filters.
544/// See [`crate::connector_fs`] for the scanning implementation.
545///
546/// # Example
547///
548/// ```toml
549/// [connectors.filesystem.docs]
550/// root = "./docs"
551/// include_globs = ["**/*.md", "**/*.txt"]
552/// exclude_globs = ["**/drafts/**"]
553/// follow_symlinks = false
554/// max_extract_bytes = 50_000_000
555/// ```
556#[derive(Debug, Deserialize, Clone)]
557pub struct FilesystemConnectorConfig {
558    /// Root directory to scan.
559    pub root: PathBuf,
560    /// Glob patterns for files to include. Default: `["**/*.md", "**/*.txt"]`.
561    #[serde(default = "default_include_globs")]
562    pub include_globs: Vec<String>,
563    /// Glob patterns for files to exclude. Default: `[]`.
564    #[serde(default)]
565    pub exclude_globs: Vec<String>,
566    /// Whether to follow symbolic links. Default: `false`.
567    #[serde(default)]
568    pub follow_symlinks: bool,
569    /// Files larger than this (bytes) are not extracted; they are skipped and counted in extraction skipped. Default: 50_000_000.
570    #[serde(default = "default_max_extract_bytes")]
571    pub max_extract_bytes: u64,
572}
573
574/// Git connector configuration.
575///
576/// Clones (or pulls) a Git repository and scans files within a configurable
577/// subdirectory. Extracts per-file metadata from `git log`.
578/// See [`crate::connector_git`] for the full implementation.
579///
580/// # Example
581///
582/// ```toml
583/// [connectors.git.platform]
584/// url = "https://github.com/acme/platform.git"
585/// branch = "main"
586/// root = "docs/"
587/// include_globs = ["**/*.md"]
588/// shallow = true
589/// ```
590#[derive(Debug, Deserialize, Clone)]
591pub struct GitConnectorConfig {
592    /// Git repository URL (`https://`, `git@`, or local path).
593    pub url: String,
594    /// Branch to clone/pull. Default: `"main"`.
595    #[serde(default = "default_git_branch")]
596    pub branch: String,
597    /// Subdirectory within the repo to scan. Default: `"."` (entire repo).
598    #[serde(default = "default_git_root")]
599    pub root: String,
600    /// Glob patterns for files to include. Default: `["**/*.md", "**/*.txt"]`.
601    #[serde(default = "default_include_globs")]
602    pub include_globs: Vec<String>,
603    /// Glob patterns for files to exclude. Default: `[]`.
604    #[serde(default)]
605    pub exclude_globs: Vec<String>,
606    /// Use shallow clone (`--depth 1`) to save disk space. Default: `true`.
607    #[serde(default = "default_true")]
608    pub shallow: bool,
609    /// Directory to cache cloned repos. Default: `&lt;db-dir&gt;/.git-cache/&lt;url-hash&gt;/`.
610    #[serde(default)]
611    pub cache_dir: Option<PathBuf>,
612}
613
614/// Amazon S3 connector configuration.
615///
616/// Lists and downloads objects from an S3 bucket using the REST API with
617/// AWS Signature V4. Supports custom endpoints for S3-compatible services.
618/// See [`crate::connector_s3`] for the full implementation.
619///
620/// # Environment Variables
621///
622/// - `AWS_ACCESS_KEY_ID` — required
623/// - `AWS_SECRET_ACCESS_KEY` — required
624/// - `AWS_SESSION_TOKEN` — optional (for temporary credentials)
625///
626/// # Example
627///
628/// ```toml
629/// [connectors.s3.runbooks]
630/// bucket = "acme-docs"
631/// prefix = "engineering/runbooks/"
632/// region = "us-east-1"
633/// include_globs = ["**/*.md"]
634/// # endpoint_url = "http://localhost:9000"   # for MinIO
635/// ```
636#[derive(Debug, Deserialize, Clone)]
637pub struct S3ConnectorConfig {
638    /// S3 bucket name.
639    pub bucket: String,
640    /// Key prefix to filter objects. Default: `""` (entire bucket).
641    #[serde(default)]
642    pub prefix: String,
643    /// AWS region. Default: `"us-east-1"`.
644    #[serde(default = "default_s3_region")]
645    pub region: String,
646    /// Glob patterns for object keys to include. Default: `["**/*.md", "**/*.txt"]`.
647    #[serde(default = "default_include_globs")]
648    pub include_globs: Vec<String>,
649    /// Glob patterns for object keys to exclude. Default: `[]`.
650    #[serde(default)]
651    pub exclude_globs: Vec<String>,
652    /// Custom endpoint URL for S3-compatible services (MinIO, LocalStack).
653    #[serde(default)]
654    pub endpoint_url: Option<String>,
655}
656
657fn default_git_branch() -> String {
658    "main".to_string()
659}
660
661fn default_git_root() -> String {
662    ".".to_string()
663}
664
665fn default_true() -> bool {
666    true
667}
668
669fn default_s3_region() -> String {
670    "us-east-1".to_string()
671}
672
673fn default_include_globs() -> Vec<String> {
674    vec!["**/*.md".to_string(), "**/*.txt".to_string()]
675}
676
677fn default_max_extract_bytes() -> u64 {
678    50_000_000
679}
680
681impl EmbeddingConfig {
682    /// Returns `true` if an embedding provider is configured (not `"disabled"`).
683    pub fn is_enabled(&self) -> bool {
684        self.provider != "disabled"
685    }
686}
687
688/// Load and validate a configuration file from disk.
689///
690/// # Arguments
691///
692/// * `path` — Path to a TOML configuration file.
693///
694/// # Errors
695///
696/// Returns an error if:
697/// - The file cannot be read or parsed
698/// - `chunking.max_tokens` is zero
699/// - `retrieval.final_limit` is less than 1
700/// - `retrieval.hybrid_alpha` is outside `[0.0, 1.0]`
701/// - Embedding provider is enabled but `model` or `dims` is missing/zero
702/// - Unknown embedding provider name
703pub fn load_config(path: &Path) -> Result<Config> {
704    let content = std::fs::read_to_string(path)
705        .with_context(|| format!("Failed to read config file: {}", path.display()))?;
706
707    let config: Config = toml::from_str(&content).with_context(|| "Failed to parse config file")?;
708
709    // Validate chunking
710    if config.chunking.max_tokens == 0 {
711        anyhow::bail!("chunking.max_tokens must be > 0");
712    }
713
714    // Validate retrieval
715    if config.retrieval.final_limit < 1 {
716        anyhow::bail!("retrieval.final_limit must be >= 1");
717    }
718
719    if !(0.0..=1.0).contains(&config.retrieval.hybrid_alpha) {
720        anyhow::bail!("retrieval.hybrid_alpha must be in [0.0, 1.0]");
721    }
722
723    // Validate embedding
724    match config.embedding.provider.as_str() {
725        "disabled" => {}
726        "openai" | "ollama" => {
727            if config.embedding.dims.is_none() || config.embedding.dims == Some(0) {
728                anyhow::bail!(
729                    "embedding.dims must be > 0 when provider is '{}'",
730                    config.embedding.provider
731                );
732            }
733            if config.embedding.model.is_none() {
734                anyhow::bail!(
735                    "embedding.model must be specified when provider is '{}'",
736                    config.embedding.provider
737                );
738            }
739        }
740        "local" => {
741            // model and dims are optional for local — defaults applied at runtime
742        }
743        other => anyhow::bail!(
744            "Unknown embedding provider: '{}'. Must be disabled, openai, ollama, or local.",
745            other
746        ),
747    }
748
749    Ok(config)
750}