context_harness/config.rs
1//! Configuration parsing and validation.
2//!
3//! Context Harness is configured via a TOML file (default: `config/ctx.toml`).
4//! The config defines database paths, chunking parameters, embedding provider
5//! settings, retrieval tuning, server bind address, and connector configurations.
6//!
7//! # Example Configuration
8//!
9//! ```toml
10//! [db]
11//! path = "./data/ctx.sqlite"
12//!
13//! [chunking]
14//! max_tokens = 700
15//! overlap_tokens = 80
16//!
17//! [embedding]
18//! provider = "openai" # "disabled" | "openai"
19//! model = "text-embedding-3-small"
20//! dims = 1536
21//!
22//! [retrieval]
23//! final_limit = 12
24//! hybrid_alpha = 0.6 # 0.0 = keyword only, 1.0 = semantic only
25//!
26//! [server]
27//! bind = "127.0.0.1:7331"
28//!
29//! [connectors.filesystem.docs]
30//! root = "./docs"
31//! include_globs = ["**/*.md", "**/*.txt"]
32//!
33//! [connectors.git.platform]
34//! url = "https://github.com/acme/platform.git"
35//! branch = "main"
36//! ```
37//!
38//! # Connectors
39//!
40//! All connector types are **named** — you can configure multiple instances of each:
41//! - **Filesystem** (`[connectors.filesystem.<name>]`) — scan a local directory
42//! - **Git** (`[connectors.git.<name>]`) — clone/pull a Git repository
43//! - **S3** (`[connectors.s3.<name>]`) — list and download from an S3 bucket
44//! - **Script** (`[connectors.script.<name>]`) — custom Lua-scripted data sources
45//!
46//! # Validation
47//!
48//! [`load_config`] performs the following validations:
49//! - `chunking.max_tokens > 0`
50//! - `retrieval.final_limit >= 1`
51//! - `retrieval.hybrid_alpha ∈ [0.0, 1.0]`
52//! - When embedding provider is `openai` or `ollama`: `model` and `dims` must be set
53//! - Embedding provider must be one of: `"disabled"`, `"openai"`, `"ollama"`, `"local"`
54
55use anyhow::{Context, Result};
56use serde::Deserialize;
57use std::collections::HashMap;
58use std::path::{Path, PathBuf};
59
60/// Top-level configuration structure.
61///
62/// Deserialized from the TOML config file. All sections are required
63/// except `connectors`, which defaults to an empty set.
64#[derive(Debug, Deserialize, Clone)]
65pub struct Config {
66 /// Database connection settings.
67 pub db: DbConfig,
68 /// Text chunking parameters.
69 pub chunking: ChunkingConfig,
70 /// Search and retrieval tuning.
71 pub retrieval: RetrievalConfig,
72 /// Embedding provider settings (defaults to disabled).
73 #[serde(default)]
74 pub embedding: EmbeddingConfig,
75 /// HTTP server bind address.
76 #[allow(dead_code)]
77 pub server: ServerConfig,
78 /// Connector configurations (all optional).
79 #[serde(default)]
80 pub connectors: ConnectorsConfig,
81 /// Tool script configurations (all optional).
82 #[serde(default)]
83 pub tools: ToolsConfig,
84 /// Agent configurations (all optional).
85 #[serde(default)]
86 pub agents: AgentsConfig,
87 /// Extension registry configurations (all optional).
88 #[serde(default)]
89 pub registries: HashMap<String, RegistryConfig>,
90}
91
92impl Config {
93 /// Create a minimal config suitable for commands that don't need
94 /// database or connector settings (e.g., `ctx connector test`).
95 pub fn minimal() -> Self {
96 Self {
97 db: DbConfig {
98 path: PathBuf::from("./data/ctx.sqlite"),
99 },
100 chunking: ChunkingConfig {
101 max_tokens: 700,
102 overlap_tokens: 0,
103 },
104 retrieval: RetrievalConfig {
105 hybrid_alpha: default_hybrid_alpha(),
106 candidate_k_keyword: default_candidate_k(),
107 candidate_k_vector: default_candidate_k(),
108 final_limit: default_final_limit(),
109 group_by: default_group_by(),
110 doc_agg: default_doc_agg(),
111 max_chunks_per_doc: default_max_chunks_per_doc(),
112 },
113 embedding: EmbeddingConfig::default(),
114 server: ServerConfig {
115 bind: "127.0.0.1:7331".to_string(),
116 },
117 connectors: ConnectorsConfig::default(),
118 tools: ToolsConfig::default(),
119 agents: AgentsConfig::default(),
120 registries: HashMap::new(),
121 }
122 }
123}
124
125/// Database configuration.
126///
127/// Specifies the path to the SQLite database file. The file and its
128/// parent directories are created automatically on first use.
129#[derive(Debug, Deserialize, Clone)]
130pub struct DbConfig {
131 /// Path to the SQLite database file (e.g. `"./data/ctx.sqlite"`).
132 pub path: PathBuf,
133}
134
135/// Text chunking parameters.
136///
137/// Controls how document bodies are split into chunks for indexing
138/// and embedding. See [`crate::chunk`] for the chunking algorithm.
139#[derive(Debug, Deserialize, Clone)]
140pub struct ChunkingConfig {
141 /// Maximum tokens per chunk. Chunks are split on paragraph boundaries
142 /// to stay within this limit. Converted to characters via `max_tokens × 4`.
143 pub max_tokens: usize,
144 /// Number of overlapping tokens between adjacent chunks (reserved for future use).
145 #[serde(default = "default_overlap")]
146 #[allow(dead_code)]
147 pub overlap_tokens: usize,
148}
149
150fn default_overlap() -> usize {
151 0
152}
153
154/// Search and retrieval tuning parameters.
155///
156/// These settings control how keyword and semantic search results are
157/// merged in hybrid mode, and the overall result limits.
158///
159/// # Hybrid Scoring
160///
161/// The `hybrid_alpha` weight determines the blend between keyword (BM25)
162/// and semantic (cosine similarity) scores:
163///
164/// ```text
165/// hybrid_score = (1 - α) × keyword_score + α × semantic_score
166/// ```
167///
168/// - `α = 0.0` → pure keyword search
169/// - `α = 1.0` → pure semantic search
170/// - `α = 0.6` (default) → 60% semantic, 40% keyword
171///
172/// See `docs/HYBRID_SCORING.md` for the full specification.
173#[derive(Debug, Deserialize, Clone)]
174pub struct RetrievalConfig {
175 /// Weight for semantic vs. keyword scores in hybrid mode.
176 /// Range: `[0.0, 1.0]`. Default: `0.6`.
177 #[serde(default = "default_hybrid_alpha")]
178 pub hybrid_alpha: f64,
179 /// Number of keyword candidates to fetch before merging. Default: `80`.
180 #[serde(default = "default_candidate_k")]
181 pub candidate_k_keyword: i64,
182 /// Number of vector candidates to fetch before merging. Default: `80`.
183 #[serde(default = "default_candidate_k")]
184 pub candidate_k_vector: i64,
185 /// Maximum number of results to return after merging and ranking. Default: `12`.
186 #[serde(default = "default_final_limit")]
187 pub final_limit: i64,
188 /// Grouping strategy for results. Default: `"document"`.
189 #[serde(default = "default_group_by")]
190 #[allow(dead_code)]
191 pub group_by: String,
192 /// Aggregation method for document-level scores. Default: `"max"`.
193 #[serde(default = "default_doc_agg")]
194 #[allow(dead_code)]
195 pub doc_agg: String,
196 /// Maximum chunks per document in results. Default: `3`.
197 #[serde(default = "default_max_chunks_per_doc")]
198 #[allow(dead_code)]
199 pub max_chunks_per_doc: usize,
200}
201
202fn default_hybrid_alpha() -> f64 {
203 0.6
204}
205fn default_candidate_k() -> i64 {
206 80
207}
208fn default_final_limit() -> i64 {
209 12
210}
211fn default_group_by() -> String {
212 "document".to_string()
213}
214fn default_doc_agg() -> String {
215 "max".to_string()
216}
217fn default_max_chunks_per_doc() -> usize {
218 3
219}
220
221/// Embedding provider configuration.
222///
223/// Controls which embedding provider is used and its parameters.
224/// When `provider = "disabled"`, no embeddings are generated and
225/// semantic/hybrid search modes will return errors.
226///
227/// # Providers
228///
229/// | Provider | Description |
230/// |----------|-------------|
231/// | `"disabled"` | No embeddings (default) |
232/// | `"openai"` | OpenAI API (`text-embedding-3-small`, etc.) |
233/// | `"ollama"` | Local Ollama instance (`nomic-embed-text`, etc.) |
234/// | `"local"` | Built-in models via fastembed (primary) or tract (musl/Intel Mac) (`all-minilm-l6-v2`, etc.) |
235///
236/// When using `"openai"`, the `OPENAI_API_KEY` environment variable must be set.
237/// When using `"ollama"`, an Ollama instance must be running (default: `http://localhost:11434`).
238/// When using `"local"`, the model is downloaded on first use and cached in `~/.cache/huggingface/`.
239#[derive(Debug, Deserialize, Clone)]
240pub struct EmbeddingConfig {
241 /// Provider name: `"disabled"`, `"openai"`, `"ollama"`, or `"local"`. Default: `"disabled"`.
242 #[serde(default = "default_provider")]
243 pub provider: String,
244 /// Embedding model name (e.g. `"text-embedding-3-small"`, `"nomic-embed-text"`,
245 /// `"all-minilm-l6-v2"`). Required for `openai` and `ollama`; optional for `local`
246 /// (defaults to `"all-minilm-l6-v2"`).
247 #[serde(default)]
248 pub model: Option<String>,
249 /// Embedding vector dimensionality (e.g. `1536` for `text-embedding-3-small`).
250 /// Required for `openai` and `ollama`; auto-detected for `local`.
251 #[serde(default)]
252 pub dims: Option<usize>,
253 /// Number of texts to embed per batch. Default: `64`.
254 #[serde(default = "default_batch_size")]
255 pub batch_size: usize,
256 /// Maximum retry attempts for transient API errors. Default: `5`.
257 #[serde(default = "default_max_retries")]
258 pub max_retries: u32,
259 /// HTTP timeout per request in seconds. Default: `30`.
260 #[serde(default = "default_timeout_secs")]
261 pub timeout_secs: u64,
262 /// Base URL for Ollama API. Default: `"http://localhost:11434"`.
263 #[serde(default)]
264 pub url: Option<String>,
265}
266
267impl Default for EmbeddingConfig {
268 fn default() -> Self {
269 Self {
270 provider: "disabled".to_string(),
271 model: None,
272 dims: None,
273 batch_size: 64,
274 max_retries: 5,
275 timeout_secs: 30,
276 url: None,
277 }
278 }
279}
280
281fn default_provider() -> String {
282 "disabled".to_string()
283}
284fn default_batch_size() -> usize {
285 64
286}
287fn default_max_retries() -> u32 {
288 5
289}
290fn default_timeout_secs() -> u64 {
291 30
292}
293
294/// HTTP server configuration.
295#[derive(Debug, Deserialize, Clone)]
296pub struct ServerConfig {
297 /// Socket address to bind to (e.g. `"127.0.0.1:7331"`).
298 pub bind: String,
299}
300
301/// Container for all connector configurations.
302///
303/// All connector types use named instances — you can configure multiple
304/// of each type. For example:
305///
306/// ```toml
307/// [connectors.git.platform]
308/// url = "https://github.com/acme/platform.git"
309///
310/// [connectors.git.auth-service]
311/// url = "https://github.com/acme/auth-service.git"
312/// ```
313///
314/// Use `ctx sync git` to sync all git connectors, or `ctx sync git:platform`
315/// for a specific one. `ctx sync all` syncs everything.
316#[derive(Debug, Deserialize, Clone, Default)]
317pub struct ConnectorsConfig {
318 /// Named filesystem connectors: walk local directories.
319 #[serde(default)]
320 pub filesystem: HashMap<String, FilesystemConnectorConfig>,
321 /// Named Git connectors: clone and scan Git repositories.
322 #[serde(default)]
323 pub git: HashMap<String, GitConnectorConfig>,
324 /// Named S3 connectors: list and download from S3 buckets.
325 #[serde(default)]
326 pub s3: HashMap<String, S3ConnectorConfig>,
327 /// Named Lua script connectors.
328 /// Each key is a connector name, each value contains the script path
329 /// and arbitrary config keys passed to the Lua `connector.scan()` function.
330 /// See `docs/LUA_CONNECTORS.md` for the full specification.
331 #[serde(default)]
332 pub script: HashMap<String, ScriptConnectorConfig>,
333}
334
335/// Lua script connector configuration.
336///
337/// Points to a `.lua` file implementing the connector interface. All fields
338/// except `path` and `timeout` are passed as a config table to the script's
339/// `connector.scan(config)` function.
340///
341/// Values containing `${VAR_NAME}` are expanded from the process environment.
342///
343/// # Example
344///
345/// ```toml
346/// [connectors.script.jira]
347/// path = "connectors/jira.lua"
348/// timeout = 600
349/// url = "https://mycompany.atlassian.net"
350/// api_token = "${JIRA_API_TOKEN}"
351/// project_key = "ENG"
352/// ```
353#[derive(Debug, Deserialize, Clone)]
354pub struct ScriptConnectorConfig {
355 /// Path to the `.lua` connector script.
356 pub path: PathBuf,
357 /// Maximum execution time in seconds. Default: `300`.
358 #[serde(default = "default_script_timeout")]
359 pub timeout: u64,
360 /// All other config keys — passed to the Lua `connector.scan()` function.
361 #[serde(flatten)]
362 pub extra: toml::Table,
363}
364
365fn default_script_timeout() -> u64 {
366 300
367}
368
369/// Container for all tool script configurations.
370///
371/// Tool scripts are Lua files that define MCP tools agents can discover
372/// and call via the HTTP server. See `docs/LUA_TOOLS.md` for the full
373/// specification.
374#[derive(Debug, Deserialize, Clone, Default)]
375pub struct ToolsConfig {
376 /// Named Lua tool scripts.
377 /// Each key is the tool name, each value contains the script path
378 /// and arbitrary config keys accessible via `context.config` in the script.
379 #[serde(default)]
380 pub script: HashMap<String, ScriptToolConfig>,
381}
382
383/// Lua tool script configuration.
384///
385/// Points to a `.lua` file implementing the tool interface. All fields
386/// except `path` and `timeout` are passed as `context.config` to the
387/// script's `tool.execute(params, context)` function.
388///
389/// Values containing `${VAR_NAME}` are expanded from the process environment.
390///
391/// # Example
392///
393/// ```toml
394/// [tools.script.create_jira_ticket]
395/// path = "tools/create-jira-ticket.lua"
396/// timeout = 30
397/// url = "https://mycompany.atlassian.net"
398/// api_token = "${JIRA_API_TOKEN}"
399/// ```
400#[derive(Debug, Deserialize, Clone)]
401pub struct ScriptToolConfig {
402 /// Path to the `.lua` tool script.
403 pub path: PathBuf,
404 /// Maximum execution time in seconds. Default: `30`.
405 #[serde(default = "default_tool_timeout")]
406 pub timeout: u64,
407 /// All other config keys — accessible via `context.config` in the script.
408 #[serde(flatten)]
409 pub extra: toml::Table,
410}
411
412fn default_tool_timeout() -> u64 {
413 30
414}
415
416/// Container for all agent configurations.
417///
418/// Agents are named personas that combine a system prompt, scoped tools,
419/// and optional dynamic context injection. They can be defined inline
420/// in TOML or via Lua scripts.
421///
422/// # Example
423///
424/// ```toml
425/// [agents.inline.code-reviewer]
426/// description = "Reviews code against project conventions"
427/// tools = ["search", "get"]
428/// system_prompt = "You are a senior code reviewer..."
429///
430/// [agents.script.incident-responder]
431/// path = "agents/incident-responder.lua"
432/// timeout = 30
433/// ```
434#[derive(Debug, Deserialize, Clone, Default)]
435pub struct AgentsConfig {
436 /// Inline TOML agents with static system prompts.
437 /// Each key is the agent name, each value contains the prompt and tool list.
438 #[serde(default)]
439 pub inline: HashMap<String, InlineAgentConfig>,
440 /// Lua script agents with dynamic prompt resolution.
441 /// Each key is the agent name, each value contains the script path
442 /// and arbitrary config keys passed to `agent.resolve()`.
443 #[serde(default)]
444 pub script: HashMap<String, ScriptAgentConfig>,
445}
446
447/// Inline (TOML) agent configuration.
448///
449/// Defines an agent with a static system prompt and fixed tool list.
450/// The simplest way to create an agent — no Lua or Rust code needed.
451///
452/// # Example
453///
454/// ```toml
455/// [agents.inline.architect]
456/// description = "Answers architecture questions"
457/// tools = ["search", "get", "sources"]
458/// system_prompt = """
459/// You are a software architect. Search for ADRs and design
460/// docs to ground your recommendations.
461/// """
462/// ```
463#[derive(Debug, Deserialize, Clone)]
464pub struct InlineAgentConfig {
465 /// One-line description for agent discovery.
466 pub description: String,
467 /// List of tool names this agent should expose.
468 pub tools: Vec<String>,
469 /// The system prompt text.
470 pub system_prompt: String,
471}
472
473/// Lua script agent configuration.
474///
475/// Points to a `.lua` file implementing the agent interface. All fields
476/// except `path` and `timeout` are passed as config to the script's
477/// `agent.resolve(args, config, context)` function.
478///
479/// Values containing `${VAR_NAME}` are expanded from the process environment.
480///
481/// # Example
482///
483/// ```toml
484/// [agents.script.incident-responder]
485/// path = "agents/incident-responder.lua"
486/// timeout = 30
487/// search_limit = 5
488/// priority_sources = ["runbooks"]
489/// ```
490#[derive(Debug, Deserialize, Clone)]
491pub struct ScriptAgentConfig {
492 /// Path to the `.lua` agent script.
493 pub path: PathBuf,
494 /// Maximum execution time in seconds. Default: `30`.
495 #[serde(default = "default_agent_timeout")]
496 pub timeout: u64,
497 /// All other config keys — passed to the Lua `agent.resolve()` function.
498 #[serde(flatten)]
499 pub extra: toml::Table,
500}
501
502fn default_agent_timeout() -> u64 {
503 30
504}
505
506/// Extension registry configuration.
507///
508/// Points to a local directory (optionally backed by a Git repository)
509/// containing Lua connector, tool, and agent scripts described by a
510/// `registry.toml` manifest.
511///
512/// # Example
513///
514/// ```toml
515/// [registries.community]
516/// url = "https://github.com/parallax-labs/ctx-registry.git"
517/// branch = "main"
518/// path = "~/.ctx/registries/community"
519/// readonly = true
520/// auto_update = true
521/// ```
522#[derive(Debug, Deserialize, Clone)]
523pub struct RegistryConfig {
524 /// Git repository URL to clone from. `None` means local-only (no git).
525 pub url: Option<String>,
526 /// Git branch or tag to track. Default: `"main"`.
527 pub branch: Option<String>,
528 /// Local filesystem path where the registry is (or will be) stored.
529 pub path: PathBuf,
530 /// If `true`, extensions on this path cannot be edited in place;
531 /// overrides are copied to a writable registry.
532 #[serde(default)]
533 pub readonly: bool,
534 /// If `true`, `ctx registry update` (and optionally `ctx sync`) will
535 /// `git pull` this registry automatically.
536 #[serde(default)]
537 #[allow(dead_code)]
538 pub auto_update: bool,
539}
540
541/// Filesystem connector configuration.
542///
543/// Scans a local directory tree, applying glob include/exclude filters.
544/// See [`crate::connector_fs`] for the scanning implementation.
545///
546/// # Example
547///
548/// ```toml
549/// [connectors.filesystem.docs]
550/// root = "./docs"
551/// include_globs = ["**/*.md", "**/*.txt"]
552/// exclude_globs = ["**/drafts/**"]
553/// follow_symlinks = false
554/// max_extract_bytes = 50_000_000
555/// ```
556#[derive(Debug, Deserialize, Clone)]
557pub struct FilesystemConnectorConfig {
558 /// Root directory to scan.
559 pub root: PathBuf,
560 /// Glob patterns for files to include. Default: `["**/*.md", "**/*.txt"]`.
561 #[serde(default = "default_include_globs")]
562 pub include_globs: Vec<String>,
563 /// Glob patterns for files to exclude. Default: `[]`.
564 #[serde(default)]
565 pub exclude_globs: Vec<String>,
566 /// Whether to follow symbolic links. Default: `false`.
567 #[serde(default)]
568 pub follow_symlinks: bool,
569 /// Files larger than this (bytes) are not extracted; they are skipped and counted in extraction skipped. Default: 50_000_000.
570 #[serde(default = "default_max_extract_bytes")]
571 pub max_extract_bytes: u64,
572}
573
574/// Git connector configuration.
575///
576/// Clones (or pulls) a Git repository and scans files within a configurable
577/// subdirectory. Extracts per-file metadata from `git log`.
578/// See [`crate::connector_git`] for the full implementation.
579///
580/// # Example
581///
582/// ```toml
583/// [connectors.git.platform]
584/// url = "https://github.com/acme/platform.git"
585/// branch = "main"
586/// root = "docs/"
587/// include_globs = ["**/*.md"]
588/// shallow = true
589/// ```
590#[derive(Debug, Deserialize, Clone)]
591pub struct GitConnectorConfig {
592 /// Git repository URL (`https://`, `git@`, or local path).
593 pub url: String,
594 /// Branch to clone/pull. Default: `"main"`.
595 #[serde(default = "default_git_branch")]
596 pub branch: String,
597 /// Subdirectory within the repo to scan. Default: `"."` (entire repo).
598 #[serde(default = "default_git_root")]
599 pub root: String,
600 /// Glob patterns for files to include. Default: `["**/*.md", "**/*.txt"]`.
601 #[serde(default = "default_include_globs")]
602 pub include_globs: Vec<String>,
603 /// Glob patterns for files to exclude. Default: `[]`.
604 #[serde(default)]
605 pub exclude_globs: Vec<String>,
606 /// Use shallow clone (`--depth 1`) to save disk space. Default: `true`.
607 #[serde(default = "default_true")]
608 pub shallow: bool,
609 /// Directory to cache cloned repos. Default: `<db-dir>/.git-cache/<url-hash>/`.
610 #[serde(default)]
611 pub cache_dir: Option<PathBuf>,
612}
613
614/// Amazon S3 connector configuration.
615///
616/// Lists and downloads objects from an S3 bucket using the REST API with
617/// AWS Signature V4. Supports custom endpoints for S3-compatible services.
618/// See [`crate::connector_s3`] for the full implementation.
619///
620/// # Environment Variables
621///
622/// - `AWS_ACCESS_KEY_ID` — required
623/// - `AWS_SECRET_ACCESS_KEY` — required
624/// - `AWS_SESSION_TOKEN` — optional (for temporary credentials)
625///
626/// # Example
627///
628/// ```toml
629/// [connectors.s3.runbooks]
630/// bucket = "acme-docs"
631/// prefix = "engineering/runbooks/"
632/// region = "us-east-1"
633/// include_globs = ["**/*.md"]
634/// # endpoint_url = "http://localhost:9000" # for MinIO
635/// ```
636#[derive(Debug, Deserialize, Clone)]
637pub struct S3ConnectorConfig {
638 /// S3 bucket name.
639 pub bucket: String,
640 /// Key prefix to filter objects. Default: `""` (entire bucket).
641 #[serde(default)]
642 pub prefix: String,
643 /// AWS region. Default: `"us-east-1"`.
644 #[serde(default = "default_s3_region")]
645 pub region: String,
646 /// Glob patterns for object keys to include. Default: `["**/*.md", "**/*.txt"]`.
647 #[serde(default = "default_include_globs")]
648 pub include_globs: Vec<String>,
649 /// Glob patterns for object keys to exclude. Default: `[]`.
650 #[serde(default)]
651 pub exclude_globs: Vec<String>,
652 /// Custom endpoint URL for S3-compatible services (MinIO, LocalStack).
653 #[serde(default)]
654 pub endpoint_url: Option<String>,
655}
656
657fn default_git_branch() -> String {
658 "main".to_string()
659}
660
661fn default_git_root() -> String {
662 ".".to_string()
663}
664
665fn default_true() -> bool {
666 true
667}
668
669fn default_s3_region() -> String {
670 "us-east-1".to_string()
671}
672
673fn default_include_globs() -> Vec<String> {
674 vec!["**/*.md".to_string(), "**/*.txt".to_string()]
675}
676
677fn default_max_extract_bytes() -> u64 {
678 50_000_000
679}
680
681impl EmbeddingConfig {
682 /// Returns `true` if an embedding provider is configured (not `"disabled"`).
683 pub fn is_enabled(&self) -> bool {
684 self.provider != "disabled"
685 }
686}
687
688/// Load and validate a configuration file from disk.
689///
690/// # Arguments
691///
692/// * `path` — Path to a TOML configuration file.
693///
694/// # Errors
695///
696/// Returns an error if:
697/// - The file cannot be read or parsed
698/// - `chunking.max_tokens` is zero
699/// - `retrieval.final_limit` is less than 1
700/// - `retrieval.hybrid_alpha` is outside `[0.0, 1.0]`
701/// - Embedding provider is enabled but `model` or `dims` is missing/zero
702/// - Unknown embedding provider name
703pub fn load_config(path: &Path) -> Result<Config> {
704 let content = std::fs::read_to_string(path)
705 .with_context(|| format!("Failed to read config file: {}", path.display()))?;
706
707 let config: Config = toml::from_str(&content).with_context(|| "Failed to parse config file")?;
708
709 // Validate chunking
710 if config.chunking.max_tokens == 0 {
711 anyhow::bail!("chunking.max_tokens must be > 0");
712 }
713
714 // Validate retrieval
715 if config.retrieval.final_limit < 1 {
716 anyhow::bail!("retrieval.final_limit must be >= 1");
717 }
718
719 if !(0.0..=1.0).contains(&config.retrieval.hybrid_alpha) {
720 anyhow::bail!("retrieval.hybrid_alpha must be in [0.0, 1.0]");
721 }
722
723 // Validate embedding
724 match config.embedding.provider.as_str() {
725 "disabled" => {}
726 "openai" | "ollama" => {
727 if config.embedding.dims.is_none() || config.embedding.dims == Some(0) {
728 anyhow::bail!(
729 "embedding.dims must be > 0 when provider is '{}'",
730 config.embedding.provider
731 );
732 }
733 if config.embedding.model.is_none() {
734 anyhow::bail!(
735 "embedding.model must be specified when provider is '{}'",
736 config.embedding.provider
737 );
738 }
739 }
740 "local" => {
741 // model and dims are optional for local — defaults applied at runtime
742 }
743 other => anyhow::bail!(
744 "Unknown embedding provider: '{}'. Must be disabled, openai, ollama, or local.",
745 other
746 ),
747 }
748
749 Ok(config)
750}