diff --git a/src/core/index_filter.rs b/src/core/index_filter.rs new file mode 100644 index 0000000..ab0c139 --- /dev/null +++ b/src/core/index_filter.rs @@ -0,0 +1,118 @@ +use std::fs; +use std::path::Path; + +pub(crate) fn should_index_path(path: &Path, max_file_size: u64) -> bool { + if let Ok(metadata) = fs::metadata(path) { + if metadata.len() > max_file_size { + return false; + } + } + + let ext = path + .extension() + .and_then(|e| e.to_str()) + .unwrap_or("") + .to_lowercase(); + + matches!( + ext.as_str(), + "rs" | "py" + | "js" + | "ts" + | "tsx" + | "jsx" + | "go" + | "c" + | "cpp" + | "h" + | "hpp" + | "java" + | "kt" + | "swift" + | "rb" + | "php" + | "cs" + | "fs" + | "scala" + | "clj" + | "ex" + | "exs" + | "erl" + | "hs" + | "ml" + | "lua" + | "r" + | "jl" + | "dart" + | "vue" + | "svelte" + | "astro" + | "html" + | "htm" + | "css" + | "scss" + | "sass" + | "less" + | "json" + | "yaml" + | "yml" + | "toml" + | "xml" + | "md" + | "markdown" + | "txt" + | "rst" + | "tex" + | "sh" + | "bash" + | "zsh" + | "fish" + | "ps1" + | "bat" + | "cmd" + | "sql" + | "graphql" + | "proto" + ) || path.file_name().is_some_and(|n| { + let name = n.to_string_lossy().to_lowercase(); + matches!( + name.as_str(), + "dockerfile" + | "makefile" + | "cmakelists.txt" + | "rakefile" + | "gemfile" + | "podfile" + | "vagrantfile" + | ".gitignore" + | ".dockerignore" + | ".env.example" + | "readme" + | "license" + | "changelog" + ) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn does_not_index_extensionless_files_by_default() { + assert!(!should_index_path(Path::new("my_binary"), 512 * 1024)); + } + + #[test] + fn indexes_known_extensionless_filenames() { + assert!(should_index_path(Path::new("Makefile"), 512 * 1024)); + assert!(should_index_path(Path::new("Dockerfile"), 512 * 1024)); + assert!(should_index_path(Path::new("README"), 512 * 1024)); + } + + #[test] + fn indexes_known_extensions() { + assert!(should_index_path(Path::new("src/main.rs"), 512 * 1024)); + assert!(should_index_path(Path::new("script.SH"), 512 * 1024)); + } +} diff --git a/src/core/indexer.rs b/src/core/indexer.rs index 469c120..34bb880 100644 --- a/src/core/indexer.rs +++ b/src/core/indexer.rs @@ -236,97 +236,7 @@ impl Indexer { } fn should_index(&self, path: &Path) -> bool { - if let Ok(metadata) = fs::metadata(path) { - if metadata.len() > self.max_file_size { - return false; - } - } - - let ext = path - .extension() - .and_then(|e| e.to_str()) - .unwrap_or("") - .to_lowercase(); - - matches!( - ext.as_str(), - "rs" | "py" - | "js" - | "ts" - | "tsx" - | "jsx" - | "go" - | "c" - | "cpp" - | "h" - | "hpp" - | "java" - | "kt" - | "swift" - | "rb" - | "php" - | "cs" - | "fs" - | "scala" - | "clj" - | "ex" - | "exs" - | "erl" - | "hs" - | "ml" - | "lua" - | "r" - | "jl" - | "dart" - | "vue" - | "svelte" - | "astro" - | "html" - | "htm" - | "css" - | "scss" - | "sass" - | "less" - | "json" - | "yaml" - | "yml" - | "toml" - | "xml" - | "md" - | "markdown" - | "txt" - | "rst" - | "tex" - | "sh" - | "bash" - | "zsh" - | "fish" - | "ps1" - | "bat" - | "cmd" - | "sql" - | "graphql" - | "proto" - | "" - ) || path.file_name().is_some_and(|n| { - let name = n.to_string_lossy().to_lowercase(); - matches!( - name.as_str(), - "dockerfile" - | "makefile" - | "cmakelists.txt" - | "rakefile" - | "gemfile" - | "podfile" - | "vagrantfile" - | ".gitignore" - | ".dockerignore" - | ".env.example" - | "readme" - | "license" - | "changelog" - ) - }) + super::index_filter::should_index_path(path, self.max_file_size) } fn chunk_content(&self, content: &str) -> Vec { @@ -612,97 +522,7 @@ impl ServerIndexer { } fn should_index(&self, path: &Path) -> bool { - if let Ok(metadata) = fs::metadata(path) { - if metadata.len() > self.max_file_size { - return false; - } - } - - let ext = path - .extension() - .and_then(|e| e.to_str()) - .unwrap_or("") - .to_lowercase(); - - matches!( - ext.as_str(), - "rs" | "py" - | "js" - | "ts" - | "tsx" - | "jsx" - | "go" - | "c" - | "cpp" - | "h" - | "hpp" - | "java" - | "kt" - | "swift" - | "rb" - | "php" - | "cs" - | "fs" - | "scala" - | "clj" - | "ex" - | "exs" - | "erl" - | "hs" - | "ml" - | "lua" - | "r" - | "jl" - | "dart" - | "vue" - | "svelte" - | "astro" - | "html" - | "htm" - | "css" - | "scss" - | "sass" - | "less" - | "json" - | "yaml" - | "yml" - | "toml" - | "xml" - | "md" - | "markdown" - | "txt" - | "rst" - | "tex" - | "sh" - | "bash" - | "zsh" - | "fish" - | "ps1" - | "bat" - | "cmd" - | "sql" - | "graphql" - | "proto" - | "" - ) || path.file_name().is_some_and(|n| { - let name = n.to_string_lossy().to_lowercase(); - matches!( - name.as_str(), - "dockerfile" - | "makefile" - | "cmakelists.txt" - | "rakefile" - | "gemfile" - | "podfile" - | "vagrantfile" - | ".gitignore" - | ".dockerignore" - | ".env.example" - | "readme" - | "license" - | "changelog" - ) - }) + super::index_filter::should_index_path(path, self.max_file_size) } fn chunk_content(&self, content: &str) -> Vec { diff --git a/src/core/mod.rs b/src/core/mod.rs index ba223a5..b04fdfa 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -2,6 +2,7 @@ mod db; mod embeddings; +pub(crate) mod index_filter; mod indexer; mod search; diff --git a/src/watcher.rs b/src/watcher.rs index 816219f..9f42b5f 100644 --- a/src/watcher.rs +++ b/src/watcher.rs @@ -168,99 +168,7 @@ impl FileWatcher { } } - // Check file size - if let Ok(metadata) = std::fs::metadata(path) { - if metadata.len() > self.config.max_file_size { - return false; - } - } - - // Check extension - let ext = path - .extension() - .and_then(|e| e.to_str()) - .unwrap_or("") - .to_lowercase(); - - matches!( - ext.as_str(), - "rs" | "py" - | "js" - | "ts" - | "tsx" - | "jsx" - | "go" - | "c" - | "cpp" - | "h" - | "hpp" - | "java" - | "kt" - | "swift" - | "rb" - | "php" - | "cs" - | "fs" - | "scala" - | "clj" - | "ex" - | "exs" - | "erl" - | "hs" - | "ml" - | "lua" - | "r" - | "jl" - | "dart" - | "vue" - | "svelte" - | "astro" - | "html" - | "htm" - | "css" - | "scss" - | "sass" - | "less" - | "json" - | "yaml" - | "yml" - | "toml" - | "xml" - | "md" - | "markdown" - | "txt" - | "rst" - | "tex" - | "sh" - | "bash" - | "zsh" - | "fish" - | "ps1" - | "bat" - | "cmd" - | "sql" - | "graphql" - | "proto" - | "" - ) || path.file_name().is_some_and(|n| { - let name = n.to_string_lossy().to_lowercase(); - matches!( - name.as_str(), - "dockerfile" - | "makefile" - | "cmakelists.txt" - | "rakefile" - | "gemfile" - | "podfile" - | "vagrantfile" - | ".gitignore" - | ".dockerignore" - | ".env.example" - | "readme" - | "license" - | "changelog" - ) - }) + crate::core::index_filter::should_index_path(path, self.config.max_file_size) } fn index_all(&self) -> Result<()> { @@ -453,3 +361,20 @@ struct FileChunk { start_line: i32, end_line: i32, } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn should_not_index_extensionless_files() { + let watcher = FileWatcher::new(Config::default(), PathBuf::from(".")); + assert!(!watcher.should_index(Path::new("my_binary"))); + } + + #[test] + fn should_index_known_extensionless_filenames() { + let watcher = FileWatcher::new(Config::default(), PathBuf::from(".")); + assert!(watcher.should_index(Path::new("Makefile"))); + } +}