diff --git a/crates/fff-core/src/background_watcher.rs b/crates/fff-core/src/background_watcher.rs index 26c03eaa..d38ffe5a 100644 --- a/crates/fff-core/src/background_watcher.rs +++ b/crates/fff-core/src/background_watcher.rs @@ -655,7 +655,7 @@ fn handle_debounced_events( // git status query even with a pathspec could be really slow, if we do this syncrhronously // within the event handler, we actually risk of forming a snow ball of conflicting events - crate::file_picker::BACKGROUND_THREAD_POOL.spawn(move || { + crate::parallelism::BACKGROUND_THREAD_POOL.spawn(move || { let Some(git_path) = git_workdir else { return }; let Ok(repo) = Repository::open(&git_path) else { error!("Failed to open git repo for async status update"); diff --git a/crates/fff-core/src/bigram_filter.rs b/crates/fff-core/src/bigram_filter.rs index 18ba6f9c..f9b99452 100644 --- a/crates/fff-core/src/bigram_filter.rs +++ b/crates/fff-core/src/bigram_filter.rs @@ -17,6 +17,14 @@ const MAX_BIGRAM_COLUMNS: usize = 5000; /// Sentinel value: bigram has no allocated column. const NO_COLUMN: u16 = u16::MAX; +/// 1024 × u64 = 8 KB covers all 65536 possible bigram keys. +const SEEN_WORDS: usize = 1024; + +thread_local! { + static NORM_BUF: std::cell::RefCell> = + std::cell::RefCell::new(Vec::with_capacity(4096)); +} + /// Temporary sync dense builder for the bigram index. /// Builds from the many threads reading file contents in parallel pub struct BigramIndexBuilder { @@ -90,17 +98,6 @@ impl BigramIndexBuilder { } } - /// SAFETY: caller must not access the same `word_idx` slot from - /// another thread concurrently. Partitioning in - /// `file_picker::build_bigram_index` enforces this. - #[inline(always)] - unsafe fn column_word_ptr(&self, col: u16, word_idx: usize) -> *mut u64 { - unsafe { - self.col_data_ptr() - .add(col as usize * self.words + word_idx) - } - } - /// Test/bench accessor for a column's raw bitset words. Assumes the /// caller has joined all writers (no concurrent mutation). #[cfg(test)] @@ -122,35 +119,67 @@ impl BigramIndexBuilder { // Stack-local dedup bitsets: 1024 × u64 = 8 KB each, covers all 65536 // bigram keys with margin. Has to fit in L1 cache. - let mut seen_consec = [0u64; 1024]; - let mut seen_skip = [0u64; 1024]; - - let bytes = content; - let len = bytes.len(); - - let mut n0 = normalize_byte_scalar(bytes[0]); - let mut n1 = normalize_byte_scalar(bytes[1]); + let mut seen_consec = [0u64; SEEN_WORDS]; + let mut seen_skip = [0u64; SEEN_WORDS]; + + let consec_base = self.col_data_ptr(); + let consec_words = self.words; + let skip_base = skip_builder.col_data_ptr(); + let skip_words = skip_builder.words; + + NORM_BUF.with_borrow_mut(|buf| { + let len = content.len(); + if buf.len() < len { + buf.resize(len.next_power_of_two().max(4096), 0); + } - if n0 != u16::MAX && n1 != u16::MAX { - let key = (n0 << 8) | n1; - self.record_bigram(&mut seen_consec, key, word_idx, bit_mask); - } + normalize_bytes(content, &mut buf[..len]); + let n = &buf[..len]; + + let mut n0 = n[0]; + let mut n1 = n[1]; + + if n0 != 0 && n1 != 0 { + let key = (n0 as u16) << 8 | n1 as u16; + self.record_bigram( + &mut seen_consec, + key, + word_idx, + bit_mask, + consec_base, + consec_words, + ); + } - for &b in &bytes[2..len] { - let cur = normalize_byte_scalar(b); - if cur != u16::MAX { - if n1 != u16::MAX { - let key = (n1 << 8) | cur; - self.record_bigram(&mut seen_consec, key, word_idx, bit_mask); - } - if n0 != u16::MAX { - let key = (n0 << 8) | cur; - skip_builder.record_bigram(&mut seen_skip, key, word_idx, bit_mask); + for &cur in &n[2..] { + if cur != 0 { + if n1 != 0 { + let key = (n1 as u16) << 8 | cur as u16; + self.record_bigram( + &mut seen_consec, + key, + word_idx, + bit_mask, + consec_base, + consec_words, + ); + } + if n0 != 0 { + let key = (n0 as u16) << 8 | cur as u16; + skip_builder.record_bigram( + &mut seen_skip, + key, + word_idx, + bit_mask, + skip_base, + skip_words, + ); + } } + n0 = n1; + n1 = cur; } - n0 = n1; - n1 = cur; - } + }); self.populated.fetch_add(1, Ordering::Relaxed); skip_builder.populated.fetch_add(1, Ordering::Relaxed); @@ -160,22 +189,29 @@ impl BigramIndexBuilder { /// and bit position is `bit_mask`, de-duplicating via the caller-owned /// `seen` bitmap so we only touch the shared column slab at most once /// per unique bigram per file. - /// - /// SAFETY: under the partitioning invariant on `add_file_content` - /// the `word_idx` slot this touches is owned exclusively by the - /// current thread, so a plain `|=` through the raw pointer is - /// race-free (no atomic RMW needed). #[inline(always)] - fn record_bigram(&self, seen: &mut [u64; 1024], key: u16, word_idx: usize, bit_mask: u64) { + fn record_bigram( + &self, + seen: &mut [u64; SEEN_WORDS], + key: u16, + word_idx: usize, + bit_mask: u64, + col_base: *mut u64, + words: usize, + ) { let k = key as usize; let w = k >> 6; let bit = 1u64 << (k & 63); - if seen[w] & bit == 0 { - seen[w] |= bit; + // SAFETY: w = key/64 with key: u16, so w < 1024 = SEEN_WORDS. + let prev = unsafe { *seen.get_unchecked(w) }; + if prev & bit == 0 { + unsafe { + *seen.get_unchecked_mut(w) = prev | bit; + } let col = self.get_or_alloc_column(key); if col != NO_COLUMN { unsafe { - let p = self.column_word_ptr(col, word_idx); + let p = col_base.add(col as usize * words + word_idx); *p |= bit_mask; } } @@ -468,22 +504,117 @@ impl BigramFilter { } } -/// Map a single input byte to its normalised form used by the bigram -/// builder: `u16::MAX` when not printable ASCII (outside `32..=126`), -/// otherwise the lowercased byte value in `0..=126`. The `u16::MAX` -/// sentinel can never collide with a printable-ASCII byte so the consumer -/// can test `!= u16::MAX` without false positives. -/// -/// Branchless and `#[inline(always)]`: LLVM lifts the ASCII-range check -/// and the conditional-lowercase OR into a handful of instructions per -/// call, so calling this inside a hot loop matches a hand-unrolled -/// equivalent. +/// Single-byte normalize: 0 for non-printable, lowercased byte otherwise. +/// 0 is a safe sentinel: lowered printable bytes are 32..=126. #[inline(always)] -fn normalize_byte_scalar(b: u8) -> u16 { +fn normalize_byte_scalar(b: u8) -> u8 { let printable = b.wrapping_sub(32) <= 94; - // Branchless lowercase: OR 0x20 iff byte is in 'A'..='Z'. let lower = b | ((b.wrapping_sub(b'A') < 26) as u8 * 0x20); - if printable { lower as u16 } else { u16::MAX } + if printable { lower } else { 0 } +} + +/// Bulk version: write `dst[i]` = `normalize_byte_scalar(src[i])` for `i` +/// in `0..src.len()`. Inlined-scalar so LLVM auto-vectorises with the +/// build's baseline SIMD; on x86_64 we runtime-dispatch to AVX2. +/// Caller guarantees `dst.len() >= src.len()`. +#[inline(always)] +fn normalize_bytes(src: &[u8], dst: &mut [u8]) { + debug_assert!(dst.len() >= src.len()); + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + if std::is_x86_feature_detected!("avx2") { + unsafe { normalize_bytes_avx2(src, dst) }; + return; + } + } + + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + unsafe { normalize_bytes_neon(src, dst) }; + return; + } + + #[allow(unused)] + normalize_bytes_scalar(src, dst); +} + +#[inline(always)] +fn normalize_bytes_scalar(src: &[u8], dst: &mut [u8]) { + for (i, &b) in src.iter().enumerate() { + dst[i] = normalize_byte_scalar(b); + } +} + +/// AVX2 normalize: 32 bytes/iter. AVX2 only has signed cmp, so unsigned +/// range checks use `min(max(v, lo), hi) == v`. +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[target_feature(enable = "avx2")] +unsafe fn normalize_bytes_avx2(src: &[u8], dst: &mut [u8]) { + use std::arch::x86_64::*; + let len = src.len(); + let mut i = 0; + let p_lo = _mm256_set1_epi8(32); + let p_hi = _mm256_set1_epi8(126u8 as i8); + let u_lo = _mm256_set1_epi8(b'A' as i8); + let u_hi = _mm256_set1_epi8(b'Z' as i8); + let or20 = _mm256_set1_epi8(0x20); + while i + 32 <= len { + unsafe { + let v = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i); + // printable_mask: v in [32, 126] + let clamp_p = _mm256_min_epu8(_mm256_max_epu8(v, p_lo), p_hi); + let printable = _mm256_cmpeq_epi8(v, clamp_p); + // is_upper_mask: v in [65, 90] + let clamp_u = _mm256_min_epu8(_mm256_max_epu8(v, u_lo), u_hi); + let is_upper = _mm256_cmpeq_epi8(v, clamp_u); + let or_bits = _mm256_and_si256(is_upper, or20); + let lower = _mm256_or_si256(v, or_bits); + let out = _mm256_and_si256(lower, printable); + _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, out); + } + i += 32; + } + while i < len { + dst[i] = normalize_byte_scalar(src[i]); + i += 1; + } +} + +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +#[target_feature(enable = "neon")] +unsafe fn normalize_bytes_neon(src: &[u8], dst: &mut [u8]) { + use std::arch::aarch64::*; + let len = src.len(); + let mut i = 0; + let v32 = vdupq_n_u8(32); + let v127 = vdupq_n_u8(127); + let va = vdupq_n_u8(b'A'); + let vz1 = vdupq_n_u8(b'Z' + 1); + let v20 = vdupq_n_u8(0x20); + + while i + 16 <= len { + unsafe { + let v = vld1q_u8(src.as_ptr().add(i)); + // printable: v >= 32 AND v < 127 + let ge32 = vcgeq_u8(v, v32); + let lt127 = vcltq_u8(v, v127); + let print_mask = vandq_u8(ge32, lt127); + // is_upper: v >= 'A' AND v < 'Z'+1 + let ge_a = vcgeq_u8(v, va); + let lt_z1 = vcltq_u8(v, vz1); + let upper_mask = vandq_u8(ge_a, lt_z1); + let or_bits = vandq_u8(upper_mask, v20); + let lower = vorrq_u8(v, or_bits); + let out = vandq_u8(lower, print_mask); + + vst1q_u8(dst.as_mut_ptr().add(i), out); + } + i += 16; + } + while i < len { + dst[i] = normalize_byte_scalar(src[i]); + i += 1; + } } pub fn extract_bigrams(content: &[u8]) -> Vec { @@ -640,7 +771,7 @@ pub(crate) fn build_bigram_index( // pass runs detached on the background pool without holding the picker // read lock, so a watcher event mutating a `FileItem` would race any // borrow we took from a cached `Mmap`. - crate::file_picker::BACKGROUND_THREAD_POOL.install(|| { + crate::parallelism::BACKGROUND_THREAD_POOL.install(|| { files .par_chunks(BIGRAM_CHUNK_FILES) .enumerate() diff --git a/crates/fff-core/src/file_picker.rs b/crates/fff-core/src/file_picker.rs index ea526dd5..04692e2a 100644 --- a/crates/fff-core/src/file_picker.rs +++ b/crates/fff-core/src/file_picker.rs @@ -56,43 +56,14 @@ use std::fmt::Debug; use std::ops::ControlFlow; use std::path::{Path, PathBuf}; use std::sync::{ - Arc, LazyLock, + Arc, atomic::{AtomicBool, AtomicUsize, Ordering}, }; use std::thread::JoinHandle; use std::time::SystemTime; use tracing::{Level, debug, error, info, warn}; -/// Dedicated thread pool for background work (scan, warmup, bigram build). -/// Uses fewer threads than the global rayon pool so Neovim's event loop -/// and search queries can still get CPU time. -pub(crate) static BACKGROUND_THREAD_POOL: LazyLock = LazyLock::new(|| { - let total = std::thread::available_parallelism() - .map(|p| p.get()) - .unwrap_or(4); - - // benchmarks show that most of the work background tasks spend on waiting for syscalls, - // by halfing available parallelism we loose some performance, but it is mostly nothing - let bg_threads = (total / 2).max(2); - rayon::ThreadPoolBuilder::new() - .num_threads(bg_threads) - .thread_name(|i| format!("fff-bg-{i}")) - .start_handler(|_| { - // Pin workers to the USER_INITIATED QoS class on macOS so the - // scheduler keeps them on P-cores. Without this the kernel is - // free to drift them to E-cores, which are ~2× slower for the - // bigram scan and per-file syscalls. - #[cfg(target_os = "macos")] - unsafe { - let _ = libc::pthread_set_qos_class_self_np( - libc::qos_class_t::QOS_CLASS_USER_INITIATED, - 0, - ); - } - }) - .build() - .expect("failed to create background rayon pool") -}); +use crate::parallelism::{BACKGROUND_THREAD_POOL, SEARCH_THREAD_POOL}; #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] pub enum FFFMode { @@ -1208,18 +1179,20 @@ impl FilePicker { .as_deref() .unwrap_or(&self.signals.cancelled); - grep_search( - self.get_files(), - query, - options, - self.cache_budget(), - self.sync_data.bigram_index.as_deref(), - overlay_guard.as_deref(), - cancel, - &self.base_path, - arena, - overflow_arena, - ) + SEARCH_THREAD_POOL.install(|| { + grep_search( + self.get_files(), + query, + options, + self.cache_budget(), + self.sync_data.bigram_index.as_deref(), + overlay_guard.as_deref(), + cancel, + &self.base_path, + arena, + overflow_arena, + ) + }) } /// Multi-pattern grep search across indexed files. @@ -1237,19 +1210,21 @@ impl FilePicker { .as_deref() .unwrap_or(&self.signals.cancelled); - multi_grep_search( - self.get_files(), - patterns, - constraints, - options, - self.cache_budget(), - self.sync_data.bigram_index.as_deref(), - overlay_guard.as_deref(), - cancel, - &self.base_path, - arena, - overflow_arena, - ) + SEARCH_THREAD_POOL.install(|| { + multi_grep_search( + self.get_files(), + patterns, + constraints, + options, + self.cache_budget(), + self.sync_data.bigram_index.as_deref(), + overlay_guard.as_deref(), + cancel, + &self.base_path, + arena, + overflow_arena, + ) + }) } // Returns an ongoing or finisshed scan progress diff --git a/crates/fff-core/src/grep.rs b/crates/fff-core/src/grep.rs index f4620b1d..9c69711b 100644 --- a/crates/fff-core/src/grep.rs +++ b/crates/fff-core/src/grep.rs @@ -1213,8 +1213,28 @@ where let mut files_consumed: usize = 0; let mut page_filled = false; - let chunk_size = rayon::current_num_threads() * 4; - for chunk in files_to_search.chunks(chunk_size) { + // Each chunk is a rayon barrier. A flat small chunk over 500k files = ~7800 + // barriers; ×2 growth makes it logarithmic. But a too-aggressive growth + // over-scans: when a page fills mid-chunk, the whole submitted chunk still + // runs. So only grow when the prefilter is weak (large candidate set); + // when bigram cut the set in half, keep fixed small chunks for cheap + // page-fill termination. + let base_chunk = rayon::current_num_threads() * 4; + let prefilter_strong = ctx.total_files > 0 && files_to_search.len() * 2 < ctx.total_files; + let max_chunk = if prefilter_strong { + base_chunk + } else { + (base_chunk * 256).max(8 * 1024) + }; + let growth = if prefilter_strong { 1 } else { 2 }; + let mut chunk_size = base_chunk; + let mut chunk_start = 0; + + while chunk_start < files_to_search.len() { + let chunk_end = (chunk_start + chunk_size).min(files_to_search.len()); + let chunk = &files_to_search[chunk_start..chunk_end]; + chunk_start = chunk_end; + chunk_size = (chunk_size * growth).min(max_chunk); let chunk_offset = files_consumed; let chunk_results: Vec<(usize, &'a FileItem, Vec)> = chunk @@ -1226,17 +1246,21 @@ where // scoped threads with a predefined local scratch buffers because of spawn cost || (Vec::with_capacity(64 * 1024), MmapSlot::default()), |(buf, mmap_slot), (local_idx, file)| { - if ctx.abort_signal.load(Ordering::Relaxed) { - budget_exceeded.store(true, Ordering::Relaxed); - return None; - } + // perform all the atomic machinery on every 8th + if local_idx % 8 == 0 { + let mut need_abort = ctx.abort_signal.load(Ordering::Relaxed); + if !need_abort + && let Some(budget) = time_budget + && all_matches.len() > 1 + && search_start.elapsed() > budget + { + need_abort = true; + } - if let Some(budget) = time_budget - && all_matches.len() > 1 - && search_start.elapsed() > budget - { - budget_exceeded.store(true, Ordering::Relaxed); - return None; + if need_abort { + budget_exceeded.store(true, Ordering::Relaxed); + return None; + } } let content = file.get_content_for_search( diff --git a/crates/fff-core/src/lib.rs b/crates/fff-core/src/lib.rs index f575427f..d45b4f1d 100644 --- a/crates/fff-core/src/lib.rs +++ b/crates/fff-core/src/lib.rs @@ -92,6 +92,7 @@ //! ``` mod background_watcher; +pub(crate) mod parallelism; mod scan; // public only for benchmarks — the inverted index is still re-exported via // `pub use bigram_filter::*` below for external consumers. diff --git a/crates/fff-core/src/parallelism.rs b/crates/fff-core/src/parallelism.rs new file mode 100644 index 00000000..7f6c3bbd --- /dev/null +++ b/crates/fff-core/src/parallelism.rs @@ -0,0 +1,84 @@ +//! Dedicated rayon pools. The global pool spans every logical core, which +//! oversubscribes asymmetric chips (Apple P+E): E-cores are ~2× slower and +//! `open()` contends on a per-VFS lock past P-core count, so a larger pool is +//! slower on file-heavy work. + +use std::sync::LazyLock; + +/// Dedicated thread pool for background work (scan, warmup, bigram build). +pub static BACKGROUND_THREAD_POOL: LazyLock = LazyLock::new(|| { + let total = std::thread::available_parallelism() + .map(|p| p.get()) + .unwrap_or(4); + + // Background work is mostly syscall-bound; halving parallelism leaves + // cores for search/UI at negligible throughput cost. + let bg_threads = (total / 2).max(2); + rayon::ThreadPoolBuilder::new() + .num_threads(bg_threads) + .thread_name(|i| format!("fff-bg-{i}")) + .start_handler(|_| { + // QoS pin keeps workers on P-cores; the kernel otherwise drifts + // them to ~2× slower E-cores. + #[cfg(target_os = "macos")] + unsafe { + let _ = libc::pthread_set_qos_class_self_np( + libc::qos_class_t::QOS_CLASS_USER_INITIATED, + 0, + ); + } + }) + .build() + .expect("failed to create background rayon pool") +}); + +/// Physical performance-core count via sysctl, falling back to logical cores. +/// On a 12P+4E M4 Max, grep runs 16t=6.2s vs 13t=4.9s — fewer threads win. +#[cfg(target_os = "macos")] +fn performance_core_count() -> usize { + let mut count: libc::c_int = 0; + let mut size = std::mem::size_of::(); + let name = c"hw.perflevel0.physicalcpu"; + let ok = unsafe { + libc::sysctlbyname( + name.as_ptr(), + &mut count as *mut _ as *mut libc::c_void, + &mut size, + std::ptr::null_mut(), + 0, + ) + }; + if ok == 0 && count > 0 { + count as usize + } else { + std::thread::available_parallelism() + .map(|p| p.get()) + .unwrap_or(4) + } +} + +/// Pool for grep content search: P-core sized and QoS-pinned on macOS, full +/// parallelism elsewhere. Avoids E-core drag and VFS-lock contention. +pub static SEARCH_THREAD_POOL: LazyLock = LazyLock::new(|| { + #[cfg(target_os = "macos")] + let threads = performance_core_count(); + #[cfg(not(target_os = "macos"))] + let threads = std::thread::available_parallelism() + .map(|p| p.get()) + .unwrap_or(4); + + rayon::ThreadPoolBuilder::new() + .num_threads(threads) + .thread_name(|i| format!("fff-search-{i}")) + .start_handler(|_| { + #[cfg(target_os = "macos")] + unsafe { + let _ = libc::pthread_set_qos_class_self_np( + libc::qos_class_t::QOS_CLASS_USER_INITIATED, + 0, + ); + } + }) + .build() + .expect("failed to create search rayon pool") +}); diff --git a/crates/fff-core/src/scan.rs b/crates/fff-core/src/scan.rs index d13bc020..d39073c0 100644 --- a/crates/fff-core/src/scan.rs +++ b/crates/fff-core/src/scan.rs @@ -9,8 +9,9 @@ use crate::FileSync; use crate::background_watcher::BackgroundWatcher; use crate::bigram_filter::{build_bigram_index, sniff_binary_for_non_indexable}; use crate::error::Error; -use crate::file_picker::{BACKGROUND_THREAD_POOL, FFFMode}; +use crate::file_picker::FFFMode; use crate::git::GitStatusCache; +use crate::parallelism::BACKGROUND_THREAD_POOL; use crate::shared::{SharedFilePicker, SharedFrecency}; use crate::simd_path::ArenaPtr; use crate::types::ContentCacheBudget; diff --git a/crates/fff-nvim/benches/grep_bench.rs b/crates/fff-nvim/benches/grep_bench.rs index 7e5ebbd6..64939946 100644 --- a/crates/fff-nvim/benches/grep_bench.rs +++ b/crates/fff-nvim/benches/grep_bench.rs @@ -12,6 +12,7 @@ struct TestData { } static SETUP: OnceLock = OnceLock::new(); +static SETUP_NO_INDEX: OnceLock = OnceLock::new(); fn big_repo_path() -> String { if let Some(path) = std::env::var_os("BIG_REPO_PATH") { @@ -71,27 +72,38 @@ fn setup() -> &'static TestData { }) } -fn setup_cold() -> SharedFilePicker { - let path = big_repo_path(); - let shared_picker = SharedFilePicker::default(); - let shared_frecency = SharedFrecency::default(); - - FilePicker::new_with_shared_state( - shared_picker.clone(), - shared_frecency.clone(), - FilePickerOptions { - base_path: path, - enable_mmap_cache: false, - enable_content_indexing: false, - mode: FFFMode::Neovim, - watch: false, - ..Default::default() - }, - ) - .expect("create picker"); - - shared_picker.wait_for_scan(Duration::from_secs(120)); - shared_picker +/// Persistent picker with the bigram index disabled — every grep scans all +/// candidate files. Isolates raw scan throughput from bigram prefilter wins. +fn setup_no_index() -> &'static TestData { + SETUP_NO_INDEX.get_or_init(|| { + let path = big_repo_path(); + let shared_picker = SharedFilePicker::default(); + let shared_frecency = SharedFrecency::default(); + + eprintln!("Initializing FilePicker (no bigram) for {:?}...", path); + FilePicker::new_with_shared_state( + shared_picker.clone(), + shared_frecency.clone(), + FilePickerOptions { + base_path: path, + enable_mmap_cache: false, + enable_content_indexing: false, + mode: FFFMode::Neovim, + watch: false, + ..Default::default() + }, + ) + .expect("create picker"); + + shared_picker.wait_for_scan(Duration::from_secs(120)); + let file_count = { + let guard = shared_picker.read().expect("read lock"); + guard.as_ref().expect("picker present").get_files().len() + }; + eprintln!("Ready (no bigram): {} files indexed", file_count); + + TestData { shared_picker } + }) } fn plain_options() -> GrepSearchOptions { @@ -118,35 +130,22 @@ fn fuzzy_options() -> GrepSearchOptions { } } +/// One query per selectivity bucket: single-char, common, medium, rare, +/// multi-word, path-constrained. const PLAIN_QUERIES: &[(&str, &str)] = &[ - ("2char_if", "if"), + ("single_char_x", "x"), ("common_return", "return"), ("func_mutex_lock", "mutex_lock"), - ("struct_inode_ops", "inode_operations"), - ("define_MODULE_LICENSE", "MODULE_LICENSE"), ("rare_phylink_ethtool", "phylink_ethtool"), - ("include", "#include"), - ("comment_TODO", "TODO"), - ("type_struct_file", "struct file"), - ("error_EINVAL", "err = -EINVAL"), ("long_static_int_init", "static int __init"), - ("very_common_int", "int"), - ("single_char_x", "x"), ("path_printk_c", "printk *.c"), - ("dir_mutex_kernel", "mutex /kernel/"), ]; +/// Fuzzy is expensive (>1s/iter even on warm). Keep three: exact, typo, abbrev. const FUZZY_QUERIES: &[(&str, &str)] = &[ ("exact_mutex_lock", "mutex_lock"), ("typo_mutx_lock", "mutx_lock"), - ("camel_InodeOps", "InodeOps"), ("abbrev_sched_rt", "sched_rt"), - ("short_kfr", "kfr"), - ("common_return", "return"), - ("define_MODULE_LICENSE", "MODULE_LICENSE"), - ("struct_file_ops", "file_operations"), - ("long_static_int_init", "static_int_init"), - ("path_printk_c", "printk *.c"), ]; fn bench_plain_warm(c: &mut Criterion) { @@ -154,9 +153,9 @@ fn bench_plain_warm(c: &mut Criterion) { let opts = plain_options(); let mut group = c.benchmark_group("plain_warm"); - group.sample_size(30); - group.warm_up_time(Duration::from_secs(2)); - group.measurement_time(Duration::from_secs(5)); + group.sample_size(15); + group.warm_up_time(Duration::from_secs(1)); + group.measurement_time(Duration::from_secs(3)); for (name, query) in PLAIN_QUERIES { group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| { @@ -177,9 +176,10 @@ fn bench_fuzzy_warm(c: &mut Criterion) { let opts = fuzzy_options(); let mut group = c.benchmark_group("fuzzy_warm"); + // Fuzzy iters cost >1s; small sample + tight window keeps the suite fast. group.sample_size(10); - group.warm_up_time(Duration::from_secs(2)); - group.measurement_time(Duration::from_secs(8)); + group.warm_up_time(Duration::from_secs(1)); + group.measurement_time(Duration::from_secs(5)); for (name, query) in FUZZY_QUERIES { group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| { @@ -195,37 +195,31 @@ fn bench_fuzzy_warm(c: &mut Criterion) { group.finish(); } -fn bench_plain_cold(c: &mut Criterion) { - let _ = setup(); +/// `bench_plain_warm` with the bigram index off. Side-by-side with the warm +/// group it shows the per-query bigram-prefilter contribution. +fn bench_plain_no_index(c: &mut Criterion) { + let data = setup_no_index(); let opts = plain_options(); + // Common + medium only; rare queries cost 10-15s/iter without bigram. let queries: &[(&str, &str)] = &[ - ("2char_if", "if"), ("common_return", "return"), ("func_mutex_lock", "mutex_lock"), - ("struct_inode_ops", "inode_operations"), - ("define_MODULE_LICENSE", "MODULE_LICENSE"), - ("rare_phylink_ethtool", "phylink_ethtool"), - ("long_static_int_init", "static int __init"), ]; - let mut group = c.benchmark_group("plain_cold"); + let mut group = c.benchmark_group("plain_no_index"); group.sample_size(10); - group.warm_up_time(Duration::from_millis(500)); - group.measurement_time(Duration::from_secs(10)); + group.warm_up_time(Duration::from_secs(1)); + group.measurement_time(Duration::from_secs(5)); for (name, query) in queries { group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| { - b.iter_with_setup( - || setup_cold(), - |cold_picker| { - let guard = cold_picker.read().expect("read lock"); - let picker = guard.as_ref().expect("picker present"); - let parsed = parse_grep_query(q); - let result = picker.grep(&parsed, &opts); - black_box(result.matches.len()) - }, - ); + let guard = data.shared_picker.read().expect("read lock"); + let picker = guard.as_ref().expect("picker present"); + b.iter(|| { + let parsed = parse_grep_query(q); + black_box(picker.grep(&parsed, &opts)) + }); }); } @@ -236,7 +230,7 @@ criterion_group!( benches, bench_plain_warm, bench_fuzzy_warm, - bench_plain_cold, + bench_plain_no_index, ); criterion_main!(benches);