diff --git a/crates/fff-core/src/background_watcher.rs b/crates/fff-core/src/background_watcher.rs
index 26c03eaa..d38ffe5a 100644
--- a/crates/fff-core/src/background_watcher.rs
+++ b/crates/fff-core/src/background_watcher.rs
@@ -655,7 +655,7 @@ fn handle_debounced_events(
 
         // git status query even with a pathspec could be really slow, if we do this syncrhronously
         // within the event handler, we actually risk of forming a snow ball of conflicting events
-        crate::file_picker::BACKGROUND_THREAD_POOL.spawn(move || {
+        crate::parallelism::BACKGROUND_THREAD_POOL.spawn(move || {
             let Some(git_path) = git_workdir else { return };
             let Ok(repo) = Repository::open(&git_path) else {
                 error!("Failed to open git repo for async status update");
diff --git a/crates/fff-core/src/bigram_filter.rs b/crates/fff-core/src/bigram_filter.rs
index 18ba6f9c..f9b99452 100644
--- a/crates/fff-core/src/bigram_filter.rs
+++ b/crates/fff-core/src/bigram_filter.rs
@@ -17,6 +17,14 @@ const MAX_BIGRAM_COLUMNS: usize = 5000;
 /// Sentinel value: bigram has no allocated column.
 const NO_COLUMN: u16 = u16::MAX;
 
+/// 1024 × u64 = 8 KB covers all 65536 possible bigram keys.
+const SEEN_WORDS: usize = 1024;
+
+thread_local! {
+    static NORM_BUF: std::cell::RefCell<Vec<u8>> =
+        std::cell::RefCell::new(Vec::with_capacity(4096));
+}
+
 /// Temporary sync dense builder for the bigram index.
 /// Builds from the many threads reading file contents in parallel
 pub struct BigramIndexBuilder {
@@ -90,17 +98,6 @@ impl BigramIndexBuilder {
         }
     }
 
-    /// SAFETY: caller must not access the same `word_idx` slot from
-    /// another thread concurrently. Partitioning in
-    /// `file_picker::build_bigram_index` enforces this.
-    #[inline(always)]
-    unsafe fn column_word_ptr(&self, col: u16, word_idx: usize) -> *mut u64 {
-        unsafe {
-            self.col_data_ptr()
-                .add(col as usize * self.words + word_idx)
-        }
-    }
-
     /// Test/bench accessor for a column's raw bitset words. Assumes the
     /// caller has joined all writers (no concurrent mutation).
     #[cfg(test)]
@@ -122,35 +119,67 @@ impl BigramIndexBuilder {
 
         // Stack-local dedup bitsets: 1024 × u64 = 8 KB each, covers all 65536
         // bigram keys with margin. Has to fit in L1 cache.
-        let mut seen_consec = [0u64; 1024];
-        let mut seen_skip = [0u64; 1024];
-
-        let bytes = content;
-        let len = bytes.len();
-
-        let mut n0 = normalize_byte_scalar(bytes[0]);
-        let mut n1 = normalize_byte_scalar(bytes[1]);
+        let mut seen_consec = [0u64; SEEN_WORDS];
+        let mut seen_skip = [0u64; SEEN_WORDS];
+
+        let consec_base = self.col_data_ptr();
+        let consec_words = self.words;
+        let skip_base = skip_builder.col_data_ptr();
+        let skip_words = skip_builder.words;
+
+        NORM_BUF.with_borrow_mut(|buf| {
+            let len = content.len();
+            if buf.len() < len {
+                buf.resize(len.next_power_of_two().max(4096), 0);
+            }
 
-        if n0 != u16::MAX && n1 != u16::MAX {
-            let key = (n0 << 8) | n1;
-            self.record_bigram(&mut seen_consec, key, word_idx, bit_mask);
-        }
+            normalize_bytes(content, &mut buf[..len]);
+            let n = &buf[..len];
+
+            let mut n0 = n[0];
+            let mut n1 = n[1];
+
+            if n0 != 0 && n1 != 0 {
+                let key = (n0 as u16) << 8 | n1 as u16;
+                self.record_bigram(
+                    &mut seen_consec,
+                    key,
+                    word_idx,
+                    bit_mask,
+                    consec_base,
+                    consec_words,
+                );
+            }
 
-        for &b in &bytes[2..len] {
-            let cur = normalize_byte_scalar(b);
-            if cur != u16::MAX {
-                if n1 != u16::MAX {
-                    let key = (n1 << 8) | cur;
-                    self.record_bigram(&mut seen_consec, key, word_idx, bit_mask);
-                }
-                if n0 != u16::MAX {
-                    let key = (n0 << 8) | cur;
-                    skip_builder.record_bigram(&mut seen_skip, key, word_idx, bit_mask);
+            for &cur in &n[2..] {
+                if cur != 0 {
+                    if n1 != 0 {
+                        let key = (n1 as u16) << 8 | cur as u16;
+                        self.record_bigram(
+                            &mut seen_consec,
+                            key,
+                            word_idx,
+                            bit_mask,
+                            consec_base,
+                            consec_words,
+                        );
+                    }
+                    if n0 != 0 {
+                        let key = (n0 as u16) << 8 | cur as u16;
+                        skip_builder.record_bigram(
+                            &mut seen_skip,
+                            key,
+                            word_idx,
+                            bit_mask,
+                            skip_base,
+                            skip_words,
+                        );
+                    }
                 }
+                n0 = n1;
+                n1 = cur;
             }
-            n0 = n1;
-            n1 = cur;
-        }
+        });
 
         self.populated.fetch_add(1, Ordering::Relaxed);
         skip_builder.populated.fetch_add(1, Ordering::Relaxed);
@@ -160,22 +189,29 @@ impl BigramIndexBuilder {
     /// and bit position is `bit_mask`, de-duplicating via the caller-owned
     /// `seen` bitmap so we only touch the shared column slab at most once
     /// per unique bigram per file.
-    ///
-    /// SAFETY: under the partitioning invariant on `add_file_content`
-    /// the `word_idx` slot this touches is owned exclusively by the
-    /// current thread, so a plain `|=` through the raw pointer is
-    /// race-free (no atomic RMW needed).
     #[inline(always)]
-    fn record_bigram(&self, seen: &mut [u64; 1024], key: u16, word_idx: usize, bit_mask: u64) {
+    fn record_bigram(
+        &self,
+        seen: &mut [u64; SEEN_WORDS],
+        key: u16,
+        word_idx: usize,
+        bit_mask: u64,
+        col_base: *mut u64,
+        words: usize,
+    ) {
         let k = key as usize;
         let w = k >> 6;
         let bit = 1u64 << (k & 63);
-        if seen[w] & bit == 0 {
-            seen[w] |= bit;
+        // SAFETY: w = key/64 with key: u16, so w < 1024 = SEEN_WORDS.
+        let prev = unsafe { *seen.get_unchecked(w) };
+        if prev & bit == 0 {
+            unsafe {
+                *seen.get_unchecked_mut(w) = prev | bit;
+            }
             let col = self.get_or_alloc_column(key);
             if col != NO_COLUMN {
                 unsafe {
-                    let p = self.column_word_ptr(col, word_idx);
+                    let p = col_base.add(col as usize * words + word_idx);
                     *p |= bit_mask;
                 }
             }
@@ -468,22 +504,117 @@ impl BigramFilter {
     }
 }
 
-/// Map a single input byte to its normalised form used by the bigram
-/// builder: `u16::MAX` when not printable ASCII (outside `32..=126`),
-/// otherwise the lowercased byte value in `0..=126`. The `u16::MAX`
-/// sentinel can never collide with a printable-ASCII byte so the consumer
-/// can test `!= u16::MAX` without false positives.
-///
-/// Branchless and `#[inline(always)]`: LLVM lifts the ASCII-range check
-/// and the conditional-lowercase OR into a handful of instructions per
-/// call, so calling this inside a hot loop matches a hand-unrolled
-/// equivalent.
+/// Single-byte normalize: 0 for non-printable, lowercased byte otherwise.
+/// 0 is a safe sentinel: lowered printable bytes are 32..=126.
 #[inline(always)]
-fn normalize_byte_scalar(b: u8) -> u16 {
+fn normalize_byte_scalar(b: u8) -> u8 {
     let printable = b.wrapping_sub(32) <= 94;
-    // Branchless lowercase: OR 0x20 iff byte is in 'A'..='Z'.
     let lower = b | ((b.wrapping_sub(b'A') < 26) as u8 * 0x20);
-    if printable { lower as u16 } else { u16::MAX }
+    if printable { lower } else { 0 }
+}
+
+/// Bulk version: write `dst[i]` = `normalize_byte_scalar(src[i])` for `i`
+/// in `0..src.len()`. Inlined-scalar so LLVM auto-vectorises with the
+/// build's baseline SIMD; on x86_64 we runtime-dispatch to AVX2.
+/// Caller guarantees `dst.len() >= src.len()`.
+#[inline(always)]
+fn normalize_bytes(src: &[u8], dst: &mut [u8]) {
+    debug_assert!(dst.len() >= src.len());
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    {
+        if std::is_x86_feature_detected!("avx2") {
+            unsafe { normalize_bytes_avx2(src, dst) };
+            return;
+        }
+    }
+
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    {
+        unsafe { normalize_bytes_neon(src, dst) };
+        return;
+    }
+
+    #[allow(unused)]
+    normalize_bytes_scalar(src, dst);
+}
+
+#[inline(always)]
+fn normalize_bytes_scalar(src: &[u8], dst: &mut [u8]) {
+    for (i, &b) in src.iter().enumerate() {
+        dst[i] = normalize_byte_scalar(b);
+    }
+}
+
+/// AVX2 normalize: 32 bytes/iter. AVX2 only has signed cmp, so unsigned
+/// range checks use `min(max(v, lo), hi) == v`.
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[target_feature(enable = "avx2")]
+unsafe fn normalize_bytes_avx2(src: &[u8], dst: &mut [u8]) {
+    use std::arch::x86_64::*;
+    let len = src.len();
+    let mut i = 0;
+    let p_lo = _mm256_set1_epi8(32);
+    let p_hi = _mm256_set1_epi8(126u8 as i8);
+    let u_lo = _mm256_set1_epi8(b'A' as i8);
+    let u_hi = _mm256_set1_epi8(b'Z' as i8);
+    let or20 = _mm256_set1_epi8(0x20);
+    while i + 32 <= len {
+        unsafe {
+            let v = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
+            // printable_mask: v in [32, 126]
+            let clamp_p = _mm256_min_epu8(_mm256_max_epu8(v, p_lo), p_hi);
+            let printable = _mm256_cmpeq_epi8(v, clamp_p);
+            // is_upper_mask: v in [65, 90]
+            let clamp_u = _mm256_min_epu8(_mm256_max_epu8(v, u_lo), u_hi);
+            let is_upper = _mm256_cmpeq_epi8(v, clamp_u);
+            let or_bits = _mm256_and_si256(is_upper, or20);
+            let lower = _mm256_or_si256(v, or_bits);
+            let out = _mm256_and_si256(lower, printable);
+            _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, out);
+        }
+        i += 32;
+    }
+    while i < len {
+        dst[i] = normalize_byte_scalar(src[i]);
+        i += 1;
+    }
+}
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+#[target_feature(enable = "neon")]
+unsafe fn normalize_bytes_neon(src: &[u8], dst: &mut [u8]) {
+    use std::arch::aarch64::*;
+    let len = src.len();
+    let mut i = 0;
+    let v32 = vdupq_n_u8(32);
+    let v127 = vdupq_n_u8(127);
+    let va = vdupq_n_u8(b'A');
+    let vz1 = vdupq_n_u8(b'Z' + 1);
+    let v20 = vdupq_n_u8(0x20);
+
+    while i + 16 <= len {
+        unsafe {
+            let v = vld1q_u8(src.as_ptr().add(i));
+            // printable: v >= 32 AND v < 127
+            let ge32 = vcgeq_u8(v, v32);
+            let lt127 = vcltq_u8(v, v127);
+            let print_mask = vandq_u8(ge32, lt127);
+            // is_upper: v >= 'A' AND v < 'Z'+1
+            let ge_a = vcgeq_u8(v, va);
+            let lt_z1 = vcltq_u8(v, vz1);
+            let upper_mask = vandq_u8(ge_a, lt_z1);
+            let or_bits = vandq_u8(upper_mask, v20);
+            let lower = vorrq_u8(v, or_bits);
+            let out = vandq_u8(lower, print_mask);
+
+            vst1q_u8(dst.as_mut_ptr().add(i), out);
+        }
+        i += 16;
+    }
+    while i < len {
+        dst[i] = normalize_byte_scalar(src[i]);
+        i += 1;
+    }
 }
 
 pub fn extract_bigrams(content: &[u8]) -> Vec<u16> {
@@ -640,7 +771,7 @@ pub(crate) fn build_bigram_index(
     // pass runs detached on the background pool without holding the picker
     // read lock, so a watcher event mutating a `FileItem` would race any
     // borrow we took from a cached `Mmap`.
-    crate::file_picker::BACKGROUND_THREAD_POOL.install(|| {
+    crate::parallelism::BACKGROUND_THREAD_POOL.install(|| {
         files
             .par_chunks(BIGRAM_CHUNK_FILES)
             .enumerate()
diff --git a/crates/fff-core/src/file_picker.rs b/crates/fff-core/src/file_picker.rs
index ea526dd5..04692e2a 100644
--- a/crates/fff-core/src/file_picker.rs
+++ b/crates/fff-core/src/file_picker.rs
@@ -56,43 +56,14 @@ use std::fmt::Debug;
 use std::ops::ControlFlow;
 use std::path::{Path, PathBuf};
 use std::sync::{
-    Arc, LazyLock,
+    Arc,
     atomic::{AtomicBool, AtomicUsize, Ordering},
 };
 use std::thread::JoinHandle;
 use std::time::SystemTime;
 use tracing::{Level, debug, error, info, warn};
 
-/// Dedicated thread pool for background work (scan, warmup, bigram build).
-/// Uses fewer threads than the global rayon pool so Neovim's event loop
-/// and search queries can still get CPU time.
-pub(crate) static BACKGROUND_THREAD_POOL: LazyLock<rayon::ThreadPool> = LazyLock::new(|| {
-    let total = std::thread::available_parallelism()
-        .map(|p| p.get())
-        .unwrap_or(4);
-
-    // benchmarks show that most of the work background tasks spend on waiting for syscalls,
-    // by halfing available parallelism we loose some performance, but it is mostly nothing
-    let bg_threads = (total / 2).max(2);
-    rayon::ThreadPoolBuilder::new()
-        .num_threads(bg_threads)
-        .thread_name(|i| format!("fff-bg-{i}"))
-        .start_handler(|_| {
-            // Pin workers to the USER_INITIATED QoS class on macOS so the
-            // scheduler keeps them on P-cores. Without this the kernel is
-            // free to drift them to E-cores, which are ~2× slower for the
-            // bigram scan and per-file syscalls.
-            #[cfg(target_os = "macos")]
-            unsafe {
-                let _ = libc::pthread_set_qos_class_self_np(
-                    libc::qos_class_t::QOS_CLASS_USER_INITIATED,
-                    0,
-                );
-            }
-        })
-        .build()
-        .expect("failed to create background rayon pool")
-});
+use crate::parallelism::{BACKGROUND_THREAD_POOL, SEARCH_THREAD_POOL};
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
 pub enum FFFMode {
@@ -1208,18 +1179,20 @@ impl FilePicker {
             .as_deref()
             .unwrap_or(&self.signals.cancelled);
 
-        grep_search(
-            self.get_files(),
-            query,
-            options,
-            self.cache_budget(),
-            self.sync_data.bigram_index.as_deref(),
-            overlay_guard.as_deref(),
-            cancel,
-            &self.base_path,
-            arena,
-            overflow_arena,
-        )
+        SEARCH_THREAD_POOL.install(|| {
+            grep_search(
+                self.get_files(),
+                query,
+                options,
+                self.cache_budget(),
+                self.sync_data.bigram_index.as_deref(),
+                overlay_guard.as_deref(),
+                cancel,
+                &self.base_path,
+                arena,
+                overflow_arena,
+            )
+        })
     }
 
     /// Multi-pattern grep search across indexed files.
@@ -1237,19 +1210,21 @@ impl FilePicker {
             .as_deref()
             .unwrap_or(&self.signals.cancelled);
 
-        multi_grep_search(
-            self.get_files(),
-            patterns,
-            constraints,
-            options,
-            self.cache_budget(),
-            self.sync_data.bigram_index.as_deref(),
-            overlay_guard.as_deref(),
-            cancel,
-            &self.base_path,
-            arena,
-            overflow_arena,
-        )
+        SEARCH_THREAD_POOL.install(|| {
+            multi_grep_search(
+                self.get_files(),
+                patterns,
+                constraints,
+                options,
+                self.cache_budget(),
+                self.sync_data.bigram_index.as_deref(),
+                overlay_guard.as_deref(),
+                cancel,
+                &self.base_path,
+                arena,
+                overflow_arena,
+            )
+        })
     }
 
     // Returns an ongoing or finisshed scan progress
diff --git a/crates/fff-core/src/grep.rs b/crates/fff-core/src/grep.rs
index f4620b1d..9c69711b 100644
--- a/crates/fff-core/src/grep.rs
+++ b/crates/fff-core/src/grep.rs
@@ -1213,8 +1213,28 @@ where
     let mut files_consumed: usize = 0;
     let mut page_filled = false;
 
-    let chunk_size = rayon::current_num_threads() * 4;
-    for chunk in files_to_search.chunks(chunk_size) {
+    // Each chunk is a rayon barrier. A flat small chunk over 500k files = ~7800
+    // barriers; ×2 growth makes it logarithmic. But a too-aggressive growth
+    // over-scans: when a page fills mid-chunk, the whole submitted chunk still
+    // runs. So only grow when the prefilter is weak (large candidate set);
+    // when bigram cut the set in half, keep fixed small chunks for cheap
+    // page-fill termination.
+    let base_chunk = rayon::current_num_threads() * 4;
+    let prefilter_strong = ctx.total_files > 0 && files_to_search.len() * 2 < ctx.total_files;
+    let max_chunk = if prefilter_strong {
+        base_chunk
+    } else {
+        (base_chunk * 256).max(8 * 1024)
+    };
+    let growth = if prefilter_strong { 1 } else { 2 };
+    let mut chunk_size = base_chunk;
+    let mut chunk_start = 0;
+
+    while chunk_start < files_to_search.len() {
+        let chunk_end = (chunk_start + chunk_size).min(files_to_search.len());
+        let chunk = &files_to_search[chunk_start..chunk_end];
+        chunk_start = chunk_end;
+        chunk_size = (chunk_size * growth).min(max_chunk);
         let chunk_offset = files_consumed;
 
         let chunk_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)> = chunk
@@ -1226,17 +1246,21 @@ where
                 // scoped threads with a predefined local scratch buffers because of spawn cost
                 || (Vec::with_capacity(64 * 1024), MmapSlot::default()),
                 |(buf, mmap_slot), (local_idx, file)| {
-                    if ctx.abort_signal.load(Ordering::Relaxed) {
-                        budget_exceeded.store(true, Ordering::Relaxed);
-                        return None;
-                    }
+                    // perform all the atomic machinery on every 8th
+                    if local_idx % 8 == 0 {
+                        let mut need_abort = ctx.abort_signal.load(Ordering::Relaxed);
+                        if !need_abort
+                            && let Some(budget) = time_budget
+                            && all_matches.len() > 1
+                            && search_start.elapsed() > budget
+                        {
+                            need_abort = true;
+                        }
 
-                    if let Some(budget) = time_budget
-                        && all_matches.len() > 1
-                        && search_start.elapsed() > budget
-                    {
-                        budget_exceeded.store(true, Ordering::Relaxed);
-                        return None;
+                        if need_abort {
+                            budget_exceeded.store(true, Ordering::Relaxed);
+                            return None;
+                        }
                     }
 
                     let content = file.get_content_for_search(
diff --git a/crates/fff-core/src/lib.rs b/crates/fff-core/src/lib.rs
index f575427f..d45b4f1d 100644
--- a/crates/fff-core/src/lib.rs
+++ b/crates/fff-core/src/lib.rs
@@ -92,6 +92,7 @@
 //! ```
 
 mod background_watcher;
+pub(crate) mod parallelism;
 mod scan;
 // public only for benchmarks — the inverted index is still re-exported via
 // `pub use bigram_filter::*` below for external consumers.
diff --git a/crates/fff-core/src/parallelism.rs b/crates/fff-core/src/parallelism.rs
new file mode 100644
index 00000000..7f6c3bbd
--- /dev/null
+++ b/crates/fff-core/src/parallelism.rs
@@ -0,0 +1,84 @@
+//! Dedicated rayon pools. The global pool spans every logical core, which
+//! oversubscribes asymmetric chips (Apple P+E): E-cores are ~2× slower and
+//! `open()` contends on a per-VFS lock past P-core count, so a larger pool is
+//! slower on file-heavy work.
+
+use std::sync::LazyLock;
+
+/// Dedicated thread pool for background work (scan, warmup, bigram build).
+pub static BACKGROUND_THREAD_POOL: LazyLock<rayon::ThreadPool> = LazyLock::new(|| {
+    let total = std::thread::available_parallelism()
+        .map(|p| p.get())
+        .unwrap_or(4);
+
+    // Background work is mostly syscall-bound; halving parallelism leaves
+    // cores for search/UI at negligible throughput cost.
+    let bg_threads = (total / 2).max(2);
+    rayon::ThreadPoolBuilder::new()
+        .num_threads(bg_threads)
+        .thread_name(|i| format!("fff-bg-{i}"))
+        .start_handler(|_| {
+            // QoS pin keeps workers on P-cores; the kernel otherwise drifts
+            // them to ~2× slower E-cores.
+            #[cfg(target_os = "macos")]
+            unsafe {
+                let _ = libc::pthread_set_qos_class_self_np(
+                    libc::qos_class_t::QOS_CLASS_USER_INITIATED,
+                    0,
+                );
+            }
+        })
+        .build()
+        .expect("failed to create background rayon pool")
+});
+
+/// Physical performance-core count via sysctl, falling back to logical cores.
+/// On a 12P+4E M4 Max, grep runs 16t=6.2s vs 13t=4.9s — fewer threads win.
+#[cfg(target_os = "macos")]
+fn performance_core_count() -> usize {
+    let mut count: libc::c_int = 0;
+    let mut size = std::mem::size_of::<libc::c_int>();
+    let name = c"hw.perflevel0.physicalcpu";
+    let ok = unsafe {
+        libc::sysctlbyname(
+            name.as_ptr(),
+            &mut count as *mut _ as *mut libc::c_void,
+            &mut size,
+            std::ptr::null_mut(),
+            0,
+        )
+    };
+    if ok == 0 && count > 0 {
+        count as usize
+    } else {
+        std::thread::available_parallelism()
+            .map(|p| p.get())
+            .unwrap_or(4)
+    }
+}
+
+/// Pool for grep content search: P-core sized and QoS-pinned on macOS, full
+/// parallelism elsewhere. Avoids E-core drag and VFS-lock contention.
+pub static SEARCH_THREAD_POOL: LazyLock<rayon::ThreadPool> = LazyLock::new(|| {
+    #[cfg(target_os = "macos")]
+    let threads = performance_core_count();
+    #[cfg(not(target_os = "macos"))]
+    let threads = std::thread::available_parallelism()
+        .map(|p| p.get())
+        .unwrap_or(4);
+
+    rayon::ThreadPoolBuilder::new()
+        .num_threads(threads)
+        .thread_name(|i| format!("fff-search-{i}"))
+        .start_handler(|_| {
+            #[cfg(target_os = "macos")]
+            unsafe {
+                let _ = libc::pthread_set_qos_class_self_np(
+                    libc::qos_class_t::QOS_CLASS_USER_INITIATED,
+                    0,
+                );
+            }
+        })
+        .build()
+        .expect("failed to create search rayon pool")
+});
diff --git a/crates/fff-core/src/scan.rs b/crates/fff-core/src/scan.rs
index d13bc020..d39073c0 100644
--- a/crates/fff-core/src/scan.rs
+++ b/crates/fff-core/src/scan.rs
@@ -9,8 +9,9 @@ use crate::FileSync;
 use crate::background_watcher::BackgroundWatcher;
 use crate::bigram_filter::{build_bigram_index, sniff_binary_for_non_indexable};
 use crate::error::Error;
-use crate::file_picker::{BACKGROUND_THREAD_POOL, FFFMode};
+use crate::file_picker::FFFMode;
 use crate::git::GitStatusCache;
+use crate::parallelism::BACKGROUND_THREAD_POOL;
 use crate::shared::{SharedFilePicker, SharedFrecency};
 use crate::simd_path::ArenaPtr;
 use crate::types::ContentCacheBudget;
diff --git a/crates/fff-nvim/benches/grep_bench.rs b/crates/fff-nvim/benches/grep_bench.rs
index 7e5ebbd6..64939946 100644
--- a/crates/fff-nvim/benches/grep_bench.rs
+++ b/crates/fff-nvim/benches/grep_bench.rs
@@ -12,6 +12,7 @@ struct TestData {
 }
 
 static SETUP: OnceLock<TestData> = OnceLock::new();
+static SETUP_NO_INDEX: OnceLock<TestData> = OnceLock::new();
 
 fn big_repo_path() -> String {
     if let Some(path) = std::env::var_os("BIG_REPO_PATH") {
@@ -71,27 +72,38 @@ fn setup() -> &'static TestData {
     })
 }
 
-fn setup_cold() -> SharedFilePicker {
-    let path = big_repo_path();
-    let shared_picker = SharedFilePicker::default();
-    let shared_frecency = SharedFrecency::default();
-
-    FilePicker::new_with_shared_state(
-        shared_picker.clone(),
-        shared_frecency.clone(),
-        FilePickerOptions {
-            base_path: path,
-            enable_mmap_cache: false,
-            enable_content_indexing: false,
-            mode: FFFMode::Neovim,
-            watch: false,
-            ..Default::default()
-        },
-    )
-    .expect("create picker");
-
-    shared_picker.wait_for_scan(Duration::from_secs(120));
-    shared_picker
+/// Persistent picker with the bigram index disabled — every grep scans all
+/// candidate files. Isolates raw scan throughput from bigram prefilter wins.
+fn setup_no_index() -> &'static TestData {
+    SETUP_NO_INDEX.get_or_init(|| {
+        let path = big_repo_path();
+        let shared_picker = SharedFilePicker::default();
+        let shared_frecency = SharedFrecency::default();
+
+        eprintln!("Initializing FilePicker (no bigram) for {:?}...", path);
+        FilePicker::new_with_shared_state(
+            shared_picker.clone(),
+            shared_frecency.clone(),
+            FilePickerOptions {
+                base_path: path,
+                enable_mmap_cache: false,
+                enable_content_indexing: false,
+                mode: FFFMode::Neovim,
+                watch: false,
+                ..Default::default()
+            },
+        )
+        .expect("create picker");
+
+        shared_picker.wait_for_scan(Duration::from_secs(120));
+        let file_count = {
+            let guard = shared_picker.read().expect("read lock");
+            guard.as_ref().expect("picker present").get_files().len()
+        };
+        eprintln!("Ready (no bigram): {} files indexed", file_count);
+
+        TestData { shared_picker }
+    })
 }
 
 fn plain_options() -> GrepSearchOptions {
@@ -118,35 +130,22 @@ fn fuzzy_options() -> GrepSearchOptions {
     }
 }
 
+/// One query per selectivity bucket: single-char, common, medium, rare,
+/// multi-word, path-constrained.
 const PLAIN_QUERIES: &[(&str, &str)] = &[
-    ("2char_if", "if"),
+    ("single_char_x", "x"),
     ("common_return", "return"),
     ("func_mutex_lock", "mutex_lock"),
-    ("struct_inode_ops", "inode_operations"),
-    ("define_MODULE_LICENSE", "MODULE_LICENSE"),
     ("rare_phylink_ethtool", "phylink_ethtool"),
-    ("include", "#include"),
-    ("comment_TODO", "TODO"),
-    ("type_struct_file", "struct file"),
-    ("error_EINVAL", "err = -EINVAL"),
     ("long_static_int_init", "static int __init"),
-    ("very_common_int", "int"),
-    ("single_char_x", "x"),
     ("path_printk_c", "printk *.c"),
-    ("dir_mutex_kernel", "mutex /kernel/"),
 ];
 
+/// Fuzzy is expensive (>1s/iter even on warm). Keep three: exact, typo, abbrev.
 const FUZZY_QUERIES: &[(&str, &str)] = &[
     ("exact_mutex_lock", "mutex_lock"),
     ("typo_mutx_lock", "mutx_lock"),
-    ("camel_InodeOps", "InodeOps"),
     ("abbrev_sched_rt", "sched_rt"),
-    ("short_kfr", "kfr"),
-    ("common_return", "return"),
-    ("define_MODULE_LICENSE", "MODULE_LICENSE"),
-    ("struct_file_ops", "file_operations"),
-    ("long_static_int_init", "static_int_init"),
-    ("path_printk_c", "printk *.c"),
 ];
 
 fn bench_plain_warm(c: &mut Criterion) {
@@ -154,9 +153,9 @@ fn bench_plain_warm(c: &mut Criterion) {
     let opts = plain_options();
 
     let mut group = c.benchmark_group("plain_warm");
-    group.sample_size(30);
-    group.warm_up_time(Duration::from_secs(2));
-    group.measurement_time(Duration::from_secs(5));
+    group.sample_size(15);
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(3));
 
     for (name, query) in PLAIN_QUERIES {
         group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| {
@@ -177,9 +176,10 @@ fn bench_fuzzy_warm(c: &mut Criterion) {
     let opts = fuzzy_options();
 
     let mut group = c.benchmark_group("fuzzy_warm");
+    // Fuzzy iters cost >1s; small sample + tight window keeps the suite fast.
     group.sample_size(10);
-    group.warm_up_time(Duration::from_secs(2));
-    group.measurement_time(Duration::from_secs(8));
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(5));
 
     for (name, query) in FUZZY_QUERIES {
         group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| {
@@ -195,37 +195,31 @@ fn bench_fuzzy_warm(c: &mut Criterion) {
     group.finish();
 }
 
-fn bench_plain_cold(c: &mut Criterion) {
-    let _ = setup();
+/// `bench_plain_warm` with the bigram index off. Side-by-side with the warm
+/// group it shows the per-query bigram-prefilter contribution.
+fn bench_plain_no_index(c: &mut Criterion) {
+    let data = setup_no_index();
     let opts = plain_options();
 
+    // Common + medium only; rare queries cost 10-15s/iter without bigram.
     let queries: &[(&str, &str)] = &[
-        ("2char_if", "if"),
         ("common_return", "return"),
         ("func_mutex_lock", "mutex_lock"),
-        ("struct_inode_ops", "inode_operations"),
-        ("define_MODULE_LICENSE", "MODULE_LICENSE"),
-        ("rare_phylink_ethtool", "phylink_ethtool"),
-        ("long_static_int_init", "static int __init"),
     ];
 
-    let mut group = c.benchmark_group("plain_cold");
+    let mut group = c.benchmark_group("plain_no_index");
     group.sample_size(10);
-    group.warm_up_time(Duration::from_millis(500));
-    group.measurement_time(Duration::from_secs(10));
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(5));
 
     for (name, query) in queries {
         group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| {
-            b.iter_with_setup(
-                || setup_cold(),
-                |cold_picker| {
-                    let guard = cold_picker.read().expect("read lock");
-                    let picker = guard.as_ref().expect("picker present");
-                    let parsed = parse_grep_query(q);
-                    let result = picker.grep(&parsed, &opts);
-                    black_box(result.matches.len())
-                },
-            );
+            let guard = data.shared_picker.read().expect("read lock");
+            let picker = guard.as_ref().expect("picker present");
+            b.iter(|| {
+                let parsed = parse_grep_query(q);
+                black_box(picker.grep(&parsed, &opts))
+            });
         });
     }
 
@@ -236,7 +230,7 @@ criterion_group!(
     benches,
     bench_plain_warm,
     bench_fuzzy_warm,
-    bench_plain_cold,
+    bench_plain_no_index,
 );
 
 criterion_main!(benches);