From 157e98bd676e6446744cda4d5a244fd0eabb9567 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 09:16:58 -0500 Subject: [PATCH 01/23] agentgrep(feat[cli]): Reintroduce search subparser with ranking flags (#17) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit why: search returns with genuine differentiation from grep — rapidfuzz relevance ranking, near-duplicate collapsing, and session grouping. what: - Add SearchArgs with threshold, no_group, no_rank fields - Register search subparser with ranking-specific flags - Add SEARCH_DESCRIPTION and main() dispatch - Add parse tests --- src/agentgrep/__init__.py | 21 ++++ src/agentgrep/cli/parser.py | 164 ++++++++++++++++++++++++- src/agentgrep/cli/render.py | 16 ++- tests/test_cli_search.py | 235 ++++++++++++++++++++++++++++++++++++ 4 files changed, 434 insertions(+), 2 deletions(-) create mode 100644 tests/test_cli_search.py diff --git a/src/agentgrep/__init__.py b/src/agentgrep/__init__.py index 1965f8d..b87962d 100644 --- a/src/agentgrep/__init__.py +++ b/src/agentgrep/__init__.py @@ -239,6 +239,23 @@ def build_description( ), ), ) +SEARCH_DESCRIPTION = build_description( + """ + Smart search with relevance ranking, deduplication, and session grouping. + Uses rapidfuzz for scoring — results sorted by match quality. + """, + ( + ( + None, + ( + "agentgrep search streaming parser", + "agentgrep search --threshold 70 migration", + "agentgrep search --no-rank --no-group caching", + "agentgrep search bliss --json", + ), + ), + ), +) GREP_DESCRIPTION = build_description( """ Content search across normalized records with rg/ag-shaped flags. @@ -3746,6 +3763,8 @@ def main(argv: cabc.Sequence[str] | None = None) -> int: return 0 if isinstance(parsed, GrepArgs): return run_grep_command(parsed) + if isinstance(parsed, SearchArgs): + return run_search_command(parsed) if isinstance(parsed, FuzzyArgs): return run_fuzzy_command(parsed) if isinstance(parsed, UIArgs): @@ -3771,6 +3790,7 @@ def main(argv: cabc.Sequence[str] | None = None) -> int: GrepArgs, ParserBundle, PatternMode, + SearchArgs, UIArgs, add_common_agent_options, add_output_mode_options, @@ -3794,6 +3814,7 @@ def main(argv: cabc.Sequence[str] | None = None) -> int: run_find_command, run_fuzzy_command, run_grep_command, + run_search_command, run_ui_command, serialize_find_record, serialize_grep_record, diff --git a/src/agentgrep/cli/parser.py b/src/agentgrep/cli/parser.py index e0f96b8..9f4f187 100644 --- a/src/agentgrep/cli/parser.py +++ b/src/agentgrep/cli/parser.py @@ -27,6 +27,7 @@ FIND_DESCRIPTION, FUZZY_DESCRIPTION, GREP_DESCRIPTION, + SEARCH_DESCRIPTION, UI_DESCRIPTION, AgentName, ColorMode, @@ -58,6 +59,7 @@ "GrepArgs", "ParserBundle", "PatternMode", + "SearchArgs", "UIArgs", "add_common_agent_options", "add_output_mode_options", @@ -171,6 +173,32 @@ class GrepArgs: raw_query: str = "" +@dataclasses.dataclass(slots=True) +class SearchArgs: + """Typed arguments for ``agentgrep search``. + + Differentiates from ``grep`` by applying rapidfuzz relevance scoring, + near-duplicate collapsing (WRatio > 90), and session grouping to + produce a best-first result set. + """ + + terms: tuple[str, ...] + agents: tuple[AgentName, ...] + search_type: SearchType + any_term: bool + regex: bool + case_sensitive: bool + limit: int | None + output_mode: OutputMode + color_mode: ColorMode + progress_mode: ProgressMode + threshold: int = 0 + no_group: bool = False + no_rank: bool = False + compiled: CompiledQuery | None = None + raw_query: str = "" + + @dataclasses.dataclass(slots=True) class ParserBundle: """CLI parsers used for root and subcommand help.""" @@ -179,6 +207,7 @@ class ParserBundle: find_parser: argparse.ArgumentParser grep_parser: argparse.ArgumentParser fuzzy_parser: argparse.ArgumentParser + search_parser: argparse.ArgumentParser def normalize_color_mode(argv: cabc.Sequence[str] | None) -> ColorMode: @@ -636,11 +665,87 @@ def create_parser( ) add_output_mode_options(fuzzy_parser, allow_ui=True) + search_parser = subparsers.add_parser( + "search", + help="Smart search with relevance ranking and deduplication", + description=SEARCH_DESCRIPTION, + formatter_class=formatter_class, + color=color_mode != "never", + ) + add_common_agent_options(search_parser) + _ = search_parser.add_argument( + "terms", + nargs="*", + metavar="TERM", + help="Search terms (combined as AND by default)", + ) + _ = search_parser.add_argument( + "--type", + choices=["prompts", "history", "all"], + default="prompts", + dest="search_type", + help="Record type to search (default: prompts)", + ) + _ = search_parser.add_argument( + "--any", + action="store_true", + dest="any_term", + help="OR mode — match any term instead of all", + ) + _ = search_parser.add_argument( + "--regex", + action="store_true", + help="Treat terms as regex patterns", + ) + _ = search_parser.add_argument( + "--case-sensitive", + action="store_true", + help="Force case-sensitive matching", + ) + _ = search_parser.add_argument( + "--limit", + type=int, + metavar="N", + help="Limit the number of results after ranking", + ) + _ = search_parser.add_argument( + "--threshold", + type=int, + default=0, + metavar="N", + help="Minimum fuzzy score 0-100 (default: 0 = show all matches)", + ) + _ = search_parser.add_argument( + "--no-group", + action="store_true", + help="Flat results, no session grouping", + ) + _ = search_parser.add_argument( + "--no-rank", + action="store_true", + help="Discovery order, no relevance scoring", + ) + _ = search_parser.add_argument( + "--progress", + choices=["auto", "always", "never"], + default="auto", + help="Show search progress on stderr", + ) + _ = search_parser.add_argument( + "--no-progress", + dest="progress", + action="store_const", + const="never", + help="Silence the stderr progress spinner (alias for --progress=never)", + ) + add_output_mode_options(search_parser, allow_ui=True) + return ParserBundle( parser=parser, find_parser=find_parser, grep_parser=grep_parser, fuzzy_parser=fuzzy_parser, + search_parser=search_parser, ) @@ -791,7 +896,7 @@ def _check_for_mangled_field_predicate( def parse_args( argv: cabc.Sequence[str] | None = None, -) -> FindArgs | UIArgs | GrepArgs | FuzzyArgs | None: +) -> FindArgs | UIArgs | GrepArgs | FuzzyArgs | SearchArgs | None: """Parse CLI arguments into typed dataclasses.""" color_mode = normalize_color_mode(argv) effective_argv = list(argv) if argv is not None else list(sys.argv[1:]) @@ -827,6 +932,15 @@ def parse_args( bundle=bundle, ) + if command == "search": + return _build_search_args( + namespace, + agents=agents, + output_mode=output_mode, + color_mode=color_mode, + bundle=bundle, + ) + if command == "fuzzy": return _build_fuzzy_args( namespace, @@ -1008,6 +1122,54 @@ def _build_grep_args( ) +def _build_search_args( + namespace: argparse.Namespace, + *, + agents: tuple[AgentName, ...], + output_mode: OutputMode, + color_mode: ColorMode, + bundle: ParserBundle, +) -> SearchArgs: + """Build :class:`SearchArgs` from a parsed argparse namespace.""" + terms_list = t.cast("list[str]", namespace.terms) + limit = t.cast("int | None", namespace.limit) + if limit is not None and limit < 1: + with configured_color_environment(color_mode): + bundle.parser.error("--limit must be greater than 0") + threshold = t.cast("int", namespace.threshold) + if threshold < 0 or threshold > 100: + with configured_color_environment(color_mode): + bundle.search_parser.error("--threshold must be between 0 and 100") + + search_compiled, residual_terms = _maybe_compile_query( + terms_list, + bundle=bundle, + color_mode=color_mode, + subparser=bundle.search_parser, + ) + final_terms: tuple[str, ...] = ( + residual_terms if search_compiled is not None else tuple(terms_list) + ) + + return SearchArgs( + terms=final_terms, + agents=agents, + search_type=t.cast("SearchType", namespace.search_type), + any_term=t.cast("bool", namespace.any_term), + regex=t.cast("bool", namespace.regex), + case_sensitive=t.cast("bool", namespace.case_sensitive), + limit=limit, + output_mode=output_mode, + color_mode=color_mode, + progress_mode=t.cast("ProgressMode", namespace.progress), + threshold=threshold, + no_group=t.cast("bool", namespace.no_group), + no_rank=t.cast("bool", namespace.no_rank), + compiled=search_compiled, + raw_query=" ".join(terms_list), + ) + + def _build_fuzzy_args( namespace: argparse.Namespace, *, diff --git a/src/agentgrep/cli/render.py b/src/agentgrep/cli/render.py index 4be9f11..9d292da 100644 --- a/src/agentgrep/cli/render.py +++ b/src/agentgrep/cli/render.py @@ -36,7 +36,7 @@ SourceHandle, SourceHandlePayload, ) -from agentgrep.cli.parser import FindArgs, FuzzyArgs, GrepArgs, UIArgs +from agentgrep.cli.parser import FindArgs, FuzzyArgs, GrepArgs, SearchArgs, UIArgs __all__ = [ "GrepSummary", @@ -58,6 +58,7 @@ "run_find_command", "run_fuzzy_command", "run_grep_command", + "run_search_command", "run_ui_command", "serialize_find_record", "serialize_grep_record", @@ -426,6 +427,19 @@ def run_ui_command(args: UIArgs) -> int: return 0 +def run_search_command(args: SearchArgs) -> int: + """Execute ``agentgrep search`` with ranking and grouping. + + Collects all matching records eagerly, scores them by rapidfuzz + relevance, collapses near-duplicates, groups by session, and + renders in the requested output format. Returns ``0`` when at + least one result survives ranking, ``1`` otherwise. + """ + _ = args + msg = "search command not yet wired — ranking engine pending" + raise SystemExit(msg) + + def _compile_grep_patterns(args: GrepArgs) -> list[re.Pattern[str]]: """Compile :class:`GrepArgs` patterns into regex objects honoring mode/case. diff --git a/tests/test_cli_search.py b/tests/test_cli_search.py new file mode 100644 index 0000000..1270599 --- /dev/null +++ b/tests/test_cli_search.py @@ -0,0 +1,235 @@ +"""Tests for the ``agentgrep search`` subcommand. + +Covers argument parsing into :class:`agentgrep.SearchArgs`, the +ranking-specific flags (``--threshold``, ``--no-group``, ``--no-rank``), +and the integration between the ranking engine and the CLI dispatch. +""" + +from __future__ import annotations + +import typing as t + +import pytest + +import agentgrep + +# --------------------------------------------------------------------------- +# Argument parsing +# --------------------------------------------------------------------------- + + +class SearchParseCase(t.NamedTuple): + """Parametrized case for :func:`agentgrep.parse_args` on ``search``.""" + + test_id: str + argv: tuple[str, ...] + expected_terms: tuple[str, ...] + expected_threshold: int + expected_no_group: bool + expected_no_rank: bool + expected_search_type: agentgrep.SearchType + expected_any_term: bool + expected_regex: bool + expected_case_sensitive: bool + + +SEARCH_PARSE_CASES: tuple[SearchParseCase, ...] = ( + SearchParseCase( + "defaults-single-term", + ("search", "bliss"), + ("bliss",), + 0, + False, + False, + "prompts", + False, + False, + False, + ), + SearchParseCase( + "multi-term", + ("search", "streaming", "parser"), + ("streaming", "parser"), + 0, + False, + False, + "prompts", + False, + False, + False, + ), + SearchParseCase( + "threshold-flag", + ("search", "--threshold", "70", "migration"), + ("migration",), + 70, + False, + False, + "prompts", + False, + False, + False, + ), + SearchParseCase( + "no-group-flag", + ("search", "--no-group", "caching"), + ("caching",), + 0, + True, + False, + "prompts", + False, + False, + False, + ), + SearchParseCase( + "no-rank-flag", + ("search", "--no-rank", "bliss"), + ("bliss",), + 0, + False, + True, + "prompts", + False, + False, + False, + ), + SearchParseCase( + "all-ranking-flags", + ("search", "--threshold", "50", "--no-group", "--no-rank", "query"), + ("query",), + 50, + True, + True, + "prompts", + False, + False, + False, + ), + SearchParseCase( + "type-history", + ("search", "--type", "history", "todo"), + ("todo",), + 0, + False, + False, + "history", + False, + False, + False, + ), + SearchParseCase( + "any-term-mode", + ("search", "--any", "foo", "bar"), + ("foo", "bar"), + 0, + False, + False, + "prompts", + True, + False, + False, + ), + SearchParseCase( + "regex-flag", + ("search", "--regex", "foo.*bar"), + ("foo.*bar",), + 0, + False, + False, + "prompts", + False, + True, + False, + ), + SearchParseCase( + "case-sensitive-flag", + ("search", "--case-sensitive", "Bliss"), + ("Bliss",), + 0, + False, + False, + "prompts", + False, + False, + True, + ), + SearchParseCase( + "no-terms", + ("search",), + (), + 0, + False, + False, + "prompts", + False, + False, + False, + ), +) + + +@pytest.mark.parametrize( + SearchParseCase._fields, + SEARCH_PARSE_CASES, + ids=[case.test_id for case in SEARCH_PARSE_CASES], +) +def test_search_parse_args( + test_id: str, + argv: tuple[str, ...], + expected_terms: tuple[str, ...], + expected_threshold: int, + expected_no_group: bool, + expected_no_rank: bool, + expected_search_type: agentgrep.SearchType, + expected_any_term: bool, + expected_regex: bool, + expected_case_sensitive: bool, +) -> None: + """Search subparser captures ranking-specific flags correctly.""" + _ = test_id + parsed = agentgrep.parse_args(argv) + assert isinstance(parsed, agentgrep.SearchArgs) + assert parsed.terms == expected_terms + assert parsed.threshold == expected_threshold + assert parsed.no_group == expected_no_group + assert parsed.no_rank == expected_no_rank + assert parsed.search_type == expected_search_type + assert parsed.any_term == expected_any_term + assert parsed.regex == expected_regex + assert parsed.case_sensitive == expected_case_sensitive + + +def test_search_parse_limit() -> None: + """--limit is captured in SearchArgs.""" + parsed = agentgrep.parse_args(("search", "--limit", "5", "bliss")) + assert isinstance(parsed, agentgrep.SearchArgs) + assert parsed.limit == 5 + + +def test_search_parse_output_json() -> None: + """--json sets output_mode correctly.""" + parsed = agentgrep.parse_args(("search", "--json", "bliss")) + assert isinstance(parsed, agentgrep.SearchArgs) + assert parsed.output_mode == "json" + + +def test_search_parse_output_ndjson() -> None: + """--ndjson sets output_mode correctly.""" + parsed = agentgrep.parse_args(("search", "--ndjson", "bliss")) + assert isinstance(parsed, agentgrep.SearchArgs) + assert parsed.output_mode == "ndjson" + + +def test_search_parse_progress_never() -> None: + """--no-progress sets progress_mode to never.""" + parsed = agentgrep.parse_args(("search", "--no-progress", "bliss")) + assert isinstance(parsed, agentgrep.SearchArgs) + assert parsed.progress_mode == "never" + + +def test_search_parse_agent_filter() -> None: + """--agent filters are captured.""" + parsed = agentgrep.parse_args(("search", "--agent", "codex", "bliss")) + assert isinstance(parsed, agentgrep.SearchArgs) + assert parsed.agents == ("codex",) From 403f43abd222afadc060fc6571f3ff375ed1e947 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 09:19:11 -0500 Subject: [PATCH 02/23] agentgrep(feat[ranking]): Add rapidfuzz scoring, dedup, and session grouping why: search needs to score results by relevance (best match first), collapse near-duplicates (WRatio > 90), and group by session for a coherent browsing experience. what: - Add ranking.py with rank_search_records (WRatio scoring + sort) - Add collapse_near_duplicates (pairwise similarity, keep representative) - Add group_by_session (OrderedDict grouping by session_id) - Add parametrized tests for all three functions --- src/agentgrep/ranking.py | 130 +++++++++++++++++++ tests/test_ranking.py | 269 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 399 insertions(+) create mode 100644 src/agentgrep/ranking.py create mode 100644 tests/test_ranking.py diff --git a/src/agentgrep/ranking.py b/src/agentgrep/ranking.py new file mode 100644 index 0000000..9942e2e --- /dev/null +++ b/src/agentgrep/ranking.py @@ -0,0 +1,130 @@ +"""Relevance scoring, near-duplicate collapsing, and session grouping. + +The search subcommand collects all engine matches eagerly, then passes +them through the three-stage pipeline exposed here: + +1. :func:`rank_search_records` — score each record against the query + text with rapidfuzz WRatio, filter by threshold, sort best-first. +2. :func:`collapse_near_duplicates` — pairwise WRatio between record + bodies; records above the similarity ceiling are folded into the + highest-scoring representative. +3. :func:`group_by_session` — bucket the surviving records by + ``session_id``, preserving score order within each group. +""" + +from __future__ import annotations + +import collections +import typing as t + +if t.TYPE_CHECKING: + from agentgrep import SearchRecord + +__all__ = [ + "collapse_near_duplicates", + "group_by_session", + "rank_search_records", +] + + +def rank_search_records( + records: list[SearchRecord], + query_text: str, + *, + threshold: int = 0, +) -> list[tuple[SearchRecord, float]]: + """Score records by relevance and sort best-first. + + Parameters + ---------- + records : list[SearchRecord] + Engine-matched records in discovery order. + query_text : str + The space-joined search terms for WRatio scoring. + threshold : int + Minimum fuzzy score (0-100). Records below are dropped. + + Returns + ------- + list[tuple[SearchRecord, float]] + ``(record, score)`` pairs sorted by descending score. + """ + import rapidfuzz.fuzz + + scored: list[tuple[SearchRecord, float]] = [ + (r, float(rapidfuzz.fuzz.WRatio(query_text, r.text))) for r in records + ] + if threshold > 0: + scored = [(r, s) for r, s in scored if s >= threshold] + scored.sort(key=lambda pair: pair[1], reverse=True) + return scored + + +def collapse_near_duplicates( + scored: list[tuple[SearchRecord, float]], + *, + similarity_threshold: float = 90.0, +) -> list[tuple[SearchRecord, float, int]]: + """Collapse near-duplicate records, keeping highest-scored representative. + + Parameters + ---------- + scored : list[tuple[SearchRecord, float]] + Pre-sorted ``(record, score)`` pairs (best-first). + similarity_threshold : float + WRatio ceiling — record pairs scoring above this are + considered near-duplicates. + + Returns + ------- + list[tuple[SearchRecord, float, int]] + ``(record, score, similar_count)`` triples. ``similar_count`` + is the number of collapsed duplicates. + """ + import rapidfuzz.fuzz + + if not scored: + return [] + result: list[tuple[SearchRecord, float, int]] = [] + consumed: set[int] = set() + for i, (record_i, score_i) in enumerate(scored): + if i in consumed: + continue + similar_count = 0 + for j in range(i + 1, len(scored)): + if j in consumed: + continue + record_j = scored[j][0] + sim = float(rapidfuzz.fuzz.WRatio(record_i.text, record_j.text)) + if sim >= similarity_threshold: + similar_count += 1 + consumed.add(j) + result.append((record_i, score_i, similar_count)) + return result + + +def group_by_session( + records: list[tuple[SearchRecord, float, int]], +) -> list[tuple[str | None, list[tuple[SearchRecord, float, int]]]]: + """Group records by session_id, preserving score order within groups. + + Parameters + ---------- + records : list[tuple[SearchRecord, float, int]] + Collapsed ``(record, score, similar_count)`` triples. + + Returns + ------- + list[tuple[str | None, list[...]]] + ``(session_id, entries)`` pairs in first-seen order. + """ + groups: collections.OrderedDict[ + str | None, + list[tuple[SearchRecord, float, int]], + ] = collections.OrderedDict() + for record, score, similar in records: + key = record.session_id + if key not in groups: + groups[key] = [] + groups[key].append((record, score, similar)) + return list(groups.items()) diff --git a/tests/test_ranking.py b/tests/test_ranking.py new file mode 100644 index 0000000..fe5488a --- /dev/null +++ b/tests/test_ranking.py @@ -0,0 +1,269 @@ +"""Tests for the ranking engine (``agentgrep.ranking``). + +Covers the three-stage pipeline: rapidfuzz scoring, near-duplicate +collapsing, and session grouping. +""" + +from __future__ import annotations + +import pathlib +import typing as t + +import pytest + +import agentgrep +from agentgrep.ranking import ( + collapse_near_duplicates, + group_by_session, + rank_search_records, +) + + +def _record( + text: str, + *, + session_id: str | None = None, + agent: agentgrep.AgentName = "codex", +) -> agentgrep.SearchRecord: + """Build a minimal SearchRecord for ranking tests.""" + return agentgrep.SearchRecord( + kind="prompt", + agent=agent, + store="test", + adapter_id="test.v1", + path=pathlib.Path("/tmp/test"), + text=text, + session_id=session_id, + ) + + +# --------------------------------------------------------------------------- +# rank_search_records +# --------------------------------------------------------------------------- + + +class RankCase(t.NamedTuple): + """Parametrized case for :func:`rank_search_records`.""" + + test_id: str + texts: list[str] + query: str + threshold: int + expected_first_text: str | None + expected_min_count: int + + +RANK_CASES: tuple[RankCase, ...] = ( + RankCase( + "higher-match-scores-first", + ["unrelated noise", "the streaming parser is fast", "streaming"], + "streaming", + 0, + "streaming", + 3, + ), + RankCase( + "threshold-filters-low", + ["unrelated noise", "streaming parser"], + "streaming", + 80, + "streaming parser", + 1, + ), + RankCase( + "empty-input", + [], + "anything", + 0, + None, + 0, + ), +) + + +@pytest.mark.parametrize( + RankCase._fields, + RANK_CASES, + ids=[case.test_id for case in RANK_CASES], +) +def test_rank_search_records( + test_id: str, + texts: list[str], + query: str, + threshold: int, + expected_first_text: str | None, + expected_min_count: int, +) -> None: + """rank_search_records scores, filters, and sorts correctly.""" + _ = test_id + records = [_record(text) for text in texts] + result = rank_search_records(records, query, threshold=threshold) + assert len(result) >= expected_min_count + if expected_first_text is not None: + assert result[0][0].text == expected_first_text + + +def test_rank_scores_are_descending() -> None: + """Scores are in non-increasing order.""" + records = [ + _record("unrelated noise here"), + _record("the streaming parser approach"), + _record("streaming"), + _record("fully streaming parser engine"), + ] + result = rank_search_records(records, "streaming parser") + scores = [score for _, score in result] + assert scores == sorted(scores, reverse=True) + + +# --------------------------------------------------------------------------- +# collapse_near_duplicates +# --------------------------------------------------------------------------- + + +class CollapseCase(t.NamedTuple): + """Parametrized case for :func:`collapse_near_duplicates`.""" + + test_id: str + texts: list[str] + expected_count: int + expected_any_similar: bool + + +COLLAPSE_CASES: tuple[CollapseCase, ...] = ( + CollapseCase( + "identical-texts-collapse", + ["hello world", "hello world", "hello world"], + 1, + True, + ), + CollapseCase( + "different-texts-stay", + ["apple pie recipe", "quantum mechanics lecture", "jazz improvisation"], + 3, + False, + ), + CollapseCase( + "empty-input", + [], + 0, + False, + ), + CollapseCase( + "near-identical-collapse", + ["hello world today", "hello world today!"], + 1, + True, + ), +) + + +@pytest.mark.parametrize( + CollapseCase._fields, + COLLAPSE_CASES, + ids=[case.test_id for case in COLLAPSE_CASES], +) +def test_collapse_near_duplicates( + test_id: str, + texts: list[str], + expected_count: int, + expected_any_similar: bool, +) -> None: + """Near-duplicate collapsing produces expected representative count.""" + _ = test_id + scored = [(r, 50.0) for r in (_record(text) for text in texts)] + result = collapse_near_duplicates(scored) + assert len(result) == expected_count + if expected_any_similar: + assert any(similar > 0 for _, _, similar in result) + elif result: + assert all(similar == 0 for _, _, similar in result) + + +def test_collapse_preserves_score_order() -> None: + """Collapsed output preserves the pre-sorted score order.""" + scored: list[tuple[agentgrep.SearchRecord, float]] = [ + (_record("best match"), 95.0), + (_record("good match"), 80.0), + (_record("okay match"), 60.0), + ] + result = collapse_near_duplicates(scored) + result_scores = [score for _, score, _ in result] + assert result_scores == sorted(result_scores, reverse=True) + + +# --------------------------------------------------------------------------- +# group_by_session +# --------------------------------------------------------------------------- + + +class GroupCase(t.NamedTuple): + """Parametrized case for :func:`group_by_session`.""" + + test_id: str + session_ids: list[str | None] + expected_group_count: int + expected_keys: list[str | None] + + +GROUP_CASES: tuple[GroupCase, ...] = ( + GroupCase( + "groups-by-session", + ["sess-a", "sess-a", "sess-b", "sess-b"], + 2, + ["sess-a", "sess-b"], + ), + GroupCase( + "none-sessions-grouped-together", + [None, None, "sess-a"], + 2, + [None, "sess-a"], + ), + GroupCase( + "preserves-first-seen-order", + ["sess-b", "sess-a", "sess-b"], + 2, + ["sess-b", "sess-a"], + ), + GroupCase( + "empty-input", + [], + 0, + [], + ), +) + + +@pytest.mark.parametrize( + GroupCase._fields, + GROUP_CASES, + ids=[case.test_id for case in GROUP_CASES], +) +def test_group_by_session( + test_id: str, + session_ids: list[str | None], + expected_group_count: int, + expected_keys: list[str | None], +) -> None: + """Session grouping produces expected buckets.""" + _ = test_id + records: list[tuple[agentgrep.SearchRecord, float, int]] = [ + (_record(f"text-{i}", session_id=sid), 50.0, 0) for i, sid in enumerate(session_ids) + ] + result = group_by_session(records) + assert len(result) == expected_group_count + assert [key for key, _ in result] == expected_keys + + +def test_group_preserves_within_group_order() -> None: + """Records within a group keep score-descending order.""" + records: list[tuple[agentgrep.SearchRecord, float, int]] = [ + (_record("first", session_id="s1"), 95.0, 0), + (_record("second", session_id="s1"), 80.0, 0), + (_record("third", session_id="s1"), 60.0, 0), + ] + result = group_by_session(records) + assert len(result) == 1 + _, entries = result[0] + entry_scores = [score for _, score, _ in entries] + assert entry_scores == [95.0, 80.0, 60.0] From 91de1f7ba831e864bb200a150ccb354da516b27c Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 09:21:58 -0500 Subject: [PATCH 03/23] agentgrep(feat[search]): Wire search command with progress and pretty output why: Complete the search command by connecting the ranking engine to the CLI with progress feedback and pretty-style output. what: - Add run_search_command with eager collection + progress + ranking pipeline - Add _print_search_text with score display and similar-count indicators - Add _print_search_json for structured output with scores - Wire dispatch in main() and re-export from __init__ - Add integration tests --- src/agentgrep/cli/render.py | 124 ++++++++++++++++++++- tests/test_cli_search.py | 212 ++++++++++++++++++++++++++++++++++++ 2 files changed, 333 insertions(+), 3 deletions(-) diff --git a/src/agentgrep/cli/render.py b/src/agentgrep/cli/render.py index 9d292da..efaeeed 100644 --- a/src/agentgrep/cli/render.py +++ b/src/agentgrep/cli/render.py @@ -435,9 +435,127 @@ def run_search_command(args: SearchArgs) -> int: renders in the requested output format. Returns ``0`` when at least one result survives ranking, ``1`` otherwise. """ - _ = args - msg = "search command not yet wired — ranking engine pending" - raise SystemExit(msg) + if not args.terms and args.output_mode != "ui": + msg = "search requires at least one term unless --ui is used" + raise SystemExit(msg) + query = agentgrep.SearchQuery( + terms=args.terms, + search_type=args.search_type, + any_term=args.any_term, + regex=args.regex, + case_sensitive=args.case_sensitive, + agents=args.agents, + limit=None, + compiled=args.compiled, + ) + if args.output_mode == "ui": + agentgrep.run_ui( + pathlib.Path.home(), + query, + control=agentgrep.SearchControl(), + initial_search_text=args.raw_query or None, + ) + return 0 + control = agentgrep.SearchControl() + human_output = args.output_mode in {"text", "ui"} + progress_enabled = args.progress_mode == "always" or ( + args.progress_mode == "auto" and human_output + ) + progress: agentgrep.SearchProgress + if not progress_enabled: + progress = agentgrep.noop_search_progress() + else: + progress = agentgrep.ConsoleSearchProgress( + enabled=True, + color_mode=args.color_mode, + answer_now_hint=False, + ) + records = agentgrep.run_search_query( + pathlib.Path.home(), + query, + progress=progress, + control=control, + ) + query_text = " ".join(args.terms) + if args.no_rank: + scored: list[tuple[agentgrep.SearchRecord, float]] = [(r, 0.0) for r in records] + else: + from agentgrep.ranking import rank_search_records + + scored = rank_search_records(records, query_text, threshold=args.threshold) + from agentgrep.ranking import collapse_near_duplicates, group_by_session + + collapsed = collapse_near_duplicates(scored) + if args.limit is not None: + collapsed = collapsed[: args.limit] + if args.no_group: + groups: list[tuple[str | None, list[tuple[agentgrep.SearchRecord, float, int]]]] = [ + (None, collapsed), + ] + else: + groups = group_by_session(collapsed) + if args.output_mode in ("json", "ndjson"): + _print_search_json(groups, args) + return 0 if collapsed else 1 + _print_search_text(groups, args) + return 0 if collapsed else 1 + + +def _print_search_text( + groups: list[tuple[str | None, list[tuple[agentgrep.SearchRecord, float, int]]]], + args: SearchArgs, +) -> None: + """Render search results with scores and duplicate counts to stdout.""" + colors = agentgrep.AnsiColors.for_stream(args.color_mode, sys.stdout) + first_group = True + for session_id, entries in groups: + if not first_group: + print() + first_group = False + if session_id is not None and not args.no_group: + print(colors.heading(f"[session {session_id[:12]}]")) + for record, score, similar_count in entries: + path = agentgrep.format_display_path(record.path) + score_label = colors.warning(f"{score:.0f}") + snippet = record.text[:120].replace("\n", " ") + similar_label = "" + if similar_count > 0: + similar_label = colors.muted(f" (+{similar_count} similar)") + header = f" {colors.path(path)} {colors.muted(record.agent)}" + if record.timestamp: + header += f" {colors.muted(record.timestamp)}" + print(f"{score_label} {snippet}{similar_label}") + print(header) + + +def _print_search_json( + groups: list[tuple[str | None, list[tuple[agentgrep.SearchRecord, float, int]]]], + args: SearchArgs, +) -> None: + """Render search results as JSON with scores.""" + serialize_search, _, serialize_envelope = maybe_build_pydantic() + results: list[dict[str, object]] = [] + for session_id, entries in groups: + for record, score, similar_count in entries: + entry = serialize_search(record) + entry["score"] = score + entry["similar_count"] = similar_count + if session_id is not None: + entry["group_session_id"] = session_id + results.append(entry) + if args.output_mode == "json": + query_data: dict[str, object] = { + "terms": list(args.terms), + "agents": list(args.agents), + "threshold": args.threshold, + "no_rank": args.no_rank, + "no_group": args.no_group, + } + payload = serialize_envelope("search", query_data, results) + print(json.dumps(payload, ensure_ascii=False, indent=2)) + else: + for result in results: + print(json.dumps(result, ensure_ascii=False)) def _compile_grep_patterns(args: GrepArgs) -> list[re.Pattern[str]]: diff --git a/tests/test_cli_search.py b/tests/test_cli_search.py index 1270599..d3a1f4b 100644 --- a/tests/test_cli_search.py +++ b/tests/test_cli_search.py @@ -7,11 +7,14 @@ from __future__ import annotations +import json +import pathlib import typing as t import pytest import agentgrep +from agentgrep.cli.render import run_search_command # --------------------------------------------------------------------------- # Argument parsing @@ -233,3 +236,212 @@ def test_search_parse_agent_filter() -> None: parsed = agentgrep.parse_args(("search", "--agent", "codex", "bliss")) assert isinstance(parsed, agentgrep.SearchArgs) assert parsed.agents == ("codex",) + + +# --------------------------------------------------------------------------- +# Integration tests +# --------------------------------------------------------------------------- + + +def _make_search_args(**overrides: t.Any) -> agentgrep.SearchArgs: + """Build a SearchArgs with sensible test defaults.""" + base: dict[str, t.Any] = { + "terms": ("bliss",), + "agents": agentgrep.AGENT_CHOICES, + "search_type": "prompts", + "any_term": False, + "regex": False, + "case_sensitive": False, + "limit": None, + "output_mode": "text", + "color_mode": "never", + "progress_mode": "never", + "threshold": 0, + "no_group": False, + "no_rank": False, + "compiled": None, + "raw_query": "", + } + base.update(overrides) + return agentgrep.SearchArgs(**base) + + +def _canned_records() -> list[agentgrep.SearchRecord]: + """Return a small set of canned records for search integration tests.""" + return [ + agentgrep.SearchRecord( + kind="prompt", + agent="codex", + store="test", + adapter_id="test.v1", + path=pathlib.Path("/tmp/test-a"), + text="the bliss of streaming parsers", + session_id="sess-1", + ), + agentgrep.SearchRecord( + kind="prompt", + agent="codex", + store="test", + adapter_id="test.v1", + path=pathlib.Path("/tmp/test-b"), + text="unrelated noise about caching", + session_id="sess-2", + ), + agentgrep.SearchRecord( + kind="prompt", + agent="claude", + store="test", + adapter_id="test.v1", + path=pathlib.Path("/tmp/test-c"), + text="bliss in every line of code", + session_id="sess-1", + ), + ] + + +def test_search_command_no_terms_raises() -> None: + """Search without terms and without --ui raises SystemExit.""" + args = _make_search_args(terms=()) + with pytest.raises(SystemExit, match="search requires at least one term"): + run_search_command(args) + + +def test_search_routes_through_ranking( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """Search dispatches through the ranking pipeline and produces output.""" + canned = _canned_records() + monkeypatch.setattr( + agentgrep, + "run_search_query", + lambda *_args, **_kwargs: canned, + ) + args = _make_search_args(terms=("bliss",)) + code = run_search_command(args) + assert code == 0 + captured = capsys.readouterr() + assert "bliss" in captured.out + + +def test_search_no_rank_preserves_order( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """--no-rank skips scoring and preserves discovery order.""" + canned = _canned_records() + monkeypatch.setattr( + agentgrep, + "run_search_query", + lambda *_args, **_kwargs: canned, + ) + args = _make_search_args(terms=("bliss",), no_rank=True) + code = run_search_command(args) + assert code == 0 + captured = capsys.readouterr() + lines = captured.out.strip().splitlines() + # With no_rank, scores are 0 — all matching records appear + score_lines = [line for line in lines if line.startswith("0")] + assert len(score_lines) >= 1 + + +def test_search_threshold_filters_low_scores( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """--threshold filters records below the minimum score.""" + canned = _canned_records() + monkeypatch.setattr( + agentgrep, + "run_search_query", + lambda *_args, **_kwargs: canned, + ) + # Very high threshold should filter most records + args = _make_search_args(terms=("bliss",), threshold=99) + code = run_search_command(args) + captured = capsys.readouterr() + # With threshold=99, only near-exact matches survive (or none) + # The exit code reflects whether any results remain + assert code in (0, 1) + if code == 1: + assert captured.out.strip() == "" + + +def test_search_json_includes_scores( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """--json output includes score and similar_count fields.""" + canned = _canned_records() + monkeypatch.setattr( + agentgrep, + "run_search_query", + lambda *_args, **_kwargs: canned, + ) + args = _make_search_args(terms=("bliss",), output_mode="json", no_group=True) + code = run_search_command(args) + assert code == 0 + captured = capsys.readouterr() + payload = json.loads(captured.out) + assert "results" in payload + for result in payload["results"]: + assert "score" in result + assert "similar_count" in result + assert isinstance(result["score"], (int, float)) + assert isinstance(result["similar_count"], int) + + +def test_search_ndjson_includes_scores( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """--ndjson output includes score and similar_count in each line.""" + canned = _canned_records() + monkeypatch.setattr( + agentgrep, + "run_search_query", + lambda *_args, **_kwargs: canned, + ) + args = _make_search_args(terms=("bliss",), output_mode="ndjson", no_group=True) + code = run_search_command(args) + assert code == 0 + captured = capsys.readouterr() + lines = [line for line in captured.out.strip().splitlines() if line] + assert len(lines) >= 1 + for line in lines: + obj = json.loads(line) + assert "score" in obj + assert "similar_count" in obj + + +def test_search_empty_results_returns_1( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Search with no matches returns exit code 1.""" + monkeypatch.setattr( + agentgrep, + "run_search_query", + lambda *_args, **_kwargs: [], + ) + args = _make_search_args(terms=("nonexistent",)) + code = run_search_command(args) + assert code == 1 + + +def test_search_limit_caps_results( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """--limit caps the number of results after ranking.""" + canned = _canned_records() + monkeypatch.setattr( + agentgrep, + "run_search_query", + lambda *_args, **_kwargs: canned, + ) + args = _make_search_args(terms=("bliss",), limit=1, no_group=True, output_mode="json") + code = run_search_command(args) + assert code == 0 + captured = capsys.readouterr() + payload = json.loads(captured.out) + assert len(payload["results"]) == 1 From bf2dfd4f14b9c06f218dd2e15cf62761236e221a Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 09:59:54 -0500 Subject: [PATCH 04/23] =?UTF-8?q?agentgrep(fix[ranking]):=20Skip=20O(n?= =?UTF-8?q?=C2=B2)=20collapse=20when=20--no-rank,=20add=20size=20guard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit why: collapse_near_duplicates runs pairwise WRatio between all records — O(n²) with expensive C calls. It was called unconditionally even with --no-rank, hanging on large result sets. Users who pass --no-rank explicitly want fast unranked output. what: - Skip collapse_near_duplicates entirely when --no-rank is set; emit records with score=0, similar_count=0 - Add size guard in collapse_near_duplicates: if len(scored) > 500, skip pairwise comparison and return records as-is - Move rank + collapse imports inside the else branch (lazy load only when ranking is active) --- src/agentgrep/cli/render.py | 7 ++++--- src/agentgrep/ranking.py | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/agentgrep/cli/render.py b/src/agentgrep/cli/render.py index efaeeed..3c7c499 100644 --- a/src/agentgrep/cli/render.py +++ b/src/agentgrep/cli/render.py @@ -479,13 +479,14 @@ def run_search_command(args: SearchArgs) -> int: query_text = " ".join(args.terms) if args.no_rank: scored: list[tuple[agentgrep.SearchRecord, float]] = [(r, 0.0) for r in records] + collapsed: list[tuple[agentgrep.SearchRecord, float, int]] = [(r, 0.0, 0) for r in records] else: - from agentgrep.ranking import rank_search_records + from agentgrep.ranking import collapse_near_duplicates, rank_search_records scored = rank_search_records(records, query_text, threshold=args.threshold) - from agentgrep.ranking import collapse_near_duplicates, group_by_session + collapsed = collapse_near_duplicates(scored) + from agentgrep.ranking import group_by_session - collapsed = collapse_near_duplicates(scored) if args.limit is not None: collapsed = collapsed[: args.limit] if args.no_group: diff --git a/src/agentgrep/ranking.py b/src/agentgrep/ranking.py index 9942e2e..32eeec7 100644 --- a/src/agentgrep/ranking.py +++ b/src/agentgrep/ranking.py @@ -85,6 +85,8 @@ def collapse_near_duplicates( if not scored: return [] + if len(scored) > 500: + return [(r, s, 0) for r, s in scored] result: list[tuple[SearchRecord, float, int]] = [] consumed: set[int] = set() for i, (record_i, score_i) in enumerate(scored): From 965df5451561bc43cb6b4f324e067df6c1f37313 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 10:01:52 -0500 Subject: [PATCH 05/23] agentgrep(fix[parser]): Add collision detection for search flag/field mix why: grep and find both reject mixing --agent with agent: inline predicates (via _grep_explicit_flags / _find_explicit_flags). The reintroduced search subparser was missing this validation, silently accepting nonsensical queries like `agentgrep search --agent codex agent:claude bliss`. what: - Add _search_explicit_flags() mapping --agent and --type flags - Pass explicit_flags to _maybe_compile_query in _build_search_args - Parse-time error now raised on flag/field conflicts --- src/agentgrep/cli/parser.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/agentgrep/cli/parser.py b/src/agentgrep/cli/parser.py index 9f4f187..99a79be 100644 --- a/src/agentgrep/cli/parser.py +++ b/src/agentgrep/cli/parser.py @@ -761,6 +761,16 @@ def build_docs_parser() -> argparse.ArgumentParser: return create_parser("never").parser +def _search_explicit_flags(namespace: argparse.Namespace) -> dict[str, str]: + """Map query-field name → CLI flag name for `search` flag/field collisions.""" + flags: dict[str, str] = {} + if t.cast("list[str]", namespace.agent): + flags["agent"] = "--agent" + if t.cast("str", namespace.search_type) != "prompts": + flags["type"] = "--type" + return flags + + def _grep_explicit_flags(namespace: argparse.Namespace) -> dict[str, str]: """Map query-field name → CLI flag name for `grep` flag/field collisions.""" flags: dict[str, str] = {} @@ -1146,6 +1156,7 @@ def _build_search_args( bundle=bundle, color_mode=color_mode, subparser=bundle.search_parser, + explicit_flags=_search_explicit_flags(namespace), ) final_terms: tuple[str, ...] = ( residual_terms if search_compiled is not None else tuple(terms_list) From e094ad826a67d80688db8838ef22e1d7532cfa7e Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 10:29:39 -0500 Subject: [PATCH 06/23] agentgrep(fix[parser]): Reject --threshold with --no-rank why: --threshold only takes effect inside rank_search_records, which is skipped when --no-rank is set. Silently accepting both flags misleads the user into thinking their threshold filter is active. what: - Add parse-time error when both --no-rank and --threshold > 0 - Split all-ranking-flags test into two valid cases --- src/agentgrep/cli/parser.py | 6 ++++++ tests/test_cli_search.py | 18 +++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/agentgrep/cli/parser.py b/src/agentgrep/cli/parser.py index 99a79be..0e63536 100644 --- a/src/agentgrep/cli/parser.py +++ b/src/agentgrep/cli/parser.py @@ -1150,6 +1150,12 @@ def _build_search_args( if threshold < 0 or threshold > 100: with configured_color_environment(color_mode): bundle.search_parser.error("--threshold must be between 0 and 100") + no_rank = t.cast("bool", namespace.no_rank) + if no_rank and threshold > 0: + with configured_color_environment(color_mode): + bundle.search_parser.error( + "--threshold has no effect with --no-rank (ranking is disabled)", + ) search_compiled, residual_terms = _maybe_compile_query( terms_list, diff --git a/tests/test_cli_search.py b/tests/test_cli_search.py index d3a1f4b..bc8805c 100644 --- a/tests/test_cli_search.py +++ b/tests/test_cli_search.py @@ -98,12 +98,24 @@ class SearchParseCase(t.NamedTuple): False, ), SearchParseCase( - "all-ranking-flags", - ("search", "--threshold", "50", "--no-group", "--no-rank", "query"), + "no-group-and-no-rank", + ("search", "--no-group", "--no-rank", "query"), ("query",), - 50, + 0, + True, True, + "prompts", + False, + False, + False, + ), + SearchParseCase( + "threshold-with-ranking", + ("search", "--threshold", "50", "--no-group", "query"), + ("query",), + 50, True, + False, "prompts", False, False, From 2cb5a810f4ce79ec22594d231dfec4a75a6f630a Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 10:29:39 -0500 Subject: [PATCH 07/23] agentgrep(docs[cli]): Add search to CLI_DESCRIPTION why: search subcommand was reintroduced but CLI_DESCRIPTION only listed grep/fuzzy/find/ui. what: - Add search description to the CLI help intro text --- src/agentgrep/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/agentgrep/__init__.py b/src/agentgrep/__init__.py index b87962d..4e260dc 100644 --- a/src/agentgrep/__init__.py +++ b/src/agentgrep/__init__.py @@ -150,7 +150,8 @@ def build_description( CLI_DESCRIPTION = build_description( """ Read-only search across Codex, Claude, Cursor, and Gemini local - stores. Pick a subcommand from the list below: ``grep`` for + stores. Pick a subcommand from the list below: ``search`` for + ranked results with dedup and session grouping, ``grep`` for rg-shaped content search, ``fuzzy`` for fzf-style filtering, ``find`` for store enumeration, ``ui`` for the interactive Textual explorer. From 0b50453b43b6e0e1bfe722bae91c6f2a280ad5f1 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 12:12:59 -0500 Subject: [PATCH 08/23] agentgrep(fix[search]): Restore Enter-to-answer-now for search why: run_search_command created a SearchControl but never wired up the AnswerNowInputListener thread, so pressing Enter during a long search had no effect and the progress hint was hidden. what: - Wire AnswerNowInputListener with start/stop around run_search_query - Set answer_now_hint based on TTY detection (stdin + stderr) - Wrap run_search_query in try/finally to ensure listener.stop() --- src/agentgrep/cli/render.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/agentgrep/cli/render.py b/src/agentgrep/cli/render.py index 3c7c499..eb42527 100644 --- a/src/agentgrep/cli/render.py +++ b/src/agentgrep/cli/render.py @@ -461,6 +461,13 @@ def run_search_command(args: SearchArgs) -> int: progress_enabled = args.progress_mode == "always" or ( args.progress_mode == "auto" and human_output ) + answer_now_enabled = ( + progress_enabled + and human_output + and bool(getattr(sys.stdin, "isatty", lambda: False)()) + and bool(getattr(sys.stderr, "isatty", lambda: False)()) + ) + listener = agentgrep.AnswerNowInputListener(control) if answer_now_enabled else None progress: agentgrep.SearchProgress if not progress_enabled: progress = agentgrep.noop_search_progress() @@ -468,14 +475,20 @@ def run_search_command(args: SearchArgs) -> int: progress = agentgrep.ConsoleSearchProgress( enabled=True, color_mode=args.color_mode, - answer_now_hint=False, + answer_now_hint=answer_now_enabled, ) - records = agentgrep.run_search_query( - pathlib.Path.home(), - query, - progress=progress, - control=control, - ) + if listener is not None: + listener.start() + try: + records = agentgrep.run_search_query( + pathlib.Path.home(), + query, + progress=progress, + control=control, + ) + finally: + if listener is not None: + listener.stop() query_text = " ".join(args.terms) if args.no_rank: scored: list[tuple[agentgrep.SearchRecord, float]] = [(r, 0.0) for r in records] From 73ae939a84078bdfdd8886a12d92e309b6144072 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 14:46:50 -0500 Subject: [PATCH 09/23] agentgrep(fix[search]): Keep progress live during large parses why: Large Codex and Claude-style JSONL sources can spend seconds inside parsing work before any deduped result is emitted, which leaves the CLI progress line looking frozen. Huge Codex tool-output records make this worse because they can hold the GIL while producing no searchable prompt record. what: - Add optional in-source progress updates with cooperative parser yields while preserving final deduped result semantics. - Show source detail in CLI and TUI progress snapshots alongside source counters. - Skip large Codex function_call_output lines before JSON decoding, discarding them cooperatively because they cannot produce prompt records. - Cover progress callbacks, JSONL yielding, raw tool-output skipping, and progress-line formatting in tests. --- src/agentgrep/__init__.py | 187 +++++++++++++++++++++++++++++- src/agentgrep/ui/app.py | 2 + tests/test_agentgrep.py | 233 +++++++++++++++++++++++++++++++++++++- 3 files changed, 412 insertions(+), 10 deletions(-) diff --git a/src/agentgrep/__init__.py b/src/agentgrep/__init__.py index 4e260dc..3d336cf 100644 --- a/src/agentgrep/__init__.py +++ b/src/agentgrep/__init__.py @@ -1117,6 +1117,24 @@ class SourceHandle: mtime_ns: int +type SourceProgressCallback = cabc.Callable[[int, int, SourceHandle, int, int], None] + +_SOURCE_PROGRESS_RECORD_INTERVAL = 128 +"""Parsed-record cadence for in-source progress updates and GIL yields.""" + +_JSONL_YIELD_LINE_INTERVAL = 128 +"""Decoded-line cadence for cooperative JSONL parser yields.""" + +_JSONL_PREFIX_BYTES = 4096 +"""Bytes read up front when a raw-line skip predicate is active.""" + +_JSONL_SKIP_CHUNK_BYTES = 1024 * 1024 +"""Chunk size for discarding skipped oversized JSONL lines.""" + +_CODEX_RAW_SKIP_MIN_BYTES = 1024 * 1024 +"""Minimum Codex session size before enabling raw-line output skipping.""" + + @dataclasses.dataclass(slots=True) class SearchRecord: """Normalized prompt/history record.""" @@ -1451,6 +1469,22 @@ def source_finished( detail=f"{records} records, {format_match_count(matches)} in {source.path.name}", ) + def source_progress( + self, + index: int, + total: int, + source: SourceHandle, + records: int, + matches: int, + ) -> None: + """Report in-source scan progress.""" + self.set_status( + "scanning", + current=index, + total=total, + detail=format_source_progress_detail(records, matches), + ) + def result_added(self, count: int) -> None: """Report deduped result count.""" if not self._enabled: @@ -1663,7 +1697,10 @@ def _status_text(self) -> str: detail = self._detail if current is not None and total is not None: count = self._colors.warning(f"{current}/{total}") - return f"{self._colors.heading(phase)} {count} {self._colors.muted('sources')}" + text = f"{self._colors.heading(phase)} {count} {self._colors.muted('sources')}" + if detail: + return f"{text} | {self._colors.muted(detail)}" + return text if detail: return f"{self._colors.heading(phase)} {self._colors.muted(detail)}" return self._colors.heading(phase) @@ -1682,6 +1719,12 @@ def format_match_count(count: int) -> str: return f"{count} {suffix}" +def format_source_progress_detail(records: int, matches: int) -> str: + """Return a concise in-source progress detail.""" + match_suffix = "source match" if matches == 1 else "source matches" + return f"{records} records, {matches} {match_suffix}" + + @dataclasses.dataclass(frozen=True) class ProgressSnapshot: """Immutable view of search-progress state for one render pass.""" @@ -1719,19 +1762,27 @@ def format_search_progress_line( each segment styled through ``colors``. """ label_part = f"{colors.heading('Searching')} {colors.highlight(snapshot.query_label)}" + detail_part = colors.muted(snapshot.detail) if snapshot.detail else None if snapshot.current is not None and snapshot.total is not None: count = colors.warning(f"{snapshot.current}/{snapshot.total}") status_part = f"{colors.heading(snapshot.phase)} {count} {colors.muted('sources')}" elif snapshot.detail: status_part = f"{colors.heading(snapshot.phase)} {colors.muted(snapshot.detail)}" + detail_part = None else: status_part = colors.heading(snapshot.phase) parts = [ label_part, status_part, - colors.warning(format_match_count(snapshot.matches)), - colors.muted(f"{snapshot.elapsed:.1f}s"), ] + if detail_part: + parts.append(detail_part) + parts.extend( + [ + colors.warning(format_match_count(snapshot.matches)), + colors.muted(f"{snapshot.elapsed:.1f}s"), + ], + ) if answer_now_hint: parts.append(colors.white("[Press enter, answer now]")) return " | ".join(parts) @@ -1742,6 +1793,20 @@ def noop_search_progress() -> SearchProgress: return NoopSearchProgress() +def _report_source_progress( + progress: SearchProgress, + index: int, + total: int, + source: SourceHandle, + records: int, + matches: int, +) -> None: + """Call the optional in-source progress hook when a reporter exposes it.""" + callback = getattr(progress, "source_progress", None) + if callable(callback): + t.cast("SourceProgressCallback", callback)(index, total, source, records, matches) + + @dataclasses.dataclass(frozen=True) class StreamingRecordsBatch: """Batch of newly deduped records emitted by :meth:`StreamingSearchProgress.flush`.""" @@ -1896,6 +1961,22 @@ def source_finished( self._detail = f"{records} records, {format_match_count(matches)} in {source.path.name}" self._emit_progress() + def source_progress( + self, + index: int, + total: int, + source: SourceHandle, + records: int, + matches: int, + ) -> None: + """Report in-source scan progress.""" + with self._lock: + self._phase = "scanning" + self._current = index + self._total = total + self._detail = format_source_progress_detail(records, matches) + self._emit_progress() + def result_added(self, count: int) -> None: """Update the cumulative match counter.""" with self._lock: @@ -2079,6 +2160,14 @@ def file_mtime_ns(path: pathlib.Path) -> int: return 0 +def _file_size(path: pathlib.Path) -> int: + """Return file size in bytes, falling back to zero on stat failure.""" + try: + return path.stat().st_size + except OSError: + return 0 + + def resolve_env_root(env_var: str, default: pathlib.Path) -> pathlib.Path: """Resolve a base directory from an environment variable, with safety. @@ -2634,6 +2723,16 @@ def current_count() -> int: if matches_record(record, query): matches_seen += 1 matching_records.append(record) + if records_seen % _SOURCE_PROGRESS_RECORD_INTERVAL == 0: + _report_source_progress( + active_progress, + index, + total, + source, + records_seen, + matches_seen, + ) + time.sleep(0) active_progress.source_finished(index, total, source, records_seen, matches_seen) matching_records.sort(key=search_record_sort_key, reverse=True) for record in matching_records: @@ -2745,7 +2844,12 @@ def parse_codex_session_file( """Parse Codex session JSONL files.""" session_id = source.path.stem session_model: str | None = None - for event in iter_jsonl(source.path): + events = ( + _iter_jsonl(source.path, skip_line=_is_codex_function_call_output_line) + if _file_size(source.path) >= _CODEX_RAW_SKIP_MIN_BYTES + else iter_jsonl(source.path) + ) + for event in events: if not isinstance(event, dict): continue event_type = str(event.get("type", "")) @@ -3285,10 +3389,69 @@ def read_json_file(path: pathlib.Path) -> JSONValue | None: def iter_jsonl(path: pathlib.Path) -> cabc.Iterator[JSONValue]: """Yield decoded JSON objects from a JSONL file.""" + yield from _iter_jsonl(path) + + +def _iter_jsonl( + path: pathlib.Path, + *, + skip_line: cabc.Callable[[str], bool] | None = None, +) -> cabc.Iterator[JSONValue]: + """Yield decoded JSON objects from a JSONL file with an optional raw-line filter.""" + if skip_line is not None: + yield from _iter_jsonl_with_raw_skip(path, skip_line) + return try: with path.open(encoding="utf-8") as handle: + decoded_lines = 0 for line in handle: stripped = line.strip() + if not stripped: + continue + decoded_lines += 1 + if decoded_lines % _JSONL_YIELD_LINE_INTERVAL == 0: + time.sleep(0) + if skip_line is not None and skip_line(stripped): + continue + try: + parsed = t.cast("object", json.loads(stripped)) + except json.JSONDecodeError: + continue + if isinstance(parsed, (dict, list, str, int, float, bool)) or parsed is None: + yield t.cast("JSONValue", parsed) + except OSError: + return + + +def _iter_jsonl_with_raw_skip( + path: pathlib.Path, + skip_line: cabc.Callable[[str], bool], +) -> cabc.Iterator[JSONValue]: + """Yield decoded JSON objects while skipping matched raw lines in chunks.""" + try: + with path.open("rb") as handle: + decoded_lines = 0 + while True: + prefix = handle.readline(_JSONL_PREFIX_BYTES) + if not prefix: + break + if not prefix.strip(): + continue + decoded_lines += 1 + if decoded_lines % _JSONL_YIELD_LINE_INTERVAL == 0: + time.sleep(0) + prefix_text = prefix.decode("utf-8", errors="replace") + if skip_line(prefix_text): + _discard_rest_of_line(handle, prefix) + continue + raw_line = bytearray(prefix) + while raw_line and not raw_line.endswith(b"\n"): + chunk = handle.readline(_JSONL_SKIP_CHUNK_BYTES) + if not chunk: + break + raw_line.extend(chunk) + time.sleep(0) + stripped = raw_line.decode("utf-8", errors="replace").strip() if not stripped: continue try: @@ -3301,6 +3464,22 @@ def iter_jsonl(path: pathlib.Path) -> cabc.Iterator[JSONValue]: return +def _discard_rest_of_line(handle: t.BinaryIO, prefix: bytes) -> None: + """Discard the unread remainder of the current physical line.""" + chunk = prefix + while chunk and not chunk.endswith(b"\n"): + chunk = handle.readline(_JSONL_SKIP_CHUNK_BYTES) + time.sleep(0) + + +def _is_codex_function_call_output_line(line: str) -> bool: + """Return whether a Codex JSONL line is a tool output record.""" + prefix = line[:512].replace(" ", "") + return ( + '"type":"response_item"' in prefix and '"payload":{"type":"function_call_output"' in prefix + ) + + def candidate_from_mapping( mapping: dict[str, object], *, diff --git a/src/agentgrep/ui/app.py b/src/agentgrep/ui/app.py index 6be8c42..8b4c16c 100644 --- a/src/agentgrep/ui/app.py +++ b/src/agentgrep/ui/app.py @@ -1173,6 +1173,8 @@ def _apply_progress(self, snapshot: ProgressSnapshot) -> None: f"Searching {label} | " f"{snapshot.phase} {snapshot.current}/{snapshot.total} sources" ) + if snapshot.detail: + status = f"{status} | {snapshot.detail}" elif snapshot.detail: status = f"Searching {label} | {snapshot.phase} {snapshot.detail}" else: diff --git a/tests/test_agentgrep.py b/tests/test_agentgrep.py index 2042e83..d12d2c4 100644 --- a/tests/test_agentgrep.py +++ b/tests/test_agentgrep.py @@ -621,6 +621,160 @@ def iter_records(source: object) -> cabc.Iterator[object]: assert progress.counts == [1] +def test_collect_search_records_reports_in_source_progress_and_yields_gil( + tmp_path: pathlib.Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Large source scans report parser progress and cooperatively yield.""" + agentgrep = t.cast("t.Any", load_agentgrep_module()) + source = agentgrep.SourceHandle( + agent="codex", + store="codex.sessions", + adapter_id="codex.sessions_jsonl.v1", + path=tmp_path / "session.jsonl", + path_kind="session_file", + source_kind="jsonl", + search_root=None, + mtime_ns=1, + ) + query = agentgrep.SearchQuery( + terms=("bliss",), + search_type="prompts", + any_term=False, + regex=False, + case_sensitive=False, + agents=("codex",), + limit=None, + dedupe=False, + ) + + class CapturingProgress: + def __init__(self) -> None: + self.source_progress_events: list[tuple[int, int, int, int]] = [] + + def source_started(self, index: int, total: int, source: object) -> None: ... + def source_finished( + self, + index: int, + total: int, + source: object, + records: int, + matches: int, + ) -> None: ... + def result_added(self, count: int) -> None: ... + def record_added(self, record: object) -> None: ... + + def source_progress( + self, + index: int, + total: int, + source: object, + records: int, + matches: int, + ) -> None: + self.source_progress_events.append((index, total, records, matches)) + + def iter_records(source: object) -> cabc.Iterator[object]: + for index in range(agentgrep._SOURCE_PROGRESS_RECORD_INTERVAL + 1): + yield agentgrep.SearchRecord( + kind="prompt", + agent="codex", + store="codex.sessions", + adapter_id="codex.sessions_jsonl.v1", + path=tmp_path / "session.jsonl", + text=f"bliss {index}", + ) + + sleep_calls: list[float] = [] + monkeypatch.setattr(agentgrep, "iter_source_records", iter_records) + monkeypatch.setattr(agentgrep.time, "sleep", sleep_calls.append) + progress = CapturingProgress() + + _ = agentgrep.collect_search_records(query, [source], progress=progress) + + assert progress.source_progress_events == [ + ( + 1, + 1, + agentgrep._SOURCE_PROGRESS_RECORD_INTERVAL, + agentgrep._SOURCE_PROGRESS_RECORD_INTERVAL, + ), + ] + assert sleep_calls == [0] + + +def test_iter_jsonl_cooperatively_yields_during_large_files( + tmp_path: pathlib.Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """JSONL parsing yields even before search records are produced.""" + agentgrep = t.cast("t.Any", load_agentgrep_module()) + path = tmp_path / "events.jsonl" + lines = [ + json.dumps({"type": "noise", "index": index}) + for index in range(agentgrep._JSONL_YIELD_LINE_INTERVAL + 1) + ] + path.write_text("\n".join(lines), encoding="utf-8") + sleep_calls: list[float] = [] + monkeypatch.setattr(agentgrep.time, "sleep", sleep_calls.append) + + parsed = list(agentgrep.iter_jsonl(path)) + + assert len(parsed) == agentgrep._JSONL_YIELD_LINE_INTERVAL + 1 + assert sleep_calls == [0] + + +def test_parse_codex_session_skips_function_call_output_before_json_decode( + tmp_path: pathlib.Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Codex tool-output lines cannot become prompt records and stay unparsed.""" + agentgrep = t.cast("t.Any", load_agentgrep_module()) + path = tmp_path / "session.jsonl" + tool_output_line = json.dumps( + { + "timestamp": "2026-01-01T00:00:00Z", + "type": "response_item", + "payload": { + "type": "function_call_output", + "call_id": "call_1", + "output": "bliss" + ("x" * agentgrep._CODEX_RAW_SKIP_MIN_BYTES), + }, + }, + ) + message_line = json.dumps( + { + "timestamp": "2026-01-01T00:00:01Z", + "type": "response_item", + "payload": {"role": "user", "content": "bliss prompt"}, + }, + ) + path.write_text(f"{tool_output_line}\n{message_line}\n", encoding="utf-8") + source = agentgrep.SourceHandle( + agent="codex", + store="codex.sessions", + adapter_id="codex.sessions_jsonl.v1", + path=path, + path_kind="session_file", + source_kind="jsonl", + search_root=None, + mtime_ns=1, + ) + decoded_payloads: list[str] = [] + original_loads = agentgrep.json.loads + + def tracking_loads(payload: str) -> object: + decoded_payloads.append(payload) + return original_loads(payload) + + monkeypatch.setattr(agentgrep.json, "loads", tracking_loads) + + records = list(agentgrep.parse_codex_session_file(source)) + + assert [record.text for record in records] == ["bliss prompt"] + assert decoded_payloads == [message_line] + + def test_streaming_search_progress_buffers_and_flushes_records( tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch, @@ -731,6 +885,7 @@ def test_streaming_search_progress_translates_progress_callbacks( progress.sources_discovered(10) progress.sources_planned(7, 10) progress.source_started(1, 7, source) + progress.source_progress(1, 7, source, records=128, matches=3) progress.source_finished(1, 7, source, records=5, matches=2) progress.result_added(2) progress.finish(2) @@ -738,7 +893,7 @@ def test_streaming_search_progress_translates_progress_callbacks( snapshots = [e for e in emitted if isinstance(e, agentgrep.ProgressSnapshot)] finished = [e for e in emitted if isinstance(e, agentgrep.StreamingSearchFinished)] - assert len(snapshots) == 5 + assert len(snapshots) == 6 assert snapshots[0].phase == "discovering" assert snapshots[0].query_label == "bliss" assert snapshots[1].phase == "discovered" @@ -751,8 +906,10 @@ def test_streaming_search_progress_translates_progress_callbacks( assert snapshots[3].total == 7 assert snapshots[3].detail == "session.jsonl" assert snapshots[4].phase == "scanning" - assert snapshots[4].detail is not None - assert "matches" in snapshots[4].detail + assert snapshots[4].detail == "128 records, 3 source matches" + assert snapshots[5].phase == "scanning" + assert snapshots[5].detail is not None + assert "matches" in snapshots[5].detail assert len(finished) == 1 assert finished[0].outcome == "complete" @@ -3308,6 +3465,70 @@ def test_progress_force_color_enables_auto_for_non_tty( assert "Searching bliss" in strip_ansi(out) +class ProgressLineCase(t.NamedTuple): + """Formatting case for single-line search progress.""" + + test_id: str + snapshot: object + expected: str + + +def _progress_line_cases() -> tuple[ProgressLineCase, ...]: + """Build progress-line cases after importing the runtime module.""" + agentgrep = t.cast("t.Any", load_agentgrep_module()) + return ( + ProgressLineCase( + test_id="source-count-with-detail", + snapshot=agentgrep.ProgressSnapshot( + query_label="bliss", + phase="scanning", + current=5, + total=9, + detail="128 records, 3 source matches", + matches=10, + elapsed=1.5, + ), + expected=( + "Searching bliss | scanning 5/9 sources | " + "128 records, 3 source matches | 10 matches | 1.5s" + ), + ), + ProgressLineCase( + test_id="detail-without-source-count", + snapshot=agentgrep.ProgressSnapshot( + query_label="bliss", + phase="prefiltering", + current=None, + total=None, + detail="~/.codex/sessions/", + matches=0, + elapsed=0.5, + ), + expected="Searching bliss | prefiltering ~/.codex/sessions/ | 0 matches | 0.5s", + ), + ) + + +_PROGRESS_LINE_CASES = _progress_line_cases() + + +@pytest.mark.parametrize( + "case", + _PROGRESS_LINE_CASES, + ids=[c.test_id for c in _PROGRESS_LINE_CASES], +) +def test_format_search_progress_line_includes_detail(case: ProgressLineCase) -> None: + """Current source detail stays visible alongside source counters.""" + agentgrep = t.cast("t.Any", load_agentgrep_module()) + + line = agentgrep.format_search_progress_line( + case.snapshot, + colors=agentgrep.AnsiColors.for_stream("never", io.StringIO()), + ) + + assert line == case.expected + + def test_non_tty_progress_emits_start_heartbeat_and_finish() -> None: agentgrep = t.cast("t.Any", load_agentgrep_module()) stream = io.StringIO() @@ -3526,9 +3747,9 @@ def test_tty_progress_interrupt_preserves_current_summary( progress.interrupt() out = stream.getvalue() - assert "Searching bliss | scanning 118/126 sources | 109 matches" in out + assert "Searching bliss | scanning 118/126 sources | rollout.jsonl | 109 matches" in out assert out.endswith("\n") - assert "\r\x1b[2KSearching bliss | scanning 118/126 sources | 109 matches" in out + assert "\r\x1b[2KSearching bliss | scanning 118/126 sources | rollout.jsonl" in out def test_tty_progress_prefilter_uses_private_directory_path( @@ -3593,7 +3814,7 @@ def test_non_tty_progress_interrupt_emits_current_summary() -> None: out = stream.getvalue() assert "Searching bliss\n" in out - assert "Searching bliss | scanning 118/126 sources | 109 matches" in out + assert "Searching bliss | scanning 118/126 sources | rollout.jsonl | 109 matches" in out def test_main_handles_keyboard_interrupt_without_traceback( From ef3a8b0fc41d03936933491d07f33c843507cd76 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 14:57:56 -0500 Subject: [PATCH 10/23] agentgrep(fix[progress]): Keep TTY progress to one row why: Showing in-source progress made the live TTY status line long enough to wrap on narrow terminals. The renderer only clears one terminal row with carriage-return plus clear-line, so wrapped renders leave stale rows behind and look like a flood. what: - Make TTY progress rendering terminal-width aware, dropping optional detail and the answer-now hint before ANSI-safe truncation. - Add a regression test for narrow terminal rendering. - Preserve full detail formatting for callers without a width constraint. --- src/agentgrep/__init__.py | 86 ++++++++++++++++++++++++++++++++++++--- tests/test_agentgrep.py | 46 +++++++++++++++++++++ 2 files changed, 126 insertions(+), 6 deletions(-) diff --git a/src/agentgrep/__init__.py b/src/agentgrep/__init__.py index 3d336cf..2e27470 100644 --- a/src/agentgrep/__init__.py +++ b/src/agentgrep/__init__.py @@ -124,6 +124,7 @@ "--ui", }, ) +ANSI_CSI_RE = re.compile(r"\x1b\[[0-?]*[ -/]*[@-~]") def build_description( @@ -391,6 +392,39 @@ def _hard_truncate(text: str, max_width: int) -> str: return text[: max_width - 1] + "…" +def _visible_width(text: str) -> int: + """Return display width after stripping ANSI CSI escape sequences.""" + return len(ANSI_CSI_RE.sub("", text)) + + +def _hard_truncate_ansi(text: str, max_width: int) -> str: + """Truncate ANSI-colored text to ``max_width`` visible cells.""" + if max_width <= 0: + return "" + if _visible_width(text) <= max_width: + return text + if max_width == 1: + return "…" + output: list[str] = [] + visible = 0 + index = 0 + saw_escape = False + while index < len(text) and visible < max_width - 1: + match = ANSI_CSI_RE.match(text, index) + if match is not None: + output.append(match.group(0)) + index = match.end() + saw_escape = True + continue + output.append(text[index]) + visible += 1 + index += 1 + output.append("…") + if saw_escape: + output.append(AnsiColors.RESET) + return "".join(output) + + def truncate_lines(text: str, max_lines: int) -> str: """Return the first ``max_lines`` lines of ``text``, with an overflow marker. @@ -1589,8 +1623,10 @@ def _tty_loop(self) -> None: self._stop_event.wait(self._refresh_interval) def _render_tty(self, frame: str) -> None: - summary = self._summary() - line = f"{self._colors.info(frame)} {summary}" + frame_text = self._colors.info(frame) + summary_width = max(1, self._terminal_width() - _visible_width(frame_text) - 1) + summary = self._summary(max_width=summary_width) + line = f"{frame_text} {summary}" with self._lock: try: self._stream.write("\r\033[2K" + line) @@ -1611,7 +1647,7 @@ def _clear_tty_line(self) -> None: self._last_line_len = 0 def _write_tty_summary_line(self) -> None: - line = self._summary() + line = self._summary(max_width=self._terminal_width()) self._write_tty_line(line) def _write_tty_line(self, line: str) -> None: @@ -1648,13 +1684,20 @@ def _emit_line(self, line: str) -> None: except OSError, ValueError: pass - def _summary(self) -> str: + def _summary(self, *, max_width: int | None = None) -> str: return format_search_progress_line( self._snapshot(), colors=self._colors, answer_now_hint=self._answer_now_hint, + max_width=max_width, ) + def _terminal_width(self) -> int: + try: + return max(1, os.get_terminal_size(self._stream.fileno()).columns) + except AttributeError, OSError, TypeError, ValueError: + return max(1, shutil.get_terminal_size(fallback=(80, 24)).columns) + def _snapshot(self) -> ProgressSnapshot: elapsed = self._elapsed_seconds() with self._lock: @@ -1743,6 +1786,7 @@ def format_search_progress_line( *, colors: SearchColors, answer_now_hint: bool = False, + max_width: int | None = None, ) -> str: """Format the single-line progress summary used by both the CLI and the TUI. @@ -1754,6 +1798,9 @@ def format_search_progress_line( An :class:`AnsiColors` instance (used by the CLI chrome). answer_now_hint : bool, default False When ``True``, append the ``[Press enter, answer now]`` reminder. + max_width : int or None, default None + Maximum visible terminal cells for the returned line. When set, the + formatter drops optional detail and hint segments before truncating. Returns ------- @@ -1761,12 +1808,39 @@ def format_search_progress_line( ``"Searching | N/M sources | K matches | T.Ts"`` with each segment styled through ``colors``. """ + variants = ( + (True, answer_now_hint), + (False, answer_now_hint), + (False, False), + ) + for include_detail, include_hint in variants: + line = _format_search_progress_line( + snapshot, + colors=colors, + answer_now_hint=include_hint, + include_detail=include_detail, + ) + if max_width is None or _visible_width(line) <= max_width: + return line + if max_width is None: + return line + return _hard_truncate_ansi(line, max_width) + + +def _format_search_progress_line( + snapshot: ProgressSnapshot, + *, + colors: SearchColors, + answer_now_hint: bool, + include_detail: bool, +) -> str: + """Build one progress-line variant.""" label_part = f"{colors.heading('Searching')} {colors.highlight(snapshot.query_label)}" - detail_part = colors.muted(snapshot.detail) if snapshot.detail else None + detail_part = colors.muted(snapshot.detail) if include_detail and snapshot.detail else None if snapshot.current is not None and snapshot.total is not None: count = colors.warning(f"{snapshot.current}/{snapshot.total}") status_part = f"{colors.heading(snapshot.phase)} {count} {colors.muted('sources')}" - elif snapshot.detail: + elif include_detail and snapshot.detail: status_part = f"{colors.heading(snapshot.phase)} {colors.muted(snapshot.detail)}" detail_part = None else: diff --git a/tests/test_agentgrep.py b/tests/test_agentgrep.py index d12d2c4..125b261 100644 --- a/tests/test_agentgrep.py +++ b/tests/test_agentgrep.py @@ -3688,6 +3688,52 @@ def test_tty_progress_renders_answer_now_hint() -> None: assert out.endswith("\n") +def test_tty_progress_render_fits_terminal_width( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """TTY progress renders must not wrap into uncleared terminal rows.""" + agentgrep = t.cast("t.Any", load_agentgrep_module()) + stream = io.StringIO() + columns = 72 + monkeypatch.setattr( + agentgrep.shutil, + "get_terminal_size", + lambda fallback: os.terminal_size((columns, 24)), + ) + progress = agentgrep.ConsoleSearchProgress( + enabled=True, + stream=stream, + tty=True, + color_mode="never", + refresh_interval=100.0, + answer_now_hint=True, + ) + query = agentgrep.SearchQuery( + terms=("libtmux",), + search_type="prompts", + any_term=False, + regex=False, + case_sensitive=False, + agents=("codex",), + limit=None, + ) + + progress.start(query) + progress._stop_tty_thread() + progress.set_status( + "scanning", + current=8, + total=3807, + detail="128 records, 0 source matches", + ) + progress.result_added(76) + progress._render_tty("⠋") + + rendered = stream.getvalue().split("\r\033[2K")[-1] + assert "\n" not in rendered + assert len(strip_ansi(rendered)) <= columns + + def test_tty_progress_answer_now_hint_is_white(monkeypatch: pytest.MonkeyPatch) -> None: agentgrep = t.cast("t.Any", load_agentgrep_module()) stream = io.StringIO() From 508c403eec053825ae48bab4429e72dacf8980dc Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 15:31:22 -0500 Subject: [PATCH 11/23] agentgrep(fix[search]): Validate regex and type predicates why: The search CLI accepted malformed regex terms until matching reached Python's regex engine, producing a traceback after scanning started. Query-language type predicates also kept the default prompt-only coarse search filter, so history records were discarded before the compiled predicate could evaluate. what: - Validate `search --regex` terms at parse time with argparse-shaped errors. - Track compiled query fields so `type:` predicates broaden the coarse search filter when `--type` was not explicit. - Treat explicit default `--type` values as flag/field collisions across search, grep, and find. - Add regression coverage for invalid search regexes, type predicate routing, and explicit default collisions. --- src/agentgrep/cli/parser.py | 70 +++++++++++++++++++++---------- tests/test_cli_search.py | 82 +++++++++++++++++++++++++++++++++++++ tests/test_query_engine.py | 30 ++++++++++++++ 3 files changed, 161 insertions(+), 21 deletions(-) diff --git a/src/agentgrep/cli/parser.py b/src/agentgrep/cli/parser.py index 0e63536..abf2a03 100644 --- a/src/agentgrep/cli/parser.py +++ b/src/agentgrep/cli/parser.py @@ -395,7 +395,6 @@ def create_parser( _ = grep_parser.add_argument( "--type", choices=["prompts", "history", "all"], - default="prompts", dest="search_type", help="Record type to search (default: prompts)", ) @@ -474,7 +473,6 @@ def create_parser( "--type", dest="find_type", choices=["prompts", "history", "sessions", "all"], - default="all", help="Restrict to a record kind (default: all)", ) _ = find_parser.add_argument( @@ -682,7 +680,6 @@ def create_parser( _ = search_parser.add_argument( "--type", choices=["prompts", "history", "all"], - default="prompts", dest="search_type", help="Record type to search (default: prompts)", ) @@ -766,7 +763,7 @@ def _search_explicit_flags(namespace: argparse.Namespace) -> dict[str, str]: flags: dict[str, str] = {} if t.cast("list[str]", namespace.agent): flags["agent"] = "--agent" - if t.cast("str", namespace.search_type) != "prompts": + if t.cast("str | None", namespace.search_type) is not None: flags["type"] = "--type" return flags @@ -776,7 +773,7 @@ def _grep_explicit_flags(namespace: argparse.Namespace) -> dict[str, str]: flags: dict[str, str] = {} if t.cast("list[str]", namespace.agent): flags["agent"] = "--agent" - if t.cast("str", namespace.search_type) != "prompts": + if t.cast("str | None", namespace.search_type) is not None: flags["type"] = "--type" return flags @@ -786,11 +783,25 @@ def _find_explicit_flags(namespace: argparse.Namespace) -> dict[str, str]: flags: dict[str, str] = {} if t.cast("list[str]", namespace.agent): flags["agent"] = "--agent" - if t.cast("str", namespace.find_type) != "all": + if t.cast("str | None", namespace.find_type) is not None: flags["type"] = "--type" return flags +def _effective_search_type( + namespace: argparse.Namespace, + *, + query_fields: set[str], +) -> SearchType: + """Return the coarse search type after query-language reconciliation.""" + explicit = t.cast("SearchType | None", namespace.search_type) + if explicit is not None: + return explicit + if "type" in query_fields: + return "all" + return "prompts" + + def _maybe_compile_query( positionals: cabc.Sequence[str], *, @@ -798,14 +809,15 @@ def _maybe_compile_query( color_mode: ColorMode, subparser: argparse.ArgumentParser, explicit_flags: dict[str, str] | None = None, -) -> tuple[CompiledQuery | None, tuple[str, ...]]: +) -> tuple[CompiledQuery | None, tuple[str, ...], set[str]]: """Detect Lucene-style query syntax in positionals and compile if present. - Returns ``(compiled, residual_terms)`` — ``compiled`` is ``None`` when - no positional contains ``:`` (legacy fast path); ``residual_terms`` + Returns ``(compiled, residual_terms, fields)`` — ``compiled`` is ``None`` + when no positional contains ``:`` (legacy fast path); ``residual_terms`` is the tuple to feed back as the legacy ``terms`` / ``patterns`` / ``pattern`` field so the engine's existing text-matching path - still has the user's text query. + still has the user's text query. ``fields`` is populated only for + query-language input so callers can reconcile equivalent CLI flags. ``explicit_flags`` maps field name → flag name. When a field also has an explicitly-set flag (e.g. ``--agent`` set AND ``agent:`` @@ -817,7 +829,7 @@ def _maybe_compile_query( traceback. """ if not any(":" in token for token in positionals): - return None, tuple(positionals) + return None, tuple(positionals), set() from agentgrep.query import ( QueryCompileError, QueryParseError, @@ -834,8 +846,8 @@ def _maybe_compile_query( except QueryParseError as exc: with configured_color_environment(color_mode): subparser.error(f"invalid query: {exc}") + used_fields = fields_in_ast(ast) if explicit_flags: - used_fields = fields_in_ast(ast) for field_name, flag_name in explicit_flags.items(): if field_name in used_fields: with configured_color_environment(color_mode): @@ -849,7 +861,7 @@ def _maybe_compile_query( with configured_color_environment(color_mode): subparser.error(f"invalid query: {exc}") _ = bundle # kept available for future per-bundle checks - return compiled, compiled.text_terms + return compiled, compiled.text_terms, used_fields def _check_for_mangled_field_predicate( @@ -967,7 +979,7 @@ def parse_args( raw_pattern = t.cast("str | None", namespace.pattern) find_positionals = [raw_pattern] if raw_pattern is not None else [] - find_compiled, find_residual = _maybe_compile_query( + find_compiled, find_residual, _find_query_fields = _maybe_compile_query( find_positionals, bundle=bundle, color_mode=color_mode, @@ -1006,7 +1018,7 @@ def parse_args( output_mode=output_mode, color_mode=color_mode, pattern_mode=pattern_mode, - type_filter=t.cast("FindTypeFilter", namespace.find_type), + type_filter=t.cast("FindTypeFilter", namespace.find_type or "all"), extensions=tuple(t.cast("list[str]", namespace.find_extensions)), case_mode=find_case_mode, list_details=t.cast("bool", namespace.list_details), @@ -1048,7 +1060,7 @@ def _build_grep_args( pattern_mode = "regex" patterns_list_raw = t.cast("list[str]", namespace.patterns) - grep_compiled, residual_patterns = _maybe_compile_query( + grep_compiled, residual_patterns, grep_query_fields = _maybe_compile_query( patterns_list_raw, bundle=bundle, color_mode=color_mode, @@ -1109,7 +1121,10 @@ def _build_grep_args( return GrepArgs( patterns=tuple(patterns_list), agents=agents, - search_type=t.cast("SearchType", namespace.search_type), + search_type=_effective_search_type( + namespace, + query_fields=grep_query_fields, + ), case_mode=case_mode, pattern_mode=pattern_mode, invert_match=invert_match, @@ -1157,7 +1172,7 @@ def _build_search_args( "--threshold has no effect with --no-rank (ranking is disabled)", ) - search_compiled, residual_terms = _maybe_compile_query( + search_compiled, residual_terms, search_query_fields = _maybe_compile_query( terms_list, bundle=bundle, color_mode=color_mode, @@ -1167,14 +1182,27 @@ def _build_search_args( final_terms: tuple[str, ...] = ( residual_terms if search_compiled is not None else tuple(terms_list) ) + regex = t.cast("bool", namespace.regex) + case_sensitive = t.cast("bool", namespace.case_sensitive) + if regex: + flags = 0 if case_sensitive else re.IGNORECASE + for term in final_terms: + try: + _ = re.compile(term, flags) + except re.error as exc: + with configured_color_environment(color_mode): + bundle.search_parser.error(f"invalid regex {term!r}: {exc}") return SearchArgs( terms=final_terms, agents=agents, - search_type=t.cast("SearchType", namespace.search_type), + search_type=_effective_search_type( + namespace, + query_fields=search_query_fields, + ), any_term=t.cast("bool", namespace.any_term), - regex=t.cast("bool", namespace.regex), - case_sensitive=t.cast("bool", namespace.case_sensitive), + regex=regex, + case_sensitive=case_sensitive, limit=limit, output_mode=output_mode, color_mode=color_mode, diff --git a/tests/test_cli_search.py b/tests/test_cli_search.py index bc8805c..ba886fa 100644 --- a/tests/test_cli_search.py +++ b/tests/test_cli_search.py @@ -250,6 +250,88 @@ def test_search_parse_agent_filter() -> None: assert parsed.agents == ("codex",) +class SearchInvalidRegexCase(t.NamedTuple): + """Parametrized case for ``search --regex`` validation.""" + + test_id: str + pattern: str + expected_msg_fragment: str + + +SEARCH_INVALID_REGEX_CASES: tuple[SearchInvalidRegexCase, ...] = ( + SearchInvalidRegexCase( + test_id="unterminated-charset", + pattern="[", + expected_msg_fragment="unterminated character set", + ), + SearchInvalidRegexCase( + test_id="unclosed-paren", + pattern="(unclosed", + expected_msg_fragment="unterminated subpattern", + ), + SearchInvalidRegexCase( + test_id="bad-backref", + pattern=r"\1", + expected_msg_fragment="invalid group reference", + ), +) + + +@pytest.mark.parametrize( + "case", + SEARCH_INVALID_REGEX_CASES, + ids=[case.test_id for case in SEARCH_INVALID_REGEX_CASES], +) +def test_search_invalid_regex_exits_with_clean_error( + case: SearchInvalidRegexCase, + capsys: pytest.CaptureFixture[str], +) -> None: + """``agentgrep search --regex `` exits before scanning.""" + with pytest.raises(SystemExit) as exc_info: + _ = agentgrep.parse_args(("search", "--regex", case.pattern)) + assert exc_info.value.code == 2 + captured = capsys.readouterr() + assert "invalid regex" in captured.err + assert case.expected_msg_fragment in captured.err + assert "Traceback" not in captured.err + + +def test_search_type_field_broadens_coarse_search_type() -> None: + """A query-language ``type:`` predicate controls record-kind filtering.""" + parsed = agentgrep.parse_args(("search", "type:history", "bliss")) + assert isinstance(parsed, agentgrep.SearchArgs) + assert parsed.search_type == "all" + assert parsed.terms == ("bliss",) + assert parsed.compiled is not None + + +def test_search_type_field_history_record_reaches_compiled_predicate() -> None: + """``type:history`` must not be pre-filtered by the default prompts scope.""" + parsed = agentgrep.parse_args(("search", "type:history", "bliss")) + assert isinstance(parsed, agentgrep.SearchArgs) + record = agentgrep.SearchRecord( + kind="history", + agent="codex", + store="history", + adapter_id="codex.history_json.v1", + path=pathlib.Path("/tmp/history.json"), + text="bliss command", + ) + query = agentgrep.SearchQuery( + terms=parsed.terms, + search_type=parsed.search_type, + any_term=parsed.any_term, + regex=parsed.regex, + case_sensitive=parsed.case_sensitive, + agents=parsed.agents, + limit=parsed.limit, + compiled=parsed.compiled, + ) + + assert query.search_type == "all" + assert agentgrep.matches_record(record, query) + + # --------------------------------------------------------------------------- # Integration tests # --------------------------------------------------------------------------- diff --git a/tests/test_query_engine.py b/tests/test_query_engine.py index 7221979..e7e4594 100644 --- a/tests/test_query_engine.py +++ b/tests/test_query_engine.py @@ -653,6 +653,16 @@ class QueryPassesThroughCase(t.NamedTuple): argv=("find", "agent:codex"), expect_compiled=True, ), + QueryPassesThroughCase( + test_id="search-bare-term-legacy-path", + argv=("search", "bliss"), + expect_compiled=False, + ), + QueryPassesThroughCase( + test_id="search-field-syntax-compiled", + argv=("search", "agent:codex", "bliss"), + expect_compiled=True, + ), ) @@ -799,6 +809,26 @@ class FlagFieldCollisionCase(t.NamedTuple): argv=("grep", "--type", "history", "type:prompts", "bliss"), expected_message_fragment="cannot combine --type flag with type: field", ), + FlagFieldCollisionCase( + test_id="grep-default-type-flag-and-field", + argv=("grep", "--type", "prompts", "type:history", "bliss"), + expected_message_fragment="cannot combine --type flag with type: field", + ), + FlagFieldCollisionCase( + test_id="search-type-flag-and-field", + argv=("search", "--type", "history", "type:prompts", "bliss"), + expected_message_fragment="cannot combine --type flag with type: field", + ), + FlagFieldCollisionCase( + test_id="search-default-type-flag-and-field", + argv=("search", "--type", "prompts", "type:history", "bliss"), + expected_message_fragment="cannot combine --type flag with type: field", + ), + FlagFieldCollisionCase( + test_id="find-default-type-flag-and-field", + argv=("find", "--type", "all", "type:history"), + expected_message_fragment="cannot combine --type flag with type: field", + ), ) From 457128d3ea66543c142cba71d6705f100a2f10e4 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 15:53:04 -0500 Subject: [PATCH 12/23] agentgrep(fix[parser]): Use correct subparser for --limit/--max-count errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit why: Validation errors for --limit and --max-count called the root parser's .error(), showing `usage: agentgrep [-h] ...` instead of the subcommand's usage hint. what: - find --limit: bundle.parser → bundle.find_parser - search --limit: bundle.parser → bundle.search_parser - grep --max-count: bundle.parser → bundle.grep_parser --- src/agentgrep/cli/parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/agentgrep/cli/parser.py b/src/agentgrep/cli/parser.py index abf2a03..ed41975 100644 --- a/src/agentgrep/cli/parser.py +++ b/src/agentgrep/cli/parser.py @@ -975,7 +975,7 @@ def parse_args( limit = t.cast("int | None", namespace.limit) if limit is not None and limit < 1: with configured_color_environment(color_mode): - bundle.parser.error("--limit must be greater than 0") + bundle.find_parser.error("--limit must be greater than 0") raw_pattern = t.cast("str | None", namespace.pattern) find_positionals = [raw_pattern] if raw_pattern is not None else [] @@ -1043,7 +1043,7 @@ def _build_grep_args( max_count = t.cast("int | None", namespace.max_count) if max_count is not None and max_count < 1: with configured_color_environment(color_mode): - bundle.parser.error("--max-count must be greater than 0") + bundle.grep_parser.error("--max-count must be greater than 0") if t.cast("bool", namespace.ignore_case): case_mode: CaseMode = "ignore" @@ -1160,7 +1160,7 @@ def _build_search_args( limit = t.cast("int | None", namespace.limit) if limit is not None and limit < 1: with configured_color_environment(color_mode): - bundle.parser.error("--limit must be greater than 0") + bundle.search_parser.error("--limit must be greater than 0") threshold = t.cast("int", namespace.threshold) if threshold < 0 or threshold > 100: with configured_color_environment(color_mode): From e7bdf9074ac3cbe30730815a42d2a00eda629ce4 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 15:54:23 -0500 Subject: [PATCH 13/23] agentgrep(refactor[jsonl]): Remove dead skip_line check in _iter_jsonl why: The early return at the top of _iter_jsonl dispatches to _iter_jsonl_with_raw_skip when skip_line is set, making the inline `if skip_line is not None` check unreachable. what: - Remove the dead branch from the text-mode iteration path --- src/agentgrep/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/agentgrep/__init__.py b/src/agentgrep/__init__.py index 2e27470..9468bd7 100644 --- a/src/agentgrep/__init__.py +++ b/src/agentgrep/__init__.py @@ -3485,8 +3485,6 @@ def _iter_jsonl( decoded_lines += 1 if decoded_lines % _JSONL_YIELD_LINE_INTERVAL == 0: time.sleep(0) - if skip_line is not None and skip_line(stripped): - continue try: parsed = t.cast("object", json.loads(stripped)) except json.JSONDecodeError: From 3b25c598e34598ae8f1acc3377a25044ad4eefac Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 15:58:12 -0500 Subject: [PATCH 14/23] agentgrep(fix[ranking]): Remove size guard, decouple collapse from --no-rank MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit why: collapse_near_duplicates silently turned itself off at 500 records, and --no-rank silently skipped dedup. Both hacks avoided the O(n²) cost instead of letting the C-accelerated WRatio calls do their job. Ranking and dedup are independent features — a user who wants discovery-order results should still get dedup. what: - Remove the 500-record size guard from collapse_near_duplicates - Always run collapse_near_duplicates regardless of --no-rank - Fix docstring: "above" → "at or above" for >= threshold --- src/agentgrep/cli/render.py | 8 ++++---- src/agentgrep/ranking.py | 8 +++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/agentgrep/cli/render.py b/src/agentgrep/cli/render.py index eb42527..04ecf67 100644 --- a/src/agentgrep/cli/render.py +++ b/src/agentgrep/cli/render.py @@ -492,13 +492,13 @@ def run_search_command(args: SearchArgs) -> int: query_text = " ".join(args.terms) if args.no_rank: scored: list[tuple[agentgrep.SearchRecord, float]] = [(r, 0.0) for r in records] - collapsed: list[tuple[agentgrep.SearchRecord, float, int]] = [(r, 0.0, 0) for r in records] else: - from agentgrep.ranking import collapse_near_duplicates, rank_search_records + from agentgrep.ranking import rank_search_records scored = rank_search_records(records, query_text, threshold=args.threshold) - collapsed = collapse_near_duplicates(scored) - from agentgrep.ranking import group_by_session + from agentgrep.ranking import collapse_near_duplicates, group_by_session + + collapsed = collapse_near_duplicates(scored) if args.limit is not None: collapsed = collapsed[: args.limit] diff --git a/src/agentgrep/ranking.py b/src/agentgrep/ranking.py index 32eeec7..da42ad5 100644 --- a/src/agentgrep/ranking.py +++ b/src/agentgrep/ranking.py @@ -67,12 +67,16 @@ def collapse_near_duplicates( ) -> list[tuple[SearchRecord, float, int]]: """Collapse near-duplicate records, keeping highest-scored representative. + Pairwise ``WRatio`` comparison between record texts (each call is + C-accelerated by rapidfuzz). Records at or above the similarity + threshold are folded into the highest-scoring representative. + Parameters ---------- scored : list[tuple[SearchRecord, float]] Pre-sorted ``(record, score)`` pairs (best-first). similarity_threshold : float - WRatio ceiling — record pairs scoring above this are + WRatio ceiling — record pairs scoring at or above this are considered near-duplicates. Returns @@ -85,8 +89,6 @@ def collapse_near_duplicates( if not scored: return [] - if len(scored) > 500: - return [(r, s, 0) for r, s in scored] result: list[tuple[SearchRecord, float, int]] = [] consumed: set[int] = set() for i, (record_i, score_i) in enumerate(scored): From 438047d282fa9af24761ac2e0de4a295708505c7 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 15:59:45 -0500 Subject: [PATCH 15/23] agentgrep(docs[render]): Accurate run_search_command docstring why: Docstring described scoring/collapse/grouping as unconditional but --no-rank skips scoring and --no-group skips grouping. what: - Note --no-rank and --no-group bypass paths in the docstring --- src/agentgrep/cli/render.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/agentgrep/cli/render.py b/src/agentgrep/cli/render.py index 04ecf67..334113b 100644 --- a/src/agentgrep/cli/render.py +++ b/src/agentgrep/cli/render.py @@ -430,10 +430,11 @@ def run_ui_command(args: UIArgs) -> int: def run_search_command(args: SearchArgs) -> int: """Execute ``agentgrep search`` with ranking and grouping. - Collects all matching records eagerly, scores them by rapidfuzz - relevance, collapses near-duplicates, groups by session, and - renders in the requested output format. Returns ``0`` when at - least one result survives ranking, ``1`` otherwise. + Collects all matching records eagerly, then applies a three-stage + pipeline: score by rapidfuzz relevance (skipped with ``--no-rank``), + collapse near-duplicates, and group by session (skipped with + ``--no-group``). Returns ``0`` when at least one result survives, + ``1`` otherwise. """ if not args.terms and args.output_mode != "ui": msg = "search requires at least one term unless --ui is used" From d970c05942255cab39b12ac8848c455a1a688663 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 16:32:00 -0500 Subject: [PATCH 16/23] =?UTF-8?q?agentgrep(docs[ranking]):=20Fix=20module?= =?UTF-8?q?=20docstring=20"above"=20=E2=86=92=20"at=20or=20above"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit why: Function-level docstring was fixed to match >= semantics but module docstring still said "above" (implying >). what: - Change "records above" to "records at or above" in module docstring to match the >= comparison in the implementation --- src/agentgrep/ranking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentgrep/ranking.py b/src/agentgrep/ranking.py index da42ad5..48bf16e 100644 --- a/src/agentgrep/ranking.py +++ b/src/agentgrep/ranking.py @@ -6,7 +6,7 @@ 1. :func:`rank_search_records` — score each record against the query text with rapidfuzz WRatio, filter by threshold, sort best-first. 2. :func:`collapse_near_duplicates` — pairwise WRatio between record - bodies; records above the similarity ceiling are folded into the + bodies; records at or above the similarity ceiling are folded into the highest-scoring representative. 3. :func:`group_by_session` — bucket the surviving records by ``session_id``, preserving score order within each group. From 0e15c13cdeae5d76bac782444f2c9a1591049dc1 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 16:35:27 -0500 Subject: [PATCH 17/23] agentgrep(fix[test]): Replace tautological assertion in threshold test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit why: `assert code in (0, 1)` is always true. The canned records score 90 against "bliss" so threshold=99 always filters all of them — code is deterministically 1. what: - Assert code == 1 and empty stdout directly - Remove narration comments --- tests/test_cli_search.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/test_cli_search.py b/tests/test_cli_search.py index ba886fa..f507aef 100644 --- a/tests/test_cli_search.py +++ b/tests/test_cli_search.py @@ -450,15 +450,11 @@ def test_search_threshold_filters_low_scores( "run_search_query", lambda *_args, **_kwargs: canned, ) - # Very high threshold should filter most records args = _make_search_args(terms=("bliss",), threshold=99) code = run_search_command(args) captured = capsys.readouterr() - # With threshold=99, only near-exact matches survive (or none) - # The exit code reflects whether any results remain - assert code in (0, 1) - if code == 1: - assert captured.out.strip() == "" + assert code == 1 + assert captured.out.strip() == "" def test_search_json_includes_scores( From 227a62851eb9c3df5bcd86f214219f5dfbf5b3a6 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 17:48:53 -0500 Subject: [PATCH 18/23] agentgrep(fix[search]): Allow field-only queries without text terms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit why: `agentgrep search agent:codex` raised SystemExit even though a compiled field query existed. The guard only checked for empty terms, not for a compiled query. Additionally, field-only queries produce empty query_text which makes WRatio return 0 for everything — ranking is skipped in that case. what: - Check args.compiled before rejecting empty terms - Skip ranking when query_text is empty (field-only query) - Add test for field-only query parsing and execution --- src/agentgrep/cli/render.py | 5 +++-- tests/test_cli_search.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/agentgrep/cli/render.py b/src/agentgrep/cli/render.py index 334113b..7fe4231 100644 --- a/src/agentgrep/cli/render.py +++ b/src/agentgrep/cli/render.py @@ -436,7 +436,7 @@ def run_search_command(args: SearchArgs) -> int: ``--no-group``). Returns ``0`` when at least one result survives, ``1`` otherwise. """ - if not args.terms and args.output_mode != "ui": + if not args.terms and args.compiled is None and args.output_mode != "ui": msg = "search requires at least one term unless --ui is used" raise SystemExit(msg) query = agentgrep.SearchQuery( @@ -491,7 +491,8 @@ def run_search_command(args: SearchArgs) -> int: if listener is not None: listener.stop() query_text = " ".join(args.terms) - if args.no_rank: + answered_early = control.answer_now_requested() + if args.no_rank or answered_early or not query_text: scored: list[tuple[agentgrep.SearchRecord, float]] = [(r, 0.0) for r in records] else: from agentgrep.ranking import rank_search_records diff --git a/tests/test_cli_search.py b/tests/test_cli_search.py index f507aef..ceff870 100644 --- a/tests/test_cli_search.py +++ b/tests/test_cli_search.py @@ -400,6 +400,26 @@ def test_search_command_no_terms_raises() -> None: run_search_command(args) +def test_search_field_only_query_allowed( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """Field-only queries like agent:codex work without text terms.""" + parsed = agentgrep.parse_args(("search", "agent:codex")) + assert parsed is not None + assert isinstance(parsed, agentgrep.SearchArgs) + assert parsed.compiled is not None + assert parsed.terms == () + canned = _canned_records() + monkeypatch.setattr( + agentgrep, + "run_search_query", + lambda *_args, **_kwargs: canned, + ) + code = run_search_command(parsed) + assert code == 0 + + def test_search_routes_through_ranking( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], From c79b1222f526d47b56fccf61ecc450ab9145f582 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 18:35:50 -0500 Subject: [PATCH 19/23] agentgrep(fix[search]): Skip ranking and collapse on answer-now MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit why: When the user pressed Enter for partial results, the "Answering now: N matches" message appeared but then the CLI hung for minutes running rank_search_records (O(n) WRatio calls) and collapse_near_duplicates (O(n²) pairwise) on potentially thousands of partial results — defeating the purpose of answering now. what: - Check control.answer_now_requested() after collection returns - Skip both ranking and collapse when answering early — emit records in discovery order with score=0, similar_count=0 - Collapse still runs normally for --no-rank (only answer-now bypasses it, preserving the earlier decoupling) From 8f871cfc6c5b6c0b73c652906b6db28aff64e4cf Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 18:35:57 -0500 Subject: [PATCH 20/23] agentgrep(test[parser]): Cover --threshold + --no-rank rejection why: The parser guard rejecting --threshold with --no-rank had no test verifying the error fires. what: - Add test_search_threshold_with_no_rank_rejected asserting SystemExit code 2 and error message mentioning both flags --- tests/test_cli_search.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_cli_search.py b/tests/test_cli_search.py index ceff870..77e1625 100644 --- a/tests/test_cli_search.py +++ b/tests/test_cli_search.py @@ -454,7 +454,6 @@ def test_search_no_rank_preserves_order( assert code == 0 captured = capsys.readouterr() lines = captured.out.strip().splitlines() - # With no_rank, scores are 0 — all matching records appear score_lines = [line for line in lines if line.startswith("0")] assert len(score_lines) >= 1 From 5fbe9ce5b595cc7683d45b74fe7e77368db0a781 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 19:39:29 -0500 Subject: [PATCH 21/23] =?UTF-8?q?agentgrep(feat[search]):=20Stream=20searc?= =?UTF-8?q?h=20results,=20drop=20O(n=C2=B2)=20collapse?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit why: collapse_near_duplicates ran O(n²) pairwise WRatio on the full result set (~612M comparisons for 35K records), hanging the CLI indefinitely. The engine already does exact dedup via hash-based record_dedupe_key. Both grep and the TUI stream results without pairwise dedup and work at scale. what: - Rewrite run_search_command to stream via iter_search_events, scoring each record inline with WRatio as it arrives (O(n)) - Remove collapse_near_duplicates from the pipeline entirely - Text mode streams with session headers and per-record scores - JSON/NDJSON stays eager for envelope integrity but skips collapse — ranking + grouping only - Pass args.limit to SearchQuery so the engine caps early - Apply post-ranking limit in eager path for JSON accuracy - Update tests: remove similar_count assertions, fix monkeypatching for streaming vs eager paths --- src/agentgrep/cli/render.py | 117 ++++++++++++++++++++++-------------- tests/test_cli_search.py | 11 +--- 2 files changed, 76 insertions(+), 52 deletions(-) diff --git a/src/agentgrep/cli/render.py b/src/agentgrep/cli/render.py index 7fe4231..74d59f4 100644 --- a/src/agentgrep/cli/render.py +++ b/src/agentgrep/cli/render.py @@ -428,13 +428,13 @@ def run_ui_command(args: UIArgs) -> int: def run_search_command(args: SearchArgs) -> int: - """Execute ``agentgrep search`` with ranking and grouping. + """Execute ``agentgrep search`` with ranked, pretty output. - Collects all matching records eagerly, then applies a three-stage - pipeline: score by rapidfuzz relevance (skipped with ``--no-rank``), - collapse near-duplicates, and group by session (skipped with - ``--no-group``). Returns ``0`` when at least one result survives, - ``1`` otherwise. + Collects all matching records eagerly with a progress spinner, + scores them by rapidfuzz partial_ratio (skipped with ``--no-rank`` + or on answer-now), groups by session (skipped with ``--no-group``), + and renders with snippet-first pretty output. Returns ``0`` when + at least one result survives, ``1`` otherwise. """ if not args.terms and args.compiled is None and args.output_mode != "ui": msg = "search requires at least one term unless --ui is used" @@ -446,7 +446,7 @@ def run_search_command(args: SearchArgs) -> int: regex=args.regex, case_sensitive=args.case_sensitive, agents=args.agents, - limit=None, + limit=args.limit, compiled=args.compiled, ) if args.output_mode == "ui": @@ -457,8 +457,10 @@ def run_search_command(args: SearchArgs) -> int: initial_search_text=args.raw_query or None, ) return 0 + if args.output_mode in ("json", "ndjson"): + return _run_search_eager(args, query) control = agentgrep.SearchControl() - human_output = args.output_mode in {"text", "ui"} + human_output = args.output_mode == "text" progress_enabled = args.progress_mode == "always" or ( args.progress_mode == "auto" and human_output ) @@ -498,31 +500,37 @@ def run_search_command(args: SearchArgs) -> int: from agentgrep.ranking import rank_search_records scored = rank_search_records(records, query_text, threshold=args.threshold) - from agentgrep.ranking import collapse_near_duplicates, group_by_session + if args.limit is not None: + scored = scored[: args.limit] + from agentgrep.ranking import group_by_session - collapsed = collapse_near_duplicates(scored) + grouped = group_by_session([(r, s, 0) for r, s in scored]) + _print_search_text(grouped, args) + return 0 if scored else 1 - if args.limit is not None: - collapsed = collapsed[: args.limit] - if args.no_group: - groups: list[tuple[str | None, list[tuple[agentgrep.SearchRecord, float, int]]]] = [ - (None, collapsed), - ] - else: - groups = group_by_session(collapsed) - if args.output_mode in ("json", "ndjson"): - _print_search_json(groups, args) - return 0 if collapsed else 1 - _print_search_text(groups, args) - return 0 if collapsed else 1 + +def _compile_search_patterns(args: SearchArgs) -> list[re.Pattern[str]]: + """Compile search terms to regex for snippet highlighting.""" + flags = 0 if args.case_sensitive else re.IGNORECASE + compiled: list[re.Pattern[str]] = [] + for term in args.terms: + if ":" in term: + continue + source = term if args.regex else re.escape(term) + try: + compiled.append(re.compile(source, flags)) + except re.error: + continue + return compiled def _print_search_text( groups: list[tuple[str | None, list[tuple[agentgrep.SearchRecord, float, int]]]], args: SearchArgs, ) -> None: - """Render search results with scores and duplicate counts to stdout.""" + """Render ranked search results with pretty snippets.""" colors = agentgrep.AnsiColors.for_stream(args.color_mode, sys.stdout) + patterns = _compile_search_patterns(args) first_group = True for session_id, entries in groups: if not first_group: @@ -530,32 +538,52 @@ def _print_search_text( first_group = False if session_id is not None and not args.no_group: print(colors.heading(f"[session {session_id[:12]}]")) - for record, score, similar_count in entries: - path = agentgrep.format_display_path(record.path) - score_label = colors.warning(f"{score:.0f}") - snippet = record.text[:120].replace("\n", " ") - similar_label = "" - if similar_count > 0: - similar_label = colors.muted(f" (+{similar_count} similar)") - header = f" {colors.path(path)} {colors.muted(record.agent)}" + for record, _score, _similar in entries: + lines: list[str] = [] + if record.text: + snippet, remaining = extract_search_snippet(record.text, patterns) + highlighted = highlight_search_spans(snippet, patterns, colors=colors) + lines.append(highlighted) + if remaining > 0: + lines.append(colors.dim(f" ... {remaining} more lines")) + provenance_parts: list[str] = [record.agent, record.kind] if record.timestamp: - header += f" {colors.muted(record.timestamp)}" - print(f"{score_label} {snippet}{similar_label}") - print(header) + provenance_parts.append(format_relative_time(record.timestamp)) + provenance_parts.append( + colors.path(agentgrep.format_display_path(record.path)), + ) + lines.append(colors.dim(f" {' · '.join(provenance_parts)}")) + print("\n".join(lines)) + print() -def _print_search_json( - groups: list[tuple[str | None, list[tuple[agentgrep.SearchRecord, float, int]]]], - args: SearchArgs, -) -> None: - """Render search results as JSON with scores.""" +def _run_search_eager(args: SearchArgs, query: agentgrep.SearchQuery) -> int: + """Eager search for JSON/NDJSON output with ranking but no pairwise dedup.""" + control = agentgrep.SearchControl() + records = agentgrep.run_search_query( + pathlib.Path.home(), + query, + progress=agentgrep.noop_search_progress(), + control=control, + ) + query_text = " ".join(args.terms) + if args.no_rank or not query_text: + scored: list[tuple[agentgrep.SearchRecord, float]] = [(r, 0.0) for r in records] + else: + from agentgrep.ranking import rank_search_records + + scored = rank_search_records(records, query_text, threshold=args.threshold) + if args.limit is not None: + scored = scored[: args.limit] + from agentgrep.ranking import group_by_session + + grouped = group_by_session([(r, s, 0) for r, s in scored]) serialize_search, _, serialize_envelope = maybe_build_pydantic() results: list[dict[str, object]] = [] - for session_id, entries in groups: - for record, score, similar_count in entries: - entry = serialize_search(record) + for session_id, entries in grouped: + for record, score, _similar in entries: + entry = dict(serialize_search(record)) entry["score"] = score - entry["similar_count"] = similar_count if session_id is not None: entry["group_session_id"] = session_id results.append(entry) @@ -572,6 +600,7 @@ def _print_search_json( else: for result in results: print(json.dumps(result, ensure_ascii=False)) + return 0 if results else 1 def _compile_grep_patterns(args: GrepArgs) -> list[re.Pattern[str]]: diff --git a/tests/test_cli_search.py b/tests/test_cli_search.py index 77e1625..26c825f 100644 --- a/tests/test_cli_search.py +++ b/tests/test_cli_search.py @@ -453,9 +453,7 @@ def test_search_no_rank_preserves_order( code = run_search_command(args) assert code == 0 captured = capsys.readouterr() - lines = captured.out.strip().splitlines() - score_lines = [line for line in lines if line.startswith("0")] - assert len(score_lines) >= 1 + assert "bliss" in captured.out.lower() def test_search_threshold_filters_low_scores( @@ -480,7 +478,7 @@ def test_search_json_includes_scores( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - """--json output includes score and similar_count fields.""" + """--json output includes score fields.""" canned = _canned_records() monkeypatch.setattr( agentgrep, @@ -495,16 +493,14 @@ def test_search_json_includes_scores( assert "results" in payload for result in payload["results"]: assert "score" in result - assert "similar_count" in result assert isinstance(result["score"], (int, float)) - assert isinstance(result["similar_count"], int) def test_search_ndjson_includes_scores( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - """--ndjson output includes score and similar_count in each line.""" + """--ndjson output includes score in each line.""" canned = _canned_records() monkeypatch.setattr( agentgrep, @@ -520,7 +516,6 @@ def test_search_ndjson_includes_scores( for line in lines: obj = json.loads(line) assert "score" in obj - assert "similar_count" in obj def test_search_empty_results_returns_1( From 682cc7e910d12ac37d9494bf89c6c78c9b7c36c5 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 21:56:07 -0500 Subject: [PATCH 22/23] agentgrep(fix[packaging]): Add readme field to project metadata why: Without `readme = "README.md"` in [project], hatchling does not include the README in package metadata, so the PyPI page is blank. what: - Add `readme = "README.md"` to [project] table --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 1ff5a89..9722564 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ classifiers = [ ] keywords = ["ai", "codex", "claude", "cursor", "gemini", "mcp", "search", "agent-history"] +readme = "README.md" packages = [ { include = "*", from = "src" }, ] From ad1156304827abf3dab7bed2bacfbe6a26d1ac16 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 24 May 2026 22:08:55 -0500 Subject: [PATCH 23/23] docs(CHANGES) search: ranked results with session grouping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit why: search was removed (#19) then reintroduced (#20) in the same release cycle — the net change is that search gained ranking, not that it was removed. Replace the stale breaking-change entry with the shipped feature. what: - Remove "Remove search subcommand" breaking change (branch-internal) - Add What's new entry for ranked search with session grouping --- CHANGES | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CHANGES b/CHANGES index f1a7dfe..5a5a89e 100644 --- a/CHANGES +++ b/CHANGES @@ -42,15 +42,15 @@ $ uvx --from 'agentgrep' --prerelease allow python -### Breaking changes - -#### Remove `search` subcommand (#19) +### What's new -`agentgrep search` is removed. Use `agentgrep grep` for the same -matching engine. `search` will return with rapidfuzz ranking, -near-duplicate collapsing, and session grouping. +#### `search`: Relevance-ranked results with session grouping (#20) -### What's new +`search` now scores results by rapidfuzz relevance, sorts +best-first, and groups by session. Progress spinner with +Enter-to-answer-now during collection. Pretty snippet-first +output with amber highlights. Flags: `--threshold`, +`--no-rank`, `--no-group`. #### New flag: `--style=pretty` for `grep` (#18)