diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..b420d9c31 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,14 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file + +version: 2 +updates: + # Enable version updates for npm + - package-ecosystem: "npm" + # Look for `package.json` and `lock` files in the `root` directory + directory: "/" + # Check the npm registry for updates every day (weekdays) + schedule: + interval: "daily" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 47b1a2040..8fb063350 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,8 +26,28 @@ jobs: - name: Lint run: npm run lint - - name: Verify - run: npm run verify + - name: Short tests (no bench) + run: npm run test-all-no-bench - - name: Fixture smoke - run: npm run fixture-smoke + windows: + runs-on: windows-latest + env: + PAIROFCLEATS_EMBEDDINGS: stub + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: '18' + cache: npm + + - name: Install deps + run: npm ci + + - name: Windows regression lane + run: | + node tests/worker-pool-windows.js + node tests/search-windows-path-filter.js + node tests/fixture-parity.js --fixtures sample diff --git a/.gitignore b/.gitignore index 7a18797b7..86d38b692 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,37 @@ index-code/ index-prose/ ci-artifacts/ tests/.cache/ +tests/.logs/ +benchmarks/repos/ +benchmarks/cache/ +benchmarks/results/ docs/benchmarks.json docs/phase3-parity-report.json *.db *.db-shm *.db-wal +__pycache__/ +*.py[cod] +*.pyo +*$py.class +.Python +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.pytype/ +.coverage +coverage.xml +htmlcov/ +.tox/ +.nox/ +.venv/ +venv/ +ENV/ +env/ +env.bak/ +venv.bak/ +*.egg +*.egg-info/ +.eggs/ +pip-wheel-metadata/ +.pairofcleats/ diff --git a/.pairofcleats.json b/.pairofcleats.json index 9e8dec64c..68af92315 100644 --- a/.pairofcleats.json +++ b/.pairofcleats.json @@ -1,101 +1,109 @@ { - "dictionary": { - "languages": [ - "en" - ], - "includeSlang": true, - "enableRepoDictionary": false, - "dir": "", - "files": [], - "slangDirs": [], - "slangFiles": [] - }, - "cache": { - "root": "" - }, + // Enable sqlite index artifacts for search backends. + // Speed impact: adds sqlite build time when stage4 runs. "sqlite": { - "use": true, - "dbDir": "", - "annMode": "extension", - "compactOnIncremental": false, - "vectorExtension": { - "provider": "sqlite-vec", - "dir": "", - "path": "", - "table": "dense_vectors_ann", - "column": "embedding", - "encoding": "float32", - "options": "" - } + // Toggle sqlite index usage/artifact generation. + // Speed impact: enabling adds some indexing time and disk usage. + "use": true }, + // Search defaults for query-time behavior. + // Speed impact: no direct impact on indexing speed. "search": { + // Prefer ANN search by default when multiple backends exist. + // Speed impact: no impact on indexing; affects query latency/recall. "annDefault": true, - "sqliteFtsNormalize": false, - "queryCache": { - "enabled": false, - "maxEntries": 200, - "ttlMs": 0 - } - }, - "triage": { - "recordsDir": "", - "storeRawPayload": false, - "promoteFields": [ - "recordType", - "source", - "recordId", - "service", - "env", - "team", - "owner", - "vulnId", - "cve", - "packageName", - "packageEcosystem", - "severity", - "status", - "assetId" - ], - "contextPack": { - "maxHistory": 5, - "maxEvidencePerQuery": 5 - } + // Dense vector combination strategy for search. + // Speed impact: minor impact on embedding/storage cost during indexing. + "denseVectorMode": "merged" }, + // Index build pipeline options. + // Speed impact: many flags here change CPU/IO per file. "indexing": { - "concurrency": 4, - "importConcurrency": 4, - "astDataflow": true, - "controlFlow": true, - "riskAnalysis": true, - "riskAnalysisCrossFile": true, - "typeInference": false, - "typeInferenceCrossFile": false, + "workerPool": { + "enabled": true, + "maxWorkers": 8 + }, + // Sparse postings generation settings. + // Speed impact: heavier postings settings increase indexing time/size. "postings": { + // Build phrase n-gram postings. + // Speed impact: increases indexing time and index size. "enablePhraseNgrams": true, + // Smallest phrase n-gram length. + // Speed impact: lower values add more n-grams and cost. "phraseMinN": 2, + // Largest phrase n-gram length. + // Speed impact: higher values increase indexing time and size. "phraseMaxN": 4, + // Build chargram postings for fuzzy matching. + // Speed impact: noticeable extra CPU and disk usage. "enableChargrams": true, + // Smallest chargram length. + // Speed impact: lower values increase chargram volume and cost. "chargramMinN": 3, - "chargramMaxN": 5 - } - }, - "sql": { - "dialect": "", - "dialectByExt": { - ".psql": "postgres", - ".pgsql": "postgres", - ".mysql": "mysql", - ".sqlite": "sqlite" + // Largest chargram length. + // Speed impact: higher values increase chargram volume and cost. + "chargramMaxN": 5, + // Choose which fields contribute chargrams. + // Speed impact: more fields increase indexing work. + "chargramSource": "fields", + // Cap token length eligible for chargrams. + // Speed impact: higher caps increase CPU on long identifiers. + "chargramMaxTokenLength": 48, + // Track postings per field (name, path, body, etc). + // Speed impact: slight overhead for richer scoring. + "fielded": true + }, + // When to scan imports ("pre" or "post" indexing). + // Speed impact: small; "post" avoids extra upfront work. + "importScan": "post", + // Enable AST dataflow analysis. + // Speed impact: moderate CPU cost on large codebases. + "astDataflow": true, + // Enable control-flow analysis. + // Speed impact: moderate CPU cost on large codebases. + "controlFlow": true, + // Enable risk analysis rules. + // Speed impact: moderate CPU cost; can be heavy on huge repos. + "riskAnalysis": true, + // Enable cross-file risk correlation. + // Speed impact: heavy extra work on large repos. + "riskAnalysisCrossFile": true, + // Enable type inference. + // Speed impact: moderate to heavy CPU cost. + "typeInference": true, + // Enable cross-file type inference. + // Speed impact: heavy extra work on large repos. + "typeInferenceCrossFile": true, + // Collect git blame/churn metadata per file. + // Speed impact: heavy IO/CPU; can dominate indexing time. + "gitBlame": false, + // Run linting pass for diagnostics. + // Speed impact: extra CPU per file. + "lint": false, + // Compute complexity metrics. + // Speed impact: extra CPU per file. + "complexity": true, + // Python AST parsing options. + // Speed impact: small to moderate CPU on Python files. + "pythonAst": { + // Enable Python AST parsing. + // Speed impact: small to moderate on Python-heavy repos. + "enabled": true + }, + // Tree-sitter parsing options. + // Speed impact: moderate CPU, improved chunking accuracy. + "treeSitter": { + // Enable tree-sitter parsing. + // Speed impact: moderate CPU on supported languages. + "enabled": true } }, - "models": { - "id": "Xenova/all-MiniLM-L12-v2", - "dir": "" - }, - "tooling": { - "autoInstallOnDetect": false, - "installScope": "cache", - "allowGlobalFallback": true, - "dir": "" + // Runtime process limits for the indexer. + // Speed impact: higher heap reduces GC stalls on big repos. + "runtime": { + // Max Node heap size in MB for the indexer process. + // Speed impact: too low slows indexing; higher reduces GC overhead. + "maxOldSpaceMb": 98048 } } diff --git a/.rgignore b/.rgignore new file mode 100644 index 000000000..430ec74d9 --- /dev/null +++ b/.rgignore @@ -0,0 +1 @@ +benchmarks/repos/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..b97b954f9 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,16 @@ +# Changelog + +All notable changes to PairOfCleats are documented in this file. + +## Unreleased +### Breaking +- None. + +### Added +- None. + +### Fixed +- None. + +## v0.2.0 - 2026-01-11 +- Initial internal release. diff --git a/COMPLETED_PHASES.md b/COMPLETED_PHASES.md new file mode 100644 index 000000000..2452f7915 --- /dev/null +++ b/COMPLETED_PHASES.md @@ -0,0 +1,685 @@ +# Completed Phases + +Phases 1-4 were completed during the initial Sublime Text plugin and map rollout. Phases 11-12 and 14-15 were completed as the cache/perf and optional-deps groundwork. + +## Phase 1 — Sublime Text 3 Plugin Foundation (Parity + Plumbing) + +### 1.1 Plugin repo structure + packaging + +* [x] Create `sublime/PairOfCleats/` package skeleton: + + * [x] `PairOfCleats.py` (entrypoint) + * [x] `commands/` (command modules) + * [x] `lib/` (helpers: config, subprocess, parsing, caching) + * [x] `messages/` (install/upgrade notes) + * [x] `Default.sublime-commands` + * [x] `Main.sublime-menu` (optional) + * [x] `Default.sublime-keymap` (optional) +* [x] Add `README.md` for ST3 plugin installation + prerequisites +* [x] Add “Package Control” compatibility notes (no external deps beyond Node runtime + repo binaries) + +### 1.2 Node/CLI discovery + execution contract + +* [x] Implement robust “pairofcleats binary discovery”: + + * [x] Prefer project-local `node_modules/.bin/pairofcleats` when available + * [x] Fallback to global `pairofcleats` on PATH + * [x] Allow explicit override in ST settings: `pairofcleats_path` +* [x] Implement repo-root detection: + + * [x] Prefer `.pairofcleats.json` location + * [x] Fallback to `.git` root + * [x] Fallback to folder of active file +* [x] Implement subprocess wrapper: + + * [x] Streams output to Sublime panel + * [x] Captures JSON payloads when `--json` is used + * [x] Supports cancellation (best-effort) + * [x] Adds stable environment injection (cache root, embeddings mode, etc.) + +### 1.3 Settings + per-project overrides + +* [x] Add `PairOfCleats.sublime-settings` defaults: + + * [x] `pairofcleats_path`, `node_path` + * [x] `index_mode_default` (code/prose/both) + * [x] `search_backend_default` (memory/sqlite-fts/etc) + * [x] `open_results_in` (quick_panel / new_tab / output_panel) +* [x] Support `.sublime-project` settings overrides +* [x] Validate config and surface actionable error messages + +### 1.4 Smoke tests (plugin-side) + +* [x] Add Python unit tests that: + + * [x] Import plugin modules without Sublime runtime (mock `sublime`, `sublime_plugin`) + * [x] Validate binary discovery behavior + * [x] Validate repo-root resolution on fixtures + * [x] Validate settings overlay precedence + +--- + + +## Phase 2 — Sublime Search UX (Queries, Results, Navigation) + +### 2.1 Search command(s) + +* [x] `PairOfCleats: Search` command: + + * [x] Prompt input panel for query + * [x] Optional toggles: code/prose/both, backend, limit + * [x] Execute `pairofcleats search ... --json` +* [x] `PairOfCleats: Search Selection` command: + + * [x] Uses selected text as query +* [x] `PairOfCleats: Search Symbol Under Cursor` command + +### 2.2 Results presentation + +* [x] Quick panel results: + + * [x] Show `file:line-range`, symbol name, snippet/headline, score + * [x] Preserve stable ordering for repeatability +* [x] On selection: + + * [x] Open file at best-effort location (line/column) + * [x] Highlight match range (if available) +* [x] Add optional “results buffer” view (for large result sets) + +### 2.3 Quality-of-life UX + +* [x] Query history (per project) +* [x] “Repeat last search” command +* [x] “Explain search” (if supported by CLI flags / internal explain output) + +### 2.4 Tests + +* [x] Add Node-level “search contract” tests: + + * [x] Ensure `--json` output parseability and required fields +* [x] Add plugin tests: + + * [x] Search command dispatches correct subprocess args + * [x] Results parsing tolerates partial/missing optional fields + +--- + + +## Phase 3 — Index Lifecycle in Sublime (Build/Watch/Validate + Status) + +### 3.1 Build index commands + +* [x] `PairOfCleats: Index Build (Code)` +* [x] `PairOfCleats: Index Build (Prose)` +* [x] `PairOfCleats: Index Build (All)` +* [x] Stream progress to an output panel +* [x] Persist “last index time” + “last index mode” in project cache + +### 3.2 Watch mode integration + +* [x] `PairOfCleats: Index Watch Start` +* [x] `PairOfCleats: Index Watch Stop` +* [x] Prevent duplicate watchers per window/project +* [x] Robust shutdown on Sublime exit / project close + +### 3.3 Validate + repair affordances + +* [x] `PairOfCleats: Index Validate` +* [x] Surface actionable failures (missing artifacts, invalid JSON, stale manifests) +* [x] Provide “Open index directory” convenience command + +### 3.4 Tests + +* [x] Node tests for index build/validate on fixtures +* [x] Plugin tests for lifecycle commands and watcher gating + +--- + + +## Phase 4 — Codebase Semantic Map (Imports/Exports/Calls/Dataflow/Control Flow → Visual Map) + +### What this phase delivers + +A **real codebase map** that uses existing and enriched semantic metadata to generate a **diagram-ready model** and one or more **rendered artifacts**. + +It must explicitly incorporate and visualize: + +* **Imports / Exports / ImportLinks** +* **Calls / CallLinks / CallSummaries** +* **Usages / UsageLinks** +* **Signature / Modifiers / Params / Returns** +* **Reads / Writes / Mutates / Aliases** +* **Control flow** (branches, loops, throws, awaits, yields, returns) +* **AST-derived semantics** (using what the indexer already extracts) + +#### Visual grammar (required characteristics) + +* **File = outer shape** + + * Shape varies by file type/category (source/test/config/doc/generated/etc.) +* **Functions/classes = content inside the file shape** + + * The “fill” of the file node is structurally subdivided to represent contained functions/classes +* **Function details = nested sub-shapes inside function area** + + * Small badges/segments represent modifiers/returns/dataflow/control-flow +* **Multiple line styles = multiple edge semantics** + + * Imports (file→file), control flow calls (fn→fn), usage deps (fn→fn), dataflow (arg/return/state) + +--- + +### 4.1 Inventory + normalize available semantics from existing artifacts + +Leverage what is already produced today, and formalize how it’s consumed: + +* [x] **Inputs** (expected present after `index build`): + + * [x] `file_relations.json` (imports, exports, usages, importLinks, functionMeta/classMeta) + * [x] `repo_map.json` (chunk-level symbol map, exported flag, signatures) + * [x] `chunk_meta.json` (docmeta/metaV2: signature/modifiers/returns/controlFlow/dataflow + relations) + * [x] `graph_relations.json` (importGraph/callGraph/usageGraph) +* [x] Define “canonical IDs” used across the map: + + * [x] `fileId = ` + * [x] `symbolId = ::` (already used in relation graphs) + * [x] Stable IDs for anonymous/lambda cases (fallback: chunkId when name is `(anonymous)`) + +--- + +### 4.2 Define a versioned “Map Model” schema (diagram-ready) + +This is the core contract the plugin will consume. + +* [x] Create `docs/map-schema.json` (or similar) with: + + * [x] `version` + * [x] `generatedAt` + * [x] `root` (repo root logical id) + * [x] `legend`: + + * [x] `nodeTypes` (file/function/class/symbol) + * [x] `fileShapes` mapping (category → shape) + * [x] `functionBadges` mapping (modifier/returns/dataflow/control-flow → badge glyph) + * [x] `edgeTypes` mapping (imports/calls/usages/dataflow/aliases/mutations) + * [x] `edgeStyles` mapping (solid/dashed/dotted/double, arrowheads, labels) + * [x] `nodes`: + + * [x] file nodes with nested “members” (functions/classes) + * [x] function nodes with structured “semantic facets” + * [x] `edges` (typed, labeled, optionally “port-addressable”) +* [x] Schema must support **hierarchical nesting**: + + * [x] File node has `members[]` with per-member ports + * [x] Member nodes (functions) include `signature`, `modifiers`, `returns`, `controlFlow`, `dataflow` +* [x] Determinism requirements: + + * [x] Stable ordering (sort keys/ids) + * [x] Explicit timestamp field allowed, but everything else must be deterministic + +--- + +### 4.3 Build the semantic “map extractor” (core engine tool) + +Implement a Node tool that reads index artifacts and produces the map model. + +* [x] Add `tools/code-map.js` (or `tools/report-code-map.js`) that: + + * [x] Locates repo + index dirs using existing `tools/dict-utils.js` + * [x] Loads: + + * [x] `file_relations.json` + * [x] `repo_map.json` + * [x] `chunk_meta.json` (or minimal subset) + * [x] `graph_relations.json` + * [x] Merges into a single “map model”: + + * [x] **Files** classified into categories (drives file shape) + * [x] **Members** extracted per file: + + * [x] functions/methods/classes (from `repo_map` and/or chunk meta) + * [x] include line ranges + * [x] include `signature`, `modifiers`, `params`, `returns` + * [x] **Function semantics**: + + * [x] `dataflow.reads`, `dataflow.writes`, `dataflow.mutations`, `dataflow.aliases` + * [x] `controlFlow.branches/loops/returns/throws/awaits/yields/breaks/continues` + * [x] `throws`, `awaits`, `yields`, `returnsValue` facets surfaced explicitly + * [x] **Edges**: + + * [x] Import edges (file→file) from `importLinks` + raw `imports` + * [x] Export edges (file→symbol) from `exports` + repo_map `exported` + * [x] Call edges (fn→fn) from `callLinks` or `graph_relations.callGraph` + * [x] Usage edges (fn→fn) from `usageLinks` or `graph_relations.usageGraph` + * [x] Dataflow edges: + + * [x] Argument flow edges from `callSummaries.argMap` (caller→callee param ports) + * [x] Return flow edges using inferred return metadata where available + * [x] Optional: “state flow” edges when reads/writes/mutations overlap (guardrailed; see 28.6) + * [x] Alias edges: + + * [x] derived from `dataflow.aliases` (function-local or cross-function via calls when resolvable) +* [x] Add CLI entrypoint: + + * [x] `pairofcleats report map` (preferred, consistent with existing `report` group), or + * [x] `pairofcleats map` (top-level) +* [x] Support scope + size controls: + + * [x] `--scope repo|dir|file|symbol` + * [x] `--focus ` + * [x] `--include imports,calls,usages,dataflow,exports` + * [x] `--only-exported` + * [x] `--max-files N`, `--max-members-per-file N`, `--max-edges N` + * [x] `--collapse file|dir` (aggregate mode) + * [x] `--format json|dot|svg|html` (see 28.4) + +--- + +### 4.4 Generate “shape-based” diagrams (DOT-first, with nested function fills) + +To match your “shape with fill containing functions” requirement cleanly, DOT/Graphviz is the most direct representation. + +* [x] Implement a DOT generator `src/map/dot-writer.js`: + + * [x] **File nodes as outer shapes** with file-type-dependent shapes: + + * [x] Source code: `box` or `component` + * [x] Tests: `box` with distinct border style + * [x] Config/data: `cylinder` or `hexagon` + * [x] Docs/prose: `note` + * [x] Generated/build artifacts: `folder` or `box3d` + * [x] **Fill represents members** using HTML-like labels: + + * [x] Outer `` represents the file “container” + * [x] Each function/class is a row with a `PORT` so edges can land on that member specifically + * [x] **Nested shapes inside the function row** (HTML sub-tables/cells) to represent: + + * [x] modifiers: async/static/generator/visibility + * [x] signature/params summary + * [x] returns/returnType/returnsValue indicator + * [x] dataflow mini-badges: reads/writes/mutates/aliases counts (and/or top N symbols) + * [x] controlFlow mini-badges: branches/loops/throws/awaits/yields +* [x] **Edge encoding** (multiple edge “line types”): + + * [x] Import edges: dashed file→file + * [x] Call edges: solid function→function (primary control flow) + * [x] Usage edges: thin/secondary style function→function + * [x] Dataflow edges: + + * [x] dotted caller→callee(param) edges (argument flow) + * [x] dotted callee→caller edges for return flow (if inferred) + * [x] Mutation/state edges (optional, guardrailed): double-line or distinct style + * [x] Alias edges: dashed-dotted, labeled `alias: a=b` +* [x] Output modes: + + * [x] `--format dot` always available + * [x] `--format svg` if Graphviz present (shell out to `dot -Tsvg`) + * [x] `--format html` wraps SVG + legend into a standalone HTML viewer +* [x] Implement legend rendering: + + * [x] Either embed as a DOT subgraph or in HTML wrapper + * [x] Must document shape/edge meaning for users + +--- + +### 4.5 Sublime Text 3 plugin commands for map generation + viewing + +Provide first-class UX inside Sublime, even if rendering happens externally. + +* [x] Add commands: + + * [x] `PairOfCleats: Map (Repo)` + * [x] `PairOfCleats: Map (Current Folder)` + * [x] `PairOfCleats: Map (Current File)` + * [x] `PairOfCleats: Map (Symbol Under Cursor)` + * [x] `PairOfCleats: Map (Selection)` +* [x] Add a “Map Type” chooser: + + * [x] Import Map + * [x] Call Map + * [x] Usage/Dependency Map + * [x] Dataflow Map (args/returns/state) + * [x] Combined Map (guardrailed by size limits) +* [x] Implement output handling: + + * [x] Write outputs to `.pairofcleats/maps/` (repo-local) or cache dir + * [x] Open `.dot` in Sublime for inspection + * [x] If `.svg`/`.html` produced: + + * [x] Provide “Open in Browser” command (best-effort) +* [x] Navigation affordances: + + * [x] When a map is generated, also produce an indexable “node list” JSON: + + * [x] allows Sublime quick panel “Jump to node” (file/function) + * [x] opens file at recorded `startLine` +* [x] Graceful degradation: + + * [x] If `astDataflow` / `controlFlow` metadata is unavailable in the index: + + * [x] show “limited map” warning + * [x] offer action: “Rebuild index with dataflow/control-flow enabled” (invokes `index build` with the project’s config expectations) + +--- + +### 4.6 Performance guardrails + scaling strategy (mandatory for real repos) + +This phase will generate *very large graphs* unless explicitly constrained. + +* [x] Hard limits with user-overrides: + + * [x] `maxFiles`, `maxMembersPerFile`, `maxEdges` + * [x] edge sampling policies per edge type +* [x] Aggregation modes: + + * [x] Directory-level aggregation (folder nodes contain files) + * [x] File-only map (no nested functions) + * [x] Export-only functions view + * [x] “Top-K by degree” (highest call/import fan-in/out) +* [x] Deterministic sampling: + + * [x] same inputs → same output (stable selection) +* [x] Cache map builds keyed by: + + * [x] index signature + generator options +* [x] Failure mode policy: + + * [x] If size exceeds limits, output a “truncated map” plus a summary explaining what was dropped + +--- + +### 4.7 Tests (core + integration + determinism) + +Add explicit automated coverage for the map feature. + +#### Node tool tests (authoritative) + +* [x] `tests/code-map-basic.js` + + * [x] Build a tiny fixture repo with: + + * [x] imports/exports + * [x] functions calling other functions + * [x] a function with reads/writes/mutations/aliases + * [x] a function with branches/loops/throws/awaits + * [x] Run `build_index.js --stub-embeddings` + * [x] Run `pairofcleats report map --format json` + * [x] Assert: + + * [x] file nodes exist + * [x] member nodes include `signature/modifiers/returns/dataflow/controlFlow` + * [x] edge sets include imports + calls +* [x] `tests/code-map-dot.js` + + * [x] Generate DOT output + * [x] Assert: + + * [x] file “container” nodes exist + * [x] function rows/ports exist + * [x] edges connect to ports (caller fn → callee fn) + * [x] distinct edge styles appear for import vs call vs dataflow +* [x] `tests/code-map-determinism.js` + + * [x] Run map generation twice and compare outputs (ignore `generatedAt`) +* [x] `tests/code-map-guardrails.js` + + * [x] Generate a repo with many dummy functions + * [x] Ensure truncation behavior is correct and stable + +#### Plugin-side tests + +* [x] Python unit tests: + + * [x] command registration exists + * [x] subprocess args are correct for each map command + * [x] output paths computed correctly + * [x] “Graphviz missing” fallback behavior (DOT-only) works + + + +### 4.8 Isometric map viewer (three.js) + +* [x] Generate an isometric HTML viewer from the map model (three.js module import) +* [x] Support zoom with configurable sensitivity +* [x] Support WASD movement with configurable sensitivity/acceleration/drag +* [x] Highlight selections and show file/line metadata +* [x] Double-click opens the selected file/line via a URI template +* [x] Add layout styles (clustered/radial/flat) with adjustable spacing +* [x] Add flow-connected highlighting (edges + related nodes) and hover highlights from the selection panel +* [x] Add grid line rendering + glow, fog, and wireframe tuning (panel configurable) +* [x] Modularize the isometric viewer client into <500-line modules +--- + +## Phase 11 — Resource Lifecycle Management (Caches, Long-Lived Servers, Builds) + +**Objective:** Prevent memory and resource leaks in long-running processes (API server, service workers), especially across repeated builds and multi-repo usage. + +1. **Add eviction/TTL for API router repo-level caches** + + * [x] **Implement eviction for `repoCaches` map in `tools/api/router.js`.** + + * **Why:** `repoCaches` can grow unbounded if clients query multiple repos or if repo roots vary. Each entry can hold heavy caches (index cache + sqlite connections). + * **Fix:** + + * Add: + + * `maxRepos` (e.g., 3–10) + * `repoTtlMs` (e.g., 10–30 minutes) + * Track `lastUsed` and evict least-recently-used / expired. + * On eviction: close sqlite cache handles (`sqliteCache.close()`), clear index cache. + * [x] Add metrics for cache size and evictions. + + * **Where:** `tools/api/router.js` and metrics registry. + +2. **Add eviction for per-repo index cache and sqlite DB cache** + + * [x] **Index cache eviction** + + * **Why:** `src/retrieval/index-cache.js` caches by `dir` (which can change per build). On repeated re-indexing, old build directories can accumulate. + * **Fix:** Convert to LRU with max entries, or TTL purge on access. + * [x] **SQLite DB cache eviction** + + * **Where:** `src/retrieval/sqlite-cache.js` + * **Why:** Same “dir-per-build” key pattern; can leak connections/handles. + * **Fix:** LRU/TTL + ensure `close()` called on eviction. + +3. **Add explicit cache invalidation when “current build” pointer changes** + + * [x] Detect when the effective index directory changes (new build) and prune caches for previous builds. + + * **Why:** Keeps hot caches relevant and bounds memory footprint. + +**Exit criteria** + +* [x] API server memory does not grow unbounded when indexing/searching multiple repos/builds. +* [x] Old build caches are evicted/pruned automatically. +* [x] SQLite handles are closed on eviction (verified via tests or instrumentation). + +--- + +## Phase 12 — Performance and Operational Hardening + +**Objective:** Improve throughput and robustness under load without changing core behavior. + +1. **Reduce event-loop blocking sync filesystem calls on API request paths** + + * [x] Replace `fsSync.*` in API request hot paths with async equivalents where practical. + + * **Why:** Sync I/O can stall concurrent requests in the API server process. + * **Where (examples):** + + * `tools/api/router.js` `resolveRepo()` uses `existsSync/statSync`. + * **Fix:** Use `fs.promises.stat` with try/catch; cache results briefly if needed. + +2. **Prevent decompression “zip bomb” style memory spikes in artifact reading** + + * [x] Add output size limiting to gzip decompression. + + * **Why:** `src/shared/artifact-io.js` uses `gunzipSync(buffer)` and only checks decompressed size *after* decompression. A small compressed file could expand massively and spike memory. + * **Fix:** + + * Use `zlib.gunzipSync(buffer, { maxOutputLength: maxBytes + slack })` (if supported in your Node target), or switch to streaming gunzip with explicit byte limits. + * **Where:** `src/shared/artifact-io.js` `parseBuffer` / gzip handling. + +3. **Add download size limits for tools that fetch large remote assets** + + * [x] Enforce maximum download size (or require hash) for dictionary downloads. + + * **Why:** `tools/download-dicts.js` buffers the entire response in memory (`Buffer.concat`) without a hard cap. + * **Fix:** Stream to disk with a cap; abort if exceeded; strongly prefer requiring hashes for non-default URLs. + +**Exit criteria** + +* [x] API request path avoids avoidable sync I/O. +* [x] Artifact gzip parsing cannot explode memory beyond configured limits. +* [x] Large downloads are bounded and/or verified. + +--- + +## Phase 14 — Optional-dependency framework + capability registry (foundation for all phases) + +### 14.1 Introduce a consistent “optional dependency” loader + +* [x] Add `src/shared/optional-deps.js` with a single, opinionated API: + + * [x] `tryRequire(name)` / `tryImport(name)` helpers (use `createRequire(import.meta.url)` where needed) + * [x] Standardized return shape: `{ ok: true, mod } | { ok: false, error, reason }` + * [x] Standardized logging hook (only when `PAIROFCLEATS_VERBOSE` or a dedicated flag is enabled) +* [x] Add `src/shared/capabilities.js` that reports runtime availability: + + * [x] `watcher: { chokidar: true, parcel: boolean }` + * [x] `regex: { re2: boolean, re2js: true }` + * [x] `hash: { nodeRsXxhash: boolean, wasmXxhash: true }` + * [x] `compression: { gzip: true, zstd: boolean }` + * [x] `extractors: { pdf: boolean, docx: boolean }` + * [x] `mcp: { sdk: boolean, legacy: true }` + * [x] `externalBackends: { tantivy: boolean, lancedb: boolean }` (even if “boolean” means “reachable” rather than “installed”) +* [x] Wire capabilities into existing “status” surfaces: + + * [x] Extend `tools/mcp/repo.js` → `configStatus()` to include capability info and warnings for requested-but-unavailable features + * [x] Extend `tools/config-dump.js` (or equivalent) to print capabilities in JSON output mode + +### 14.2 Add config + env “backend selectors” (uniform UX) + +* [x] Extend `src/shared/env.js` to parse new selectors (string + allowlist): + + * [x] `PAIROFCLEATS_WATCHER_BACKEND` = `auto|chokidar|parcel` + * [x] `PAIROFCLEATS_REGEX_ENGINE` = `auto|re2|re2js` + * [x] `PAIROFCLEATS_XXHASH_BACKEND` = `auto|native|wasm` + * [x] `PAIROFCLEATS_COMPRESSION` = `auto|gzip|zstd|none` + * [x] `PAIROFCLEATS_DOC_EXTRACT` = `auto|on|off` + * [x] `PAIROFCLEATS_MCP_TRANSPORT` = `auto|sdk|legacy` +* [x] Add parallel config keys in `.pairofcleats.json` (keep them near existing related config blocks): + + * [x] `indexing.watch.backend` + * [x] `search.regex.engine` + * [x] `indexing.hash.backend` + * [x] `indexing.artifactCompression.mode` enum expansion + `auto` + * [x] `indexing.documentExtraction.enabled` + * [x] `mcp.transport` +* [x] Update `docs/config-schema.json`: + + * [x] Add/expand enums (avoid “free string” for anything that’s meant to be policy-controlled) + * [x] Add descriptions that clarify fallback rules (`auto` behavior) +* [x] Update any config validation code paths if they enforce known keys (`src/config/validate.js` is schema-driven; keep schema authoritative) + +### 14.3 Add dependency-bundle reference stubs (keeps repo documentation consistent) + +For each new dependency introduced in later phases, add a minimal doc file under: +`docs/references/dependency-bundle/deps/.md` + +* [x] `parcel-watcher.md` +* [x] `re2.md` +* [x] `node-rs-xxhash.md` +* [x] `mongodb-js-zstd.md` +* [x] `pdfjs-dist.md` +* [x] `mammoth.md` +* [x] `modelcontextprotocol-sdk.md` +* [x] `lancedb.md` (if used) +* [x] `tantivy.md` (if used) +* [x] Update `docs/references/dependency-bundle/README.md` if it has an index + +### 14.4 Tests (framework-level) + +* [x] Add `tests/capabilities-report.js`: + + * [x] Asserts `capabilities` object shape is stable + * [x] Asserts `auto` selectors never throw when optional deps are missing +* [x] Add a script-coverage action to run it: + + * [x] `tests/script-coverage/actions.js`: add action entry that calls `runNode(...)` + * [x] (Optional) Add an npm script alias if you want parity with the rest of the repo scripts + +**Exit criteria** + +* [x] All “capability” calls are side-effect-free and safe when optional deps are absent +* [x] `config_status` (MCP) can surface “you requested X but it’s not available” warnings without crashing +* [x] CI passes on Node 18 (Ubuntu + Windows lanes) + +--- + +## Phase 15 — File watching performance: add `@parcel/watcher` backend (keep chokidar fallback) + +### 15.1 Add the dependency (prefer optional unless you want it guaranteed everywhere) + +* [x] Add `@parcel/watcher` to `package.json` + + * [x] Prefer `optionalDependencies` if you want installs to succeed even when native builds fail + * [x] If you add it as a hard dependency, ensure Windows CI remains green + +### 15.2 Create a watcher-backend abstraction + +* [x] Create `src/index/build/watch/backends/types.js` (or inline JSDoc contract) describing: + + * [x] `start({ root, ignored, onEvent, onError, pollMs? }) -> { close(): Promise }` + * [x] Normalized event shape: `{ type: 'add'|'change'|'unlink', absPath }` +* [x] Extract chokidar wiring out of `src/index/build/watch.js`: + + * [x] Move into `src/index/build/watch/backends/chokidar.js` + * [x] Preserve existing semantics (`awaitWriteFinish`, ignored matcher, poll support) +* [x] Implement parcel watcher backend: + + * [x] New file: `src/index/build/watch/backends/parcel.js` + * [x] Map parcel events to the normalized `{type, absPath}` model + * [x] Decide how to handle rename/move (often appears as unlink+add): + + * [x] If parcel reports rename, still emit unlink+add for compatibility with current scheduling + * [x] Implement “poll” behavior: + + * [x] If poll mode is requested, either: + + * [x] force chokidar with polling, **or** + * [x] implement a cheap stat-based poller wrapper (only if needed) + * [x] Implement “write stability” guard: + + * [x] Chokidar has `awaitWriteFinish`; parcel does not in the same way + * [x] Add a “stabilize file” check in the pipeline: before processing a file, optionally confirm `mtime/size` stable across N ms + * [x] Place this in `createDebouncedScheduler()` or immediately before `enqueueOrUpdate()` in `file-processor.js` (prefer a single shared guard) + +### 15.3 Wire selection into `watchIndex()` + +* [x] Update `src/index/build/watch.js`: + + * [x] Choose backend via (in order): CLI/config → env → `auto` capability + * [x] Log selected backend once at startup (only if verbose or `--watch`) + * [x] Ensure `pollMs` is still honored (either by backend or by selection logic) + +### 15.4 Tests + +* [x] Add `tests/watch-backend-selection.js`: + + * [x] Forces `PAIROFCLEATS_WATCHER_BACKEND=chokidar` and asserts no parcel import occurs + * [x] Forces `...=parcel` and asserts fallback behavior if module unavailable (no crash, warning path) +* [x] Add `tests/watch-stability-guard.js`: + + * [x] Simulate “partial write” (write file in two chunks with delay) and assert processor waits/defers correctly + * [x] Keep the test deterministic: use explicit timeouts and a temp directory under `tests/.cache` +* [x] Add corresponding script-coverage actions in `tests/script-coverage/actions.js` + +**Exit criteria** + +* [x] `pairofcleats index watch` remains correct on Windows and Linux +* [x] No regressions in ignore behavior (still uses `buildIgnoredMatcher`) +* [x] Event storms do not cause repeated redundant rebuilds (existing debounce logic preserved) + +--- diff --git a/COMPLETE_PLAN.md b/COMPLETE_PLAN.md deleted file mode 100644 index c08839a4b..000000000 --- a/COMPLETE_PLAN.md +++ /dev/null @@ -1,466 +0,0 @@ -# Complete Plan - -This document consolidates all phase docs and tracks implementation status. Phase markdown files are removed after merge; this is the single source of truth. - -## Status key -- done: implemented and validated -- partial: implemented with known gaps or follow-ups -- todo: not implemented -- in-progress: actively being implemented - -## Baseline goals (status: done) -- [x] Per-repo indexing with a central cache outside the repo. -- [x] On-demand indexing with incremental caching and optional CI artifacts. -- [x] MCP server interface for status/build/search/model download. -- [x] Non-git repos supported with a strong recommendation to use git. - -## Cache layout (status: done) -- /repos//index-code/ -- /repos//index-prose/ -- /repos//incremental/ -- /repos//repometrics/ -- /repos//index-sqlite/index-code.db -- /repos//index-sqlite/index-prose.db -- /models/ -- /extensions/ - -Repo identity: -- Hash the absolute repo path (run from repo root for stable IDs). -- Git metadata is captured separately for status/reporting. - -SQLite location: -- Override with `sqlite.dbDir` or `codeDbPath`/`proseDbPath`. -- Point `sqlite.dbDir` at `index-sqlite` to keep DBs in the repo. - -## Model download and bootstrap (status: done) -- [x] Detect model availability in MCP status and provide a download_models hint. -- [x] Provide download helper (node) and bootstrap path. - -## Git handling (status: done) -- [x] Warn when git is missing and continue without git metadata. -- [x] Store commit hash and dirty flag when git is present. - -## MCP surface (status: done) -- [x] index_status(repoPath) -- [x] build_index(repoPath, mode=all, incremental=true) -- [x] search(repoPath, query, filters...) -- [x] download_models() -- [x] report_artifacts() - -## Phase 2: SQLite Candidate Generation (status: done) -Goal: Use SQLite to generate candidate sets while keeping scoring/rendering in JS. -Work items: -- [x] Candidate set creation via token, phrase, and chargram tables. -- [x] BM25 stats sourced from SQLite (doc_lengths + token_stats). -- [x] Fallback to file-backed artifacts when SQLite is missing or incomplete. -- [x] Docs updated to describe SQLite candidate generation. -Notes: -- Query tokenization remains in search.js; SQLite provides candidates only. -- Dense vectors and minhash are still JS-side. - -## Phase 3: Parity + Performance Validation (status: done) -Goal: Validate SQLite vs file-backed parity and capture baseline metrics. -Work items: -- [x] Parity harness (tests/parity.js) with overlap and score deltas. -- [x] Query set in tests/parity-queries.txt. -- [x] Report output (docs/phase3-parity-report.json). -- [x] Benchmark harness (tests/bench.js) for latency and artifact sizes. - -## Phase 4: Incremental Indexing (status: done) -Goal: Reuse per-file bundles to avoid re-embedding unchanged files. -Work items: -- [x] Per-file cache manifest and bundles outside the repo. -- [x] Incremental build path in build_index.js. -- [x] SQLite incremental updates in tools/build-sqlite-index.js. -- [x] Incremental tests (tests/sqlite-incremental.js). -Notes: -- Global postings are rebuilt from cached bundles (not in-place deltas for file-backed JSON). - -## Phase 5: CI Artifact Generation + Detection (status: done) -Goal: Build and restore index artifacts in CI. -Work items: -- [x] Build script (tools/ci-build-artifacts.js) with manifest output. -- [x] Restore script (tools/ci-restore-artifacts.js) with commit checks. -- [x] Bootstrap restore when ci-artifacts/manifest.json exists. -- [x] Docs for GitHub and GitLab usage. - -## Phase 6: Tests + Benchmarks (status: done) -Goal: Expand deterministic tests and perf harnesses. -Work items: -- [x] Fixture repos under tests/fixtures (sample, mixed). -- [x] Fixture smoke, parity, eval harnesses. -- [x] Bench harness (tests/bench.js) + bench-ann script. -- [x] Query cache, cleanup, uninstall, sqlite incremental/compact, mcp server tests. -- [x] Add CI workflow to run smoke + parity in GitHub Actions. - -## Phase 7: Language Expansion (status: done) -Goal: Provide stable chunking + metadata for prioritized languages. - -Python (status: done) -- [x] Python AST enrichment when python is available; heuristic fallback. -- [x] Class/function/method chunking with docstrings and signatures. -- [x] Improve call graph accuracy for nested functions. -- [x] Add type-aware docs for dataclasses/attrs. - -Swift (status: done) -- [x] Brace-aware chunking for declarations. -- [x] Doc comment extraction and signature metadata. -- [x] Improve parsing of generics and extensions. - -ObjC/C/C++ (status: done) -- [x] Regex-driven chunking for C-family and ObjC blocks. -- [x] Selector extraction for ObjC methods. -- [x] Improve call graph and include resolution heuristics. - -Rust (status: done) -- [x] Heuristic chunking for structs/enums/traits/mods/impls/fns. -- [x] Basic metadata extraction and imports/exports. -- [x] Improve macro-heavy parsing and impl block method grouping. - -## Phase 7b: AST Completion Passes (status: done) -Goal: Extend AST-backed languages to a "complete" metadata and dataflow feature set. -Work items: -- [x] Define and document the AST feature list and per-language coverage. -- [x] JS AST: signatures/params/modifiers/inheritance + dataflow (reads/writes/mutations/throws/awaits/yields). -- [x] Python AST: signatures/params/types/bases/modifiers + dataflow (reads/writes/mutations/throws/awaits/yields/globals). -- [x] Configurable AST dataflow extraction (default on). -- [x] Add fixtures + language-fidelity assertions for AST metadata. - -## Phase 8: SQLite Scoring (FTS5) + ANN Extension (status: done) -Goal: Optional SQLite-only sparse ranking plus optional vector extension for ANN. -Work items: -- [x] FTS5 ranking path (sqlite-fts backend) with shared renderer. -- [x] Configurable FTS5 weighting and optional normalization. -- [x] ANN extension support (sqlite-vec) with loadable binary. -- [x] Archive download support for extension binaries (zip/tar/tgz). -- [x] ANN extension test harness (tests/sqlite-ann-extension.js). - -## Phase 9: Scoring Calibration (status: done) -Goal: Deterministic ranking and tunable BM25 parameters. -Work items: -- [x] Deterministic tie-breakers in ranking and merging. -- [x] Configurable BM25 parameters (search.bm25.k1/b). -- [x] Documentation for tuning and parity expectations. - -## Phase 10: SQLite Split (status: done) -Goal: Split code/prose DBs to reduce lock contention. -Work items: -- [x] index-code.db and index-prose.db layout. -- [x] Build/search use split DBs. -- [x] CI artifacts handle split DBs. -- [x] Legacy index.db cleanup. - -## Phase 11: Parallel Indexing (status: done) -Goal: Parallel file processing with deterministic ordering. -Work items: -- [x] File worker pool with deterministic output ordering. -- [x] Separate concurrency for import scanning. -- [x] Configurable concurrency via .pairofcleats.json and CLI. - -## Phase 12: MCP Server Packaging (status: done) -Goal: MCP stdio server for index lifecycle and search. -Work items: -- [x] JSON-RPC 2.0 server with content-length framing. -- [x] Tools: index_status/build_index/search/download_models/report_artifacts. -- [x] Git-optional behavior with warnings. - -## Phase 13: Language Fidelity Review + Enhancements (status: done) -Goal: Evaluate current fidelity of each supported language and enhance parsing. -Work items: -- [x] Build a per-language evaluation checklist (chunking, metadata, relations). -- [x] Expand fixtures per language and add targeted regression tests. -- [x] Implement improvements per language and update docs. - -## Phase 14: CI Coverage and Full Script Coverage (status: done) -Goal: Ensure every npm script is exercised and documented. -Work items: -- [x] Add CI workflow for smoke + parity + core harnesses. -- [x] Add a meta-test runner that exercises all scripts (with stub embeddings). -- [x] Record expected runtime and platform constraints. - -## Phase 15: New Languages and Features (status: done) -Goal: Add new languages and new indexing/search features after baseline completion. -Work items: --- [x] Add Go support (chunking + metadata + relations + fixtures + tests). --- [x] Add Java support (chunking + metadata + relations + fixtures + tests). --- [x] Add Perl (lite) support for comedy coverage (chunking + minimal metadata). --- [x] Add Shell (lite) support (chunking + minimal metadata + fixtures + tests). --- [x] Add AST-based dataflow metadata (reads/writes/mutations/throws/awaits/yields). --- [x] Add search filters for AST metadata (decorators/modifiers/returns/throws/reads/writes/mutations/extends/visibility). --- [x] Render AST metadata in human output. --- [x] Update docs and tests for each addition. - -## Phase 16: Unified Parsing + Tooling Bootstrap (status: done) -Goal: Centralize parsing where possible while keeping native parsers for stable languages, and add tooling detection/install support. -Work items: -- [x] Choose and document a unified parser backbone (tree-sitter) plus native parser mapping for JS/Python. -- [x] Add tooling detection + install scripts with cache-local default installs and optional normal installs. -- [x] Add config: tooling.autoInstallOnDetect, tooling.installScope, tooling.allowGlobalFallback. -- [x] Update bootstrap to detect languages and auto-install tooling when configured. -- [x] Add tests for tooling detection/install logic (stubbed where needed). - -## Phase 17: Format Coverage Expansion (status: done) -Goal: Add rich chunking/metadata for common config and docs formats. -Work items: -- [x] Add JSON/TOML/INI/XML parsers and chunking rules. -- [x] Add Dockerfile/Makefile parsing and chunking rules. -- [x] Add GitHub Actions YAML parsing (workflow/job/step chunks). -- [x] Add RST and AsciiDoc heading/section chunking. -- [x] Update fixtures, language-fidelity checklist, and docs for formats. - -## Phase 18: Language Expansion (status: done) -Goal: Add baseline parsing/chunking/relations for new languages with the unified backbone. -Work items: -- [x] TypeScript baseline heuristic chunking + metadata (native TS parser integration deferred). -- [x] C# baseline heuristic chunking + metadata (tree-sitter/LSP enrichment deferred). -- [x] Kotlin baseline heuristic chunking + metadata (tree-sitter/LSP enrichment deferred). -- [x] Ruby baseline heuristic chunking + metadata (tree-sitter/LSP enrichment deferred). -- [x] PHP baseline heuristic chunking + metadata (tree-sitter/LSP enrichment deferred). -- [x] Lua baseline heuristic chunking + metadata (tree-sitter/LSP enrichment deferred). -- [x] SQL baseline statement chunking + metadata (dialect parsing in Phase 19). -- [x] Add fixtures and language-fidelity assertions for each. -Notes: -- Tree-sitter/native parser enrichment remains planned alongside Phase 19-22 work. - -## Phase 19: SQL Dialect Parsing (status: done) -Goal: Provide dialect-aware SQL parsing and metadata. -Work items: -- [x] Add PostgreSQL/MySQL/SQLite dialect selection rules (extension + override). -- [x] Add per-dialect fixtures and tests. -- [x] Add config for sql.dialect and dialect-by-extension mapping. - -## Phase 20: CFG + Dataflow Everywhere (status: done) -Goal: Add control-flow graphs and dataflow metadata across supported languages. -Work items: -- [x] Define shared CFG/dataflow schema in docs/ast-feature-list.md. -- [x] Implement CFG/dataflow for C/C++/ObjC, Rust, Go, Java, Shell. -- [x] Reuse shared engine for JS/Python where applicable. -- [x] Add filters and output rendering for CFG/dataflow metadata. -- [x] Expand fixtures/tests to validate control-flow and dataflow fields. -- [x] Evaluate dynamic language handler imports (pros/cons, perf, DX). - -## Phase 21: Type Inference (Intra-file) (status: done) -Goal: Add local type inference for each supported language. -Work items: -- [x] Implement intra-file inference for literals, annotations, and symbol tables. -- [x] Merge inferred types into docmeta and render/filter paths. -- [x] Validate with fixtures and language-fidelity tests. - -## Phase 22: Type Inference (Cross-file) (status: done) -Goal: Resolve types across files after intra-file stability is confirmed. -Work items: -- [x] Add cross-file symbol resolution and import/usage linking. -- [x] Use detected tooling when present for richer type info. -- [x] Validate with tests; provide parity/perf summary after completion. -Notes: -- Cross-file inference is covered by `tests/type-inference-crossfile.js`; large-repo perf runs are still pending. - -## Phase 23: Unified Setup Command (status: done) -Goal: Provide a single guided command that bundles optional setup steps. -Work items: -- [x] Add a guided setup script that can install deps, dictionaries, models, extensions, tooling, and build indexes. -- [x] Support prompts when defaults fail or when optional tooling is detected. -- [x] Provide non-interactive flags for CI usage. -- [x] Document and add tests for the unified setup flow. - -## Maintenance / Refactor Guardrails (status: done) -- [x] Break `build_index.js` into focused modules (discovery/import scan/file processing/posting builders/artifact writers/metrics) to keep growth in check. - -## Deferred / Do Not Surface (status: deferred) -- [ ] Evaluate FTS5 vs BM25 parity on larger benchmarks and retune weights. - - Do not prioritize or bring this up unless explicitly requested. - -## Phase 24: Indexing Core Reliability (status: done) -- [x] Fix chunk weight wiring (`weightt` typo) and add a regression test for weight effects. -- [x] Use precomputed token frequencies in BM25 row building; remove unused `wordFreq`/`sparse` artifacts if they remain unused. -- [x] Add a config option to disable per-chunk `git blame` (or downgrade to file-level) for large repos. -- [x] Add empty-repo/zero-chunk coverage to ensure postings/metrics stay stable. - -## Phase 25: Language Parsing Hardening (status: done) -- [x] Improve TypeScript import parsing for multi-line imports/exports and dynamic `import()` calls. -- [x] Add JSX/Stage-3 parsing support (espree or tree-sitter) to avoid fallback chunking in `.jsx/.tsx`. -- [x] Extend cross-file inference beyond TS (Go/Rust/Java via tooling hooks). -- [x] Add fixtures/tests for `.tsx/.mts/.cts` and Python AST fallback. - -## Phase 26: Search + Scoring Consistency (status: done) -- [x] Unify MinHash implementation between indexing and search; add a compatibility test. -- [x] Decide on `sparse_postings_varint.bin`: consume it or remove it from outputs. -- [x] Add caching for search summaries and unify shared CLI/output code with sqlite search. -- [x] Expand filter coverage tests (return types, inferred types, returns/async flags). - -## Phase 27: SQLite Incremental Safety (status: done) -- [x] Validate schema version before incremental updates and force rebuild when mismatched. -- [x] Detect embedding model changes (id/dims) and rebuild or re-ingest dense vectors. -- [x] Add optional vocab pruning/compaction for long-lived incremental DBs. -- [x] Add tests for schema mismatch and vector-ann table sync after deletions. - -## Phase 28: Tooling + Cache UX (status: done) -- [x] Make `clean-artifacts --all` preserve models/dicts or add keep flags aligned with uninstall behavior. -- [x] Add `setup --json` summary output for CI automation. -- [x] Add Node-based archive extraction fallback for extension downloads. -- [x] Deduplicate shared helper logic across setup/bootstrap/clean/uninstall scripts. - -## Phase 29: MCP + Docs Quality (status: done) -- [x] Refresh `ROADMAP.md` or mark it as historical to avoid contradicting `COMPLETE_PLAN.md`. -- [x] Add async MCP build support (stream output vs `spawnSync`) and document error payloads. -- [x] Add MCP error-path tests (invalid repo path, missing indexes). -- [x] Add a docs consistency test to catch stale plan/roadmap references. - -## Phase 30: Scoring + JSON Consolidation (status: done) -Goal: Standardize scoring outputs across backends and make JSON payloads consistent and inspectable. -Work items: -- [x] Align score labels and semantics across memory/sqlite/sqlite-fts paths (including ANN fallback). -- [x] Add score breakdowns (BM25/FTS/ANN components, normalization flags, weights). -- [x] Ensure `--json-compact` preserves the same fields across backends and filters. -- [x] Update compare/parity harnesses to consume the unified score schema. -- [x] Add targeted tests for score breakdown parity. -Notes: -- Enhancement thread 1 (scoring transparency) is implemented here. - -## Phase 31: Index Pipeline Pluginization (status: done) -Goal: Replace large conditional flows with a registry-based indexing pipeline. -Work items: -- [x] Build a per-language/format registry for scanners, parsers, and enrichers. -- [x] Centralize shared helpers (tokenize, metadata normalization, relations). -- [x] Reduce build_index control flow into steps with explicit inputs/outputs. -- [x] Add fixtures/tests for registry ordering and missing-handler fallbacks. -Notes: -- Enhancement thread 3 (parser SDK) is implemented here. - -## Phase 32: Language Semantics Depth (status: done) -Goal: Improve type inference, control flow, and dataflow richness with interprocedural context. -Work items: -- [x] Expand intra-file type inference precision (literal unions, generics, propagation). -- [x] Add interprocedural summaries (callsite argument/return linking). -- [x] Extend dataflow with alias tracking for supported languages. -- [x] Add fidelity fixtures covering new semantic edges. -Notes: -- Enhancement thread 2 (language semantics) is implemented here. - -## Phase 33: Continuous Indexing (status: done) -Goal: Support live updates via watchers and git hooks with safe concurrency. -Work items: -- [x] Add a watch mode to trigger incremental indexing on file changes. -- [x] Add optional git hook installers (post-commit / post-merge). -- [x] Add lock/health checks to avoid concurrent writes. -- [x] Document workflows for CI and local dev. -Notes: -- Enhancement thread 4 (continuous update loop) is implemented here. - -## Phase 34: Artifact Lifecycle + Cache Hygiene (status: done) -Goal: Manage cache size, retention, and shared artifacts safely. -Work items: -- [x] Add cache quota and GC policy (age/size-based eviction). -- [x] Add artifact health checks and cold-cache rebuild hints. -- [x] Expand report-artifacts with per-repo and global rollups. -- [x] Add tests for GC and quota handling. -Notes: -- Enhancement thread 5 (cache/artifact hygiene) is implemented here. - -## Phase 35: MCP UX Enhancements (status: done) -Goal: Make MCP interactions richer, safer, and more transparent. -Work items: -- [x] Stream progress for long-running MCP tasks (index build, download). -- [x] Add remediation hints on common errors (missing models/dicts/sqlite). -- [x] Add MCP tool to inspect config + cache status with warnings. -- [x] Add MCP-focused tests for error and progress payloads. -Notes: -- Enhancement thread 6 (MCP UX) is implemented here. - -## Phase 36: Agent-Focused SAST Features (status: done) -Goal: Provide lightweight risk signals and flows for agent workflows. -Work items: -- [x] Add taint-like flow summaries for sources/sinks (configurable). -- [x] Add risky API usage detectors with metadata tags. -- [x] Add search filters for risk categories and flows. -- [x] Add fixtures/tests for sample flows. -Notes: -- Enhancement thread 7 (SAST-adjacent) is implemented here. - -## Phase 37: Triage Records + Context Packs (Phase 0: spec review + plan) (status: done) -Goal: Review the v1 triage spec, map touched systems, and capture assumptions for a safe rollout. -Work items: -- [x] Review newfeature.md and current build/search/config flows to map integration points. -- [x] Confirm cache-only storage for triage artifacts (no repo writes). -- [x] Document assumptions and guardrails before implementation. -Assumptions/guardrails: -- Keep `build_index --mode all` semantics as code+prose only; records are opt-in via `--mode records`. -- Triage records live under the repo cache by default; no triage data written to the repo tree. -- Promote only selected fields into `docmeta.record` to avoid bloating chunk metadata. -- Record indexing can be a full rebuild in v1 (expected low volume); incremental support is optional. -- Meta filtering uses case-insensitive matching and ignores missing fields rather than erroring. -- Context packs can invoke `search.js` via a child process in v1 (no core search refactor required). - -## Phase 38: Triage Records + Context Packs (Phase 1: config + paths + schema) (status: done) -Goal: Add triage config and path resolution, plus shared helpers for stable record IDs. -Work items: -- [x] Add `triage` config defaults to `.pairofcleats.json` and config loaders. -- [x] Extend `tools/dict-utils.js` with `getTriageRecordsDir()` and allow `getIndexDir(..., 'records')`. -- [x] Define shared helpers for recordId generation and promoted field extraction. - -## Phase 39: Triage Records + Context Packs (Phase 2: ingest + normalize + render + decisions) (status: done) -Goal: Ingest findings into normalized records and render human/indexable views. -Work items: -- [x] Implement `tools/triage/ingest.js` with Dependabot, AWS Inspector, and generic adapters. -- [x] Add normalization modules in `src/triage/normalize/` with parse warnings and metadata routing. -- [x] Add `src/triage/render.js` to render canonical markdown views. -- [x] Implement `tools/triage/decision.js` to create decision records linked to findings. - -## Phase 40: Triage Records + Context Packs (Phase 3: records indexing) (status: done) -Goal: Build a dedicated records index with prose-style tokenization and optional incremental caching. -Work items: -- [x] Allow `--mode records` in build args and route to a new records indexer. -- [x] Add `src/triage/index-records.js` to build `index-records` from record markdown + JSON. -- [x] Store promoted fields in `docmeta.record` and keep artifacts small. - -## Phase 41: Triage Records + Context Packs (Phase 4: records search + meta filters) (status: done) -Goal: Enable records search with metadata-first filtering and JSON output support. -Work items: -- [x] Extend `search.js` to include `--mode records` and optional `--meta`/`--meta-json`. -- [x] Add record output section and JSON `records` payloads in `src/search/output.js`. -- [x] Add generic file/ext filters if not already present and apply them to records. - -## Phase 42: Triage Records + Context Packs (Phase 5: context packs + MCP + tests + docs) (status: done) -Goal: Produce LLM-ready context packs, expose MCP tools, and add tests/fixtures/docs. -Work items: -- [x] Implement `tools/triage/context-pack.js` (history + repo evidence). -- [x] Add MCP tool wrappers for ingest/decision/context packs and allow `records` mode in MCP build/search. -- [x] Add triage fixtures + `tests/triage-records.js` and script wiring in `package.json`. -- [x] Update README + docs to describe triage workflows and new CLI/MCP tools. - -## Phase 43: Prioritized Issues - P0 Correctness (status: done) -Goal: Fix correctness issues and broken/unused CLI behavior. -Work items: -- [x] Fix `--churn` CLI parsing, numeric thresholds, cache keys, and docs. -- [x] Replace churn metric with git numstat-based churn; add tests. -- [x] Fix Unicode offset drift between indexing and rendering; add fixture test. -- [x] Remove or implement build `--chunk` option; update docs/tests. -- [x] Enable GitHub Actions workflows under `.github/workflows` with CI. - -## Phase 44: Prioritized Issues - P1 High ROI (status: done) -Goal: Bring MCP/CLI parity and improve indexing robustness. -Work items: -- [x] Expand MCP `search` filters to CLI parity and default to `--json-compact`. -- [x] Add MCP ops tools for download/build/maintain workflows. -- [x] Add `--path` alias filter and ensure CLI/MCP path/ext filters are consistent. -- [x] Auto-detect repo root for CLI/tools; add `--repo` overrides. -- [x] Add file-size guardrails with skip/partial index reporting. -- [x] Graceful shutdown for watch mode with lock cleanup. - -## Phase 45: Prioritized Issues - P2 Enhancements (status: done) -Goal: Improve search UX and reduce index footprint. -Work items: -- [x] Add negative terms and quoted phrases to query parsing. -- [x] Add modified-since/after filters (git-aware recency). -- [x] Add chunk-author filter and output rendering. -- [x] Make chargram/phrase-ngrams configurable and handle missing artifacts. -- [x] Clarify score fields (`score`, `annScore`, `scoreBreakdown`) in JSON + docs. -- [x] Remove redundant `call` vs `calls` filtering path. - -## Phase 46: Prioritized Issues - P3 Maintainability (status: done) -Goal: Improve packaging, configuration safety, and testability. -Work items: -- [x] Add `pairofcleats` CLI entrypoint with subcommands. -- [x] Add config schema + validation command. -- [x] Pin dependency versions (remove `*`) and document policy. -- [x] Refactor `search.js` into modules for testability. diff --git a/GIGAROAD/ROADMAP.md b/GIGAROAD/ROADMAP.md new file mode 100644 index 000000000..d920d561d --- /dev/null +++ b/GIGAROAD/ROADMAP.md @@ -0,0 +1,5430 @@ +## Phase 1 — Sublime Text 3 Plugin Foundation (Parity + Plumbing) + +### 1.1 Plugin repo structure + packaging + +* [ ] Create `sublime/PairOfCleats/` package skeleton: + + * [ ] `PairOfCleats.py` (entrypoint) + * [ ] `commands/` (command modules) + * [ ] `lib/` (helpers: config, subprocess, parsing, caching) + * [ ] `messages/` (install/upgrade notes) + * [ ] `Default.sublime-commands` + * [ ] `Main.sublime-menu` (optional) + * [ ] `Default.sublime-keymap` (optional) +* [ ] Add `README.md` for ST3 plugin installation + prerequisites +* [ ] Add “Package Control” compatibility notes (no external deps beyond Node runtime + repo binaries) + +### 1.2 Node/CLI discovery + execution contract + +* [ ] Implement robust “pairofcleats binary discovery”: + + * [ ] Prefer project-local `node_modules/.bin/pairofcleats` when available + * [ ] Fallback to global `pairofcleats` on PATH + * [ ] Allow explicit override in ST settings: `pairofcleats_path` +* [ ] Implement repo-root detection: + + * [ ] Prefer `.pairofcleats.json` location + * [ ] Fallback to `.git` root + * [ ] Fallback to folder of active file +* [ ] Implement subprocess wrapper: + + * [ ] Streams output to Sublime panel + * [ ] Captures JSON payloads when `--json` is used + * [ ] Supports cancellation (best-effort) + * [ ] Adds stable environment injection (cache root, embeddings mode, etc.) + +### 1.3 Settings + per-project overrides + +* [ ] Add `PairOfCleats.sublime-settings` defaults: + + * [ ] `pairofcleats_path`, `node_path` + * [ ] `index_mode_default` (code/prose/both) + * [ ] `search_backend_default` (memory/sqlite-fts/etc) + * [ ] `open_results_in` (quick_panel / new_tab / output_panel) +* [ ] Support `.sublime-project` settings overrides +* [ ] Validate config and surface actionable error messages + +### 1.4 Smoke tests (plugin-side) + +* [ ] Add Python unit tests that: + + * [ ] Import plugin modules without Sublime runtime (mock `sublime`, `sublime_plugin`) + * [ ] Validate binary discovery behavior + * [ ] Validate repo-root resolution on fixtures + * [ ] Validate settings overlay precedence + +--- + + +## Phase 2 — Sublime Search UX (Queries, Results, Navigation) + +### 2.1 Search command(s) + +* [ ] `PairOfCleats: Search` command: + + * [ ] Prompt input panel for query + * [ ] Optional toggles: code/prose/both, backend, limit + * [ ] Execute `pairofcleats search ... --json` +* [ ] `PairOfCleats: Search Selection` command: + + * [ ] Uses selected text as query +* [ ] `PairOfCleats: Search Symbol Under Cursor` command + +### 2.2 Results presentation + +* [ ] Quick panel results: + + * [ ] Show `file:line-range`, symbol name, snippet/headline, score + * [ ] Preserve stable ordering for repeatability +* [ ] On selection: + + * [ ] Open file at best-effort location (line/column) + * [ ] Highlight match range (if available) +* [ ] Add optional “results buffer” view (for large result sets) + +### 2.3 Quality-of-life UX + +* [ ] Query history (per project) +* [ ] “Repeat last search” command +* [ ] “Explain search” (if supported by CLI flags / internal explain output) + +### 2.4 Tests + +* [ ] Add Node-level “search contract” tests: + + * [ ] Ensure `--json` output parseability and required fields +* [ ] Add plugin tests: + + * [ ] Search command dispatches correct subprocess args + * [ ] Results parsing tolerates partial/missing optional fields + +--- + + +## Phase 3 — Index Lifecycle in Sublime (Build/Watch/Validate + Status) + +### 3.1 Build index commands + +* [ ] `PairOfCleats: Index Build (Code)` +* [ ] `PairOfCleats: Index Build (Prose)` +* [ ] `PairOfCleats: Index Build (All)` +* [ ] Stream progress to an output panel +* [ ] Persist “last index time” + “last index mode” in project cache + +### 3.2 Watch mode integration + +* [ ] `PairOfCleats: Index Watch Start` +* [ ] `PairOfCleats: Index Watch Stop` +* [ ] Prevent duplicate watchers per window/project +* [ ] Robust shutdown on Sublime exit / project close + +### 3.3 Validate + repair affordances + +* [ ] `PairOfCleats: Index Validate` +* [ ] Surface actionable failures (missing artifacts, invalid JSON, stale manifests) +* [ ] Provide “Open index directory” convenience command + +### 3.4 Tests + +* [ ] Node tests for index build/validate on fixtures +* [ ] Plugin tests for lifecycle commands and watcher gating + +--- + + +## Phase 4 — Codebase Semantic Map (Imports/Exports/Calls/Dataflow/Control Flow → Visual Map) + +### What this phase delivers + +A **real codebase map** that uses existing and enriched semantic metadata to generate a **diagram-ready model** and one or more **rendered artifacts**. + +It must explicitly incorporate and visualize: + +* **Imports / Exports / ImportLinks** +* **Calls / CallLinks / CallSummaries** +* **Usages / UsageLinks** +* **Signature / Modifiers / Params / Returns** +* **Reads / Writes / Mutates / Aliases** +* **Control flow** (branches, loops, throws, awaits, yields, returns) +* **AST-derived semantics** (using what the indexer already extracts) + +#### Visual grammar (required characteristics) + +* **File = outer shape** + + * Shape varies by file type/category (source/test/config/doc/generated/etc.) +* **Functions/classes = content inside the file shape** + + * The “fill” of the file node is structurally subdivided to represent contained functions/classes +* **Function details = nested sub-shapes inside function area** + + * Small badges/segments represent modifiers/returns/dataflow/control-flow +* **Multiple line styles = multiple edge semantics** + + * Imports (file→file), control flow calls (fn→fn), usage deps (fn→fn), dataflow (arg/return/state) + +--- + +### 4.1 Inventory + normalize available semantics from existing artifacts + +Leverage what is already produced today, and formalize how it’s consumed: + +* [ ] **Inputs** (expected present after `index build`): + + * [ ] `file_relations.json` (imports, exports, usages, importLinks, functionMeta/classMeta) + * [ ] `repo_map.json` (chunk-level symbol map, exported flag, signatures) + * [ ] `chunk_meta.json` (docmeta/metaV2: signature/modifiers/returns/controlFlow/dataflow + relations) + * [ ] `graph_relations.json` (importGraph/callGraph/usageGraph) +* [ ] Define “canonical IDs” used across the map: + + * [ ] `fileId = ` + * [ ] `symbolId = ::` (already used in relation graphs) + * [ ] Stable IDs for anonymous/lambda cases (fallback: chunkId when name is `(anonymous)`) + +--- + +### 4.2 Define a versioned “Map Model” schema (diagram-ready) + +This is the core contract the plugin will consume. + +* [ ] Create `docs/map-schema.json` (or similar) with: + + * [ ] `version` + * [ ] `generatedAt` + * [ ] `root` (repo root logical id) + * [ ] `legend`: + + * [ ] `nodeTypes` (file/function/class/symbol) + * [ ] `fileShapes` mapping (category → shape) + * [ ] `functionBadges` mapping (modifier/returns/dataflow/control-flow → badge glyph) + * [ ] `edgeTypes` mapping (imports/calls/usages/dataflow/aliases/mutations) + * [ ] `edgeStyles` mapping (solid/dashed/dotted/double, arrowheads, labels) + * [ ] `nodes`: + + * [ ] file nodes with nested “members” (functions/classes) + * [ ] function nodes with structured “semantic facets” + * [ ] `edges` (typed, labeled, optionally “port-addressable”) +* [ ] Schema must support **hierarchical nesting**: + + * [ ] File node has `members[]` with per-member ports + * [ ] Member nodes (functions) include `signature`, `modifiers`, `returns`, `controlFlow`, `dataflow` +* [ ] Determinism requirements: + + * [ ] Stable ordering (sort keys/ids) + * [ ] Explicit timestamp field allowed, but everything else must be deterministic + +--- + +### 4.3 Build the semantic “map extractor” (core engine tool) + +Implement a Node tool that reads index artifacts and produces the map model. + +* [ ] Add `tools/code-map.js` (or `tools/report-code-map.js`) that: + + * [ ] Locates repo + index dirs using existing `tools/dict-utils.js` + * [ ] Loads: + + * [ ] `file_relations.json` + * [ ] `repo_map.json` + * [ ] `chunk_meta.json` (or minimal subset) + * [ ] `graph_relations.json` + * [ ] Merges into a single “map model”: + + * [ ] **Files** classified into categories (drives file shape) + * [ ] **Members** extracted per file: + + * [ ] functions/methods/classes (from `repo_map` and/or chunk meta) + * [ ] include line ranges + * [ ] include `signature`, `modifiers`, `params`, `returns` + * [ ] **Function semantics**: + + * [ ] `dataflow.reads`, `dataflow.writes`, `dataflow.mutations`, `dataflow.aliases` + * [ ] `controlFlow.branches/loops/returns/throws/awaits/yields/breaks/continues` + * [ ] `throws`, `awaits`, `yields`, `returnsValue` facets surfaced explicitly + * [ ] **Edges**: + + * [ ] Import edges (file→file) from `importLinks` + raw `imports` + * [ ] Export edges (file→symbol) from `exports` + repo_map `exported` + * [ ] Call edges (fn→fn) from `callLinks` or `graph_relations.callGraph` + * [ ] Usage edges (fn→fn) from `usageLinks` or `graph_relations.usageGraph` + * [ ] Dataflow edges: + + * [ ] Argument flow edges from `callSummaries.argMap` (caller→callee param ports) + * [ ] Return flow edges using inferred return metadata where available + * [ ] Optional: “state flow” edges when reads/writes/mutations overlap (guardrailed; see 28.6) + * [ ] Alias edges: + + * [ ] derived from `dataflow.aliases` (function-local or cross-function via calls when resolvable) +* [ ] Add CLI entrypoint: + + * [ ] `pairofcleats report map` (preferred, consistent with existing `report` group), or + * [ ] `pairofcleats map` (top-level) +* [ ] Support scope + size controls: + + * [ ] `--scope repo|dir|file|symbol` + * [ ] `--focus ` + * [ ] `--include imports,calls,usages,dataflow,exports` + * [ ] `--only-exported` + * [ ] `--max-files N`, `--max-members-per-file N`, `--max-edges N` + * [ ] `--collapse file|dir` (aggregate mode) + * [ ] `--format json|dot|svg|html` (see 28.4) + +--- + +### 4.4 Generate “shape-based” diagrams (DOT-first, with nested function fills) + +To match your “shape with fill containing functions” requirement cleanly, DOT/Graphviz is the most direct representation. + +* [ ] Implement a DOT generator `src/map/dot-writer.js`: + + * [ ] **File nodes as outer shapes** with file-type-dependent shapes: + + * [ ] Source code: `box` or `component` + * [ ] Tests: `box` with distinct border style + * [ ] Config/data: `cylinder` or `hexagon` + * [ ] Docs/prose: `note` + * [ ] Generated/build artifacts: `folder` or `box3d` + * [ ] **Fill represents members** using HTML-like labels: + + * [ ] Outer `
` represents the file “container” + * [ ] Each function/class is a row with a `PORT` so edges can land on that member specifically + * [ ] **Nested shapes inside the function row** (HTML sub-tables/cells) to represent: + + * [ ] modifiers: async/static/generator/visibility + * [ ] signature/params summary + * [ ] returns/returnType/returnsValue indicator + * [ ] dataflow mini-badges: reads/writes/mutates/aliases counts (and/or top N symbols) + * [ ] controlFlow mini-badges: branches/loops/throws/awaits/yields +* [ ] **Edge encoding** (multiple edge “line types”): + + * [ ] Import edges: dashed file→file + * [ ] Call edges: solid function→function (primary control flow) + * [ ] Usage edges: thin/secondary style function→function + * [ ] Dataflow edges: + + * [ ] dotted caller→callee(param) edges (argument flow) + * [ ] dotted callee→caller edges for return flow (if inferred) + * [ ] Mutation/state edges (optional, guardrailed): double-line or distinct style + * [ ] Alias edges: dashed-dotted, labeled `alias: a=b` +* [ ] Output modes: + + * [ ] `--format dot` always available + * [ ] `--format svg` if Graphviz present (shell out to `dot -Tsvg`) + * [ ] `--format html` wraps SVG + legend into a standalone HTML viewer +* [ ] Implement legend rendering: + + * [ ] Either embed as a DOT subgraph or in HTML wrapper + * [ ] Must document shape/edge meaning for users + +--- + +### 4.5 Sublime Text 3 plugin commands for map generation + viewing + +Provide first-class UX inside Sublime, even if rendering happens externally. + +* [ ] Add commands: + + * [ ] `PairOfCleats: Map (Repo)` + * [ ] `PairOfCleats: Map (Current Folder)` + * [ ] `PairOfCleats: Map (Current File)` + * [ ] `PairOfCleats: Map (Symbol Under Cursor)` + * [ ] `PairOfCleats: Map (Selection)` +* [ ] Add a “Map Type” chooser: + + * [ ] Import Map + * [ ] Call Map + * [ ] Usage/Dependency Map + * [ ] Dataflow Map (args/returns/state) + * [ ] Combined Map (guardrailed by size limits) +* [ ] Implement output handling: + + * [ ] Write outputs to `.pairofcleats/maps/` (repo-local) or cache dir + * [ ] Open `.dot` in Sublime for inspection + * [ ] If `.svg`/`.html` produced: + + * [ ] Provide “Open in Browser” command (best-effort) +* [ ] Navigation affordances: + + * [ ] When a map is generated, also produce an indexable “node list” JSON: + + * [ ] allows Sublime quick panel “Jump to node” (file/function) + * [ ] opens file at recorded `startLine` +* [ ] Graceful degradation: + + * [ ] If `astDataflow` / `controlFlow` metadata is unavailable in the index: + + * [ ] show “limited map” warning + * [ ] offer action: “Rebuild index with dataflow/control-flow enabled” (invokes `index build` with the project’s config expectations) + +--- + +### 4.6 Performance guardrails + scaling strategy (mandatory for real repos) + +This phase will generate *very large graphs* unless explicitly constrained. + +* [ ] Hard limits with user-overrides: + + * [ ] `maxFiles`, `maxMembersPerFile`, `maxEdges` + * [ ] edge sampling policies per edge type +* [ ] Aggregation modes: + + * [ ] Directory-level aggregation (folder nodes contain files) + * [ ] File-only map (no nested functions) + * [ ] Export-only functions view + * [ ] “Top-K by degree” (highest call/import fan-in/out) +* [ ] Deterministic sampling: + + * [ ] same inputs → same output (stable selection) +* [ ] Cache map builds keyed by: + + * [ ] index signature + generator options +* [ ] Failure mode policy: + + * [ ] If size exceeds limits, output a “truncated map” plus a summary explaining what was dropped + +--- + +### 4.7 Tests (core + integration + determinism) + +Add explicit automated coverage for the map feature. + +#### Node tool tests (authoritative) + +* [ ] `tests/code-map-basic.js` + + * [ ] Build a tiny fixture repo with: + + * [ ] imports/exports + * [ ] functions calling other functions + * [ ] a function with reads/writes/mutations/aliases + * [ ] a function with branches/loops/throws/awaits + * [ ] Run `build_index.js --stub-embeddings` + * [ ] Run `pairofcleats report map --format json` + * [ ] Assert: + + * [ ] file nodes exist + * [ ] member nodes include `signature/modifiers/returns/dataflow/controlFlow` + * [ ] edge sets include imports + calls +* [ ] `tests/code-map-dot.js` + + * [ ] Generate DOT output + * [ ] Assert: + + * [ ] file “container” nodes exist + * [ ] function rows/ports exist + * [ ] edges connect to ports (caller fn → callee fn) + * [ ] distinct edge styles appear for import vs call vs dataflow +* [ ] `tests/code-map-determinism.js` + + * [ ] Run map generation twice and compare outputs (ignore `generatedAt`) +* [ ] `tests/code-map-guardrails.js` + + * [ ] Generate a repo with many dummy functions + * [ ] Ensure truncation behavior is correct and stable + +#### Plugin-side tests + +* [ ] Python unit tests: + + * [ ] command registration exists + * [ ] subprocess args are correct for each map command + * [ ] output paths computed correctly + * [ ] “Graphviz missing” fallback behavior (DOT-only) works + +--- + + +## Phase 5 — Optional: Service-Mode Integration for Sublime (API-backed Workflows) + +*(Renumbered from prior Phase 28; content largely unchanged, but consider adding map endpoints.)* + +### 5.1 Map endpoints (if service mode is adopted) + +* [ ] Extend `api-server` to support: + + * [ ] `GET /map?scope=...&format=...` + * [ ] `GET /map/nodes?filter=...` for quick panels +* [ ] Sublime plugin optionally consumes the API for faster iteration + +### 5.2 Tests + +* [ ] API contract tests for map endpoints +* [ ] Sublime plugin integration tests (mock HTTP server) + +--- + + +## Phase 6 — Distribution Readiness (Package Control + Cross-Platform) + +*(Renumbered from prior Phase 29.)* + +* [ ] Packaging rules for ST3 (no compiled Python deps) +* [ ] Windows/macOS/Linux path + quoting correctness +* [ ] Document Graphviz optional dependency (for SVG/HTML rendering) +* [ ] Provide minimal “DOT-only mode” documentation + +Tests: + +* [ ] `python -m py_compile` over plugin package +* [ ] Cross-platform subprocess quoting tests (Node) + +--- + + +## Phase 7 — Verification Gates (Regression + Parity + UX Acceptance) + +*(Renumbered from prior Phase 30.)* + +* [ ] Parity checklist vs existing extension behaviors (where applicable) +* [ ] Deterministic outputs for map/search commands +* [ ] Performance acceptance criteria (map generation with guardrails) +* [ ] End-to-end smoke suite including: + + * [ ] index build + * [ ] search + * [ ] map generation (json + dot) + * [ ] optional svg rendering when Graphviz available + +--- + +### Notes on dependency leverage (aligned to the map phase) + +This map phase is intentionally designed to **maximize reuse** of what the repo already has: + +* Existing semantics extraction already provides the key fields you listed: + + * `imports/exports/usages/importLinks` via relations + * `calls/callDetails` + cross-file `callLinks/usageLinks/callSummaries` + * `signature/modifiers/returns` via docmeta/functionMeta + * `reads/writes/mutations/aliases` via AST dataflow (when enabled) + * `controlFlow` counts already present in docmeta/functionMeta +* Existing graph tooling: + + * `graphology`-backed `graph_relations.json` provides a strong base graph layer +* The missing piece is the **visual model + rendering/export** and **Sublime UX** around it, which Phase 28 supplies. + + +## Phase 8 — Test Gate Stabilization and Determinism + +**Objective:** Make the current test suite reliable (non-flaky) and green, so subsequent refactors (security, caching, RPC hardening) have a trustworthy safety net. + +1. **Fix failing Phase 22 gate: `type-inference-lsp-enrichment` (Python tooling return type missing)** + + * [ ] **Broaden hover fallback conditions in LSP tooling providers so missing return types are recovered even when parameter types are present.** + + * **Why:** All three LSP tooling providers currently only fetch hover when *both* `returnType` is missing *and* `paramTypes` is empty. If a provider can parse param types from `documentSymbol.detail` but that string omits return type (a plausible LSP behavior), it will never attempt hover and will miss return types (exact symptom reported by the failing test). + * **Where:** + + * `src/index/tooling/pyright-provider.js` + + * Current gating (too strict): + `if (!info || (!info.returnType && !Object.keys(info.paramTypes || {}).length)) { ... hover ... }` + * `src/index/tooling/clangd-provider.js` (same pattern) + * `src/index/tooling/sourcekit-provider.js` (same pattern) + * **Fix:** + + * Change hover fallback gating to trigger when **either** return type is missing **or** param types are missing, e.g.: + + * `if (!info || !info.returnType || !Object.keys(info.paramTypes || {}).length) { ... }` + * Keep a small timeout override (already present) and consider a per-file/per-symbol hover cap if you want to prevent worst-case hover storms. + * **Tests:** + + * Keep `tests/type-inference-lsp-enrichment.js` as the regression gate. + * Add/adjust a focused unit/integration test fixture path where `documentSymbol.detail` omits return type but hover includes it (this directly validates the new behavior rather than relying on chance). + * [ ] **Validate stored tooling return types match exact expectations for Python (`str`)** + + * **Why:** The test asserts `entry.type === 'str'` (exact string match). Any normalization differences (e.g., `builtins.str`, `str:`) will fail. + * **Where:** Return type extraction path: + + * `src/index/tooling/signature-parse/python.js` (`parsePythonSignature`) + * `src/index/tooling/pyright-provider.js` (populating `entry.returns`) + * `src/index/type-inference-crossfile/apply.js` (`addInferredReturn`) + * **Fix:** Ensure the Python return type passed into `addInferredReturn()` is the normalized “plain” name the project expects (currently looks intended to already be `str`, but explicitly confirm by tests). + +2. **Fix failing Phase 22 gate: `embeddings-dims-mismatch` (test is flaky due to cache file selection)** + + * [ ] **Make the test select a cache entry that matches the identity it intends to mutate.** + + * **Why:** The cache directory can contain *multiple* caches for the same file hash/signature but different identity keys (e.g., stub embeddings default dims 384 from `build_index` stage vs. a subsequent `build-embeddings --dims 8`). The test currently mutates an arbitrary first file returned by `readdir`, which is OS/filesystem-order dependent, causing nondeterministic behavior (observed in `tests/phase22-logs/embeddings-dims-mismatch.js.log`). + * **Where:** `tests/embeddings-dims-mismatch.js` + + * Current behavior: `const targetFile = cacheFiles[0];` (no filtering) + * **Fix (recommended):** + + * Read all cache files, parse JSON, and select one whose `cacheMeta.identity.dims === 8` **and** `cacheMeta.identity.stub === true` (or match `cacheMeta.identityKey` computed from `buildCacheIdentity`). + * Sort `cacheFiles` for determinism even after filtering. + * **Tests:** The test itself is the gate; ensure it passes consistently on Windows/macOS/Linux. + +3. **De-flake related embeddings cache test to prevent future intermittent failures** + + * [ ] Apply the same deterministic cache selection strategy to `tests/embeddings-cache-identity.js`. + + * **Why:** It uses the same “first file” selection pattern and can fail depending on directory enumeration order and presence of other identity caches. + * **Where:** `tests/embeddings-cache-identity.js` + * **Fix:** Filter for identity matching the run’s intended dims/provider/stub flags (same as above), and sort before selecting. + +4. **Add a “Phase 22 gate” smoke runner (optional but strongly recommended)** + + * [ ] Create a single script to run only the gate tests and report failures clearly. + + * **Why:** Reduces time-to-signal and encourages frequent local verification during refactors. + * **Where:** e.g., `tools/run-phase22-gates.js` or `npm run test:phase22` + * **Exit expectation:** One command that deterministically reproduces CI gate results. + +**Exit criteria** + +* [ ] `tests/type-inference-lsp-enrichment.js` passes. +* [ ] `tests/embeddings-dims-mismatch.js` passes deterministically (no filesystem-order dependence). +* [ ] `tests/embeddings-cache-identity.js` passes deterministically. +* [ ] No new flaky tests introduced (verified via at least 5 repeated local runs on one platform, and ideally at least one Windows run). + +--- + + +## Phase 9 — Security and Input-Hardening (Local Servers + Indexing) + +**Objective:** Close high-impact vulnerabilities and unsafe defaults that could be exploited when indexing untrusted repositories or exposing the local API server beyond localhost. + +1. **Prevent symlink-based repo escape during discovery/indexing** + + * [ ] **Stop following symlinks when discovering and stat’ing files.** + + * **Why:** If a repository contains a tracked symlink pointing outside the repo (e.g., to `/etc/passwd`), the current logic can follow it and read/index external files. This is a classic “repo escape / data exfiltration” risk when indexing untrusted repos. + * **Where:** `src/index/build/discover.js` + + * Uses `fs.stat()` (follows symlinks) on each path. + * **Fix:** + + * Use `lstat` first; if it is a symlink: + + * Default behavior: **skip** the entry. + * Optional (configurable) behavior: allow symlinks only if resolved target remains within `rootDir` (realpath boundary check). + * Ensure both “git ls-files” path discovery and fallback `fdir` scanning apply the same symlink policy. + * **Tests:** + + * Add a fixture repo containing a symlink file pointing outside repo root. + * Assert indexing does not read it (and ideally logs a warning or records a skip reason). + * [ ] **Ensure downstream file reads cannot accidentally follow symlinks even if discovery misses one.** + + * **Why:** Defense-in-depth; discovery should prevent it, but a second gate at file-read time reduces risk. + * **Where:** `src/index/build/file-processor.js` and any shared read helpers (e.g., `src/shared/encoding.js` `readTextFileWithHash`) + * **Fix:** If feasible, check `lstat` before read in the pre-read stage (or pass `lstat` results from discovery and enforce “no symlink reads”). + +2. **Lock down API server defaults (CORS, repo selection, and exposure)** + + * [ ] **Remove unconditional permissive CORS (`Access-Control-Allow-Origin: *`) or make it explicitly opt-in.** + + * **Why:** If the server is started with `--host 0.0.0.0` (supported), permissive CORS plus no auth makes it trivial for any web page on the same network to call the API from a browser (cross-site request from an untrusted origin). + * **Where (currently sets `*`):** + + * `tools/api/router.js` (sets headers broadly, including metrics endpoint) + * `tools/api/response.js` + * `tools/api/sse.js` + * **Fix (recommended safe default):** + + * Default allowlist: `http://127.0.0.1:*` and `http://localhost:*` only (or no CORS headers at all unless configured). + * Add config flags: + + * `api.cors.allowedOrigins` (array) + * `api.cors.allowAnyOrigin` (explicit opt-in, default false) + * [ ] **Add authentication for non-localhost bindings (or always, with a “dev disable” escape hatch).** + + * **Why:** The API allows expensive operations (search) and can access the filesystem via repo selection (see next item). This should not be anonymous if reachable from other machines. + * **Fix:** + + * Support a bearer token header, e.g. `Authorization: Bearer ` with `PAIR_OF_CLEATS_API_TOKEN` env var. + * If `host` is not `127.0.0.1/localhost`, require token by default. + * [ ] **Restrict `repoPath` override in API requests (prevent arbitrary filesystem indexing/search).** + + * **Why:** Current API accepts a request body that can set `repoPath`, and then resolves and operates on that directory. Without an allowlist, this is arbitrary directory read/search capability. + * **Where:** `tools/api/router.js` `resolveRepo(value)` and usage in `/search`, `/status`, `/stream/search`. + * **Fix options:** + + * Option A (strict): disallow `repoPath` in request; only use the server’s configured repo. + * Option B (allowlist): allow only if within a configured set of allowed roots (`api.allowedRepoRoots`), enforced by realpath boundary checks. + * **Tests:** + + * Confirm requests with disallowed repoPath return 400/403. + * Confirm allowed repo paths still work. + +3. **Harden API request body parsing and limits** + + * [ ] **Replace string concatenation body parsing with byte-safe buffering and strict size enforcement.** + + * **Why:** Current `parseBody` in `tools/api/router.js` does `data += chunk` and uses `data.length` (characters, not bytes). This is less reliable and can be slower for large payloads due to repeated string reallocations. + * **Fix:** + + * Accumulate Buffers in an array; track `byteLength`. + * Enforce a hard cap in bytes (e.g., 1 MiB configurable). + * Only decode once at the end. + * [ ] **Validate `Content-Type` for JSON endpoints.** + + * **Why:** Avoid ambiguous parsing and reduce attack surface. + * **Fix:** Require `application/json` for POST bodies on `/search` and stream endpoints (except where intentionally flexible). + +**Exit criteria** + +* [ ] Indexing does not follow symlinks by default (tested with a symlink fixture). +* [ ] API no longer emits permissive CORS headers by default. +* [ ] API requests cannot arbitrarily set `repoPath` unless explicitly allowed/configured. +* [ ] API body parsing is byte-safe and enforces a clear, tested size limit. + +--- + + +## Phase 10 — RPC Robustness and Memory-Safety (LSP + MCP + JSON-RPC) + +**Objective:** Prevent unbounded memory growth and improve resilience when communicating with external processes (LSP servers, MCP transport), including malformed or oversized JSON-RPC frames. + +1. **Implement `maxBufferBytes` enforcement in framed JSON-RPC parser** + + * [ ] **Enforce `maxBufferBytes` in `createFramedJsonRpcParser`.** + + * **Why:** The function accepts `maxBufferBytes` but does not enforce it, leaving an unbounded buffer growth path if a peer sends large frames or never terminates headers. + * **Where:** `src/shared/jsonrpc.js` (`createFramedJsonRpcParser`) + * **Fix:** + + * Track buffer size after concatenation. + * If buffer exceeds limit: + + * Clear internal buffer. + * Call `onError(new Error(...))`. + * Optionally enter a “failed/closed” state to reject further data. + * Consider separate thresholds: + + * `maxHeaderBytes` (protect header scan) + * `maxMessageBytes` (protect content-length payload) + * [ ] **Add explicit tests for oversized frames.** + + * **Where:** Add a new unit test under `tests/` that pushes > limit into parser and asserts: + + * `onError` called + * parser does not continue to grow memory + +2. **Apply bounded JSON-RPC parsing in LSP client** + + * [ ] Replace `StreamMessageReader` usage with the bounded framed parser (or wrap it with size checks). + + * **Why:** `StreamMessageReader` will buffer messages; without explicit size enforcement at your integration boundary, a misbehaving server can cause OOM. + * **Where:** `src/integrations/tooling/lsp/client.js` + * **Fix:** + + * Wire `proc.stdout` `data` into `createFramedJsonRpcParser`. + * Feed parsed messages into the existing dispatch/response correlation logic. + * Ensure shutdown/kill closes parser cleanly. + +3. **Apply bounded JSON-RPC parsing in MCP transport** + + * [ ] Replace `StreamMessageReader` usage similarly. + + * **Where:** `tools/mcp/transport.js` + * **Fix:** Same pattern as LSP client; enforce message size limits and fail gracefully. + +**Exit criteria** + +* [ ] `createFramedJsonRpcParser` enforces max buffer/message sizes with tests. +* [ ] LSP client no longer relies on unbounded message buffering. +* [ ] MCP transport no longer relies on unbounded message buffering. + +--- + + +## Phase 11 — Resource Lifecycle Management (Caches, Long-Lived Servers, Builds) + +**Objective:** Prevent memory and resource leaks in long-running processes (API server, service workers), especially across repeated builds and multi-repo usage. + +1. **Add eviction/TTL for API router repo-level caches** + + * [ ] **Implement eviction for `repoCaches` map in `tools/api/router.js`.** + + * **Why:** `repoCaches` can grow unbounded if clients query multiple repos or if repo roots vary. Each entry can hold heavy caches (index cache + sqlite connections). + * **Fix:** + + * Add: + + * `maxRepos` (e.g., 3–10) + * `repoTtlMs` (e.g., 10–30 minutes) + * Track `lastUsed` and evict least-recently-used / expired. + * On eviction: close sqlite cache handles (`sqliteCache.close()`), clear index cache. + * [ ] Add metrics for cache size and evictions. + + * **Where:** `tools/api/router.js` and metrics registry. + +2. **Add eviction for per-repo index cache and sqlite DB cache** + + * [ ] **Index cache eviction** + + * **Why:** `src/retrieval/index-cache.js` caches by `dir` (which can change per build). On repeated re-indexing, old build directories can accumulate. + * **Fix:** Convert to LRU with max entries, or TTL purge on access. + * [ ] **SQLite DB cache eviction** + + * **Where:** `src/retrieval/sqlite-cache.js` + * **Why:** Same “dir-per-build” key pattern; can leak connections/handles. + * **Fix:** LRU/TTL + ensure `close()` called on eviction. + +3. **Add explicit cache invalidation when “current build” pointer changes** + + * [ ] Detect when the effective index directory changes (new build) and prune caches for previous builds. + + * **Why:** Keeps hot caches relevant and bounds memory footprint. + +**Exit criteria** + +* [ ] API server memory does not grow unbounded when indexing/searching multiple repos/builds. +* [ ] Old build caches are evicted/pruned automatically. +* [ ] SQLite handles are closed on eviction (verified via tests or instrumentation). + +--- + + +## Phase 12 — Performance and Operational Hardening + +**Objective:** Improve throughput and robustness under load without changing core behavior. + +1. **Reduce event-loop blocking sync filesystem calls on API request paths** + + * [ ] Replace `fsSync.*` in API request hot paths with async equivalents where practical. + + * **Why:** Sync I/O can stall concurrent requests in the API server process. + * **Where (examples):** + + * `tools/api/router.js` `resolveRepo()` uses `existsSync/statSync`. + * **Fix:** Use `fs.promises.stat` with try/catch; cache results briefly if needed. + +2. **Prevent decompression “zip bomb” style memory spikes in artifact reading** + + * [ ] Add output size limiting to gzip decompression. + + * **Why:** `src/shared/artifact-io.js` uses `gunzipSync(buffer)` and only checks decompressed size *after* decompression. A small compressed file could expand massively and spike memory. + * **Fix:** + + * Use `zlib.gunzipSync(buffer, { maxOutputLength: maxBytes + slack })` (if supported in your Node target), or switch to streaming gunzip with explicit byte limits. + * **Where:** `src/shared/artifact-io.js` `parseBuffer` / gzip handling. + +3. **Add download size limits for tools that fetch large remote assets** + + * [ ] Enforce maximum download size (or require hash) for dictionary downloads. + + * **Why:** `tools/download-dicts.js` buffers the entire response in memory (`Buffer.concat`) without a hard cap. + * **Fix:** Stream to disk with a cap; abort if exceeded; strongly prefer requiring hashes for non-default URLs. + +**Exit criteria** + +* [ ] API request path avoids avoidable sync I/O. +* [ ] Artifact gzip parsing cannot explode memory beyond configured limits. +* [ ] Large downloads are bounded and/or verified. + +--- + + +## Phase 13 — Documentation and Configuration Hardening + +**Objective:** Ensure the fixed behavior is discoverable, configurable, and hard to misconfigure into an unsafe state. + +1. **Document security posture and safe defaults** + + * [ ] Document: + + * API server host binding risks (`--host 0.0.0.0`) + * CORS policy and how to configure allowed origins + * Auth token configuration (if implemented) + * RepoPath allowlist behavior + * [ ] Add a prominent note: indexing untrusted repos and symlinks policy. + +2. **Add configuration schema coverage for new settings** + + * [ ] If adding config keys (CORS/auth/cache TTL), ensure they are: + + * Reflected in whatever config docs you maintain + * Validated consistently (even if validation is lightweight) + +**Exit criteria** + +* [ ] README/docs reflect new defaults and how to safely expose services. +* [ ] New options are documented and validated enough to prevent silent misconfiguration. + +--- + +--- + + +## Phase 14 — Optional-dependency framework + capability registry (foundation for all phases) + +### 14.1 Introduce a consistent “optional dependency” loader + +* [ ] Add `src/shared/optional-deps.js` with a single, opinionated API: + + * [ ] `tryRequire(name)` / `tryImport(name)` helpers (use `createRequire(import.meta.url)` where needed) + * [ ] Standardized return shape: `{ ok: true, mod } | { ok: false, error, reason }` + * [ ] Standardized logging hook (only when `PAIROFCLEATS_VERBOSE` or a dedicated flag is enabled) +* [ ] Add `src/shared/capabilities.js` that reports runtime availability: + + * [ ] `watcher: { chokidar: true, parcel: boolean }` + * [ ] `regex: { re2: boolean, re2js: true }` + * [ ] `hash: { nodeRsXxhash: boolean, wasmXxhash: true }` + * [ ] `compression: { gzip: true, zstd: boolean }` + * [ ] `extractors: { pdf: boolean, docx: boolean }` + * [ ] `mcp: { sdk: boolean, legacy: true }` + * [ ] `externalBackends: { tantivy: boolean, lancedb: boolean }` (even if “boolean” means “reachable” rather than “installed”) +* [ ] Wire capabilities into existing “status” surfaces: + + * [ ] Extend `tools/mcp/repo.js` → `configStatus()` to include capability info and warnings for requested-but-unavailable features + * [ ] Extend `tools/config-dump.js` (or equivalent) to print capabilities in JSON output mode + +### 14.2 Add config + env “backend selectors” (uniform UX) + +* [ ] Extend `src/shared/env.js` to parse new selectors (string + allowlist): + + * [ ] `PAIROFCLEATS_WATCHER_BACKEND` = `auto|chokidar|parcel` + * [ ] `PAIROFCLEATS_REGEX_ENGINE` = `auto|re2|re2js` + * [ ] `PAIROFCLEATS_XXHASH_BACKEND` = `auto|native|wasm` + * [ ] `PAIROFCLEATS_COMPRESSION` = `auto|gzip|zstd|none` + * [ ] `PAIROFCLEATS_DOC_EXTRACT` = `auto|on|off` + * [ ] `PAIROFCLEATS_MCP_TRANSPORT` = `auto|sdk|legacy` +* [ ] Add parallel config keys in `.pairofcleats.json` (keep them near existing related config blocks): + + * [ ] `indexing.watch.backend` + * [ ] `search.regex.engine` + * [ ] `indexing.hash.backend` + * [ ] `indexing.artifactCompression.mode` enum expansion + `auto` + * [ ] `indexing.documentExtraction.enabled` + * [ ] `mcp.transport` +* [ ] Update `docs/config-schema.json`: + + * [ ] Add/expand enums (avoid “free string” for anything that’s meant to be policy-controlled) + * [ ] Add descriptions that clarify fallback rules (`auto` behavior) +* [ ] Update any config validation code paths if they enforce known keys (`src/config/validate.js` is schema-driven; keep schema authoritative) + +### 14.3 Add dependency-bundle reference stubs (keeps repo documentation consistent) + +For each new dependency introduced in later phases, add a minimal doc file under: +`docs/references/dependency-bundle/deps/.md` + +* [ ] `parcel-watcher.md` +* [ ] `re2.md` +* [ ] `node-rs-xxhash.md` +* [ ] `mongodb-js-zstd.md` +* [ ] `pdfjs-dist.md` +* [ ] `mammoth.md` +* [ ] `modelcontextprotocol-sdk.md` +* [ ] `lancedb.md` (if used) +* [ ] `tantivy.md` (if used) +* [ ] Update `docs/references/dependency-bundle/README.md` if it has an index + +### 14.4 Tests (framework-level) + +* [ ] Add `tests/capabilities-report.js`: + + * [ ] Asserts `capabilities` object shape is stable + * [ ] Asserts `auto` selectors never throw when optional deps are missing +* [ ] Add a script-coverage action to run it: + + * [ ] `tests/script-coverage/actions.js`: add action entry that calls `runNode(...)` + * [ ] (Optional) Add an npm script alias if you want parity with the rest of the repo scripts + +**Exit criteria** + +* [ ] All “capability” calls are side-effect-free and safe when optional deps are absent +* [ ] `config_status` (MCP) can surface “you requested X but it’s not available” warnings without crashing +* [ ] CI passes on Node 18 (Ubuntu + Windows lanes) + +--- + + +## Phase 15 — File watching performance: add `@parcel/watcher` backend (keep chokidar fallback) + +### 15.1 Add the dependency (prefer optional unless you want it guaranteed everywhere) + +* [ ] Add `@parcel/watcher` to `package.json` + + * [ ] Prefer `optionalDependencies` if you want installs to succeed even when native builds fail + * [ ] If you add it as a hard dependency, ensure Windows CI remains green + +### 15.2 Create a watcher-backend abstraction + +* [ ] Create `src/index/build/watch/backends/types.js` (or inline JSDoc contract) describing: + + * [ ] `start({ root, ignored, onEvent, onError, pollMs? }) -> { close(): Promise }` + * [ ] Normalized event shape: `{ type: 'add'|'change'|'unlink', absPath }` +* [ ] Extract chokidar wiring out of `src/index/build/watch.js`: + + * [ ] Move into `src/index/build/watch/backends/chokidar.js` + * [ ] Preserve existing semantics (`awaitWriteFinish`, ignored matcher, poll support) +* [ ] Implement parcel watcher backend: + + * [ ] New file: `src/index/build/watch/backends/parcel.js` + * [ ] Map parcel events to the normalized `{type, absPath}` model + * [ ] Decide how to handle rename/move (often appears as unlink+add): + + * [ ] If parcel reports rename, still emit unlink+add for compatibility with current scheduling + * [ ] Implement “poll” behavior: + + * [ ] If poll mode is requested, either: + + * [ ] force chokidar with polling, **or** + * [ ] implement a cheap stat-based poller wrapper (only if needed) + * [ ] Implement “write stability” guard: + + * [ ] Chokidar has `awaitWriteFinish`; parcel does not in the same way + * [ ] Add a “stabilize file” check in the pipeline: before processing a file, optionally confirm `mtime/size` stable across N ms + * [ ] Place this in `createDebouncedScheduler()` or immediately before `enqueueOrUpdate()` in `file-processor.js` (prefer a single shared guard) + +### 15.3 Wire selection into `watchIndex()` + +* [ ] Update `src/index/build/watch.js`: + + * [ ] Choose backend via (in order): CLI/config → env → `auto` capability + * [ ] Log selected backend once at startup (only if verbose or `--watch`) + * [ ] Ensure `pollMs` is still honored (either by backend or by selection logic) + +### 15.4 Tests + +* [ ] Add `tests/watch-backend-selection.js`: + + * [ ] Forces `PAIROFCLEATS_WATCHER_BACKEND=chokidar` and asserts no parcel import occurs + * [ ] Forces `...=parcel` and asserts fallback behavior if module unavailable (no crash, warning path) +* [ ] Add `tests/watch-stability-guard.js`: + + * [ ] Simulate “partial write” (write file in two chunks with delay) and assert processor waits/defers correctly + * [ ] Keep the test deterministic: use explicit timeouts and a temp directory under `tests/.cache` +* [ ] Add corresponding script-coverage actions in `tests/script-coverage/actions.js` + +**Exit criteria** + +* [ ] `pairofcleats index watch` remains correct on Windows and Linux +* [ ] No regressions in ignore behavior (still uses `buildIgnoredMatcher`) +* [ ] Event storms do not cause repeated redundant rebuilds (existing debounce logic preserved) + +--- + + +## Phase 16 — Safe regex acceleration: optional native RE2 (`re2`) with `re2js` fallback + +### 16.1 Add dependency + backend wrapper + +* [ ] Add `re2` (native) as an optional dependency (recommended) +* [ ] Refactor `src/shared/safe-regex.js` into a backend-based module: + + * [ ] Keep current behavior as the fallback backend (`re2js`) + * [ ] Add `src/shared/safe-regex/backends/re2.js` + * [ ] Add `src/shared/safe-regex/backends/re2js.js` (wrap existing usage cleanly) +* [ ] Preserve existing safety constraints: + + * [ ] `maxPatternLength` + * [ ] `maxInputLength` + * [ ] Guard flags normalization (only `gimsyu` supported as today) + +### 16.2 Integrate selector + compatibility contract + +* [ ] Add `createSafeRegex({ engine, ...limits })` selection: + + * [ ] `engine=auto` uses `re2` if available else `re2js` + * [ ] `engine=re2` hard-requires native; if missing, returns a clear error (or a warning + fallback if you prefer) +* [ ] Validate behavioral parity: + + * [ ] Ensure `.exec()` and `.test()` match expectations for `g` and non-`g` + * [ ] Ensure `.lastIndex` semantics are either compatible or explicitly *not supported* (and documented) + +### 16.3 Update call sites + +* [ ] Verify these flows still behave correctly: + + * [ ] `src/retrieval/output/filters.js` (file/path filters) + * [ ] `src/retrieval/output/risk-tags.js` (risk tagging) + * [ ] Any structural search / rulepack path using regex constraints + +### 16.4 Tests + +* [ ] Add `tests/safe-regex-engine.js`: + + * [ ] Conformance tests (flags, match groups, global behavior) + * [ ] Safety limit tests (pattern length, input length) + * [ ] Engine-selection tests (`auto`, forced `re2js`) +* [ ] Add script-coverage action(s) + +**Exit criteria** + +* [ ] No user-visible semantic regressions in filtering/risk-tagging +* [ ] “Engine auto” is safe and silent (no noisy logs) unless verbose + +--- + + +## Phase 17 — Hashing performance: optional native xxhash (`@node-rs/xxhash`) with `xxhash-wasm` fallback + +### 17.1 Add dependency + unify backend contract + +* [ ] Add `@node-rs/xxhash` as optional dependency (or hard dep if you accept platform constraints) +* [ ] Create `src/shared/hash/xxhash-backend.js`: + + * [ ] `hash64(buffer|string) -> hex16` (exact output format must match existing `checksumString()` + `checksumFile()`) + * [ ] `hash64Stream(readable) -> hex16` (if supported; otherwise implement chunking in JS) +* [ ] Update `src/shared/hash.js`: + + * [ ] Keep `sha1()` unchanged + * [ ] Route `checksumString()` / `checksumFile()` through the backend contract + * [ ] Preserve deterministic formatting (`formatXxhashHex`) + +### 17.2 Introduce selector + telemetry + +* [ ] Add `PAIROFCLEATS_XXHASH_BACKEND=auto|native|wasm` +* [ ] Emit backend choice in verbose logs (once) + +### 17.3 Tests + +* [ ] Add `tests/xxhash-backends.js`: + + * [ ] Assert `checksumString('abc')` matches a known baseline (record from current implementation) + * [ ] Assert `checksumFile()` matches `checksumString()` on same content (via temp file) + * [ ] If native backend is available, assert native and wasm match exactly + * [ ] If native is missing, ensure test still passes (skips “native parity” block) +* [ ] Add script-coverage action(s) + +**Exit criteria** + +* [ ] No change to bundle identity semantics (incremental cache stability) +* [ ] `checksumFile()` remains bounded-memory for large files (streaming or chunked reads) + +--- + + +## Phase 18 — Artifact compression upgrade: add Zstandard (`zstd`) alongside gzip + +### 18.1 Add compression dependency + +* [ ] Add `@mongodb-js/zstd` (recommended as optional dependency due to native bindings) +* [ ] Decide “streaming vs buffer-only” support: + + * [ ] If streaming is supported: implement streaming JSONL writers/readers + * [ ] If buffer-only: restrict zstd to JSON object/array artifacts, keep JSONL as gzip (document clearly) + +### 18.2 Introduce compression abstraction (avoid sprinkling `if (mode===...)` everywhere) + +* [ ] Add `src/shared/compression.js`: + + * [ ] `compressBuffer(mode, buffer, level?)` + * [ ] `decompressBuffer(mode, buffer)` + * [ ] Optional stream helpers if supported +* [ ] Update `src/index/build/artifacts/compression.js`: + + * [ ] Expand `mode` validation: `gzip|zstd|none` + * [ ] Keep current defaults unchanged (`gzip` or `null` based on existing config) +* [ ] Update `src/index/build/artifacts.js`: + + * [ ] Replace hard-coded `.json.gz` with extension derived from compression mode + + * [ ] gzip: `.json.gz` + * [ ] zstd: `.json.zst` (or `.json.zstd`; pick one and standardize) + * [ ] Ensure `compressionKeepRaw` behavior remains correct + +### 18.3 Update readers/writers for new extensions + +* [ ] Update `src/shared/artifact-io.js`: + + * [ ] Extend `resolveArtifactPath()` to check: + + * [ ] `.json` then `.json.gz` then `.json.zst` + * [ ] Also handle `.bak` variants for each + * [ ] Extend `readJsonFile()` to decode zstd when applicable +* [ ] Update `src/shared/json-stream.js`: + + * [ ] Add zstd path for `writeJsonArrayFile()` / `writeJsonObjectFile()` when compression is requested + * [ ] If JSONL is to support zstd: update `writeJsonLinesFile()` and `readJsonLinesArraySync()` + +### 18.4 Update artifact contract + metrics + +* [ ] Update `docs/artifact-contract.md`: + + * [ ] New allowed compression modes + * [ ] New filename extensions + * [ ] Backward compatibility statement (gzip still readable) +* [ ] Update `src/index/build/artifacts/metrics.js` to report `compression.mode=zstd` +* [ ] Update `docs/config-schema.json` to restrict/describe valid modes + +### 18.5 Tests + +* [ ] Add `tests/artifact-zstd-readwrite.js`: + + * [ ] Write a compressed artifact (zstd) using production writer + * [ ] Read it with `readJsonFile()` and assert payload matches +* [ ] Extend `tests/artifact-bak-recovery.js` with a zstd variant: + + * [ ] `.json.zst` + `.bak` fallback behavior +* [ ] Add script-coverage action(s) + +**Exit criteria** + +* [ ] `loadIndex()` can transparently read `.json`, `.json.gz`, and `.json.zst` artifacts +* [ ] Existing gzip artifacts remain fully compatible +* [ ] Failure-mode behavior (`.bak` recovery) remains correct for new extensions + +--- + + +## Phase 19 — Massive functionality boost: PDF + DOCX ingestion (prose mode) + +### 19.1 Add document extraction dependencies + +* [ ] Add `pdfjs-dist` (PDF text extraction) +* [ ] Add `mammoth` (DOCX → text/HTML extraction) + +### 19.2 Introduce “extractor” layer in indexing pipeline + +* [ ] Create `src/index/build/extractors/`: + + * [ ] `text.js` (wrap existing `readTextFileWithHash` path) + * [ ] `pdf.js` (buffer → extracted text; include page separators if possible) + * [ ] `docx.js` (buffer → extracted text; preserve headings if possible) + * [ ] `index.js` (select extractor by extension + config) +* [ ] Add a new constant set in `src/index/constants.js`: + + * [ ] `EXTS_EXTRACTABLE_BINARY = new Set(['.pdf', '.docx'])` +* [ ] Add `.pdf` and `.docx` to `EXTS_PROSE` **only if** extraction is enabled (or add them unconditionally but ensure they don’t get skipped) + +### 19.3 Fix binary-skip logic to allow extractable docs + +You must handle both “pre-read” scanning and “post-read” binary checks: + +* [ ] Update `src/index/build/file-scan.js` / `createFileScanner()`: + + * [ ] If `ext` ∈ `EXTS_EXTRACTABLE_BINARY` and extraction enabled: + + * [ ] Do **not** mark as `{ reason: 'binary' }` + * [ ] Still allow minified checks to run when relevant (likely irrelevant for pdf/docx) +* [ ] Update `src/index/build/file-processor/skip.js`: + + * [ ] If `ext` extractable and extraction enabled, do not return `binarySkip` +* [ ] Update `src/index/build/file-processor.js`: + + * [ ] Branch early on `ext`: + + * [ ] For `.pdf`/`.docx`: read buffer → extractor → `text` + * [ ] For all else: existing text decoding path + * [ ] Ensure `hash` still derives from raw bytes (current `sha1(buffer)` behavior is good) + * [ ] Ensure `stats.bytes` is still the raw size for guardrails + +### 19.4 Chunking strategy for extracted docs + +* [ ] Decide on an initial, deterministic chunking approach: + + * [ ] Minimal viable: treat extracted output as prose and let default prose chunking apply + * [ ] Better: add dedicated chunkers: + + * [ ] Add `src/index/chunking/prose/pdf.js` to split by page markers + * [ ] Add `src/index/chunking/prose/docx.js` to split by headings / paragraph blocks +* [ ] Update `src/index/chunking/dispatch.js`: + + * [ ] Map `.pdf` and `.docx` to their chunkers (or prose fallback) + +### 19.5 Search + metadata integration + +* [ ] Ensure extracted docs appear in: + + * [ ] `file_meta.json` (file path + ext) + * [ ] `chunk_meta.*` (chunks with correct file associations) +* [ ] Consider adding a metadata flag for UI filters: + + * [ ] `fileMeta[i].isExtractedDoc = true` (or reuse existing `externalDocs` pattern if appropriate) +* [ ] Verify retrieval filters treat these files correctly (extension/path filters) + +### 19.6 Tests (must include “end-to-end search finds doc content”) + +* [ ] Add fixture files under `tests/fixtures/docs/`: + + * [ ] `sample.pdf` with a known unique phrase + * [ ] `sample.docx` with a known unique phrase +* [ ] Add `tests/pdf-docx-extraction.js`: + + * [ ] Unit-level extraction returns expected text +* [ ] Add `tests/pdf-docx-index-search.js`: + + * [ ] Build prose index for a temp repo that includes the docs + * [ ] Run `search.js --mode prose` and assert the phrases match chunks +* [ ] Add script-coverage action(s) + +**Exit criteria** + +* [ ] PDF/DOCX are no longer silently dropped as “binary” (when enabled) +* [ ] Prose search can retrieve content from these formats reliably +* [ ] No regression to binary detection for non-extractable files + +--- + + +## Phase 20 — MCP server: migrate from custom JSON-RPC plumbing to official MCP SDK (reduce maintenance) + +### 20.1 Add MCP SDK and plan transport layering + +* [ ] Add `@modelcontextprotocol/sdk` dependency +* [ ] Decide migration strategy: + + * [ ] **Option A (recommended):** keep `tools/mcp-server.js` as the entrypoint, but implement server via SDK and keep legacy behind a flag + * [ ] Option B: replace legacy entirely (higher risk) + +### 20.2 Implement SDK-based server + +* [ ] Add `src/integrations/mcp/sdk-server.js` (or similar): + + * [ ] Register tools from `src/integrations/mcp/defs.js` + * [ ] Dispatch calls to existing handlers in `tools/mcp/tools.js` (or migrate handlers into `src/` cleanly) + * [ ] Preserve progress notifications semantics expected by `tests/mcp-server.js`: + + * [ ] `notifications/progress` + * [ ] Include `{ tool: 'build_index', phase, message }` fields (match current tests) +* [ ] Update `tools/mcp-server.js`: + + * [ ] If `mcp.transport=legacy` or env forces legacy → use current transport + * [ ] Else → use SDK transport + +### 20.3 Remove or isolate legacy transport surface area + +* [ ] Keep `tools/mcp/transport.js` for now, but: + + * [ ] Move to `tools/mcp/legacy/transport.js` + * [ ] Update imports accordingly + * [ ] Reduce churn risk while you validate parity + +### 20.4 Tests + +* [ ] Ensure these existing tests continue to pass without rewriting expectations unless protocol mandates it: + + * [ ] `tests/mcp-server.js` + * [ ] `tests/mcp-robustness.js` + * [ ] `tests/mcp-schema.js` +* [ ] Add `tests/mcp-transport-selector.js`: + + * [ ] Force `PAIROFCLEATS_MCP_TRANSPORT=legacy` and assert legacy path still works + * [ ] Force `...=sdk` and assert SDK path works +* [ ] Add script-coverage action(s) + +**Exit criteria** + +* [ ] MCP server behavior is unchanged from the client perspective (tool list, outputs, progress events) +* [ ] Maintenance burden reduced: eliminate custom framing/parsing where SDK provides it + +--- + + +## Phase 21 — Tantivy sparse backend (optional, high impact on large repos) + +> This phase is intentionally split into “abstraction first” and “backend integration” to keep risk controlled. + +### 21.1 Extract a sparse-retrieval interface + +* [ ] Create `src/retrieval/sparse/`: + + * [ ] `types.js` contract: `search({ query, topN, filters, mode }) -> hits[]` + * [ ] `providers/sqlite-fts.js` wrapper around existing SQLite FTS ranking + * [ ] `providers/js-bm25.js` wrapper around the in-memory BM25 path +* [ ] Update `src/retrieval/pipeline.js` to call the provider rather than direct sqlite/JS branching: + + * [ ] Keep behavior identical as baseline + * [ ] Preserve determinism (stable tie-breaking) + +### 21.2 Implement Tantivy integration (choose one operational model) + +* [ ] Choose packaging model: + + * [ ] **Sidecar model:** `tools/tantivy-server` (Rust) + Node client + * [ ] **Embedded binding:** Node N-API module +* [ ] Add `src/retrieval/sparse/providers/tantivy.js`: + + * [ ] Build query → execute → map results to `{ idx, score }` + * [ ] Support candidate-set filtering if feasible (or document it as a limitation and handle via post-filtering) +* [ ] Add `tools/build-tantivy-index.js`: + + * [ ] Consume existing artifacts (`chunk_meta`, token streams) and build tantivy index on disk + * [ ] Store alongside other indexes (e.g., under repo cache root) + * [ ] Consider incremental updates later; start with full rebuild + +### 21.3 Config + CLI integration + +* [ ] Add config: + + * [ ] `tantivy.enabled` + * [ ] `tantivy.path` (optional override) + * [ ] `tantivy.autoBuild` (optional) +* [ ] Extend backend policy logic (see `src/retrieval/cli/backend-context.js` and backend-policy tests): + + * [ ] Allow `--backend tantivy` (or `--sparse-backend tantivy`) + * [ ] Ensure `auto` fallback behavior remains predictable + +### 21.4 Tests (gated if tantivy isn’t always available in CI) + +* [ ] Add `tests/tantivy-smoke.js`: + + * [ ] Builds tantivy index for `tests/fixtures/sample` + * [ ] Executes a basic query and asserts hits are non-empty +* [ ] Gate it behind env: + + * [ ] `PAIROFCLEATS_TEST_TANTIVY=1` to run + * [ ] Otherwise test exits 0 with “skipped” message (match existing patterns in repo) +* [ ] Add script-coverage action(s) that run it only when env flag is set (or mark as skipped in coverage if you keep strictness) + +**Exit criteria** + +* [ ] Tantivy backend can be enabled without changing default behavior +* [ ] For large repos, sparse retrieval latency is materially improved (benchmarks added in Phase 15) + +--- + + +## Phase 22 — LanceDB vector backend (optional, high impact on ANN scaling) + +### 22.1 Extract a vector-ANN provider interface + +* [ ] Create `src/retrieval/ann/`: + + * [ ] `types.js`: `query({ embedding, topN, candidateSet, mode }) -> hits[]` + * [ ] `providers/sqlite-vec.js` wrapper around `rankVectorAnnSqlite` + * [ ] `providers/hnsw.js` wrapper around `rankHnswIndex` +* [ ] Update `src/retrieval/pipeline.js` to use the provider interface + +### 22.2 Implement LanceDB integration (choose operational model) + +* [ ] Choose packaging model: + + * [ ] Node library integration, **or** + * [ ] Sidecar service (Python) + HTTP +* [ ] Add `src/retrieval/ann/providers/lancedb.js`: + + * [ ] Query by vector and return `{ idx, sim }` + * [ ] Handle filtering: + + * [ ] If LanceDB supports “where id IN (…)” efficiently → push down + * [ ] Otherwise → post-filter and overfetch + +### 22.3 Build tooling for vector index creation + +* [ ] Add `tools/build-lancedb-index.js`: + + * [ ] Ingest `dense_vectors_*` artifacts + * [ ] Store LanceDB table in cache (mode-specific) + * [ ] Validate dims/model compatibility using existing `index_state.json` semantics + +### 22.4 Tests (gated) + +* [ ] Add `tests/lancedb-ann-smoke.js`: + + * [ ] Build embeddings (stub) → build lancedb table → run a nearest-neighbor query → assert stable result ordering +* [ ] Gate behind `PAIROFCLEATS_TEST_LANCEDB=1` +* [ ] Add script-coverage action(s) gated similarly + +**Exit criteria** + +* [ ] LanceDB ANN can be enabled without breaking sqlite/hnsw fallbacks +* [ ] Demonstrable memory and/or latency win for ANN retrieval at scale + +--- + + +## Phase 23 — Benchmarks, regression gates, and release hardening (prove the ROI) + +### 23.1 Extend microbench suite (`tools/bench/micro/`) + +* [ ] Add `tools/bench/micro/watch.js`: + + * [ ] Event storm simulation (if feasible) or synthetic scheduler load +* [ ] Add `tools/bench/micro/regex.js`: + + * [ ] Compare `re2js` vs `re2` on representative patterns/inputs +* [ ] Add `tools/bench/micro/hash.js`: + + * [ ] Compare wasm vs native checksum throughput +* [ ] Add `tools/bench/micro/compression.js`: + + * [ ] gzip vs zstd compress/decompress for representative artifact payload sizes +* [ ] Add `tools/bench/micro/extractors.js`: + + * [ ] PDF/DOCX extraction throughput and memory ceiling + +### 23.2 Add “no-regression” assertions where it matters + +* [ ] Add deterministic snapshot tests (lightweight, not full golden files): + + * [ ] Ensure chunk IDs stable across backends + * [ ] Ensure ordering stable under ties +* [ ] Add metrics validation: + + * [ ] `index-*.json` metrics reflect new compression/extractor options correctly + +### 23.3 Documentation + UX polish + +* [ ] Update `README.md`: + + * [ ] Mention PDF/DOCX support and how to enable/disable + * [ ] Mention optional performance backends and how `auto` works +* [ ] Update `docs/external-backends.md` for Tantivy/LanceDB reality (what’s implemented vs planned) +* [ ] Update `docs/mcp-server.md` for SDK migration + +**Exit criteria** + +* [ ] Benchmarks show measurable improvement (and are reproducible) +* [ ] CI remains green on Node 18 + Windows lane +* [ ] New features are discoverable via config docs + `config_status` + +--- + + +## Phase 24 — LibUV threadpool utilization (explicit control + docs + tests) + +**Objective:** Make libuv threadpool sizing an explicit, validated, and observable runtime control so PairOfCleats I/O concurrency scales predictably across platforms and workloads. + +### 24.1 Audit: identify libuv-threadpool-bound hot paths and mismatch points + +* [ ] Audit all high-volume async filesystem call sites (these ultimately depend on libuv threadpool behavior): + + * [ ] `src/index/build/file-processor.js` (notably `runIo(() => fs.stat(...))`, `runIo(() => fs.readFile(...))`) + * [ ] `src/index/build/file-scan.js` (`fs.open`, `handle.read`) + * [ ] `src/index/build/preprocess.js` (file sampling + `countLinesForEntries`) + * [ ] `src/shared/file-stats.js` (stream-based reads for line counting) +* [ ] Audit concurrency derivation points where PairOfCleats may exceed practical libuv parallelism: + + * [ ] `src/shared/threads.js` (`ioConcurrency = ioBase * 4`, cap 32/64) + * [ ] `src/index/build/runtime/workers.js` (`createRuntimeQueues` pending limits) +* [ ] Decide and record the intended precedence rules for threadpool sizing: + + * [ ] Whether PairOfCleats should **respect an already-set `UV_THREADPOOL_SIZE`** (recommended, matching existing `NODE_OPTIONS` behavior where flags aren’t overridden if already present). + +### 24.2 Add a first-class runtime setting + env override + +* [ ] Add config key (new): + + * [ ] `runtime.uvThreadpoolSize` (number; if unset/invalid => no override) +* [ ] Add env override (new): + + * [ ] `PAIROFCLEATS_UV_THREADPOOL_SIZE` (number; same parsing rules as other numeric env overrides) +* [ ] Implement parsing + precedence: + + * [ ] Update `src/shared/env.js` + + * [ ] Add `uvThreadpoolSize: parseNumber(env.PAIROFCLEATS_UV_THREADPOOL_SIZE)` + * [ ] Update `tools/dict-utils.js` + + * [ ] Extend `getRuntimeConfig(repoRoot, userConfig)` to resolve `uvThreadpoolSize` with precedence: + + * `userConfig.runtime.uvThreadpoolSize` → else `envConfig.uvThreadpoolSize` → else `null` + * [ ] Clamp/normalize: floor to integer; require `> 0`; else `null` + * [ ] Update the function’s return shape and JSDoc: + + * from `{ maxOldSpaceMb, nodeOptions }` + * to `{ maxOldSpaceMb, nodeOptions, uvThreadpoolSize }` + +### 24.3 Propagate `UV_THREADPOOL_SIZE` early enough (launcher + spawned scripts) + +* [ ] Update `bin/pairofcleats.js` (critical path) + + * [ ] In `runScript()`: + + * [ ] Resolve `runtimeConfig` as today. + * [ ] Build child env as an object (don’t pass `process.env` by reference when you need to conditionally add keys). + * [ ] If `runtimeConfig.uvThreadpoolSize` is set and `process.env.UV_THREADPOOL_SIZE` is not set, add: + + * [ ] `UV_THREADPOOL_SIZE = String(runtimeConfig.uvThreadpoolSize)` + * [ ] (Optional) If `--verbose` or `PAIROFCLEATS_VERBOSE`, log a one-liner showing the chosen `UV_THREADPOOL_SIZE` for the child process. +* [ ] Update other scripts that spawn Node subcommands and already apply runtime Node options, so they also carry the threadpool sizing consistently: + + * [ ] `tools/setup.js` (`buildRuntimeEnv()`) + * [ ] `tools/bootstrap.js` (`baseEnv`) + * [ ] `tools/ci-build-artifacts.js` (`baseEnv`) + * [ ] `tools/bench-language-repos.js` (repo child env) + * [ ] `tests/bench.js` (bench child env when spawning search/build steps) + * [ ] `tools/triage/context-pack.js`, `tools/triage/ingest.js` (where `resolveNodeOptions` is used) + * Implementation pattern: wherever you currently do `{ ...process.env, NODE_OPTIONS: resolvedNodeOptions }`, also conditionally set `UV_THREADPOOL_SIZE` from `runtimeConfig.uvThreadpoolSize` if not already present. + +> (Optional refactor, if you want to reduce repetition): add a helper in `tools/dict-utils.js` like `resolveRuntimeEnv(runtimeConfig, baseEnv)` and migrate the call sites above to use it. + +### 24.4 Observability: surface “configured vs effective” values + +* [ ] Update `tools/config-dump.js` + + * [ ] Include in `payload.derived.runtime`: + + * [ ] `uvThreadpoolSize` (configured value from `getRuntimeConfig`) + * [ ] `effectiveUvThreadpoolSize` (from `process.env.UV_THREADPOOL_SIZE` or null/undefined if absent) +* [ ] Add runtime warnings in indexing startup when mismatch is likely: + + * [ ] Update `src/index/build/runtime/workers.js` (in `resolveThreadLimitsConfig`, verbose mode is already supported) + + * [ ] Compute `effectiveUv = Number(process.env.UV_THREADPOOL_SIZE) || null` + * [ ] If `effectiveUv` is set and `ioConcurrency` is materially larger, emit a single warning suggesting alignment. + * [ ] If `effectiveUv` is not set, consider a *non-fatal* hint when `ioConcurrency` is high (e.g., `>= 16`) and `--verbose` is enabled. +* [ ] (Services) Emit one-time startup info in long-running modes: + + * [ ] `tools/api-server.js` + * [ ] `tools/indexer-service.js` + * [ ] `tools/mcp-server.js` + * Log: effective `UV_THREADPOOL_SIZE`, and whether it was set by PairOfCleats runtime config or inherited from the environment. + +### 24.5 Documentation updates + +* [ ] Update env overrides doc: + + * [ ] `docs/env-overrides.md` + + * [ ] Add `PAIROFCLEATS_UV_THREADPOOL_SIZE` + * [ ] Explicitly note: libuv threadpool size must be set **before the Node process starts**; PairOfCleats applies it by setting `UV_THREADPOOL_SIZE` in spawned child processes (via `bin/pairofcleats.js` and other tool launchers). +* [ ] Update config docs: + + * [ ] `docs/config-schema.json` add `runtime.uvThreadpoolSize` + * [ ] `docs/config-inventory.md` add `runtime.uvThreadpoolSize (number)` + * [ ] `docs/config-inventory.json` add entry for `runtime.uvThreadpoolSize` +* [ ] Update setup documentation: + + * [ ] `docs/setup.md` add a short “Performance tuning” note: + + * [ ] When indexing large repos or using higher `--threads`, consider setting `runtime.uvThreadpoolSize` (or `PAIROFCLEATS_UV_THREADPOOL_SIZE`) to avoid libuv threadpool becoming the limiting factor. +* [ ] (Optional) Add a benchmark note: + + * [ ] `docs/benchmarks.md` mention that benchmarking runs should control `UV_THREADPOOL_SIZE` for reproducibility. + +### 24.6 Tests: schema validation + env propagation + +* [ ] Update config validation tests: + + * [ ] `tests/config-validate.js` ensure `runtime.uvThreadpoolSize` is accepted by schema validation. +* [ ] Add a focused propagation test: + + * [ ] New: `tests/uv-threadpool-env.js` + + * [ ] Create a temp repo dir with a `.pairofcleats.json` that sets `runtime.uvThreadpoolSize`. + * [ ] Run: `node bin/pairofcleats.js config dump --json --repo ` + * [ ] Assert: + + * `payload.derived.runtime.uvThreadpoolSize` matches the config + * `payload.derived.runtime.effectiveUvThreadpoolSize` matches the propagated env (or check `process.env.UV_THREADPOOL_SIZE` if you expose it directly in the dump) +* [ ] Add a non-override semantics test (if that’s the decided rule): + + * [ ] New: `tests/uv-threadpool-no-override.js` + + * [ ] Set parent env `UV_THREADPOOL_SIZE=…` + * [ ] Also set config `runtime.uvThreadpoolSize` to a different value + * [ ] Assert child sees the parent value (i.e., wrapper respects existing env) + +**Exit criteria** + +* [ ] `runtime.uvThreadpoolSize` is in schema + inventory and validated by `tools/validate-config.js`. +* [ ] `pairofcleats …` launches propagate `UV_THREADPOOL_SIZE` to child processes when configured. +* [ ] Users can confirm configured/effective behavior via `pairofcleats config dump --json`. +* [ ] Docs clearly explain when and how the setting applies. + +--- + + +## Phase 25 — Threadpool-aware I/O scheduling guardrails + +**Objective:** Reduce misconfiguration risk by aligning PairOfCleats internal I/O scheduling with the effective libuv threadpool size and preventing runaway pending I/O buildup. + +### 25.1 Add a “threadpool-aware” cap option for I/O queue sizing + +* [ ] Add config (optional, but recommended if you want safer defaults): + + * [ ] `indexing.ioConcurrencyCap` (number) **or** `runtime.ioConcurrencyCap` (number) + * Choose the namespace based on your ownership map (`docs/config-inventory-notes.md` suggests runtime is `tools/dict-utils.js`, indexing is build runtime). +* [ ] Implement in: + + * [ ] `src/shared/threads.js` (preferred, because it’s the canonical concurrency resolver) + + * [ ] After computing `ioConcurrency`, apply: + + * `ioConcurrency = min(ioConcurrency, ioConcurrencyCap)` when configured + * (Optional) `ioConcurrency = min(ioConcurrency, effectiveUvThreadpoolSize)` when a new boolean is enabled, e.g. `runtime.threadpoolAwareIo === true` + * [ ] `src/index/build/runtime/workers.js` + + * [ ] Adjust `maxIoPending` to scale from the *final* `ioConcurrency`, not the pre-cap value. + +### 25.2 Split “filesystem I/O” from “process I/O” (optional, higher impact) + +If profiling shows git/tool subprocess work is being unnecessarily throttled by a threadpool-aware cap: + +* [ ] Update `src/shared/concurrency.js` to support two queues: + + * [ ] `fs` queue (bounded by threadpool sizing) + * [ ] `proc` queue (bounded separately) +* [ ] Update call sites: + + * [ ] `src/index/build/file-processor.js` + + * [ ] Use `fsQueue` for `fs.stat`, `fs.readFile`, `fs.open` + * [ ] Use `procQueue` for `getGitMetaForFile` (and any other spawn-heavy steps) + * [ ] `src/index/build/runtime/workers.js` and `src/index/build/indexer/steps/process-files.js` + + * [ ] Wire new queues into runtime and shard runtime creation. + +### 25.3 Tests + benchmarks + +* [ ] Add tests that validate: + + * [ ] Caps are applied deterministically + * [ ] Pending limits remain bounded + * [ ] No deadlocks when both queues exist +* [ ] Update or add a micro-benchmark to show: + + * [ ] Throughput difference when `UV_THREADPOOL_SIZE` and internal `ioConcurrency` are aligned vs misaligned. + +**Exit criteria** + +* [ ] Internal I/O concurrency cannot silently exceed intended caps. +* [ ] No regression in incremental/watch mode stability. +* [ ] Benchmarks show either improved throughput or reduced memory/queue pressure (ideally both). + +--- + + +## Phase 26 — (Conditional) Native LibUV work: only if profiling proves a real gap + +**Objective:** Only pursue *direct* libuv usage (via a native addon) if profiling demonstrates a material bottleneck that cannot be addressed through configuration and queue hygiene. + +### 26.1 Profiling gate and decision record + +* [ ] Add a short profiling harness / guidance doc: + + * [ ] `docs/perf-profiling.md` (new) describing how to profile indexing (CPU + I/O wait) and what thresholds justify native work. +* [ ] Establish decision criteria (example): + + * [ ] If ≥20–30% wall time is spent in JS-level file scanning/reading overhead beyond disk throughput limits, consider native. + * [ ] Otherwise, stay in JS + threadpool tuning. + +### 26.2 Prototype native module (N-API) using libuv for a specific hot path + +* [ ] Only target one narrow, measurable function (examples): + + * [ ] Fast “sample read + binary/minified detection” replacing parts of `src/index/build/file-scan.js` + * [ ] Batched `stat + read` pipeline for small files +* [ ] Provide a clean fallback path to existing JS implementation. +* [ ] Add CI coverage for: + + * [ ] Linux/macOS/Windows builds (or prebuilds) + * [ ] ABI compatibility across supported Node versions + +### 26.3 Packaging and docs + +* [ ] Update: + + * [ ] `package.json` optionalDependencies/build tooling (node-gyp/prebuildify/etc.) + * [ ] `docs/setup.md` to explain native build requirements/fallback behavior + +**Exit criteria** + +* [ ] Prototype demonstrates measurable improvement on representative repos. +* [ ] Install friction and cross-platform maintenance cost are explicitly accepted (or the work is abandoned). + +#### 18 Bottom line + +* **Do not add libuv directly** to this Node codebase. +* **Do add explicit support for libuv threadpool sizing** (via `UV_THREADPOOL_SIZE`) because the current concurrency model (notably `ioConcurrency` up to 64) strongly suggests you will otherwise hit an invisible throughput ceiling. + +--- + + + +## Phase 27 — File processing & artifact assembly (chunk payloads/writers/shards) + +**Reviewed snapshot:** `PairOfCleats-main` (zip import) +**Scope driver:** `pairofcleats_review_section_3_files_and_checklist.md` (Section 3) +**Review date:** 2026-01-12 + +### Severity / priority scale + +- **P0** — correctness, broken reads, data loss/corruption, or contract violations that can invalidate an index +- **P1** — determinism/stability, significant performance regressions, security/CI risks, or high-maintenance debt +- **P2** — cleanup, minor performance wins, refactors, and documentation improvements + +--- + +## Executive summary + +### P0 (must address) + +- **Chunk-meta sharding cleanup bug can cause the loader to read stale shard data** when switching builds from sharded chunk-meta to non-sharded JSONL. This is because `loadChunkMeta()` prefers `chunk_meta.meta.json` / `chunk_meta.parts` over `chunk_meta.jsonl`. Current cleanup logic does not remove the sharded artifacts in the “jsonl, not sharded” path. + - Impact: **incorrect chunks, incorrect file mapping, confusing debug output, and potentially broken search** for any repo where a previous build produced `chunk_meta.meta.json` / `chunk_meta.parts`. + - Primary locus: `src/index/build/artifacts/writers/chunk-meta.js`. + +- **Fast import scanning likely mis-parses `es-module-lexer` records** by treating `entry.d` as a module specifier string. In `es-module-lexer`, `d` is not a specifier (it is typically a numeric “dynamic import” marker). This can yield non-string imports (numbers), downstream crashes in normalization, and/or incorrect `fileRelations.imports` / `externalDocs`. + - Primary locus: `src/index/build/imports.js`. + +- **Piece assembly can silently accept structurally-invalid inputs** because `validateLengths()` treats an empty list as “valid” even when the expected length is non-zero. This can produce assembled indexes with mismatched arrays (e.g., `docLengths`, embeddings vectors) without an early, actionable error. + - Primary locus: `src/index/build/piece-assembly.js`. + +- **Piece assembly appears to drop the `comment` field in field postings/docLengths** (field tokens include `comment`, but assembly only merges `name/signature/doc/body`). If `comment` is enabled in fielded search, this can corrupt/disable that feature in assembled outputs. + - Primary locus: `src/index/build/piece-assembly.js` (and, secondarily, `src/index/build/postings.js` conventions). + +### P1 (high-value next) + +- **Determinism risks** (import link ordering; vocab ordering derived from `Map` insertion order; shard batch sorting ties; repo-map ordering) can cause noisy diffs and unstable IDs across builds even when inputs are unchanged. +- **Artifact manifest robustness**: `pieces/manifest.json` generation can silently record `null` checksums/bytes on error; this weakens contract guarantees and can hide partial artifact failures. +- **CI metadata hygiene**: `tools/ci-build-artifacts.js` records remote URLs; sanitize to avoid leaking credentials in CI logs/artifacts. + +### P2 (cleanup / maintainability) + +- Documentation drift (notably the claim that compressed payloads embed a `compression` field) and contract documentation gaps (assembled stage semantics, meta schema examples) should be corrected. +- Several low-risk performance wins are available (avoid `split('\n')` in hot paths; reduce repeated per-chunk work; minimize transient array concat). + +--- + +## 27.1 Per-file processing correctness (Checklist A) + +**Audit** + +Reviewed the per-file pipeline as implemented in: + +- `src/index/build/file-processor.js` +- `src/index/build/file-processor/*` (assemble/cached-bundle/chunk/incremental/meta/read/relations/skip/timings) +- Supporting callsites and artifacts emitted downstream: `src/index/build/artifacts.js`, `src/index/build/artifacts/file-meta.js`, and chunk-meta serialization (`src/index/build/artifacts/writers/chunk-meta.js`) +- Relevant tests in scope: `tests/file-processor/skip.test.js`, `tests/file-processor/cached-bundle.test.js` + +Key pipeline stages observed: + +1. Resolve file identity (`abs`, `relKey`) and caps → early skip checks +2. Load cached bundle (incremental) when enabled +3. Read + decode file; hash +4. Language context (registry), segment discovery, chunking +5. Comments extraction (optional) → comment-to-chunk assignment +6. Relations, docmeta, flow/meta enrichment (code mode) +7. Tokenization (main thread or worker), minhash, phrase/chargram sources +8. Embeddings attach (optional) +9. Assemble final chunk payloads + per-file relations → persist incremental bundle + +**Gaps / issues** + +#### Offsets: define and test offset units (byte vs. UTF-16 index) + +- `start` / `end` offsets are produced and consumed as **JavaScript string indices** (UTF‑16 code units) throughout the file pipeline (`text.slice(c.start, c.end)` etc.). +- The checklist explicitly calls out **byte offsets**. Current docs/contracts do not define the unit for `start`/`end`, which leaves room for misinterpretation and subtle bugs for non‑ASCII content. + +**Why it matters** +- If any consumer assumes byte offsets (e.g., a non-JS reader, a tool that indexes into raw file bytes), chunks will be mis-sliced for multi-byte UTF‑8 sequences. + +**Where to address** +- Primary: `src/index/build/file-processor.js` and `src/index/build/artifacts/writers/chunk-meta.js` (and docs under `docs/`). + +#### Chunk boundary invariants are not asserted at the file-processor boundary + +- `file-processor.js` assumes `chunkSegments()` returns non-overlapping, in-range chunks. It does not assert invariants such as: + - `0 <= start <= end <= text.length` + - monotonically increasing chunk ranges (or “overlap only when configured”) + - “no accidental overlap” beyond configured overlap window +- This makes debugging chunking regressions harder: errors will surface downstream (postings build, artifact read) rather than at the boundary. + +#### Skip reasons: observable coverage is incomplete + +Covered / explicit: +- `oversize` (max bytes / max lines), `minified`, `binary`, `read-failure` (and `unreadable` via scan results) + +Missing or ambiguous: +- **unsupported language** (no explicit skip reason visible in `file-processor.js` / `skip.js`) +- **parse / relation extraction failures**: most errors will currently throw and likely fail the build rather than record a per-file skip reason (no “parse-error” skip). + +#### Provenance: per-file outputs are missing stable “content identity” fields + +- Chunk payloads contain `file` (rel path), `ext`, and `lang`, which is good. +- `file_meta.json` contains `id`, `file`, `ext`, git metadata, etc. +- **Neither chunk meta nor file meta currently records a stable file content hash** (even though the pipeline already computes `fileHash` for incremental caching). + +This makes post-hoc debugging harder: +- You cannot quickly tell whether a chunk came from a particular file revision without recomputing hashes from source. + +#### Minor correctness nits + +- Comment assignment edge: comments starting exactly at `chunk.end` can be assigned to the previous chunk due to a strict `<` comparison in `assignCommentsToChunks()` (`src/index/build/file-processor/chunk.js`). +- Timing accounting: `addParseDuration()` is invoked multiple times per file (parseStart and relationStart paths), which risks double-counting in aggregated metrics. + +**Remaining work** + +- [ ] **Document offset units** for `start`/`end` (recommendation: define as UTF‑16 code-unit offsets, because that is what JS uses), and add at least one non‑ASCII regression test that validates: + - [ ] `text.slice(start, end)` reproduces the chunk text + - [ ] `offsetToLine()` aligns with `startLine/endLine` for multi-byte characters + (Files: `src/index/build/file-processor.js`, `docs/artifact-contract.md`, `docs/contracts/indexing.md`, plus a new/extended test) + +- [ ] Add **boundary asserts** (behind a dev/test flag if needed) after chunking: + - [ ] in-range checks (`0..text.length`) + - [ ] monotonic chunk ordering + - [ ] overlap detection (only allow configured overlap) + (File: `src/index/build/file-processor.js`) + +- [ ] Make **unsupported-language** behavior explicit and test-covered: + - [ ] decide: skip with reason `unsupported-language` vs. treat as `unknown` with generic chunking + - [ ] add test coverage for the chosen behavior + (Files: `src/index/build/file-processor.js`, `src/index/build/file-processor/skip.js`, tests under `tests/file-processor/`) + +- [ ] Add **parse-error** (and relation-error) per-file skip handling: + - [ ] catch and record failures from `lang.chunk`, `lang.buildRelations`, `lang.extractDocMeta`, `flow()`, etc. + - [ ] ensure the build can proceed when a single file fails (configurable) + (File: `src/index/build/file-processor.js`) + +- [ ] Add **file-level content hash** to `file_meta.json` (and optionally, to each chunk’s `metaV2`): + - [ ] store `hash` and `hashAlgo` + - [ ] ensure incremental and non-incremental builds agree + (Files: `src/index/build/file-processor.js`, `src/index/build/artifacts/file-meta.js`, `docs/artifact-contract.md`) + +- [ ] Fix the comment boundary condition in `assignCommentsToChunks()`: + - [ ] consider `<=` for boundary tests, or implement overlap-based assignment using comment `(start,end)` + (File: `src/index/build/file-processor/chunk.js`) + +- [ ] Audit and correct **timing double-counting** in `createTimingsTracker()` usage: + - [ ] ensure parseMs reflects one pass, and relation/flow have separate counters if desired + (Files: `src/index/build/file-processor.js`, `src/index/build/file-processor/timings.js`) + +--- + +## 27.2 Artifact contract correctness (Checklist B) + +**Audit** + +Reviewed artifact write orchestration and contract touchpoints: + +- Orchestration: `src/index/build/artifacts.js` +- Contract-level helpers: `src/index/build/artifacts/checksums.js`, `src/index/build/artifacts/compression.js` +- Writers: `src/index/build/artifacts/writers/chunk-meta.js`, `.../file-relations.js`, `.../repo-map.js` +- Schema docs: `docs/artifact-contract.md`, `docs/contracts/indexing.md` +- Guardrail tests: `tests/artifact-size-guardrails.js`, `tests/artifact-formats.js`, `tests/artifact-bak-recovery.js` + +Confirmed: +- JSON and JSONL writers use `atomic: true` (temp + rename + `.bak` semantics) via shared JSON stream helpers. +- `pieces/manifest.json` is generated and includes checksums for files that can be read at generation time. +- Readers are designed to be backward compatible with older shapes (e.g., token shard files and meta shapes in `tests/artifact-formats.js`). + +**Gaps / issues** + +#### P0: Chunk-meta sharding cleanup is incomplete (stale shards override new JSONL) + +- In `enqueueChunkMetaArtifacts()` (`src/index/build/artifacts/writers/chunk-meta.js`): + - When `chunkMetaUseJsonl === true` and `chunkMetaUseShards === false`, the writer removes `chunk_meta.json` and `chunk_meta.json.gz`, but **does not remove**: + - `chunk_meta.meta.json` + - `chunk_meta.parts/` +- `loadChunkMeta()` prefers meta/parts if they exist, even if `chunk_meta.jsonl` exists. Therefore, stale shards can override a newly-written JSONL file. + +#### Sharded directory atomicity remains “best effort” only + +- Token postings shards: `artifacts.js` deletes and recreates `token_postings.shards/` and writes part files atomically, but the directory as a whole can still be left in a partial state if the process crashes mid-write (no staging directory + atomic rename). +- Chunk meta shards: similar; additionally, the parts directory is not cleared before writing, which can leave orphan part files. + +This is not always fatal if readers rely solely on `meta.parts`, but it violates the “no partially-written states” intent of the checklist. + +#### Manifest robustness: checksum/stat errors are swallowed + +- `writePiecesManifest()` catches errors from `fs.stat` and `checksumFile` and records `bytes: null` / `checksum: null`, without failing the build or preserving error details. +- That makes it easy to produce an apparently “valid” manifest that cannot be validated later. + +#### Documentation drift: compression description is inaccurate + +- `docs/artifact-contract.md` claims the JSON payload contains a `compression` field when `.json.gz` is written. Current writers compress the raw JSON stream; they do not inject a `compression` field into the JSON object. + +#### Contract clarity gaps + +- The docs do not clearly document: + - precedence rules when multiple formats are present (meta/parts vs jsonl vs json) + - the on-disk schema for `token_postings.meta.json` and `chunk_meta.meta.json` (fields vs arrays vs legacy) + - whether `.json.gz` is a sidecar (both present) or a replacement (only gz present) + +**Remaining work** + +- [ ] **Fix chunk-meta cleanup** when `chunkMetaUseJsonl && !chunkMetaUseShards`: + - [ ] remove `chunk_meta.meta.json` if present + - [ ] remove `chunk_meta.parts/` if present + (File: `src/index/build/artifacts/writers/chunk-meta.js`) + +- [ ] Ensure shard writes do not accumulate orphan files: + - [ ] delete `chunk_meta.parts/` before writing new sharded parts (or write to staging dir + rename) + - [ ] confirm `token_postings.shards/` cleanup is complete on all branches + (Files: `src/index/build/artifacts/writers/chunk-meta.js`, `src/index/build/artifacts.js`) + +- [ ] Implement **directory-level atomicity** for sharded artifacts: + - [ ] write shards to `*.tmp/` directory + - [ ] atomically swap into place via rename (and optionally keep a directory-level `.bak`) + (Files: `src/index/build/artifacts/writers/chunk-meta.js`, `src/index/build/artifacts.js`) + +- [ ] Make manifest generation strict for required artifacts: + - [ ] either (a) fail the build on checksum/stat failure, or (b) record an `error` field and ensure validation tooling treats it as failure + (File: `src/index/build/artifacts/checksums.js`) + +- [ ] Update docs to match implementation: + - [ ] remove/adjust claim about `compression` field + - [ ] add schema examples for meta files (fields/arrays/legacy) + - [ ] document precedence rules for readers + (Files: `docs/artifact-contract.md`, `docs/contracts/indexing.md`) + +- [ ] Add a regression test that explicitly covers the stale chunk-meta shard override: + - [ ] build A: sharded chunk meta written + - [ ] build B: non-sharded jsonl written, ensure shards removed or ignored + - [ ] loader reads build B’s jsonl, not build A’s shards + (New test; or extend `tests/artifact-formats.js` / `tests/artifact-size-guardrails.js`) + +--- + +## 27.3 Sharding / pieces / postings (Checklist C) + +**Audit** + +Reviewed: + +- Shard planning: `src/index/build/shards.js` + tests (`tests/shard-plan.js`) +- Postings build: `src/index/build/postings.js` +- Tokenization primitives: `src/index/build/tokenization.js` + buffering tests (`tests/tokenization-buffering.js`) +- Piece assembly/merge: `src/index/build/piece-assembly.js` + test (`tests/piece-assembly.js`) +- Piece compaction tool: `tools/compact-pieces.js` + +**Gaps / issues** + +#### Determinism: import links and vocab ordering are under-specified + +- **Imports / importLinks**: + - `scanImports()` runs with concurrency and stores per-module Sets of importing files. The final arrays are not sorted. + - `buildImportLinksFromRelations()` builds `importLinks` lists that may include the current file and are not explicitly sorted/deduped. + - Result: output can vary based on processing order, which can vary with concurrency and scheduling. + +- **Vocab ordering**: + - `buildPostings()` converts multiple Maps to vocab arrays via `Array.from(map.keys())`. + - This relies on Map insertion order being stable across builds. It often is, but it is not a strong contract and can be perturbed by changes in traversal order or parallelism. + - Risk: **token IDs may shift across builds** even when inputs are unchanged, creating noisy diffs and complicating caching. + +#### Postings canonicalization: sorted/canonical postings are assumed but not asserted + +- Many consumers assume postings are in docId order and token vocab order is stable. +- There is no explicit “canonicalize and validate” step before writing postings, and few tests assert canonical ordering. + +#### Piece assembly: field postings coverage mismatch + weak validation + +- **Field postings merge omits the `comment` field** (see P0 summary). +- **validateLengths()** can silently allow missing arrays when expected > 0 (see P0 summary). +- Vocab arrays in assembly are also derived from Map insertion order; if input order differs, assembled token IDs can differ. + +#### Shard planning: tie-break determinism should be explicit + +- Some sorts are deterministic (by label, by relPath), but shard batching uses weight-based partitioning without explicit tie-breakers when weights are equal. This is likely stable in current Node versions, but should be explicitly stable to avoid cross-version drift. + +**Remaining work** + +#### Shard planning + +- [ ] Add explicit tie-breakers in shard batching and balancing when weights are equal: + - [ ] include `label` or `id` in comparator + - [ ] document determinism guarantees + (File: `src/index/build/shards.js`) + +- [ ] Add a “very large repo” synthetic shard-plan test: + - [ ] verifies bounded memory and time + - [ ] verifies stable shard labels/IDs across runs + (New test; extend `tests/shard-plan.js`) + +#### Postings / tokenization + +- [ ] Canonicalize vocab ordering for stability: + - [ ] define canonical sort order (lexicographic; or localeCompare with explicit locale; or bytewise) + - [ ] apply consistently to token vocab, phrase vocab, chargram vocab, and field vocabs + (File: `src/index/build/postings.js` and any upstream postings-map builders) + +- [ ] Canonicalize and/or validate postings ordering: + - [ ] assert postings doc IDs are strictly increasing per token (or stable canonical order) + - [ ] assert vocab/postings arrays align and lengths match + (File: `src/index/build/postings.js`; plus tests) + +- [ ] Expand quantization tests to include: + - [ ] scale correctness + - [ ] dims mismatch handling + - [ ] doc/code embeddings “fallback to main embedding” behavior + (File: `tests/postings-quantize.js`) + +#### Piece assembly + +- [ ] Fix `validateLengths()` to fail when expected > 0 and list is empty or mismatched: + - [ ] treat `[]` as invalid when `expected > 0` + - [ ] include artifact name + input dir in error message for fast triage + (File: `src/index/build/piece-assembly.js`) + +- [ ] Merge **all field postings present in inputs**, including `comment` (and any future fields): + - [ ] do not hardcode `name/signature/doc/body` + - [ ] merge based on keys present in `field_postings.json` / `field_tokens.json` or config + (File: `src/index/build/piece-assembly.js`) + +- [ ] Determinize assembly: + - [ ] sort `inputs` deterministically by path (or require stable input ordering and document it) + - [ ] sort merged vocabs (or guarantee stable order via canonicalization) + - [ ] ensure assembled output is byte-for-byte stable for same inputs + (Files: `tools/assemble-pieces.js`, `src/index/build/piece-assembly.js`) + +- [ ] Add a regression test: **assembled output equals monolithic output** for the same fixture: + - [ ] build monolithic index + - [ ] build two partial indexes (or reuse shards) and assemble + - [ ] compare chunk_meta + token_postings + manifest semantics + (New test; extend `tests/piece-assembly.js`) + +- [ ] Verify manifests list all required parts: + - [ ] ensure meta files are included and checksummed + - [ ] ensure shard part counts match meta.parts and manifest counts match meta totals + (Files: `src/index/build/artifacts/checksums.js`, tests) + +--- + +## 27.4 Performance improvements to prioritize (Checklist D) + +**Audit** + +The current implementation is functional and reasonably structured, but several areas will become dominant costs on large repos: + +- Per-file pipeline does multiple passes over the same data (chunking, tokenization, docmeta, lint/complexity). +- Artifact writing constructs full in-memory arrays for potentially huge artifacts and then serializes them. +- Some hot paths allocate transient arrays aggressively. + +### High-impact improvements (prioritized) + +#### Avoid “build huge arrays then serialize” + +- `buildPostings()` currently materializes large `vocab` and `postings` arrays in memory. + - [ ] Add a streaming/sharded writer path that writes postings shards incrementally as postings are built (or at least allows releasing intermediate Maps earlier). +- `chunk_meta` estimation uses JSON.stringify samples, which is OK, but writing sharded JSONL still relies on iterators that materialize per-entry objects. + - [ ] Consider a “lightweight entry view” or direct JSONL streaming that avoids building large intermediate objects for fields not needed. + +#### Reduce repeated parsing/enrichment passes + +- Complexity + lint are computed in the per-chunk loop but cached per file; move the computation to a single per-file pre-pass to remove repeated cache checks. +- Where feasible, consider combining: + - chunking + tokenization (tokenize the chunk as soon as you slice it, but avoid repeated slice work) + - relations/docmeta extraction caching to avoid per-chunk repeated derived work + +#### Minimize transient allocations + +- Avoid `text.split('\n')` for context windows in `file-processor.js`. Use a line-scan utility that slices the relevant ranges without splitting the entire file. +- Replace repeated `array.concat()` in loops (e.g., `commentFieldTokens = commentFieldTokens.concat(tokens)`) with `push(...tokens)` or manual push for large arrays. +- In tokenization, buffer reuse is good, but `buildTokenSequence()` still clones arrays (`slice()`) each call. Confirm this is intentional and consider: + - pre-sizing output arrays when token counts are known/estimable + - returning typed arrays for `seq` where possible (if consumers permit) + +**Remaining work** + +- [ ] Replace `split('\n')` usage in `src/index/build/file-processor.js` with a targeted line-scan helper. +- [ ] Move complexity/lint computation outside the per-chunk loop in `file-processor.js`. +- [ ] Reduce transient array concatenations in comment token aggregation. +- [ ] Explore a streaming postings writer for very large repos (phase-level refactor). +- [ ] Add at least one micro-benchmark or perf regression test covering: + - piece assembly (`src/index/build/piece-assembly.js`) + - piece compaction (`tools/compact-pieces.js`) + +--- + +## 27.5 Refactoring goals (Checklist E) + +**Audit** + +Current state: +- Artifact writing is orchestrated from `artifacts.js` via `enqueueJsonObject/Array/Lines` + special-case writers (chunk meta writer). +- Schema definitions are implicit in “writer payload construction” and spread across multiple modules. +- Multiple identifiers exist (`chunk.id`, `metaV2.chunkId`, graph keys `file::name`), which increases the chance of accidental drift. + +**Remaining work** + +- [ ] Introduce a single “artifact writer” abstraction with a consistent interface: + - [ ] `write(name, payload | iterator, { format, sharded, compression, pieceType })` + - [ ] built-in cleanup rules and directory-level atomic swaps + - [ ] standard metadata (version, generatedAt, schemaVersion) + (Impacts: `src/index/build/artifacts.js`, `src/index/build/artifacts/writers/*`) + +- [ ] Separate schema definitions from I/O: + - [ ] define schemas for artifacts in a central module (even if only via JS object contracts + comments) + - [ ] ensure docs mirror those schema definitions + (Impacts: `docs/artifact-contract.md`, `docs/contracts/indexing.md`) + +- [ ] Create a single canonical chunk-id generator and use it everywhere: + - [ ] prefer `metaV2.chunkId` (content-based) for graphs/relations keys instead of ad-hoc `file::name` + - [ ] ensure assembled and non-assembled builds produce identical chunkIds + (Impacts: `src/index/build/graphs.js`, and any code producing chunk identifiers) + +--- + +## 27.6 Tests (Checklist F) + +**Audit** + +In-scope tests are generally helpful and cover: +- `.bak` recovery semantics (`tests/artifact-bak-recovery.js`) +- artifact precedence formats (`tests/artifact-formats.js`) +- size guardrails forcing sharding (`tests/artifact-size-guardrails.js`) +- shard planning (`tests/shard-plan.js`) +- shard vs non-shard equivalence (`tests/shard-merge.js`) +- quantization correctness (`tests/postings-quantize.js`) +- incremental tokenization caching (`tests/incremental-tokenization-cache.js`) + +However, multiple tests are still existence/shape-heavy and do not verify semantic meaning deeply, especially around assembled outputs and import scanning. + +**Gaps / issues** + +- `tests/file-processor/cached-bundle.test.js` uses shapes for `allImports` and `codeRelations.calls` that do not match the likely real shapes; it can pass while not meaningfully validating correctness. +- No tests cover: + - chunk-meta cleanup when switching formats (P0 issue) + - compressed sidecar `.json.gz` artifacts and their `.bak` semantics + - partial shard write behavior (meta missing, orphan parts, etc.) + - import scanning correctness for dynamic imports / es-module-lexer record handling + - deterministic `importLinks` ordering + - perf regression for `compact-pieces` / `assembleIndexPieces` + +**Remaining work** + +- [ ] Strengthen artifact format tests to assert semantic meaning: + - [ ] verify loader precedence (meta/parts vs jsonl vs json) in more combinations + - [ ] verify meta.parts path normalization and correctness + +- [ ] Add regression tests for atomic write failures: + - [ ] simulate rename failures (via dependency injection or controlled FS behavior) + - [ ] assert `.bak` fallback and cleanup behavior + +- [ ] Add regression tests for partial shard writes: + - [ ] parts written, meta missing + - [ ] meta references missing parts + - [ ] stale orphan parts do not affect reads + +- [ ] Add stress fixtures for large token/postings sets: + - [ ] ensure bounded memory / time + - [ ] ensure canonical ordering remains correct under stress + +- [ ] Add at least one perf regression test: + - [ ] compaction: `tools/compact-pieces.js` + - [ ] assembly: `src/index/build/piece-assembly.js` + +- [ ] Fix `tests/file-processor/cached-bundle.test.js` to use realistic shapes: + - [ ] `allImports` should be `{ [moduleName: string]: string[] }` + - [ ] `codeRelations.calls/usages` should match the real structure used by `buildRelationGraphs()` / `buildCallIndex()` + (File: `tests/file-processor/cached-bundle.test.js`) + +--- + +## Appendix A: File-by-file findings + +This section enumerates each in-scope file and lists file-specific items to address (beyond cross-cutting tasks already listed above). + +### src/index/build/artifacts.js +- [ ] (P1) Consider directory-level atomic swap for `token_postings.shards/` (staging dir + rename). +- [ ] (P1) Normalize shard part paths to POSIX in any meta/manifest structures (avoid OS-separator leakage). +- [ ] (P2) Consider sorting `pieceEntries` by `path` before writing the manifest to reduce diff noise. + +### src/index/build/artifacts/checksums.js +- [ ] (P1) Do not silently accept checksum/stat failures for required pieces; fail or record errors explicitly. + +### src/index/build/artifacts/compression.js +- [ ] (P2) Update docs to clarify that gzip is a sidecar (`.json` and `.json.gz` both exist). +- [ ] (P2) Consider extending compression to sharded artifacts (optional future work). + +### src/index/build/artifacts/file-meta.js +- [ ] (P1) Make file ID assignment stable by sorting unique file paths before assigning IDs. +- [ ] (P1) Add file content hash (and algo) and file size to `file_meta.json`. +- [ ] (P2) Remove or rename `chunk_authors` in file meta (currently derived from the first chunk and not file-level). + +### src/index/build/artifacts/filter-index.js +- [ ] (P2) Consider persisting schema version/config hash in the filter index artifact for easier debugging. + +### src/index/build/artifacts/metrics.js +- [ ] (P2) Do not swallow metrics write errors silently (log or propagate based on severity). + +### src/index/build/artifacts/token-mode.js +- [ ] (P2) Make parsing more robust (case-insensitive modes; integer parsing + clamping). + +### src/index/build/artifacts/writers/chunk-meta.js +- [ ] (P0) Remove stale `chunk_meta.meta.json` and `chunk_meta.parts/` when writing non-sharded JSONL. +- [ ] (P1) Clear or stage-swap `chunk_meta.parts/` when writing sharded output. +- [ ] (P1) Normalize `meta.parts` entries to POSIX paths. +- [ ] (P2) Consider normalizing field naming conventions (`chunk_authors` vs `startLine/endLine`). + +### src/index/build/artifacts/writers/file-relations.js +- [ ] (P2) Consider JSONL/sharding for very large `file_relations` outputs; add versioning metadata. + +### src/index/build/artifacts/writers/repo-map.js +- [ ] (P1) Ensure `exported` detection handles default exports correctly (depends on relations schema). +- [ ] (P2) Consider sorting output by `{file, name}` for stability. + +### src/index/build/file-processor.js +- [ ] (P1) Add explicit boundary asserts for chunks after chunking. +- [ ] (P1) Replace `split('\n')` with line-scan utility for context extraction. +- [ ] (P2) Move complexity/lint to per-file scope; avoid repeated per-chunk cache checks. +- [ ] (P2) Fix possible timing double-counting across parse/relation durations. +- [ ] (P1) Add explicit unsupported-language and parse-error skip reasons (configurable). + +### src/index/build/file-processor/assemble.js +- [ ] (P1) Ensure field token fields written here (including `comment`) are consistently supported by postings and piece assembly. + +### src/index/build/file-processor/cached-bundle.js +- [ ] (P2) Validate cached bundle shapes more strictly; ensure importLinks shape is consistent. + +### src/index/build/file-processor/chunk.js +- [ ] (P2) Adjust comment-to-chunk assignment at boundary (`chunk.end === comment.start`) and consider overlap-based assignment. + +### src/index/build/file-processor/incremental.js +- [ ] (P2) Ensure cache invalidation includes schema/version changes for any artifact-impacting changes. + +### src/index/build/file-processor/meta.js +- [ ] (P2) Deduplicate `externalDocs` outputs; consider ordering for determinism. + +### src/index/build/file-processor/read.js +- [ ] (P2) Consider UTF-8 safe truncation (avoid splitting multi-byte sequences mid-codepoint). + +### src/index/build/file-processor/relations.js +- [ ] (P2) Consider sorting/deduping relation arrays (imports/exports/usages) for determinism. + +### src/index/build/file-processor/skip.js +- [ ] (P1) Add explicit unsupported-language skip reason (or document that unknown languages are processed). +- [ ] (P2) Add coverage for `unreadable` and `read-failure` skip paths. + +### src/index/build/file-processor/timings.js +- [ ] (P2) Validate that parse/token/embed durations are not double-counted; document semantics. + +### src/index/build/graphs.js +- [ ] (P2) Prefer canonical `chunkId` keys where possible instead of `file::name` to avoid collisions. +- [ ] (P2) Sort serialized node lists for full determinism (neighbors are already sorted). + +### src/index/build/imports.js +- [ ] (P0) Fix `es-module-lexer` import record handling (`entry.d` is not a specifier string). +- [ ] (P1) Sort and dedupe `importLinks` deterministically; exclude self-links unless explicitly desired. +- [ ] (P1) Ensure concurrency does not affect output ordering (sort module keys and file arrays before serialization). + +### src/index/build/piece-assembly.js +- [ ] (P0) Make `validateLengths()` strict when `expected > 0`. +- [ ] (P0) Merge all field postings (including `comment`) and docLengths based on actual input keys. +- [ ] (P1) Canonicalize vocab ordering in assembled outputs. +- [ ] (P2) Remove redundant filterIndex construction (avoid double work; rely on writeIndexArtifacts). + +### src/index/build/postings.js +- [ ] (P1) Canonicalize vocab ordering (token/phrase/chargram/field) explicitly. +- [ ] (P2) Validate docLengths are finite and consistent; avoid NaN avgDocLen. +- [ ] (P2) Sort Object.entries() iteration for field postings and weights for deterministic output. + +### src/index/build/shards.js +- [ ] (P1) Add explicit tie-breakers in weight-based sorts/batching for determinism across runtimes. +- [ ] (P2) Document heuristic thresholds (minFilesForSubdir, hugeThreshold, tenth-largest targets). + +### src/index/build/tokenization.js +- [ ] (P2) Review buffer reuse effectiveness (arrays are still cloned); consider pre-sizing and reducing transient allocations further. + +### tools/assemble-pieces.js +- [ ] (P1) Sort `inputDirs` by default (or add `--sort`) to ensure deterministic assembled output. +- [ ] (P2) When `--force` is used, consider cleaning the output dir first to avoid stale artifacts. + +### tools/ci-build-artifacts.js +- [ ] (P1) Sanitize remote URLs before writing them to `manifest.json` to avoid leaking credentials. + +### tools/ci-restore-artifacts.js +- [ ] (P2) Optionally validate `pieces/manifest.json` checksums after restore (fast fail on corrupt artifacts). + +### tools/compact-pieces.js +- [ ] (P1) Consider directory-level atomic swap semantics (avoid rm+rename window). +- [ ] (P2) Add perf regression harness and validate output equivalence post-compaction. + +### tests/artifact-bak-recovery.js +- [ ] (P2) Expand coverage to include: both primary and backup corrupt; json.gz sidecars; and cleanup expectations. + +### tests/artifact-formats.js +- [ ] (P1) Add explicit precedence test: sharded meta/parts must not override fresh jsonl when shards are stale (post-fix). + +### tests/artifact-size-guardrails.js +- [ ] (P2) Extend to cover: chunkMetaFormat=jsonl with switching shard/no-shard, and cleanup behavior. + +### tests/artifacts/file-meta.test.js +- [ ] (P1) Update test if file ID assignment is changed to sorted-by-path; assert stability across different chunk orders. + +### tests/artifacts/token-mode.test.js +- [ ] (P2) Add coverage for invalid modes, case-insensitive parsing, and maxTokens/maxFiles parsing edge cases. + +### tests/clean-artifacts.js +- [ ] (P2) Consider adding a check that `.bak` files are handled correctly (optional). + +### tests/file-processor/cached-bundle.test.js +- [ ] (P1) Fix test fixtures to use realistic `allImports` and `codeRelations` shapes, and assert semantic correctness (not only presence). + +### tests/file-processor/skip.test.js +- [ ] (P2) Add coverage for `unreadable` and `read-failure` paths (permissions, ENOENT races). + +### tests/filter-index-artifact.js +- [ ] (P2) Add a schema assertion for filter_index fields/versioning to prevent drift. + +### tests/filter-index.js +- [ ] (P2) Consider adding a determinism check for serialized filter index (same inputs => same output). + +### tests/graph-chunk-id.js +- [ ] (P2) Add a collision regression test for graph keys, or migrate to chunkId-based keys. + +### tests/incremental-tokenization-cache.js +- [ ] (P2) Add a second invalidation scenario (e.g., tokenization config changes that affect stemming/synonyms). + +### tests/piece-assembly.js +- [ ] (P1) Add semantic equivalence test vs monolithic build and add a determinism test (same inputs => identical assembled output). + +### tests/postings-quantize.js +- [ ] (P2) Extend to test scale and dims, and doc/code embedding behavior. + +### tests/shard-merge.js +- [ ] (P2) Consider adding checksum and manifest equivalence checks as well. + +### tests/shard-plan.js +- [ ] (P2) Add stress case coverage (many files, equal weights, perfProfile enabled). + +### tests/tokenization-buffering.js +- [ ] (P2) Consider adding a non-ASCII tokenization regression case. + +### docs/artifact-contract.md +- [ ] (P1) Fix compression description (no embedded `compression` field) and clarify `.json.gz` sidecar semantics. +- [ ] (P1) Add explicit precedence rules (meta/parts vs jsonl vs json). +- [ ] (P2) Add schema examples for meta files and `pieces/manifest.json`. + +### docs/contracts/coverage-ledger.md +- [ ] (P2) Add entries for new/critical tooling: `tools/assemble-pieces.js`, `tools/compact-pieces.js`, and CI artifact scripts. + +### docs/contracts/indexing.md +- [ ] (P1) Clarify which artifacts are “required” vs “optional/configurable” (e.g., minhash signatures). +- [ ] (P1) Document sharded meta schema and loader precedence. + + +## Phase 28 — Section 2 — Index build orchestration review (findings + required fixes) + +### Executive summary: highest-priority issues (fix first) + +#### Correctness / functional + +- [ ] **Sharding path creates fresh worker pools + queues per shard work item, with no explicit teardown.** + This is very likely to cause thread/resource leaks, excessive pool creation overhead, and/or a build process that does not exit cleanly. + _Primary file:_ `src/index/build/indexer/steps/process-files.js` + _Related:_ `src/index/build/runtime/workers.js`, `src/index/build/worker-pool.js` + +- [ ] **`--mode all` behavior is inconsistent with “extracted-prose” expectations (tests + CLI surface).** + `tests/build-index-all.js` expects an `extracted-prose` index to be produced for `--mode all`, and `parseBuildArgs(...)` already resolves `modes` to include it; however the CLI entry (`build_index.js`) discards the computed `modes` and delegates to the core build entry, which (in the current tree) resolves “all” differently. + _Primary file(s) in scope:_ `build_index.js`, `src/index/build/args.js`, `tests/build-index-all.js` + _Note:_ the root cause may live outside this section’s file list, but the mismatch is observable from the files in scope and should be corrected at the boundary. + +- [ ] **Watch debounce scheduler does not safely handle async `onRun` errors (risk of unhandled promise rejection).** + `createDebouncedScheduler(...)` calls `onRun()` without `await`/`.catch(...)`. In `watchIndex(...)`, `onRun` is async. Any unexpected throw/rejection (e.g., from lock release, filesystem exceptions) can become an unhandled rejection. + _Primary file:_ `src/index/build/watch.js` + +#### Determinism / reproducibility + +- [ ] **Locale-dependent sorts in ordering-critical paths (`localeCompare`) should be replaced with deterministic lexicographic compares.** + Ordering drives chunk IDs, manifest key ordering, and shard planning stability; `localeCompare` can vary by ICU/locale. + _Primary files:_ + - `src/index/build/indexer/steps/discover.js` + - `src/index/build/indexer/steps/process-files.js` + - `tools/shard-census.js` + +#### Incremental correctness across versions + +- [ ] **Incremental cache signature likely needs a “tool/build schema version” component.** + Today, signature invalidation is strongly config-based. If tokenization/chunk schema/postings semantics change across releases without config changes, the cache can be reused incorrectly. + _Primary file:_ `src/index/build/indexer/signatures.js` + _Related:_ `src/index/build/incremental.js`, `tests/incremental-*.js` + +--- + +### A. Pipeline mapping and boundaries + +#### A.1 Current pipeline map (as implemented) + +**Audit** + +The index build pipeline, as observable from the files in scope, is structured as: + +1. **CLI entry** + - `build_index.js` → parses args and calls the core build entry with `argv` + `rawArgv`. + +2. **Runtime construction** + - `src/index/build/runtime.js` → `createBuildRuntime(...)` + - `src/index/build/runtime/runtime.js` → loads config(s), applies stage overrides (`runtime/stage.js`), resolves caps/guardrails (`runtime/caps.js`), ignore rules (`ignore.js`), concurrency and queues/pools (`runtime/workers.js`, `worker-pool.js`), crash logging (`crash-log.js`), and creates a build output root. + +3. **Mode build orchestration** + - `src/index/build/indexer.js` → `buildIndexForMode(...)` for each mode. + - `src/index/build/indexer/pipeline.js` coordinates the build steps per mode. + +4. **Per-mode pipeline stages** + - **Discover**: `indexer/steps/discover.js` (uses `discover.js` + optional preprocessed discovery) + - **Incremental plan + whole-index reuse**: `indexer/steps/incremental.js` (wraps `incremental.js`) + - **Relations pre-scan**: `indexer/steps/relations.js` (`preScanImports`) + - **Estimate context window**: `estimateContextWindow(...)` (not in scope; used by pipeline) + - **Process files**: `indexer/steps/process-files.js` + - optional sharding plan execution + - per-file chunking + postings accumulation + incremental bundle read/write + - **Relations post-scan + cross-file inference**: `indexer/steps/relations.js` (`postScanImports`, `runCrossFileInference`) + - **Incremental manifest pruning**: `incremental.js` (`pruneIncrementalManifest(...)`) + - **Postings build**: `indexer/steps/postings.js` + - **Write artifacts**: `indexer/steps/write.js` + - **Optional**: enqueue embeddings job when using an external embeddings service (called from pipeline) + +5. **Promotion** + - `src/index/build/promotion.js` writes/updates a `current.json` pointer to a successful build root (promotion is performed outside the per-mode pipeline). + +**Contract boundaries (recommended)** + +- The pipeline currently “spans layers” in a few places: + - CLI args parsing (“mode all”) and computed mode lists are not consistently treated as an API contract boundary. + - Sharding logic (planning + execution) creates runtime sub-instances rather than remaining a pure scheduling layer. + - Incremental state is mutated from multiple steps (process-files + relations cross-file inference updates). + +These are workable, but they heighten the importance of clear contracts/invariants per stage. + +--- + +#### A.2 Stage-by-stage contracts (inputs/outputs/invariants/errors/determinism) + +> This section captures what the code *currently* does, plus what should be made explicit (and tested). + +##### Stage: Discover + +**Primary implementation** +- `src/index/build/indexer/steps/discover.js` +- `src/index/build/discover.js` + +**Inputs** +- `runtime.root`, `runtime.ignoreMatcher`, `runtime.maxFileBytes`, `runtime.fileCaps`, `runtime.guardrails` (maxDepth/maxFiles), mode (`code`/`prose`/`extracted-prose`) +- Optional precomputed discovery bundle `{ entries, skippedFiles, lineCounts }` from preprocessing (if provided by orchestration layer) + +**Outputs** +- `state.entries`: ordered list of discovered file entries +- `state.skippedFiles`: per-mode skips (plus common skips) +- Entries are annotated with `orderIndex` for deterministic downstream ordering + +**Invariants** +- Entries must have: + - `abs` absolute path + - `rel` repo-relative path (POSIX form) with no `..` + - `stat` with at least `size`, `mtimeMs` +- Deterministic ordering: sorting by `rel` must be stable and locale-independent. +- `skippedFiles` should preserve a stable ordering for reproducibility (currently sorted in discover.js). + +**Error behavior** +- Per-file stat errors or size cap failures are recorded as skips, not fatal errors. +- Discover-level failures (e.g., inability to crawl filesystem) should throw and abort build. + +**Determinism requirements** +- Must not use locale-sensitive comparisons (`localeCompare`) or OS-dependent casing assumptions. +- Normalize paths consistently (POSIX rel keys). + +**Remaining work** +- [ ] Replace locale-dependent sorting in `indexer/steps/discover.js` with deterministic compare (and document determinism requirement). +- [ ] Consider adding `stat.isFile()` checks (defensive) before admitting entries (especially for non-git discovery paths). +- [ ] Consider making “tracked-only” behavior explicit at the API boundary (discover uses `git ls-files` when root is a git repo root) and ensure watch mode semantics align (see Watch section). + +--- + +##### Stage: Incremental plan / reuse + +**Primary implementation** +- `src/index/build/indexer/steps/incremental.js` +- `src/index/build/incremental.js` +- `src/index/build/indexer/signatures.js` + +**Inputs** +- `outDir` (mode-specific index output dir) +- `tokenizationKey` (derived from dict signature + tokenization/postings config) +- `cacheSignature` (derived from broader runtime feature/config surface) +- current discovered entries list + their `stat` for whole-index reuse decision + +**Outputs** +- `incrementalState` with: + - `manifest` (files, signature, tokenizationKey, bundleFormat, shards metadata) + - `bundleDir` + bundle format +- `reused` boolean indicating full-index reuse (early exit) +- For per-file reuse, `readCachedBundle(...)` is used by file processor layer. + +**Invariants** +- `manifest.files` keys represent the exact set of indexed files, keyed by deterministic relKey. +- Whole-index reuse must only return true if: + - stage coverage is sufficient for requested stage + - manifest key set matches current entries key set (including deletions) + - size + mtime checks match for all files (or an approved hash fallback mechanism is used) + - signature + tokenizationKey match + +**Error behavior** +- Corrupt/missing manifest should fall back to “rebuild” (not crash). +- Bundle read failures should fall back to “recompute file” (not crash), unless explicitly configured otherwise. + +**Determinism requirements** +- Signature computation must be stable (`stableStringify` is used). +- Manifest writing should be stable in structure and ordering (even if JSON object key order is mostly stable in practice). + +**Remaining work** +- [ ] Add an explicit “cache schema / tool version” component to `cacheSignature` (or a separate `cacheSchemaVersion` field checked alongside it). +- [ ] Treat `manifest.version` as a compatibility gate (migrate or reset when unsupported); ensure `manifest.files` is validated as a *plain object* (not an array). +- [ ] Decide whether whole-index reuse should allow hash fallback (currently it is strict on mtime/size) — if yes, add an opt-in and tests. + +--- + +##### Stage: Process files (chunking + postings accumulation) + +**Primary implementation** +- `src/index/build/indexer/steps/process-files.js` +- `src/index/build/state.js` +- `src/index/build/file-scan.js` (via file processor layer) +- `src/index/build/workers/indexer-worker.js` (worker pool tokenization) +- `src/index/build/worker-pool.js`, `src/index/build/runtime/workers.js` (pool + queue orchestration) + +**Inputs** +- Ordered entries list with `orderIndex` +- Runtime config: tokenization config, postings config, feature flags, caps/guardrails, worker pool config, concurrency limits, sharding config +- Incremental state with manifest + bundle directory +- Optional import map from pre-scan stage + +**Outputs** +- Mutated `state`: + - `chunks` (+ `chunkMeta`) + - `tokenPost`, `phrasePost`, `trigramPost`, `chargramPost` + - `df`, `docLengths`, `fileRelations`, `importLinks` + - `fileMeta` and `fileChunkMap` + - `totalTokens`, `totalChunks` + - `skippedFiles` additions for per-file failures +- `tokenizationStats` + `shardSummary` + `shardPlan` (for reporting and later artifact writing) +- Incremental manifest updates + bundle writes for non-cached files + +**Invariants** +- Chunk IDs must be assigned deterministically and match the ordering derived from discovered entries (not processing completion order). + - Current mechanism: `orderedAppender` ensures deterministic append order even with concurrency/sharding. +- Postings and DF must reflect the same token stream used to produce chunk meta. +- For cached files: + - The cached bundle contents must be compatible with the current tokenizationKey/signature. + - Cached chunks must be appended in the same deterministic order. + +**Error behavior** +- Per-file failures: retry per `indexingConfig.fileRetries` (via `runWithQueue` retry handling); if ultimately failing, abort build (current behavior). +- Crash logging is best-effort (debug mode only). + +**Determinism requirements** +- Ordering must not depend on concurrency, sharding, or locale settings. +- Any feature that modifies existing chunks (token retention “auto”, cross-file inference update) must be deterministic given the same inputs. + +**Remaining work** +- [ ] Fix sharding runtime lifecycle (see Section C/D): avoid creating worker pools per shard item; ensure explicit teardown; ensure sharding does not leak threads/handles. +- [ ] Replace localeCompare usage in shard plan sorting with deterministic ordering. +- [ ] Consider exposing and testing a “deterministic build mode” in which timestamps/build IDs do not affect artifact contents (at least for core artifacts). + +--- + +##### Stage: Relations (import scan + cross-file inference) + +**Primary implementation** +- `src/index/build/indexer/steps/relations.js` +- `src/index/build/feature-metrics.js` (for reporting) + +**Inputs** +- `state.fileRelations` from per-file processing (and/or pre-scan) +- runtime feature flags: + - `indexingConfig.importScan` + - `typeInferenceEnabled`, `riskAnalysisEnabled` + - `*CrossFileEnabled` flags +- incremental state (to update cached bundles after cross-file inference) + +**Outputs** +- `state.importLinks` from `postScanImports` +- Optionally updated `state.chunks` and file metadata from `applyCrossFileInference` +- `graphRelations` structure for index artifacts +- Optional incremental bundle updates via `updateIncrementalBundlesWithChunks(...)` + +**Invariants** +- importLinks should be stable given stable fileRelations + scan plan. +- If cross-file inference updates are applied: + - updates must be reflected in persisted incremental bundles (or explicitly excluded) + - index artifacts written later must correspond to the updated state. + +**Error behavior** +- Import scan failures should degrade gracefully (ideally mark relations as unavailable and continue) unless configured otherwise. +- Cross-file inference failures should not leave state partially mutated; either apply atomically or abort. + +**Determinism requirements** +- Import scan output ordering should be stable. +- Graph construction should be stable (avoid hash/map iteration nondeterminism in serialization). + +**Remaining work** +- [ ] Add tests ensuring cross-file inference updates are persisted into incremental bundles when enabled. +- [ ] Clarify the artifact contract for `graphRelations` in `index_state.json` and ensure it is versioned. + +--- + +##### Stage: Postings build + +**Primary implementation** +- `src/index/build/indexer/steps/postings.js` + +**Inputs** +- `state` with postings sets + DF + doc lengths + chunks +- `runtime.postingsConfig`, token retention configuration + +**Outputs** +- A postings artifact structure ready for serialization (plus metrics like context window) +- Optional token retention adjustments applied to chunks (auto) + +**Invariants** +- Postings must refer to valid chunk IDs. +- DF counts must align with unique tokens per doc. +- Token retention must not change postings/DF (only the retained token/gram arrays stored in chunks for downstream consumers). + +**Error behavior** +- Failures should abort (postings are core artifact). + +**Determinism requirements** +- Postings list ordering must be stable (e.g., chunk IDs sorted ascending). +- DF computation must not depend on processing order (it currently does not, provided chunk order is deterministic). + +**Remaining work** +- [ ] Add/verify tests around token retention “auto” switching (sample vs none) to ensure artifact stability and correctness. + +--- + +##### Stage: Write artifacts + promotion + +**Primary implementation** +- `src/index/build/indexer/steps/write.js` +- `src/index/build/promotion.js` +- `src/index/build/build-state.js` (build_state.json) + +**Inputs** +- runtime + mode +- `state`, `postings`, `timing`, `entries`, `shardSummary`, `graphRelations` +- (promotion) build root + mode list + +**Outputs** +- Mode-specific index directory: + - `index_state.json` + - chunk meta, file meta, postings, perf profile, feature metrics, relations graph +- Promotion pointer file: + - `current.json` mapping mode → build root + +**Invariants** +- Artifact writes should be atomic where practical. +- `index_state.json` must contain: + - tool version + config hash + - stage + - tokenizationKey + cacheSignature (if incremental is enabled) + - feature flags summary (for transparency) + +**Error behavior** +- Any write failure should abort promotion; promotion must only occur after successful writes. + +**Determinism requirements** +- Artifact contents (excluding timestamps) should be stable given stable inputs. +- Promotion pointer must not “flip” to a partial build. + +**Remaining work** +- [ ] Validate that `promotion.js` cannot write a `current.json` pointer that escapes the intended cache root (path traversal hardening). +- [ ] Consider making build_state updates resilient to concurrent writes (or explicitly “best effort” with documentation). + +--- + +### B. Incremental builds: deeper review + +#### B.1 What is already solid + +**Audit** + +- Clear separation between: + - tokenizationKey (tokenization + dictionary + postings surface) + - cacheSignature (broader runtime feature surface) +- Per-file bundle read has a hash fallback mechanism to handle mtime/size mismatch scenarios (when a cached hash exists). +- Manifest pruning deletes bundles for deleted files (`pruneIncrementalManifest`). +- Whole-index reuse checks stage coverage and verifies manifest key set matches entries key set (including deletions) and validates per-file stat checks (`shouldReuseIncrementalIndex`). +- A dedicated test suite exists for: + - signature invalidation (`tests/incremental-cache-signature.js`) + - manifest updates (`tests/incremental-manifest.js`) + - reuse semantics including deletions (`tests/incremental-reuse.js`) + - incremental plan behavior (`tests/indexer/incremental-plan.test.js`) + +#### B.2 Gaps / risks + +**Remaining work (correctness + durability)** + +- [ ] **Cache invalidation across tool updates:** include a “tool version / schema version / algorithm version” in the incremental signature. + Suggested approach: + - Add a `runtime.cacheSchemaVersion` constant (bumped on any semantic change), and include it in `buildIncrementalSignature(...)`. + - Or include `runtime.toolInfo.version` (and document that caches are invalidated across versions). +- [ ] **Manifest version compatibility:** enforce `manifest.version` compatibility explicitly; if unsupported, reset (and optionally delete bundles). + Also validate `manifest.files` is a plain object: `loaded.files && typeof loaded.files === 'object' && !Array.isArray(loaded.files)`. +- [ ] **Bundle cleanup on invalidation:** when signature/tokenizationKey mismatches, consider deleting the bundles directory (or moving aside) to avoid disk bloat. +- [ ] **Whole-index reuse strictness:** decide if whole-index reuse should support content-hash fallback for stat mismatch (opt-in). + If not, document that mtime/size must match exactly, and why (performance vs safety). +- [ ] **Stage interactions:** confirm and test that: + - stage1 builds do not reuse stage2 caches (signature should differ, but confirm) + - stage2 builds do not reuse stage1 caches + - stage4 behaviors are consistent (if stage4 writes different artifact sets) +- [ ] **RelKey normalization:** ensure relKey generation is consistently POSIX and case-handled on Windows for both discovery and watch paths. + +--- + +### C. Concurrency and robustness + +#### C.1 Locking + +**Audit** + +- `src/index/build/lock.js` implements: + - atomic lock acquisition via `fs.open(lockPath, 'wx')` + - stale lock detection via pid + timestamp (and mtime fallback) + - optional wait/poll to acquire lock + +**Remaining work** +- [ ] Ensure the lock file handle is closed even if `writeFile(...)` fails (use try/finally around the acquired `handle`). +- [ ] Consider including `buildId` and `mode(s)` in the lock file payload to improve observability/debugging. +- [ ] Add a test that simulates write failure during lock acquisition (can be done by injecting a stubbed fs layer, or by creating a read-only directory). + +#### C.2 Sharding + queues + worker pools + +**Audit** + +- The pipeline uses a queue abstraction (`createTaskQueues`, `runWithQueue`) and worker pools (`Piscina`) to parallelize CPU-heavy tasks. +- Sharding aims to distribute work based on line counts / cost predictions, while preserving deterministic output ordering via an ordered appender. + +**Remaining work (critical)** +- [ ] **Do not create worker pools per shard item.** + Options (choose one): + 1) **Preferred:** share the parent runtime’s worker pools across all shards; only shard the scheduling/queueing. + 2) If per-shard pools are required: create **one** shard runtime per shard worker (batch), reuse it for all work items in that batch, and **always** `destroy()` pools and tear down queues in a `finally`. +- [ ] Add a regression test / harness that runs a sharded build and asserts the process exits promptly (no lingering worker threads). + Practical approach: spawn `node build_index.js ...` with `--shards.enabled` and ensure it exits within a timeout; also enable `--verbose` to detect repeated pool creation. +- [ ] Audit `maxPending` sizing on queues in shard runtime creation; ensure it cannot exceed a safe bound when shard concurrency is high. + +#### C.3 Watch mode robustness + +**Audit** + +- Watch mode uses chokidar and a debounce scheduler to coalesce changes. +- It maintains a tracked file set to decide whether removals/oversize transitions should trigger rebuilds. +- It always enables incremental to avoid full reindexing on every change. + +**Remaining work** +- [ ] Make `createDebouncedScheduler(...)` safe for async `onRun`: + - wrap `onRun()` in `Promise.resolve(...).catch(...)` + - optionally provide an `onError` callback +- [ ] Ensure “extracted-prose only” watch mode is supported: + - update `isIndexablePath(...)` to treat `extracted-prose` as both `code` and `prose` for extension filtering + - add coverage in `tests/watch-filter.js` +- [ ] Decide how to handle untracked file changes in git repos (discover is tracked-only): + - either document that watch will trigger rebuilds but new untracked files will not be indexed + - or add an optional “include untracked” mode for watch builds (with tests) + +--- + +### D. Performance and scalability + +#### D.1 Discovery and preprocessing overhead + +**Audit** + +- Discovery uses `git ls-files -z` when root is the git repo root, otherwise fdir crawl. +- It performs a per-file `fs.stat` in a sequential loop (async, but awaited one-by-one). +- Preprocess stage can scan file headers to detect binary/minified, and optionally count lines. + +**Remaining work** +- [ ] Parallelize `fs.stat` in discovery with a concurrency limit (e.g., 32) to reduce wall-clock time on large repos. +- [ ] Consider using fdir’s `withStats()` to avoid a separate stat syscall for non-git discovery paths. +- [ ] Ensure file-type detection does not misclassify common text types as binary (treat certain `application/*` mimes as text if needed). + +#### D.2 Sharding overhead + +**Audit** + +- Sharding may require a full line-count pass (expensive) unless line counts are provided. +- Shard planning uses predicted cost from perf profiles when available. + +**Remaining work** +- [ ] Add an option to avoid full line counting when perf profile is available and sufficiently fresh (approximate weights). +- [ ] Revisit per-shard file concurrency hard cap (`min(2, ...)`) — it can underutilize configured `runtime.fileConcurrency` on larger machines. +- [ ] Avoid per-shard runtime creation (performance + correctness; see Section C). + +#### D.3 Worker pool overhead + +**Audit** + +- Worker tasks validate cloneability of inputs/outputs for each task (deep scan with limits). +- Worker pool supports restart/backoff, and permanent disable on repeated opaque failures. + +**Remaining work** +- [ ] Gate cloneability validation behind a debug flag or environment variable; keep it on by default in CI/tests, off in production, or vice versa (choose explicitly). +- [ ] Consider using transfer lists for large typed arrays in quantize tasks to reduce cloning overhead. +- [ ] Add metrics to quantify: + - pool restart frequency + - clone-check overhead + - task latency distribution + +--- + +### E. Refactoring / code quality / test gaps + +#### E.1 Duplication and clarity + +**Audit** + +- Multiple modules duplicate “max bytes per extension” logic and cap normalization: + - `discover.js` has `resolveMaxBytesForExt` + - `watch.js` has `maxBytesForExt` + - `tools/shard-census.js` has its own normalization helpers +- Ordering uses both explicit `<` comparisons and `localeCompare` in different places. + +**Remaining work** +- [ ] Centralize “max bytes per extension” and “cap normalization” logic into a single helper module (likely `runtime/caps.js` or a shared `file-caps.js`) and reuse across discover/watch/tools. +- [ ] Standardize ordering comparisons: provide a shared `compareRelPaths(a, b)` helper that is locale-independent and (optionally) Windows-case-aware. +- [ ] Run formatter / lint pass on files with inconsistent indentation (not functionally wrong, but increases diff noise and review friction). + +#### E.2 Tests to add or strengthen + +**Remaining work** +- [ ] **Build all modes:** Ensure `tests/build-index-all.js` reliably enforces that `--mode all` produces `code`, `prose`, and `extracted-prose` artifacts (and fix the orchestration boundary if currently inconsistent). +- [ ] **Watch extracted-prose:** add a case to `tests/watch-filter.js` where `modes=['extracted-prose']` and confirm indexable file changes trigger scheduling. +- [ ] **Watch async error safety:** add a test that uses an async `onRun` that rejects once, and assert no `unhandledRejection` occurs (attach a listener in the test). +- [ ] **Sharding teardown:** add a harness test that enables sharding and asserts no lingering worker threads prevent exit. +- [ ] **Incremental schema version:** add a test that simulates a tool version/schema version change and confirms caches are invalidated. + +--- + +### File-by-file findings (actionable) + +> Items below are intentionally concrete and file-scoped to minimize ambiguity. + +#### `build_index.js` + +- [ ] Pass the resolved `modes` from `parseBuildArgs(...)` through to the build orchestrator (or otherwise guarantee that “mode all” resolves identically at every boundary). + _Why:_ prevents drift between CLI arg parsing and internal orchestration; aligns with `tests/build-index-all.js`. + +#### `src/index/build/args.js` + +- [ ] Consider adding `argv.modes` (or similar) so downstream layers do not need to re-derive the “all → modes” mapping (and so the CLI entry can pass a single object). + +#### `src/index/build/build-state.js` + +- [ ] Document that `build_state.json` is best-effort and may lose updates under concurrent writers; or introduce an append-only/event model to prevent lost updates. +- [ ] Consider `timer.unref()` on heartbeat interval for cases where build-state heartbeat should not keep the process alive (optional). + +#### `src/index/build/crash-log.js` + +- [ ] Consider throttling `updateFile(...)` writes when debug crash logging is enabled (currently potentially writes state on every file). + +#### `src/index/build/discover.js` + +- [ ] Add concurrency-limited parallel statting for large repos. +- [ ] Add defensive `stat.isFile()` gating for non-git crawls. + +#### `src/index/build/failure-taxonomy.js` + +- No blocking issues found in scope; consider expanding taxonomy categories over time as needed. + +#### `src/index/build/feature-metrics.js` + +- No blocking issues found; consider adding an explicit schema version to metrics output to support future evolution. + +#### `src/index/build/file-scan.js` + +- [ ] Treat certain `file-type` “application/*” results (e.g., json/xml) as potentially text, or ensure `file-type` is only advisory and always confirm with istextorbinary when in doubt. +#### `src/index/build/ignore.js` + +- [ ] Consider supporting nested `.gitignore` semantics for non-git discovery paths (optional, but improves parity with developer expectations). + +#### `src/index/build/incremental.js` + +- [ ] Validate `manifest.files` is a plain object; reset if array/invalid. +- [ ] Enforce manifest version compatibility; reset or migrate. +- [ ] Consider deleting stale bundles on signature/tokenizationKey mismatch to avoid disk bloat. + +#### `src/index/build/indexer.js` + +- No major issues; ensure per-mode runtime mutations are intentional and documented. + +#### `src/index/build/indexer/pipeline.js` + +- [ ] Ensure any ordering-critical sorts remain locale-independent (primary issue is in discover step; pipeline relies on it). +- [ ] Consider explicitly documenting the per-mode stage graph and how it maps to artifacts and cache signature components. + +#### `src/index/build/indexer/signatures.js` + +- [ ] Add cache schema / tool version component to `buildIncrementalSignature(...)`. +- [ ] Consider adding explicit versions for: + - chunk schema + - postings schema + - relations graph schema + +#### `src/index/build/indexer/steps/discover.js` + +- [ ] Replace `localeCompare` sort with deterministic compare. +- [ ] Avoid mutating shared entry objects if discovery is reused across modes (optional; low risk today, but cleaner). + +#### `src/index/build/indexer/steps/incremental.js` + +- [ ] Add more granular status reporting (e.g., why reuse rejected) for observability; currently logs are decent but could be structured. + +#### `src/index/build/indexer/steps/postings.js` + +- [ ] Add tests for token retention “auto” switching correctness and stability. + +#### `src/index/build/indexer/steps/process-files.js` + +- [ ] Fix sharding runtime lifecycle (do not create per-work-item pools; ensure teardown). +- [ ] Replace localeCompare in shard plan sorting with deterministic compare. +- [ ] Revisit per-shard concurrency cap (min(2, ...)). +- [ ] Consider hoisting shard runtime creation outside the inner work-item loop if per-shard runtime instances remain desired. + +#### `src/index/build/indexer/steps/relations.js` + +- [ ] Add tests ensuring cross-file inference updates are persisted into incremental bundles when enabled. +- [ ] Clarify error strategy for import scan failures (degrade vs abort) and encode it in tests/config. + +#### `src/index/build/indexer/steps/write.js` + +- [ ] Ensure `index_state.json` always includes the correct cache signature / tokenizationKey values used for the build (especially when any runtime config is adapted per mode). + +#### `src/index/build/lock.js` + +- [ ] Close file handle in a `finally` if write fails during lock acquisition. + +#### `src/index/build/perf-profile.js` + +- No major correctness issues; consider exporting a schema version. + +#### `src/index/build/preprocess.js` + +- [ ] Document that preprocess is currently for `code` + `prose` only (or extend support to `extracted-prose` explicitly if desired). + +#### `src/index/build/promotion.js` + +- [ ] Harden path handling so `current.json` cannot point outside `repoCacheRoot` even if inputs are malformed. + +#### `src/index/build/runtime.js` + +- No blocking issues found in scope. + +#### `src/index/build/runtime/caps.js` + +- No blocking issues found; consider consolidating cap normalization usage across tools. + +#### `src/index/build/runtime/hash.js` + +- No blocking issues found. + +#### `src/index/build/runtime/logging.js` + +- No blocking issues found; consider documenting the distinction between structured logs and progress logs. + +#### `src/index/build/runtime/runtime.js` + +- [ ] Consider making the “tracked-only discovery” behavior visible in logs when git is used (helps users understand why new files may not be indexed). +- [ ] Consider ensuring any per-mode adaptive config does not bleed across modes (currently low risk, but worth documenting). + +#### `src/index/build/runtime/stage.js` + +- No blocking issues found; stage overrides appear coherent and tested (`tests/build-runtime/stage-overrides.test.js`). + +#### `src/index/build/runtime/tree-sitter.js` + +- No blocking issues found in scope. + +#### `src/index/build/runtime/workers.js` + +- [ ] Review queue pending-limit sizing with sharding enabled; ensure worst-case bounds are safe. + +#### `src/index/build/state.js` + +- No blocking issues found; consider adding explicit assertions/guards in merge functions to prevent mismatched id offsets if used elsewhere. + +#### `src/index/build/watch.js` + +- [ ] Make debounce scheduler safe for async `onRun` (catch rejections). +- [ ] Support `extracted-prose` as a mode for indexable path filtering. +- [ ] Consider reducing rebuild churn from untracked files (optional). + +#### `src/index/build/worker-pool.js` + +- [ ] Consider exposing a “debug clone checks” toggle (ties into worker validation overhead discussion). +- [ ] Add optional transferList support for quantize tasks. + +#### `src/index/build/workers/indexer-worker.js` + +- [ ] Gate cloneability validation behind a debug/config toggle if performance becomes an issue. + +#### `tools/shard-census.js` + +- [ ] Replace `localeCompare` with deterministic compare for stable reporting. +- [ ] Consider reusing shared cap/normalization utilities rather than duplicating. + +#### Tests + +##### `tests/build-index-all.js` + +- [ ] Ensure the build orchestration actually builds `extracted-prose` for `--mode all` (fix boundary mismatch if needed). + +##### `tests/watch-filter.js` + +- [ ] Add an `extracted-prose`-only mode coverage case. +- [ ] Add an async debounce safety test (unhandled rejection prevention). + +##### `tests/worker-pool*.js` + +- No immediate gaps; consider adding a perf regression test if clone checks are made optional. + +--- + +### Deliverables + +- [ ] Fix sharding runtime lifecycle and add regression coverage. +- [ ] Resolve “mode all” / extracted-prose mismatch and ensure `tests/build-index-all.js` passes reliably. +- [ ] Harden watch debounce scheduling against async rejection. +- [ ] Replace localeCompare sorts in ordering-critical paths. +- [ ] Add a cache schema/tool version component to incremental signature and add a test for invalidation. + +### Exit criteria + +- [ ] Sharded builds do not leak worker threads/handles and the process exits cleanly. +- [ ] `--mode all` produces `code`, `prose`, and `extracted-prose` indices; validated by test. +- [ ] Watch mode does not emit unhandled promise rejections under forced error paths. +- [ ] Deterministic ordering is documented and enforced (no locale-dependent sorts in critical ordering paths). +- [ ] Incremental cache reuse is safe across code releases (explicit schema/version invalidation). + + +## Phase 29 — Embeddings & ANN (onnx/HNSW/batching/candidate sets) + +**Objective:** harden the embeddings + ANN stack for correctness, determinism (where required), performance, and resilient fallbacks across **index build**, **build-embeddings tooling**, and **retrieval-time ANN execution**. + +### 29.1 Correctness + +#### 29.1.1 Model identity (cache keys, preprocessing, normalization, dims) + +##### Current state (verified) +- [x] Tooling cache keys include **file hash** + **chunk signature** + **embedding identity** (`tools/build-embeddings/cache.js`, `tools/build-embeddings/run.js`). +- [x] Tooling includes **dims mismatch guardrails** with explicit hard-fail paths and tests (`tools/build-embeddings/embed.js`, `tests/embeddings-dims-mismatch.js`, `tests/embeddings-dims-validation.js`). + +##### Remaining gaps / action items +- [ ] **Expand embedding identity to include preprocessing + provider-specific knobs**, not just `{modelId, provider, mode, stub, dims, scale}`: + - Why: changing `onnx` tokenizer/model path or execution provider can change embeddings without changing `modelId`/`provider`, allowing silent cache reuse. + - Files: + - `tools/build-embeddings/cache.js` (identity schema) + - `tools/build-embeddings/run.js` (identity inputs) + - Add fields (at minimum): + - ONNX: `onnx.modelPath` (resolved), `onnx.tokenizerId`, `onnx.executionProviders`, `onnx.threads`, `onnx.graphOptimizationLevel` + - Common: pooling strategy (mean), `normalize=true`, truncation/max_length policy + - Quantization: `minVal/maxVal` (currently fixed -1..1), quantization “version” +- [ ] **Include a tooling/version fingerprint in cache identity** (or bumpable `identity.version`) so cache invalidates when embedding algorithm changes: + - Why: changes to doc extraction, pooling logic, quantization, or merging should invalidate caches even if file hashes are unchanged. + - Files: `tools/build-embeddings/cache.js`, optionally `tools/build-embeddings/chunks.js` +- [ ] **Add strict provider validation**: unknown `indexing.embeddings.provider` should not silently map to `xenova`. + - Why: silent fallback can produce “correct-looking” but unintended embeddings and cache identity mismatch. + - Files: `src/shared/onnx-embeddings.js` (normalizeEmbeddingProvider), `src/index/embedding.js`, `tools/build-embeddings/cli.js`, `src/retrieval/embedding.js` +- [ ] **Unify default stub embedding dimensions across build + retrieval + tooling** (currently inconsistent defaults: 384 vs 512). + - Why: any code path that calls stub embeddings without an explicit `dims` risks producing query embeddings that cannot match the index dims. + - Files: `src/shared/embedding.js` (defaults to 512), `src/index/embedding.js` (defaults to 384), `tools/build-embeddings/run.js` (defaults to 384), `src/retrieval/embedding.js` (passes `dims`, but can pass null in some ANN-only paths). + - Recommendation: pick **384** as the single default everywhere OR require dims explicitly in stub mode and fail loudly if missing. +- [ ] **Index-build (inline) path lacks explicit dims mismatch failure** comparable to build-embeddings tool: + - `src/index/build/file-processor/embeddings.js` currently coerces unexpected shapes to empty arrays and proceeds. + - Add an explicit “dims contract” check and fail fast (or disable embeddings) if: + - vectors are not arrays/typed arrays, + - dims are inconsistent across chunks, + - batch output length mismatches input length. +- [ ] **Make per-file embedding cache writes atomic** (cache files are written with `fs.writeFile`): + - Why: partial/corrupt cache JSON can cause repeated recompute; while not “poisoning,” it degrades throughput and can mask real failures. + - Files: `tools/build-embeddings/run.js` (cache writes), optionally reuse `tools/build-embeddings/atomic.js` or shared atomic writer. + +**Exit criteria** +- [ ] Changing any embedding-relevant knob (model path/tokenizer/provider/normalization/pooling/quantization) forces cache miss. +- [ ] Dims mismatch fails loudly (or deterministically disables embeddings) in **both** build-embeddings and inline index-build paths. +- [ ] Stub-mode dims are consistent across indexing + retrieval. + +--- + +#### 29.1.2 Determinism (float handling, batching order) + +##### Current state (verified) +- [x] Quantization uses deterministic rounding (`src/index/embedding.js`). +- [x] Batched embedding retains input ordering in both tooling and index build (`tools/build-embeddings/embed.js`, `src/index/build/file-processor/embeddings.js`). + +##### Remaining gaps / action items +- [ ] **Document and/or enforce determinism requirements for HNSW build**: + - HNSW graph structure can vary with insertion order; current insertion order is “file processing order,” which depends on `Map` insertion order derived from chunk meta traversal. + - Files: `tools/build-embeddings/run.js`, `tools/build-embeddings/hnsw.js` + - Recommendation: ensure vectors are added to HNSW in a stable order (e.g., ascending `chunkIndex`). +- [ ] **Avoid nondeterministic file sampling in context window estimation**: + - `src/index/build/context-window.js` uses the first N files in `files[]`; if upstream file enumeration order is OS-dependent, context window results can change. + - Recommendation: sort file paths before sampling (or explicitly document nondeterminism). +- [ ] **Normalize float types across providers**: + - Many paths convert typed arrays into JS arrays; this is deterministic but increases the surface for subtle differences and performance regressions. + - Recommendation: standardize on `Float32Array` where feasible and only convert at serialization boundaries. + +**Exit criteria** +- [ ] HNSW build is reproducible across runs given identical artifacts/config (or nondeterminism is clearly documented and accepted). +- [ ] Context window selection is stable given identical repo state. + +--- + +#### 29.1.3 Robust fallback behavior (missing models/extensions/unsupported configs) + +##### Current state (verified) +- [x] Retrieval embedding errors are caught and return `null` (`src/retrieval/embedding.js`), which allows the search pipeline to continue in sparse-only mode. +- [x] SQLite vector extension usage is guarded and can be disabled via sanitization (`tests/vector-extension-sanitize.js`). + +##### Remaining gaps / action items +- [ ] **ONNX embedder config validation is partially ineffective**: + - `src/shared/onnx-embeddings.js:createOnnxEmbedder()` checks `normalizeEmbeddingProvider('onnx') !== 'onnx'` which is a no-op (constant input). + - Replace with validation of the *actual* requested provider (or remove the dead check). +- [ ] **Improve “missing model” errors with clear remediation** (especially for offline envs): + - Recommend: explicitly mention `tools/download-models.js` and where the model path is expected. + - Files: `src/shared/onnx-embeddings.js`, `src/index/embedding.js` +- [ ] **HNSW load path should fall back to `.bak` on corrupt primary**, not only when primary is missing: + - Today: `src/shared/hnsw.js` only chooses `.bak` if primary missing; it does not retry `.bak` if `readIndexSync()` throws. +- [ ] **Use HNSW meta for safety checks**: + - Retrieval load does not read `dense_vectors_hnsw.meta.json`, so it cannot validate `dims`, `space`, or `model` before querying. + - Files: `src/shared/hnsw.js` +- [ ] **Add explicit tests for “extension missing” fallback**: + - Currently there is sanitization coverage, but not “load failure / missing shared library” behavior. + - Files/tests: `tools/build-embeddings/sqlite-dense.js` + new test. + +**Exit criteria** +- [ ] Missing/corrupt HNSW artifacts do not crash retrieval; the system degrades gracefully to another ANN backend or sparse-only. +- [ ] Missing ONNX model artifacts fail with actionable errors (or clean fallback in non-strict modes). + +--- + +### 29.2 Batching & scheduling + +#### 29.2.1 Batch auto-tuning (memory/CPU/repo size) + +##### Current state (verified) +- [x] Both index-build and build-embeddings tooling implement “auto batch” based on `os.totalmem()` (`src/index/build/runtime/embeddings.js`, `tools/build-embeddings/cli.js`). +- [x] Language-specific multipliers exist and are tested (`src/index/build/embedding-batch.js`, `tests/embedding-batch-multipliers.js`). + +##### Remaining gaps / action items +- [ ] **Unify and justify auto-batch heuristics**: + - Index-build uses `totalGb * 16` with min 16. + - build-embeddings tool uses `totalGb * 32` with min 32. + - Decide a single policy OR clearly document why they intentionally differ. +- [ ] **Incorporate CPU oversubscription controls**: + - ONNX runtime can be multi-threaded (`threads` option), while the embedding queue can also be concurrent. + - Add a policy: e.g., `embeddingConcurrency * onnxThreads <= cpuCount` (or document exceptions). + - Files: `src/index/build/runtime/embeddings.js`, `src/shared/onnx-embeddings.js` +- [ ] **Adapt batch sizing to repo characteristics**: + - For tiny repos/files, large batch sizes increase latency without improving throughput. + - For huge repos, file-by-file batching underutilizes the accelerator (many small batches). + - Recommendation: introduce a global “embedding batcher” that batches across files with: + - max batch size, + - max tokens/estimated memory per batch, + - stable ordering. + - Files impacted: `src/index/build/file-processor/embeddings.js`, `tools/build-embeddings/run.js` + +**Exit criteria** +- [ ] Batch sizing + concurrency are predictable and safe across low-memory hosts, multi-core hosts, and both small and large repos. +- [ ] Default settings do not oversubscribe CPU when ONNX threads are enabled. + +--- + +#### 29.2.2 Embedding queues (backpressure, bounded memory) + +##### Current state (verified) +- [x] Service-mode job enqueue provides a `maxQueued` hook (`src/index/build/indexer/embedding-queue.js`). + +##### Remaining gaps / action items +- [ ] **Define and enforce backpressure defaults**: + - If `maxQueued` is unset/null, behavior depends on `enqueueJob()` (not in scope here); ensure a safe default exists. + - Add explicit documentation + a test that verifies queue growth is bounded. +- [ ] **Ensure service jobs include enough identity to be safe**: + - Job payload includes `{repo, mode}`, but not an embedding identity fingerprint. + - Include `embeddingProvider`, model id, and/or a hash of embedding config to prevent mismatched worker configuration from producing incompatible embeddings. + +**Exit criteria** +- [ ] Queue growth is bounded by default; overload produces clear errors and does not OOM the process. + +--- + +#### 29.2.3 Session/model reuse + +##### Current state (verified) +- [x] ONNX sessions are cached per normalized config (`src/shared/onnx-embeddings.js`). +- [x] Retrieval embedder instances are cached in-process (`src/retrieval/embedding.js`). + +##### Remaining gaps / action items +- [ ] **Guard concurrent use of shared ONNX sessions if required**: + - If `onnxruntime-node` sessions are not safe for concurrent `run()` calls, add a per-session mutex/queue. + - At minimum: document thread-safety assumptions and add a stress test. +- [ ] **Avoid duplicate pipeline/session loads in index-build**: + - `src/index/embedding.js` does not maintain a global cache similar to retrieval; if multiple embedder instances are constructed in one process, models may be loaded multiple times. + +**Exit criteria** +- [ ] A single model/session is loaded once per process per config, and safely shared across all embedding calls. + +--- + +### 29.3 ANN correctness + +#### 29.3.1 Distance metric correctness (HNSW scoring) + +##### Current state (verified) +- [x] HNSW ranker applies a stable tie-break (`idx`) after converting distances to similarity (`src/shared/hnsw.js`). + +##### Remaining gaps / action items +- [ ] **Confirm and test distance-to-similarity conversion for each HNSW space** (`l2`, `cosine`, `ip`): + - Current code treats `ip` the same as `cosine` (`sim = 1 - distance`). + - This may be correct or incorrect depending on hnswlib’s distance definition for `ip`. + - Required: add unit tests with known vectors and expected distances/similarities and adjust conversion if needed. + - Files: `src/shared/hnsw.js`, new test (e.g., `tests/hnsw-distance-metrics.js`). + +**Exit criteria** +- [ ] For each supported space, returned `sim` is monotonic with the true similarity notion used elsewhere in scoring. + +--- + +#### 29.3.2 Atomic safety (no torn reads/writes) + +##### Current state (verified) +- [x] Build writes HNSW `.bin` and `.meta.json` via atomic replace with `.bak` retention (`tools/build-embeddings/atomic.js`, `tools/build-embeddings/hnsw.js`). +- [x] There is a test that asserts `.bak` is created on replace (`tests/hnsw-atomic.js`). + +##### Remaining gaps / action items +- [ ] **HNSW reader should support “corrupt primary” fallback**: + - Implement: try primary, and if read fails, try `.bak` before giving up. + - Files: `src/shared/hnsw.js` +- [ ] **Validate `.bin` / `.meta.json` pairing**: + - Ensure meta file exists, parseable, and matches expected dims/space/model before using the index. + - If mismatch, treat index as unavailable and fall back. + +**Exit criteria** +- [ ] Retrieval never crashes due to a torn/corrupt HNSW file; fallback paths are exercised by tests. + +--- + +#### 29.3.3 Candidate set semantics (HNSW + sqlite-vec) + +##### Current state (verified) +- [x] SQLite candidate pushdown behavior is tested for small vs large candidate sets (`tests/sqlite-vec-candidate-set.js`). + +##### Remaining gaps / action items +- [ ] **Handle empty candidate sets explicitly in HNSW path**: + - `rankHnswIndex()` currently treats an empty set as “no filter” (because `candidateSet.size` is falsy), which can return results when none are desired. + - Files: `src/shared/hnsw.js` +- [ ] **Document and test candidate-set cap behavior**: + - HNSW uses a `candidateSetCap` default of 1000; ensure callers understand whether this can truncate results. + - Add tests for: + - empty set → empty hits, + - small set → only those labels, + - very large set → filter still applied and returned hits are subset, with stable ordering. +- [ ] **Align candidate-set tie-break behavior across backends**: + - SQLite ANN tests require deterministic tie-break by `rowid`. + - HNSW already tie-breaks by `idx`. Ensure both are consistent with retrieval expectations. + +**Exit criteria** +- [ ] Candidate sets behave identically (semantically) across ANN backends: never return items outside the set, deterministic ordering for ties, predictable truncation rules. + +--- + +### 29.4 Performance improvements to prioritize + +#### 29.4.1 Float32Array end-to-end (avoid JS arrays of floats) +- [ ] **Standardize the embedding contract to return `Float32Array`**: + - Files: `src/index/embedding.js`, `src/retrieval/embedding.js`, `src/shared/onnx-embeddings.js`, `src/shared/embedding.js` +- [ ] **Update downstream code to accept typed arrays** (don’t gate on `Array.isArray`): + - Files: `src/index/build/file-processor/embeddings.js`, `tools/build-embeddings/embed.js`, `tools/build-embeddings/run.js`, `tools/build-embeddings/hnsw.js` +- [ ] **Defer conversion to JS arrays only at serialization boundaries** (JSON writing). + +#### 29.4.2 Minimize serialization between threads/processes (transferable buffers) +- [ ] Where embeddings are computed in worker threads/processes (service mode), prefer: + - transferring `ArrayBuffer`/`SharedArrayBuffer` instead of JSON arrays, + - or using binary packed formats for vectors. +- [ ] Add an explicit “embedding payload format” version in job payloads so workers and callers stay compatible. + - File touchpoints: `src/index/build/indexer/embedding-queue.js` (job payload) + +#### 29.4.3 Pre-allocate and reuse buffers +- [ ] **ONNX embedding path**: + - Avoid per-call allocations: + - re-use `BigInt64Array` buffers for token ids/masks where shapes are stable, + - avoid `Array.from()` conversions for slices. + - Files: `src/shared/onnx-embeddings.js` +- [ ] **Index-build merge path**: + - Avoid allocating a new zero vector per chunk in `attachEmbeddings()`. + - File: `src/index/build/file-processor/embeddings.js` + +#### 29.4.4 Candidate generation tuning +- [ ] Push sparse filters earlier and reduce dense scoring work: + - prefer ANN-restricted candidate sets before dense dot products, + - prefer pushing candidate constraints into sqlite-vec queries when small enough (already partially implemented). + - (Some of this lives outside the reviewed file list; track as cross-cutting work.) + +**Exit criteria** +- [ ] Embedding pipelines avoid unnecessary conversions/allocations; measurable CPU and memory reductions on large repos. +- [ ] ANN candidate generation demonstrably reduces dense scoring load for common queries. + +--- + +### 29.5 Refactoring goals + +#### 29.5.1 Single embedding interface shared by build + retrieval +- [ ] Create a single shared adapter interface, e.g.: + - `embed(texts: string[], opts) => Float32Array[]` + - `embedOne(text: string, opts) => Float32Array` +- [ ] Move provider selection + error handling behind adapters: + - `xenova`, `onnx`, `stub`. +- [ ] Ensure both index-build and retrieval use the same adapter and the same preprocessing defaults. + +#### 29.5.2 Centralize normalization & preprocessing +- [ ] Eliminate duplicated `normalizeVec()` implementations: + - `src/index/embedding.js` + - `src/shared/onnx-embeddings.js` + - `tools/build-embeddings/embed.js` (indirectly uses index/embedding normalization) +- [ ] Centralize: + - pooling strategy, + - normalization strategy, + - truncation/max_length policy, + - doc/code merge policy. + +#### 29.5.3 Clear ANN backend adapters +- [ ] Wrap sqlite-vec and HNSW behind a single “ANN adapter” contract with: + - candidate set semantics, + - deterministic tie-break contract, + - consistent error handling and stats reporting. + - (Some of this lives outside the reviewed file list.) + +**Exit criteria** +- [ ] Build + retrieval cannot diverge in embedding shape/normalization/pooling without a deliberate, versioned change. +- [ ] ANN behavior is consistent regardless of backend. + +--- + +### 29.6 Tests + +#### 29.6.1 Coverage checklist + +##### Already covered (verified) +- [x] Cache identity/invalidation (baseline) — `tests/embeddings-cache-identity.js`, `tests/embeddings-cache-invalidation.js` +- [x] Dims mismatch (tooling) — `tests/embeddings-dims-mismatch.js`, `tests/embeddings-dims-validation.js` +- [x] ANN candidate set correctness (sqlite-vec) — `tests/sqlite-vec-candidate-set.js` +- [x] HNSW artifacts existence + atomic replace — `tests/hnsw-ann.js`, `tests/hnsw-atomic.js` + +##### Missing / needs additions +- [ ] **Cache identity tests must cover provider-specific knobs**, especially ONNX config: + - Add tests proving that changing `onnx.tokenizerId` or `onnx.modelPath` changes identityKey and forces cache miss. +- [ ] **Add extension missing/fallback tests**: + - Simulate vector extension load failure and ensure build/search does not crash and disables vector ANN. +- [ ] **Add HNSW candidate set tests**: + - empty set returns empty hits, + - filter does not leak labels, + - tie-break stability. +- [ ] **Add HNSW `.bak` fallback tests**: + - corrupt primary index/meta triggers `.bak` load and does not crash. +- [ ] **Add performance regression test for embedding batching throughput** (required by checklist): + - Recommended approach (stable in CI): + - Use a synthetic embedder function with a fixed per-call overhead + per-item cost. + - Assert that `runBatched()` with batchSize>1 achieves >= X% speedup vs batchSize=1 on a fixed input size. + - Use generous thresholds to avoid flakiness; focus on catching *major* regressions (e.g., accidental O(n²) behavior or disabling batching). + - Candidate target: `tools/build-embeddings/embed.js:runBatched()` and/or `src/index/build/file-processor/embeddings.js` batching path. + +**Exit criteria** +- [ ] Tests fail if embedding identity changes are not reflected in cache keys. +- [ ] Tests cover ANN candidate set semantics for both sqlite-vec and HNSW. +- [ ] At least one performance regression test exists for batching throughput. + +--- + +### Appendix A — File-by-file review notes (actionable items) + +> The checklist items above are the canonical “what to fix.” This appendix maps concrete file-level changes back to those items. + +#### src + +##### `src/index/build/context-window.js` +- [ ] Sort/sanitize file list before sampling to reduce OS-dependent nondeterminism. +- [ ] Consider documenting that context-window estimation is heuristic and may vary with sampling strategy. + +##### `src/index/build/embedding-batch.js` +- [ ] Consider parsing `baseSize` if it may come from config as a numeric string. +- [ ] Add explicit documentation for multiplier precedence (fallback vs user config). + +##### `src/index/build/file-processor/embeddings.js` +- [ ] Add dims contract validation (non-empty vectors must share dims; fail fast otherwise). +- [ ] Support `Float32Array` outputs (don’t rely on `Array.isArray`). +- [ ] Avoid allocating `new Array(dims).fill(0)` per chunk; reuse a single `zeroVec`. +- [ ] Validate that `getChunkEmbeddings(texts).length === texts.length`; if not, log + fail or retry with a clear warning. +- [ ] Ensure doc embedding results are length-aligned with `docPayloads` (currently assumes perfect alignment). + +##### `src/index/build/indexer/embedding-queue.js` +- [ ] Include embedding identity/config hash in job payload to prevent mismatched worker behavior. +- [ ] Consider switching job IDs to `crypto.randomUUID()` for collision resistance. +- [ ] Ensure `maxQueued` has a safe default; document backpressure behavior. + +##### `src/index/build/runtime/embeddings.js` +- [ ] Reconcile auto-batch policy with tooling (`tools/build-embeddings/cli.js`). +- [ ] Consider incorporating ONNX thread settings into concurrency auto-tune to avoid oversubscription. + +##### `src/index/embedding.js` +- [ ] Centralize `normalizeVec`/`quantizeVec` into shared utilities; remove duplication. +- [ ] Add strict provider validation (unknown provider should error/warn). +- [ ] Harden `normalizeBatchOutput()` to: + - guarantee output length equals input count, + - handle unexpected tensor dims more defensively, + - avoid returning a single huge vector when output is 3D. +- [ ] Prefer returning `Float32Array` (or at least accept typed arrays downstream). + +##### `src/retrieval/embedding.js` +- [ ] Use a normalized/fingerprinted ONNX config in the embedder cache key (avoid JSON-order sensitivity). +- [ ] If retrieval can request embeddings without known dims (ANN-only paths), require dims or ensure consistent default dims. +- [ ] Consider logging embedder load failures once (rate-limited) to aid debugging. + +##### `src/shared/embedding.js` +- [ ] Unify stub default dims with the rest of the system (recommend 384). +- [ ] Optionally return `Float32Array` to match the desired end-to-end contract. + +##### `src/shared/hnsw.js` +- [ ] Implement `.bak` fallback when the primary index exists but is corrupt/unreadable. +- [ ] Read/validate `dense_vectors_hnsw.meta.json` to confirm `dims/space/model` before using the index. +- [ ] Handle empty candidate sets explicitly by returning `[]`. +- [ ] Add unit tests for distance conversion across spaces (l2/cosine/ip) and adjust similarity conversion if required. + +##### `src/shared/onnx-embeddings.js` +- [ ] Remove/fix dead provider check (`normalizeEmbeddingProvider('onnx')`). +- [ ] Add clearer error messaging for missing model artifacts + remediation steps. +- [ ] Improve performance by avoiding heavy array conversions and by reusing buffers/tensors. +- [ ] Consider concurrency guards around `session.run()` if onnxruntime sessions are not safe concurrently. + +--- + +#### tools + +##### `tools/build-embeddings.js` +- No issues observed beyond those in underlying implementation modules. + +##### `tools/build-embeddings/atomic.js` +- [ ] Consider consolidating atomic replace logic with `src/shared/json-stream.js` to avoid divergence (optional refactor). + +##### `tools/build-embeddings/cache.js` +- [ ] Expand identity schema to include preprocessing and provider-specific config (especially ONNX knobs). +- [ ] Add a bumpable “identity version” or build-tool version fingerprint. + +##### `tools/build-embeddings/chunks.js` +- [ ] Consider incorporating doc-related signals into the chunk signature (or into identity versioning) so doc embedding caches invalidate when doc extraction logic changes. +- [ ] Consider normalizing `start/end` to finite numbers before signature generation (avoid stringifying `undefined`). + +##### `tools/build-embeddings/cli.js` +- [ ] Document (or change) the behavior where `mode=service` is coerced to `inline` for this tool. +- [ ] Unify auto-batch defaults with index-build runtime (or document why they differ). + +##### `tools/build-embeddings/embed.js` +- [ ] Update to accept and return typed arrays (`Float32Array`) instead of insisting on JS arrays. +- [ ] Consider failing fast on non-vector outputs instead of silently returning `[]` entries (to avoid quietly producing all-zero embeddings). + +##### `tools/build-embeddings/hnsw.js` +- [ ] Ensure stable vector insertion order into HNSW (ascending chunkIndex). +- [ ] When adding vectors reconstructed from cache (dequantized), consider re-normalizing for cosine space to reduce drift. + +##### `tools/build-embeddings/manifest.js` +- [ ] Consider reading HNSW meta to report accurate `count`/`dims` for ANN piece files, rather than relying on `totalChunks` (defensive correctness). + +##### `tools/build-embeddings/run.js` +- [ ] Make cache writes atomic (optional but recommended). +- [ ] Use `Number.isFinite()` for chunk start/end to avoid 0/NaN edge cases from `||` coercion. +- [ ] Apply `ensureVectorArrays()` to embedded doc batches just like code batches. +- [ ] Make HNSW build deterministic (stable insertion order). +- [ ] Consider adding a global cross-file batcher for throughput. + +##### `tools/build-embeddings/sqlite-dense.js` +- [ ] Add tests for “vector extension missing/failed to load” fallback behavior. +- [ ] Consider batching inserts in larger chunks or using prepared statements more aggressively for performance on large vector sets. + +##### `tools/compare-models.js` +- [ ] If comparing ONNX vs xenova providers, ensure the script can capture and report provider config differences (identity) to interpret deltas correctly (minor enhancement). + +##### `tools/download-models.js` +- [ ] Consider supporting explicit download of ONNX model artifacts when users rely on `indexing.embeddings.provider=onnx` and custom `onnx.modelPath`. +- [ ] Improve output to show where models were cached and what to set in config if needed. + +--- + +#### tests + +##### `tests/build-embeddings-cache.js` +- [ ] Extend to assert cache identity changes for ONNX config changes (once identity schema is expanded). + +##### `tests/embedding-batch-autotune.js` +- [ ] Consider loosening or documenting assumptions about minimum batch size on low-memory systems (or adjust runtime min to match test expectations). + +##### `tests/embedding-batch-multipliers.js` +- No issues; good coverage of multiplier normalization. + +##### `tests/embeddings-cache-identity.js` +- [ ] Extend to cover ONNX-specific identity fields (tokenizerId/modelPath/etc). + +##### `tests/embeddings-cache-invalidation.js` +- [ ] Add invalidation scenarios tied to preprocessing knobs (pooling/normalize/max_length) once surfaced in identity. + +##### `tests/embeddings-dims-mismatch.js` +- Good. + +##### `tests/embeddings-dims-validation.js` +- Good. + +##### `tests/embeddings-sqlite-dense.js` +- [ ] Add coverage for vector extension load failure paths (extension missing), not only baseline dense sqlite insertions. + +##### `tests/embeddings-validate.js` +- Good baseline index-state + artifact validation coverage. + +##### `tests/hnsw-ann.js` +- [ ] Add correctness assertions beyond “backend selected”: + - candidate set filtering (once exposed), + - tie-break determinism, + - sanity check of returned ordering for a known query on fixture corpus. + +##### `tests/hnsw-atomic.js` +- [ ] Add test for `.bak` fallback on corrupt primary index/meta (reader-side). + +##### `tests/smoke-embeddings.js` +- Good smoke harness; consider adding new tests to this suite after implementing performance regression and fallback tests. + +##### `tests/sqlite-vec-candidate-set.js` +- [ ] Add a column-name sanitization test (table is covered; column is not). + +##### `tests/vector-extension-sanitize.js` +- Good table sanitization coverage; extend for column sanitization as above. + +--- + + +## Phase 30 — Index analysis features (metadata/risk/git/type-inference) — Review findings & remediation checklist + +**Objective:** Review the Section 4 file set (56 files) and produce a concrete, exhaustive remediation checklist that (1) satisfies the provided Phase 4 checklist (A–G) and (2) captures additional defects, inconsistencies, and improvements found during review. + +**Scope:** All files enumerated in `pairofcleats_review_section_4_files_and_checklist.md` (src/tests/docs). +**Out of scope:** Implementing fixes in-code (this document is a work plan / punch list). + +--- + +### Summary (priority ordered) + +#### P0 — Must fix (correctness / crash / schema integrity) + +- [ ] **Risk rules regex compilation is currently mis-wired.** `src/index/risk-rules.js` calls `createSafeRegex()` with an incorrect argument signature, so rule regex configuration (flags, limits) is not applied, and invalid patterns can throw and abort normalization. + - Fix in: `src/index/risk-rules.js` (see §B.1). +- [ ] **Risk analysis can crash indexing on long lines.** `src/index/risk.js` calls SafeRegex `test()` / `exec()` without guarding against SafeRegex input-length exceptions. One long line can throw and fail the whole analysis pass. + - Fix in: `src/index/risk.js` (see §B.2). +- [ ] **Metadata v2 drops inferred/tooling parameter types (schema data loss).** `src/index/metadata-v2.js` normalizes type maps assuming values are arrays; nested maps (e.g., `inferredTypes.params.[]`) are silently discarded. + - Fix in: `src/index/metadata-v2.js` + tests + schema/docs (see §A.1–A.4). + +#### P1 — Should fix (determinism, performance, docs, validation gaps) + +- [ ] **`metaV2` validation is far too shallow and does not reflect the actual schema shape.** `src/index/validate.js` only validates a tiny subset of fields and does not traverse nested type maps. +- [ ] **Docs drift:** `docs/metadata-schema-v2.md` and `docs/risk-rules.md` do not fully match current code (field names, structures, and configuration). +- [ ] **Performance risks:** risk scanning does redundant passes and does not short-circuit meaningfully when capped; markdown parsing is duplicated (inline + fenced); tooling providers re-read files rather than reusing already-loaded text. + +#### P2 — Nice to have (quality, maintainability, test depth) + +- [ ] Improve signature parsing robustness for complex types (C-like, Python, Swift). +- [ ] Clarify and standardize naming conventions (chunk naming vs provider symbol naming, “generatedBy”, “embedded” semantics). +- [ ] Expand tests to cover surrogate pairs (emoji), CRLF offsets, and risk rules/config edge cases. + +--- + +### A) Metadata v2: correctness, determinism, and validation + +#### Dependency guidance (best choices) +- `ajv` — encode **metadata-schema-v2** as JSON Schema and validate `metaV2` as a hard gate in `tools/index-validate` (or equivalent). +- `semver` — version `metaV2.schemaVersion` independently and gate readers/writers. + +#### A.1 `metaV2.types` loses nested inferred/tooling param types (P0) + +##### Affected files +- `src/index/metadata-v2.js` +- `docs/metadata-schema-v2.md` +- `src/index/validate.js` +- `tests/metadata-v2.js` + +##### Findings +- [ ] **Data loss bug:** `normalizeTypeMap()` assumes `raw[key]` is an array of entries. If `raw[key]` is an object map (e.g., `raw.params` where `raw.params.` is an array), it is treated as non-array and dropped. + - Evidence: `normalizeTypeMap()` (lines ~78–91) only normalizes `Array.isArray(entries)` shapes. +- [ ] **Downstream effect:** `splitToolingTypes()` is applied to `docmeta.inferredTypes`; because nested shapes are not handled, **tooling-derived param types will not appear in `metaV2.types.tooling.params`**, and inferred param types will be absent from `metaV2.types.inferred.params`. + +##### Required remediation +- [ ] Update `normalizeTypeMap()` to support nested “param maps” (and any similar nested structures) rather than dropping them. A pragmatic approach: + - [ ] If `entries` is an array → normalize as today. + - [ ] If `entries` is an object → treat it as a nested map and normalize each subkey: + - preserve the nested object shape in output (preferred), or + - flatten with a predictable prefix strategy (only if schema explicitly adopts that). +- [ ] Update `splitToolingTypes()` so it correctly separates tooling vs non-tooling entries **inside nested maps** (e.g., `params.[]`, `locals.[]`). +- [ ] Update `tests/metadata-v2.js` to assert: + - [ ] inferred param types survive into `metaV2.types.inferred.params.[]` + - [ ] tooling param types survive into `metaV2.types.tooling.params.[]` + - [ ] non-tooling inferred types do not leak into tooling bucket (and vice versa) + +#### A.2 Declared types coverage is incomplete (P1) + +##### Findings +- [ ] `buildDeclaredTypes()` currently only materializes: + - param annotations via `docmeta.paramTypes` + - return annotation via `docmeta.returnType` + It does **not** cover: + - [ ] parameter defaults (`docmeta.paramDefaults`) + - [ ] local types (`docmeta.localTypes`) + - [ ] any other declared type sources the codebase may already emit + +##### Required remediation +- [ ] Decide which “declared” facets are part of Metadata v2 contract and implement them consistently (and document them): + - [ ] `declared.defaults` (if desired) + - [ ] `declared.locals` (if desired) +- [ ] Update `docs/metadata-schema-v2.md` accordingly. +- [ ] Add tests in `tests/metadata-v2.js` for any newly included declared facets. + +#### A.3 Determinism and stable ordering in `metaV2` (P1) + +##### Findings +- [ ] Several arrays are produced via Set insertion order (e.g., `annotations`, `params`, `risk.tags`, `risk.categories`). While *often* stable, they can drift if upstream traversal order changes. +- [ ] `metaV2` mixes optional `null` vs empty collections inconsistently across fields (some fields null, others empty arrays). This matters for artifact diffs and schema validation. + +##### Required remediation +- [ ] Standardize ordering rules for arrays that are semantically sets: + - [ ] Sort `annotations` (lexicographic) before emitting. + - [ ] Sort `params` (lexicographic) before emitting. + - [ ] Sort risk `tags`/`categories` (lexicographic) before emitting. +- [ ] Establish a consistent “empty means null” vs “empty means []” policy for v2 and enforce it in `buildMetaV2()` and schema/docs. + +#### A.4 `generatedBy` and `embedded` semantics are unclear (P2) + +##### Findings +- [ ] `generatedBy` currently uses `toolInfo?.version` only; if `tooling` already contains `tool` and `version`, this can be redundant and underspecified. +- [ ] `embedded` is emitted whenever `chunk.segment` exists, even when the segment is not embedded (parentSegmentId may be null). This makes the field name misleading. + +##### Required remediation +- [ ] Decide and document the intended meaning: + - [ ] Option A: `generatedBy = "@"` and keep `tooling` for structured detail. + - [ ] Option B: remove `generatedBy` and rely solely on `tooling`. +- [ ] Restrict `embedded` field to truly-embedded segments only **or** rename the field to something like `segmentContext` / `embedding`. + +#### A.5 Validation gaps for Metadata v2 (P1) + +##### Findings (in `src/index/validate.js`) +- [ ] `validateMetaV2()` (lines ~162–206) validates only: + - `chunkId` presence + - `file` presence + - `risk.flows` has `source` and `sink` + - type entries have `.type` for a shallow, array-only traversal + It does **not** validate: + - [ ] `segment` object shape + - [ ] range/start/end types and ordering invariants + - [ ] `lang`, `ext`, `kind`, `name` constraints + - [ ] nested types map shapes (params/locals) + - [ ] `generatedBy`/`tooling` shape and required fields + - [ ] cross-field invariants (e.g., range within segment, embedded context consistency) + +##### Required remediation +- [ ] Establish **one canonical validator** for `metaV2` (preferably schema-based): + - [ ] Add an explicit JSON Schema for v2 (in docs or tooling directory). + - [ ] Validate `metaV2` against the schema in `validateIndexArtifacts()`. +- [ ] If schema-based validation is not yet possible, expand `validateMetaV2()` to: + - [ ] traverse nested `params`/`locals` maps for type entries + - [ ] validate `range` numbers, monotonicity, and non-negativity + - [ ] validate the presence/type of stable core fields as defined in `docs/metadata-schema-v2.md` +- [ ] Add tests (or fixtures) that exercise validation failures for each major failure class. + +#### A.6 Docs drift: `docs/metadata-schema-v2.md` vs implementation (P1) + +##### Findings +- [ ] The schema doc should be reviewed line-by-line against current `buildMetaV2()` output: + - field names + - optionality + - nesting of `types.*` + - risk shapes and analysisStatus shape + - relations link formats + +##### Required remediation +- [ ] Update `docs/metadata-schema-v2.md` to reflect the actual emitted shape **or** update `buildMetaV2()` to match the doc (pick one, do not leave them divergent). +- [ ] Add a “schema change log” section so future modifications don’t silently drift. + +--- + +### B) Risk rules and risk analysis + +#### Dependency guidance (best choices) +- `re2`/RE2-based engine (already present via `re2js`) — keep for ReDoS safety, but ensure wrapper behavior cannot crash indexing. +- `ajv` — validate rule bundle format (ids, patterns, severities, categories, etc.) before compiling. + +#### B.1 Risk regex compilation is broken (P0) + +##### Affected file +- `src/index/risk-rules.js` + +##### Findings +- [ ] **Incorrect call signature:** `compilePattern()` calls `createSafeRegex(pattern, flags, regexConfig)` but `createSafeRegex()` accepts `(pattern, config)` (per `src/shared/safe-regex.js`). + Consequences: + - `regexConfig` is ignored entirely + - the intended default flags (`i`) are not applied + - any user-configured safe-regex limits are not applied +- [ ] **No error shielding:** `compilePattern()` does not catch regex compilation errors. An invalid pattern can throw and abort normalization. + +##### Required remediation +- [ ] Fix `compilePattern()` to call `createSafeRegex(pattern, safeRegexConfig)` (or a merged config object). +- [ ] Wrap compilation in `try/catch` and return `null` on failure (or record a validation error) so rule bundles cannot crash indexing. +- [ ] Add tests that verify: + - [ ] configured flags (e.g., `i`) actually take effect + - [ ] invalid patterns do not crash normalization and are surfaced as actionable diagnostics + - [ ] configured `maxInputLength` and other safety controls are honored + +#### B.2 Risk analysis can crash on long inputs (P0) + +##### Affected file +- `src/index/risk.js` + +##### Findings +- [ ] `matchRuleOnLine()` calls SafeRegex `test()` and `exec()` without guarding against exceptions thrown by SafeRegex input validation (e.g., when line length exceeds `maxInputLength`). + - This is a hard failure mode: one long line can abort analysis for the entire file (or build, depending on call site error handling). + +##### Required remediation +- [ ] Ensure **risk analysis never throws** due to regex evaluation. Options: + - [ ] Add `try/catch` around `rule.requires.test(...)`, `rule.excludes.test(...)`, and `pattern.exec(...)` to treat failures as “no match”. + - [ ] Alternatively (or additionally), change the SafeRegex wrapper to return `false/null` instead of throwing for overlong input. + - [ ] Add a deterministic “line too long” cap behavior: + - skip risk evaluation for that line + - optionally record `analysisStatus.exceeded` includes `maxLineLength` (or similar) + +#### B.3 `scope` and cap semantics need tightening (P1) + +##### Findings +- [ ] `scope === 'file'` currently evaluates only `lineIdx === 0` (first line). This is likely not the intended meaning of “file scope”. +- [ ] `maxMatchesPerFile` currently caps **number of matching lines**, not number of matches (variable name implies match-count cap). + +##### Required remediation +- [ ] Define (in docs + code) what `scope: "file"` means: + - [ ] “pattern evaluated against entire file text” (recommended), or + - [ ] “pattern evaluated once per file via a representative subset” +- [ ] Implement `maxMatchesPerFile` as an actual match-count cap (or rename it to `maxMatchingLines`). +- [ ] Add tests for both behaviors. + +#### B.4 Performance: redundant scanning and weak short-circuiting (P1) + +##### Findings +- [ ] Risk analysis scans the same text repeatedly (sources, sinks, sanitizers are scanned in separate loops). +- [ ] When caps are exceeded (bytes/lines), flows are skipped, but line scanning for matches still proceeds across the entire file, which defeats the purpose of caps for large/minified files. + +##### Required remediation +- [ ] Add an early-exit path when `maxBytes`/`maxLines` caps are exceeded: + - either skip all analysis and return `analysisStatus: capped` + - or scan only a bounded prefix/suffix and clearly mark that results are partial +- [ ] Consider a single-pass scanner per line that evaluates all rule categories in one traversal. +- [ ] Add a prefilter stage for candidate files/lines (cheap substring checks) before SafeRegex evaluation. + +#### B.5 Actionability and determinism of outputs (P1) + +##### Findings +- [ ] `dedupeMatches()` collapses evidence to one match per rule id (may not be sufficient for remediation). +- [ ] Time-based caps (`maxMs`) can introduce nondeterminism across machines/runs (what gets included depends on wall clock). + +##### Required remediation +- [ ] Preserve up to N distinct match locations per rule (configurable) rather than only first hit. +- [ ] Prefer deterministic caps (maxBytes/maxLines/maxNodes/maxEdges) over time caps; if `maxMs` remains, ensure it cannot cause nondeterministic partial outputs without clearly indicating partiality. +- [ ] Sort emitted matches/flows deterministically (by line/col, rule id) before output. + +#### B.6 Docs drift: `docs/risk-rules.md` vs implementation (P1) + +##### Findings +- [ ] `docs/risk-rules.md` should be updated to reflect: + - actual rule bundle fields supported (`requires`, `excludes`, `scope`, `maxMatchesPerLine`, `maxMatchesPerFile`, etc.) + - actual emitted `risk.analysisStatus` shape (object vs string) + - actual matching semantics (line-based vs file-based) + +##### Required remediation +- [ ] Update the doc to match current behavior (or update code to match doc), then add tests that lock it in. + +--- + +### C) Git signals (metadata + blame-derived authorship) + +#### Dependency guidance (best choices) +- `simple-git` (already used) — ensure it’s called in a way that scales: batching where feasible, caching aggressively, and defaulting expensive paths off unless explicitly enabled. + +#### C.1 Default blame behavior and cost control (P1) + +##### Affected file +- `src/index/git.js` + +##### Findings +- [ ] `blameEnabled` defaults to **true** (`options.blame !== false`). If a caller forgets to pass `blame:false`, indexing will run `git blame` per file (very expensive). +- [ ] `git log` + `git log --numstat` are executed per file; caching helps within a run but does not avoid the O(files) subprocess cost. + +##### Required remediation +- [ ] Make blame opt-in by default: + - [ ] change default to `options.blame === true`, **or** + - [ ] ensure all call sites pass `blame:false` unless explicitly requested via config +- [ ] Consider adding a global “gitSignalsPolicy” (or reuse existing policy object) that centrally controls: + - blame on/off + - churn computation on/off + - commit log depth +- [ ] Performance optimization options (choose based on ROI): + - [ ] batch `git log` queries when indexing many files (e.g., per repo, not per file) + - [ ] compute churn only when needed for ranking/filtering + - [ ] support “recent churn only” explicitly in docs (currently it’s “last 10 commits”) + +#### C.2 Minor correctness and maintainability issues (P2) + +##### Findings +- [ ] Misleading JSDoc: `parseLineAuthors()` is documented as “Compute churn from git numstat output” (it parses blame authors, not churn). This can mislead future maintenance. + +##### Required remediation +- [ ] Fix the JSDoc to match the function purpose and parameter type. + +#### C.3 Tests improvements (P1) + +##### Affected tests +- `tests/git-blame-range.js` +- `tests/git-meta.js` +- `tests/churn-filter.js` +- `tests/git-hooks.js` + +##### Findings +- [ ] No tests assert “blame is off by default” (or the intended default policy). +- [ ] No tests cover rename-following semantics (`--follow`) or untracked files. +- [ ] Caching behavior is not validated (e.g., “git blame called once per file even if many chunks”). + +##### Required remediation +- [ ] Add tests that explicitly validate the intended default blame policy. +- [ ] Add a caching-focused test that ensures repeated `getGitMeta()` calls for the same file do not spawn repeated git commands (can be validated via mocking or by instrumenting wrapper counts). +- [ ] Decide whether rename-following is required and add tests if so. + +--- + +### D) Type inference (local + cross-file + tooling providers) + +#### Dependency guidance (best choices) +- LSP-based providers (clangd/sourcekit/pyright) — keep optional and guarded; correctness should degrade gracefully. +- TypeScript compiler API — keep optional and isolated; add caching/incremental compilation for large repos. + +#### D.1 Provider lifecycle and resilience (P1) + +##### Affected files +- `src/index/type-inference-crossfile/tooling.js` +- `src/index/tooling/*.js` +- `src/integrations/tooling/lsp/client.js` +- `src/integrations/tooling/providers/lsp.js` +- `src/integrations/tooling/providers/shared.js` + +##### Findings +- [ ] `createLspClient().request()` can leave pending requests forever if a caller forgets to supply `timeoutMs` (pending map leak). Current provider code *usually* supplies a timeout, but this is not enforced. +- [ ] Diagnostics timing: providers request symbols immediately after `didOpen` and then `didClose` quickly; some servers publish diagnostics asynchronously and may not emit before close, leading to inconsistent diagnostic capture. + +##### Required remediation +- [ ] Enforce a default request timeout in `createLspClient.request()` if none is provided. +- [ ] For diagnostics collection, consider: + - [ ] waiting a bounded time for initial diagnostics after `didOpen`, or + - [ ] explicitly requesting diagnostics if server supports it (varies), or + - [ ] documenting that diagnostics are “best effort” and may be incomplete + +#### D.2 Unicode/offset correctness: add stronger guarantees (P1) + +##### Affected files +- `src/integrations/tooling/lsp/positions.js` +- `src/shared/lines.js` (supporting) +- `tests/type-inference-lsp-enrichment.js` +- `tests/segment-pipeline.js` + fixtures + +##### Findings +- [ ] `positions.js` JSDoc claims “1-based line/column”; column is actually treated as 0-based (correct for LSP), but the doc comment is misleading. +- [ ] Test coverage does not explicitly include surrogate pairs (emoji), which are the common failure mode when mixing code-point vs UTF-16 offsets. + +##### Required remediation +- [ ] Fix the JSDoc to reflect actual behavior (LSP: 0-based character offsets; line converted to 1-based for internal helpers). +- [ ] Add tests with: + - [ ] emoji in identifiers and/or strings before symbol definitions + - [ ] CRLF line endings fixtures (if Windows compatibility is required) + +#### D.3 Generic LSP provider chunk matching is weaker than clangd provider (P2) + +##### Affected file +- `src/integrations/tooling/providers/lsp.js` + +##### Findings +- [ ] `findChunkForOffsets()` requires strict containment (symbol range must be within chunk range). clangd-provider uses overlap scoring, which is more robust. + +##### Required remediation +- [ ] Update generic provider to use overlap scoring like clangd-provider to reduce missed matches. + +#### D.4 TypeScript provider issues (P2/P1 depending on usage) + +##### Affected file +- `src/index/tooling/typescript-provider.js` + +##### Findings +- [ ] `loadTypeScript()` resolve order includes keys that are not implemented (`global`) and duplicates (`cache` vs `tooling`). +- [ ] Parameter name extraction uses `getText()` which can produce non-identifiers for destructuring params (bad keys for `params` map). +- [ ] Naming convention risk: provider writes keys like `Class.method` which may not match chunk naming conventions; if mismatched, types will not attach. + +##### Required remediation +- [ ] Fix the resolution order logic and document each lookup path purpose. +- [ ] Only record parameter names for identifiers; skip or normalize destructuring params. +- [ ] Validate chunk naming alignment (structural chunk naming vs provider symbol naming) and add a test for a class method mapping end-to-end. + +#### D.5 Cross-file inference merge determinism and evidence (P2) + +##### Affected files +- `src/index/type-inference-crossfile/apply.js` +- `src/index/type-inference-crossfile/pipeline.js` + +##### Findings +- [ ] `mergeTypeList()` dedupes by `type|source` but drops evidence differences; confidence merging strategy is simplistic. +- [ ] Output ordering is not explicitly sorted after merges. + +##### Required remediation +- [ ] Decide how to treat evidence in merges (keep first, merge arrays, keep highest confidence). +- [ ] Sort merged type lists deterministically (confidence desc, type asc, source asc). + +#### D.6 Signature parsing robustness (P2) + +##### Affected files +- `src/index/tooling/signature-parse/clike.js` +- `src/index/tooling/signature-parse/python.js` +- `src/index/tooling/signature-parse/swift.js` + +##### Findings +- [ ] Parsers are intentionally lightweight, but they will fail on common real-world signatures: + - C++ templates, function pointers, references + - Python `*args/**kwargs`, keyword-only params, nested generics + - Swift closures and attributes + +##### Required remediation +- [ ] Add test fixtures covering at least one “hard” signature per language. +- [ ] Consider using tooling hover text more consistently (already used as fallback in clangd-provider) or integrate a minimal parser that handles nested generics and defaults. + +--- + +### E) Performance improvements to prioritize (cross-cutting) + +#### E.1 Risk analysis hot path (P1) +- [ ] Single-pass line scan for sources/sinks/sanitizers. +- [ ] Early return on caps (maxBytes/maxLines) rather than scanning the whole file anyway. +- [ ] Cheap prefilter before SafeRegex evaluation. +- [ ] Avoid per-line SafeRegex exceptions (see §B.2). + +#### E.2 Markdown segmentation duplication (P2) +- [ ] `segments.js` parses markdown twice (inline code spans + fenced blocks). Consider extracting both from one micromark event stream. + +#### E.3 Tooling providers I/O duplication (P2) +- [ ] Providers re-read file text from disk; if indexing already has the content in memory, pass it through (where feasible) to reduce I/O. + +--- + +### F) Refactoring goals (maintainability / policy centralization) + +- [ ] Consolidate analysis feature toggles into a single `analysisPolicy` object that is passed to: + - metadata v2 builder + - risk analysis + - git analysis + - type inference (local + cross-file + tooling) +- [ ] Centralize schema versioning and validation: + - one metadata v2 schema + - one risk rule bundle schema + - one place that validates both as part of artifact validation + +--- + +### G) Tests: required additions and upgrades + +#### Existing tests reviewed (from the provided list) +- `tests/metadata-v2.js` +- `tests/churn-filter.js` +- `tests/git-blame-range.js` +- `tests/git-hooks.js` +- `tests/git-meta.js` +- `tests/minhash-parity.js` +- `tests/segment-pipeline.js` (+ fixtures) +- `tests/type-inference-crossfile*.js` +- `tests/type-inference-lsp-enrichment.js` +- `tests/type-inference-*-provider-no-*.js` (clangd/sourcekit) + +#### Required test upgrades (P1/P0 where noted) +- [ ] **P0:** Add tests for metadata v2 nested inferred/tooling param types (see §A.1). +- [ ] **P0:** Add tests for risk rule compilation config correctness (flags honored, invalid patterns handled) (see §B.1). +- [ ] **P0:** Add risk analysis “long line” test to ensure no crashes (see §B.2). +- [ ] **P1:** Add unicode offset tests that include surrogate pairs (emoji) for: + - LSP position mapping + - chunk start offsets around unicode +- [ ] **P1:** Add git caching/policy tests (default blame policy + no repeated subprocess calls where caching is intended). + +--- + +**Deliverables** +- This remediation checklist (this document) +- Updated `docs/metadata-schema-v2.md` and `docs/risk-rules.md` that match implementation +- Expanded test suite that locks in: + - metaV2 types correctness (including nested) + - risk rule compilation correctness and non-crashing evaluation + - unicode offset correctness (including surrogate pairs) + - intended git blame policy and caching + +**Exit criteria** +- All P0 items are fixed and covered by tests. +- Metadata v2 output matches the schema doc, and `validateIndexArtifacts()` validates it meaningfully. +- Risk analysis and tooling passes are “best-effort”: they may skip/partial, but they never crash indexing. + + +## Phase 31 — Language handlers & chunking review (Section 5) + +**Objective:** Make language detection, per-language chunking, tree-sitter integration, and ingestion tooling *deterministic, robust on real-world code*, and *well-tested* — with clear fallback behavior, predictable chunk boundaries, and guardrails against performance/pathological inputs. + +**Scope reference:** Review Section 5 file list + checklist (see the attached “review section 5 files and checklist” markdown). + +### Note +While generating the markdown deliverable, I noticed one small wording issue in the YAML section of the produced document: it currently describes the tab bug using code spans that don’t clearly distinguish '\t' vs '\\t' (because Markdown code spans visually collapse some intent). The underlying identified bug is correct and the remediation tasks are correct, but that one wording line could be clarified to explicitly contrast '\\t' (backslash+t) vs '\t' (actual tab). + +--- + +### 31.0 Priority findings summary (what must be fixed first) + +#### P0 — Breaks correctness, tests, or core workflows +- [ ] **Fix YAML tab handling + Windows path normalization bugs** in `src/index/chunking/formats/yaml.js` (tabs currently checked as the literal string `"\t"`; Windows paths normalized with the wrong regex). + - Affects: skipping list items / indentation detection; GitHub Actions workflow detection on Windows-style paths. +- [ ] **Fix C-like docstring/attribute extraction off-by-one** in `src/lang/clike.js` (doc comment extraction currently skips the line immediately above declarations). + - Affects: docstring/attributes in C/C++/ObjC chunks (and downstream docmeta / fidelity). +- [ ] **Fix broken test syntax** in `tests/language-registry/collectors.test.js` (invalid escaped quotes). + - Affects: test suite execution. +- [ ] **Fix ingestion tools writing output before ensuring directory exists** in: + - `tools/ctags-ingest.js` + - `tools/gtags-ingest.js` + - `tools/lsif-ingest.js` + - `tools/scip-ingest.js` + Creating the write stream before `ensureOutputDir()` can fail when the output directory does not exist. +- [ ] **Fix SQL statement splitting for standard SQL escaping (`''` / `""`)** in `src/lang/sql.js`. + Current quote toggling assumes backslash-escaping and will mis-split statements containing doubled quotes. + +#### P1 — Tree-sitter quality/perf gaps that will surface at scale +- [ ] **Fix `findNameNode` traversal depth bug** in `src/lang/tree-sitter/chunking.js` (depth increments per node instead of per level; the search stops after ~4 iterations). + - Affects: chunk naming quality and method/class qualification. +- [ ] **Make tree-sitter worker path functional and deterministic** (`src/lang/workers/tree-sitter-worker.js` + `src/lang/tree-sitter/chunking.js`). + - Worker currently does not preload/init grammars; `buildTreeSitterChunksAsync()` treats a `null` worker result as “success” and does not fall back. + +#### P2 — Cleanup, clarity, and long-term maintainability +- [ ] **Remove or use unused imports** (e.g., `parseTypeScriptSignature` in `src/lang/typescript/chunks-babel.js`). +- [ ] **Add missing/edge-case tests** (Windows paths, tabs, unicode identifiers, SQL quoting, tree-sitter worker behavior, etc.). +- [ ] **Document chunk metadata semantics** (particularly `meta.endLine` inclusivity and byte vs. code-unit offsets) in `docs/contracts/chunking.md` (and/or a new contract doc). + +--- + +### 31.1 Chunking pipeline: mapping, fallback, limits, determinism + +#### 31.1.1 Fallback behavior and deterministic output +- [ ] **Audit & document** the full fallback chain in `src/index/chunking/dispatch.js`: + - code chunker → code-format chunker → prose chunker → root chunk (prose extensions) → fixed-size blob fallback. +- [ ] **Add regression tests** that verify: + - A failed code chunker returns `null` and the dispatcher properly falls back. + - “Prose mode” behavior for `.md/.rst/.adoc/.txt/.mdx` is stable (chunk headings when possible; otherwise single chunk). + - “Code mode” for prose files intentionally uses blob fallback (or adjust if that’s not desired). + +#### 31.1.2 Limits: correctness + performance under large inputs +- [ ] **Add tests for multi-byte UTF-8 boundaries** in `applyChunkingLimits()` (`src/index/chunking/limits.js`): + - Ensure splits never create invalid surrogate pairs. + - Ensure byte limits are enforced correctly with emoji / non-ASCII identifiers. +- [ ] **Performance review:** `resolveByteBoundary()` currently calls `Buffer.byteLength(text.slice(0, mid))` repeatedly. + - [ ] Consider a faster strategy (e.g., pre-encoding once to a `Buffer`, or maintaining cumulative byte counts per line) to avoid repeated substring allocations. +- [ ] **Clarify contract semantics** for: + - Whether `chunk.end` is exclusive (it is treated as exclusive almost everywhere). + - Whether `meta.endLine` is “line containing end offset” vs “last included line”. + (Many language chunkers use `offsetToLine(end)` vs `offsetToLine(end - 1)`; this should be intentional and documented.) + - Update `docs/contracts/chunking.md` accordingly and add examples. + +--- + +### 31.2 Format chunkers: YAML, JSON, XML, INI/TOML, Markdown, RST/Asciidoc + +#### 31.2.1 YAML (`src/index/chunking/formats/yaml.js`) +**Bugs** +- [ ] **Fix tab detection** in `chunkYamlTopLevel()` and list-item skipping: + - Current code checks `line.startsWith("\t")` (literal backslash + t) instead of `line.startsWith("\t")` as a tab character. + - Locations: + - line ~60: `line.startsWith('\t')` in list-item skip condition + - line ~92: `line.startsWith('\t')` in indentation calculation +- [ ] **Fix Windows path normalization** in `chunkYaml()`: + - Current: `normalizedPath = relPath.replace(/\\\\/g, '/')` + This matches *double* backslashes; typical Windows paths contain single backslashes. + - Should be: `relPath.replace(/\\/g, '/')` (single backslash regex) + +**Hardening / improvements** +- [ ] **Add YAML tests** covering: + - Tab-indented YAML (even if discouraged, tools may produce it). + - Workflow path detection for both `".github/workflows/foo.yml"` and `".github\\workflows\\foo.yml"`. + - A workflow file with `jobs:` where indentation is not 2 spaces (ensure graceful behavior). +- [ ] **Document YAML chunker limitations** (top-level-only + heuristics for GH Actions) in the chunking contract or a dedicated “format chunkers” doc section. + +#### 31.2.2 JSON (`src/index/chunking/formats/json.js`) +- [ ] **Test hygiene:** Fix test calls that pass arguments in the wrong positions (e.g., `chunkJson(jsonText, {})` in `tests/chunking/json.test.js` currently passes `{}` as `relPath`). + Update to `chunkJson(jsonText, null, {})` for clarity and future-proofing. +- [ ] **Optional robustness improvement:** consider using `jsonc-parser` for tolerant parsing (trailing commas/comments) *if desired*. + - If adopted, ensure invalid JSON still cleanly falls back (i.e., return `null`). + +#### 31.2.3 XML (`src/index/chunking/formats/xml.js`) +- [ ] Add tests for: + - Nested tags with attributes + self-closing tags. + - CDATA blocks and processing instructions. + - Malformed tag recovery (should return `null`, triggering fallback, rather than producing broken chunks). + +#### 31.2.4 Markdown (`src/index/chunking/formats/markdown.js`) +- [ ] Add tests for: + - Headings inside fenced blocks (should not create chunks; current `inFence` logic covers ``` and ~~~). + - Setext headings vs horizontal rules (ensure `---` under a paragraph is treated correctly). + +#### 31.2.5 RST/Asciidoc (`src/index/chunking/formats/rst-asciidoc.js`) +- [ ] Add tests for: + - RST overline+underline headings and nested sectioning. + - Asciidoc `==` headings inside code/list blocks to avoid false positives. + +#### 31.2.6 INI/TOML (`src/index/chunking/formats/ini-toml.js`) +- [ ] Add tests for: + - TOML array-of-tables (`[[table]]`). + - INI sections with unusual whitespace and comments. + +--- + +### 31.3 Language registry: selection, options, and collector mapping + +#### 31.3.1 Registry correctness (`src/index/language-registry/registry.js`) +- [ ] **Confirm and document intentional grouping** of C/C++/ObjC into `id: 'clike'`: + - Ensure docs and tests consistently reflect that `.c/.h/.cpp/.hpp/.m/.mm` map to the same language id. + - Update language-fidelity expectations and/or docs if users expect separate ids. + +- [ ] Expand `tests/language-registry/selection.test.js` to cover: + - C/C++/ObjC extensions: `.c`, `.h`, `.cpp`, `.hpp`, `.m`, `.mm` + - Ambiguous extensions and “special names”: + - `Dockerfile`, `dockerfile`, `*.Dockerfile` + - `Makefile`, `makefile` + - `CMakeLists.txt` + - `.gitignore`-style config names (if supported elsewhere) + +#### 31.3.2 Import collectors map (`tests/language-registry/collectors.test.js`) +- [ ] **Fix syntax error** at the Dart fixture entry: + - Replace `text: "import 'package:foo/bar.dart';",` with a valid JS string literal: + - `text: "import 'package:foo/bar.dart';",` + +- [ ] Add edge-case import collector tests for: + - Multiline imports (where applicable). + - Imports inside comments (should be ignored where the collector claims to ignore comments). + - Duplicate imports / whitespace variants (ensure normalization works). + +--- + +### 31.4 Tree-sitter backbone: wasm init, language loading, chunk extraction, workers + +#### 31.4.1 Name extraction (`src/lang/tree-sitter/chunking.js`) +- [ ] **Fix `findNameNode()` depth logic**: + - Current implementation increments `depth` per dequeued node, not per BFS level. + - Result: the search stops after ~4 processed nodes and often fails to find a name. + - Expected: traverse up to N levels or up to a node-count budget (explicitly), and return the first plausible identifier. + +- [ ] Add tests that assert: + - Function and class chunk names are extracted correctly across multiple language grammars. + - Member/method names are found for nested AST shapes where the `name` field is not a direct child. + +#### 31.4.2 Worker-mode tree-sitter chunking (`src/lang/workers/tree-sitter-worker.js`, `src/lang/tree-sitter/chunking.js`) +- [ ] **Initialize and preload grammars inside the worker** (or add a per-worker lazy-init path): + - Today, the worker calls `buildTreeSitterChunks()` without ensuring tree-sitter wasm + language grammar are loaded in that worker thread. + - Proposed fix: + - In the worker, resolve language id from `ext`/`languageId`, then `await preloadTreeSitterLanguages([resolvedId], treeSitterOptions)` before parsing. +- [ ] **Make `buildTreeSitterChunksAsync()` treat `null` results as a failure signal** and fall back to in-thread parsing (or to non-tree-sitter chunking), at least when worker-mode is enabled. +- [ ] Add tests that explicitly enable worker-mode and assert that: + - Chunks are returned (not `null`) for a known fixture. + - The result matches non-worker behavior (same chunk boundaries, or documented acceptable differences). + - If a grammar is missing/unavailable, it falls back cleanly and deterministically. + +#### 31.4.3 Configuration normalization (`src/lang/tree-sitter/options.js`) +- [ ] Improve boolean normalization: + - Current `normalizeEnabled()` only recognizes `false` and the literal string `'off'`. + - Expand to treat `'false'`, `'0'`, `'no'` (case-insensitive) as disabled, and `'true'`, `'1'`, `'yes'`, `'on'` as enabled. +- [ ] Add tests for config parsing from environment/JSON where booleans may be strings. + +#### 31.4.4 Offsets: bytes vs JS string indices +- [ ] Add an explicit contract note and tests around offset units used by: + - tree-sitter (`node.startIndex/endIndex`) + - parse5 and other JS parsers + - Python AST (line/col from Python runtime) + Ensure all chunk `start/end` offsets are consistent with JS string slicing expectations, particularly with non-BMP unicode characters. + +--- + +### 31.5 Language handlers: correctness fixes & hardening + +#### 31.5.1 C-like (`src/lang/clike.js`) +- [ ] **Fix docstring extraction index** for functions and ObjC methods: + - Current: + - ObjC method chunk meta: `extractDocComment(lines, i - 1, ...)` and `collectAttributes(lines, i - 1, ...)` + - C-like functions: `extractDocComment(lines, i - 1)` + - This skips the immediate preceding line. + - Fix: pass `i` (0-based declaration start line) instead of `i - 1`. + - Locations: + - ~417–418, ~463 in `src/lang/clike.js` + +- [ ] Add tests for C-like doc comment capture: + - A `/** ... */` or `// ...` directly above a `struct`, `class`, `enum`, and `function`. + - ObjC method with `///` doc comment above it. + +#### 31.5.2 SQL (`src/lang/sql.js`) +- [ ] **Fix quote handling** in both `stripSqlComments()` and `splitSqlStatements()`: + - SQL escaping commonly uses doubled quotes: + - `'It''s fine'` + - `"a ""quoted"" identifier"` + - Current logic toggles on every `'`/`"` not preceded by backslash, which breaks on doubled quotes. + +- [ ] Add tests that include: + - Semicolons inside strings with doubled quotes. + - PostgreSQL dollar-quoted strings combined with single-quoted strings. + - MySQL delimiter blocks that contain semicolons. + +#### 31.5.3 CSS (`src/lang/css.js`) +- [ ] Add guardrails to prevent pathological chunk explosion when using the CSS tree-sitter parser: + - Options: + - Enforce a max node/chunk count (consistent with tree-sitter default maxChunkNodes behavior). + - Or switch to `buildTreeSitterChunks()` and its existing limits. +- [ ] Add tests for: + - Nested `@media` with many rules (ensure performance and deterministic chunk output). + - Files exceeding the max node threshold (ensure fallback to heuristic). + +#### 31.5.4 TypeScript (`src/lang/typescript/chunks-babel.js`) +- [ ] Remove or use unused import `parseTypeScriptSignature` (currently imported but not referenced). +- [ ] Add/extend tests ensuring: + - Babel-based TS chunker produces signatures and types consistently where expected. + - Worker/non-worker tree-sitter paths do not regress TS chunking (when enabled). + +--- + +### 31.6 Imports, relations, and control-flow metrics + +#### 31.6.1 Import collectors +- [ ] Add test coverage for: + - Normalization rules (`normalizeImportToken()` behavior). + - Edge cases per language (e.g., JS `import type`, TS `import("x")`, Python relative imports). +- [ ] Validate that collectors return stable, sorted output (dedupe + order determinism), or document if order is intentionally non-deterministic. + +#### 31.6.2 Relations builders (`src/index/language-registry/simple-relations.js`, per-language `relations.js`) +- [ ] Add a small integration test that: + - Runs `collectLanguageImports()` and `buildLanguageRelations()` for a multi-language fixture set. + - Verifies the resulting `imports`, `exports`, `calls`, and `usages` sets match expectations. + +--- + +### 31.7 Ingestion tools: ctags / gtags / lsif / scip + +#### 31.7.1 Output directory creation order +- [ ] Move `await ensureOutputDir()` to occur *before* `fs.createWriteStream(outputPath, ...)` in: + - `tools/ctags-ingest.js` (write stream is created before the dir is ensured) + - `tools/gtags-ingest.js` + - `tools/lsif-ingest.js` + - `tools/scip-ingest.js` + +#### 31.7.2 Robustness improvements +- [ ] Add tests / smoke scripts that verify: + - Tools succeed when output directory doesn’t exist. + - Tools correctly handle empty input streams. + - Tools fail with actionable errors on malformed JSON lines. + +- [ ] Add optional flags/docs for: + - Strict vs tolerant ingest behavior (skip malformed lines vs fail-fast). + - Path normalization expectations (repo-root relative vs absolute). + +--- + +### 31.8 Docs and test suite alignment + +#### 31.8.1 Fix broken / missing documentation references +- [ ] The Section 5 checklist references docs that are *not present* in this repo snapshot (e.g., `docs/contracts/language-registry.md`, `docs/contracts/ast.md`, and `docs/optional/*`). + Decide whether to: + - Create these docs, or + - Update the checklist to point to existing docs (`docs/language-handler-imports.md`, `docs/language-fidelity.md`, etc.). + +#### 31.8.2 Update existing docs for discovered behavior +- [ ] Update `docs/contracts/chunking.md` to include: + - Chunk offset semantics (exclusive `end`, unicode considerations). + - `meta.startLine/endLine` semantics and examples. + - Expected behavior for overlapping chunks (if allowed) vs non-overlapping (if required). +- [ ] Update `docs/language-fidelity.md` if docstring expectations for C-like currently fail due to the off-by-one bug. + +#### 31.8.3 Add a “known limitations” section (recommended) +- [ ] Document known heuristic limitations for: + - SQL parsing (heuristic statement splitting vs full parser). + - YAML parsing (line-based, top-level heuristics). + - Language relations (regex-based calls/usages for some languages). + +--- + +### Deliverables +- [ ] All P0/P1 fixes implemented with unit tests. +- [ ] Updated docs reflecting chunk semantics and configuration. +- [ ] A focused regression test pack covering: + - YAML tabs + Windows workflow paths + - C-like doc comments + - SQL doubled-quote handling + - Tree-sitter worker-mode functionality + - Chunking limits with unicode/multi-byte text + +--- + +### Exit criteria +- [ ] `npm test` (or the project’s test runner) executes without syntax errors (including `collectors.test.js`). +- [ ] Format chunkers are robust against malformed inputs and fall back deterministically. +- [ ] Tree-sitter worker-mode returns real chunks for supported languages and falls back when grammars are missing. +- [ ] Chunk metadata semantics are documented and consistent across chunkers (or differences are explicitly justified). +- [ ] Ingestion tools succeed when output directories are missing and produce valid NDJSON outputs. + + +## Phase 32 — (Review) — Retrieval, Services & Benchmarking/Eval (Latency End-to-End) + +### Objective + +Validate and improve the **retrieval pipeline**, **services surfaces (API + MCP)**, and **benchmark/eval tooling** so that: + +* Search semantics are correct and contract-aligned (query parsing, filters, ranking, explain output, context expansion). +* Backends behave consistently (memory / sqlite / sqlite-fts / lmdb) and performance paths are not accidentally disabled. +* Services are robust (streaming behavior, cancellation, backpressure, security posture). +* Benchmarks and eval harnesses are actionable, reproducible, and can enforce latency/quality budgets. + +### Scope + +Reviewed the complete Section 8 list from the attached markdown checklist document fileciteturn0file0, including: + +* Retrieval CLI + pipeline + filters + output formatting +* SQLite/LMDB helpers and cache layers +* Core integrations used by tools/services +* API server (router + SSE) and MCP transport/tools +* Benchmark harnesses (micro + language) and query tooling +* Eval harness +* Related docs + tests + fixtures + +(Where files referenced other modules not in the Section 8 list, I noted mismatches and dependency risks, but the primary focus remains the Section 8 scope.) + +--- + +### Exit Criteria (What “Done” Looks Like) + +#### Correctness & Contracts + +* [ ] Query parsing supports required constructs (operators/quoting/negation/precedence) or docs/contracts explicitly define the simplified grammar. +* [ ] Filters are correctly detected as “active” and do not disable backend fast-paths accidentally. +* [ ] Explain output matches actual scoring math and is emitted only when requested (or contracts updated to reflect always-present fields). + +#### Performance & Latency + +* [ ] SQLite FTS fast-path is not disabled by default (especially for large indexes). +* [ ] Context expansion avoids repeated O(N) scans per query (or is cached/optimized). +* [ ] Benchmarks can write baselines reliably and optionally enforce budgets. + +#### Services Robustness + +* [ ] API streaming handles backpressure and connection close without hanging. +* [ ] API/MCP support cancellation/timeout propagation to stop expensive work. +* [ ] CORS/security posture is explicitly intentional and documented. + +#### Tests & Tooling + +* [ ] Tests cover discovered regressions and add missing edge cases (FTS eligibility, extracted-prose query caching, MCP id=0, etc.). +* [ ] Bench/eval docs match actual behavior and command usage. + +--- + +## Findings & Required Work + +### 8.A — Retrieval Semantics, Explain, Context Expansion + +#### A1 — **Critical: Filter “active” detection is wrong (breaks performance paths)** + +**Files:** + +* `src/retrieval/filters.js` +* `src/retrieval/cli.js` +* `src/retrieval/pipeline.js` +* `src/retrieval/sqlite-helpers.js` (indirect impact via CLI choices) + +**What I found:** +`hasActiveFilters()` treats *any non-empty object* as “active,” which causes `filtersActive` to be true even when no user filters are set, because the CLI always includes internal objects like `filePrefilter`. + +**Impact:** + +* Forces filter pass on every query. +* Can disable SQLite FTS eligibility for large indexes because allowed-id pushdown cannot be used when the “allowed set” becomes huge. +* Prevents “lazy chunk loading” decisions that should apply when there are no real filters. +* Creates major, silent performance regressions at scale. + +**Action items:** + +* [ ] Fix `hasActiveFilters()` to ignore internal/config-only keys (e.g., `filePrefilter`) and only count user-constraining filters. +* [ ] Add unit tests for `hasActiveFilters()` default filter object and typical combinations. +* [ ] Add an integration test ensuring sqlite-fts remains eligible on a large index when no filters are set (or at least verify the path selection in stats/debug output). + +--- + +#### A2 — **Context expansion does repeated O(N) indexing work per query** + +**Files:** + +* `src/retrieval/context-expansion.js` +* `src/retrieval/cli.js` (enables context expansion) +* `src/retrieval/pipeline.js` + +**What I found:** +`buildContextIndex()` rebuilds `byName` and `byFile` maps every query. + +**Impact:** + +* For large repos, this adds noticeable latency per query. +* Violates checklist intent: “avoids repeated file reads / expensive rebuilds.” + +**Action items:** + +* [ ] Cache context index per loaded index signature (store on the loaded index object or in `index-cache.js`). +* [ ] Add tests to ensure expansions are stable and do not cross branch/filters (if applicable). +* [ ] Document the intended semantic boundaries of context expansion (same file vs cross-file, name matching rules, etc.). + +--- + +#### A3 — Explain output / scoring contract alignment is ambiguous + +**Files:** + +* `src/retrieval/pipeline.js` +* `src/retrieval/output/explain.js` +* `src/retrieval/cli/render-output.js` +* Docs: `docs/contracts/retrieval-ranking.md` (very high-level) + +**What I found:** +The pipeline always builds `scoreBreakdown` objects, even if explain is not requested; compact JSON hides it, but full JSON may expose it unintentionally. + +**Action items:** + +* [ ] Decide contract behavior: + + * Option 1: Only compute/attach `scoreBreakdown` when explain requested. + * Option 2: Always include but document it (and remove `--explain` implication of optionality). +* [ ] Add snapshot tests asserting the presence/absence of explain fields by mode/output format. +* [ ] Ensure explain’s boost attribution matches scoring math (phrase + symbol boosts currently depend on the already-boosted score; document or adjust). + +--- + +### 8.B — Query Parsing & Filtering + +#### B1 — Query parsing does not satisfy checklist requirements + +**Files:** + +* `src/retrieval/query.js` +* `src/retrieval/query-parse.js` +* Tests/docs indirectly + +**What I found:** +Parsing supports: + +* quoted phrases (`"..."`) +* negation via `-token` and `-"phrase"` + +It does **not** support: + +* boolean operators (AND/OR/NOT) semantics +* precedence / parentheses +* actionable errors for malformed queries (unbalanced quotes become literal tokens) + +**Action items:** + +* [ ] Either implement full operator parsing & precedence or explicitly constrain and document the query grammar. +* [ ] Add detection + actionable error messages for unbalanced quotes and invalid constructs. +* [ ] Add tests for negated phrases, nested quotes, malformed input, and operator tokens. + +--- + +#### B2 — Filtering: performance and correctness concerns + +**Files:** + +* `src/retrieval/output/filters.js` +* `src/retrieval/filter-index.js` + +**Key improvements:** + +* [ ] Ensure case-sensitive file filters don’t lose correctness through normalization shortcuts (currently used for prefiltering; confirm final checks are strict). +* [ ] Consider memory growth of filter index structures; document expected footprint and add soft limits/metrics. + +--- + +### 8.C — Ranking Determinism & Tie-Breaking + +#### C1 — Dense ranking should defensively validate embedding dimensionality + +**Files:** + +* `src/retrieval/rankers.js` +* `src/retrieval/embedding.js` +* `src/retrieval/sqlite-helpers.js` + +**What I found:** +`rankDenseVectors()` assumes query embedding length matches index vector dimension. If not, dot-products can become NaN and ranking becomes unstable. + +**Action items:** + +* [ ] Validate query embedding length vs index dims; if mismatch, either truncate safely or skip dense scoring with a clear warning. +* [ ] Add tests for dims mismatch (stub embeddings + configured dims is a good harness). + +--- + +#### C2 — SQLite dense vector scale fallback looks unsafe + +**Files:** + +* `src/retrieval/sqlite-helpers.js` +* Related: `src/storage/sqlite/vector.js` (quantization uses 2/255) + +**What I found:** +If `dense_meta.scale` is missing for any reason, sqlite helper defaults scale to **1.0**, which would break score normalization badly for uint8 quantized vectors. + +**Action items:** + +* [ ] Change fallback scale default to `2/255` (and minVal to `-1` consistent with vector quantization). +* [ ] Add a regression test ensuring dense scoring remains bounded even when meta is missing/corrupt (or fail loudly). + +--- + +### 8.D — Services: API Server & MCP + +#### D1 — SSE backpressure “drain wait” can hang indefinitely on closed connections + +**Files:** + +* `tools/api/sse.js` + +**What I found:** +If `res.write()` returns false, the code awaits `'drain'` only. If the client disconnects before drain fires, that promise may never resolve. + +**Action items:** + +* [ ] Replace `await once('drain')` with `Promise.race([drain, close, error])`. +* [ ] Add tests simulating backpressure + early disconnect (larger payload / forced write buffering). + +--- + +#### D2 — Streaming contracts/docs do not match actual /search/stream behavior + +**Files:** + +* `tools/api/router.js` +* Docs: `docs/api-server.md`, `docs/contracts/api-mcp.md` + +**What I found:** +`/search/stream` only emits: + +* `start` +* `result` OR `error` +* `done` + +Docs/contracts claim progress streaming and/or richer semantics. + +**Action items:** + +* [ ] Decide: implement progress events (pipeline milestones) OR revise docs/contracts to match current behavior. +* [ ] If implementing progress: add hooks from retrieval CLI/pipeline → core API → router SSE. + +--- + +#### D3 — Cancellation/timeout propagation is missing end-to-end + +**Files:** + +* `tools/api/router.js` +* `tools/mcp/transport.js` +* `tools/mcp/tools.js` +* `src/integrations/core/index.js` +* `src/retrieval/cli.js` (currently no signal handling) + +**What I found:** +Timeouts exist in MCP wrapper, but they do not abort underlying work. API does not abort search on client disconnect. Retrieval does not consume `AbortSignal`. + +**Action items:** + +* [ ] Introduce `AbortController` per request/tool call. +* [ ] Wire close events (`req.on('close')`) and timeout timers to `abort()`. +* [ ] Teach retrieval pipeline / embedding fetch to check `signal.aborted` and throw a consistent cancellation error. +* [ ] Add tests: + + * API stream abort stops work early (not just stops writing). + * MCP tool timeout aborts the underlying work, not just returns an error. + +--- + +#### D4 — Security posture: permissive CORS is risky + +**Files:** + +* `tools/api/router.js` +* Docs: `docs/api-server.md` + +**What I found:** +CORS is `*` by default. Even though server defaults to localhost, permissive CORS enables untrusted sites to read responses from a local service in a browser context. + +**Action items:** + +* [ ] Default CORS to disabled or restricted (require explicit `--cors` enablement). +* [ ] Document threat model: local-only, trusted environment, or add token-based auth. +* [ ] Add tests for CORS behavior (preflight, allowed origins). + +--- + +### 8.E — Benchmarks & Latency Budgets + +#### E1 — Microbench “dense” vs “hybrid” distinction is not actually implemented + +**Files:** + +* `tools/bench/micro/run.js` +* `tools/bench/micro/search.js` +* `tools/bench/micro/tinybench.js` +* Docs: `docs/benchmarks.md` + +**What I found:** +Bench tasks labeled “dense” and “hybrid” do not reliably enforce different scoring regimes. Some of the logic implies profiles/env-driven behavior that isn’t applied. + +**Action items:** + +* [ ] Implement explicit scoring strategy selection (via args/env/profile) for sparse vs dense vs hybrid. +* [ ] Confirm the benchmark measures what it claims (esp. hybrid weighting). +* [ ] Add “sanity asserts” in benchmark output to record which strategy actually ran. + +--- + +#### E2 — Baseline writing can fail because directories don’t exist + +**Files:** + +* `tools/bench/micro/tinybench.js` +* Docs: `docs/benchmarks.md` + +**What I found:** +`--write-baseline` writes to `benchmarks/baselines/...` but does not create the directory first. + +**Action items:** + +* [ ] Ensure baseline directory exists via `fs.mkdirSync(..., { recursive:true })`. +* [ ] Add a test for `--write-baseline` success on a clean repo checkout. +* [ ] Update docs to clarify how baselines are created and stored. + +--- + +#### E3 — SQLite cache reuse is missing in benchmark harnesses + +**Files:** + +* `tools/bench/micro/run.js` +* `tools/bench/micro/tinybench.js` + +**What I found:** +Bench harnesses often pass `sqliteCache = null`, which may force repeated DB opens and distort warm-run measurements. + +**Action items:** + +* [ ] Instantiate and reuse `createSqliteDbCache()` across runs for warm scenarios. +* [ ] Record cache reuse status in benchmark output for transparency. + +--- + +#### E4 — Latency “budgets” are described but not enforceable + +**Files:** + +* `docs/benchmarks.md` +* Tests: existing bench tests do not enforce budgets + +**Action items:** + +* [ ] Define target budgets (p50/p95) for representative queries and backends. +* [ ] Add CI-friendly “perf smoke” tests that fail if budgets regress beyond thresholds (with generous margins and stable fixtures). +* [ ] Document environment assumptions for benchmarks (CPU, disk, warmup, etc.). + +--- + +### 8.F — Eval Harness + +#### F1 — Matching logic is permissive and may inflate scores + +**Files:** + +* `tools/eval/run.js` +* Docs: `docs/eval.md` + +**What I found:** +Expected match uses `hit.name.includes(expected.name)`; that may treat `foo` as matching `foobar`. + +**Action items:** + +* [ ] Decide strictness: exact name match vs substring vs regex. +* [ ] Add dataset option `matchMode` or per-expected matcher configuration. +* [ ] Add tests for false-positive matching cases. + +--- + +## Additional Concrete Bugs Found (Non-Checklist) + +### G1 — Retrieval output summary “word count” logic uses character length + +**Files:** + +* `src/retrieval/output/format.js` + +**What I found:** +The summary logic compares `.length` of the string (characters) to a “maxWords” variable and uses it to adjust `maxWords`. This is unit-inconsistent and likely incorrect behavior. + +**Action items:** + +* [ ] Fix to track word count, not character length. +* [ ] Avoid calling `getBodySummary()` twice. +* [ ] Add tests for summary length behavior. + +--- + +### G2 — Parity test references missing benchmark query file path + +**Files:** + +* `tests/parity.js` +* Existing file: `tests/parity-queries.txt` + +**What I found:** +`tests/parity.js` reads from `benchmarks/queries/parity-queries.txt`, but the queries file exists under `tests/parity-queries.txt`. + +**Action items:** + +* [ ] Update parity test to load from `tests/parity-queries.txt` (or move file to benchmarks). +* [ ] Add a guard assertion that query file exists with a clear message. + +--- + +### G3 — Language benchmark progress renderer imports wrong relative paths + +**Files:** + +* `tools/bench/language/progress/render.js` + +**What I found:** +Imports reference `../../../src/shared/...` but need one more `../` to reach repo root. As written, this resolves to `tools/src/shared/...` which doesn’t exist. + +**Action items:** + +* [ ] Fix import paths to `../../../../src/shared/...`. +* [ ] Add a smoke test that loads the module (ensures no runtime import failures). + +--- + +### G4 — MCP transport drops valid JSON-RPC ids when id = 0 + +**Files:** + +* `tools/mcp/transport.js` + +**What I found:** +`if (!id) return;` treats `0` as falsy and drops responses/notifications. JSON-RPC allows `id: 0`. + +**Action items:** + +* [ ] Change checks to `(id === null || id === undefined)`. +* [ ] Add MCP tests sending `id: 0`. + +--- + +### G5 — Bench query generator emits invalid CLI fragments (and lacks quoting) + +**Files:** + +* `tools/bench-query-generator.js` + +**What I found:** +At least one strategy emits `--signature` without a value. Additionally, values with spaces (authors, types) are not quoted, which will break shell parsing. + +**Action items:** + +* [ ] Fix signature strategy to emit `--signature ""`. +* [ ] Quote/escape all flag values safely. +* [ ] Clarify intended consumer (CLI vs internal harness) and ensure output format matches it. + +--- + +## Test Coverage Additions (Highly Recommended) + +### New/Expanded Tests + +* [ ] `hasActiveFilters()` default object returns false; internal config-only objects don’t activate filters. +* [ ] sqlite-fts eligibility remains enabled for unfiltered queries on large (>900 chunks) indexes. +* [ ] Query cache includes extracted-prose payloads and validates required fields when mode enabled. +* [ ] SSE backpressure + client disconnect doesn’t hang. +* [ ] API abort cancels search work (requires AbortSignal support). +* [ ] MCP id=0 support. +* [ ] `--write-baseline` creates directories and succeeds. + +--- + +## Documentation Corrections Required + +* [ ] `docs/api-server.md`: align stream behavior (progress vs start/result/done), update security/CORS discussion. +* [ ] `docs/contracts/api-mcp.md`: align `/search/stream` contract to actual behavior or update implementation. +* [ ] `docs/benchmarks.md`: document baseline creation and ensure code supports it (mkdir); clarify dense/hybrid distinctions. +* [ ] `docs/mcp-server.md`: appears outdated vs actual transport implementation; update to match current code. + +## Phase 33 — Review Section 7 — Storage backends (SQLite + LMDB) + +**Objective:** Perform an audit of the storage backends (SQLite + LMDB) and their supporting tooling (build, validation, compaction, incremental updates, ANN extension management, and backend selection). Identify *all* correctness bugs, edge cases, documentation drift, missing tests, and performance/refactoring opportunities, aligned to the provided checklist. + +#### Out-of-scope (not deeply reviewed, but referenced when necessary) + +- Non-listed call-sites (e.g. retrieval query code) were spot-checked only when needed to validate schema/index/query alignment. + +--- + +### Executive summary + +#### Top P0 / correctness items + +- [ ] **(P0) SQLite ANN table is not updated when it already exists** in: + - `src/storage/sqlite/build/from-bundles.js` (vector table existence sets `vectorAnnReady = true` but **does not** prepare `insertVectorAnn`) — see around L120. + - `src/storage/sqlite/build/incremental-update.js` (same pattern) — see around L240. + + **Impact:** when the ANN virtual table already exists (most importantly during incremental updates), deleted rows *can* be removed (because deletes run via `deleteDocIds(...)`), but replacement vectors for changed chunks are **not reinserted**, leaving the ANN table sparse/out-of-sync with `dense_vectors`. This can silently degrade or break ANN-based retrieval depending on how the extension is queried. + +- [ ] **(P0) Retrieval-side fail-closed is incomplete for SQLite schema versions.** + + `src/retrieval/cli-sqlite.js` validates required table *names* but does **not** enforce `PRAGMA user_version == SCHEMA_VERSION` (or otherwise fail-closed on schema mismatch). This violates the checklist requirement (“readers fail closed on unknown versions”) for the SQLite reader path. + +- [ ] **(P0) Bundle-build path does not hard-fail on embedding dimension mismatches** (`src/storage/sqlite/build/from-bundles.js`). + + The code currently *warns once* on a dims mismatch but continues (and may still insert inconsistent vectors). This risks producing an index with an internally inconsistent dense-vector corpus (which can cause downstream errors or silent relevance regressions). + +#### High-signal P1 / robustness items + +- [ ] **WAL / sidecar handling is inconsistent across build vs incremental update paths.** + Full rebuild paths use `replaceSqliteDatabase(...)` which removes sidecars, but incremental updates modify the DB in-place under WAL mode and do not explicitly checkpoint/truncate. If later tooling removes sidecars without a checkpoint, this can create “single-file DB” assumptions that do not hold. + +- [ ] **Indexing for hot maintenance queries can be improved**: `chunks(mode, file)` exists, but multiple maintenance queries order by `id` and would benefit from `(mode, file, id)`. + +- [ ] **Docs drift:** `docs/sqlite-incremental-updates.md` (and a few related docs) describe doc-id behavior and operational details that do not match current implementation (doc-id reuse/free-list behavior; ratio guard details; and operational caveats). + +#### “Good news” / items that look solid already + +- Most bulk write paths are transactional (build ingest, compaction copy, incremental applyChanges). +- The extension download hardening in `tools/download-extensions.js` has multiple safety layers (hash verification support, archive path traversal protection, size/entry limits). +- LMDB corruption handling has targeted tests (`tests/lmdb-corruption.js`) and tooling integration (`tests/lmdb-report-artifacts.js`). + +--- + +## Checklist coverage and required follow-ups + +### A) Schema & migrations + +**Audit** + +- SQLite schema is versioned via `PRAGMA user_version` with `SCHEMA_VERSION = 7` (`src/storage/sqlite/schema.js`). +- Incremental update explicitly checks schema version and required tables before mutating (`src/storage/sqlite/build/incremental-update.js`). +- Table-level constraints are generally well-defined (primary keys per (mode, …), plus supporting indexes for vocab/postings). + +**Gaps / issues** + +- [ ] **Fail-closed at read time:** Add a `user_version` gate to the SQLite reader path (at minimum in `src/retrieval/cli-sqlite.js` / sqlite backend creation). + - Desired behavior: + - If backend is *forced* to SQLite: throw a clear error (“SQLite schema mismatch: expected X, found Y”). + - If backend is not forced (auto): treat SQLite as unavailable and fall back to the file-backed backend, with a warning. +- [ ] **Index alignment with hot predicates:** Consider adding `CREATE INDEX idx_chunks_file_id ON chunks(mode, file, id)` to support: + - `SELECT id FROM chunks WHERE mode=? AND file=? ORDER BY id` + - `SELECT file, id FROM chunks WHERE mode=? ORDER BY file, id` (incremental update id reuse scan) +- [ ] **Document upgrade path explicitly:** The system is effectively “rebuild on schema bump”. Ensure docs and user-facing error messaging make that explicit (and fail closed rather than attempting to limp on). +- [ ] **Consider column-level schema validation for critical tables** (optional but recommended): required-table-name checks do not catch incompatible column changes if a user provides an arbitrary SQLite file containing tables with the right names. + +--- + +### B) SQLite build pipeline + +**Audit** + +- Build-from-artifacts path uses bulk inserts and creates secondary indexes after ingest (`src/storage/sqlite/build/from-artifacts.js`). +- Build-from-bundles supports a fast-path using bundle workers (`src/storage/sqlite/build/from-bundles.js` + `bundle-loader.js`). +- Validation includes `PRAGMA integrity_check` (full) and cross-table count consistency checks (`src/storage/sqlite/build/validate.js`). + +**Gaps / issues** + +- [ ] **(P0) Fix ANN insert statement preparation when the ANN table already exists:** + - In `src/storage/sqlite/build/from-bundles.js`: + - When `hasVectorTable` is true (L120), prepare `insertVectorAnn` immediately (same SQL as the “created table” path near L209). + - In `src/storage/sqlite/build/incremental-update.js`: + - When `vectorAnnReady` is set based on `hasVectorTable` (L240), prepare `insertVectorAnn` as well. + - Add a CI-friendly unit test that does not require a real sqlite-vec binary (see “Tests” section below). +- [ ] **(P0) Enforce embedding dims consistency in bundle builds.** + - Recommendation: pre-scan each bundle (or the whole manifest) to ensure all embeddings are either absent or have a single consistent dimension; then hard-fail the build if mismatched. + - Current behavior: warns once around L197 and continues; this should be tightened to match the artifacts build path which throws on mismatch. +- [ ] **Failure cleanup should include SQLite sidecars** (`.db-wal`, `.db-shm`) in: + - `src/storage/sqlite/build/from-artifacts.js` + - `src/storage/sqlite/build/from-bundles.js` + + Today they remove only `outPath` on failure. If WAL/SHM exist, they can be left behind as confusing debris and can interfere with subsequent runs. +- [ ] **Consider ensuring the produced DB is “single-file”** after build by checkpointing/truncating WAL (or switching journal mode back), rather than relying on implicit behavior. +- [ ] **Prepared statement churn:** `deleteDocIds(...)` dynamically prepares multiple statements per chunk; consider statement caching keyed by chunk size to reduce overhead during large deletes. + +--- + +### C) LMDB backend + +**Audit** + +- LMDB has a clear key-space separation (`meta:*`, `artifact:*`) and an explicit schema version (`src/storage/lmdb/schema.js`). +- LMDB build tool stores artifacts plus metadata into LMDB (`tools/build-lmdb-index.js`). +- Corruption handling is at least partially validated via tests (`tests/lmdb-corruption.js`, `tests/lmdb-report-artifacts.js`). + +**Gaps / issues** + +- [ ] Ensure the LMDB *reader* path (not in this checklist set) fails closed on schema mismatch the same way SQLite incremental update does (explicit schema version check; clear error messaging). +- [ ] Consider adding a lightweight “LMDB quick check” command in tooling (or enhancing `tools/index-validate.js`) that validates the presence of all required keys (schema version, chunk meta, vocab, postings, etc.) and reports missing keys explicitly. +- [ ] Document LMDB key invariants and expected artifact presence (which artifacts are mandatory vs optional). + +--- + +### D) Incremental updates + +**Audit** + +- Incremental update gating exists (requires incremental manifest, rejects schema mismatch, rejects high change ratios) (`src/storage/sqlite/build/incremental-update.js`). +- It preserves doc-id stability per-file by reusing IDs for changed files and reusing free IDs from deletions. +- Deletes are applied across all relevant tables using `deleteDocIds(...)` with consistent table lists. + +**Gaps / issues** + +- [ ] **(P0) ANN table insertion bug** (same as in section B) must be fixed for incremental updates. +- [ ] **WAL lifecycle:** after an in-place incremental update, run: + - `PRAGMA wal_checkpoint(TRUNCATE);` + - optionally `PRAGMA journal_mode = DELETE;` (if the project prefers single-file DBs) + + This ensures the on-disk DB is not “dependent on sidecars” after the update and reduces the likelihood of later tooling accidentally discarding uncheckpointed state. +- [ ] **Manifest match logic:** `isManifestMatch(...)` falls back to mtime/size when one side has a hash and the other does not. + - Consider tightening: if an incremental manifest provides a hash but the DB manifest row does not, treat as “changed” and update the DB row hash (this gradually converges the DB to the stronger invariant). +- [ ] **Performance of doc-id reuse scan:** the “scan all chunks ordered by file,id” approach is correct but can be expensive; if it becomes a bottleneck, consider either: + - adding `(mode,file,id)` index, and/or + - materializing file→docId list in a side table (only if necessary). + +--- + +### E) Performance + +**Audit** + +- Build pragmas in `src/storage/sqlite/build/pragmas.js` are set to favor build throughput (WAL + relaxed synchronous) and are restored (partially). +- Compaction tool is designed to reduce doc-id sparsity and reclaim file size (`tools/compact-sqlite-index.js`). + +**Gaps / issues** + +- [ ] **Avoid repeated `COUNT(*)` scans** for backend auto-selection where possible (`src/storage/backend-policy.js`). + - Options: use `file_manifest` sum, maintain a meta counter, or store chunk count in `index_state.json`. +- [ ] **Improve maintenance query performance** via `(mode,file,id)` index as noted above. +- [ ] **Reduce query-time statement re-preparation** in `src/retrieval/sqlite-helpers.js` (`chunkArray(...)` creates fresh SQL each time); consider caching by chunk size. +- [ ] **Add at least one p95 query latency regression test** using a stable fixture DB (details below). + +--- + +### F) Refactoring goals + +**Audit** + +- The codebase already separates schema SQL, prepared statements, and build/validate logic into dedicated modules. + +**Gaps / issues** + +- [ ] **De-duplicate shared helpers:** + - `updateIndexStateManifest(...)` exists in both `tools/build-lmdb-index.js` and `tools/build-sqlite-index/index-state.js`. + - `chunkArray(...)` exists in both build and retrieval code (or adjacent helpers). +- [ ] **Centralize ANN table setup logic** so that “table exists” vs “table created” paths always prepare the insert statement (avoid the current drift between `prepareVectorAnnTable(...)` and the bundle/incremental paths). +- [ ] **Clarify naming:** `toVectorId(...)` is currently a “coerce to BigInt” helper; consider renaming to reflect that it does not encode/transform the id. + +--- + +## Tests and benchmarks — required additions + +### Must-add tests (CI-friendly) + +- [ ] **Unit test: ANN insertion when the ANN table already exists** (no real extension binary required). + - Approach: + - Create a temporary SQLite DB with all required tables plus a *plain* `dense_vectors_ann` table (not virtual) matching the schema used by insert/delete (`rowid` + `embedding` BLOB column). + - Pass a mocked `vectorConfig` into `incrementalUpdateDatabase(...)` with: + - `loadVectorExtension: () => ({ ok: true })` + - `hasVectorTable: () => true` + - `encodeVector: () => Buffer.from([0])` (or similar stable stub) + - Run an incremental update that modifies at least one file and assert that: + - rows are deleted for removed docIds + - rows are inserted/replaced for changed docIds +- [ ] **Unit test: bundle-build dims mismatch hard failure** + - Create two bundle files in the incremental bundle dir: one with embedding length N, one with embedding length N+1. + - Assert build fails (or returns count 0 with a clear reason) rather than “warn and continue”. + +### Additional recommended tests + +- [ ] **Reader fail-closed test:** Provide a DB with `user_version != SCHEMA_VERSION` and confirm: + - forced SQLite backend errors clearly + - auto backend falls back without using SQLite. +- [ ] **Incremental WAL checkpoint test** (if WAL checkpointing is implemented): verify that after incremental update: + - no `*.db-wal` / `*.db-shm` remain (or WAL is truncated to a small size, depending on desired policy). + +### Benchmark / regression testing + +- [ ] **p95 query latency regression guard (fixture-based)** + - Add a small but non-trivial fixture SQLite DB (or build it deterministically during test setup) and run a representative query workload: + - candidate generation (ngrams) + - FTS ranking (if enabled) + - dense vector scoring (if enabled) + - Measure per-query durations and assert p95 stays under a budget (or does not regress beyond a tolerance vs a baseline). + - Keep it deterministic: single-threaded, warm cache (or explicit warm-up iterations), fixed query set, fixed limits. + +--- + +## File-by-file findings and action items + +> This section lists concrete issues and improvement opportunities per reviewed file. +> Items are written as actionable checkboxes; severity tags (P0/P1/P2) are included where appropriate. + +### `src/storage/backend-policy.js` + +- [ ] Clarify threshold semantics for `autoSqliteThresholdChunks` / `autoSqliteThresholdBytes` when set to `0` (current code uses `> 0`, so `0` behaves like “disabled” rather than “always use SQLite”). +- [ ] Consider avoiding expensive `COUNT(*)` scans for auto-selection; store chunk count in a meta table or `index_state.json` and read that instead (or sum `file_manifest.chunk_count`). +- [ ] Consider logging/telemetry: when auto-select declines SQLite due to missing/invalid thresholds, surface that decision (currently it is silent except for return fields). + +### `src/storage/lmdb/schema.js` + +- [ ] Add brief inline documentation describing key-space expectations (which keys must exist for a usable LMDB index). +- [ ] Consider adding a helper to enumerate expected artifact keys for validation tooling (to avoid drift). + +### `src/storage/sqlite/build-helpers.js` + +- [ ] Ensure `vectorConfig.extension.table` / `.column` are always sanitized before being interpolated into SQL (call-site currently depends on the caller to sanitize). +- [ ] Consider making `buildChunkRow(...)` treat empty strings/arrays consistently (e.g., avoid turning `''` into `null` unintentionally for fields where empty-string is meaningful). +- [ ] Consider reducing confusion: `buildChunkRow(...)` returns fields (`signature`, `doc`) that are not inserted into `chunks` but only into `chunks_fts`. + +### `src/storage/sqlite/build/bundle-loader.js` + +- [ ] Ensure loader failures return actionable error messages (bundle path, reason). (Current errors are decent; confirm `readBundleFile(...)` includes enough context.) +- [ ] Consider exposing a small “max in-flight bundles” safeguard if worker threads are enabled (to avoid memory spikes on extremely large bundles). + +### `src/storage/sqlite/build/delete.js` + +- [ ] Cache delete statements by chunk size to reduce repeated `db.prepare(...)` overhead when deleting many docIds. +- [ ] Consider supporting a temp table approach (`CREATE TEMP TABLE ids(...)`) if deletion performance becomes a bottleneck for large deletes. +- [ ] Verify that the `vectorDeleteTargets` contract remains consistent across callers (column name `rowid` vs explicit id columns). + +### `src/storage/sqlite/build/from-artifacts.js` + +- [ ] Tighten shard discovery: `listShardFiles(...)` includes `.jsonl` but ingestion reads shards via `readJson(...)`; either: + - restrict token-postings shards to `.json`, or + - add JSONL support for token-postings shards (if they can be JSONL in practice). +- [ ] Consider inserting `dense_meta` inside the same transaction as the first dense-vector batch (atomicity / consistency). +- [ ] For `chunkMeta` ingestion (non-piece path), avoid building a single giant `rows` array in memory if the artifact can be large; use chunked batching as done in `ingestChunkMetaPieces(...)`. +- [ ] Failure cleanup: remove sidecars (`outPath-wal`, `outPath-shm`) as well as `outPath` on failure. + +### `src/storage/sqlite/build/from-bundles.js` + +- [ ] **(P0) Prepare `insertVectorAnn` even when the ANN table already exists** (see around L120). + The “table exists” branch sets `vectorAnnReady = true` but does not prepare the insert statement, so embeddings are not inserted into ANN. +- [ ] **(P0) Make embedding dims mismatch a hard failure.** + Current warning-only behavior (around L197) can produce inconsistent dense vectors. +- [ ] Guard against malformed bundles: `count += result.bundle.chunks.length` should handle missing/invalid `chunks` gracefully (use `?.length || 0`). +- [ ] Remove unused import (`path` is currently imported but not used). +- [ ] Failure cleanup should remove SQLite sidecars, not just the DB file. + +### `src/storage/sqlite/build/incremental-update.js` + +- [ ] **(P0) Prepare `insertVectorAnn` when the ANN table already exists** (see around L240). + Without this, incremental updates delete ANN rows but do not reinsert replacement vectors. +- [ ] Add explicit WAL checkpointing/truncation at the end of a successful update (to keep the DB self-contained and avoid large WAL growth). +- [ ] Consider tightening `isManifestMatch(...)` semantics when hashes are available on only one side (to converge DB manifest quality). +- [ ] Performance: consider `(mode,file,id)` index or other optimization for `getDocIdsForFile(...)` scanning and per-file id lists. +- [ ] Remove (or convert to assertion) the redundant “dims mismatch warn” path inside applyChanges; dims mismatch should already be rejected earlier. + +### `src/storage/sqlite/build/manifest.js` + +- [ ] De-duplicate `conflicts` output (currently can include repeated normalized paths). +- [ ] Consider strict hash preference: if `entry.hash` is present but `dbEntry.hash` is null, treat as mismatch and update DB hash (do not silently match on mtime/size). + +### `src/storage/sqlite/build/pragmas.js` + +- [ ] Consider restoring `journal_mode` (or explicitly checkpointing) after build to ensure “single-file DB” invariants if the project expects that. +- [ ] Consider surfacing pragma failures (currently swallowed silently). + +### `src/storage/sqlite/build/statements.js` + +- [ ] Consider adding `idx_chunks_file_id` (see schema/index alignment notes). +- [ ] Reduce confusion: `buildChunkRowWithMeta(...)` populates fields not present in the schema (e.g., `churn_added`, `churn_deleted`, `churn_commits`). Either: + - add these columns to the schema if they are intended, or + - stop emitting them to avoid “looks supported but isn’t”. + +### `src/storage/sqlite/build/validate.js` + +- [ ] Consider validating ANN invariants when ANN is enabled: + - `dense_vectors_ann` row count should match `dense_vectors` row count for the mode (or at least have no orphans). +- [ ] Consider making full `integrity_check` optional for very large DBs (it can be expensive); provide a quick-check mode and/or configurable validation levels. + +### `src/storage/sqlite/build/vocab.js` + +- [ ] Consider caching prepared statements by chunk size (similar to delete/vocab fetch) to reduce repeated SQL compilation overhead. +- [ ] Error messaging: if `missing.length` is huge, cap printed missing values in the thrown error and include only a sample plus counts (to avoid megabyte-scale exception strings). + +### `src/storage/sqlite/incremental.js` + +- [ ] Document the on-disk incremental manifest contract and failure modes (missing manifest, conflicts, ratio guard). +- [ ] Consider adding a small helper to validate the incremental manifest shape early, with clearer error output. + +### `src/storage/sqlite/schema.js` + +- [ ] Consider adding `(mode,file,id)` index for maintenance queries. +- [ ] Ensure docs (`docs/sqlite-index-schema.md`) stay in sync when schema changes. + +### `src/storage/sqlite/utils.js` + +- [ ] `normalizeFilePath(...)` returns the input unchanged when it is not a string; consider returning `null` instead to reduce accidental “undefined as key” behavior. +- [ ] `replaceSqliteDatabase(...)`: consider logging when fallback rename/remove paths are taken (debuggability of replacement failures). + +### `src/storage/sqlite/vector.js` + +- [ ] `toVectorId(...)` is effectively “coerce to BigInt”; consider renaming to reflect that (e.g., `toSqliteRowidInt64(...)`) to avoid implying a non-trivial mapping. +- [ ] Consider making quantization parameters (`minVal`, `maxVal`) configurable or derived from embedding model metadata (avoid silent saturation if embeddings are out of range). + +--- + +### Tooling files + +#### `tools/build-lmdb-index.js` + +- [ ] Consider a `--validate` option that checks required artifacts exist before writing LMDB (fail early, clearer errors). +- [ ] Consider writing a small LMDB “manifest” key listing which artifacts were written (enables tool-side validation and reduces drift). + +#### `tools/build-sqlite-index.js` + +- [ ] Consider exit codes and messaging consistency across build modes (full rebuild vs incremental vs skipped). + +#### `tools/build-sqlite-index/cli.js` + +- [ ] Consider validating incompatible flag combinations early (e.g., `--bundle-workers` without a bundle dir). +- [ ] Consider adding `--no-compact` / `--compact` clarity in CLI help (if not already covered elsewhere). + +#### `tools/build-sqlite-index/index-state.js` + +- [ ] De-duplicate `updateIndexStateManifest(...)` with the LMDB equivalent; extract to a shared helper module. +- [ ] Consider including schema version and build mode (full vs incremental) in `index_state.json` for observability. + +#### `tools/build-sqlite-index/run.js` + +- [ ] Ensure `stopHeartbeat()` is always invoked via `try/finally` (avoid leaking an interval on error when `exitOnError=false`). +- [ ] After incremental updates, consider forcing WAL checkpoint/truncate (see incremental update section). +- [ ] Consider making the “incremental fallback to rebuild” reason more explicit in output (currently logged, but could include key stats: changedFiles, deletedFiles, ratio). + +#### `tools/build-sqlite-index/temp-path.js` + +- [ ] Consider a “same filesystem guarantee” note: temp DB path must be on same filesystem for atomic rename (current implementation uses same directory, which is good; document this). + +#### `tools/clean-artifacts.js` + +- [ ] Consider adding a `--dry-run` option that prints what would be deleted without deleting it (safety for new users). + +#### `tools/compact-sqlite-index.js` + +- [ ] If vector extension is enabled but cannot be loaded, consider warning that compaction may drop ANN acceleration (and suggest remediation, e.g. rerun embeddings rebuild once extension is available). +- [ ] Consider recording pre/post compaction stats into `index_state.json` (bytes, row counts) for observability. + +#### `tools/download-extensions.js` + +- [ ] Consider streaming zip extraction rather than buffering each entry into memory (`adm-zip` forces buffer extraction; if large binaries become common, consider a streaming zip library). +- [ ] Consider setting file permissions for extracted binaries explicitly per-platform conventions (e.g., preserve exec bit if needed, although shared libraries typically do not require it). + +#### `tools/index-validate.js` + +- [ ] Consider including actionable remediation hints per failure mode (e.g., “run build-index”, “run build-sqlite-index”, “run download-extensions”). + +#### `tools/report-artifacts.js` + +- [ ] Consider clarifying the units in output when printing both formatted size and raw bytes (currently raw bytes are printed in parentheses without a label). + +#### `tools/vector-extension.js` + +- [ ] Consider keying `loadCache` by (db, config) rather than only db (avoids surprising behavior if config changes during a long-lived process). +- [ ] Consider restoring prior `trusted_schema` value after `ensureVectorTable(...)` (minimize global DB setting changes). + +#### `tools/verify-extensions.js` + +- [ ] Consider adding a quick “smoke query” that verifies the ANN table can be created and queried (optional). + +--- + +### Test files + +#### `tests/backend-policy.js` + +- [ ] Add coverage for threshold edge cases (e.g., `autoSqliteThresholdChunks=0` semantics). +- [ ] Add a test case where SQLite exists but artifact metadata cannot be read (ensure fallback behavior is correct and reason is surfaced). + +#### `tests/compact-pieces.js` + +- [ ] No issues noted (acts as a compaction functional check for artifact pieces). + +#### `tests/lmdb-backend.js` + +- [ ] Consider adding schema version mismatch coverage (fail closed when schema version differs). + +#### `tests/lmdb-corruption.js` + +- [ ] Consider asserting on error message content to ensure corruption reporting remains actionable. + +#### `tests/lmdb-report-artifacts.js` + +- [ ] Consider adding a test for “missing required key” vs “corruption” differentiation (if validation tooling can distinguish). + +#### `tests/retrieval-backend-policy.js` + +- [ ] Add coverage for schema version mismatch fallback (once reader-side user_version check exists). + +#### `tests/smoke-sqlite.js` + +- [ ] Add coverage for `user_version` mismatch behavior once implemented. + +#### `tests/sqlite-ann-extension.js` + +- [ ] Add a CI-friendly companion test that does not require the real extension binary (mock vectorConfig approach described above) to ensure ANN insert/delete invariants are enforced in CI. + +#### `tests/sqlite-ann-fallback.js` + +- [ ] Consider adding explicit coverage that fallback ANN search never returns out-of-range docIds (robustness guard). + +#### `tests/sqlite-auto-backend.js` + +- [ ] Add a test that covers the “SQLite present but too small” path + verifies reason reporting is stable. + +#### `tests/sqlite-build-delete.js` + +- [ ] Add coverage for deleting from an ANN table using `rowid` column and BigInt inputs (ensures `toVectorId(...)` conversion remains correct). + +#### `tests/sqlite-build-indexes.js` + +- [ ] Add coverage for any new maintenance index (e.g., `(mode,file,id)`), if introduced. + +#### `tests/sqlite-build-manifest.js` + +- [ ] Add a test for “manifest has hash but DB does not” semantics (once tightened). + +#### `tests/sqlite-build-vocab.js` + +- [ ] Add stress coverage for token sets larger than SQLite’s `IN` limit (ensuring chunking logic remains correct). + +#### `tests/sqlite-bundle-missing.js` + +- [ ] Add bundle-shape validation coverage (missing `chunks` field should not crash build loop). + +#### `tests/sqlite-cache.js` + +- [ ] No issues noted (validates cache path behavior / read path). + +#### `tests/sqlite-chunk-id.js` + +- [ ] No issues noted (docId/chunkId behavior). + +#### `tests/sqlite-compact.js` + +- [ ] Consider adding coverage for compaction with ANN enabled but extension mocked (ensures dense_vectors_ann remains consistent after compaction). + +#### `tests/sqlite-incremental-no-change.js` + +- [ ] Consider verifying `index_state.json` is unchanged (or only updated timestamp changes), depending on desired policy. + +#### `tests/sqlite-incremental.js` + +- [ ] Add coverage for doc-id reuse behavior (free-list) to prevent accidental regression to “always append”. + +#### `tests/sqlite-index-state-fail-closed.js` + +- [ ] Consider adding coverage that “pending” flips back to false on successful build (already implied but could be explicit). + +#### `tests/sqlite-missing-dep.js` + +- [ ] No issues noted (validates better-sqlite3 missing behavior). + +#### `tests/sqlite-sidecar-cleanup.js` + +- [ ] Add incremental-update sidecar cleanup coverage if WAL checkpointing/truncation is implemented. + +--- + +### Documentation files + +#### `docs/contracts/sqlite.md` + +- [ ] Explicitly document the `user_version` contract and the “fail closed / rebuild on mismatch” behavior. +- [ ] Ensure the list of required tables aligns with the actual reader/build code paths (and clearly separate “core” vs “optional” tables). + +#### `docs/external-backends.md` + +- [ ] Consider updating to reflect current backend-policy behavior (auto selection thresholds, forced backend semantics). + +#### `docs/model-compare-sqlite.json`, `docs/parity-sqlite-ann.json`, `docs/parity-sqlite-fts-ann.json` + +- [ ] Ensure these reports are either generated artifacts (and documented as such) or kept in sync with the current schema/tooling versions (otherwise they can mislead). + +#### `docs/references/dependency-bundle/deps/better-sqlite3.md` + +- [ ] Confirm documented behavior matches current runtime expectations (particularly around extension loading, platform binaries, and supported SQLite features). + +#### `docs/sqlite-ann-extension.md` + +- [ ] Document the invariant that `dense_vectors_ann` must remain consistent with `dense_vectors` (no orphans; same cardinality per mode when enabled). +- [ ] Document how incremental updates maintain the ANN table (and note limitations when extension is not available). + +#### `docs/sqlite-compaction.md` + +- [ ] Clarify how compaction interacts with the ANN extension table (and the remediation path if ANN is temporarily unavailable during compaction). + +#### `docs/sqlite-incremental-updates.md` + +- [ ] Update doc-id behavior description to match implementation (per-file id reuse + free-list reuse rather than always appending). +- [ ] Document the ratio guard behavior and fallback to full rebuild more explicitly. +- [ ] Document WAL/sidecar expectations for incremental updates (single-file vs WAL sidecars). + +#### `docs/sqlite-index-schema.md` + +- [ ] Reconfirm schema matches `SCHEMA_VERSION = 7` (columns, indexes, optional extension table). +- [ ] If `(mode,file,id)` index is added, document it as a maintenance/performance index. + +--- + +## Exit criteria for this review section + +The following items should be completed to consider “Review Section 7” fully addressed: + +- [ ] ANN insert-preparation bug fixed in both bundle-build and incremental-update code paths. +- [ ] Reader-side schema version fail-closed behavior implemented and tested. +- [ ] Bundle-build embedding dims mismatch becomes a hard failure (with tests). +- [ ] WAL/sidecar policy is explicitly decided, implemented consistently, and documented (at minimum for incremental updates). +- [ ] At least one CI-friendly test covers ANN table sync invariants without requiring a real extension binary. +- [ ] At least one fixture-based p95 latency regression test is added (or an equivalent deterministic perf guard). + +--- + +--- + +# Phase 34 — Phase 2/3/4/5/6 verification gates + +**Objective:** run and gate the regression tests that confirm Phase 2 contract alignment, Phase 3 chunking invariants, Phase 4 retrieval semantics, Phase 5 durability, and Phase 6 embeddings correctness. + +## 34.1 CLI flag removal and error handling +- [ ] `tests/search-removed-flags.js` + - [ ] Failure: Expected actionable error for --human. + - [ ] Log: `logs/phase-22/search-removed-flags.log:1` +- [ ] `tests/search-missing-flag-values.js` + - [ ] Failure: Expected missing value message for --type. + - [ ] Log: `logs/phase-22/search-missing-flag-values.log:1` + +## 34.10 Phase 9 CI gating + flaky test recovery +- [ ] `tests/script-coverage.js` + - [ ] Failure: Error: unsafe tar entry: C:/Users/sneak/Development/PairOfCleats_CODEX/tests/.cache/download-extensions/tar/.tmp/extract-1768204937568/vec0.dll + - [ ] Log: `tests/.logs/2026-01-12T08-02-14-028Z/download-extensions-test.attempt-3.log:15` + +## 34.11 Phase 10 modularization regression sweep +- [ ] `tests/search-help.js` + - [ ] Failure: Help output missing flag: --calls. + - [ ] Log: `logs/phase-22/search-help.log:1` + +## 34.12 Phase 11 docs/help parity checks +- [ ] `tests/search-help.js` + - [ ] Failure: Help output missing flag: --calls. + - [ ] Log: `logs/phase-22/search-help.log:1` +- [ ] `tests/search-removed-flags.js` + - [ ] Failure: Expected actionable error for --human. + - [ ] Log: `logs/phase-22/search-removed-flags.log:1` + +## 34.29 file processor skip +- [ ] `tests/file-processor/skip.test.js` + - [ ] Failure: Expected binary buffer to skip with reason=binary. + - [ ] Log: `logs/phase-22/file-processor-skip.log:1` + +## 34.32 lang js chunking +- [ ] `tests/lang/js-chunking.test.js` + - [ ] Failure: Missing exported function chunk (alpha). + - [ ] Log: `logs/phase-22/lang-js-chunking.log:1` + +## 34.34 lang js relations +- [ ] `tests/lang/js-relations.test.js` + - [ ] Failure: Missing exports for run/default: []. + - [ ] Log: `logs/phase-22/lang-js-relations.log:1` + +## 34.38 language registry collectors +- [ ] `tests/language-registry/collectors.test.js` + - [ ] Failure: dockerfile mismatch: ["node:18"] !== ["base","node:18"]. + - [ ] Log: `logs/phase-22/language-registry-collectors.log:1` + +**Exit criteria** +- [ ] All verification tests pass. + +--- diff --git a/HISTORIC_COMPLETED_PHASES.md b/HISTORIC_COMPLETED_PHASES.md new file mode 100644 index 000000000..0b7c4be95 --- /dev/null +++ b/HISTORIC_COMPLETED_PHASES.md @@ -0,0 +1,2918 @@ +# Completed phases + +# Phase 1 — Truth Alignment, Spec Freeze, and Correctness Harness + +**Objective:** Establish the authoritative definition of “what the tool does,” then encode it into tests, validations, and reproducible fixtures so every subsequent phase is measurable. + +## 1.1 Feature truth table (claims → evidence → tests → limitations) + +### Dependency guidance (best choices) +- `ajv` — model the truth table itself as a JSON Schema and **validate it in CI** so the “claims → evidence → tests → limits” ledger can’t silently drift. + - Compile schemas once at startup (`new Ajv({ strict: true, allErrors: true })`), not per file/run. +- `jsonc-parser` — if feature flags or config files are JSONC, use offset-aware parsing (`getLocation`, `parseTree`) so you can attach *precise* diagnostics to a feature claim. +- `semver` — version every claim bundle and feature gate using semver ranges rather than ad-hoc strings. + +- [x] Build `docs/truth-table.md` that covers: + - [x] Build modes: code / prose / records / mixed + - [x] Chunking rules (by language and file type) + - [x] Tokenization semantics (code vs prose) + - [x] Index artifact outputs (memory + sqlite + shard formats) + - [x] Search semantics (filters, scoring, explain) + - [x] Enrichment outputs (risk, types, relations, git) + - [x] Service/API/MCP behavior (contracts, stability expectations) +- [x] For each claim: + - [x] link to implementation module(s) + - [x] list configuration toggles + - [x] list known limitations / failure modes + - [x] identify a fixture-based test that demonstrates it + +## 1.2 Acceptance-test fixtures and golden expectations + +### Dependency guidance (best choices) +- `seedrandom` — make all randomized fixture generation deterministic (seed = repo hash + test name), so flaky “random repos” never block correctness gates. +- `xxhash-wasm` — use fast, stable hashing to derive fixture IDs and to detect unintended fixture drift (hash raw inputs + normalized outputs). + +- [x] Add fixture repos representing: + - [x] small: <1k files mixed code/prose + - [x] medium: 5k–50k files with mixed languages + - [x] multi-language mixed-file repo (HTML+JS+CSS, markdown code fences, etc) +- [x] Define “must-hit” retrieval assertions: + - [x] symbol lookup (name/kind) + - [x] structural filters (e.g., `--kind`, `--signature`, `--decorator`) + - [x] risk filter behavior (even if basic initially) + - [x] type inference visibility (even if minimal initially) + +## 1.3 Tool invocation correctness (install-root vs repo-root) + +### Dependency guidance (best choices) +- `execa` — standardize all subprocess calls (git, node, pnpm) with robust quoting, streaming output capture, timeouts, and non-throwing exit handling. + - Prefer `reject: false` and check `exitCode` explicitly; capture `stdout`, `stderr`, and combined `all` output. +- `semver` — validate runtime/tool versions (Node, npm/pnpm, optional native deps) and emit actionable errors early. + +- [x] Implement and require a single resolver: + - [x] `resolveToolRoot()` (ESM-safe, based on `import.meta.url`) + - [x] `resolveRepoRoot()` (explicit > inferred; deterministic) +- [x] Convert *all* scripts that spawn other scripts/tools to use toolRoot resolution. +- [x] Add tests that run commands from a directory **outside** repoRoot. + +## 1.4 Determinism and reproducibility baseline + +### Dependency guidance (best choices) +- `seedrandom` — seed any randomized ordering (file traversal, shard selection, benchmark query generation). +- `xxhash-wasm` — deterministic hashing for chunk IDs and segment IDs; avoid crypto hashes unless explicitly required. +- `msgpackr` — if you snapshot intermediate artifacts for determinism tests, prefer MsgPack for speed and stable binary outputs. + +- [x] Ensure build artifacts include: + - [x] tool version, node version, OS, effective config hash + - [x] repo provenance (git commit + dirty flag when available) +- [x] Establish a deterministic test mode: + - [x] deterministic embedding stub (by default in tests) + - [x] deterministic ordering everywhere (files, shards, chunk IDs) + +**Deliverables** +- `docs/truth-table.md` +- fixture repos + goldens +- installed-package E2E test suite + +**Exit criteria** +- Tier-1 E2E tests pass reliably (Linux) and are reproducible locally. +- “Truth table” coverage: every user-visible feature claim has a test or explicit limitation. + +--- + +# Phase 2 — Artifact Contracts, Metadata Schema v2, and Atomic Build Durability + +**Objective:** Make artifacts and metadata self-describing, versioned, validated, and crash-safe. + +## 2.1 Artifact contract (schema + invariants) + +### Dependency guidance (best choices) +- `ajv` — enforce artifact schema invariants (index file, shard manifests, metadata v2, benchmark outputs) as a hard gate. + - Consider Ajv standalone validation for hot-path validation during large builds (generate validators once). +- `msgpackr` — use for compact, fast serialization of intermediate shard artifacts (especially metadata-rich chunks). + - Prefer a versioned envelope (magic bytes + schema version + codec version) so upgrades are safe. +- `fflate` — compress large artifacts (shards, posting lists) with streaming APIs to avoid event-loop stalls. +- `xxhash-wasm` — compute stable content hashes and IDs efficiently; cache initialized WASM instance and reuse. +- `roaring-wasm` (optional but high ROI) — represent posting lists and large ID sets as compressed bitmaps for fast intersection/union. + - Explicitly call `dispose()` on bitmaps to avoid WASM memory growth. +- `better-sqlite3` — if SQLite is a backend, standardize on prepared statements + WAL mode + transactional writes for durability. +- `lmdb` (optional) — consider as an alternative backend for very high write throughput; gate behind optional dependency/feature flag (install friction). + +- [x] Define/refresh `docs/artifact-contract.md`: + - [x] every artifact file + format + version + - [x] required fields + optional fields + - [x] invariants (cross-artifact) and validation rules +- [x] Strengthen `tools/index-validate`: + - [x] schema validation per artifact + - [x] cross checks: chunk IDs, file references, postings references, embedding references + - [x] human remediation hints for each failure class + +## 2.2 **Metadata schema v2** (rich per-chunk metadata contract) + +### Dependency guidance (best choices) +- `ajv` — treat **Metadata Schema v2** as the canonical contract. + - Encode “required when …” rules as schema + additional runtime checks (Ajv can’t express every cross-field invariant cleanly). +- `semver` — version metadata schema independently from the index container version; negotiate reader compatibility. + +This is the foundation for advanced rich metadata, risk flows, and type inference. + +- [x] Create `docs/metadata-schema-v2.md` defining: + - [x] stable core: `chunkId`, `file`, `segment`, `range`, `lang`, `ext`, `kind`, `name` + - [x] provenance: `generatedBy`, `tooling`, `parser`, versions + - [x] doc metadata: signature, docstring/doc-comments, annotations, decorators/attributes + - [x] control-flow summary: branches/loops/returns/throws/awaits/async/generator + - [x] dataflow summary: reads/writes/mutates/aliases (local first; later cross-file) + - [x] dependencies: imports, referenced modules, includes + - [x] risk metadata: sources/sinks/sanitizers/flows (+ confidence) + - [x] type metadata: declared/inferred/tooling (+ confidence) + - [x] embedded metadata: segment parent, embedded language, embedding context +- [x] Define compatibility rules with existing `docmeta`: + - [x] migration mapping from current fields to v2 fields + - [x] deprecation schedule for legacy keys + +## 2.3 Atomic build and “current” pointer + +### Dependency guidance (best choices) +- `better-sqlite3` — implement “current pointer” and multi-stage build state updates as **atomic transactions**. + - Use WAL journaling; keep write transactions short and bounded. +- `fflate` — if “current pointer” points at compressed shard bundles, stream compress/decompress rather than buffering whole bundles. + +- [x] Build to staging directory `builds//...` (default format: `YYYYMMDDTHHMMSSZ__`) +- [x] Validate staging artifacts before promoting to “current” +- [x] Ensure readers never see partial outputs: + - [x] atomic rename/swap semantics + - [x] sqlite temp file + rename + - [x] shard manifest atomicity + +## 2.4 Durable state machine for multi-stage builds + +### Dependency guidance (best choices) +- `better-sqlite3` / `lmdb` — persist the build state machine (stage, shard progress, error ledger, tool versions, input manifest hashes) in a durable store. + - Prefer append-only event logs + periodic snapshots rather than in-place mutation only. +- `pino` — log state transitions as structured events (runId, shardId, stage, timings, error category). +- `prom-client` — expose state machine counters/histograms for throughput and failure rates (per stage, per language). + +- [x] Create a build state model with explicit phases: + - [x] discovery → preprocessing → stage1 → stage2 → stage3 → validation → promote +- [x] Ensure stage2/stage3 jobs cannot remain “running forever”: + - [x] heartbeat timestamps: persist `lastHeartbeatAt` every **30s** while a job is `running` + - [x] stale job detection: consider a job stale if `now - lastHeartbeatAt` exceeds: + - [x] **10 minutes** for stage2 (enrichment; mostly CPU + local IO) + - [x] **15 minutes** for stage3 (embeddings; can be longer-running, but heartbeat is independent of work duration) + - [x] recovery policy: mark stale jobs as `failed` and re-queue up to **2 retries** (default) with exponential backoff (**2m**, **10m**) + - [x] resumable checkpoints: persist progress at least every **1,000 files** or **120 seconds** (whichever comes first) + +**Deliverables** +- `docs/artifact-contract.md` +- `docs/metadata-schema-v2.md` +- hardened `index-validate` +- atomic build/promotion implementation + tests + +**Exit criteria** +- Killing the process mid-build never corrupts last-known-good index. +- Any index can be validated deterministically; schema v2 is published and enforced. + +--- + +# Phase 3 — Generalized Hybrid Chunking and Prose Extraction (Correctness) + +**Objective:** Make file segmentation and chunking correct for real-world mixed files (embedded languages) and ensure comments are consistently extracted and searchable as prose when desired. + +## 3.1 Introduce a **SegmentedDocument** pipeline + +### Dependency guidance (best choices) +- `file-type` + `istextorbinary` — aggressively avoid parsing binaries; detect via magic bytes first, then fallback heuristics. +- `chardet` + `iconv-lite` — only attempt encoding detection/decoding when UTF-8 decoding fails; preserve byte offsets by tracking decoding strategy. +- `fdir` — fast directory traversal (significantly faster than naive `fs.readdir` recursion). +- `ignore` — implement `.gitignore` semantics correctly (and cache per-directory ignore matchers). +- `picomatch` — precompile include/exclude globs for the segment discovery pre-pass (don’t recompile per file). +- `linguist-languages` — unify extension → languageId mapping, but keep project overrides (repo-local config) higher priority. + +- [x] Define a new internal representation: + - [x] `FileDocument { file, bytes, text, ext, langHint }` + - [x] `Segment { segmentId, type: code|prose|config|comment|embedded, languageId, start, end, parentSegmentId?, meta }` + - [x] `Chunk { chunkId, segmentId, start, end, name, kind, metaV2 }` +- [x] Replace “single chunker per file” with: + 1) segment discovery + 2) per-segment chunking + 3) chunk merging + stable ordering + overlap rules + +## 3.2 Mixed-file support coverage (beyond HTML) + +### Dependency guidance (best choices) +- Markdown / MDX / prose containers: + - `micromark` — extract **exact byte ranges** of headings, paragraphs, and fenced code blocks (language from info string). + - `yaml` + `smol-toml` + `jsonc-parser` — parse frontmatter blocks into config segments with node/range provenance. + - `@mdx-js/mdx` — for MDX, compile with plugins disabled by default; enable remark/rehype plugins only when requested (performance). +- Web component containers: + - `@vue/compiler-sfc` — use `parse()` to get descriptor blocks and their `loc`/range; treat template/script/style as segments and preserve ordering. + - `svelte` — use compiler `parse()`; extract ` + + + +`; +} diff --git a/src/map/isometric/client/controls.js b/src/map/isometric/client/controls.js new file mode 100644 index 000000000..4e785c50f --- /dev/null +++ b/src/map/isometric/client/controls.js @@ -0,0 +1,322 @@ +import { state } from './state.js'; +import { clamp } from './utils.js'; +import { applyHighlights, setSelection, openSelection } from './selection.js'; + +export const initControls = () => { + const { + THREE, + dom, + renderer, + camera, + lockIsometric, + getViewport, + groundPlane, + lineResolution, + controlDefaults, + controls, + flowWaveLayers, + flowWaveTotal, + visuals, + visualDefaults + } = state; + + const pointer = new THREE.Vector2(); + const raycaster = new THREE.Raycaster(); + const zoomRaycaster = new THREE.Raycaster(); + + const getPointerNdc = (event) => { + const rect = renderer.domElement.getBoundingClientRect(); + const x = ((event.clientX - rect.left) / rect.width) * 2 - 1; + const y = -((event.clientY - rect.top) / rect.height) * 2 + 1; + return { x, y, rect }; + }; + + const getPlanePointFromNdc = (ndc) => { + if (!ndc) return null; + zoomRaycaster.setFromCamera({ x: ndc.x, y: ndc.y }, camera); + const point = new THREE.Vector3(); + if (zoomRaycaster.ray.intersectPlane(groundPlane, point)) return point; + return null; + }; + + const onPointer = (event) => { + const ndc = getPointerNdc(event); + pointer.x = ndc.x; + pointer.y = ndc.y; + raycaster.setFromCamera(pointer, camera); + const hits = raycaster.intersectObjects([...state.memberMeshes, ...state.fileMeshes]); + const target = hits.length ? hits[0].object : null; + setSelection(target); + }; + + let dragging = false; + let dragMoved = false; + let lastPointer = { x: 0, y: 0 }; + + const startDrag = (event) => { + dragging = true; + dragMoved = false; + lastPointer = { x: event.clientX, y: event.clientY }; + }; + + const moveDrag = (event) => { + if (!dragging) return; + const dx = event.clientX - lastPointer.x; + const dy = event.clientY - lastPointer.y; + if (Math.abs(dx) + Math.abs(dy) > 1) dragMoved = true; + lastPointer = { x: event.clientX, y: event.clientY }; + const ndc = getPointerNdc(event); + const rect = ndc.rect; + if (!rect.width || !rect.height) return; + const viewWidth = (camera.right - camera.left) / camera.zoom; + const viewHeight = (camera.top - camera.bottom) / camera.zoom; + const unitsX = viewWidth / rect.width; + const unitsZ = viewHeight / rect.height; + const panSensitivity = controls.panSensitivity || controlDefaults.panSensitivity; + const rot = Math.PI / 4; + const cos = Math.cos(rot); + const sin = Math.sin(rot); + const dragForward = -dy; + const dragSide = dx; + const moveX = (dragForward * cos - dragSide * sin) * unitsX * panSensitivity; + const moveZ = (dragForward * sin + dragSide * cos) * unitsZ * panSensitivity; + camera.position.x += moveX; + camera.position.z += moveZ; + lockIsometric(); + }; + + const updateHover = (event) => { + if (dragging) return; + const ndc = getPointerNdc(event); + pointer.x = ndc.x; + pointer.y = ndc.y; + raycaster.setFromCamera(pointer, camera); + const hits = raycaster.intersectObjects([...state.memberMeshes, ...state.fileMeshes]); + const nextHover = hits.length ? hits[0].object : null; + if (nextHover !== state.hoveredMesh) { + state.hoveredMesh = nextHover; + applyHighlights(); + } + }; + + const endDrag = () => { + dragging = false; + }; + + renderer.domElement.addEventListener('pointerdown', startDrag); + window.addEventListener('pointermove', moveDrag); + window.addEventListener('pointerup', endDrag); + renderer.domElement.addEventListener('pointerleave', endDrag); + renderer.domElement.addEventListener('pointermove', updateHover); + renderer.domElement.addEventListener('pointerleave', () => { + state.hoveredMesh = null; + applyHighlights(); + }); + + renderer.domElement.addEventListener('click', (event) => { + if (dragMoved) { + dragMoved = false; + return; + } + onPointer(event); + }); + renderer.domElement.addEventListener('dblclick', (event) => { + if (dragMoved) { + dragMoved = false; + return; + } + onPointer(event); + openSelection(); + }); + + let focused = false; + dom.app.addEventListener('pointerdown', () => { + focused = true; + dom.app.focus(); + }); + window.addEventListener('blur', () => { focused = false; }); + + const keys = {}; + window.addEventListener('keydown', (event) => { + if (!focused) return; + keys[event.code] = true; + }); + window.addEventListener('keyup', (event) => { + if (!focused) return; + keys[event.code] = false; + }); + + const velocity = new THREE.Vector2(0, 0); + + const updateCamera = (dt) => { + const wasd = controls.wasd || controlDefaults.wasd; + const accel = wasd.acceleration || controlDefaults.wasd.acceleration; + const maxSpeed = wasd.maxSpeed || controlDefaults.wasd.maxSpeed; + const drag = wasd.drag || controlDefaults.wasd.drag; + const sensitivity = wasd.sensitivity || controlDefaults.wasd.sensitivity; + + if (keys.KeyW) velocity.y -= accel * dt; + if (keys.KeyS) velocity.y += accel * dt; + if (keys.KeyA) velocity.x += accel * dt; + if (keys.KeyD) velocity.x -= accel * dt; + + velocity.x -= velocity.x * drag * dt; + velocity.y -= velocity.y * drag * dt; + velocity.x = Math.max(-maxSpeed, Math.min(maxSpeed, velocity.x)); + velocity.y = Math.max(-maxSpeed, Math.min(maxSpeed, velocity.y)); + + const rot = Math.PI / 4; + const cos = Math.cos(rot); + const sin = Math.sin(rot); + const moveX = (velocity.y * cos - velocity.x * sin) * dt * sensitivity * 0.005; + const moveZ = (velocity.y * sin + velocity.x * cos) * dt * sensitivity * 0.005; + camera.position.x += moveX; + camera.position.z += moveZ; + lockIsometric(); + }; + + let zoomVelocity = 0; + let zoomPointer = { x: 0, y: 0 }; + const onWheel = (event) => { + event.preventDefault(); + const zoomSensitivity = Number.isFinite(controls.zoomSensitivity) + ? controls.zoomSensitivity + : controlDefaults.zoomSensitivity; + const rawDelta = Number.isFinite(event.deltaY) ? event.deltaY : 0; + const deltaModeScale = event.deltaMode === 1 ? 18 : (event.deltaMode === 2 ? 360 : 1); + const delta = -rawDelta * deltaModeScale * 0.05; + const ndc = getPointerNdc(event); + zoomPointer = { x: ndc.x, y: ndc.y }; + const direction = Math.sign(delta); + const velocityDir = Math.sign(zoomVelocity); + const momentumBoost = Math.min(6, Math.abs(zoomVelocity) * 0.6); + const repeatBoost = direction !== 0 && direction === velocityDir ? 1 + momentumBoost : 1; + zoomVelocity += delta * zoomSensitivity * (2 + repeatBoost); + }; + renderer.domElement.addEventListener('wheel', onWheel, { passive: false }); + + let lastTime = performance.now(); + let lastPulseUpdate = 0; + const animate = () => { + requestAnimationFrame(animate); + const now = performance.now(); + const dt = Math.min(0.05, (now - lastTime) / 1000); + lastTime = now; + updateCamera(dt); + if (Math.abs(zoomVelocity) > 0.0001) { + const zoomMin = Number.isFinite(controls.zoomMin) + ? controls.zoomMin + : controlDefaults.zoomMin; + const zoomMax = Number.isFinite(controls.zoomMax) ? controls.zoomMax : controlDefaults.zoomMax; + const before = getPlanePointFromNdc(zoomPointer); + camera.zoom = Math.max(zoomMin, Math.min(zoomMax, camera.zoom + zoomVelocity * dt)); + camera.updateProjectionMatrix(); + const after = getPlanePointFromNdc(zoomPointer); + if (before && after) { + camera.position.add(before.sub(after)); + lockIsometric(); + } + const damping = Number.isFinite(controls.zoomDamping) ? controls.zoomDamping : controlDefaults.zoomDamping; + zoomVelocity *= Math.pow(damping, dt * 60); + if (Math.abs(zoomVelocity) < 0.0001) zoomVelocity = 0; + } + if (now - lastPulseUpdate > 33) { + lastPulseUpdate = now; + for (const material of state.glowMaterials) { + const base = material.userData?.glowBase ?? 0; + const range = material.userData?.glowRange ?? 0.05; + const glowSpeed = material.userData?.glowSpeed ?? 1; + const glowPhase = material.userData?.glowPhase ?? 0; + const pulse = 0.5 + 0.5 * Math.sin(now * 0.002 * glowSpeed + glowPhase); + material.emissiveIntensity = base + range * pulse; + } + const flowSpeed = visuals.glowPulseSpeed || visualDefaults.glowPulseSpeed; + for (const material of state.flowMaterials) { + const base = material.userData?.glowBase ?? 0; + const range = material.userData?.glowRange ?? 0.05; + const phase = material.userData?.flowPhase ?? 0; + const dir = material.userData?.flowDir ?? 1; + const typeSpeed = material.userData?.flowSpeed ?? 1; + const offset = material.userData?.flowOffset ?? 0; + let waveSum = 0; + for (const layer of flowWaveLayers) { + const waveTime = + now * 0.002 * flowSpeed * layer.speed * typeSpeed + offset - phase * dir; + waveSum += layer.amplitude * (0.5 + 0.5 * Math.sin(waveTime)); + } + const waveValue = waveSum / flowWaveTotal; + material.emissiveIntensity = base + range * waveValue; + } + for (const material of state.wireMaterials) { + const base = material.userData?.glowBase ?? 0.3; + const range = material.userData?.glowRange ?? 0.4; + const phase = material.userData?.flowPhase ?? 0; + const wireSpeed = + material.userData?.flowSpeed ?? + visuals.wirePulseSpeed ?? + visualDefaults.wirePulseSpeed; + const wirePulse = 0.5 + 0.5 * Math.sin(now * 0.002 * wireSpeed - phase); + material.opacity = clamp(base + range * wirePulse, 0.02, 0.6); + } + for (const material of state.gridLineMaterials) { + const base = material.userData?.glowBase ?? 0.1; + const range = material.userData?.glowRange ?? 0.2; + const phase = material.userData?.flowPhase ?? 0; + const gridSpeed = + material.userData?.flowSpeed ?? + visuals.gridPulseSpeed ?? + visualDefaults.gridPulseSpeed; + const gridPulse = 0.5 + 0.5 * Math.sin(now * 0.002 * gridSpeed + phase); + material.opacity = clamp(base + range * gridPulse, 0.02, 0.6); + } + for (const light of state.flowLights) { + const base = light.userData?.base ?? 0.8; + const phase = light.userData?.flowPhase ?? 0; + const dir = light.userData?.flowDir ?? 1; + const typeSpeed = light.userData?.flowSpeed ?? 1; + const offset = light.userData?.flowOffset ?? 0; + let waveSum = 0; + for (const layer of flowWaveLayers) { + const waveTime = + now * 0.002 * flowSpeed * layer.speed * typeSpeed + offset - phase * dir; + waveSum += layer.amplitude * (0.5 + 0.5 * Math.sin(waveTime)); + } + const waveValue = waveSum / flowWaveTotal; + light.intensity = base * (0.4 + 0.6 * waveValue); + } + } + lockIsometric(); + renderer.render(state.scene, camera); + }; + animate(); + + const onResize = () => { + const viewport = getViewport(); + const aspect = viewport.width / viewport.height; + const base = state.cameraBase; + camera.left = -base * aspect; + camera.right = base * aspect; + camera.top = base; + camera.bottom = -base; + camera.near = state.nearPlane; + camera.far = state.farPlane; + camera.updateProjectionMatrix(); + lineResolution.width = viewport.width; + lineResolution.height = viewport.height; + for (const material of state.wireMaterials) { + if (material.resolution && typeof material.resolution.set === 'function') { + material.resolution.set(lineResolution.width, lineResolution.height); + } + } + for (const material of state.gridLineMaterials) { + if (material.resolution && typeof material.resolution.set === 'function') { + material.resolution.set(lineResolution.width, lineResolution.height); + } + } + renderer.setPixelRatio(Math.min(2, window.devicePixelRatio || 1)); + renderer.setSize(viewport.width, viewport.height); + lockIsometric(); + }; + window.addEventListener('resize', onResize); + onResize(); +}; diff --git a/src/map/isometric/client/defaults.js b/src/map/isometric/client/defaults.js new file mode 100644 index 000000000..cb657cf6f --- /dev/null +++ b/src/map/isometric/client/defaults.js @@ -0,0 +1,120 @@ +export const layoutDefaults = { + style: 'flow', + groupDepth: 1, + groupSpacing: 3.2, + fileSpacing: 2, + compactness: 1, + baseSize: 3.2, + fileHeight: 1.2, + fileShape: 'category', + memberShape: 'category', + memberCell: 0.9, + memberGap: 0.2, + memberInset: 0.35, + memberHeightBase: 0.8, + memberHeightScale: 0.55, + memberHeightMax: 7, + edgePlane: -1, + routingPadding: 0.9, + routingStep: 1.3, + labelScale: 0.018, + labelOffset: 0.08 +}; + +export const scoringDefaults = { + dataflow: 0.9, + controlFlow: 0.9, + params: 0.4, + signature: 0.03, + exported: 1.4, + modifiers: 0.4, + type: 1.2, + returns: 0.8 +}; + +export const colorDefaults = { + mode: 'score', + hueStart: 0.72, + hueEnd: 0.08, + saturation: 0.75, + lightnessMin: 0.42, + lightnessMax: 0.72, + distinctSaturation: 0.66, + distinctLightness: 0.58, + distinctHueOffset: 0.08 +}; + +export const assetDefaults = { + normalMapUrl: '/assets/isomap/normal.jpg', + hdrEnvUrl: '/assets/isomap/moonless_golf_2k.hdr', + rgbeLoaderUrl: '/three/examples/jsm/loaders/RGBELoader.js' +}; + +export const visualDefaults = { + fileOpacity: 1, + memberOpacity: 1, + flowGlowBase: 0.9, + flowGlowRange: 0.75, + glowPulseSpeed: 1.4, + wireframeThickness: 0.08, + wireframeGlow: 0.18, + wirePulseSpeed: 0.18, + gridLineThickness: 0.5, + gridGlowBase: 0.2, + gridGlowRange: 0.38, + gridPulseSpeed: 0.2, + enableFlowLights: true, + curveEdges: false, + enableFog: false, + enableHeightFog: false, + fogDistance: 2.8, + fogColor: '#0f1115', + fogHeight: 4, + fogHeightRange: 14, + enableExtraLights: true, + glass: { + metalness: 0.15, + roughness: 0.03, + transmission: 1, + ior: 1.6, + reflectivity: 1, + thickness: 3.6, + envMapIntensity: 5.2, + clearcoat: 1, + clearcoatRoughness: 0.03, + normalScale: 0.22, + clearcoatNormalScale: 0.16, + normalRepeat: 2.8 + } +}; + +export const controlDefaults = { + panSensitivity: 1.5, + zoomSensitivity: 6, + zoomDamping: 0.9, + zoomMin: 1, + zoomMax: 80, + wasd: { + sensitivity: 40000, + acceleration: 16000, + maxSpeed: 120000, + drag: 6 + } +}; + +export const flowWaveLayers = [ + { speed: 0.9, amplitude: 0.6 }, + { speed: 1.6, amplitude: 0.35 }, + { speed: 2.4, amplitude: 0.25 }, + { speed: 3.4, amplitude: 0.18 } +]; + +export const flowTypeProfiles = { + dataflow: { speed: 1.2, phase: 0.0 }, + export: { speed: 1.5, phase: 1.4 }, + call: { speed: 1.8, phase: 2.1 }, + import: { speed: 1.0, phase: 2.8 }, + usage: { speed: 0.9, phase: 3.6 }, + alias: { speed: 1.3, phase: 4.3 }, + other: { speed: 1.0, phase: 0.8 } +}; diff --git a/src/map/isometric/client/dom.js b/src/map/isometric/client/dom.js new file mode 100644 index 000000000..250c80dd0 --- /dev/null +++ b/src/map/isometric/client/dom.js @@ -0,0 +1,44 @@ +export const storageKey = 'pairofcleats.isometric.config'; + +export const mergeConfig = (base, override) => { + if (!override || typeof override !== 'object') return base; + const merged = { ...base, ...override }; + merged.layout = { ...(base.layout || {}), ...(override.layout || {}) }; + merged.controls = { ...(base.controls || {}), ...(override.controls || {}) }; + merged.colors = { ...(base.colors || {}), ...(override.colors || {}) }; + merged.scoring = { ...(base.scoring || {}), ...(override.scoring || {}) }; + merged.visuals = { ...(base.visuals || {}), ...(override.visuals || {}) }; + merged.assets = { ...(base.assets || {}), ...(override.assets || {}) }; + return merged; +}; + +export const loadDomConfig = () => { + const map = JSON.parse(document.getElementById('map-data').textContent || '{}'); + let config = JSON.parse(document.getElementById('viewer-config').textContent || '{}'); + const dom = { + app: document.getElementById('app'), + selectionBody: document.getElementById('selection-body'), + summary: document.getElementById('summary'), + menuView: document.getElementById('menu-view'), + menuEdges: document.getElementById('menu-edges'), + menuControls: document.getElementById('menu-controls'), + menuLayout: document.getElementById('menu-layout'), + menuScore: document.getElementById('menu-score'), + menuColors: document.getElementById('menu-colors'), + menuColorMode: document.getElementById('menu-color-mode'), + menuVisuals: document.getElementById('menu-visuals'), + menuEffects: document.getElementById('menu-effects'), + menuActions: document.getElementById('menu-actions') + }; + + try { + const stored = window.localStorage.getItem(storageKey); + if (stored) { + config = mergeConfig(config, JSON.parse(stored)); + } + } catch (err) { + // ignore storage failures + } + + return { map, config, dom }; +}; diff --git a/src/map/isometric/client/edges.js b/src/map/isometric/client/edges.js new file mode 100644 index 000000000..b2ffa8558 --- /dev/null +++ b/src/map/isometric/client/edges.js @@ -0,0 +1,579 @@ +import { state } from './state.js'; +import { applyHeightFog, updateFlowLights } from './materials.js'; + +const quantize = (value) => Number(value.toFixed(3)); + +export const buildEdges = () => { + const { + THREE, + edges, + allFiles, + layoutMetrics, + edgeWeights, + edgeGroup, + edgeVisibility, + flowTypeProfiles, + fileAnchors, + memberAnchors, + fileByMember, + memberColorById, + fileColorByPath, + visuals, + layoutStyle + } = state; + + const edgePlane = layoutMetrics.edgePlane; + const routingPadding = layoutMetrics.routingPadding; + const routingStep = layoutMetrics.routingStep; + + const resolveEdgeFile = (endpoint) => { + if (!endpoint) return null; + if (endpoint.file) return endpoint.file; + if (endpoint.member) return fileByMember.get(endpoint.member) || null; + return null; + }; + + const resolveEdgeColor = (endpoint) => { + if (!endpoint) return null; + if (endpoint.member && memberColorById.has(endpoint.member)) { + return memberColorById.get(endpoint.member); + } + if (endpoint.file && fileColorByPath.has(endpoint.file)) { + return fileColorByPath.get(endpoint.file); + } + const fileKey = resolveEdgeFile(endpoint); + if (fileKey && fileColorByPath.has(fileKey)) { + return fileColorByPath.get(fileKey); + } + return null; + }; + + const obstacles = []; + let minX = Infinity; + let maxX = -Infinity; + let minZ = Infinity; + let maxZ = -Infinity; + for (const fileLayout of allFiles) { + const fileId = fileLayout.node.path || fileLayout.node.name || null; + if (!fileId) continue; + const bounds = { + file: fileId, + minX: fileLayout.x - fileLayout.width / 2 - routingPadding, + maxX: fileLayout.x + fileLayout.width / 2 + routingPadding, + minZ: fileLayout.z - fileLayout.depth / 2 - routingPadding, + maxZ: fileLayout.z + fileLayout.depth / 2 + routingPadding + }; + obstacles.push(bounds); + minX = Math.min(minX, bounds.minX); + maxX = Math.max(maxX, bounds.maxX); + minZ = Math.min(minZ, bounds.minZ); + maxZ = Math.max(maxZ, bounds.maxZ); + } + + const resolveAnchor = (endpoint) => { + if (!endpoint) return null; + if (endpoint.member && memberAnchors.has(endpoint.member)) return memberAnchors.get(endpoint.member); + if (endpoint.file && fileAnchors.has(endpoint.file)) return fileAnchors.get(endpoint.file); + return null; + }; + + const segmentHitsObstacle = (x1, z1, x2, z2, ignoreFiles) => { + const dx = x2 - x1; + const dz = z2 - z1; + for (const obstacle of obstacles) { + if (ignoreFiles && ignoreFiles.has(obstacle.file)) continue; + const minX = obstacle.minX; + const maxX = obstacle.maxX; + const minZ = obstacle.minZ; + const maxZ = obstacle.maxZ; + const insideStart = x1 >= minX && x1 <= maxX && z1 >= minZ && z1 <= maxZ; + const insideEnd = x2 >= minX && x2 <= maxX && z2 >= minZ && z2 <= maxZ; + if (insideStart || insideEnd) return true; + let t0 = 0; + let t1 = 1; + const clip = (p, q) => { + if (p === 0) return q >= 0; + const r = q / p; + if (p < 0) { + if (r > t1) return false; + if (r > t0) t0 = r; + } else { + if (r < t0) return false; + if (r < t1) t1 = r; + } + return true; + }; + if ( + clip(-dx, x1 - minX) + && clip(dx, maxX - x1) + && clip(-dz, z1 - minZ) + && clip(dz, maxZ - z1) + ) { + return true; + } + } + return false; + }; + + const buildLaneValues = (min, max, step) => { + const values = []; + if (!step || step <= 0) return values; + const start = Math.floor(min / step) * step; + const end = Math.ceil(max / step) * step; + for (let value = start; value <= end; value += step) { + values.push(Number(value.toFixed(3))); + } + return values; + }; + + const sqrt3 = Math.sqrt(3); + const toAxial = (point, size) => { + const q = (sqrt3 / 3 * point.x - 1 / 3 * point.z) / size; + const r = (2 / 3 * point.z) / size; + return { q, r }; + }; + const axialToPoint = (axial, size) => ({ + x: size * sqrt3 * (axial.q + axial.r / 2), + z: size * 1.5 * axial.r + }); + const cubeRound = (cube) => { + let rx = Math.round(cube.x); + let ry = Math.round(cube.y); + let rz = Math.round(cube.z); + const dx = Math.abs(rx - cube.x); + const dy = Math.abs(ry - cube.y); + const dz = Math.abs(rz - cube.z); + if (dx > dy && dx > dz) { + rx = -ry - rz; + } else if (dy > dz) { + ry = -rx - rz; + } else { + rz = -rx - ry; + } + return { x: rx, y: ry, z: rz }; + }; + const axialToCube = (axial) => ({ x: axial.q, z: axial.r, y: -axial.q - axial.r }); + const cubeToAxial = (cube) => ({ q: cube.x, r: cube.z }); + const cubeLerp = (a, b, t) => ({ + x: a.x + (b.x - a.x) * t, + y: a.y + (b.y - a.y) * t, + z: a.z + (b.z - a.z) * t + }); + const cubeDistance = (a, b) => Math.max(Math.abs(a.x - b.x), Math.abs(a.y - b.y), Math.abs(a.z - b.z)); + const buildHexPath = (start, end, size) => { + if (!size || size <= 0) return [start, end]; + const a = axialToCube(toAxial(start, size)); + const b = axialToCube(toAxial(end, size)); + const steps = Math.max(1, cubeDistance(a, b)); + const points = []; + for (let i = 0; i <= steps; i += 1) { + const t = steps === 0 ? 0 : i / steps; + const cube = cubeRound(cubeLerp(a, b, t)); + points.push(axialToPoint(cubeToAxial(cube), size)); + } + return points; + }; + + const useHexRouting = layoutStyle === 'hex'; + const hexSize = Math.max(routingStep, (layoutMetrics.baseSize || 1) * 0.6); + + const findRoute = (start, end, ignoreFiles) => { + let bestPoints = null; + let bestDistance = Infinity; + const tryPath = (points) => { + for (let i = 0; i < points.length - 1; i += 1) { + const a = points[i]; + const b = points[i + 1]; + if (segmentHitsObstacle(a.x, a.z, b.x, b.z, ignoreFiles)) return false; + } + let distance = 0; + for (let i = 0; i < points.length - 1; i += 1) { + const a = points[i]; + const b = points[i + 1]; + distance += Math.abs(a.x - b.x) + Math.abs(a.z - b.z); + } + if (distance < bestDistance) { + bestDistance = distance; + bestPoints = points; + } + return true; + }; + + if (useHexRouting) { + const hexPoints = buildHexPath(start, end, hexSize); + let hits = false; + for (let i = 0; i < hexPoints.length - 1; i += 1) { + if (segmentHitsObstacle(hexPoints[i].x, hexPoints[i].z, hexPoints[i + 1].x, hexPoints[i + 1].z, ignoreFiles)) { + hits = true; + break; + } + } + if (!hits) return hexPoints; + } + + const directA = [start, { x: end.x, z: start.z }, end]; + const directB = [start, { x: start.x, z: end.z }, end]; + const directAOk = tryPath(directA); + const directBOk = tryPath(directB); + if (directAOk || directBOk) { + return bestPoints || directA; + } + + const laneZ = buildLaneValues(minZ - routingPadding, maxZ + routingPadding, routingStep); + for (const z of laneZ) { + tryPath([start, { x: start.x, z }, { x: end.x, z }, end]); + } + const laneX = buildLaneValues(minX - routingPadding, maxX + routingPadding, routingStep); + for (const x of laneX) { + tryPath([start, { x, z: start.z }, { x, z: end.z }, end]); + } + + return bestPoints || directA; + }; + + const flowSegmentsByType = new Map(); + const flowLightCandidates = []; + const edgeStyles = state.map.legend?.edgeStyles || {}; + const edgeTypeAliases = state.map.legend?.edgeTypes || {}; + const resolveEdgeType = (type) => (edgeStyles[type] ? type : (edgeTypeAliases[type] || type)); + const resolveEdgeStyle = (type) => edgeStyles[resolveEdgeType(type)] || edgeStyles[type] || {}; + const addEndpoint = (entry, endpoint) => { + if (!endpoint) return; + if (endpoint.member) { + entry.endpoints.add(`member:${endpoint.member}`); + const memberFile = fileByMember.get(endpoint.member); + if (memberFile) entry.endpoints.add(`file:${memberFile}`); + } + if (endpoint.file) { + entry.endpoints.add(`file:${endpoint.file}`); + } + }; + + const addFlowSegment = (type, x1, y1, z1, x2, y2, z2, weight, color, dir, edge) => { + if (Math.abs(x1 - x2) < 0.0001 && Math.abs(y1 - y2) < 0.0001 && Math.abs(z1 - z2) < 0.0001) return; + const nx1 = quantize(x1); + const ny1 = quantize(y1); + const nz1 = quantize(z1); + const nx2 = quantize(x2); + const ny2 = quantize(y2); + const nz2 = quantize(z2); + const swap = nx1 > nx2 || (nx1 === nx2 && (ny1 > ny2 || (ny1 === ny2 && nz1 > nz2))); + const ax1 = swap ? nx2 : nx1; + const ay1 = swap ? ny2 : ny1; + const az1 = swap ? nz2 : nz1; + const ax2 = swap ? nx1 : nx2; + const ay2 = swap ? ny1 : ny2; + const az2 = swap ? nz1 : nz2; + const key = `${ax1},${ay1},${az1}->${ax2},${ay2},${az2}`; + const bucket = flowSegmentsByType.get(type) || new Map(); + const entry = bucket.get(key) || { + x1: ax1, + y1: ay1, + z1: az1, + x2: ax2, + y2: ay2, + z2: az2, + weight: 0, + dirSum: 0, + rSum: 0, + gSum: 0, + bSum: 0, + colorWeight: 0, + endpoints: new Set() + }; + const direction = Number.isFinite(dir) && dir !== 0 ? dir : 1; + const normalizedDir = swap ? -direction : direction; + entry.weight += weight; + entry.dirSum += normalizedDir * weight; + if (edge) { + addEndpoint(entry, edge.from); + addEndpoint(entry, edge.to); + } + if (color) { + entry.rSum += color.r * weight; + entry.gSum += color.g * weight; + entry.bSum += color.b * weight; + entry.colorWeight += weight; + } + bucket.set(key, entry); + flowSegmentsByType.set(type, bucket); + }; + + const edgeHighlight = new THREE.Color('#ffffff'); + const endpointDots = new Map(); + const planeY = edgePlane + Math.max(0.08, (layoutMetrics.memberGap || 0) * 0.3); + const curveEdges = visuals.curveEdges === true; + const addEndpointDot = (key, anchor, color) => { + if (!key || !anchor) return; + const entry = endpointDots.get(key) || { + x: anchor.x, + y: anchor.y, + z: anchor.z, + color: new THREE.Color(0, 0, 0), + weight: 0 + }; + if (color) { + entry.color.add(color.clone().multiplyScalar(1)); + entry.weight += 1; + } + endpointDots.set(key, entry); + }; + const addPathPoints = (points, startAnchor, endAnchor, routePoints) => { + const startPlane = { x: startAnchor.x, y: planeY, z: startAnchor.z }; + const endPlane = { x: endAnchor.x, y: planeY, z: endAnchor.z }; + if (curveEdges) { + const startLift = Math.max(0.4, Math.abs(startAnchor.y - planeY) * 0.5); + const endLift = Math.max(0.4, Math.abs(endAnchor.y - planeY) * 0.5); + points.push(startAnchor); + points.push({ x: startAnchor.x, y: Math.max(startAnchor.y, planeY) + startLift, z: startAnchor.z }); + points.push(startPlane); + } else { + points.push(startAnchor); + points.push(startPlane); + } + routePoints.forEach((point, index) => { + if (index === 0 || index === routePoints.length - 1) return; + points.push({ x: point.x, y: planeY, z: point.z }); + }); + if (curveEdges) { + const endLift = Math.max(0.4, Math.abs(endAnchor.y - planeY) * 0.5); + points.push(endPlane); + points.push({ x: endAnchor.x, y: Math.max(endAnchor.y, planeY) + endLift, z: endAnchor.z }); + points.push(endAnchor); + } else { + points.push(endPlane); + points.push(endAnchor); + } + }; + + for (const edge of edges) { + const startAnchor = resolveAnchor(edge.from); + const endAnchor = resolveAnchor(edge.to); + if (!startAnchor || !endAnchor) continue; + const fromFile = resolveEdgeFile(edge.from); + const toFile = resolveEdgeFile(edge.to); + const ignoreFiles = new Set([fromFile, toFile].filter(Boolean)); + const start = { x: startAnchor.x, z: startAnchor.z }; + const end = { x: endAnchor.x, z: endAnchor.z }; + const routePoints = findRoute(start, end, ignoreFiles); + const rawType = edge.type || 'other'; + const type = resolveEdgeType(rawType); + const style = resolveEdgeStyle(type); + const weight = edgeWeights[type] || edgeWeights[rawType] || 1; + const fromColor = resolveEdgeColor(edge.from); + const toColor = resolveEdgeColor(edge.to); + let edgeColor = null; + if (fromColor && toColor) { + edgeColor = fromColor.clone().lerp(toColor, 0.5); + } else { + edgeColor = fromColor || toColor || new THREE.Color(style.color || '#9aa0a6'); + } + const pathPoints = []; + addPathPoints(pathPoints, startAnchor, endAnchor, routePoints); + const path = curveEdges + ? new THREE.CatmullRomCurve3(pathPoints.map((p) => new THREE.Vector3(p.x, p.y, p.z)), false, 'centripetal', 0.4) + : null; + const resolvedPoints = path + ? path.getPoints(Math.min(40, Math.max(12, pathPoints.length * 3))) + : pathPoints.map((p) => new THREE.Vector3(p.x, p.y, p.z)); + for (let i = 0; i < resolvedPoints.length - 1; i += 1) { + const a = resolvedPoints[i]; + const b = resolvedPoints[i + 1]; + const dx = b.x - a.x; + const dy = b.y - a.y; + const dz = b.z - a.z; + const dominant = Math.max(Math.abs(dx), Math.abs(dy), Math.abs(dz)); + const dir = dominant === Math.abs(dx) + ? Math.sign(dx) + : (dominant === Math.abs(dy) ? Math.sign(dy) : Math.sign(dz)); + addFlowSegment(type, a.x, a.y, a.z, b.x, b.y, b.z, weight, edgeColor, dir, edge); + } + if (edgeColor) { + if (edge.from?.member) addEndpointDot(`member:${edge.from.member}`, startAnchor, edgeColor); + if (edge.from?.file) addEndpointDot(`file:${edge.from.file}`, startAnchor, edgeColor); + if (edge.to?.member) addEndpointDot(`member:${edge.to.member}`, endAnchor, edgeColor); + if (edge.to?.file) addEndpointDot(`file:${edge.to.file}`, endAnchor, edgeColor); + } + } + + const localEdgeTypeGroups = new Map(); + for (const [type, segments] of flowSegmentsByType.entries()) { + if (!segments.size) continue; + const group = new THREE.Group(); + edgeGroup.add(group); + localEdgeTypeGroups.set(type, group); + if (edgeVisibility.has(type)) { + group.visible = edgeVisibility.get(type); + } + const style = resolveEdgeStyle(type); + const typeProfile = flowTypeProfiles[type] || flowTypeProfiles.other; + const fallbackColor = new THREE.Color(style.color || '#9aa0a6'); + const entries = Array.from(segments.values()); + if (!entries.length) continue; + const geometry = state.edgeUnitBoxGeometry || (state.edgeUnitBoxGeometry = (() => { + const unit = new THREE.BoxGeometry(1, 1, 1); + unit.userData = { ...(unit.userData || {}), shared: true }; + return unit; + })()); + const material = new THREE.MeshStandardMaterial({ + color: 0xffffff, + roughness: 0.2, + metalness: 0.8, + envMapIntensity: visuals.glass.envMapIntensity, + transparent: true, + opacity: 0.85, + depthWrite: false, + depthTest: true, + vertexColors: true + }); + if ('toneMapped' in material) material.toneMapped = false; + material.emissive = new THREE.Color(0xffffff); + material.emissiveIntensity = visuals.flowGlowBase; + material.userData = { + glowBase: visuals.flowGlowBase, + glowRange: visuals.flowGlowRange, + baseEmissiveIntensity: visuals.flowGlowBase, + baseOpacity: 0.8 + }; + const prevCompile = material.onBeforeCompile; + material.onBeforeCompile = (shader) => { + if (typeof prevCompile === 'function') prevCompile(shader); + if (shader.fragmentShader.includes('vColor')) { + shader.fragmentShader = shader.fragmentShader.replace( + '#include ', + '#include \n totalEmissiveRadiance *= vColor;' + ); + } + }; + applyHeightFog(material); + state.flowMaterials.push(material); + + const mesh = new THREE.InstancedMesh(geometry, material, entries.length); + mesh.renderOrder = 7; + const dummy = new THREE.Object3D(); + const axis = new THREE.Vector3(1, 0, 0); + const direction = new THREE.Vector3(); + const baseColors = new Array(entries.length); + const highlightColors = new Array(entries.length); + entries.forEach((entry, index) => { + const dx = entry.x2 - entry.x1; + const dy = entry.y2 - entry.y1; + const dz = entry.z2 - entry.z1; + const length = Math.sqrt(dx * dx + dy * dy + dz * dz); + if (!length) return; + const thickness = 0.08 + Math.log1p(entry.weight) * 0.04; + const colorWeight = entry.colorWeight || 0; + const averaged = colorWeight + ? new THREE.Color(entry.rSum / colorWeight, entry.gSum / colorWeight, entry.bSum / colorWeight) + : fallbackColor.clone(); + const edgeBase = style.color ? new THREE.Color(style.color) : averaged; + const brightColor = edgeBase.clone().lerp(edgeHighlight, 0.65); + const highlightColor = brightColor.clone().lerp(edgeHighlight, 0.35); + const flowDirection = entry.dirSum >= 0 ? 1 : -1; + dummy.position.set((entry.x1 + entry.x2) / 2, (entry.y1 + entry.y2) / 2, (entry.z1 + entry.z2) / 2); + direction.set(dx, dy, dz).normalize(); + dummy.quaternion.setFromUnitVectors(axis, direction); + dummy.scale.set(length, thickness, thickness); + dummy.updateMatrix(); + mesh.setMatrixAt(index, dummy.matrix); + mesh.setColorAt(index, brightColor); + baseColors[index] = brightColor; + highlightColors[index] = highlightColor; + state.edgeSegments.push({ + mesh, + index, + endpoints: entry.endpoints, + edgeColor: brightColor, + highlightColor + }); + flowLightCandidates.push({ + x: dummy.position.x, + y: dummy.position.y, + z: dummy.position.z, + color: brightColor, + weight: entry.weight, + phase: (entry.x1 + entry.x2 + entry.z1 + entry.z2) * 0.18, + speed: typeProfile.speed || 1, + offset: typeProfile.phase || 0, + dir: flowDirection + }); + }); + mesh.instanceMatrix.needsUpdate = true; + if (mesh.instanceColor) mesh.instanceColor.needsUpdate = true; + mesh.userData = { + instanceBaseColors: baseColors, + instanceHighlightColors: highlightColors + }; + group.add(mesh); + state.edgeMeshes.push(mesh); + } + + if (flowLightCandidates.length) { + flowLightCandidates.sort((a, b) => b.weight - a.weight); + const maxLights = Math.min(32, flowLightCandidates.length); + for (let i = 0; i < maxLights; i += 1) { + const entry = flowLightCandidates[i]; + const light = new THREE.PointLight(entry.color, 2.2, 40, 2); + light.position.set(entry.x, (entry.y ?? edgePlane) + 0.6, entry.z); + light.userData = { + flowPhase: entry.phase, + base: 1.6, + flowSpeed: entry.speed || 1, + flowOffset: entry.offset || 0, + flowDir: entry.dir || 1 + }; + state.flowLights.push(light); + state.scene.add(light); + } + } + + if (endpointDots.size) { + const dotGeometry = state.edgeDotGeometry || (state.edgeDotGeometry = (() => { + const geom = new THREE.SphereGeometry(0.08, 10, 10); + geom.userData = { ...(geom.userData || {}), shared: true }; + return geom; + })()); + const dotMaterial = new THREE.MeshStandardMaterial({ + color: 0xffffff, + emissive: new THREE.Color(0xffffff), + emissiveIntensity: visuals.flowGlowBase, + metalness: 0.7, + roughness: 0.25, + envMapIntensity: visuals.glass.envMapIntensity, + transparent: true, + opacity: 0.95, + depthWrite: false, + depthTest: true, + vertexColors: true + }); + dotMaterial.userData = { + glowBase: visuals.flowGlowBase, + glowRange: visuals.flowGlowRange, + glowSpeed: 1.1, + glowPhase: 0.4 + }; + applyHeightFog(dotMaterial); + state.edgeDotMaterial = dotMaterial; + state.glowMaterials.push(dotMaterial); + const dotMesh = new THREE.InstancedMesh(dotGeometry, dotMaterial, endpointDots.size); + const dummy = new THREE.Object3D(); + let index = 0; + endpointDots.forEach((entry) => { + const color = entry.weight ? entry.color.multiplyScalar(1 / entry.weight) : new THREE.Color(0xffffff); + dummy.position.set(entry.x, entry.y, entry.z); + dummy.updateMatrix(); + dotMesh.setMatrixAt(index, dummy.matrix); + dotMesh.setColorAt(index, color); + index += 1; + }); + dotMesh.instanceMatrix.needsUpdate = true; + if (dotMesh.instanceColor) dotMesh.instanceColor.needsUpdate = true; + dotMesh.renderOrder = 8; + edgeGroup.add(dotMesh); + state.edgeDotMesh = dotMesh; + } + updateFlowLights(); + + state.edgeTypeGroups = localEdgeTypeGroups; + state.edgeTypes = Array.from(flowSegmentsByType.keys()).sort((a, b) => a.localeCompare(b)); +}; diff --git a/src/map/isometric/client/layout-utils.js b/src/map/isometric/client/layout-utils.js new file mode 100644 index 000000000..3e2b5be66 --- /dev/null +++ b/src/map/isometric/client/layout-utils.js @@ -0,0 +1,371 @@ +import { clamp, hashString } from './utils.js'; + +const shapeForCategory = { + source: 'hexagon', + test: 'pentagon-pyramid', + config: 'octagon', + docs: 'heptagon', + generated: 'square', + dir: 'pentagon', + other: 'square' +}; + +const shapeForMemberType = { + class: 'pyramid', + function: 'hexagon-pyramid', + symbol: 'square' +}; + +const knownShapes = new Set([ + 'square', + 'circle', + 'pyramid', + 'pentagon', + 'hexagon', + 'heptagon', + 'octagon', + 'pentagon-pyramid', + 'hexagon-pyramid', + 'heptagon-pyramid', + 'octagon-pyramid', + 'pentagon-frustum', + 'hexagon-frustum', + 'heptagon-frustum', + 'octagon-frustum' +]); + +export const resolveShape = (mode, { key, category, type } = {}) => { + const normalized = String(mode || 'square').toLowerCase(); + if (normalized === 'category') { + if (category && shapeForCategory[category]) return shapeForCategory[category]; + if (type && shapeForMemberType[type]) return shapeForMemberType[type]; + return 'square'; + } + if (normalized === 'mix') { + const mixSeed = hashString(key || category || type || ''); + if (mixSeed < 0.2) return 'square'; + if (mixSeed < 0.4) return 'circle'; + if (mixSeed < 0.6) return 'pyramid'; + if (mixSeed < 0.75) return 'hexagon'; + if (mixSeed < 0.9) return 'pentagon'; + return 'octagon'; + } + if (knownShapes.has(normalized)) { + return normalized; + } + return 'square'; +}; + +export const sizeFactor = (value, base, scale, min, max) => { + const normalized = base + Math.log1p(Math.max(0, value)) * scale; + return clamp(normalized, min, max); +}; + +export const memberSizeFromRange = (range) => { + if (!range || !Number.isFinite(range.startLine)) return 1; + const start = range.startLine; + const end = Number.isFinite(range.endLine) ? range.endLine : start; + return Math.max(1, end - start + 1); +}; + +const splitPath = (value) => String(value || '').split('/').filter(Boolean); + +export const groupKeyForPath = (filePath, groupDepth) => { + const segments = splitPath(filePath); + if (!segments.length || groupDepth === 0) return '(root)'; + return segments.slice(0, groupDepth).join('/'); +}; + +export const scoreMember = (member, scoring) => { + let score = 0; + const dataflow = member?.dataflow || {}; + const flowLists = [dataflow.reads, dataflow.writes, dataflow.mutations, dataflow.aliases]; + for (const list of flowLists) { + if (Array.isArray(list)) score += list.length * scoring.dataflow; + } + const control = member?.controlFlow || {}; + for (const value of Object.values(control)) { + if (Array.isArray(value)) score += value.length * scoring.controlFlow; + else if (typeof value === 'number') score += value * scoring.controlFlow; + else if (value) score += 1 * scoring.controlFlow; + } + if (Array.isArray(member?.params)) score += member.params.length * scoring.params; + if (member?.signature) score += Math.min(10, String(member.signature).length / 20) * scoring.signature; + if (member?.returns) score += 1 * scoring.returns; + if (member?.exported) score += 1 * scoring.exported; + if (member?.modifiers && typeof member.modifiers === 'object') { + score += Object.keys(member.modifiers).length * scoring.modifiers; + } + const kind = String(member?.kind || member?.type || '').toLowerCase(); + if (kind.includes('class') || kind.includes('interface') || kind.includes('struct')) score += scoring.type; + return score; +}; + +export const scoreToColor = (score, maxScore, colors, THREE, key) => { + const mode = String(colors.mode || 'score').toLowerCase(); + const color = new THREE.Color(); + if (mode === 'distinct') { + const seed = hashString(key || score || ''); + const normalized = seed / 0xffffffff; + const hue = (normalized + (colors.distinctHueOffset || 0)) % 1; + const saturation = colors.distinctSaturation ?? colors.saturation ?? 0.7; + const lightness = colors.distinctLightness ?? colors.lightnessMax ?? 0.6; + color.setHSL(hue, saturation, lightness); + return color; + } + const ratio = maxScore > 0 + ? Math.log10(score + 1) / Math.log10(maxScore + 1) + : 0; + const hue = colors.hueStart + (colors.hueEnd - colors.hueStart) * ratio; + const lightness = colors.lightnessMin + (colors.lightnessMax - colors.lightnessMin) * ratio; + color.setHSL(hue, colors.saturation, lightness); + return color; +}; + +export const computeGrid = (count) => { + if (!count) return { columns: 0, rows: 0 }; + const columns = Math.ceil(Math.sqrt(count)); + const rows = Math.ceil(count / columns); + return { columns, rows }; +}; + +export const buildSlots = (width, depth, columns, rows, cellSize, gap, memberInset, memberCell, memberGap) => { + if (!columns || !rows) return []; + const slots = []; + const resolvedCell = cellSize || memberCell; + const resolvedGap = Number.isFinite(gap) ? gap : memberGap; + const startX = -width / 2 + memberInset + resolvedCell / 2; + const startZ = -depth / 2 + memberInset + resolvedCell / 2; + for (let row = 0; row < rows; row += 1) { + for (let col = 0; col < columns; col += 1) { + const x = startX + col * (resolvedCell + resolvedGap); + const z = startZ + row * (resolvedCell + resolvedGap); + slots.push({ x, z, sort: x + z }); + } + } + return slots.sort((a, b) => (a.sort - b.sort) || (a.x - b.x) || (a.z - b.z)); +}; + +export const orderByAdjacency = (items, getKey, adjacency) => { + if (!items.length) return []; + if (items.length === 1) return items.slice(); + const keys = items.map(getKey); + const totalWeight = new Map(); + keys.forEach((key) => { + const neighbors = adjacency.get(key) || new Map(); + let total = 0; + for (const value of neighbors.values()) total += value; + totalWeight.set(key, total); + }); + const remaining = new Set(keys); + const orderedKeys = []; + let current = keys.slice().sort((a, b) => { + const diff = (totalWeight.get(b) || 0) - (totalWeight.get(a) || 0); + return diff || a.localeCompare(b); + })[0]; + orderedKeys.push(current); + remaining.delete(current); + while (remaining.size) { + let best = null; + let bestScore = -1; + for (const key of remaining) { + const neighbors = adjacency.get(key) || new Map(); + let score = 0; + for (const placed of orderedKeys) { + score += neighbors.get(placed) || 0; + } + score += (totalWeight.get(key) || 0) * 0.1; + if (score > bestScore) { + bestScore = score; + best = key; + } else if (score === bestScore && best && key.localeCompare(best) < 0) { + best = key; + } + } + orderedKeys.push(best); + remaining.delete(best); + } + const itemByKey = new Map(items.map((item) => [getKey(item), item])); + return orderedKeys.map((key) => itemByKey.get(key)).filter(Boolean); +}; + +export const layoutGridItems = (items, columns, spacing) => { + const count = items.length; + if (!count) return { width: 0, depth: 0, columns: 0, rows: 0 }; + const cols = Math.max(1, columns || 1); + const rows = Math.max(1, Math.ceil(count / cols)); + const colWidths = Array.from({ length: cols }, () => 0); + const rowDepths = Array.from({ length: rows }, () => 0); + items.forEach((item, index) => { + const col = index % cols; + const row = Math.floor(index / cols); + colWidths[col] = Math.max(colWidths[col], item.width || 0); + rowDepths[row] = Math.max(rowDepths[row], item.depth || 0); + }); + const colOffsets = []; + const rowOffsets = []; + let offsetX = 0; + for (let col = 0; col < cols; col += 1) { + colOffsets[col] = offsetX; + offsetX += colWidths[col] + spacing; + } + let offsetZ = 0; + for (let row = 0; row < rows; row += 1) { + rowOffsets[row] = offsetZ; + offsetZ += rowDepths[row] + spacing; + } + items.forEach((item, index) => { + const col = index % cols; + const row = Math.floor(index / cols); + const xPad = (colWidths[col] - item.width) / 2; + const zPad = (rowDepths[row] - item.depth) / 2; + item.x = colOffsets[col] + xPad; + item.z = rowOffsets[row] + zPad; + }); + const totalWidth = colWidths.reduce((acc, value) => acc + value, 0) + spacing * (cols - 1); + const totalDepth = rowDepths.reduce((acc, value) => acc + value, 0) + spacing * (rows - 1); + return { width: totalWidth, depth: totalDepth, columns: cols, rows }; +}; + +export const layoutRadialItems = (items, spacing) => { + const count = items.length; + if (!count) return { width: 0, depth: 0 }; + if (count === 1) { + items[0].x = 0; + items[0].z = 0; + return { width: items[0].width || 0, depth: items[0].depth || 0 }; + } + const radii = items.map((item) => Math.max(item.width || 0, item.depth || 0) / 2); + const maxRadius = radii.reduce((acc, value) => Math.max(acc, value), 0); + const circumference = radii.reduce((acc, value) => acc + (value * 2 + spacing), 0); + const baseRadius = Math.max(maxRadius * 1.5, circumference / (2 * Math.PI)); + let angle = 0; + items.forEach((item, index) => { + const arc = (radii[index] * 2 + spacing) / baseRadius; + angle += arc / 2; + item.x = Math.cos(angle) * baseRadius; + item.z = Math.sin(angle) * baseRadius; + angle += arc / 2; + }); + const extent = baseRadius + maxRadius; + return { width: extent * 2, depth: extent * 2 }; +}; + +export const layoutHexItems = (items, spacing) => { + const count = items.length; + if (!count) return { width: 0, depth: 0, columns: 0, rows: 0 }; + const cols = Math.max(1, Math.ceil(Math.sqrt(count))); + const rows = Math.max(1, Math.ceil(count / cols)); + const maxWidth = items.reduce((acc, item) => Math.max(acc, item.width || 0), 0); + const maxDepth = items.reduce((acc, item) => Math.max(acc, item.depth || 0), 0); + const cellWidth = maxWidth + spacing; + const cellDepth = maxDepth + spacing; + const rowStep = cellDepth * 0.86; + items.forEach((item, index) => { + const row = Math.floor(index / cols); + const col = index % cols; + const offset = (row % 2) * cellWidth * 0.5; + item.x = col * cellWidth + offset; + item.z = row * rowStep; + }); + const totalWidth = cellWidth * cols + cellWidth * 0.5; + const totalDepth = rowStep * Math.max(1, rows - 1) + maxDepth; + return { width: totalWidth, depth: totalDepth, columns: cols, rows }; +}; + +export const layoutFlowItems = (items, spacing, adjacency, getKey) => { + const count = items.length; + if (!count) return { width: 0, depth: 0 }; + const columns = Math.max(1, Math.ceil(Math.sqrt(count))); + layoutGridItems(items, columns, spacing); + + const indexByKey = new Map(items.map((item, index) => [getKey(item), index])); + const neighbors = items.map(() => []); + items.forEach((item, index) => { + const key = getKey(item); + const adjacent = adjacency.get(key) || new Map(); + for (const [targetKey, weight] of adjacent.entries()) { + const targetIndex = indexByKey.get(targetKey); + if (targetIndex === undefined) continue; + neighbors[index].push({ index: targetIndex, weight: weight || 1 }); + } + }); + + const positions = items.map((item) => ({ x: item.x || 0, z: item.z || 0 })); + const velocities = items.map(() => ({ x: 0, z: 0 })); + const iterations = Math.min(80, 20 + count); + const repulse = 0.35; + const attract = 0.04; + const damping = 0.75; + const minSpacing = Math.max(0.6, spacing * 0.8); + const maxVelocity = Math.max(minSpacing, spacing * 1.2); + + for (let iter = 0; iter < iterations; iter += 1) { + for (let i = 0; i < count; i += 1) { + let fx = 0; + let fz = 0; + const a = items[i]; + const posA = positions[i]; + for (let j = i + 1; j < count; j += 1) { + const b = items[j]; + const posB = positions[j]; + const dx = posB.x - posA.x; + const dz = posB.z - posA.z; + const dist = Math.sqrt(dx * dx + dz * dz) || 0.0001; + const target = (a.width + b.width) * 0.5 + minSpacing; + const overlap = target - dist; + if (overlap > 0) { + const push = overlap * repulse; + const rx = (dx / dist) * push; + const rz = (dz / dist) * push; + fx -= rx; + fz -= rz; + velocities[j].x += rx; + velocities[j].z += rz; + } + } + for (const neighbor of neighbors[i]) { + const b = items[neighbor.index]; + const posB = positions[neighbor.index]; + const dx = posB.x - posA.x; + const dz = posB.z - posA.z; + const dist = Math.sqrt(dx * dx + dz * dz) || 0.0001; + const target = (a.width + b.width) * 0.4 + spacing * 0.6; + // Keep attraction pulling toward the target distance to avoid runaway layouts. + const pull = (target - dist) * attract * Math.min(3, neighbor.weight || 1); + fx += (dx / dist) * pull; + fz += (dz / dist) * pull; + } + velocities[i].x = (velocities[i].x + fx) * damping; + velocities[i].z = (velocities[i].z + fz) * damping; + const speed = Math.hypot(velocities[i].x, velocities[i].z); + if (speed > maxVelocity) { + // Clamp velocity to avoid unstable layouts that can explode the bounds. + const scale = maxVelocity / speed; + velocities[i].x *= scale; + velocities[i].z *= scale; + } + } + for (let i = 0; i < count; i += 1) { + positions[i].x += velocities[i].x; + positions[i].z += velocities[i].z; + } + } + + items.forEach((item, index) => { + item.x = positions[index].x; + item.z = positions[index].z; + }); + + let minX = Infinity; + let maxX = -Infinity; + let minZ = Infinity; + let maxZ = -Infinity; + items.forEach((item) => { + minX = Math.min(minX, item.x - item.width / 2); + maxX = Math.max(maxX, item.x + item.width / 2); + minZ = Math.min(minZ, item.z - item.depth / 2); + maxZ = Math.max(maxZ, item.z + item.depth / 2); + }); + return { width: Math.max(0, maxX - minX), depth: Math.max(0, maxZ - minZ) }; +}; diff --git a/src/map/isometric/client/layout.js b/src/map/isometric/client/layout.js new file mode 100644 index 000000000..abdd68891 --- /dev/null +++ b/src/map/isometric/client/layout.js @@ -0,0 +1,430 @@ +import { state } from './state.js'; +import { clamp, numberValue } from './utils.js'; +import { + resolveShape, + sizeFactor, + memberSizeFromRange, + groupKeyForPath, + scoreMember, + scoreToColor, + computeGrid, + buildSlots, + orderByAdjacency, + layoutGridItems, + layoutRadialItems, + layoutFlowItems, + layoutHexItems +} from './layout-utils.js'; + +export const createShapeGeometry = (shape) => { + const { THREE } = state; + const resolved = String(shape || 'square').toLowerCase(); + state.shapeGeometryCache = state.shapeGeometryCache || new Map(); + if (state.shapeGeometryCache.has(resolved)) { + return state.shapeGeometryCache.get(resolved); + } + const polygonMatch = resolved.match(/^(pentagon|hexagon|heptagon|octagon)(?:-(pyramid|frustum))?$/); + const polygonSides = { + pentagon: 5, + hexagon: 6, + heptagon: 7, + octagon: 8 + }; + + let geometry; + if (resolved === 'circle') { + geometry = new THREE.CylinderGeometry(0.5, 0.5, 1, 32, 1, false); + } else if (resolved === 'pyramid') { + geometry = new THREE.CylinderGeometry(0, 0.55, 1, 4, 1, false); + } else if (polygonMatch) { + const [, polygon, variant] = polygonMatch; + const sides = polygonSides[polygon] || 6; + if (variant === 'pyramid') { + geometry = new THREE.CylinderGeometry(0, 0.55, 1, sides, 1, false); + } else if (variant === 'frustum') { + geometry = new THREE.CylinderGeometry(0.25, 0.55, 1, sides, 1, false); + } else { + geometry = new THREE.CylinderGeometry(0.5, 0.5, 1, sides, 1, false); + } + } else { + geometry = new THREE.BoxGeometry(1, 1, 1); + } + geometry.userData = { ...(geometry.userData || {}), shared: true }; + state.shapeGeometryCache.set(resolved, geometry); + return geometry; +}; + +export const computeLayout = () => { + const { + THREE, + files, + edges, + layout, + layoutDefaults, + scoring, + colors, + scaleFactor, + fileByMember + } = state; + + const groupDepth = Math.max(0, Math.floor(numberValue(layout.groupDepth, layoutDefaults.groupDepth))); + const baseSize = numberValue(layout.baseSize, layoutDefaults.baseSize) * scaleFactor; + const fileHeight = numberValue(layout.fileHeight, layoutDefaults.fileHeight) * scaleFactor * 2; + const memberCell = numberValue(layout.memberCell, layoutDefaults.memberCell) * scaleFactor; + const memberGap = numberValue(layout.memberGap, layoutDefaults.memberGap) * scaleFactor; + const memberInset = numberValue(layout.memberInset, layoutDefaults.memberInset) * scaleFactor; + const fileSpacing = numberValue(layout.fileSpacing ?? layout.spacing, layoutDefaults.fileSpacing) * scaleFactor; + const groupSpacing = numberValue(layout.groupSpacing, layoutDefaults.groupSpacing) * scaleFactor; + const compactness = numberValue(layout.compactness, layoutDefaults.compactness); + const routingPadding = numberValue(layout.routingPadding, layoutDefaults.routingPadding) * scaleFactor; + const routingStep = numberValue(layout.routingStep, layoutDefaults.routingStep) * scaleFactor; + const labelScale = numberValue(layout.labelScale, layoutDefaults.labelScale) * scaleFactor; + const labelOffset = numberValue(layout.labelOffset, layoutDefaults.labelOffset) * scaleFactor; + const edgePlane = numberValue(layout.edgePlane, layoutDefaults.edgePlane) * scaleFactor; + const memberHeightBase = numberValue(layout.memberHeightBase, layoutDefaults.memberHeightBase) * scaleFactor; + const memberHeightScale = numberValue(layout.memberHeightScale, layoutDefaults.memberHeightScale) * scaleFactor; + const memberHeightMax = numberValue(layout.memberHeightMax, layoutDefaults.memberHeightMax) * scaleFactor; + + const edgeWeights = { + import: 3, + export: 3, + call: 2.5, + usage: 2, + dataflow: 2, + alias: 1.5 + }; + + const resolveEdgeFile = (endpoint) => { + if (!endpoint) return null; + if (endpoint.file) return endpoint.file; + if (endpoint.member) return fileByMember.get(endpoint.member) || null; + return null; + }; + + const fileAdjacency = new Map(); + const groupAdjacency = new Map(); + const touchAdjacency = (mapRef, from, to, weight) => { + if (!from || !to || from === to) return; + const bucket = mapRef.get(from) || new Map(); + bucket.set(to, (bucket.get(to) || 0) + weight); + mapRef.set(from, bucket); + }; + + const groupKeyByFile = new Map(); + const surfaceScaleForShape = (shape) => { + if (shape === 'pyramid') return 0.72; + if (shape?.endsWith('-pyramid')) return 0.72; + if (shape?.endsWith('-frustum')) return 0.82; + if (shape === 'circle') return 0.9; + return 1; + }; + + for (const node of files) { + const key = groupKeyForPath(node.path || node.name || '', groupDepth); + groupKeyByFile.set(node.path, key); + } + for (const edge of edges) { + const fromFile = resolveEdgeFile(edge.from); + const toFile = resolveEdgeFile(edge.to); + if (!fromFile || !toFile) continue; + const weight = edgeWeights[edge.type] || 1; + touchAdjacency(fileAdjacency, fromFile, toFile, weight); + touchAdjacency(fileAdjacency, toFile, fromFile, weight); + const fromGroup = groupKeyByFile.get(fromFile); + const toGroup = groupKeyByFile.get(toFile); + if (fromGroup && toGroup) { + touchAdjacency(groupAdjacency, fromGroup, toGroup, weight); + touchAdjacency(groupAdjacency, toGroup, fromGroup, weight); + } + } + + const groupsByKey = new Map(); + let maxMemberScore = 0; + let maxFileScore = 0; + + for (const node of files) { + const members = Array.isArray(node.members) ? node.members : []; + const membersWithMetrics = members.map((member) => { + const score = scoreMember(member, scoring); + const size = memberSizeFromRange(member.range); + maxMemberScore = Math.max(maxMemberScore, score); + const sizeScale = sizeFactor(size, 0.75, 0.12, 0.7, 1.35); + const scoreScale = sizeFactor(score, 0.65, 0.08, 0.8, 1.8); + return { + member, + score, + size, + footprintScale: clamp(sizeScale * scoreScale, 0.8, 3.2), + heightScale: clamp( + sizeFactor(size, 0.85, 0.18, 0.75, 1.6) * sizeFactor(score, 0.7, 0.08, 0.85, 2), + 0.8, + 2.6 + ) + }; + }); + const fileSize = membersWithMetrics.reduce((acc, entry) => acc + entry.size, 0) + || members.length + || 1; + const fileSizeScale = sizeFactor(fileSize, 0.8, 0.12, 0.75, 2.3); + const fileShape = resolveShape(layout.fileShape || layoutDefaults.fileShape, { + key: node.path || node.name, + category: node.category + }); + const grid = computeGrid(members.length); + const maxFootprintScale = membersWithMetrics.reduce((acc, entry) => Math.max(acc, entry.footprintScale), 1); + const cellSize = memberCell * maxFootprintScale; + const cellGap = memberGap * maxFootprintScale; + let width = baseSize; + let depth = baseSize; + if (members.length) { + width = Math.max(baseSize, grid.columns * cellSize + (grid.columns - 1) * cellGap + memberInset * 2); + depth = Math.max(baseSize, grid.rows * cellSize + (grid.rows - 1) * cellGap + memberInset * 2); + } + const fileScore = membersWithMetrics.reduce((acc, entry) => acc + entry.score, 0); + maxFileScore = Math.max(maxFileScore, fileScore); + const fileScoreScale = sizeFactor(fileScore, 0.85, 0.08, 0.85, 1.9); + width *= fileSizeScale * fileScoreScale; + depth *= fileSizeScale * fileScoreScale; + const fileHeightBoost = Math.min(6, Math.log1p(fileScore) * 0.35) * scaleFactor; + const fileHeightScale = sizeFactor(fileSize, 0.9, 0.08, 0.85, 1.5); + const fileComplexityScale = sizeFactor(fileScore, 0.85, 0.06, 0.9, 1.7); + const surfaceScale = surfaceScaleForShape(fileShape); + const surfaceWidth = width * surfaceScale; + const surfaceDepth = depth * surfaceScale; + const surfaceInset = memberInset * surfaceScale; + const fileHeightValue = (fileHeight + fileHeightBoost) * fileHeightScale * fileComplexityScale; + const fileLayout = { + node, + width, + depth, + height: fileHeightValue, + topY: fileHeightValue, + surfaceScale, + surfaceWidth, + surfaceDepth, + score: fileScore, + shape: fileShape, + columns: grid.columns, + rows: grid.rows, + cellSize, + cellGap, + slotStep: cellSize + cellGap, + memberSlots: buildSlots( + surfaceWidth, + surfaceDepth, + grid.columns, + grid.rows, + cellSize, + cellGap, + surfaceInset, + memberCell, + memberGap + ), + members: membersWithMetrics.map((entry) => { + const rawHeight = memberHeightBase + scoreMember(entry.member, scoring) * memberHeightScale; + const clampedHeight = Math.max(memberHeightBase, Math.min(memberHeightMax, rawHeight)); + return { + member: entry.member, + score: entry.score, + size: entry.size, + shape: resolveShape(layout.memberShape || layoutDefaults.memberShape, { + key: entry.member.id || entry.member.name, + type: entry.member.type + }), + footprint: memberCell * entry.footprintScale, + height: clampedHeight * entry.heightScale + }; + }) + }; + const key = groupKeyForPath(node.path || node.name || '', groupDepth); + const group = groupsByKey.get(key) || { key, files: [] }; + group.files.push(fileLayout); + groupsByKey.set(key, group); + } + + const layoutStyle = String(layout.style || layoutDefaults.style || 'clustered').toLowerCase(); + const groups = orderByAdjacency( + Array.from(groupsByKey.values()), + (group) => group.key, + groupAdjacency + ); + + for (const group of groups) { + group.files = orderByAdjacency( + group.files, + (file) => file.node.path || file.node.name || '', + fileAdjacency + ); + if (layoutStyle === 'radial') { + const metrics = layoutRadialItems(group.files, fileSpacing); + group.width = Math.max(baseSize, metrics.width); + group.depth = Math.max(baseSize, metrics.depth); + } else if (layoutStyle === 'hex') { + const metrics = layoutHexItems(group.files, fileSpacing); + group.width = Math.max(baseSize, metrics.width); + group.depth = Math.max(baseSize, metrics.depth); + } else { + const columns = Math.max(1, Math.ceil(Math.sqrt(group.files.length || 1))); + const metrics = layoutGridItems(group.files, columns, fileSpacing); + group.width = Math.max(baseSize, metrics.width); + group.depth = Math.max(baseSize, metrics.depth); + } + } + + const allFiles = groups.flatMap((group) => group.files); + + if (layoutStyle === 'stream') { + const orderedFiles = orderByAdjacency( + allFiles, + (file) => file.node.path || file.node.name || '', + fileAdjacency + ); + let cursorX = 0; + let cursorZ = 0; + orderedFiles.forEach((fileLayout) => { + fileLayout.x = cursorX; + fileLayout.z = cursorZ; + cursorX += fileLayout.width + fileSpacing; + cursorZ += fileLayout.depth * 0.6 + fileSpacing * 0.6; + }); + } else if (layoutStyle === 'flat' || layoutStyle === 'grid') { + const orderedFiles = orderByAdjacency( + allFiles, + (file) => file.node.path || file.node.name || '', + fileAdjacency + ); + const columns = Math.max(1, Math.ceil(Math.sqrt(orderedFiles.length || 1))); + layoutGridItems(orderedFiles, columns, fileSpacing); + } else if (layoutStyle === 'radial') { + const groupRadii = groups.map((group) => Math.max(group.width || 0, group.depth || 0) / 2); + const maxGroupRadius = groupRadii.reduce((acc, value) => Math.max(acc, value), baseSize / 2); + const circumference = groupRadii.reduce((acc, value) => acc + (value * 2 + groupSpacing), 0); + const baseRadius = Math.max(maxGroupRadius * 2.2, circumference / (2 * Math.PI)); + let angle = 0; + groups.forEach((group, index) => { + const arc = (groupRadii[index] * 2 + groupSpacing) / baseRadius; + angle += arc / 2; + const offsetX = Math.cos(angle) * baseRadius; + const offsetZ = Math.sin(angle) * baseRadius; + for (const fileLayout of group.files) { + fileLayout.x += offsetX; + fileLayout.z += offsetZ; + } + angle += arc / 2; + }); + } else if (layoutStyle === 'flow') { + const orderedFiles = orderByAdjacency( + allFiles, + (file) => file.node.path || file.node.name || '', + fileAdjacency + ); + layoutFlowItems( + orderedFiles, + fileSpacing, + fileAdjacency, + (file) => file.node.path || file.node.name || '' + ); + } else if (layoutStyle === 'hex') { + const orderedFiles = orderByAdjacency( + allFiles, + (file) => file.node.path || file.node.name || '', + fileAdjacency + ); + layoutHexItems(orderedFiles, fileSpacing); + } else { + const groupCount = Math.max(1, groups.length); + const groupColumns = Math.ceil(Math.sqrt(groupCount)); + const groupLayouts = groups.map((group) => ({ + width: group.width || baseSize, + depth: group.depth || baseSize, + x: 0, + z: 0 + })); + layoutGridItems(groupLayouts, groupColumns, groupSpacing); + groups.forEach((group, index) => { + const offsetX = groupLayouts[index].x; + const offsetZ = groupLayouts[index].z; + for (const fileLayout of group.files) { + fileLayout.x += offsetX; + fileLayout.z += offsetZ; + } + }); + } + + let minX = 0; + let maxX = 0; + let minZ = 0; + let maxZ = 0; + if (allFiles.length) { + minX = Infinity; + maxX = -Infinity; + minZ = Infinity; + maxZ = -Infinity; + for (const fileLayout of allFiles) { + const left = fileLayout.x - fileLayout.width / 2; + const right = fileLayout.x + fileLayout.width / 2; + const back = fileLayout.z - fileLayout.depth / 2; + const front = fileLayout.z + fileLayout.depth / 2; + minX = Math.min(minX, left); + maxX = Math.max(maxX, right); + minZ = Math.min(minZ, back); + maxZ = Math.max(maxZ, front); + } + const centerX = (minX + maxX) / 2; + const centerZ = (minZ + maxZ) / 2; + for (const fileLayout of allFiles) { + fileLayout.x -= centerX; + fileLayout.z -= centerZ; + } + minX -= centerX; + maxX -= centerX; + minZ -= centerZ; + maxZ -= centerZ; + } + + if (Number.isFinite(compactness) && compactness > 0 && compactness !== 1) { + for (const fileLayout of allFiles) { + fileLayout.x *= compactness; + fileLayout.z *= compactness; + } + minX *= compactness; + maxX *= compactness; + minZ *= compactness; + maxZ *= compactness; + } + + const spanX = Math.max(40, maxX - minX); + const spanZ = Math.max(40, maxZ - minZ); + const maxSpan = Math.max(spanX, spanZ); + + Object.assign(state, { + layoutStyle, + layoutMetrics: { + groupDepth, + baseSize, + fileHeight, + memberCell, + memberGap, + memberInset, + fileSpacing, + groupSpacing, + compactness, + routingPadding, + routingStep, + labelScale, + labelOffset, + edgePlane + }, + edgeWeights, + groupKeyByFile, + fileAdjacency, + groupAdjacency, + groups, + allFiles, + maxMemberScore, + maxFileScore, + bounds: { minX, maxX, minZ, maxZ, spanX, spanZ, maxSpan }, + resolveShape, + scoreToColor: (score, key) => scoreToColor(score, maxMemberScore, colors, THREE, key) + }); +}; diff --git a/src/map/isometric/client/map-data.js b/src/map/isometric/client/map-data.js new file mode 100644 index 000000000..246dcaef1 --- /dev/null +++ b/src/map/isometric/client/map-data.js @@ -0,0 +1,47 @@ +import { state } from './state.js'; + +const buildMemberKey = (filePath, name, range) => { + const start = Number.isFinite(range?.startLine) ? range.startLine : ''; + const end = Number.isFinite(range?.endLine) ? range.endLine : ''; + return `${filePath}::${name || ''}:${start}-${end}`; +}; + +const buildMemberNameKey = (filePath, name) => `${filePath}::${name || ''}`; + +export const initMapData = () => { + const files = Array.isArray(state.map?.nodes) ? state.map.nodes : []; + const edges = Array.isArray(state.map?.edges) ? state.map.edges : []; + const nodeByPath = new Map(); + const nodeById = new Map(); + const memberById = new Map(); + const memberByKey = new Map(); + const fileByMember = new Map(); + + for (const node of files) { + if (node.path) nodeByPath.set(node.path, node); + if (node.name && !nodeByPath.has(node.name)) nodeByPath.set(node.name, node); + if (node.id) nodeById.set(node.id, node); + const members = Array.isArray(node.members) ? node.members : []; + for (const member of members) { + if (member?.id) memberById.set(member.id, member); + const filePath = member?.file || node.path || node.name || ''; + if (member?.id) fileByMember.set(member.id, filePath); + const rangeKey = buildMemberKey(filePath, member?.name || '', member?.range || {}); + memberByKey.set(rangeKey, member); + const nameKey = buildMemberNameKey(filePath, member?.name || ''); + if (!memberByKey.has(nameKey)) memberByKey.set(nameKey, member); + } + } + + Object.assign(state, { + files, + edges, + nodeByPath, + nodeById, + memberById, + memberByKey, + fileByMember, + buildMemberKey, + buildMemberNameKey + }); +}; diff --git a/src/map/isometric/client/materials.js b/src/map/isometric/client/materials.js new file mode 100644 index 000000000..736362d34 --- /dev/null +++ b/src/map/isometric/client/materials.js @@ -0,0 +1,507 @@ +import { state } from './state.js'; +import { clamp, numberValue } from './utils.js'; + +const getWireGeometry = (geometry, THREE) => { + const cache = state.wireGeometryCache || (state.wireGeometryCache = new Map()); + const key = geometry?.uuid || geometry; + if (cache.has(key)) return cache.get(key); + const wireGeom = new THREE.EdgesGeometry(geometry); + wireGeom.userData = { ...(wireGeom.userData || {}), shared: true }; + cache.set(key, wireGeom); + return wireGeom; +}; + +export const initMaterials = () => { + const { THREE, assets, visuals } = state; + state.glowMaterials = []; + state.flowMaterials = []; + state.glassMaterials = []; + state.labelMaterials = []; + state.glassShells = []; + state.wireMaterials = []; + state.gridLineMaterials = []; + state.normalMapState = { texture: null }; + + if (assets.normalMapUrl) { + const loader = new THREE.TextureLoader(); + loader.load(assets.normalMapUrl, (texture) => { + texture.wrapS = THREE.RepeatWrapping; + texture.wrapT = THREE.RepeatWrapping; + texture.repeat.set(visuals.glass.normalRepeat, visuals.glass.normalRepeat); + state.normalMapState.texture = texture; + applyGlassSettings(); + }); + } +}; + +export const applyHeightFog = (material) => { + const { visuals } = state; + if (!material || material.userData?.heightFogApplied) return; + material.userData.heightFogApplied = true; + const fogVarying = 'vIsoWorldPosition'; + const previousCompile = material.onBeforeCompile; + material.onBeforeCompile = (shader) => { + if (typeof previousCompile === 'function') { + previousCompile(shader); + } + shader.uniforms.fogHeight = { value: visuals.fogHeight }; + shader.uniforms.fogHeightRange = { value: visuals.fogHeightRange }; + shader.uniforms.fogHeightEnabled = { value: visuals.enableHeightFog ? 1 : 0 }; + const fogUniformsSnippet = [ + '#include ', + ' uniform float fogHeight;', + ' uniform float fogHeightRange;', + ' uniform float fogHeightEnabled;' + ].join('\n'); + const heightExpr = + ` float heightFactor = fogHeightEnabled * clamp((fogHeight - ${fogVarying}.y) / ` + + 'max(0.001, fogHeightRange), 0.0, 1.0);'; + const fogFragmentSnippet = [ + '#ifdef USE_FOG', + ' float fogFactor = smoothstep(fogNear, fogFar, vFogDepth);', + heightExpr, + ' float combinedFog = max(fogFactor, heightFactor);', + ' gl_FragColor.rgb = mix(gl_FragColor.rgb, fogColor, combinedFog);', + '#endif' + ].join('\n'); + if (!shader.vertexShader.includes(`varying vec3 ${fogVarying}`)) { + if (shader.vertexShader.includes('#include ')) { + shader.vertexShader = shader.vertexShader.replace( + '#include ', + `#include \n varying vec3 ${fogVarying};` + ); + } + } + if (shader.vertexShader.includes('#include ')) { + shader.vertexShader = shader.vertexShader.replace( + '#include ', + `#include \n ${fogVarying} = (modelMatrix * vec4(position, 1.0)).xyz;` + ); + } + if (!shader.fragmentShader.includes(`varying vec3 ${fogVarying}`)) { + if (shader.fragmentShader.includes('#include ')) { + shader.fragmentShader = shader.fragmentShader.replace( + '#include ', + `#include \n varying vec3 ${fogVarying};` + ); + } + } + if (!shader.fragmentShader.includes('uniform float fogHeight')) { + if (shader.fragmentShader.includes('#include ')) { + shader.fragmentShader = shader.fragmentShader.replace( + '#include ', + fogUniformsSnippet + ); + } + } + if (shader.fragmentShader.includes('#include ')) { + shader.fragmentShader = shader.fragmentShader.replace( + '#include ', + fogFragmentSnippet + ); + } + material.userData.fogUniforms = shader.uniforms; + }; + material.needsUpdate = true; +}; + +export const createGlassMaterial = (color, opacity) => { + const { THREE, visuals, normalMapState, glassMaterials, glowMaterials } = state; + const glass = visuals.glass || state.visualDefaults.glass; + const transmission = clamp(glass.transmission ?? 0, 0, 1); + const material = new THREE.MeshPhysicalMaterial({ + color, + metalness: glass.metalness, + roughness: glass.roughness, + transmission, + ior: glass.ior, + reflectivity: glass.reflectivity, + thickness: glass.thickness, + envMapIntensity: glass.envMapIntensity, + clearcoat: glass.clearcoat, + clearcoatRoughness: glass.clearcoatRoughness, + transparent: true, + opacity, + depthWrite: false, + side: THREE.DoubleSide + }); + material.attenuationDistance = transmission > 0 ? 9999 : 0; + material.emissive = color.clone().multiplyScalar(0.25); + material.emissiveIntensity = 0.4; + material.userData = { + glowBase: 0.4, + glowRange: 0.3, + baseColor: color.clone(), + baseEmissive: material.emissive.clone(), + baseEmissiveIntensity: material.emissiveIntensity, + baseOpacity: opacity + }; + if (normalMapState.texture) { + material.normalMap = normalMapState.texture; + material.clearcoatNormalMap = normalMapState.texture; + material.normalScale = new THREE.Vector2(glass.normalScale, glass.normalScale); + material.clearcoatNormalScale = new THREE.Vector2(glass.clearcoatNormalScale, glass.clearcoatNormalScale); + } + glassMaterials.push(material); + glowMaterials.push(material); + applyHeightFog(material); + return material; +}; + +export const createGlassShell = (geometry, material) => { + const { THREE, visuals, glassMaterials, glowMaterials, glassShells } = state; + const outer = new THREE.Mesh(geometry, material); + const innerMaterial = material.clone(); + innerMaterial.side = THREE.BackSide; + innerMaterial.opacity = clamp(material.opacity * 0.9, 0.05, 1); + innerMaterial.userData = { + ...(material.userData || {}), + baseEmissive: material.emissive.clone(), + baseEmissiveIntensity: material.emissiveIntensity, + baseOpacity: innerMaterial.opacity + }; + glassMaterials.push(innerMaterial); + glowMaterials.push(innerMaterial); + applyHeightFog(innerMaterial); + const inner = new THREE.Mesh(geometry, innerMaterial); + const thicknessScale = clamp(1 - visuals.glass.thickness * 0.03, 0.75, 0.98); + inner.scale.set(thicknessScale, thicknessScale, thicknessScale); + const group = new THREE.Group(); + group.add(outer); + group.add(inner); + glassShells.push({ inner, outer }); + return { group, outer, inner }; +}; + +export const configureWireMaterial = (wireMat) => { + const { visuals, visualDefaults, scaleFactor } = state; + const thickness = numberValue(visuals.wireframeThickness, visualDefaults.wireframeThickness) * (scaleFactor || 1); + const glow = numberValue(visuals.wireframeGlow, visualDefaults.wireframeGlow); + const baseColor = wireMat.userData?.baseColor || wireMat.color; + const emissiveColor = wireMat.userData?.emissiveColor || baseColor; + wireMat.opacity = clamp(0.02 + glow * 0.22, 0.02, 0.8); + if ('linewidth' in wireMat) { + wireMat.linewidth = clamp(thickness, 0.01, 12); + wireMat.userData.baseLinewidth = wireMat.linewidth; + } + wireMat.color.copy(emissiveColor); + wireMat.userData.glowBase = 0.03 + glow * 0.2; + wireMat.userData.glowRange = 0.05 + glow * 0.35; + wireMat.userData.flowSpeed = numberValue(visuals.wirePulseSpeed, visualDefaults.wirePulseSpeed); + if ('toneMapped' in wireMat) wireMat.toneMapped = false; +}; + +export const createWireframe = (geometry, color, phase) => { + const { + THREE, + LineMaterial, + LineSegments2, + LineSegmentsGeometry, + lineResolution, + wireMaterials + } = state; + const wireGeom = getWireGeometry(geometry, THREE); + let wireMat; + if (LineMaterial && LineSegments2 && LineSegmentsGeometry) { + wireMat = new LineMaterial({ + color, + transparent: true, + opacity: 0.2, + linewidth: 1, + blending: THREE.AdditiveBlending, + depthWrite: false, + depthTest: false + }); + wireMat.worldUnits = true; + wireMat.resolution.set(lineResolution.width, lineResolution.height); + } else { + wireMat = new THREE.LineBasicMaterial({ + color, + transparent: true, + opacity: 0.2, + linewidth: 1, + blending: THREE.AdditiveBlending, + depthWrite: false, + depthTest: false + }); + } + const emissiveColor = color.clone().lerp(new THREE.Color(0xffffff), 0.55); + wireMat.userData = { + glowBase: 0.18, + glowRange: 0.25, + flowPhase: phase || 0, + baseColor: color.clone(), + emissiveColor: emissiveColor.clone() + }; + configureWireMaterial(wireMat); + wireMaterials.push(wireMat); + if (LineSegments2 && LineSegmentsGeometry && wireMat instanceof LineMaterial) { + const lineGeom = new LineSegmentsGeometry(); + lineGeom.setPositions(wireGeom.attributes.position.array); + const line = new LineSegments2(lineGeom, wireMat); + line.computeLineDistances(); + return line; + } + return new THREE.LineSegments(wireGeom, wireMat); +}; + +export const createTextPlane = (text, options = {}) => { + const { THREE, labelMaterials } = state; + const size = Number.isFinite(options.size) ? options.size : 0; + const maxTextureSize = 1024; + const baseFontSize = Math.max(20, Math.round(220 * (size || 1))); + const canvas = document.createElement('canvas'); + const context = canvas.getContext('2d'); + const measure = (fontPx) => { + context.font = `600 ${fontPx}px "Segoe UI", sans-serif`; + const paddingPx = Math.round(fontPx * 0.2); + const metrics = context.measureText(text); + const widthPx = Math.ceil(metrics.width + paddingPx * 2); + const heightPx = Math.ceil(fontPx + paddingPx * 2); + return { fontPx, paddingPx, widthPx, heightPx }; + }; + let { fontPx, paddingPx, widthPx, heightPx } = measure(baseFontSize); + const scaleDown = Math.min(1, maxTextureSize / Math.max(widthPx, heightPx)); + if (scaleDown < 1) { + ({ fontPx, paddingPx, widthPx, heightPx } = measure(Math.max(10, Math.floor(baseFontSize * scaleDown)))); + } + canvas.width = Math.min(maxTextureSize, widthPx); + canvas.height = Math.min(maxTextureSize, heightPx); + context.clearRect(0, 0, canvas.width, canvas.height); + context.font = `600 ${fontPx}px "Segoe UI", sans-serif`; + context.fillStyle = options.color || '#e7eef8'; + context.textBaseline = 'middle'; + context.textAlign = 'left'; + context.fillText(text, paddingPx, canvas.height / 2); + const texture = new THREE.CanvasTexture(canvas); + texture.needsUpdate = true; + const material = new THREE.MeshBasicMaterial({ + color: 0xffffff, + transparent: true, + opacity: options.opacity ?? 0.9, + side: THREE.DoubleSide, + depthWrite: false, + map: texture + }); + if ('toneMapped' in material) material.toneMapped = false; + material.userData = { baseOpacity: material.opacity }; + applyHeightFog(material); + labelMaterials.push(material); + const plane = new THREE.Mesh(new THREE.PlaneGeometry(canvas.width / 100, canvas.height / 100), material); + plane.userData = { labelTexture: texture }; + return plane; +}; + +export const applyGlassSettings = () => { + const { + THREE, + visuals, + visualDefaults, + glassMaterials, + glassShells, + normalMapState, + flowMaterials, + grid, + edgeDotMaterial + } = state; + const glass = visuals.glass || visualDefaults.glass; + const transmission = clamp(glass.transmission ?? 0, 0, 1); + for (const material of glassMaterials) { + material.metalness = glass.metalness; + material.roughness = glass.roughness; + material.transmission = transmission; + material.ior = glass.ior; + material.reflectivity = glass.reflectivity; + material.thickness = glass.thickness; + material.attenuationDistance = transmission > 0 ? 9999 : 0; + material.envMapIntensity = glass.envMapIntensity; + material.clearcoat = glass.clearcoat; + material.clearcoatRoughness = glass.clearcoatRoughness; + if (normalMapState.texture) { + material.normalScale = new THREE.Vector2(glass.normalScale, glass.normalScale); + material.clearcoatNormalScale = new THREE.Vector2(glass.clearcoatNormalScale, glass.clearcoatNormalScale); + } + if (material.userData?.fogUniforms) { + material.userData.fogUniforms.fogHeight.value = visuals.fogHeight; + material.userData.fogUniforms.fogHeightRange.value = visuals.fogHeightRange; + if ('fogHeightEnabled' in material.userData.fogUniforms) { + material.userData.fogUniforms.fogHeightEnabled.value = visuals.enableHeightFog ? 1 : 0; + } + } + material.needsUpdate = true; + } + for (const material of flowMaterials) { + if ('envMapIntensity' in material) { + material.envMapIntensity = glass.envMapIntensity; + } + material.needsUpdate = true; + } + if (edgeDotMaterial && 'envMapIntensity' in edgeDotMaterial) { + edgeDotMaterial.envMapIntensity = glass.envMapIntensity; + edgeDotMaterial.needsUpdate = true; + } + if (grid?.material && 'envMapIntensity' in grid.material) { + grid.material.envMapIntensity = glass.envMapIntensity; + grid.material.needsUpdate = true; + } + const thicknessScale = clamp(1 - glass.thickness * 0.03, 0.75, 0.98); + for (const shell of glassShells) { + if (shell?.inner) shell.inner.scale.set(thicknessScale, thicknessScale, thicknessScale); + } +}; + +export const updateFileOpacity = () => { + const { visuals, visualDefaults, fileMeshes, fileChunkMeshes } = state; + const baseOpacity = clamp(numberValue(visuals.fileOpacity, visualDefaults.fileOpacity), 0.1, 1); + for (const mesh of [...fileMeshes, ...fileChunkMeshes]) { + const offset = mesh.userData?.opacityOffset ?? 0; + const opacity = clamp(baseOpacity + offset, 0.1, 1); + if (mesh.material) { + mesh.material.opacity = opacity; + if (mesh.material.userData) mesh.material.userData.baseOpacity = opacity; + } + const inner = mesh.userData?.shellInner; + if (inner?.material) { + const innerOpacity = clamp(opacity * 0.9, 0.05, 1); + inner.material.opacity = innerOpacity; + if (inner.material.userData) inner.material.userData.baseOpacity = innerOpacity; + } + } +}; + +export const updateMemberOpacity = () => { + const { visuals, visualDefaults, memberMeshes, chunkMeshes } = state; + const baseOpacity = clamp(numberValue(visuals.memberOpacity, visualDefaults.memberOpacity), 0.1, 1); + for (const mesh of [...memberMeshes, ...chunkMeshes]) { + const offset = mesh.userData?.opacityOffset ?? 0; + const opacity = clamp(baseOpacity + offset, 0.1, 1); + if (mesh.material) { + mesh.material.opacity = opacity; + if (mesh.material.userData) mesh.material.userData.baseOpacity = opacity; + } + const inner = mesh.userData?.shellInner; + if (inner?.material) { + const innerOpacity = clamp(opacity * 0.9, 0.05, 1); + inner.material.opacity = innerOpacity; + if (inner.material.userData) inner.material.userData.baseOpacity = innerOpacity; + } + } +}; + +export const updateWireframes = () => { + const { wireMaterials, lineResolution } = state; + for (const material of wireMaterials) { + configureWireMaterial(material); + if (material.resolution && typeof material.resolution.set === 'function') { + material.resolution.set(lineResolution.width, lineResolution.height); + } + material.needsUpdate = true; + } +}; + +export const updateFlowGlow = () => { + const { flowMaterials, visuals } = state; + for (const material of flowMaterials) { + material.emissiveIntensity = visuals.flowGlowBase; + material.userData.glowBase = visuals.flowGlowBase; + material.userData.glowRange = visuals.flowGlowRange; + material.userData.baseEmissiveIntensity = visuals.flowGlowBase; + } +}; + +export const updateGridGlow = () => { + const { visuals, visualDefaults, gridLineMaterials, lineResolution } = state; + const base = numberValue(visuals.gridGlowBase, visualDefaults.gridGlowBase); + const range = numberValue(visuals.gridGlowRange, visualDefaults.gridGlowRange); + const thickness = numberValue(visuals.gridLineThickness, visualDefaults.gridLineThickness); + for (const material of gridLineMaterials) { + material.opacity = clamp(base + range * 0.5, 0.05, 0.9); + material.userData.glowBase = base; + material.userData.glowRange = range; + material.userData.flowSpeed = numberValue(visuals.gridPulseSpeed, visualDefaults.gridPulseSpeed); + if ('linewidth' in material) { + material.linewidth = clamp(thickness, 0.02, 6); + } + if (material.resolution && typeof material.resolution.set === 'function') { + material.resolution.set(lineResolution.width, lineResolution.height); + } + } +}; + +export const updateFog = (maxSpanOverride) => { + const { + fogBounds, + visuals, + visualDefaults, + scene, + THREE, + glassMaterials, + labelMaterials, + flowMaterials, + wireMaterials, + gridLineMaterials, + grid, + edgeDotMaterial + } = state; + if (Number.isFinite(maxSpanOverride)) { + fogBounds.maxSpan = maxSpanOverride; + } + const maxSpan = fogBounds.maxSpan || 120; + const enableFog = visuals.enableFog === true; + const fogMaterials = [ + ...glassMaterials, + ...labelMaterials, + ...flowMaterials, + ...wireMaterials, + ...gridLineMaterials, + ...(edgeDotMaterial ? [edgeDotMaterial] : []) + ]; + if (!enableFog) { + scene.fog = null; + if (state.fogEnabled !== enableFog) { + state.fogEnabled = enableFog; + fogMaterials.forEach((material) => { + if (material) material.needsUpdate = true; + }); + } + return; + } + const colorValue = visuals.fogColor || visualDefaults.fogColor; + const fogColor = new THREE.Color(colorValue); + const distance = numberValue(visuals.fogDistance, visualDefaults.fogDistance); + const fogNear = maxSpan * 0.9; + const fogFar = maxSpan * Math.max(1.1, distance); + scene.fog = new THREE.Fog(fogColor.getHex(), fogNear, fogFar); + if (state.fogEnabled !== enableFog) { + state.fogEnabled = enableFog; + fogMaterials.forEach((material) => { + if (material) material.needsUpdate = true; + }); + } + const updateFogUniforms = (material) => { + if (!material?.userData?.fogUniforms) return; + material.userData.fogUniforms.fogHeight.value = visuals.fogHeight; + material.userData.fogUniforms.fogHeightRange.value = visuals.fogHeightRange; + if ('fogHeightEnabled' in material.userData.fogUniforms) { + material.userData.fogUniforms.fogHeightEnabled.value = visuals.enableHeightFog ? 1 : 0; + } + }; + fogMaterials.forEach(updateFogUniforms); + if (grid?.material) updateFogUniforms(grid.material); +}; + +export const updateFlowLights = () => { + const { visuals, flowLights } = state; + const enabled = visuals.enableFlowLights !== false; + for (const light of flowLights) { + light.visible = enabled; + } +}; + +export const updateExtraLights = () => { + const { visuals, extraLights } = state; + const enabled = visuals.enableExtraLights !== false; + for (const light of extraLights) { + light.visible = enabled; + } +}; diff --git a/src/map/isometric/client/meshes.js b/src/map/isometric/client/meshes.js new file mode 100644 index 000000000..0d95858f3 --- /dev/null +++ b/src/map/isometric/client/meshes.js @@ -0,0 +1,329 @@ +import { state } from './state.js'; +import { clamp, hashString } from './utils.js'; +import { createGlassMaterial, createGlassShell, createTextPlane, createWireframe } from './materials.js'; +import { createShapeGeometry } from './layout.js'; + +const colorPalette = { + source: 0x2980b9, + test: 0x8e44ad, + config: 0x16a085, + docs: 0xd35400, + generated: 0x7f8c8d, + dir: 0x34495e, + other: 0x2c3e50 +}; + +export const buildMeshes = () => { + const { + THREE, + visuals, + allFiles, + layoutMetrics, + fileGroup, + memberGroup, + labelGroup, + wireGroup + } = state; + + const { labelOffset, memberCell } = layoutMetrics; + const labelsEnabled = Boolean(labelGroup?.visible); + const chunkInstances = []; + const fileChunkInstances = []; + + const colorFromKey = (value, saturation = 0.65, lightness = 0.55) => { + if (!value) return null; + const seed = hashString(value); + const hue = (seed % 360) / 360; + return new THREE.Color().setHSL(hue, saturation, lightness); + }; + + for (const fileLayout of allFiles) { + const node = fileLayout.node; + const geometry = createShapeGeometry(fileLayout.shape); + const languageKey = node.language || node.type || node.ext || node.category || node.name; + const fileColor = colorFromKey(languageKey, 0.68, 0.52) + || new THREE.Color(colorPalette[node.category] || colorPalette.other); + const fileOpacity = Math.max(0.1, Math.min(1, visuals.fileOpacity)); + const material = createGlassMaterial(fileColor, fileOpacity); + const shell = createGlassShell(geometry, material); + const mesh = shell.outer; + shell.group.position.set(fileLayout.x, fileLayout.height / 2, fileLayout.z); + shell.group.scale.set(fileLayout.width, fileLayout.height, fileLayout.depth); + const fileTopY = Number.isFinite(fileLayout.topY) ? fileLayout.topY : fileLayout.height; + mesh.castShadow = true; + mesh.receiveShadow = true; + mesh.userData = { + type: 'file', + file: node.path || node.name, + name: node.name, + id: node.id || null, + range: null, + baseColor: fileColor.clone(), + shellInner: shell.inner, + shellGroup: shell.group + }; + fileGroup.add(shell.group); + state.fileMeshes.push(mesh); + state.glowMaterials.push(material); + const fileWireColor = fileColor.clone(); + const fileWire = createWireframe(geometry, fileWireColor, shell.group.position.x + shell.group.position.z); + fileWire.position.copy(shell.group.position); + fileWire.rotation.copy(shell.group.rotation); + fileWire.scale.copy(shell.group.scale); + wireGroup.add(fileWire); + state.wireByMesh.set(mesh, fileWire); + const fileKey = node.path || node.name; + if (fileKey) { + state.fileAnchors.set(fileKey, { x: shell.group.position.x, y: fileTopY, z: shell.group.position.z }); + state.fileColorByPath.set(fileKey, fileColor.clone()); + state.fileMeshByKey.set(fileKey, mesh); + } + + const fileChunkCount = clamp(Math.ceil(Math.sqrt(fileLayout.members.length || 1)), 1, 8); + if (fileChunkCount > 0) { + const innerWidth = fileLayout.width * 0.6; + const innerDepth = fileLayout.depth * 0.6; + const cols = Math.max(1, Math.ceil(Math.sqrt(fileChunkCount))); + const rows = Math.max(1, Math.ceil(fileChunkCount / cols)); + const stepX = innerWidth / cols; + const stepZ = innerDepth / rows; + const startX = fileLayout.x - innerWidth / 2 + stepX / 2; + const startZ = fileLayout.z - innerDepth / 2 + stepZ / 2; + for (let i = 0; i < fileChunkCount; i += 1) { + const row = Math.floor(i / cols); + const col = i % cols; + const seed = hashString(`${fileKey || node.name || 'file'}:${i}`); + const t = (seed % 1000) / 1000; + const heightScale = 0.6 + t * 0.6; + const footprintScale = 0.45 + ((seed >> 6) % 100) / 250; + const chunkHeight = Math.max(0.12, fileLayout.height * 0.08 * heightScale); + const chunkFootprint = Math.min(stepX, stepZ) * footprintScale; + const innerBottom = fileLayout.height * 0.12; + const innerTop = fileLayout.height * 0.65; + const centerY = Math.min( + innerTop - chunkHeight / 2, + innerBottom + (innerTop - innerBottom) * t + ); + const chunkColor = fileColor.clone().offsetHSL(0.05 * t, 0.08, 0.08); + fileChunkInstances.push({ + x: startX + col * stepX, + y: centerY, + z: startZ + row * stepZ, + scaleX: chunkFootprint, + scaleY: chunkHeight, + scaleZ: chunkFootprint, + color: chunkColor + }); + } + } + + const fileLabelText = String(node.name || node.path || '').split('/').filter(Boolean).pop(); + if (labelsEnabled && fileLabelText) { + const fileLabelSize = Math.min(fileLayout.width, fileLayout.depth); + const fileLabel = createTextPlane(fileLabelText, { size: fileLabelSize }); + if (fileLabel.material) fileLabel.material.depthTest = true; + fileLabel.position.set( + shell.group.position.x + fileLayout.width * 0.5 + labelOffset, + Math.max(0.3, fileTopY * 0.6), + shell.group.position.z + fileLayout.depth * 0.5 + labelOffset + ); + fileLabel.rotation.y = -Math.PI / 4; + fileLabel.renderOrder = 4; + labelGroup.add(fileLabel); + } + + const slots = fileLayout.memberSlots; + if (!slots.length) continue; + const members = fileLayout.members.slice().sort((a, b) => { + const footprintDiff = (b.footprint || 0) - (a.footprint || 0); + if (footprintDiff !== 0) return footprintDiff; + return (b.height || 0) - (a.height || 0); + }); + const slotStep = fileLayout.slotStep || (layoutMetrics.memberCell + layoutMetrics.memberGap); + const slotLimit = Math.max(0.2, slotStep - layoutMetrics.memberGap * 0.6); + const maxFootprint = Math.min( + (fileLayout.surfaceWidth || fileLayout.width) / Math.max(1, fileLayout.columns || 1), + (fileLayout.surfaceDepth || fileLayout.depth) / Math.max(1, fileLayout.rows || 1), + slotLimit + ) - layoutMetrics.memberGap; + members.forEach((entry, index) => { + const slot = slots[index]; + if (!slot) return; + const height = entry.height; + const rawFootprint = entry.footprint || memberCell; + const footprint = Math.max(0.2, Math.min(rawFootprint, maxFootprint || rawFootprint)); + const mGeom = createShapeGeometry(entry.shape); + const memberKey = entry.member.id || entry.member.name || entry.member.file || ''; + const mColor = state.scoreToColor(entry.score, memberKey); + if (entry.member.id) state.memberColorById.set(entry.member.id, mColor.clone()); + const memberOpacity = Math.max(0.1, Math.min(1, visuals.memberOpacity)); + const mMat = createGlassMaterial(mColor, memberOpacity); + const shellMember = createGlassShell(mGeom, mMat); + const mMesh = shellMember.outer; + const anchorY = Number.isFinite(fileLayout.topY) ? fileLayout.topY : fileLayout.height; + const lift = Math.max(0.06, layoutMetrics.memberGap * 0.35); + shellMember.group.position.set(fileLayout.x + slot.x, anchorY + height / 2 + lift, fileLayout.z + slot.z); + shellMember.group.scale.set(footprint, height, footprint); + mMesh.castShadow = false; + mMesh.receiveShadow = false; + mMesh.userData = { + type: 'member', + file: node.path || node.name, + id: entry.member.id || null, + name: entry.member.name, + range: entry.member.range || null, + baseColor: mColor.clone(), + shellInner: shellMember.inner, + shellGroup: shellMember.group + }; + memberGroup.add(shellMember.group); + state.memberMeshes.push(mMesh); + state.glowMaterials.push(mMat); + const memberWireColor = mColor.clone(); + const memberWire = createWireframe( + mGeom, + memberWireColor, + shellMember.group.position.x + shellMember.group.position.z + ); + memberWire.position.copy(shellMember.group.position); + memberWire.rotation.copy(shellMember.group.rotation); + memberWire.scale.copy(shellMember.group.scale); + wireGroup.add(memberWire); + state.wireByMesh.set(mMesh, memberWire); + if (entry.member.id) { + state.memberAnchors.set(entry.member.id, { + x: shellMember.group.position.x, + y: shellMember.group.position.y + height / 2, + z: shellMember.group.position.z + }); + state.memberMeshById.set(entry.member.id, mMesh); + } + if (labelsEnabled && entry.member.name) { + const memberLabelSize = Math.min(footprint, height); + const memberLabel = createTextPlane(entry.member.name, { size: memberLabelSize }); + if (memberLabel.material) memberLabel.material.depthTest = true; + memberLabel.position.set( + shellMember.group.position.x + footprint * 0.5 + labelOffset, + shellMember.group.position.y, + shellMember.group.position.z + footprint * 0.5 + labelOffset + ); + memberLabel.rotation.y = -Math.PI / 4; + memberLabel.renderOrder = 4; + labelGroup.add(memberLabel); + } + + const dataflow = entry.member.dataflow || {}; + const controlFlow = entry.member.controlFlow || {}; + const flowCount = [ + dataflow.reads, + dataflow.writes, + dataflow.mutations, + dataflow.aliases + ].reduce((acc, value) => acc + (Array.isArray(value) ? value.length : 0), 0); + const controlCount = Object.values(controlFlow).reduce((acc, value) => { + if (Array.isArray(value)) return acc + value.length; + if (typeof value === 'number') return acc + value; + if (value) return acc + 1; + return acc; + }, 0); + const chunkCount = clamp(Math.ceil(Math.sqrt(flowCount + controlCount + 1)), 1, 6); + const footprintScale = footprint / memberCell; + let chunkHeight = Math.max(0.08, height * clamp(0.1 + footprintScale * 0.02, 0.1, 0.18)); + const chunkFootprintScale = clamp(0.55 + footprintScale * 0.15, 0.6, 0.95); + const chunkFootprint = Math.min(footprint, footprint * chunkFootprintScale); + let chunkGap = Math.max(0.02, chunkHeight * 0.12); + const maxStackHeight = height * 0.55; + const stackHeight = chunkCount * chunkHeight + (chunkCount - 1) * chunkGap; + if (stackHeight > maxStackHeight && stackHeight > 0) { + const scale = maxStackHeight / stackHeight; + chunkHeight *= scale; + chunkGap *= scale; + } + const topY = shellMember.group.position.y + height / 2 - 0.04; + const chunkStart = topY - (chunkCount * chunkHeight + (chunkCount - 1) * chunkGap) + chunkHeight / 2; + for (let i = 0; i < chunkCount; i += 1) { + const chunkY = chunkStart + i * (chunkHeight + chunkGap); + const chunkColor = mColor.clone().offsetHSL(0.02 * i, 0.08, 0.08); + chunkInstances.push({ + x: shellMember.group.position.x, + y: chunkY, + z: shellMember.group.position.z, + scaleX: chunkFootprint, + scaleY: chunkHeight, + scaleZ: chunkFootprint, + color: chunkColor + }); + } + }); + } + + if (chunkInstances.length) { + const chunkGeometry = createShapeGeometry('square'); + const chunkOpacity = Math.min(1, Math.max(0.1, visuals.memberOpacity) + 0.1); + const chunkMaterial = createGlassMaterial(new THREE.Color(0xffffff), chunkOpacity); + chunkMaterial.vertexColors = true; + chunkMaterial.userData.glowSpeed = 1.4; + chunkMaterial.userData.glowPhase = -0.6; + const prevCompile = chunkMaterial.onBeforeCompile; + chunkMaterial.onBeforeCompile = (shader) => { + if (typeof prevCompile === 'function') prevCompile(shader); + if (shader.fragmentShader.includes('vColor')) { + shader.fragmentShader = shader.fragmentShader.replace( + '#include ', + '#include \n totalEmissiveRadiance *= vColor;' + ); + } + }; + chunkMaterial.needsUpdate = true; + const chunkMesh = new THREE.InstancedMesh(chunkGeometry, chunkMaterial, chunkInstances.length); + chunkMesh.castShadow = false; + chunkMesh.receiveShadow = false; + const dummy = new THREE.Object3D(); + chunkInstances.forEach((entry, index) => { + dummy.position.set(entry.x, entry.y, entry.z); + dummy.scale.set(entry.scaleX, entry.scaleY, entry.scaleZ); + dummy.updateMatrix(); + chunkMesh.setMatrixAt(index, dummy.matrix); + chunkMesh.setColorAt(index, entry.color); + }); + chunkMesh.instanceMatrix.needsUpdate = true; + if (chunkMesh.instanceColor) chunkMesh.instanceColor.needsUpdate = true; + chunkMesh.userData = { type: 'chunk', opacityOffset: 0.1 }; + memberGroup.add(chunkMesh); + state.chunkMeshes.push(chunkMesh); + } + + if (fileChunkInstances.length) { + const chunkGeometry = createShapeGeometry('square'); + const chunkOpacity = Math.min(1, Math.max(0.1, visuals.fileOpacity) + 0.05); + const chunkMaterial = createGlassMaterial(new THREE.Color(0xffffff), chunkOpacity); + chunkMaterial.vertexColors = true; + chunkMaterial.userData.glowSpeed = 0.6; + chunkMaterial.userData.glowPhase = Math.PI * 0.3; + const prevCompile = chunkMaterial.onBeforeCompile; + chunkMaterial.onBeforeCompile = (shader) => { + if (typeof prevCompile === 'function') prevCompile(shader); + if (shader.fragmentShader.includes('vColor')) { + shader.fragmentShader = shader.fragmentShader.replace( + '#include ', + '#include \n totalEmissiveRadiance *= vColor;' + ); + } + }; + chunkMaterial.needsUpdate = true; + const chunkMesh = new THREE.InstancedMesh(chunkGeometry, chunkMaterial, fileChunkInstances.length); + const dummy = new THREE.Object3D(); + fileChunkInstances.forEach((entry, index) => { + dummy.position.set(entry.x, entry.y, entry.z); + dummy.scale.set(entry.scaleX, entry.scaleY, entry.scaleZ); + dummy.updateMatrix(); + chunkMesh.setMatrixAt(index, dummy.matrix); + chunkMesh.setColorAt(index, entry.color); + }); + chunkMesh.instanceMatrix.needsUpdate = true; + if (chunkMesh.instanceColor) chunkMesh.instanceColor.needsUpdate = true; + chunkMesh.userData = { type: 'file-chunk', opacityOffset: 0.05 }; + fileGroup.add(chunkMesh); + state.fileChunkMeshes.push(chunkMesh); + } +}; diff --git a/src/map/isometric/client/rebuild.js b/src/map/isometric/client/rebuild.js new file mode 100644 index 000000000..ef7aeca50 --- /dev/null +++ b/src/map/isometric/client/rebuild.js @@ -0,0 +1,224 @@ +import { state } from './state.js'; +import { clearGroup, disposeObject } from './scene-utils.js'; +import { applyHeightFog, updateFog, updateGridGlow, updateFlowLights } from './materials.js'; +import { computeLayout } from './layout.js'; +import { buildMeshes } from './meshes.js'; +import { buildEdges } from './edges.js'; +import { applyHighlights } from './selection.js'; + +const resetScene = () => { + clearGroup(state.fileGroup); + clearGroup(state.memberGroup); + clearGroup(state.labelGroup); + clearGroup(state.edgeGroup); + clearGroup(state.wireGroup); + state.fileMeshes = []; + state.memberMeshes = []; + state.chunkMeshes = []; + state.fileChunkMeshes = []; + state.glowMaterials = []; + state.flowMaterials = []; + state.glassMaterials = []; + state.labelMaterials = []; + state.glassShells = []; + state.wireMaterials = []; + state.gridLineMaterials = []; + state.edgeMeshes = []; + state.edgeSegments = []; + state.edgeDotMesh = null; + state.edgeDotMaterial = null; + state.fileMeshByKey = new Map(); + state.memberMeshById = new Map(); + state.wireByMesh = new Map(); + state.fileAnchors = new Map(); + state.memberAnchors = new Map(); + state.fileColorByPath = new Map(); + state.memberColorById = new Map(); + state.edgeTypeGroups = new Map(); + state.edgeTypes = []; + if (state.flowLights) { + state.flowLights.forEach((light) => state.scene.remove(light)); + } + state.flowLights = []; + if (state.grid) { + state.scene.remove(state.grid); + disposeObject(state.grid); + state.grid = null; + } + if (state.gridLines) { + clearGroup(state.gridLines); + state.scene.remove(state.gridLines); + state.gridLines = null; + } +}; + +export const scheduleRebuild = (delay = 180) => { + if (state.rebuildTimer) { + clearTimeout(state.rebuildTimer); + } + state.rebuildTimer = setTimeout(() => { + state.rebuildTimer = null; + rebuildScene(); + }, delay); +}; + +export const rebuildScene = () => { + if (typeof state.syncStateFromPanel === 'function') { + state.syncStateFromPanel(); + } + const preservedCamera = { + position: state.camera.position.clone(), + zoom: state.camera.zoom + }; + resetScene(); + computeLayout(); + + const { + THREE, + visuals, + LineMaterial, + LineSegments2, + LineSegmentsGeometry, + layoutMetrics, + bounds, + scene, + lineResolution, + lockIsometric, + camera, + controlDefaults, + controls, + renderer + } = state; + + const edgePlane = layoutMetrics.edgePlane; + const gridSize = Math.max(80, Math.ceil(bounds.maxSpan * 1.4 / 10) * 10); + const groundGeometry = new THREE.PlaneGeometry(gridSize, gridSize); + const groundMaterial = new THREE.MeshStandardMaterial({ + color: 0x151a20, + metalness: 1, + roughness: 0.25, + envMapIntensity: visuals.glass.envMapIntensity * 0.6 + }); + applyHeightFog(groundMaterial); + state.grid = new THREE.Mesh(groundGeometry, groundMaterial); + state.grid.rotation.x = -Math.PI / 2; + state.grid.position.y = edgePlane - 0.05 * state.scaleFactor; + state.grid.receiveShadow = true; + scene.add(state.grid); + state.grid.visible = state.gridVisible; + state.groundPlane.constant = -state.grid.position.y; + + const gridLineStep = Math.max(2, Math.round(layoutMetrics.baseSize)); + const gridHalf = gridSize / 2; + const gridY = state.grid.position.y + 0.02 * state.scaleFactor; + const gridBuckets = [ + { positions: [], phase: 0 }, + { positions: [], phase: 1.8 }, + { positions: [], phase: 3.6 } + ]; + let lineIndex = 0; + for (let x = -gridHalf; x <= gridHalf; x += gridLineStep) { + const bucket = gridBuckets[lineIndex % gridBuckets.length]; + bucket.positions.push(x, gridY, -gridHalf, x, gridY, gridHalf); + lineIndex += 1; + } + for (let z = -gridHalf; z <= gridHalf; z += gridLineStep) { + const bucket = gridBuckets[lineIndex % gridBuckets.length]; + bucket.positions.push(-gridHalf, gridY, z, gridHalf, gridY, z); + lineIndex += 1; + } + const gridLineColor = new THREE.Color('#3b4350'); + state.gridLines = new THREE.Group(); + gridBuckets.forEach((bucket) => { + if (!bucket.positions.length) return; + let gridLineMaterial; + if (LineMaterial && LineSegments2 && LineSegmentsGeometry) { + gridLineMaterial = new LineMaterial({ + color: gridLineColor, + transparent: true, + opacity: visuals.gridGlowBase, + linewidth: visuals.gridLineThickness, + blending: THREE.AdditiveBlending, + depthWrite: false, + depthTest: false + }); + gridLineMaterial.resolution.set(lineResolution.width, lineResolution.height); + } else { + gridLineMaterial = new THREE.LineBasicMaterial({ + color: gridLineColor, + transparent: true, + opacity: visuals.gridGlowBase, + blending: THREE.AdditiveBlending, + depthWrite: false, + depthTest: false + }); + } + gridLineMaterial.userData = { + glowBase: visuals.gridGlowBase, + glowRange: visuals.gridGlowRange, + flowSpeed: visuals.gridPulseSpeed, + flowPhase: bucket.phase + }; + if ('toneMapped' in gridLineMaterial) gridLineMaterial.toneMapped = false; + applyHeightFog(gridLineMaterial); + state.gridLineMaterials.push(gridLineMaterial); + if (LineSegments2 && LineSegmentsGeometry && gridLineMaterial instanceof LineMaterial) { + const gridGeom = new LineSegmentsGeometry(); + gridGeom.setPositions(bucket.positions); + const lineMesh = new LineSegments2(gridGeom, gridLineMaterial); + lineMesh.computeLineDistances(); + state.gridLines.add(lineMesh); + } else { + const gridGeom = new THREE.BufferGeometry(); + gridGeom.setAttribute('position', new THREE.Float32BufferAttribute(bucket.positions, 3)); + state.gridLines.add(new THREE.LineSegments(gridGeom, gridLineMaterial)); + } + }); + state.gridLines.renderOrder = 1; + state.gridLines.visible = state.gridVisible; + scene.add(state.gridLines); + updateGridGlow(); + updateFog(bounds.maxSpan); + + const targetCameraBase = Math.max(40, bounds.maxSpan * 0.6); + const cameraDistance = Math.max(60, bounds.maxSpan * 1.2); + if (!state.cameraInitialized) { + state.cameraBase = targetCameraBase; + } + state.farPlane = Math.max(5000, bounds.maxSpan * 10); + state.nearPlane = Math.max(0.1, state.farPlane / 100000); + const viewport = typeof state.getViewport === 'function' + ? state.getViewport() + : { width: 1, height: 1 }; + const aspect = viewport.height ? viewport.width / viewport.height : 1; + camera.left = -state.cameraBase * aspect; + camera.right = state.cameraBase * aspect; + camera.top = state.cameraBase; + camera.bottom = -state.cameraBase; + camera.near = state.nearPlane; + camera.far = state.farPlane; + const zoomMin = Number.isFinite(controls.zoomMin) ? controls.zoomMin : controlDefaults.zoomMin; + const zoomMax = Number.isFinite(controls.zoomMax) ? controls.zoomMax : controlDefaults.zoomMax; + if (!state.cameraInitialized) { + camera.position.set(cameraDistance, cameraDistance * 0.9, cameraDistance); + lockIsometric(); + state.cameraInitialized = true; + } else { + camera.position.copy(preservedCamera.position); + lockIsometric(); + } + camera.zoom = Math.max(zoomMin, Math.min(zoomMax, preservedCamera.zoom || camera.zoom)); + camera.updateProjectionMatrix(); + lockIsometric(); + + buildMeshes(); + buildEdges(); + updateFlowLights(); + if (typeof state.renderEdgeMenu === 'function') { + state.renderEdgeMenu(); + } + applyHighlights(); + if (renderer?.shadowMap) { + renderer.shadowMap.needsUpdate = true; + } +}; diff --git a/src/map/isometric/client/scene-utils.js b/src/map/isometric/client/scene-utils.js new file mode 100644 index 000000000..316ddee15 --- /dev/null +++ b/src/map/isometric/client/scene-utils.js @@ -0,0 +1,44 @@ +export const disposeMaterial = (material) => { + if (!material) return; + if (Array.isArray(material)) { + material.forEach((entry) => disposeMaterial(entry)); + return; + } + if (material.map) material.map.dispose?.(); + if (material.normalMap) material.normalMap.dispose?.(); + if (material.clearcoatNormalMap) material.clearcoatNormalMap.dispose?.(); + material.dispose?.(); +}; + +export const disposeObject = (object) => { + if (!object) return; + if (object.geometry && !object.geometry.userData?.shared) object.geometry.dispose(); + if (object.material) disposeMaterial(object.material); +}; + +export const clearGroup = (group) => { + if (!group) return; + const disposedGeometries = new Set(); + const disposedMaterials = new Set(); + group.traverse((child) => { + if (child === group) return; + const geometry = child.geometry; + if (geometry && !geometry.userData?.shared && !disposedGeometries.has(geometry)) { + disposedGeometries.add(geometry); + geometry.dispose?.(); + } + const material = child.material; + if (material) { + const materials = Array.isArray(material) ? material : [material]; + for (const entry of materials) { + if (entry && !disposedMaterials.has(entry)) { + disposedMaterials.add(entry); + disposeMaterial(entry); + } + } + } + }); + while (group.children.length) { + group.remove(group.children[0]); + } +}; diff --git a/src/map/isometric/client/scene.js b/src/map/isometric/client/scene.js new file mode 100644 index 000000000..429fbd85f --- /dev/null +++ b/src/map/isometric/client/scene.js @@ -0,0 +1,146 @@ +import { state } from './state.js'; + +export const initScene = async () => { + const { THREE, dom, RGBELoader, assets, visuals } = state; + const { app } = dom; + + const getViewport = () => { + const rect = app.getBoundingClientRect(); + const width = rect.width || window.innerWidth; + const height = rect.height || window.innerHeight; + return { width, height }; + }; + + const renderer = new THREE.WebGLRenderer({ antialias: true, alpha: true }); + renderer.setPixelRatio(Math.min(2, window.devicePixelRatio || 1)); + const initialViewport = getViewport(); + const lineResolution = { width: initialViewport.width, height: initialViewport.height }; + renderer.setSize(initialViewport.width, initialViewport.height); + renderer.domElement.style.width = '100%'; + renderer.domElement.style.height = '100%'; + renderer.physicallyCorrectLights = true; + renderer.toneMapping = THREE.ACESFilmicToneMapping; + renderer.toneMappingExposure = 1.9; + renderer.shadowMap.enabled = true; + renderer.shadowMap.type = THREE.PCFSoftShadowMap; + renderer.shadowMap.autoUpdate = false; + if (renderer.outputColorSpace !== undefined) { + renderer.outputColorSpace = THREE.SRGBColorSpace; + } + app.appendChild(renderer.domElement); + + const scene = new THREE.Scene(); + scene.background = new THREE.Color('#0f1115'); + + const ambient = new THREE.AmbientLight(0xffffff, 0.9); + scene.add(ambient); + const dirLight = new THREE.DirectionalLight(0xffffff, 1.2); + dirLight.position.set(50, 80, 30); + dirLight.castShadow = true; + scene.add(dirLight); + const hemiLight = new THREE.HemisphereLight(0x6fb1ff, 0x2b2f3a, 0.8); + scene.add(hemiLight); + const fillLight = new THREE.PointLight(0x9fd3ff, 1.0, 260); + fillLight.position.set(-40, 35, -20); + scene.add(fillLight); + const rimLight = new THREE.DirectionalLight(0x6fb1ff, 1.4); + rimLight.position.set(-80, 60, 80); + const accentLight = new THREE.PointLight(0xffe6b5, 1.2, 220); + accentLight.position.set(40, 50, -70); + const extraLights = [rimLight, accentLight]; + extraLights.forEach((light) => scene.add(light)); + + const fileGroup = new THREE.Group(); + const memberGroup = new THREE.Group(); + const labelGroup = new THREE.Group(); + const wireGroup = new THREE.Group(); + const edgeGroup = new THREE.Group(); + scene.add(fileGroup); + scene.add(memberGroup); + scene.add(labelGroup); + scene.add(wireGroup); + scene.add(edgeGroup); + edgeGroup.renderOrder = 1; + wireGroup.renderOrder = 5; + labelGroup.renderOrder = 4; + fileGroup.renderOrder = 2; + memberGroup.renderOrder = 3; + labelGroup.visible = false; + + let cameraBase = 40; + let nearPlane = 0.1; + let farPlane = 2000; + const camera = new THREE.OrthographicCamera(-cameraBase, cameraBase, cameraBase, -cameraBase, nearPlane, farPlane); + camera.matrixAutoUpdate = true; + const isoYaw = Math.PI / 4; + const isoPitch = -Math.atan(1 / Math.sqrt(2)); + const isoEuler = new THREE.Euler(isoPitch, isoYaw, 0, 'YXZ'); + const isoQuaternion = new THREE.Quaternion().setFromEuler(isoEuler); + const isoUp = new THREE.Vector3(0, 1, 0); + camera.position.set(60, 54, 60); + camera.quaternion.copy(isoQuaternion); + camera.up.copy(isoUp); + const lockIsometric = () => { + camera.up.copy(isoUp); + camera.quaternion.copy(isoQuaternion); + camera.updateMatrixWorld(); + }; + + const applyEnvironment = (texture) => { + if (!texture) return; + texture.mapping = THREE.EquirectangularReflectionMapping; + const pmrem = new THREE.PMREMGenerator(renderer); + scene.environment = pmrem.fromEquirectangular(texture).texture; + pmrem.dispose(); + }; + + const envCanvas = document.createElement('canvas'); + envCanvas.width = 32; + envCanvas.height = 16; + const envCtx = envCanvas.getContext('2d'); + const gradient = envCtx.createLinearGradient(0, 0, envCanvas.width, envCanvas.height); + gradient.addColorStop(0, '#1b2230'); + gradient.addColorStop(0.5, '#6fb1ff'); + gradient.addColorStop(1, '#0f1115'); + envCtx.fillStyle = gradient; + envCtx.fillRect(0, 0, envCanvas.width, envCanvas.height); + const fallbackEnv = new THREE.CanvasTexture(envCanvas); + applyEnvironment(fallbackEnv); + fallbackEnv.dispose(); + + if (RGBELoader && assets.hdrEnvUrl) { + const rgbe = new RGBELoader(); + rgbe.load(assets.hdrEnvUrl, (hdrTexture) => { + applyEnvironment(hdrTexture); + hdrTexture.dispose(); + }); + } + + Object.assign(state, { + renderer, + scene, + camera, + lineResolution, + getViewport, + lockIsometric, + cameraBase, + nearPlane, + farPlane, + cameraInitialized: false, + extraLights, + fileGroup, + memberGroup, + labelGroup, + wireGroup, + edgeGroup, + grid: null, + gridLines: null, + groundPlane: new THREE.Plane(new THREE.Vector3(0, 1, 0), 0), + fogBounds: { maxSpan: 120 }, + scaleFactor: 2 + }); + + if (visuals?.enableExtraLights === false) { + extraLights.forEach((light) => { light.visible = false; }); + } +}; diff --git a/src/map/isometric/client/selection.js b/src/map/isometric/client/selection.js new file mode 100644 index 000000000..07d447987 --- /dev/null +++ b/src/map/isometric/client/selection.js @@ -0,0 +1,492 @@ +import { state } from './state.js'; +import { clamp } from './utils.js'; +import { configureWireMaterial } from './materials.js'; + +const formatPrimitive = (value) => { + if (value === null || value === undefined || value === '') return 'None'; + if (typeof value === 'boolean') return value ? 'true' : 'false'; + if (typeof value === 'number') return Number.isFinite(value) ? value.toString() : 'None'; + return String(value); +}; + +const isRefItem = (value) => value && typeof value === 'object' && value.__ref; + +export const setHoverRef = (ref) => { + state.hoveredRef = ref; + applyHighlights(); +}; + +const renderValueNode = (value) => { + if (value === null || value === undefined || value === '') { + const empty = document.createElement('span'); + empty.className = 'sel-empty'; + empty.textContent = 'None'; + return empty; + } + if (Array.isArray(value)) { + if (!value.length) { + const empty = document.createElement('span'); + empty.className = 'sel-empty'; + empty.textContent = 'None'; + return empty; + } + const list = document.createElement('div'); + list.className = 'sel-list'; + value.forEach((entry) => { + const pill = document.createElement('span'); + pill.className = 'sel-pill'; + if (isRefItem(entry)) { + pill.textContent = entry.label; + pill.dataset.refType = entry.refType; + pill.dataset.refId = entry.refId; + pill.addEventListener('mouseenter', () => setHoverRef(entry)); + pill.addEventListener('mouseleave', () => setHoverRef(null)); + } else { + pill.textContent = formatPrimitive(entry); + } + list.appendChild(pill); + }); + return list; + } + if (typeof value === 'object') { + if (isRefItem(value)) { + const pill = document.createElement('span'); + pill.className = 'sel-pill'; + pill.textContent = value.label; + pill.dataset.refType = value.refType; + pill.dataset.refId = value.refId; + pill.addEventListener('mouseenter', () => setHoverRef(value)); + pill.addEventListener('mouseleave', () => setHoverRef(null)); + return pill; + } + const entries = Object.entries(value); + if (!entries.length) { + const empty = document.createElement('span'); + empty.className = 'sel-empty'; + empty.textContent = 'None'; + return empty; + } + const list = document.createElement('div'); + list.className = 'sel-list'; + entries.forEach(([key, entry]) => { + const pill = document.createElement('span'); + pill.className = 'sel-pill'; + pill.textContent = `${key}: ${formatPrimitive(entry)}`; + list.appendChild(pill); + }); + return list; + } + const text = document.createElement('span'); + text.textContent = formatPrimitive(value); + return text; +}; + +const createSelectionSection = (title) => { + const { dom } = state; + const section = document.createElement('div'); + section.className = 'sel-section'; + const heading = document.createElement('div'); + heading.className = 'sel-title'; + heading.textContent = title; + section.appendChild(heading); + dom.selectionBody.appendChild(section); + return section; +}; + +const addSelectionRow = (section, label, value) => { + const row = document.createElement('div'); + row.className = 'sel-row'; + const labelNode = document.createElement('div'); + labelNode.className = 'sel-label'; + labelNode.textContent = label; + const valueNode = document.createElement('div'); + valueNode.className = 'sel-value'; + valueNode.appendChild(renderValueNode(value)); + row.appendChild(labelNode); + row.appendChild(valueNode); + section.appendChild(row); +}; + +const formatRange = (range) => { + if (!range || !Number.isFinite(range.startLine)) return 'None'; + const start = range.startLine; + const end = Number.isFinite(range.endLine) ? range.endLine : start; + const span = Math.max(1, end - start + 1); + return `${start}-${end} (${span} lines)`; +}; + +const formatEdgeCounts = (edgeList) => { + if (!edgeList.length) return []; + const counts = new Map(); + edgeList.forEach((edge) => { + const type = edge.type || 'other'; + counts.set(type, (counts.get(type) || 0) + 1); + }); + return Array.from(counts.entries()) + .sort((a, b) => (b[1] - a[1]) || a[0].localeCompare(b[0])) + .map(([type, count]) => `${type}: ${count}`); +}; + +const formatEdgeTargets = (edgeList, direction, limit = 8) => { + if (!edgeList.length) return []; + const seen = new Set(); + const targets = []; + edgeList.forEach((edge) => { + const endpoint = direction === 'incoming' ? edge.from : edge.to; + if (!endpoint) return; + let label = ''; + let refType = ''; + let refId = ''; + if (endpoint.member) { + const member = state.memberById.get(endpoint.member); + if (member) { + label = `${member.name || endpoint.member} - ${member.file || ''}`.trim(); + refType = 'member'; + refId = member.id || endpoint.member; + } else { + label = endpoint.member; + refType = 'member'; + refId = endpoint.member; + } + } else if (endpoint.file) { + label = endpoint.file; + refType = 'file'; + refId = endpoint.file; + } + if (!label || seen.has(label)) return; + seen.add(label); + targets.push({ __ref: true, label, refType, refId }); + }); + if (targets.length > limit) { + const trimmed = targets.slice(0, limit); + trimmed.push(`+${targets.length - limit} more`); + return trimmed; + } + return targets; +}; + +const formatListWithLimit = (values, limit = 10) => { + if (!Array.isArray(values) || !values.length) return []; + if (values.length > limit) { + return values.slice(0, limit).concat(`+${values.length - limit} more`); + } + return values; +}; + +const collectEdgesForSelection = (selectionInfo, member, node) => { + if (!selectionInfo) return { incoming: [], outgoing: [] }; + const memberId = member?.id || selectionInfo.id || null; + const fileKey = selectionInfo.file || node?.path || node?.name || ''; + const incoming = []; + const outgoing = []; + state.edges.forEach((edge) => { + const from = edge.from || {}; + const to = edge.to || {}; + const fromMatch = memberId + ? from.member === memberId + : (from.file === fileKey || state.fileByMember.get(from.member) === fileKey); + const toMatch = memberId + ? to.member === memberId + : (to.file === fileKey || state.fileByMember.get(to.member) === fileKey); + if (fromMatch) outgoing.push(edge); + if (toMatch) incoming.push(edge); + }); + return { incoming, outgoing }; +}; + +export const renderSelectionDetails = (info) => { + const { dom, nodeByPath, nodeById, memberById, memberByKey, buildMemberKey, buildMemberNameKey } = state; + dom.selectionBody.textContent = ''; + if (!info) { + dom.selectionBody.textContent = 'None'; + return; + } + const fileKey = info.file || info.name || ''; + const node = nodeByPath.get(fileKey) || nodeById.get(info.id) || null; + const rangeKey = buildMemberKey(fileKey, info.name || '', info.range || {}); + const nameKey = buildMemberNameKey(fileKey, info.name || ''); + const member = info.id + ? memberById.get(info.id) + : (memberByKey.get(rangeKey) || memberByKey.get(nameKey) || null); + + if (info.type === 'file' || (!info.type && node)) { + const section = createSelectionSection('File'); + addSelectionRow(section, 'Name', node?.name || info.name || fileKey); + addSelectionRow(section, 'Path', node?.path || fileKey); + addSelectionRow(section, 'Category', node?.category || 'None'); + addSelectionRow(section, 'Type', node?.type || 'file'); + addSelectionRow(section, 'Ext', node?.ext || 'None'); + addSelectionRow(section, 'Id', node?.id || 'None'); + const members = Array.isArray(node?.members) ? node.members : []; + addSelectionRow(section, 'Members', members.length); + if (members.length) { + const memberNames = members.map((entry) => entry.name).filter(Boolean); + addSelectionRow(section, 'Member names', formatListWithLimit(memberNames, 10)); + } + } + + if (info.type === 'member' || member) { + const section = createSelectionSection('Member'); + addSelectionRow(section, 'Name', member?.name || info.name || 'None'); + addSelectionRow(section, 'File', member?.file || fileKey || 'None'); + addSelectionRow(section, 'Type', member?.type || info.type || 'None'); + addSelectionRow(section, 'Kind', member?.kind || 'None'); + addSelectionRow(section, 'Signature', member?.signature || 'None'); + addSelectionRow(section, 'Params', member?.params || null); + addSelectionRow(section, 'Returns', member?.returns || 'None'); + addSelectionRow(section, 'Modifiers', member?.modifiers || 'None'); + addSelectionRow(section, 'Exported', member?.exported ?? false); + addSelectionRow(section, 'Range', formatRange(member?.range || info.range)); + addSelectionRow(section, 'Id', member?.id || info.id || 'None'); + addSelectionRow(section, 'Port', member?.port || 'None'); + + const dataflow = member?.dataflow || {}; + const dataSection = createSelectionSection('Dataflow'); + addSelectionRow(dataSection, 'Reads', dataflow.reads || null); + addSelectionRow(dataSection, 'Writes', dataflow.writes || null); + addSelectionRow(dataSection, 'Mutations', dataflow.mutations || null); + addSelectionRow(dataSection, 'Aliases', dataflow.aliases || null); + + const controlFlow = member?.controlFlow || {}; + const controlSection = createSelectionSection('Control flow'); + addSelectionRow(controlSection, 'Branches', controlFlow.branches ?? 0); + addSelectionRow(controlSection, 'Loops', controlFlow.loops ?? 0); + addSelectionRow(controlSection, 'Returns', controlFlow.returns ?? 0); + addSelectionRow(controlSection, 'Breaks', controlFlow.breaks ?? 0); + addSelectionRow(controlSection, 'Continues', controlFlow.continues ?? 0); + addSelectionRow(controlSection, 'Throws', controlFlow.throws ?? 0); + addSelectionRow(controlSection, 'Awaits', controlFlow.awaits ?? 0); + addSelectionRow(controlSection, 'Yields', controlFlow.yields ?? 0); + } + + const edgeSection = createSelectionSection('Edges'); + const edgeData = collectEdgesForSelection(info, member, node); + addSelectionRow(edgeSection, 'Incoming', formatEdgeCounts(edgeData.incoming)); + addSelectionRow(edgeSection, 'Outgoing', formatEdgeCounts(edgeData.outgoing)); + addSelectionRow(edgeSection, 'From', formatEdgeTargets(edgeData.incoming, 'incoming')); + addSelectionRow(edgeSection, 'To', formatEdgeTargets(edgeData.outgoing, 'outgoing')); +}; + +const resetMaterialHighlight = (material) => { + if (!material || !material.userData?.baseEmissive) return; + material.emissive.copy(material.userData.baseEmissive); + material.emissiveIntensity = material.userData.baseEmissiveIntensity ?? material.emissiveIntensity; + if (material.userData.baseOpacity != null) material.opacity = material.userData.baseOpacity; + material.needsUpdate = true; +}; + +const resetObjectHighlights = () => { + for (const mesh of [...state.fileMeshes, ...state.memberMeshes, ...state.chunkMeshes]) { + resetMaterialHighlight(mesh.material); + const inner = mesh.userData?.shellInner; + if (inner?.material) resetMaterialHighlight(inner.material); + } +}; + +const resetEdgeHighlights = () => { + for (const mesh of state.edgeMeshes) { + const material = mesh.material; + if (!material) continue; + if (mesh.isInstancedMesh) { + const baseColors = mesh.userData?.instanceBaseColors; + if (Array.isArray(baseColors)) { + baseColors.forEach((color, index) => { + if (color) mesh.setColorAt(index, color); + }); + if (mesh.instanceColor) mesh.instanceColor.needsUpdate = true; + } + if (material.userData?.baseEmissiveIntensity != null) { + material.emissiveIntensity = material.userData.baseEmissiveIntensity; + } + if (material.userData?.baseOpacity != null) { + material.opacity = material.userData.baseOpacity; + } + material.needsUpdate = true; + continue; + } + if (!material.userData?.baseColor) continue; + material.color.copy(material.userData.baseColor); + material.emissive.copy(material.userData.baseEmissive); + material.emissiveIntensity = material.userData.baseEmissiveIntensity ?? material.emissiveIntensity; + material.opacity = material.userData.baseOpacity ?? material.opacity; + material.needsUpdate = true; + } +}; + +const resetWireHighlights = () => { + for (const material of state.wireMaterials) { + configureWireMaterial(material); + material.needsUpdate = true; + } +}; + +const boostWireframe = (mesh, color, strength) => { + if (!mesh) return; + const wire = state.wireByMesh.get(mesh); + if (!wire || !wire.material) return; + const material = wire.material; + const baseWidth = material.userData?.baseLinewidth || material.linewidth || 1; + if ('linewidth' in material) { + material.linewidth = baseWidth * (1 + strength); + } + if (color) material.color.copy(color); + material.opacity = clamp(material.opacity + strength * 0.2, 0.02, 0.9); + material.needsUpdate = true; +}; + +const highlightMesh = (mesh, color, intensity, wireBoost = 0) => { + if (!mesh || !mesh.material) return; + mesh.material.emissive.copy(color); + mesh.material.emissiveIntensity = intensity; + mesh.material.needsUpdate = true; + const inner = mesh.userData?.shellInner; + if (inner?.material) { + inner.material.emissive.copy(color); + inner.material.emissiveIntensity = intensity * 0.75; + inner.material.needsUpdate = true; + } + if (wireBoost > 0) boostWireframe(mesh, color, wireBoost); +}; + +const highlightEdgeMesh = (mesh, color) => { + if (!mesh || !mesh.material) return; + mesh.material.color.copy(color); + mesh.material.emissive.copy(color); + mesh.material.emissiveIntensity = Math.max(0.6, mesh.material.userData?.baseEmissiveIntensity || 0.6); + mesh.material.opacity = Math.max(0.7, mesh.material.opacity); + mesh.material.needsUpdate = true; +}; + +const highlightEdgeInstance = (mesh, index, color) => { + if (!mesh || !mesh.isInstancedMesh) return; + if (typeof mesh.setColorAt === 'function') { + mesh.setColorAt(index, color); + if (mesh.instanceColor) mesh.instanceColor.needsUpdate = true; + } +}; + +const buildSelectionKeys = (info) => { + const keys = new Set(); + if (!info) return keys; + const fileKey = info.file || info.name || ''; + if (fileKey) keys.add(`file:${fileKey}`); + const memberId = info.id || info.memberId || null; + if (memberId) { + keys.add(`member:${memberId}`); + const memberFile = state.fileByMember.get(memberId); + if (memberFile) keys.add(`file:${memberFile}`); + } + return keys; +}; + +const applyHighlightsForKeys = (selectionKeys, intensity = 1) => { + if (!selectionKeys || !selectionKeys.size) return; + const connected = new Map(); + const edgeSegments = state.edgeSegments || []; + edgeSegments.forEach((segment) => { + const endpoints = segment.endpoints; + if (!endpoints || !endpoints.size) return; + let matches = false; + for (const key of selectionKeys) { + if (endpoints.has(key)) { + matches = true; + break; + } + } + if (!matches) return; + const edgeColor = segment.edgeColor || new state.THREE.Color(0xffffff); + const highlightColor = segment.highlightColor || edgeColor; + highlightEdgeInstance(segment.mesh, segment.index, highlightColor); + endpoints.forEach((endpointKey) => { + if (selectionKeys.has(endpointKey)) return; + const entry = connected.get(endpointKey) || { color: new state.THREE.Color(0, 0, 0), weight: 0 }; + entry.color.add(edgeColor.clone().multiplyScalar(1)); + entry.weight += 1; + connected.set(endpointKey, entry); + }); + }); + + connected.forEach((entry, endpointKey) => { + if (!entry.weight) return; + const color = entry.color.multiplyScalar(1 / entry.weight); + const [type, id] = endpointKey.split(':'); + if (type === 'file' && state.fileMeshByKey.has(id)) { + highlightMesh(state.fileMeshByKey.get(id), color, 0.35 * intensity + 0.15, 0.25 * intensity); + } + if (type === 'member' && state.memberMeshById.has(id)) { + highlightMesh(state.memberMeshById.get(id), color, 0.35 * intensity + 0.15, 0.25 * intensity); + } + }); +}; + +export const applyHighlights = () => { + resetObjectHighlights(); + resetEdgeHighlights(); + resetWireHighlights(); + const selectionKeys = buildSelectionKeys(state.selected?.userData || null); + if (state.selected) { + const baseColor = state.selected.userData?.baseColor + ? state.selected.userData.baseColor + : (state.selected.material?.color ? state.selected.material.color : new state.THREE.Color(0xffffff)); + highlightMesh(state.selected, baseColor.clone().lerp(new state.THREE.Color(0xffffff), 0.35), 0.7, 0.85); + applyHighlightsForKeys(selectionKeys, 1); + } + if (state.hoveredRef) { + const hoverInfo = state.hoveredRef.refType === 'member' + ? { id: state.hoveredRef.refId, memberId: state.hoveredRef.refId } + : { file: state.hoveredRef.refId, name: state.hoveredRef.refId }; + const hoverKeys = buildSelectionKeys(hoverInfo); + applyHighlightsForKeys(hoverKeys, 0.6); + if (state.hoveredRef.refType === 'file' && state.fileMeshByKey.has(state.hoveredRef.refId)) { + highlightMesh(state.fileMeshByKey.get(state.hoveredRef.refId), new state.THREE.Color(0xffffff), 0.35, 0.35); + } + if (state.hoveredRef.refType === 'member' && state.memberMeshById.has(state.hoveredRef.refId)) { + highlightMesh(state.memberMeshById.get(state.hoveredRef.refId), new state.THREE.Color(0xffffff), 0.35, 0.35); + } + } + if (state.hoveredMesh && !state.selected) { + const baseColor = state.hoveredMesh.userData?.baseColor + ? state.hoveredMesh.userData.baseColor.clone().lerp(new state.THREE.Color(0xffffff), 0.25) + : new state.THREE.Color(0xffffff); + highlightMesh(state.hoveredMesh, baseColor, 0.35, 0.4); + } +}; + +export const setSelection = (object) => { + state.selected = object; + const info = state.selected ? (state.selected.userData || {}) : null; + renderSelectionDetails(info); + applyHighlights(); +}; + +const resolveFilePath = (file) => { + if (!file) return ''; + if (file.includes(':\\') || file.startsWith('\\') || file.startsWith('/')) return file; + const root = state.map.root?.path || ''; + if (!root) return file; + if (root.endsWith('/') || root.endsWith('\\')) return root + file; + return root + '/' + file; +}; + +const buildOpenUri = (info) => { + if (!state.config.openUriTemplate) return null; + const range = info.range || {}; + const filePath = resolveFilePath(info.file || ''); + const replacements = { + file: encodeURIComponent(filePath), + fileRaw: filePath, + line: encodeURIComponent(range.startLine || 1), + column: encodeURIComponent(1), + startLine: encodeURIComponent(range.startLine || 1), + endLine: encodeURIComponent(range.endLine || range.startLine || 1), + symbol: encodeURIComponent(info.name || '') + }; + return state.config.openUriTemplate.replace(/{(\w+)}/g, (match, key) => replacements[key] || match); +}; + +export const openSelection = () => { + if (!state.selected) return; + const uri = buildOpenUri(state.selected.userData || {}); + if (uri) window.location.href = uri; +}; + diff --git a/src/map/isometric/client/state.js b/src/map/isometric/client/state.js new file mode 100644 index 000000000..1979e5457 --- /dev/null +++ b/src/map/isometric/client/state.js @@ -0,0 +1 @@ +export const state = {}; diff --git a/src/map/isometric/client/three-loader.js b/src/map/isometric/client/three-loader.js new file mode 100644 index 000000000..29569cf18 --- /dev/null +++ b/src/map/isometric/client/three-loader.js @@ -0,0 +1,25 @@ +export const loadThreeModules = async (threeUrl) => { + const THREE = await import(threeUrl); + let LineSegments2 = null; + let LineSegmentsGeometry = null; + let LineMaterial = null; + try { + ({ LineSegments2 } = await import('/three/examples/jsm/lines/LineSegments2.js')); + ({ LineSegmentsGeometry } = await import('/three/examples/jsm/lines/LineSegmentsGeometry.js')); + ({ LineMaterial } = await import('/three/examples/jsm/lines/LineMaterial.js')); + } catch (err) { + LineSegments2 = null; + LineSegmentsGeometry = null; + LineMaterial = null; + } + return { THREE, LineSegments2, LineSegmentsGeometry, LineMaterial }; +}; + +export const loadRgbeLoader = async (url) => { + try { + const module = await import(url || '/three/examples/jsm/loaders/RGBELoader.js'); + return module.RGBELoader || null; + } catch (err) { + return null; + } +}; diff --git a/src/map/isometric/client/ui.js b/src/map/isometric/client/ui.js new file mode 100644 index 000000000..6caa8c5ed --- /dev/null +++ b/src/map/isometric/client/ui.js @@ -0,0 +1,988 @@ +import { state } from './state.js'; +import { storageKey } from './dom.js'; +import { + applyGlassSettings, + updateExtraLights, + updateFileOpacity, + updateFlowGlow, + updateFlowLights, + updateFog, + updateGridGlow, + updateMemberOpacity, + updateWireframes +} from './materials.js'; +import { scheduleRebuild } from './rebuild.js'; +import { renderSelectionDetails } from './selection.js'; +import { clearGroup } from './scene-utils.js'; + +const getNested = (obj, path) => { + const parts = path.split('.'); + let current = obj; + for (const part of parts) { + if (!current || typeof current !== 'object') return undefined; + current = current[part]; + } + return current; +}; + +const setNested = (obj, path, value) => { + const parts = path.split('.'); + let current = obj; + while (parts.length > 1) { + const part = parts.shift(); + current[part] = current[part] || {}; + current = current[part]; + } + current[parts[0]] = value; +}; + +const createToggle = (container, options) => { + const wrapper = document.createElement('label'); + const input = document.createElement('input'); + input.type = 'checkbox'; + input.checked = options.checked !== false; + input.addEventListener('change', () => options.onChange(input.checked)); + wrapper.appendChild(input); + if (options.swatch) wrapper.appendChild(options.swatch); + const text = document.createElement('span'); + text.textContent = options.label; + wrapper.appendChild(text); + container.appendChild(wrapper); +}; + +const createSelect = (container, options) => { + const wrapper = document.createElement('label'); + const text = document.createElement('span'); + text.textContent = options.label; + const select = document.createElement('select'); + select.style.flex = '1'; + options.options.forEach((entry) => { + const option = document.createElement('option'); + option.value = entry.value; + option.textContent = entry.label; + select.appendChild(option); + }); + select.value = options.value ?? options.defaultValue; + select.addEventListener('change', () => { + options.onChange(select.value); + }); + wrapper.appendChild(text); + wrapper.appendChild(select); + container.appendChild(wrapper); +}; + +const createSlider = (container, options) => { + const label = document.createElement('div'); + label.textContent = options.label; + const row = document.createElement('div'); + row.className = 'slider-row'; + const input = document.createElement('input'); + input.type = 'range'; + input.min = String(options.min); + input.max = String(options.max); + input.step = String(options.step || 0.1); + const currentValue = getNested(state.panelState, options.path); + input.value = String(Number.isFinite(currentValue) ? currentValue : options.defaultValue); + const valueLabel = document.createElement('div'); + valueLabel.className = 'value'; + const updateValue = () => { + const raw = Number(input.value); + const nextValue = Number.isFinite(raw) ? raw : options.defaultValue; + setNested(state.panelState, options.path, nextValue); + valueLabel.textContent = options.format ? options.format(nextValue) : String(nextValue); + syncStateFromPanel(); + if (typeof options.onInput === 'function') { + options.onInput(nextValue); + } + if (options.rebuild !== false) { + scheduleRebuild(options.debounceMs); + } + persistPanelState(); + }; + input.addEventListener('input', updateValue); + updateValue(); + row.appendChild(input); + row.appendChild(valueLabel); + container.appendChild(label); + container.appendChild(row); +}; + +const createButton = (container, label, onClick) => { + const button = document.createElement('button'); + button.type = 'button'; + button.textContent = label; + button.addEventListener('click', onClick); + container.appendChild(button); +}; + +const persistPanelState = (() => { + let timer = null; + return () => { + if (timer) clearTimeout(timer); + timer = setTimeout(() => { + const payload = { + layout: state.panelState.layout, + scoring: state.panelState.scoring, + colors: state.panelState.colors, + controls: state.panelState.controls, + visuals: state.panelState.visuals + }; + try { + window.localStorage.setItem(storageKey, JSON.stringify(payload)); + } catch (err) { + // ignore storage failures + } + }, 200); + }; +})(); + +export const syncStateFromPanel = () => { + Object.assign(state.layout, state.panelState.layout || {}); + Object.assign(state.scoring, state.panelState.scoring || {}); + Object.assign(state.colors, state.panelState.colors || {}); + Object.assign(state.controls, state.panelState.controls || {}); + state.controls.wasd = { ...state.controls.wasd, ...(state.panelState.controls?.wasd || {}) }; + Object.assign(state.visuals, state.panelState.visuals || {}); + state.visuals.glass = { ...state.visuals.glass, ...(state.panelState.visuals?.glass || {}) }; + if (state.normalMapState?.texture) { + state.normalMapState.texture.repeat.set(state.visuals.glass.normalRepeat, state.visuals.glass.normalRepeat); + } + updateExtraLights(); +}; + +export const renderEdgeMenu = () => { + const { dom, edgeTypes, edgeVisibility, edgeTypeGroups, visuals, visualDefaults } = state; + dom.menuEdges.textContent = ''; + createToggle(dom.menuEdges, { + label: 'Curve edges', + checked: visuals.curveEdges ?? visualDefaults.curveEdges, + onChange: (value) => { + setNested(state.panelState, 'visuals.curveEdges', value); + syncStateFromPanel(); + scheduleRebuild(); + persistPanelState(); + } + }); + if (!edgeTypes.length) { + const empty = document.createElement('div'); + empty.textContent = 'No edges available'; + dom.menuEdges.appendChild(empty); + return; + } + edgeTypes.forEach((type) => { + const style = state.map.legend?.edgeStyles?.[type] || {}; + const swatch = document.createElement('span'); + swatch.className = 'swatch'; + swatch.style.background = style.color || '#9aa0a6'; + createToggle(dom.menuEdges, { + label: type, + swatch, + checked: edgeVisibility.has(type) + ? edgeVisibility.get(type) + : edgeTypeGroups.get(type)?.visible !== false, + onChange: (value) => { + const group = edgeTypeGroups.get(type); + if (group) group.visible = value; + edgeVisibility.set(type, value); + } + }); + }); +}; + +export const initUi = () => { + const { + dom, + layout, + scoring, + controls, + visuals, + controlDefaults, + layoutDefaults, + scoringDefaults, + colorDefaults, + visualDefaults, + colors, + fileGroup, + memberGroup, + labelGroup, + wireGroup, + edgeGroup + } = state; + + state.panelState = { + layout: { ...layout }, + scoring: { ...scoring }, + colors: { ...colors }, + controls: { ...controls, wasd: { ...(controls.wasd || {}) } }, + visuals: { ...visuals, glass: { ...visuals.glass } } + }; + + state.edgeVisibility = state.edgeVisibility || new Map(); + state.gridVisible = state.gridVisible ?? true; + + createToggle(dom.menuView, { + label: 'Grid', + onChange: (value) => { + state.gridVisible = value; + if (state.grid) state.grid.visible = value; + if (state.gridLines) state.gridLines.visible = value; + } + }); + createToggle(dom.menuView, { + label: 'Files', + onChange: (value) => { + fileGroup.visible = value; + } + }); + createToggle(dom.menuView, { + label: 'Members', + onChange: (value) => { + memberGroup.visible = value; + } + }); + createToggle(dom.menuView, { + label: 'Labels', + checked: false, + onChange: (value) => { + labelGroup.visible = value; + if (value) { + scheduleRebuild(0); + } else { + clearGroup(labelGroup); + state.labelMaterials = []; + } + } + }); + createToggle(dom.menuView, { + label: 'Wireframes', + onChange: (value) => { + wireGroup.visible = value; + } + }); + createToggle(dom.menuView, { + label: 'Edges', + onChange: (value) => { + edgeGroup.visible = value; + } + }); + + createSlider(dom.menuControls, { + label: 'Pan sensitivity', + path: 'controls.panSensitivity', + min: 0.2, + max: 4, + step: 0.1, + defaultValue: controlDefaults.panSensitivity, + rebuild: false + }); + createSlider(dom.menuControls, { + label: 'Zoom damping', + path: 'controls.zoomDamping', + min: 0.6, + max: 0.98, + step: 0.01, + defaultValue: controlDefaults.zoomDamping, + format: (value) => value.toFixed(2), + rebuild: false + }); + createSlider(dom.menuControls, { + label: 'Zoom max', + path: 'controls.zoomMax', + min: 4, + max: 120, + step: 1, + defaultValue: controlDefaults.zoomMax, + rebuild: false + }); + createSlider(dom.menuControls, { + label: 'WASD sensitivity', + path: 'controls.wasd.sensitivity', + min: 100, + max: 50000, + step: 100, + defaultValue: controlDefaults.wasd.sensitivity, + rebuild: false + }); + createSlider(dom.menuControls, { + label: 'WASD accel', + path: 'controls.wasd.acceleration', + min: 100, + max: 20000, + step: 100, + defaultValue: controlDefaults.wasd.acceleration, + rebuild: false + }); + createSlider(dom.menuControls, { + label: 'WASD max', + path: 'controls.wasd.maxSpeed', + min: 100, + max: 60000, + step: 500, + defaultValue: controlDefaults.wasd.maxSpeed, + rebuild: false + }); + createSlider(dom.menuControls, { + label: 'WASD drag', + path: 'controls.wasd.drag', + min: 1, + max: 20, + step: 0.5, + defaultValue: controlDefaults.wasd.drag, + rebuild: false + }); + + createSelect(dom.menuLayout, { + label: 'Layout style', + value: getNested(state.panelState, 'layout.style'), + defaultValue: layoutDefaults.style, + options: [ + { label: 'Clustered', value: 'clustered' }, + { label: 'Flow', value: 'flow' }, + { label: 'Hex grid', value: 'hex' }, + { label: 'Radial', value: 'radial' }, + { label: 'Flat grid', value: 'flat' }, + { label: 'Stream', value: 'stream' } + ], + onChange: (value) => { + setNested(state.panelState, 'layout.style', value); + syncStateFromPanel(); + scheduleRebuild(); + persistPanelState(); + } + }); + + createSelect(dom.menuLayout, { + label: 'File shapes', + value: getNested(state.panelState, 'layout.fileShape'), + defaultValue: layoutDefaults.fileShape, + options: [ + { label: 'Category', value: 'category' }, + { label: 'Mixed', value: 'mix' }, + { label: 'Square', value: 'square' }, + { label: 'Circle', value: 'circle' }, + { label: 'Pyramid', value: 'pyramid' }, + { label: 'Pentagon', value: 'pentagon' }, + { label: 'Hexagon', value: 'hexagon' }, + { label: 'Heptagon', value: 'heptagon' }, + { label: 'Octagon', value: 'octagon' }, + { label: 'Pentagon pyramid', value: 'pentagon-pyramid' }, + { label: 'Hexagon pyramid', value: 'hexagon-pyramid' }, + { label: 'Heptagon pyramid', value: 'heptagon-pyramid' }, + { label: 'Octagon pyramid', value: 'octagon-pyramid' }, + { label: 'Pentagon frustum', value: 'pentagon-frustum' }, + { label: 'Hexagon frustum', value: 'hexagon-frustum' }, + { label: 'Heptagon frustum', value: 'heptagon-frustum' }, + { label: 'Octagon frustum', value: 'octagon-frustum' } + ], + onChange: (value) => { + setNested(state.panelState, 'layout.fileShape', value); + syncStateFromPanel(); + scheduleRebuild(); + persistPanelState(); + } + }); + + createSelect(dom.menuLayout, { + label: 'Member shapes', + value: getNested(state.panelState, 'layout.memberShape'), + defaultValue: layoutDefaults.memberShape, + options: [ + { label: 'Category', value: 'category' }, + { label: 'Mixed', value: 'mix' }, + { label: 'Square', value: 'square' }, + { label: 'Circle', value: 'circle' }, + { label: 'Pyramid', value: 'pyramid' }, + { label: 'Pentagon', value: 'pentagon' }, + { label: 'Hexagon', value: 'hexagon' }, + { label: 'Heptagon', value: 'heptagon' }, + { label: 'Octagon', value: 'octagon' }, + { label: 'Pentagon pyramid', value: 'pentagon-pyramid' }, + { label: 'Hexagon pyramid', value: 'hexagon-pyramid' }, + { label: 'Heptagon pyramid', value: 'heptagon-pyramid' }, + { label: 'Octagon pyramid', value: 'octagon-pyramid' }, + { label: 'Pentagon frustum', value: 'pentagon-frustum' }, + { label: 'Hexagon frustum', value: 'hexagon-frustum' }, + { label: 'Heptagon frustum', value: 'heptagon-frustum' }, + { label: 'Octagon frustum', value: 'octagon-frustum' } + ], + onChange: (value) => { + setNested(state.panelState, 'layout.memberShape', value); + syncStateFromPanel(); + scheduleRebuild(); + persistPanelState(); + } + }); + + createSlider(dom.menuLayout, { + label: 'Group spacing', + path: 'layout.groupSpacing', + min: 0, + max: 16, + step: 0.5, + defaultValue: layoutDefaults.groupSpacing + }); + createSlider(dom.menuLayout, { + label: 'File spacing', + path: 'layout.fileSpacing', + min: 0, + max: 12, + step: 0.5, + defaultValue: layoutDefaults.fileSpacing + }); + createSlider(dom.menuLayout, { + label: 'Compactness', + path: 'layout.compactness', + min: 0.5, + max: 1.4, + step: 0.05, + defaultValue: layoutDefaults.compactness + }); + createSlider(dom.menuLayout, { + label: 'Routing padding', + path: 'layout.routingPadding', + min: 0, + max: 3, + step: 0.1, + defaultValue: layoutDefaults.routingPadding + }); + createSlider(dom.menuLayout, { + label: 'Routing step', + path: 'layout.routingStep', + min: 0.5, + max: 5, + step: 0.1, + defaultValue: layoutDefaults.routingStep + }); + createSlider(dom.menuLayout, { + label: 'Edge plane', + path: 'layout.edgePlane', + min: -4, + max: 0.5, + step: 0.05, + defaultValue: layoutDefaults.edgePlane + }); + createSlider(dom.menuLayout, { + label: 'Label size', + path: 'layout.labelScale', + min: 0.01, + max: 0.04, + step: 0.002, + defaultValue: layoutDefaults.labelScale, + format: (value) => value.toFixed(3) + }); + createSlider(dom.menuLayout, { + label: 'Label offset', + path: 'layout.labelOffset', + min: 0, + max: 1.5, + step: 0.05, + defaultValue: layoutDefaults.labelOffset + }); + + createSlider(dom.menuScore, { + label: 'Dataflow weight', + path: 'scoring.dataflow', + min: 0, + max: 2, + step: 0.05, + defaultValue: scoringDefaults.dataflow + }); + createSlider(dom.menuScore, { + label: 'Controlflow weight', + path: 'scoring.controlFlow', + min: 0, + max: 2, + step: 0.05, + defaultValue: scoringDefaults.controlFlow + }); + createSlider(dom.menuScore, { + label: 'Params weight', + path: 'scoring.params', + min: 0, + max: 1.5, + step: 0.05, + defaultValue: scoringDefaults.params + }); + createSlider(dom.menuScore, { + label: 'Signature weight', + path: 'scoring.signature', + min: 0, + max: 0.15, + step: 0.01, + defaultValue: scoringDefaults.signature, + format: (value) => value.toFixed(2) + }); + createSlider(dom.menuScore, { + label: 'Exported boost', + path: 'scoring.exported', + min: 0, + max: 3, + step: 0.1, + defaultValue: scoringDefaults.exported + }); + createSlider(dom.menuScore, { + label: 'Modifiers weight', + path: 'scoring.modifiers', + min: 0, + max: 1.5, + step: 0.05, + defaultValue: scoringDefaults.modifiers + }); + createSlider(dom.menuScore, { + label: 'Type weight', + path: 'scoring.type', + min: 0, + max: 2, + step: 0.05, + defaultValue: scoringDefaults.type + }); + createSlider(dom.menuScore, { + label: 'Returns weight', + path: 'scoring.returns', + min: 0, + max: 2, + step: 0.05, + defaultValue: scoringDefaults.returns + }); + + createSlider(dom.menuColors, { + label: 'Hue start', + path: 'colors.hueStart', + min: 0, + max: 1, + step: 0.01, + defaultValue: colorDefaults.hueStart, + format: (value) => value.toFixed(2) + }); + createSlider(dom.menuColors, { + label: 'Hue end', + path: 'colors.hueEnd', + min: 0, + max: 1, + step: 0.01, + defaultValue: colorDefaults.hueEnd, + format: (value) => value.toFixed(2) + }); + createSlider(dom.menuColors, { + label: 'Saturation', + path: 'colors.saturation', + min: 0.2, + max: 1, + step: 0.02, + defaultValue: colorDefaults.saturation, + format: (value) => value.toFixed(2) + }); + createSlider(dom.menuColors, { + label: 'Light min', + path: 'colors.lightnessMin', + min: 0.2, + max: 0.8, + step: 0.02, + defaultValue: colorDefaults.lightnessMin, + format: (value) => value.toFixed(2) + }); + createSlider(dom.menuColors, { + label: 'Light max', + path: 'colors.lightnessMax', + min: 0.3, + max: 0.95, + step: 0.02, + defaultValue: colorDefaults.lightnessMax, + format: (value) => value.toFixed(2) + }); + + createSelect(dom.menuColorMode, { + label: 'Color mode', + value: getNested(state.panelState, 'colors.mode'), + defaultValue: colorDefaults.mode || 'score', + options: [ + { label: 'Score gradient', value: 'score' }, + { label: 'Distinct (hash)', value: 'distinct' } + ], + onChange: (value) => { + setNested(state.panelState, 'colors.mode', value); + syncStateFromPanel(); + scheduleRebuild(); + persistPanelState(); + } + }); + createSlider(dom.menuColorMode, { + label: 'Distinct saturation', + path: 'colors.distinctSaturation', + min: 0.2, + max: 1, + step: 0.02, + defaultValue: colorDefaults.distinctSaturation, + format: (value) => value.toFixed(2) + }); + createSlider(dom.menuColorMode, { + label: 'Distinct lightness', + path: 'colors.distinctLightness', + min: 0.2, + max: 0.85, + step: 0.02, + defaultValue: colorDefaults.distinctLightness, + format: (value) => value.toFixed(2) + }); + createSlider(dom.menuColorMode, { + label: 'Distinct hue offset', + path: 'colors.distinctHueOffset', + min: 0, + max: 1, + step: 0.01, + defaultValue: colorDefaults.distinctHueOffset, + format: (value) => value.toFixed(2) + }); + + createSlider(dom.menuVisuals, { + label: 'File opacity', + path: 'visuals.fileOpacity', + min: 0.1, + max: 1, + step: 0.05, + defaultValue: visualDefaults.fileOpacity, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: updateFileOpacity + }); + createSlider(dom.menuVisuals, { + label: 'Member opacity', + path: 'visuals.memberOpacity', + min: 0.1, + max: 1, + step: 0.05, + defaultValue: visualDefaults.memberOpacity, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: updateMemberOpacity + }); + createSlider(dom.menuVisuals, { + label: 'Wireframe thickness', + path: 'visuals.wireframeThickness', + min: 0.01, + max: 10, + step: 0.02, + defaultValue: visualDefaults.wireframeThickness, + rebuild: false, + onInput: updateWireframes + }); + createSlider(dom.menuVisuals, { + label: 'Wireframe glow', + path: 'visuals.wireframeGlow', + min: 0, + max: 2.5, + step: 0.05, + defaultValue: visualDefaults.wireframeGlow, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: updateWireframes + }); + createSlider(dom.menuVisuals, { + label: 'Wire pulse speed', + path: 'visuals.wirePulseSpeed', + min: 0.02, + max: 1, + step: 0.02, + defaultValue: visualDefaults.wirePulseSpeed, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: updateWireframes + }); + createSlider(dom.menuVisuals, { + label: 'Flow glow base', + path: 'visuals.flowGlowBase', + min: 0, + max: 2, + step: 0.05, + defaultValue: visualDefaults.flowGlowBase, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: updateFlowGlow + }); + createSlider(dom.menuVisuals, { + label: 'Flow glow pulse', + path: 'visuals.flowGlowRange', + min: 0, + max: 2, + step: 0.05, + defaultValue: visualDefaults.flowGlowRange, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: updateFlowGlow + }); + createSlider(dom.menuVisuals, { + label: 'Glow speed', + path: 'visuals.glowPulseSpeed', + min: 0.4, + max: 4, + step: 0.1, + defaultValue: visualDefaults.glowPulseSpeed, + format: (value) => value.toFixed(1), + rebuild: false + }); + createSlider(dom.menuVisuals, { + label: 'Glass roughness', + path: 'visuals.glass.roughness', + min: 0, + max: 1, + step: 0.02, + defaultValue: visualDefaults.glass.roughness, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: applyGlassSettings + }); + createSlider(dom.menuVisuals, { + label: 'Glass metalness', + path: 'visuals.glass.metalness', + min: 0, + max: 1, + step: 0.02, + defaultValue: visualDefaults.glass.metalness, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: applyGlassSettings + }); + createSlider(dom.menuVisuals, { + label: 'Glass transmission', + path: 'visuals.glass.transmission', + min: 0, + max: 1, + step: 0.02, + defaultValue: visualDefaults.glass.transmission, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: applyGlassSettings + }); + createSlider(dom.menuVisuals, { + label: 'Glass IOR', + path: 'visuals.glass.ior', + min: 1, + max: 2.4, + step: 0.02, + defaultValue: visualDefaults.glass.ior, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: applyGlassSettings + }); + createSlider(dom.menuVisuals, { + label: 'Glass reflectivity', + path: 'visuals.glass.reflectivity', + min: 0, + max: 1, + step: 0.02, + defaultValue: visualDefaults.glass.reflectivity, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: applyGlassSettings + }); + createSlider(dom.menuVisuals, { + label: 'Glass thickness', + path: 'visuals.glass.thickness', + min: 0.1, + max: 10, + step: 0.1, + defaultValue: visualDefaults.glass.thickness, + format: (value) => value.toFixed(1), + rebuild: false, + onInput: applyGlassSettings + }); + createSlider(dom.menuVisuals, { + label: 'Env intensity', + path: 'visuals.glass.envMapIntensity', + min: 0, + max: 8, + step: 0.1, + defaultValue: visualDefaults.glass.envMapIntensity, + format: (value) => value.toFixed(1), + rebuild: false, + onInput: applyGlassSettings + }); + createSlider(dom.menuVisuals, { + label: 'Clearcoat', + path: 'visuals.glass.clearcoat', + min: 0, + max: 1, + step: 0.02, + defaultValue: visualDefaults.glass.clearcoat, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: applyGlassSettings + }); + createSlider(dom.menuVisuals, { + label: 'Clearcoat rough', + path: 'visuals.glass.clearcoatRoughness', + min: 0, + max: 1, + step: 0.02, + defaultValue: visualDefaults.glass.clearcoatRoughness, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: applyGlassSettings + }); + createSlider(dom.menuVisuals, { + label: 'Normal scale', + path: 'visuals.glass.normalScale', + min: 0, + max: 2, + step: 0.05, + defaultValue: visualDefaults.glass.normalScale, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: applyGlassSettings + }); + createSlider(dom.menuVisuals, { + label: 'Clearcoat normal', + path: 'visuals.glass.clearcoatNormalScale', + min: 0, + max: 2, + step: 0.05, + defaultValue: visualDefaults.glass.clearcoatNormalScale, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: applyGlassSettings + }); + createSlider(dom.menuVisuals, { + label: 'Normal repeat', + path: 'visuals.glass.normalRepeat', + min: 1, + max: 6, + step: 1, + defaultValue: visualDefaults.glass.normalRepeat, + rebuild: false, + onInput: applyGlassSettings + }); + + createToggle(dom.menuEffects, { + label: 'Fog', + checked: visuals.enableFog !== false, + onChange: (value) => { + setNested(state.panelState, 'visuals.enableFog', value); + syncStateFromPanel(); + updateFog(); + persistPanelState(); + } + }); + createToggle(dom.menuEffects, { + label: 'Height fog', + checked: visuals.enableHeightFog === true, + onChange: (value) => { + setNested(state.panelState, 'visuals.enableHeightFog', value); + syncStateFromPanel(); + updateFog(); + persistPanelState(); + } + }); + createSlider(dom.menuEffects, { + label: 'Fog distance', + path: 'visuals.fogDistance', + min: 1.2, + max: 4, + step: 0.1, + defaultValue: visualDefaults.fogDistance, + format: (value) => value.toFixed(1), + rebuild: false, + onInput: () => updateFog() + }); + createSlider(dom.menuEffects, { + label: 'Fog height', + path: 'visuals.fogHeight', + min: 0, + max: 40, + step: 0.5, + defaultValue: visualDefaults.fogHeight, + format: (value) => value.toFixed(1), + rebuild: false, + onInput: () => updateFog() + }); + createSlider(dom.menuEffects, { + label: 'Fog height range', + path: 'visuals.fogHeightRange', + min: 4, + max: 40, + step: 0.5, + defaultValue: visualDefaults.fogHeightRange, + format: (value) => value.toFixed(1), + rebuild: false, + onInput: () => updateFog() + }); + createToggle(dom.menuEffects, { + label: 'Flow lights', + checked: visuals.enableFlowLights !== false, + onChange: (value) => { + setNested(state.panelState, 'visuals.enableFlowLights', value); + syncStateFromPanel(); + updateFlowLights(); + persistPanelState(); + } + }); + createToggle(dom.menuEffects, { + label: 'Extra lights', + checked: visuals.enableExtraLights !== false, + onChange: (value) => { + setNested(state.panelState, 'visuals.enableExtraLights', value); + syncStateFromPanel(); + updateExtraLights(); + persistPanelState(); + } + }); + createSlider(dom.menuEffects, { + label: 'Grid glow base', + path: 'visuals.gridGlowBase', + min: 0, + max: 0.6, + step: 0.02, + defaultValue: visualDefaults.gridGlowBase, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: updateGridGlow + }); + createSlider(dom.menuEffects, { + label: 'Grid glow pulse', + path: 'visuals.gridGlowRange', + min: 0, + max: 1, + step: 0.02, + defaultValue: visualDefaults.gridGlowRange, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: updateGridGlow + }); + createSlider(dom.menuEffects, { + label: 'Grid glow speed', + path: 'visuals.gridPulseSpeed', + min: 0.1, + max: 1, + step: 0.05, + defaultValue: visualDefaults.gridPulseSpeed, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: updateGridGlow + }); + createSlider(dom.menuEffects, { + label: 'Grid line thickness', + path: 'visuals.gridLineThickness', + min: 0.02, + max: 6, + step: 0.05, + defaultValue: visualDefaults.gridLineThickness, + format: (value) => value.toFixed(2), + rebuild: false, + onInput: updateGridGlow + }); + + createButton(dom.menuActions, 'Save settings', () => { + persistPanelState(); + }); + createButton(dom.menuActions, 'Reset to defaults', () => { + try { + window.localStorage.removeItem(storageKey); + } catch (err) { + // ignore storage failures + } + window.location.reload(); + }); + + state.syncStateFromPanel = syncStateFromPanel; + state.renderEdgeMenu = renderEdgeMenu; + renderSelectionDetails(state.selected?.userData || null); +}; diff --git a/src/map/isometric/client/utils.js b/src/map/isometric/client/utils.js new file mode 100644 index 000000000..faf2f1df6 --- /dev/null +++ b/src/map/isometric/client/utils.js @@ -0,0 +1,15 @@ +export const numberValue = (value, fallback) => { + const parsed = Number(value); + return Number.isFinite(parsed) ? parsed : fallback; +}; + +export const clamp = (value, min, max) => Math.max(min, Math.min(max, value)); + +export const hashString = (value) => { + const text = String(value || ''); + let hash = 0; + for (let i = 0; i < text.length; i += 1) { + hash = (hash * 31 + text.charCodeAt(i)) | 0; + } + return hash >>> 0; +}; diff --git a/src/map/isometric/client/viewer-app.js b/src/map/isometric/client/viewer-app.js new file mode 100644 index 000000000..f5f3c338b --- /dev/null +++ b/src/map/isometric/client/viewer-app.js @@ -0,0 +1,120 @@ +import { state } from './state.js'; +import { loadDomConfig } from './dom.js'; +import { loadThreeModules, loadRgbeLoader } from './three-loader.js'; +import { + assetDefaults, + colorDefaults, + controlDefaults, + flowTypeProfiles, + flowWaveLayers, + layoutDefaults, + scoringDefaults, + visualDefaults +} from './defaults.js'; +import { initScene } from './scene.js'; +import { initMapData } from './map-data.js'; +import { initMaterials } from './materials.js'; +import { initUi } from './ui.js'; +import { rebuildScene, scheduleRebuild } from './rebuild.js'; +import { initControls } from './controls.js'; + +const initViewer = async () => { + const { map, config, dom } = loadDomConfig(); + + if (!config.threeUrl) { + dom.selectionBody.textContent = 'Missing three.js module reference.'; + throw new Error('threeUrl missing'); + } + + const { THREE, LineSegments2, LineSegmentsGeometry, LineMaterial } = await loadThreeModules(config.threeUrl); + + const layout = { ...layoutDefaults, ...(config.layout || {}) }; + const scoring = { ...scoringDefaults, ...(config.scoring || {}) }; + const colors = { ...colorDefaults, ...(config.colors || {}) }; + const visuals = { ...visualDefaults, ...(config.visuals || {}) }; + visuals.glass = { ...visualDefaults.glass, ...(config.visuals?.glass || {}) }; + const assets = { ...assetDefaults, ...(config.assets || {}) }; + const controls = { + ...controlDefaults, + ...(config.controls || {}), + wasd: { + ...controlDefaults.wasd, + ...(config.controls?.wasd || {}) + } + }; + + const flowWaveTotal = + flowWaveLayers.reduce((acc, layer) => acc + layer.amplitude, 0) || 1; + const RGBELoader = await loadRgbeLoader(assets.rgbeLoaderUrl); + + Object.assign(state, { + map, + config, + dom, + THREE, + LineSegments2, + LineSegmentsGeometry, + LineMaterial, + RGBELoader, + layout, + scoring, + colors, + visuals, + assets, + controls, + layoutDefaults, + scoringDefaults, + colorDefaults, + visualDefaults, + controlDefaults, + flowWaveLayers, + flowWaveTotal, + flowTypeProfiles, + edgeVisibility: new Map(), + gridVisible: true, + hoveredRef: null, + hoveredMesh: null, + selected: null, + fileMeshes: [], + memberMeshes: [], + chunkMeshes: [], + fileChunkMeshes: [], + fileAnchors: new Map(), + memberAnchors: new Map(), + fileMeshByKey: new Map(), + memberMeshById: new Map(), + fileColorByPath: new Map(), + memberColorById: new Map(), + wireByMesh: new Map(), + edgeMeshes: [], + edgeSegments: [], + edgeDotMesh: null, + edgeDotMaterial: null, + edgeTypeGroups: new Map(), + edgeTypes: [], + flowLights: [], + wireMaterials: [], + gridLineMaterials: [], + labelMaterials: [], + glassMaterials: [], + glassShells: [], + glowMaterials: [], + flowMaterials: [], + normalMapState: { texture: null } + }); + + const counts = map.summary?.counts || { files: 0, members: 0, edges: 0 }; + dom.summary.textContent = + `files: ${counts.files || 0} | members: ${counts.members || 0}` + + ` | edges: ${counts.edges || 0}`; + + await initScene(); + initMapData(); + initMaterials(); + initUi(); + rebuildScene(); + initControls(); + state.scheduleRebuild = scheduleRebuild; +}; + +initViewer(); diff --git a/src/map/isometric/client/viewer.js b/src/map/isometric/client/viewer.js new file mode 100644 index 000000000..f519c4f93 --- /dev/null +++ b/src/map/isometric/client/viewer.js @@ -0,0 +1 @@ +import './viewer-app.js'; diff --git a/src/map/utils.js b/src/map/utils.js new file mode 100644 index 000000000..b568f02a9 --- /dev/null +++ b/src/map/utils.js @@ -0,0 +1,51 @@ +import path from 'node:path'; +import { FILE_CATEGORY_RULES } from './constants.js'; + +export const normalizePath = (value) => String(value || '').replace(/\\/g, '/'); + +export const basename = (value) => { + if (!value) return ''; + return normalizePath(path.basename(value)); +}; + +export const extension = (value) => { + if (!value) return ''; + const ext = path.extname(value); + return ext || ''; +}; + +export const classifyFilePath = (filePath) => { + const normalized = normalizePath(filePath || ''); + if (!normalized) return 'other'; + + const lower = normalized.toLowerCase(); + const ext = extension(lower); + + const isMatch = (rule) => { + if (!rule) return false; + if (rule.extensions && rule.extensions.some((entry) => lower.includes(entry + '.'))) + return true; + if (rule.extensions && rule.extensions.includes(ext)) return true; + if (rule.names && rule.names.some((name) => lower.includes('/' + name + '/'))) return true; + if (rule.patterns && rule.patterns.some((pattern) => pattern.test(lower))) return true; + return false; + }; + + if (isMatch(FILE_CATEGORY_RULES.generated)) return 'generated'; + if (isMatch(FILE_CATEGORY_RULES.test)) return 'test'; + if (isMatch(FILE_CATEGORY_RULES.docs)) return 'docs'; + if (isMatch(FILE_CATEGORY_RULES.config)) return 'config'; + return 'source'; +}; + +export const sortBy = (list, keyFn) => { + return list.slice().sort((a, b) => { + const left = keyFn(a); + const right = keyFn(b); + return String(left).localeCompare(String(right)); + }); +}; + +export const unique = (values) => Array.from(new Set((values || []).filter(Boolean))); + +export const clamp = (value, min, max) => Math.min(max, Math.max(min, value)); diff --git a/src/retrieval/bitmap.js b/src/retrieval/bitmap.js new file mode 100644 index 000000000..3cf15c599 --- /dev/null +++ b/src/retrieval/bitmap.js @@ -0,0 +1,207 @@ +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); +const DEFAULT_MIN_SIZE = 256; +let roaringLib = null; +let roaringChecked = false; + +const resolveRoaring = () => { + if (roaringChecked) return roaringLib; + roaringChecked = true; + try { + roaringLib = require('roaring-wasm'); + } catch { + roaringLib = null; + } + return roaringLib; +}; + +const resolveBitmapClass = () => { + const lib = resolveRoaring(); + if (!lib) return null; + return lib.RoaringBitmap32 + || lib.RoaringBitmap + || lib.default?.RoaringBitmap32 + || lib.default?.RoaringBitmap + || lib.default + || null; +}; + +const normalizeIds = (values) => { + if (!values) return []; + const list = Array.isArray(values) ? values : Array.from(values); + const ids = []; + for (const value of list) { + const parsed = Number(value); + if (!Number.isFinite(parsed)) continue; + const id = Math.floor(parsed); + if (id < 0) continue; + ids.push(id); + } + ids.sort((a, b) => a - b); + const deduped = []; + let last = null; + for (const id of ids) { + if (id === last) continue; + deduped.push(id); + last = id; + } + return deduped; +}; + +const cloneBitmap = (bitmap) => { + if (!bitmap) return null; + if (typeof bitmap.clone === 'function') return bitmap.clone(); + const ids = bitmapToArray(bitmap); + return createBitmapFromIds(ids, { force: true }); +}; + +const bitmapHas = (bitmap, value) => { + if (!bitmap) return false; + if (typeof bitmap.has === 'function') return bitmap.has(value); + if (typeof bitmap.contains === 'function') return bitmap.contains(value); + if (typeof bitmap.includes === 'function') return bitmap.includes(value); + return false; +}; + +const getBitmapSize = (bitmap) => { + if (!bitmap) return 0; + if (Number.isFinite(bitmap.size)) return bitmap.size; + if (typeof bitmap.size === 'function') return bitmap.size(); + if (typeof bitmap.getSize === 'function') return bitmap.getSize(); + return bitmapToArray(bitmap).length; +}; + +export const isBitmapEmpty = (bitmap) => getBitmapSize(bitmap) === 0; + +export const isRoaringAvailable = () => Boolean(resolveBitmapClass()); + +export const shouldUseBitmap = (size, minSize = DEFAULT_MIN_SIZE) => ( + Number.isFinite(size) && size >= minSize +); + +export const bitmapToArray = (bitmap) => { + if (!bitmap) return []; + if (typeof bitmap.toArray === 'function') return bitmap.toArray(); + if (typeof bitmap.toArraySync === 'function') return bitmap.toArraySync(); + if (typeof bitmap.values === 'function') return Array.from(bitmap.values()); + return Array.from(bitmap || []); +}; + +export const bitmapToSet = (bitmap) => new Set(bitmapToArray(bitmap)); + +export const createBitmapFromIds = (values, options = {}) => { + const Bitmap = resolveBitmapClass(); + if (!Bitmap) return null; + const minSize = Number.isFinite(Number(options.minSize)) + ? Math.max(1, Math.floor(Number(options.minSize))) + : DEFAULT_MIN_SIZE; + const force = options.force === true; + const ids = normalizeIds(values); + if (!ids.length) return null; + if (!force && !shouldUseBitmap(ids.length, minSize)) return null; + let bitmap = null; + if (typeof Bitmap.from === 'function') { + bitmap = Bitmap.from(ids); + } else { + bitmap = new Bitmap(); + if (typeof bitmap.addMany === 'function') { + bitmap.addMany(ids); + } else { + for (const id of ids) bitmap.add(id); + } + } + return bitmap; +}; + +export const unionBitmaps = (bitmaps) => { + if (!Array.isArray(bitmaps) || !bitmaps.length) return null; + let acc = cloneBitmap(bitmaps[0]); + for (let i = 1; i < bitmaps.length; i += 1) { + const next = bitmaps[i]; + if (!next || !acc) continue; + if (typeof acc.orInPlace === 'function') { + acc.orInPlace(next); + } else if (typeof acc.or === 'function') { + acc = acc.or(next); + } else if (typeof acc.union === 'function') { + acc = acc.union(next); + } else { + const merged = [...bitmapToArray(acc), ...bitmapToArray(next)]; + acc = createBitmapFromIds(merged, { force: true }); + } + } + return acc; +}; + +export const intersectBitmaps = (bitmaps) => { + if (!Array.isArray(bitmaps) || !bitmaps.length) return null; + let acc = cloneBitmap(bitmaps[0]); + for (let i = 1; i < bitmaps.length; i += 1) { + const next = bitmaps[i]; + if (!next || !acc) continue; + if (typeof acc.andInPlace === 'function') { + acc.andInPlace(next); + } else if (typeof acc.and === 'function') { + acc = acc.and(next); + } else if (typeof acc.intersect === 'function') { + acc = acc.intersect(next); + } else { + const left = bitmapToArray(acc); + const right = new Set(bitmapToArray(next)); + const merged = []; + for (const id of left) { + if (right.has(id)) merged.push(id); + } + acc = createBitmapFromIds(merged, { force: true }); + } + if (!acc || isBitmapEmpty(acc)) return acc; + } + return acc; +}; + +export const intersectSetWithBitmap = (set, bitmap) => { + const out = new Set(); + if (!set || !bitmap) return out; + const hasMethod = typeof bitmap.has === 'function' + || typeof bitmap.contains === 'function' + || typeof bitmap.includes === 'function'; + if (!hasMethod) { + const bitmapSet = bitmapToSet(bitmap); + for (const id of set) { + if (bitmapSet.has(id)) out.add(id); + } + return out; + } + for (const id of set) { + if (bitmapHas(bitmap, id)) out.add(id); + } + return out; +}; + +export const buildBitmapIndex = (index, options = {}) => { + const Bitmap = resolveBitmapClass(); + if (!Bitmap || !index) return null; + const minSize = Number.isFinite(Number(options.minSize)) + ? Math.max(1, Math.floor(Number(options.minSize))) + : DEFAULT_MIN_SIZE; + const buildMap = (source) => { + const out = new Map(); + if (!source || typeof source.entries !== 'function') return out; + for (const [key, set] of source.entries()) { + if (!set || !shouldUseBitmap(set.size, minSize)) continue; + const bitmap = createBitmapFromIds(set, { force: true, minSize }); + if (bitmap) out.set(key, bitmap); + } + return out; + }; + return { + enabled: true, + minSize, + byExt: buildMap(index.byExt), + byKind: buildMap(index.byKind), + byAuthor: buildMap(index.byAuthor), + byChunkAuthor: buildMap(index.byChunkAuthor), + byVisibility: buildMap(index.byVisibility) + }; +}; diff --git a/src/retrieval/cli-args.js b/src/retrieval/cli-args.js new file mode 100644 index 000000000..acaba1859 --- /dev/null +++ b/src/retrieval/cli-args.js @@ -0,0 +1,177 @@ +import yargs from 'yargs/yargs'; + +const BOOLEAN_FLAGS = [ + 'json', + 'json-compact', + 'stats', + 'ann', + 'lint', + 'matched', + 'async', + 'generator', + 'returns', + 'explain', + 'why', + 'case', + 'case-file', + 'case-tokens' +]; + +const STRING_FLAGS = [ + 'type', + 'author', + 'import', + 'calls', + 'uses', + 'signature', + 'param', + 'decorator', + 'inferred-type', + 'return-type', + 'throws', + 'reads', + 'writes', + 'mutates', + 'churn', + 'alias', + 'awaits', + 'branches', + 'loops', + 'breaks', + 'continues', + 'risk', + 'risk-tag', + 'risk-source', + 'risk-sink', + 'risk-category', + 'risk-flow', + 'struct-pack', + 'struct-rule', + 'struct-tag', + 'meta', + 'meta-json', + 'file', + 'ext', + 'lang', + 'chunk-author', + 'modified-after', + 'modified-since', + 'visibility', + 'extends', + 'mode', + 'backend', + 'path', + 'model', + 'repo', + 'branch', + 'fts-profile', + 'fts-weights', + 'bm25-k1', + 'bm25-b', + 'profile' +]; + +const ALIASES = { n: 'top', c: 'context', t: 'type', why: 'explain' }; +const DEFAULTS = { n: 5, context: 3 }; + +/** + * Parse CLI arguments for search. + * @param {string[]} rawArgs + * @returns {object} + */ +export function parseSearchArgs(rawArgs) { + const removedFlags = [ + { flag: '--human', replacement: '--json | --json-compact' }, + { flag: '--headline', replacement: '--matched' } + ]; + const removed = removedFlags.filter((entry) => + rawArgs.some((arg) => arg === entry.flag || arg.startsWith(`${entry.flag}=`)) + ); + if (removed.length) { + const details = removed + .map((entry) => `${entry.flag} was removed (use ${entry.replacement}).`) + .join(' '); + const error = new Error(details); + error.code = 'REMOVED_FLAG'; + throw error; + } + const options = { + n: { type: 'number', default: DEFAULTS.n }, + context: { type: 'number', default: DEFAULTS.context } + }; + for (const flag of BOOLEAN_FLAGS) { + options[flag] = { type: 'boolean' }; + } + for (const flag of STRING_FLAGS) { + options[flag] = { type: 'string' }; + } + const argv = yargs(rawArgs) + .parserConfiguration({ + 'camel-case-expansion': false, + 'dot-notation': false + }) + .options(options) + .alias(ALIASES) + .help() + .alias('h', 'help') + .parse(); + if (argv.profile) { + process.env.PAIROFCLEATS_PROFILE = String(argv.profile).trim(); + } + return argv; +} + +/** + * Build a usage string for search CLI. + * @returns {string} + */ +export function getSearchUsage() { + return [ + 'usage: search "query" [options]', + '', + 'Options:', + ' --repo ', + ' --mode code|prose|both|records|all|extracted-prose', + ' --backend auto|memory|sqlite|sqlite-fts|lmdb', + ' --top N, --context N', + ' --json | --json-compact | --stats', + ' --ann | --no-ann', + ' --model ', + ' --fts-profile | --fts-weights ', + ' --bm25-k1 | --bm25-b ', + ' --profile ', + ' --matched | --explain | --why', + ' Filters:', + ' --type --author --import --calls --uses ', + ' --signature --param --decorator --inferred-type --return-type ', + ' --throws --reads --writes --mutates --alias --awaits ', + ' --branches --loops --breaks --continues ', + ' --risk --risk-tag --risk-source --risk-sink --risk-category --risk-flow ', + ' --struct-pack --struct-rule --struct-tag ', + ' --visibility --extends --async --generator --returns --lint', + ' --churn [min] --modified-after --modified-since --chunk-author ', + ' --path --file --ext <.ext> --lang --branch ', + ' --case --case-file --case-tokens', + ' --meta --meta-json ' + ].join('\n'); +} + +/** + * Resolve the requested search mode and derived flags. + * @param {string|undefined} modeRaw + * @returns {{searchMode:string,runCode:boolean,runProse:boolean,runRecords:boolean,runExtractedProse:boolean}} + */ +export function resolveSearchMode(modeRaw) { + const searchMode = String(modeRaw || 'both').toLowerCase(); + const allowedModes = new Set(['code', 'prose', 'both', 'records', 'all', 'extracted-prose']); + if (!allowedModes.has(searchMode)) { + const error = new Error(`Invalid --mode ${searchMode}. Use code|prose|both|records|all|extracted-prose.`); + error.code = 'INVALID_MODE'; + throw error; + } + const runCode = searchMode === 'code' || searchMode === 'both' || searchMode === 'all'; + const runProse = searchMode === 'prose' || searchMode === 'both' || searchMode === 'all'; + const runRecords = searchMode === 'records' || searchMode === 'all'; + const runExtractedProse = searchMode === 'extracted-prose' || searchMode === 'all'; + return { searchMode, runCode, runProse, runRecords, runExtractedProse }; +} diff --git a/src/retrieval/cli-dictionary.js b/src/retrieval/cli-dictionary.js new file mode 100644 index 000000000..6eec9cefd --- /dev/null +++ b/src/retrieval/cli-dictionary.js @@ -0,0 +1,24 @@ +import fsSync from 'node:fs'; +import { getDictionaryPaths } from '../../tools/dict-utils.js'; + +/** + * Load dictionary files into a normalized Set. + * @param {string} root + * @param {object} dictConfig + * @returns {Promise<{dict:Set, dictionaryPaths:string[]}>} + */ +export async function loadDictionary(root, dictConfig) { + const dictionaryPaths = await getDictionaryPaths(root, dictConfig); + const dict = new Set(); + for (const dictFile of dictionaryPaths) { + try { + const contents = fsSync.readFileSync(dictFile, 'utf8'); + contents + .split(/\r?\n/) + .map((word) => word.trim().toLowerCase()) + .filter(Boolean) + .forEach((word) => dict.add(word)); + } catch {} + } + return { dict, dictionaryPaths }; +} diff --git a/src/retrieval/cli-index.js b/src/retrieval/cli-index.js new file mode 100644 index 000000000..24ab8ae58 --- /dev/null +++ b/src/retrieval/cli-index.js @@ -0,0 +1,334 @@ +import fsSync from 'node:fs'; +import path from 'node:path'; +import crypto from 'node:crypto'; +import { getIndexDir } from '../../tools/dict-utils.js'; +import { buildFilterIndex, hydrateFilterIndex } from './filter-index.js'; +import { createError, ERROR_CODES } from '../shared/error-codes.js'; +import { + MAX_JSON_BYTES, + loadChunkMeta, + loadTokenPostings, + readJsonFile +} from '../shared/artifact-io.js'; +import { loadHnswIndex, normalizeHnswConfig, resolveHnswPaths, validateHnswMetaCompatibility } from '../shared/hnsw.js'; + +/** + * Load file-backed index artifacts from a directory. + * @param {string} dir + * @param {{modelIdDefault:string}} options + * @returns {object} + */ +export function loadIndex(dir, options) { + const { + modelIdDefault, + fileChargramN, + includeHnsw = true, + hnswConfig: rawHnswConfig + } = options || {}; + const hnswConfig = normalizeHnswConfig(rawHnswConfig || {}); + const readJson = (name) => { + const filePath = path.join(dir, name); + return readJsonFile(filePath, { maxBytes: MAX_JSON_BYTES }); + }; + const loadOptional = (name) => { + try { + return readJson(name); + } catch (err) { + if (err?.code === 'ERR_JSON_TOO_LARGE') { + console.warn( + `[search] Skipping ${name}: ${err.message} Use sqlite backend for large repos.` + ); + } + return null; + } + }; + const chunkMeta = loadChunkMeta(dir, { maxBytes: MAX_JSON_BYTES }); + const fileMetaRaw = loadOptional('file_meta.json'); + let fileMetaById = null; + if (Array.isArray(fileMetaRaw)) { + fileMetaById = new Map(); + for (const entry of fileMetaRaw) { + if (!entry || entry.id == null) continue; + fileMetaById.set(entry.id, entry); + } + } + if (!fileMetaById) { + const missingMeta = chunkMeta.some((chunk) => chunk && chunk.fileId != null && !chunk.file); + if (missingMeta) { + throw new Error('file_meta.json is required for fileId-based chunk metadata.'); + } + } else { + for (const chunk of chunkMeta) { + if (!chunk || (chunk.file && chunk.ext)) continue; + const meta = fileMetaById.get(chunk.fileId); + if (!meta) continue; + if (!chunk.file) chunk.file = meta.file; + if (!chunk.ext) chunk.ext = meta.ext; + if (!chunk.externalDocs) chunk.externalDocs = meta.externalDocs; + if (!chunk.last_modified) chunk.last_modified = meta.last_modified; + if (!chunk.last_author) chunk.last_author = meta.last_author; + if (!chunk.churn) chunk.churn = meta.churn; + if (!chunk.churn_added) chunk.churn_added = meta.churn_added; + if (!chunk.churn_deleted) chunk.churn_deleted = meta.churn_deleted; + if (!chunk.churn_commits) chunk.churn_commits = meta.churn_commits; + } + } + const fileRelationsRaw = loadOptional('file_relations.json'); + const repoMap = loadOptional('repo_map.json'); + let fileRelations = null; + if (Array.isArray(fileRelationsRaw)) { + const map = new Map(); + for (const entry of fileRelationsRaw) { + if (!entry || !entry.file) continue; + map.set(entry.file, entry.relations || null); + } + fileRelations = map; + } + const indexState = loadOptional('index_state.json'); + const embeddingsState = indexState?.embeddings || null; + const embeddingsReady = embeddingsState?.ready !== false && embeddingsState?.pending !== true; + const denseVec = embeddingsReady ? loadOptional('dense_vectors_uint8.json') : null; + const denseVecDoc = embeddingsReady ? loadOptional('dense_vectors_doc_uint8.json') : null; + const denseVecCode = embeddingsReady ? loadOptional('dense_vectors_code_uint8.json') : null; + if (denseVec && !denseVec.model && modelIdDefault) denseVec.model = modelIdDefault; + if (denseVecDoc && !denseVecDoc.model && modelIdDefault) denseVecDoc.model = modelIdDefault; + if (denseVecCode && !denseVecCode.model && modelIdDefault) denseVecCode.model = modelIdDefault; + const hnswMeta = embeddingsReady && includeHnsw && hnswConfig.enabled + ? loadOptional('dense_vectors_hnsw.meta.json') + : null; + let hnswIndex = null; + let hnswAvailable = false; + if (hnswMeta && includeHnsw && hnswConfig.enabled) { + const compatibility = validateHnswMetaCompatibility({ denseVectors: denseVec, hnswMeta }); + if (!compatibility.ok) { + console.warn(`[ann] Skipping HNSW index load due to incompatible metadata: ${compatibility.warnings.join('; ')}`); + } else { + const { indexPath } = resolveHnswPaths(dir); + const mergedConfig = { + ...hnswConfig, + space: hnswMeta.space || hnswConfig.space, + efSearch: hnswMeta.efSearch || hnswConfig.efSearch + }; + hnswIndex = loadHnswIndex({ indexPath, dims: hnswMeta.dims, config: mergedConfig }); + hnswAvailable = Boolean(hnswIndex); + } + } + const fieldPostings = loadOptional('field_postings.json'); + const fieldTokens = loadOptional('field_tokens.json'); + const filterIndexRaw = loadOptional('filter_index.json'); + const idx = { + chunkMeta, + fileRelations, + repoMap, + denseVec, + denseVecDoc, + denseVecCode, + hnsw: hnswMeta ? { + available: hnswAvailable, + index: hnswIndex, + meta: hnswMeta, + space: hnswMeta.space || hnswConfig.space + } : { available: false, index: null, meta: null, space: hnswConfig.space }, + state: indexState, + fieldPostings, + fieldTokens, + minhash: loadOptional('minhash_signatures.json'), + phraseNgrams: loadOptional('phrase_ngrams.json'), + chargrams: loadOptional('chargram_postings.json') + }; + if (idx.phraseNgrams?.vocab && !idx.phraseNgrams.vocabIndex) { + idx.phraseNgrams.vocabIndex = new Map(idx.phraseNgrams.vocab.map((term, i) => [term, i])); + } + if (idx.chargrams?.vocab && !idx.chargrams.vocabIndex) { + idx.chargrams.vocabIndex = new Map(idx.chargrams.vocab.map((term, i) => [term, i])); + } + if (idx.fieldPostings?.fields) { + for (const field of Object.keys(idx.fieldPostings.fields)) { + const entry = idx.fieldPostings.fields[field]; + if (!entry?.vocab || entry.vocabIndex) continue; + entry.vocabIndex = new Map(entry.vocab.map((term, i) => [term, i])); + } + } + idx.filterIndex = filterIndexRaw + ? (hydrateFilterIndex(filterIndexRaw) || buildFilterIndex(chunkMeta, { fileChargramN })) + : buildFilterIndex(chunkMeta, { fileChargramN }); + try { + idx.tokenIndex = loadTokenPostings(dir, { maxBytes: MAX_JSON_BYTES }); + } catch {} + return idx; +} + +/** + * Resolve the index directory (cache-first, local fallback). + * @param {string} root + * @param {'code'|'prose'|'records'|'extracted-prose'} mode + * @param {object} userConfig + * @returns {string} + */ +export function resolveIndexDir(root, mode, userConfig) { + const cached = getIndexDir(root, mode, userConfig); + const cachedMeta = path.join(cached, 'chunk_meta.json'); + const cachedMetaJsonl = path.join(cached, 'chunk_meta.jsonl'); + const cachedMetaParts = path.join(cached, 'chunk_meta.meta.json'); + const cachedPartsDir = path.join(cached, 'chunk_meta.parts'); + if (fsSync.existsSync(cachedMeta) + || fsSync.existsSync(cachedMetaJsonl) + || fsSync.existsSync(cachedMetaParts) + || fsSync.existsSync(cachedPartsDir)) { + return cached; + } + const local = path.join(root, `index-${mode}`); + const localMeta = path.join(local, 'chunk_meta.json'); + const localMetaJsonl = path.join(local, 'chunk_meta.jsonl'); + const localMetaParts = path.join(local, 'chunk_meta.meta.json'); + const localPartsDir = path.join(local, 'chunk_meta.parts'); + if (fsSync.existsSync(localMeta) + || fsSync.existsSync(localMetaJsonl) + || fsSync.existsSync(localMetaParts) + || fsSync.existsSync(localPartsDir)) { + return local; + } + return cached; +} + +/** + * Ensure a file-backed index exists for a mode. + * @param {string} root + * @param {'code'|'prose'|'records'|'extracted-prose'} mode + * @param {object} userConfig + * @returns {string} + */ +export function requireIndexDir(root, mode, userConfig, options = {}) { + const dir = resolveIndexDir(root, mode, userConfig); + const metaPath = path.join(dir, 'chunk_meta.json'); + const metaJsonlPath = path.join(dir, 'chunk_meta.jsonl'); + const metaPartsPath = path.join(dir, 'chunk_meta.meta.json'); + const metaPartsDir = path.join(dir, 'chunk_meta.parts'); + if (!fsSync.existsSync(metaPath) + && !fsSync.existsSync(metaJsonlPath) + && !fsSync.existsSync(metaPartsPath) + && !fsSync.existsSync(metaPartsDir)) { + const suffix = (mode === 'records' || mode === 'extracted-prose') + ? ` --mode ${mode}` + : ''; + const message = `[search] ${mode} index not found at ${dir}. Run "pairofcleats index build${suffix}" or "npm run build-index${suffix}".`; + const emitOutput = options.emitOutput !== false; + const exitOnError = options.exitOnError !== false; + if (emitOutput) console.error(message); + if (exitOnError) process.exit(1); + throw createError(ERROR_CODES.NO_INDEX, message); + } + return dir; +} + +/** + * Build a deterministic cache key for the current query + settings. + * @param {object} payload + * @returns {{key:string,payload:object}} + */ +export function buildQueryCacheKey(payload) { + const raw = JSON.stringify(payload); + const key = crypto.createHash('sha1').update(raw).digest('hex'); + return { key, payload }; +} + +/** + * Build a signature payload for cache invalidation. + * @param {object} options + * @returns {object} + */ +export function getIndexSignature(options) { + const { + useSqlite, + backendLabel, + sqliteCodePath, + sqliteProsePath, + runRecords, + runExtractedProse, + root, + userConfig + } = options; + const fileSignature = (filePath) => { + try { + let statPath = filePath; + if (!fsSync.existsSync(statPath) && filePath.endsWith('.json')) { + const gzPath = `${filePath}.gz`; + if (fsSync.existsSync(gzPath)) statPath = gzPath; + } + const stat = fsSync.statSync(statPath); + return `${stat.size}:${stat.mtimeMs}`; + } catch { + return null; + } + }; + + const extractedProseDir = runExtractedProse + ? resolveIndexDir(root, 'extracted-prose', userConfig) + : null; + const extractedProseMeta = extractedProseDir ? path.join(extractedProseDir, 'chunk_meta.json') : null; + const extractedProseDense = extractedProseDir ? path.join(extractedProseDir, 'dense_vectors_uint8.json') : null; + const extractedProseHnswMeta = extractedProseDir ? path.join(extractedProseDir, 'dense_vectors_hnsw.meta.json') : null; + const extractedProseHnswIndex = extractedProseDir ? path.join(extractedProseDir, 'dense_vectors_hnsw.bin') : null; + + if (useSqlite) { + const codeDir = resolveIndexDir(root, 'code', userConfig); + const proseDir = resolveIndexDir(root, 'prose', userConfig); + const codeRelations = path.join(codeDir, 'file_relations.json'); + const proseRelations = path.join(proseDir, 'file_relations.json'); + const recordDir = runRecords ? resolveIndexDir(root, 'records', userConfig) : null; + const recordMeta = recordDir ? path.join(recordDir, 'chunk_meta.json') : null; + const recordDense = recordDir ? path.join(recordDir, 'dense_vectors_uint8.json') : null; + return { + backend: backendLabel, + code: fileSignature(sqliteCodePath), + prose: fileSignature(sqliteProsePath), + codeRelations: fileSignature(codeRelations), + proseRelations: fileSignature(proseRelations), + extractedProse: extractedProseMeta ? fileSignature(extractedProseMeta) : null, + extractedProseDense: extractedProseDense ? fileSignature(extractedProseDense) : null, + extractedProseHnswMeta: extractedProseHnswMeta ? fileSignature(extractedProseHnswMeta) : null, + extractedProseHnswIndex: extractedProseHnswIndex ? fileSignature(extractedProseHnswIndex) : null, + records: recordMeta ? fileSignature(recordMeta) : null, + recordsDense: recordDense ? fileSignature(recordDense) : null + }; + } + + const codeDir = resolveIndexDir(root, 'code', userConfig); + const proseDir = resolveIndexDir(root, 'prose', userConfig); + const codeMeta = path.join(codeDir, 'chunk_meta.json'); + const proseMeta = path.join(proseDir, 'chunk_meta.json'); + const codeDense = path.join(codeDir, 'dense_vectors_uint8.json'); + const proseDense = path.join(proseDir, 'dense_vectors_uint8.json'); + const codeHnswMeta = path.join(codeDir, 'dense_vectors_hnsw.meta.json'); + const codeHnswIndex = path.join(codeDir, 'dense_vectors_hnsw.bin'); + const proseHnswMeta = path.join(proseDir, 'dense_vectors_hnsw.meta.json'); + const proseHnswIndex = path.join(proseDir, 'dense_vectors_hnsw.bin'); + const codeRelations = path.join(codeDir, 'file_relations.json'); + const proseRelations = path.join(proseDir, 'file_relations.json'); + const recordDir = runRecords ? resolveIndexDir(root, 'records', userConfig) : null; + const recordMeta = recordDir ? path.join(recordDir, 'chunk_meta.json') : null; + const recordDense = recordDir ? path.join(recordDir, 'dense_vectors_uint8.json') : null; + const recordHnswMeta = recordDir ? path.join(recordDir, 'dense_vectors_hnsw.meta.json') : null; + const recordHnswIndex = recordDir ? path.join(recordDir, 'dense_vectors_hnsw.bin') : null; + return { + backend: backendLabel, + code: fileSignature(codeMeta), + prose: fileSignature(proseMeta), + codeDense: fileSignature(codeDense), + proseDense: fileSignature(proseDense), + codeHnswMeta: fileSignature(codeHnswMeta), + codeHnswIndex: fileSignature(codeHnswIndex), + proseHnswMeta: fileSignature(proseHnswMeta), + proseHnswIndex: fileSignature(proseHnswIndex), + codeRelations: fileSignature(codeRelations), + proseRelations: fileSignature(proseRelations), + extractedProse: extractedProseMeta ? fileSignature(extractedProseMeta) : null, + extractedProseDense: extractedProseDense ? fileSignature(extractedProseDense) : null, + extractedProseHnswMeta: extractedProseHnswMeta ? fileSignature(extractedProseHnswMeta) : null, + extractedProseHnswIndex: extractedProseHnswIndex ? fileSignature(extractedProseHnswIndex) : null, + records: recordMeta ? fileSignature(recordMeta) : null, + recordsDense: recordDense ? fileSignature(recordDense) : null, + recordsHnswMeta: recordHnswMeta ? fileSignature(recordHnswMeta) : null, + recordsHnswIndex: recordHnswIndex ? fileSignature(recordHnswIndex) : null + }; +} diff --git a/src/retrieval/cli-lmdb.js b/src/retrieval/cli-lmdb.js new file mode 100644 index 000000000..010e9f89d --- /dev/null +++ b/src/retrieval/cli-lmdb.js @@ -0,0 +1,104 @@ +import fsSync from 'node:fs'; +import path from 'node:path'; +import { Unpackr } from 'msgpackr'; +import { LMDB_META_KEYS, LMDB_SCHEMA_VERSION } from '../storage/lmdb/schema.js'; + +let open = null; +try { + ({ open } = await import('lmdb')); +} catch {} + +const unpackr = new Unpackr(); +const decode = (value) => (value == null ? null : unpackr.unpack(value)); + +const isStorePresent = (storePath) => { + if (!storePath || !fsSync.existsSync(storePath)) return false; + return fsSync.existsSync(path.join(storePath, 'data.mdb')); +}; + +const validateStore = (db, label) => { + const version = decode(db.get(LMDB_META_KEYS.schemaVersion)); + if (version !== LMDB_SCHEMA_VERSION) { + return { ok: false, reason: `lmdb schema mismatch (expected ${LMDB_SCHEMA_VERSION}, got ${version ?? 'missing'})` }; + } + const mode = decode(db.get(LMDB_META_KEYS.mode)); + if (mode && mode !== label) { + return { ok: false, reason: `lmdb mode mismatch (expected ${label}, got ${mode})` }; + } + return { ok: true }; +}; + +export async function createLmdbBackend(options) { + const { + useLmdb: useLmdbInput, + needsCode, + needsProse, + lmdbCodePath, + lmdbProsePath, + backendForcedLmdb, + lmdbStates + } = options; + let useLmdb = useLmdbInput; + let dbCode = null; + let dbProse = null; + + if (!useLmdb) { + return { useLmdb, dbCode, dbProse, isAvailable: false }; + } + + if (!open) { + const message = 'lmdb is required for the LMDB backend. Run npm install first.'; + if (backendForcedLmdb) { + throw new Error(message); + } + console.warn(message); + useLmdb = false; + return { useLmdb, dbCode, dbProse, isAvailable: false }; + } + + const isLmdbReady = (mode) => { + const state = lmdbStates?.[mode] || null; + const lmdbState = state?.lmdb || null; + if (!lmdbState) return true; + return lmdbState.ready !== false && lmdbState.pending !== true; + }; + const pendingModes = []; + if (needsCode && !isLmdbReady('code')) pendingModes.push('code'); + if (needsProse && !isLmdbReady('prose')) pendingModes.push('prose'); + if (pendingModes.length) { + const message = `LMDB ${pendingModes.join(', ')} index marked pending; falling back to file-backed indexes.`; + if (backendForcedLmdb) { + throw new Error(message); + } + console.warn(message); + useLmdb = false; + return { useLmdb, dbCode, dbProse, isAvailable: false }; + } + + const openStore = (storePath, label) => { + if (!isStorePresent(storePath)) return null; + const db = open({ path: storePath, readOnly: true }); + const validation = validateStore(db, label); + if (!validation.ok) { + db.close(); + if (backendForcedLmdb) { + throw new Error(`LMDB ${label} invalid: ${validation.reason}`); + } + console.warn(`LMDB ${label} invalid: ${validation.reason}`); + return null; + } + return db; + }; + + if (needsCode) dbCode = openStore(lmdbCodePath, 'code'); + if (needsProse) dbProse = openStore(lmdbProsePath, 'prose'); + if ((needsCode && !dbCode) || (needsProse && !dbProse)) { + if (dbCode) dbCode.close(); + if (dbProse) dbProse.close(); + dbCode = null; + dbProse = null; + useLmdb = false; + } + + return { useLmdb, dbCode, dbProse, isAvailable: Boolean(dbCode || dbProse) }; +} diff --git a/src/retrieval/cli-sqlite.js b/src/retrieval/cli-sqlite.js new file mode 100644 index 000000000..2806e43ef --- /dev/null +++ b/src/retrieval/cli-sqlite.js @@ -0,0 +1,193 @@ +import { hasVectorTable, loadVectorExtension, resolveVectorExtensionPath } from '../../tools/vector-extension.js'; + +import { parseEnvBool } from '../shared/env.js'; + +/** + * Initialize SQLite connections for search. + * @param {object} options + * @returns {Promise<{useSqlite:boolean,dbCode:(object|null),dbProse:(object|null),vectorAnnState:object,vectorAnnUsed:object}>} + */ +export async function createSqliteBackend(options) { + const { + useSqlite: useSqliteInput, + needsCode, + needsProse, + sqliteCodePath, + sqliteProsePath, + sqliteFtsRequested, + backendForcedSqlite, + vectorExtension, + vectorAnnEnabled, + dbCache, + sqliteStates + } = options; + + let useSqlite = useSqliteInput; + let dbCode = null; + let dbProse = null; + const vectorAnnState = { + code: { available: false }, + prose: { available: false }, + records: { available: false } + }; + const vectorAnnUsed = { code: false, prose: false, records: false }; + + if (!useSqlite) { + return { useSqlite, dbCode, dbProse, vectorAnnState, vectorAnnUsed }; + } + + const isSqliteReady = (mode) => { + const state = sqliteStates?.[mode] || null; + const sqliteState = state?.sqlite || null; + if (!sqliteState) return true; + return sqliteState.ready !== false && sqliteState.pending !== true; + }; + const pendingModes = []; + if (needsCode && !isSqliteReady('code')) pendingModes.push('code'); + if (needsProse && !isSqliteReady('prose')) pendingModes.push('prose'); + if (pendingModes.length) { + const message = `SQLite ${pendingModes.join(', ')} index marked pending; falling back to file-backed indexes.`; + if (backendForcedSqlite) { + throw new Error(message); + } + console.warn(message); + useSqlite = false; + return { useSqlite, dbCode, dbProse, vectorAnnState, vectorAnnUsed }; + } + + const sqliteDisabled = parseEnvBool(process.env.PAIROFCLEATS_SQLITE_DISABLED) === true; + if (sqliteDisabled) { + const message = 'better-sqlite3 is required for the SQLite backend. Run npm install first.'; + if (backendForcedSqlite) { + throw new Error(message); + } + console.warn(message); + useSqlite = false; + return { useSqlite, dbCode, dbProse, vectorAnnState, vectorAnnUsed }; + } + + let Database; + try { + ({ default: Database } = await import('better-sqlite3')); + } catch (err) { + const message = 'better-sqlite3 is required for the SQLite backend. Run npm install first.'; + if (backendForcedSqlite) { + throw new Error(message); + } + console.warn(message); + useSqlite = false; + return { useSqlite, dbCode, dbProse, vectorAnnState, vectorAnnUsed }; + } + + const requiredTables = sqliteFtsRequested + ? [ + 'chunks', + 'chunks_fts', + 'minhash_signatures', + 'dense_vectors', + 'dense_meta' + ] + : [ + 'chunks', + 'token_vocab', + 'token_postings', + 'doc_lengths', + 'token_stats', + 'phrase_vocab', + 'phrase_postings', + 'chargram_vocab', + 'chargram_postings', + 'minhash_signatures', + 'dense_vectors', + 'dense_meta' + ]; + + const openSqlite = (dbPath, label) => { + const cached = dbCache?.get?.(dbPath); + if (cached) return cached; + const db = new Database(dbPath, { readonly: true }); + const tableRows = db.prepare("SELECT name FROM sqlite_master WHERE type='table'").all(); + const tableNames = new Set(tableRows.map((row) => row.name)); + const missing = requiredTables.filter((name) => !tableNames.has(name)); + if (missing.length) { + const message = `SQLite index ${label} is missing required tables (${missing.join(', ')}). Rebuild with npm run build-sqlite-index.`; + if (backendForcedSqlite) { + throw new Error(message); + } + console.warn(`${message} Falling back to file-backed indexes.`); + db.close(); + return null; + } + if (dbCache?.set) dbCache.set(dbPath, db); + return db; + }; + + let vectorAnnWarned = false; + const initVectorAnn = (db, mode) => { + if (!vectorAnnEnabled || !db) return; + const loadResult = loadVectorExtension(db, vectorExtension, `sqlite ${mode}`); + if (!loadResult.ok) { + if (!vectorAnnWarned) { + const extPath = resolveVectorExtensionPath(vectorExtension); + console.warn(`[ann] SQLite vector extension unavailable (${loadResult.reason}).`); + console.warn(`[ann] Expected extension at ${extPath || 'unset'}; falling back to JS ANN.`); + vectorAnnWarned = true; + } + return; + } + if (!hasVectorTable(db, vectorExtension.table)) { + if (!vectorAnnWarned) { + console.warn(`[ann] SQLite vector table missing (${vectorExtension.table}). Rebuild with npm run build-sqlite-index.`); + vectorAnnWarned = true; + } + return; + } + vectorAnnState[mode].available = true; + }; + + if (needsCode) dbCode = openSqlite(sqliteCodePath, 'code'); + if (needsProse) dbProse = openSqlite(sqliteProsePath, 'prose'); + if (needsCode) initVectorAnn(dbCode, 'code'); + if (needsProse) initVectorAnn(dbProse, 'prose'); + if ((needsCode && !dbCode) || (needsProse && !dbProse)) { + if (dbCode) dbCache?.close ? dbCache.close(sqliteCodePath) : dbCode.close(); + if (dbProse) dbCache?.close ? dbCache.close(sqliteProsePath) : dbProse.close(); + dbCode = null; + dbProse = null; + useSqlite = false; + } + + return { useSqlite, dbCode, dbProse, vectorAnnState, vectorAnnUsed }; +} + +/** + * Probe SQLite chunk counts for auto-backend selection. + * @param {string} dbPath + * @param {'code'|'prose'} mode + * @returns {Promise} + */ +export async function getSqliteChunkCount(dbPath, mode) { + if (parseEnvBool(process.env.PAIROFCLEATS_SQLITE_DISABLED) === true) { + return null; + } + let Database; + try { + ({ default: Database } = await import('better-sqlite3')); + } catch { + return null; + } + let db; + try { + db = new Database(dbPath, { readonly: true }); + const row = db.prepare('SELECT COUNT(*) as count FROM chunks WHERE mode = ?').get(mode); + return typeof row?.count === 'number' ? row.count : null; + } catch { + return null; + } finally { + if (db) { + try { + db.close(); + } catch {} + } + } +} diff --git a/src/retrieval/cli.js b/src/retrieval/cli.js new file mode 100644 index 000000000..c2738c58d --- /dev/null +++ b/src/retrieval/cli.js @@ -0,0 +1,617 @@ +import fsSync from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { + applyAdaptiveDictConfig, + DEFAULT_MODEL_ID, + getCacheRuntimeConfig, + getDictConfig, + getMetricsDir, + getModelConfig, + loadUserConfig, + resolveRepoRoot, + resolveLmdbPaths, + resolveSqlitePaths +} from '../../tools/dict-utils.js'; +import { queryVectorAnn } from '../../tools/vector-extension.js'; +import { getEnvConfig } from '../shared/env.js'; +import { createError, ERROR_CODES, isErrorCode } from '../shared/error-codes.js'; +import { getSearchUsage, parseSearchArgs } from './cli-args.js'; +import { loadDictionary } from './cli-dictionary.js'; +import { resolveIndexDir } from './cli-index.js'; +import { configureOutputCaches } from './output.js'; +import { createSearchTelemetry } from './cli/telemetry.js'; +import { getMissingFlagMessages, resolveIndexedFileCount } from './cli/options.js'; +import { hasLmdbStore } from './cli/index-loader.js'; +import { applyBranchFilter } from './cli/branch-filter.js'; +import { createBackendContext } from './cli/backend-context.js'; +import { color } from './cli/ansi.js'; +import { resolveBackendSelection } from './cli/policy.js'; +import { normalizeSearchOptions } from './cli/normalize-options.js'; +import { buildQueryPlan } from './cli/query-plan.js'; +import { loadSearchIndexes } from './cli/load-indexes.js'; +import { runSearchSession } from './cli/run-search-session.js'; +import { renderSearchOutput } from './cli/render.js'; +import { recordSearchArtifacts } from './cli/persist.js'; + +const loadIndexState = (rootDir, userConfig, mode) => { + try { + const dir = resolveIndexDir(rootDir, mode, userConfig); + const statePath = path.join(dir, 'index_state.json'); + if (!fsSync.existsSync(statePath)) return null; + return JSON.parse(fsSync.readFileSync(statePath, 'utf8')); + } catch { + return null; + } +}; + +const isSqliteReady = (state) => { + if (!state?.sqlite) return true; + return state.sqlite.ready !== false && state.sqlite.pending !== true; +}; + +const isLmdbReady = (state) => { + if (!state?.lmdb) return true; + return state.lmdb.ready !== false && state.lmdb.pending !== true; +}; + +export async function runSearchCli(rawArgs = process.argv.slice(2), options = {}) { + const telemetry = createSearchTelemetry(); + const recordSearchMetrics = (status) => telemetry.record(status); + const emitOutput = options.emitOutput !== false; + const exitOnError = options.exitOnError !== false; + const indexCache = options.indexCache || null; + const sqliteCache = options.sqliteCache || null; + const t0 = Date.now(); + + const inferJsonOutputFromArgs = () => { + if (!Array.isArray(rawArgs)) return { jsonOutput: false, jsonCompact: false }; + const hasFlag = (name) => + rawArgs.some((arg) => typeof arg === 'string' && (arg === name || arg.startsWith(`${name}=`))); + const jsonCompact = hasFlag('--json-compact'); + const jsonOutput = hasFlag('--json') || jsonCompact; + return { jsonOutput, jsonCompact }; + }; + + let argv; + try { + argv = parseSearchArgs(rawArgs); + } catch (err) { + recordSearchMetrics('error'); + const { jsonOutput } = inferJsonOutputFromArgs(); + const message = err && typeof err.message === 'string' && err.message.trim() + ? err.message + : 'Invalid arguments.'; + + if (emitOutput) { + if (jsonOutput) { + console.log(JSON.stringify({ ok: false, code: ERROR_CODES.INVALID_REQUEST, message }, null, 2)); + } else { + console.error(message); + } + } + + if (exitOnError) process.exit(1); + + const error = createError(ERROR_CODES.INVALID_REQUEST, message); + error.emitted = true; + error.cause = err; + throw error; + } + + const jsonCompact = argv['json-compact'] === true; + const jsonOutput = argv.json || jsonCompact; + const rootOverride = options.root ? path.resolve(options.root) : null; + const rootArg = rootOverride || (argv.repo ? path.resolve(argv.repo) : null); + const rootDir = rootArg || resolveRepoRoot(process.cwd()); + const userConfig = loadUserConfig(rootDir); + const cacheConfig = getCacheRuntimeConfig(rootDir, userConfig); + const envConfig = getEnvConfig(); + const verboseCache = envConfig.verbose === true; + const cacheLog = verboseCache ? (msg) => process.stderr.write(`\n${msg}\n`) : null; + + configureOutputCaches({ cacheConfig, verbose: verboseCache, log: cacheLog }); + + const emitError = (message, errorCode) => { + if (!emitOutput || !message) return; + if (jsonOutput) { + console.log(JSON.stringify({ ok: false, code: errorCode, message }, null, 2)); + } else { + console.error(message); + } + }; + const bail = (message, code = 1, errorCode = ERROR_CODES.INTERNAL) => { + const resolvedCode = isErrorCode(errorCode) ? errorCode : ERROR_CODES.INTERNAL; + emitError(message, resolvedCode); + if (exitOnError) process.exit(code); + recordSearchMetrics('error'); + const error = createError(resolvedCode, message || 'Search failed.'); + error.emitted = true; + throw error; + }; + + try { + const missingValueMessages = getMissingFlagMessages(argv, rawArgs); + if (missingValueMessages.length) { + return bail(missingValueMessages.join('\n'), 1, ERROR_CODES.INVALID_REQUEST); + } + + const metricsDir = getMetricsDir(rootDir, userConfig); + let normalized; + try { + normalized = normalizeSearchOptions({ + argv, + rawArgs, + rootDir, + userConfig, + envConfig, + metricsDir + }); + } catch (err) { + return bail(err.message, 1, ERROR_CODES.INVALID_REQUEST); + } + + if (normalized.missingValueMessages.length) { + return bail(normalized.missingValueMessages.join('\n'), 1, ERROR_CODES.INVALID_REQUEST); + } + + const { + query, + searchType, + searchAuthor, + searchImport, + chunkAuthorFilter, + searchMode, + runCode, + runProse, + runRecords, + runExtractedProse: runExtractedProseRaw, + embeddingProvider, + embeddingOnnx, + hnswConfig, + sqliteConfig, + sqliteAutoChunkThreshold, + sqliteAutoArtifactBytes, + postingsConfig, + filePrefilterEnabled, + searchRegexConfig, + fileChargramN, + vectorExtension, + bm25K1, + bm25B, + branchesMin, + loopsMin, + breaksMin, + continuesMin, + churnMin, + modifiedAfter, + modifiedSinceDays, + fileFilter, + caseFile, + caseTokens, + branchFilter, + extFilter, + metaFilters, + annEnabled, + scoreBlendEnabled, + scoreBlendSparseWeight, + scoreBlendAnnWeight, + symbolBoostEnabled, + symbolBoostDefinitionWeight, + symbolBoostExportWeight, + minhashMaxDocs, + queryCacheEnabled, + queryCacheMaxEntries, + queryCacheTtlMs, + rrfEnabled, + rrfK, + contextExpansionEnabled, + contextExpansionOptions, + contextExpansionRespectFilters, + sqliteFtsNormalize, + sqliteFtsProfile, + sqliteFtsWeights, + fieldWeightsConfig, + explain, + denseVectorMode, + backendArg + } = normalized; + + if (!query) { + return bail(getSearchUsage(), 1, ERROR_CODES.INVALID_REQUEST); + } + + telemetry.setMode(searchMode); + telemetry.setAnn(annEnabled ? 'on' : 'off'); + + const modelConfig = getModelConfig(rootDir, userConfig); + const modelIdDefault = argv.model || modelConfig.id || DEFAULT_MODEL_ID; + const useStubEmbeddings = envConfig.embeddings === 'stub'; + const topN = argv.n; + const showStats = argv.stats === true; + const showMatched = argv.matched === true; + + const needsCode = runCode; + const needsProse = runProse; + const needsSqlite = runCode || runProse; + const vectorAnnEnabled = annEnabled && vectorExtension.enabled; + const sqliteScoreModeConfig = sqliteConfig.scoreMode === 'fts'; + const sqliteConfigured = sqliteConfig.use !== false; + const lmdbConfigured = userConfig.lmdb?.use !== false; + + const lmdbPaths = resolveLmdbPaths(rootDir, userConfig); + const lmdbCodePath = lmdbPaths.codePath; + const lmdbProsePath = lmdbPaths.prosePath; + const sqlitePaths = resolveSqlitePaths(rootDir, userConfig); + const sqliteCodePath = sqlitePaths.codePath; + const sqliteProsePath = sqlitePaths.prosePath; + + const sqliteStateCode = needsCode ? loadIndexState(rootDir, userConfig, 'code') : null; + const sqliteStateProse = needsProse ? loadIndexState(rootDir, userConfig, 'prose') : null; + const sqliteCodeAvailable = fsSync.existsSync(sqliteCodePath) && isSqliteReady(sqliteStateCode); + const sqliteProseAvailable = fsSync.existsSync(sqliteProsePath) && isSqliteReady(sqliteStateProse); + const sqliteAvailable = (!needsCode || sqliteCodeAvailable) && (!needsProse || sqliteProseAvailable); + const lmdbStateCode = sqliteStateCode; + const lmdbStateProse = sqliteStateProse; + const lmdbCodeAvailable = hasLmdbStore(lmdbCodePath) && isLmdbReady(lmdbStateCode); + const lmdbProseAvailable = hasLmdbStore(lmdbProsePath) && isLmdbReady(lmdbStateProse); + const lmdbAvailable = (!needsCode || lmdbCodeAvailable) && (!needsProse || lmdbProseAvailable); + + const backendSelection = await resolveBackendSelection({ + backendArg, + sqliteScoreModeConfig, + sqliteConfigured, + sqliteAvailable, + sqliteCodeAvailable, + sqliteProseAvailable, + sqliteCodePath, + sqliteProsePath, + lmdbConfigured, + lmdbAvailable, + lmdbCodeAvailable, + lmdbProseAvailable, + lmdbCodePath, + lmdbProsePath, + sqliteAutoChunkThreshold, + sqliteAutoArtifactBytes, + needsSqlite, + needsCode, + needsProse, + root: rootDir, + userConfig, + onWarn: console.warn + }); + if (backendSelection.error) { + return bail(backendSelection.error.message); + } + + const { + backendPolicy, + useSqlite: useSqliteSelection, + useLmdb: useLmdbSelection, + sqliteFtsRequested, + backendForcedSqlite, + backendForcedLmdb + } = backendSelection; + + const backendContext = await createBackendContext({ + backendPolicy, + useSqlite: useSqliteSelection, + useLmdb: useLmdbSelection, + needsCode, + needsProse, + sqliteCodePath, + sqliteProsePath, + sqliteFtsRequested, + backendForcedSqlite, + backendForcedLmdb, + vectorExtension, + vectorAnnEnabled, + dbCache: sqliteCache, + sqliteStates: { + code: sqliteStateCode, + prose: sqliteStateProse + }, + lmdbCodePath, + lmdbProsePath, + lmdbStates: { + code: lmdbStateCode, + prose: lmdbStateProse + }, + postingsConfig, + sqliteFtsWeights, + queryVectorAnn, + modelIdDefault, + fileChargramN, + hnswConfig, + root: rootDir, + userConfig + }); + + const { + useSqlite, + useLmdb, + backendLabel, + backendPolicyInfo, + vectorAnnState, + vectorAnnUsed, + sqliteHelpers, + lmdbHelpers + } = backendContext; + telemetry.setBackend(backendLabel); + + const branchResult = await applyBranchFilter({ + branchFilter, + caseSensitive: caseFile, + root: rootDir, + metricsDir, + runCode, + runProse, + backendLabel, + backendPolicy: backendPolicyInfo, + emitOutput, + jsonOutput, + recordSearchMetrics, + warn: console.warn + }); + if (branchResult?.payload) { + return branchResult.payload; + } + + const dictConfigBase = getDictConfig(rootDir, userConfig); + const dictConfig = applyAdaptiveDictConfig( + dictConfigBase, + resolveIndexedFileCount(metricsDir, { runCode, runProse, runExtractedProse: runExtractedProseRaw }) + ); + const { dict } = await loadDictionary(rootDir, dictConfig); + + const queryPlan = buildQueryPlan({ + query, + argv, + dict, + dictConfig, + postingsConfig, + caseTokens, + fileFilter, + caseFile, + searchRegexConfig, + filePrefilterEnabled, + fileChargramN, + searchType, + searchAuthor, + searchImport, + chunkAuthorFilter, + branchesMin, + loopsMin, + breaksMin, + continuesMin, + churnMin, + extFilter, + metaFilters, + modifiedAfter, + modifiedSinceDays, + fieldWeightsConfig, + denseVectorMode, + branchFilter + }); + + const annActive = annEnabled && queryPlan.queryTokens.length > 0; + + const { + loadIndexFromSqlite, + buildCandidateSetSqlite, + getTokenIndexForQuery, + rankSqliteFts, + rankVectorAnnSqlite + } = sqliteHelpers; + const { loadIndexFromLmdb } = lmdbHelpers; + + const { + idxProse, + idxExtractedProse, + idxCode, + idxRecords, + runExtractedProse, + hnswAnnState, + hnswAnnUsed, + modelIdForCode, + modelIdForProse, + modelIdForExtractedProse, + modelIdForRecords + } = loadSearchIndexes({ + rootDir, + userConfig, + searchMode, + runProse, + runExtractedProse: runExtractedProseRaw, + runCode, + runRecords, + useSqlite, + useLmdb, + emitOutput, + exitOnError, + annActive, + filtersActive: queryPlan.filtersActive, + contextExpansionEnabled, + sqliteFtsRequested, + indexCache, + modelIdDefault, + fileChargramN, + hnswConfig, + loadIndexFromSqlite, + loadIndexFromLmdb, + resolvedDenseVectorMode: queryPlan.resolvedDenseVectorMode + }); + + const modelIds = { + code: modelIdForCode, + prose: modelIdForProse, + extractedProse: modelIdForExtractedProse, + records: modelIdForRecords + }; + + const searchResult = await runSearchSession({ + rootDir, + userConfig, + metricsDir, + query, + searchMode, + runCode, + runProse, + runExtractedProse, + runRecords, + topN, + useSqlite, + annEnabled, + annActive, + vectorExtension, + vectorAnnEnabled, + vectorAnnState, + vectorAnnUsed, + hnswConfig, + hnswAnnState, + hnswAnnUsed, + sqliteFtsRequested, + sqliteFtsNormalize, + sqliteFtsProfile, + sqliteFtsWeights, + sqliteCodePath, + sqliteProsePath, + bm25K1, + bm25B, + fieldWeights: queryPlan.fieldWeights, + postingsConfig, + queryTokens: queryPlan.queryTokens, + phraseNgramSet: queryPlan.phraseNgramSet, + phraseRange: queryPlan.phraseRange, + symbolBoost: { + enabled: symbolBoostEnabled, + definitionWeight: symbolBoostDefinitionWeight, + exportWeight: symbolBoostExportWeight + }, + filters: queryPlan.filters, + filtersActive: queryPlan.filtersActive, + scoreBlend: { + enabled: scoreBlendEnabled, + sparseWeight: scoreBlendSparseWeight, + annWeight: scoreBlendAnnWeight + }, + rrf: { + enabled: rrfEnabled, + k: rrfK + }, + minhashMaxDocs, + buildCandidateSetSqlite, + getTokenIndexForQuery, + rankSqliteFts, + rankVectorAnnSqlite, + idxProse, + idxExtractedProse, + idxCode, + idxRecords, + modelConfig, + modelIds, + embeddingProvider, + embeddingOnnx, + embeddingQueryText: queryPlan.embeddingQueryText, + useStubEmbeddings, + contextExpansionEnabled, + contextExpansionOptions, + contextExpansionRespectFilters, + cacheFilters: queryPlan.cacheFilters, + queryCacheEnabled, + queryCacheMaxEntries, + queryCacheTtlMs, + backendLabel, + resolvedDenseVectorMode: queryPlan.resolvedDenseVectorMode, + intentInfo: queryPlan.intentInfo + }); + + const elapsedMs = Date.now() - t0; + + const payload = renderSearchOutput({ + emitOutput, + jsonOutput, + jsonCompact, + explain, + color, + rootDir, + backendLabel, + backendPolicyInfo, + runCode, + runProse, + runExtractedProse, + runRecords, + topN, + queryTokens: queryPlan.queryTokens, + highlightRegex: queryPlan.highlightRegex, + contextExpansionEnabled, + expandedHits: { + prose: searchResult.proseExpanded, + extractedProse: searchResult.extractedProseExpanded, + code: searchResult.codeExpanded, + records: searchResult.recordExpanded + }, + baseHits: { + proseHits: searchResult.proseHits, + extractedProseHits: searchResult.extractedProseHits, + codeHits: searchResult.codeHits, + recordHits: searchResult.recordHits + }, + annEnabled, + annActive, + annBackend: searchResult.annBackend, + vectorExtension, + vectorAnnEnabled, + vectorAnnState, + vectorAnnUsed, + hnswConfig, + hnswAnnState, + modelIds, + embeddingProvider, + embeddingOnnx, + cacheInfo: searchResult.cache, + intentInfo: queryPlan.intentInfo, + resolvedDenseVectorMode: queryPlan.resolvedDenseVectorMode, + fieldWeights: queryPlan.fieldWeights, + contextExpansionStats: searchResult.contextExpansionStats, + idxProse, + idxCode, + idxRecords, + showStats, + showMatched, + verboseCache, + elapsedMs + }); + + await recordSearchArtifacts({ + metricsDir, + query, + queryTokens: queryPlan.queryTokens, + proseHits: searchResult.proseHits, + codeHits: searchResult.codeHits, + recordHits: searchResult.recordHits, + elapsedMs, + cacheHit: searchResult.cache.hit + }); + + recordSearchMetrics('ok'); + return payload; + } catch (err) { + recordSearchMetrics('error'); + if (emitOutput && jsonOutput && !err?.emitted) { + const message = err?.message || 'Search failed.'; + const code = isErrorCode(err?.code) ? err.code : ERROR_CODES.INTERNAL; + console.log(JSON.stringify({ ok: false, code, message }, null, 2)); + if (err) err.emitted = true; + } + throw err; + } +} + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + runSearchCli().catch((err) => { + console.error(err?.message || err); + process.exit(1); + }); +} diff --git a/src/retrieval/cli/ansi.js b/src/retrieval/cli/ansi.js new file mode 100644 index 000000000..82328bf51 --- /dev/null +++ b/src/retrieval/cli/ansi.js @@ -0,0 +1,11 @@ +export const color = { + green: (text) => `\x1b[32m${text}\x1b[0m`, + yellow: (text) => `\x1b[33m${text}\x1b[0m`, + red: (text) => `\x1b[31m${text}\x1b[0m`, + cyan: (text) => `\x1b[36m${text}\x1b[0m`, + magenta: (text) => `\x1b[35m${text}\x1b[0m`, + blue: (text) => `\x1b[34m${text}\x1b[0m`, + gray: (text) => `\x1b[90m${text}\x1b[0m`, + bold: (text) => `\x1b[1m${text}\x1b[0m`, + underline: (text) => `\x1b[4m${text}\x1b[0m` +}; diff --git a/src/retrieval/cli/backend-context.js b/src/retrieval/cli/backend-context.js new file mode 100644 index 000000000..8d3a8c20f --- /dev/null +++ b/src/retrieval/cli/backend-context.js @@ -0,0 +1,128 @@ +import { createLmdbBackend } from '../cli-lmdb.js'; +import { createSqliteBackend } from '../cli-sqlite.js'; +import { resolveIndexDir } from '../cli-index.js'; +import { createLmdbHelpers } from '../lmdb-helpers.js'; +import { createSqliteHelpers } from '../sqlite-helpers.js'; + +export const createBackendContext = async ({ + backendPolicy, + useSqlite: useSqliteInput, + useLmdb: useLmdbInput, + needsCode, + needsProse, + sqliteCodePath, + sqliteProsePath, + sqliteFtsRequested, + backendForcedSqlite, + backendForcedLmdb, + vectorExtension, + vectorAnnEnabled, + dbCache, + sqliteStates, + lmdbCodePath, + lmdbProsePath, + lmdbStates, + postingsConfig, + sqliteFtsWeights, + queryVectorAnn, + modelIdDefault, + fileChargramN, + hnswConfig, + root, + userConfig +}) => { + const lmdbBackend = await createLmdbBackend({ + useLmdb: useLmdbInput, + needsCode, + needsProse, + lmdbCodePath, + lmdbProsePath, + backendForcedLmdb, + lmdbStates + }); + let useLmdb = lmdbBackend.useLmdb; + + const sqliteBackend = await createSqliteBackend({ + useSqlite: useSqliteInput, + needsCode, + needsProse, + sqliteCodePath, + sqliteProsePath, + sqliteFtsRequested, + backendForcedSqlite, + vectorExtension, + vectorAnnEnabled, + dbCache, + sqliteStates + }); + let useSqlite = sqliteBackend.useSqlite; + let dbCode = sqliteBackend.dbCode; + let dbProse = sqliteBackend.dbProse; + let lmdbCode = lmdbBackend.dbCode; + let lmdbProse = lmdbBackend.dbProse; + + if (useSqlite) { + useLmdb = false; + lmdbCode = null; + lmdbProse = null; + } + + const vectorAnnState = sqliteBackend.vectorAnnState; + const vectorAnnUsed = sqliteBackend.vectorAnnUsed; + const backendLabel = useSqlite + ? (sqliteFtsRequested ? 'sqlite-fts' : 'sqlite') + : (useLmdb ? 'lmdb' : 'memory'); + const backendPolicyInfo = backendPolicy ? { ...backendPolicy, backendLabel } : { backendLabel }; + + const getSqliteDb = (mode) => { + if (!useSqlite) return null; + if (mode === 'code') return dbCode; + if (mode === 'prose') return dbProse; + return null; + }; + + const getLmdbDb = (mode) => { + if (!useLmdb) return null; + if (mode === 'code') return lmdbCode; + if (mode === 'prose') return lmdbProse; + return null; + }; + + const sqliteHelpers = createSqliteHelpers({ + getDb: getSqliteDb, + postingsConfig, + sqliteFtsWeights, + vectorExtension, + vectorAnnState, + queryVectorAnn, + modelIdDefault, + fileChargramN + }); + + const lmdbIndexDirs = { + code: resolveIndexDir(root, 'code', userConfig), + prose: resolveIndexDir(root, 'prose', userConfig) + }; + const lmdbHelpers = createLmdbHelpers({ + getDb: getLmdbDb, + hnswConfig, + modelIdDefault, + fileChargramN, + indexDirs: lmdbIndexDirs + }); + + return { + useSqlite, + useLmdb, + dbCode, + dbProse, + lmdbCode, + lmdbProse, + backendLabel, + backendPolicyInfo, + vectorAnnState, + vectorAnnUsed, + sqliteHelpers, + lmdbHelpers + }; +}; diff --git a/src/retrieval/cli/branch-filter.js b/src/retrieval/cli/branch-filter.js new file mode 100644 index 000000000..06e643cae --- /dev/null +++ b/src/retrieval/cli/branch-filter.js @@ -0,0 +1,70 @@ +import simpleGit from 'simple-git'; +import { loadBranchFromMetrics } from './options.js'; + +export const resolveRepoBranch = async ({ root, metricsDir, runCode, runProse }) => { + const fromMetrics = runCode ? loadBranchFromMetrics(metricsDir, 'code') : null; + const fromProse = !fromMetrics && runProse ? loadBranchFromMetrics(metricsDir, 'prose') : null; + if (fromMetrics || fromProse) return fromMetrics || fromProse; + try { + const git = simpleGit(root); + const status = await git.status(); + return status.current || null; + } catch { + return null; + } +}; + +export const applyBranchFilter = async ({ + branchFilter, + caseSensitive, + root, + metricsDir, + runCode, + runProse, + backendLabel, + backendPolicy, + emitOutput, + jsonOutput, + recordSearchMetrics, + warn = console.warn, + repoBranch: repoBranchInput, + resolveBranch +} = {}) => { + if (!branchFilter) { + return { matched: true, repoBranch: null, payload: null }; + } + const resolve = resolveBranch || resolveRepoBranch; + const repoBranch = repoBranchInput ?? await resolve({ root, metricsDir, runCode, runProse }); + const normalizedBranch = caseSensitive ? branchFilter : branchFilter.toLowerCase(); + const normalizedRepo = repoBranch ? (caseSensitive ? repoBranch : repoBranch.toLowerCase()) : null; + const branchMatches = normalizedRepo ? normalizedRepo === normalizedBranch : true; + if (repoBranch && !branchMatches) { + const payload = { + backend: backendLabel, + prose: [], + code: [], + records: [], + stats: { + branch: repoBranch, + branchFilter, + branchMatch: false, + backendPolicy + } + }; + if (emitOutput) { + if (jsonOutput) { + console.log(JSON.stringify(payload, null, 2)); + } else { + console.log(`Branch filter ${branchFilter} did not match current branch ${repoBranch}; returning no results.`); + } + } + if (recordSearchMetrics) { + recordSearchMetrics('ok'); + } + return { matched: false, repoBranch, payload }; + } + if (!repoBranch && warn) { + warn('Branch filter requested but repo branch is unavailable; continuing without branch validation.'); + } + return { matched: true, repoBranch, payload: null }; +}; diff --git a/src/retrieval/cli/highlight.js b/src/retrieval/cli/highlight.js new file mode 100644 index 000000000..ce45e7486 --- /dev/null +++ b/src/retrieval/cli/highlight.js @@ -0,0 +1,12 @@ +const escapeRegExp = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + +export function buildHighlightRegex(queryTokens) { + const highlightTokens = [...new Set(queryTokens.map((tok) => tok.trim()).filter(Boolean))]; + if (!highlightTokens.length) return null; + try { + const pattern = highlightTokens.map((tok) => escapeRegExp(tok)).join('|'); + return pattern ? new RegExp(`(${pattern})`, 'ig') : null; + } catch { + return null; + } +} diff --git a/src/retrieval/cli/index-loader.js b/src/retrieval/cli/index-loader.js new file mode 100644 index 000000000..14eb67a4f --- /dev/null +++ b/src/retrieval/cli/index-loader.js @@ -0,0 +1,98 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { loadIndexWithCache } from '../index-cache.js'; +import { resolveIndexDir } from '../cli-index.js'; + +export function hasLmdbStore(storePath) { + if (!storePath || !fs.existsSync(storePath)) return false; + return fs.existsSync(path.join(storePath, 'data.mdb')); +} + +export function loadIndexCached({ + indexCache, + dir, + modelIdDefault, + fileChargramN, + includeHnsw = true, + hnswConfig, + loadIndex +}) { + return loadIndexWithCache( + indexCache, + dir, + { + modelIdDefault, + fileChargramN, + includeHnsw, + hnswConfig + }, + loadIndex + ); +} + +export function hasIndexMeta(dir) { + if (!dir) return false; + const metaPath = path.join(dir, 'chunk_meta.json'); + const metaJsonlPath = path.join(dir, 'chunk_meta.jsonl'); + const metaPartsPath = path.join(dir, 'chunk_meta.meta.json'); + const metaPartsDir = path.join(dir, 'chunk_meta.parts'); + return fs.existsSync(metaPath) + || fs.existsSync(metaJsonlPath) + || fs.existsSync(metaPartsPath) + || fs.existsSync(metaPartsDir); +} + +export function warnPendingState(idx, label, { emitOutput, useSqlite, annActive }) { + if (!emitOutput) return; + const state = idx?.state; + if (!state || useSqlite) return; + if (state.enrichment?.pending) { + console.warn(`[search] ${label} index enrichment pending (stage1).`); + } + if (annActive && state.embeddings?.enabled && state.embeddings.ready === false) { + console.warn(`[search] ${label} embeddings pending; ANN may be limited.`); + } +} + +export function resolveDenseVector(idx, mode, denseVectorMode) { + if (!idx) return null; + if (denseVectorMode === 'code') return idx.denseVecCode || idx.denseVec || null; + if (denseVectorMode === 'doc') return idx.denseVecDoc || idx.denseVec || null; + if (denseVectorMode === 'auto') { + if (mode === 'code') return idx.denseVecCode || idx.denseVec || null; + if (mode === 'prose' || mode === 'extracted-prose') { + return idx.denseVecDoc || idx.denseVec || null; + } + } + return idx.denseVec || null; +} + +export function loadFileRelations(rootDir, userConfig, mode) { + try { + const dir = resolveIndexDir(rootDir, mode, userConfig); + const relPath = path.join(dir, 'file_relations.json'); + if (!fs.existsSync(relPath)) return null; + const raw = JSON.parse(fs.readFileSync(relPath, 'utf8')); + if (!Array.isArray(raw)) return null; + const map = new Map(); + for (const entry of raw) { + if (!entry?.file) continue; + map.set(entry.file, entry.relations || null); + } + return map; + } catch { + return null; + } +} + +export function loadRepoMap(rootDir, userConfig, mode) { + try { + const dir = resolveIndexDir(rootDir, mode, userConfig); + const mapPath = path.join(dir, 'repo_map.json'); + if (!fs.existsSync(mapPath)) return null; + const raw = JSON.parse(fs.readFileSync(mapPath, 'utf8')); + return Array.isArray(raw) ? raw : null; + } catch { + return null; + } +} diff --git a/src/retrieval/cli/load-indexes.js b/src/retrieval/cli/load-indexes.js new file mode 100644 index 000000000..d98028e9f --- /dev/null +++ b/src/retrieval/cli/load-indexes.js @@ -0,0 +1,189 @@ +import { + hasIndexMeta, + loadFileRelations, + loadIndexCached, + loadRepoMap, + resolveDenseVector, + warnPendingState +} from './index-loader.js'; +import { loadIndex, requireIndexDir, resolveIndexDir } from '../cli-index.js'; +import { resolveModelIds } from './model-ids.js'; + +const EMPTY_INDEX = { chunkMeta: [], denseVec: null, minhash: null }; + +export function loadSearchIndexes({ + rootDir, + userConfig, + searchMode, + runProse, + runExtractedProse, + runCode, + runRecords, + useSqlite, + useLmdb, + emitOutput, + exitOnError, + annActive, + filtersActive, + contextExpansionEnabled, + sqliteFtsRequested, + indexCache, + modelIdDefault, + fileChargramN, + hnswConfig, + loadIndexFromSqlite, + loadIndexFromLmdb, + resolvedDenseVectorMode +}) { + const sqliteLazyChunks = sqliteFtsRequested && !filtersActive; + const sqliteContextChunks = contextExpansionEnabled ? true : !sqliteLazyChunks; + + const proseDir = runProse && !useSqlite + ? requireIndexDir(rootDir, 'prose', userConfig, { emitOutput, exitOnError }) + : null; + const codeDir = runCode && !useSqlite + ? requireIndexDir(rootDir, 'code', userConfig, { emitOutput, exitOnError }) + : null; + const recordsDir = runRecords + ? requireIndexDir(rootDir, 'records', userConfig, { emitOutput, exitOnError }) + : null; + + const loadIndexCachedLocal = (dir, includeHnsw = true) => loadIndexCached({ + indexCache, + dir, + modelIdDefault, + fileChargramN, + includeHnsw, + hnswConfig, + loadIndex + }); + + let extractedProseDir = null; + let resolvedRunExtractedProse = runExtractedProse; + if (resolvedRunExtractedProse) { + if (searchMode === 'extracted-prose') { + extractedProseDir = requireIndexDir(rootDir, 'extracted-prose', userConfig, { emitOutput, exitOnError }); + } else { + extractedProseDir = resolveIndexDir(rootDir, 'extracted-prose', userConfig); + if (!hasIndexMeta(extractedProseDir)) { + resolvedRunExtractedProse = false; + if (emitOutput) { + console.warn('[search] extracted-prose index not found; skipping.'); + } + } + } + } + + const idxProse = runProse + ? (useSqlite ? loadIndexFromSqlite('prose', { + includeDense: annActive, + includeMinhash: annActive, + includeChunks: sqliteContextChunks, + includeFilterIndex: filtersActive + }) : (useLmdb ? loadIndexFromLmdb('prose', { + includeDense: annActive, + includeMinhash: annActive, + includeChunks: true, + includeFilterIndex: filtersActive + }) : loadIndexCachedLocal(proseDir, annActive))) + : { ...EMPTY_INDEX }; + const idxExtractedProse = resolvedRunExtractedProse + ? loadIndexCachedLocal(extractedProseDir, annActive) + : { ...EMPTY_INDEX }; + const idxCode = runCode + ? (useSqlite ? loadIndexFromSqlite('code', { + includeDense: annActive, + includeMinhash: annActive, + includeChunks: sqliteContextChunks, + includeFilterIndex: filtersActive + }) : (useLmdb ? loadIndexFromLmdb('code', { + includeDense: annActive, + includeMinhash: annActive, + includeChunks: true, + includeFilterIndex: filtersActive + }) : loadIndexCachedLocal(codeDir, annActive))) + : { ...EMPTY_INDEX }; + const idxRecords = runRecords + ? loadIndexCachedLocal(recordsDir, annActive) + : { ...EMPTY_INDEX }; + + warnPendingState(idxCode, 'code', { emitOutput, useSqlite, annActive }); + warnPendingState(idxProse, 'prose', { emitOutput, useSqlite, annActive }); + warnPendingState(idxExtractedProse, 'extracted-prose', { emitOutput, useSqlite, annActive }); + + const hnswAnnState = { + code: { available: Boolean(idxCode?.hnsw?.available) }, + prose: { available: Boolean(idxProse?.hnsw?.available) }, + records: { available: Boolean(idxRecords?.hnsw?.available) }, + 'extracted-prose': { available: Boolean(idxExtractedProse?.hnsw?.available) } + }; + const hnswAnnUsed = { + code: false, + prose: false, + records: false, + 'extracted-prose': false + }; + + if (runCode) { + idxCode.denseVec = resolveDenseVector(idxCode, 'code', resolvedDenseVectorMode); + if ((useSqlite || useLmdb) && !idxCode.fileRelations) { + idxCode.fileRelations = loadFileRelations(rootDir, userConfig, 'code'); + } + if ((useSqlite || useLmdb) && !idxCode.repoMap) { + idxCode.repoMap = loadRepoMap(rootDir, userConfig, 'code'); + } + } + if (runProse) { + idxProse.denseVec = resolveDenseVector(idxProse, 'prose', resolvedDenseVectorMode); + if ((useSqlite || useLmdb) && !idxProse.fileRelations) { + idxProse.fileRelations = loadFileRelations(rootDir, userConfig, 'prose'); + } + if ((useSqlite || useLmdb) && !idxProse.repoMap) { + idxProse.repoMap = loadRepoMap(rootDir, userConfig, 'prose'); + } + } + if (resolvedRunExtractedProse) { + idxExtractedProse.denseVec = resolveDenseVector( + idxExtractedProse, + 'extracted-prose', + resolvedDenseVectorMode + ); + if (!idxExtractedProse.fileRelations) { + idxExtractedProse.fileRelations = loadFileRelations(rootDir, userConfig, 'extracted-prose'); + } + if (!idxExtractedProse.repoMap) { + idxExtractedProse.repoMap = loadRepoMap(rootDir, userConfig, 'extracted-prose'); + } + } + + const { + modelIdForCode, + modelIdForProse, + modelIdForExtractedProse, + modelIdForRecords + } = resolveModelIds({ + modelIdDefault, + runCode, + runProse, + runExtractedProse: resolvedRunExtractedProse, + runRecords, + idxCode, + idxProse, + idxExtractedProse, + idxRecords + }); + + return { + idxProse, + idxExtractedProse, + idxCode, + idxRecords, + runExtractedProse: resolvedRunExtractedProse, + hnswAnnState, + hnswAnnUsed, + modelIdForCode, + modelIdForProse, + modelIdForExtractedProse, + modelIdForRecords + }; +} diff --git a/src/retrieval/cli/model-ids.js b/src/retrieval/cli/model-ids.js new file mode 100644 index 000000000..7178e9570 --- /dev/null +++ b/src/retrieval/cli/model-ids.js @@ -0,0 +1,20 @@ +export const resolveModelIds = ({ + modelIdDefault, + runCode, + runProse, + runExtractedProse, + runRecords, + idxCode, + idxProse, + idxExtractedProse, + idxRecords +}) => { + return { + modelIdForCode: runCode ? (idxCode?.denseVec?.model || modelIdDefault) : null, + modelIdForProse: runProse ? (idxProse?.denseVec?.model || modelIdDefault) : null, + modelIdForExtractedProse: runExtractedProse + ? (idxExtractedProse?.denseVec?.model || modelIdDefault) + : null, + modelIdForRecords: runRecords ? (idxRecords?.denseVec?.model || modelIdDefault) : null + }; +}; diff --git a/src/retrieval/cli/normalize-options.js b/src/retrieval/cli/normalize-options.js new file mode 100644 index 000000000..affc4bdb9 --- /dev/null +++ b/src/retrieval/cli/normalize-options.js @@ -0,0 +1,247 @@ +import { getVectorExtensionConfig } from '../../../tools/vector-extension.js'; +import { normalizeHnswConfig } from '../../shared/hnsw.js'; +import { normalizeEmbeddingProvider, normalizeOnnxConfig } from '../../shared/onnx-embeddings.js'; +import { normalizePostingsConfig } from '../../shared/postings-config.js'; +import { resolveFtsWeights } from '../fts.js'; +import { parseJson } from '../query-cache.js'; +import { parseChurnArg, parseModifiedArgs } from '../query-parse.js'; +import { mergeExtFilters, normalizeExtFilter, normalizeLangFilter, parseMetaFilters } from '../filters.js'; +import { resolveSearchMode } from '../cli-args.js'; +import { getMissingFlagMessages, resolveBm25Defaults } from './options.js'; + +const normalizeOptionalNumber = (value) => ( + Number.isFinite(Number(value)) ? Number(value) : null +); + +const normalizeOptionalPositive = (value, fallback) => { + const parsed = normalizeOptionalNumber(value); + if (!Number.isFinite(parsed)) return fallback; + return Math.max(0, parsed); +}; + +export function normalizeSearchOptions({ + argv, + rawArgs, + rootDir, + userConfig, + envConfig, + metricsDir +}) { + const jsonCompact = argv['json-compact'] === true; + const jsonOutput = argv.json || jsonCompact; + const missingValueMessages = getMissingFlagMessages(argv, rawArgs); + const query = argv._.join(' ').trim(); + + const embeddingsConfig = userConfig.indexing?.embeddings || {}; + const embeddingProvider = normalizeEmbeddingProvider(embeddingsConfig.provider); + const embeddingOnnx = normalizeOnnxConfig(embeddingsConfig.onnx || {}); + const hnswConfig = normalizeHnswConfig(embeddingsConfig.hnsw || {}); + + const sqliteConfig = userConfig.sqlite || {}; + const sqliteAutoChunkThresholdRaw = userConfig.search?.sqliteAutoChunkThreshold; + const sqliteAutoChunkThreshold = normalizeOptionalPositive(sqliteAutoChunkThresholdRaw, 0); + const sqliteAutoArtifactBytesRaw = userConfig.search?.sqliteAutoArtifactBytes; + const sqliteAutoArtifactBytes = normalizeOptionalPositive(sqliteAutoArtifactBytesRaw, 0); + + const postingsConfig = normalizePostingsConfig(userConfig.indexing?.postings || {}); + const filePrefilterConfig = userConfig.search?.filePrefilter || {}; + const filePrefilterEnabled = filePrefilterConfig.enabled !== false; + const searchRegexConfig = userConfig.search?.regex || null; + const fileChargramN = Number.isFinite(Number(filePrefilterConfig.chargramN)) + ? Math.max(2, Math.floor(Number(filePrefilterConfig.chargramN))) + : postingsConfig.chargramMinN; + + const vectorExtension = getVectorExtensionConfig(rootDir, userConfig); + + const contextLines = Math.max(0, parseInt(argv.context, 10) || 0); + const searchType = argv.type || null; + const searchAuthor = argv.author || null; + const searchImport = argv.import || null; + const chunkAuthorFilter = argv['chunk-author'] || null; + + const searchModeInfo = resolveSearchMode(argv.mode); + const { + searchMode, + runCode, + runProse, + runRecords, + runExtractedProse: runExtractedProseRaw + } = searchModeInfo; + const runExtractedProse = runExtractedProseRaw; + + const bm25Config = userConfig.search?.bm25 || {}; + const bm25K1Arg = normalizeOptionalNumber(argv['bm25-k1']); + const bm25BArg = normalizeOptionalNumber(argv['bm25-b']); + const bm25Defaults = resolveBm25Defaults(metricsDir, { runCode, runProse, runExtractedProse }); + const bm25K1 = bm25K1Arg + ?? normalizeOptionalNumber(bm25Config.k1) + ?? (bm25Defaults ? bm25Defaults.k1 : null) + ?? 1.2; + const bm25B = bm25BArg + ?? normalizeOptionalNumber(bm25Config.b) + ?? (bm25Defaults ? bm25Defaults.b : null) + ?? 0.75; + + const branchesMin = normalizeOptionalNumber(argv.branches); + const loopsMin = normalizeOptionalNumber(argv.loops); + const breaksMin = normalizeOptionalNumber(argv.breaks); + const continuesMin = normalizeOptionalNumber(argv.continues); + const churnMin = argv.churn ? parseChurnArg(argv.churn) : null; + const modifiedArgs = parseModifiedArgs(argv['modified-after'], argv['modified-since']); + const modifiedAfter = modifiedArgs.modifiedAfter; + const modifiedSinceDays = modifiedArgs.modifiedSinceDays; + + const fileFilters = []; + if (argv.path) fileFilters.push(argv.path); + if (argv.file) fileFilters.push(argv.file); + const fileFilter = fileFilters.length ? fileFilters.flat() : null; + const caseAll = argv.case === true; + const caseFile = argv['case-file'] === true || caseAll; + const caseTokens = argv['case-tokens'] === true || caseAll; + const branchFilter = argv.branch ? String(argv.branch).trim() : null; + + const extFilterRaw = normalizeExtFilter(argv.ext); + const langFilter = normalizeLangFilter(argv.lang); + const extFilter = mergeExtFilters(extFilterRaw, langFilter); + const metaFilters = parseMetaFilters(argv.meta, argv['meta-json']); + + const annFlagPresent = rawArgs.includes('--ann') || rawArgs.includes('--no-ann'); + const annDefault = userConfig.search?.annDefault !== false; + const annEnabled = annFlagPresent ? argv.ann : annDefault; + + const scoreBlendConfig = userConfig.search?.scoreBlend || {}; + const scoreBlendEnabled = scoreBlendConfig.enabled === true; + const scoreBlendSparseWeight = normalizeOptionalNumber(scoreBlendConfig.sparseWeight) ?? 1; + const scoreBlendAnnWeight = normalizeOptionalNumber(scoreBlendConfig.annWeight) ?? 1; + + const symbolBoostConfig = userConfig.search?.symbolBoost || {}; + const symbolBoostEnabled = symbolBoostConfig.enabled !== false; + const symbolBoostDefinitionWeight = normalizeOptionalNumber(symbolBoostConfig.definitionWeight) ?? 1.2; + const symbolBoostExportWeight = normalizeOptionalNumber(symbolBoostConfig.exportWeight) ?? 1.1; + + const minhashMaxDocs = Number.isFinite(Number(userConfig.search?.minhashMaxDocs)) + ? Math.max(0, Number(userConfig.search.minhashMaxDocs)) + : 5000; + + const queryCacheConfig = userConfig.search?.queryCache || {}; + const queryCacheEnabled = queryCacheConfig.enabled === true; + const queryCacheMaxEntries = Number.isFinite(Number(queryCacheConfig.maxEntries)) + ? Math.max(1, Number(queryCacheConfig.maxEntries)) + : 200; + const queryCacheTtlMs = Number.isFinite(Number(queryCacheConfig.ttlMs)) + ? Math.max(0, Number(queryCacheConfig.ttlMs)) + : 0; + + const rrfConfig = userConfig.search?.rrf || {}; + const rrfEnabled = rrfConfig.enabled !== false; + const rrfK = Number.isFinite(Number(rrfConfig.k)) ? Math.max(1, Number(rrfConfig.k)) : 60; + + const contextExpansionConfig = userConfig.search?.contextExpansion || {}; + const contextExpansionEnabled = contextExpansionConfig.enabled === true; + const contextExpansionOptions = { + maxPerHit: contextExpansionConfig.maxPerHit, + maxTotal: contextExpansionConfig.maxTotal, + includeCalls: contextExpansionConfig.includeCalls, + includeImports: contextExpansionConfig.includeImports, + includeExports: contextExpansionConfig.includeExports, + includeUsages: contextExpansionConfig.includeUsages + }; + const contextExpansionRespectFilters = contextExpansionConfig.respectFilters !== false; + + const sqliteFtsNormalize = userConfig.search?.sqliteFtsNormalize === true; + const sqliteFtsProfile = (argv['fts-profile'] + || envConfig.ftsProfile + || userConfig.search?.sqliteFtsProfile + || 'balanced').toLowerCase(); + let sqliteFtsWeightsConfig = userConfig.search?.sqliteFtsWeights || null; + if (argv['fts-weights']) { + const parsed = parseJson(argv['fts-weights'], null); + if (parsed) { + sqliteFtsWeightsConfig = parsed; + } else { + const values = String(argv['fts-weights']) + .split(/[,\s]+/) + .filter(Boolean) + .map((val) => Number(val)) + .filter((val) => Number.isFinite(val)); + sqliteFtsWeightsConfig = values.length ? values : sqliteFtsWeightsConfig; + } + } + const sqliteFtsWeights = resolveFtsWeights(sqliteFtsProfile, sqliteFtsWeightsConfig); + + const explain = argv.explain === true || argv.why === true; + const denseVectorMode = typeof userConfig.search?.denseVectorMode === 'string' + ? userConfig.search.denseVectorMode.toLowerCase() + : 'merged'; + + const backendArg = typeof argv.backend === 'string' ? argv.backend.toLowerCase() : ''; + + return { + jsonCompact, + jsonOutput, + missingValueMessages, + query, + contextLines, + searchType, + searchAuthor, + searchImport, + chunkAuthorFilter, + searchMode, + runCode, + runProse, + runRecords, + runExtractedProse, + embeddingsConfig, + embeddingProvider, + embeddingOnnx, + hnswConfig, + sqliteConfig, + sqliteAutoChunkThreshold, + sqliteAutoArtifactBytes, + postingsConfig, + filePrefilterConfig, + filePrefilterEnabled, + searchRegexConfig, + fileChargramN, + vectorExtension, + bm25Config, + bm25K1, + bm25B, + branchesMin, + loopsMin, + breaksMin, + continuesMin, + churnMin, + modifiedAfter, + modifiedSinceDays, + fileFilter, + caseFile, + caseTokens, + branchFilter, + extFilter, + metaFilters, + annEnabled, + scoreBlendEnabled, + scoreBlendSparseWeight, + scoreBlendAnnWeight, + symbolBoostEnabled, + symbolBoostDefinitionWeight, + symbolBoostExportWeight, + minhashMaxDocs, + queryCacheEnabled, + queryCacheMaxEntries, + queryCacheTtlMs, + rrfEnabled, + rrfK, + contextExpansionEnabled, + contextExpansionOptions, + contextExpansionRespectFilters, + sqliteFtsNormalize, + sqliteFtsProfile, + sqliteFtsWeights, + fieldWeightsConfig: userConfig.search?.fieldWeights, + explain, + denseVectorMode, + backendArg + }; +} diff --git a/src/retrieval/cli/options.js b/src/retrieval/cli/options.js new file mode 100644 index 000000000..fffa6e927 --- /dev/null +++ b/src/retrieval/cli/options.js @@ -0,0 +1,139 @@ +import fs from 'node:fs'; +import path from 'node:path'; + +export function getMissingFlagMessages(argv, rawArgs = []) { + const args = Array.isArray(rawArgs) ? rawArgs : []; + const hasMissingValue = (flag) => { + const flagEq = `${flag}=`; + for (let i = 0; i < args.length; i += 1) { + const arg = String(args[i] || ''); + if (arg === flag) { + const next = args[i + 1]; + if (next === undefined) return true; + const nextValue = String(next); + if (!nextValue.trim() || nextValue.startsWith('-')) return true; + continue; + } + if (arg.startsWith(flagEq)) { + const value = arg.slice(flagEq.length); + if (!String(value).trim()) return true; + } + } + return false; + }; + + const missingValueFlags = [ + { key: 'type', flag: '--type', example: '--type Function' }, + { key: 'author', flag: '--author', example: '--author "Jane Doe"' }, + { key: 'import', flag: '--import', example: '--import lodash' } + ]; + return missingValueFlags + .filter((entry) => { + const value = argv?.[entry.key]; + if (value === true) return true; + if (typeof value === 'string' && !value.trim()) return true; + if (value === undefined && hasMissingValue(entry.flag)) return true; + return false; + }) + .map((entry) => `Missing value for ${entry.flag}. Example: ${entry.example}`); +} + +export function estimateIndexBytes(indexDir) { + if (!indexDir || !fs.existsSync(indexDir)) return 0; + const targets = [ + 'chunk_meta.json', + 'chunk_meta.jsonl', + 'chunk_meta.meta.json', + 'token_postings.json', + 'token_postings.meta.json', + 'phrase_ngrams.json', + 'chargram_postings.json', + 'dense_vectors_uint8.json', + 'filter_index.json' + ]; + const sumFile = (targetPath) => { + try { + const stat = fs.statSync(targetPath); + return stat.size; + } catch { + return 0; + } + }; + let total = 0; + for (const name of targets) { + total += sumFile(path.join(indexDir, name)); + } + const chunkMetaPartsDir = path.join(indexDir, 'chunk_meta.parts'); + if (fs.existsSync(chunkMetaPartsDir)) { + for (const entry of fs.readdirSync(chunkMetaPartsDir)) { + total += sumFile(path.join(chunkMetaPartsDir, entry)); + } + } + const tokenPostingsShardsDir = path.join(indexDir, 'token_postings.shards'); + if (fs.existsSync(tokenPostingsShardsDir)) { + for (const entry of fs.readdirSync(tokenPostingsShardsDir)) { + total += sumFile(path.join(tokenPostingsShardsDir, entry)); + } + } + return total; +} + +export function resolveIndexedFileCount(metricsRoot, modeFlags) { + if (!metricsRoot || !fs.existsSync(metricsRoot)) return null; + const modes = []; + if (modeFlags?.runCode) modes.push('code'); + if (modeFlags?.runProse) modes.push('prose'); + if (modeFlags?.runExtractedProse) modes.push('extracted-prose'); + if (!modes.length) return null; + const counts = []; + for (const mode of modes) { + const metricsPath = path.join(metricsRoot, `index-${mode}.json`); + if (!fs.existsSync(metricsPath)) continue; + try { + const raw = JSON.parse(fs.readFileSync(metricsPath, 'utf8')); + const count = Number(raw?.files?.candidates); + if (Number.isFinite(count) && count > 0) counts.push(count); + } catch { + // ignore + } + } + if (!counts.length) return null; + return Math.max(...counts); +} + +export function resolveBm25Defaults(metricsRoot, modeFlags) { + if (!metricsRoot || !fs.existsSync(metricsRoot)) return null; + const targets = []; + if (modeFlags?.runCode) targets.push('code'); + if (modeFlags?.runProse) targets.push('prose'); + if (modeFlags?.runExtractedProse) targets.push('extracted-prose'); + if (!targets.length) return null; + const values = []; + for (const mode of targets) { + const metricsPath = path.join(metricsRoot, `index-${mode}.json`); + if (!fs.existsSync(metricsPath)) continue; + try { + const raw = JSON.parse(fs.readFileSync(metricsPath, 'utf8')); + const k1 = Number(raw?.bm25?.k1); + const b = Number(raw?.bm25?.b); + if (Number.isFinite(k1) && Number.isFinite(b)) values.push({ k1, b }); + } catch { + // ignore + } + } + if (!values.length) return null; + const k1 = values.reduce((sum, v) => sum + v.k1, 0) / values.length; + const b = values.reduce((sum, v) => sum + v.b, 0) / values.length; + return { k1, b }; +} + +export function loadBranchFromMetrics(metricsDir, mode) { + try { + const metricsPath = path.join(metricsDir, `index-${mode}.json`); + if (!fs.existsSync(metricsPath)) return null; + const raw = JSON.parse(fs.readFileSync(metricsPath, 'utf8')); + return raw?.git?.branch || null; + } catch { + return null; + } +} diff --git a/src/retrieval/cli/persist.js b/src/retrieval/cli/persist.js new file mode 100644 index 000000000..eb1714cae --- /dev/null +++ b/src/retrieval/cli/persist.js @@ -0,0 +1,58 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; + +export async function recordSearchArtifacts({ + metricsDir, + query, + queryTokens, + proseHits, + codeHits, + recordHits, + elapsedMs, + cacheHit +}) { + try { + const metricsPath = path.join(metricsDir, 'metrics.json'); + const historyPath = path.join(metricsDir, 'searchHistory'); + const noResultPath = path.join(metricsDir, 'noResultQueries'); + await fs.mkdir(path.dirname(metricsPath), { recursive: true }); + + let metrics = {}; + try { + metrics = JSON.parse(await fs.readFile(metricsPath, 'utf8')); + } catch { + metrics = {}; + } + const inc = (file, key) => { + if (!metrics[file]) metrics[file] = { md: 0, code: 0, records: 0, terms: [] }; + metrics[file][key] = (metrics[file][key] || 0) + 1; + queryTokens.forEach((token) => { + if (!metrics[file].terms.includes(token)) metrics[file].terms.push(token); + }); + }; + proseHits.forEach((hit) => inc(hit.file, 'md')); + codeHits.forEach((hit) => inc(hit.file, 'code')); + recordHits.forEach((hit) => inc(hit.file, 'records')); + await fs.writeFile(metricsPath, JSON.stringify(metrics) + '\n'); + + await fs.appendFile( + historyPath, + JSON.stringify({ + time: new Date().toISOString(), + query, + mdFiles: proseHits.length, + codeFiles: codeHits.length, + recordFiles: recordHits.length, + ms: elapsedMs, + cached: cacheHit + }) + '\n' + ); + + if (proseHits.length === 0 && codeHits.length === 0 && recordHits.length === 0) { + await fs.appendFile( + noResultPath, + JSON.stringify({ time: new Date().toISOString(), query }) + '\n' + ); + } + } catch {} +} diff --git a/src/retrieval/cli/policy.js b/src/retrieval/cli/policy.js new file mode 100644 index 000000000..e3f7583e6 --- /dev/null +++ b/src/retrieval/cli/policy.js @@ -0,0 +1,102 @@ +import { getIndexDir } from '../../../tools/dict-utils.js'; +import { resolveBackendPolicy } from '../../storage/backend-policy.js'; +import { getSqliteChunkCount } from '../cli-sqlite.js'; +import { estimateIndexBytes } from './options.js'; + +export const resolveBackendSelection = async ({ + backendArg, + sqliteScoreModeConfig, + sqliteConfigured, + sqliteAvailable, + sqliteCodeAvailable, + sqliteProseAvailable, + sqliteCodePath, + sqliteProsePath, + lmdbConfigured, + lmdbAvailable, + lmdbCodeAvailable, + lmdbProseAvailable, + lmdbCodePath, + lmdbProsePath, + sqliteAutoChunkThreshold, + sqliteAutoArtifactBytes, + needsSqlite, + needsCode, + needsProse, + root, + userConfig, + onWarn +}) => { + let chunkCounts = []; + let artifactBytes = []; + if (needsSqlite && (!backendArg || backendArg === 'auto')) { + if (sqliteAutoChunkThreshold > 0) { + if (needsCode) chunkCounts.push(await getSqliteChunkCount(sqliteCodePath, 'code')); + if (needsProse) chunkCounts.push(await getSqliteChunkCount(sqliteProsePath, 'prose')); + } + if (sqliteAutoArtifactBytes > 0) { + if (needsCode) artifactBytes.push(estimateIndexBytes(getIndexDir(root, 'code', userConfig))); + if (needsProse) artifactBytes.push(estimateIndexBytes(getIndexDir(root, 'prose', userConfig))); + } + } + + const backendPolicy = resolveBackendPolicy({ + backendArg, + sqliteScoreModeConfig, + sqliteConfigured, + sqliteAvailable, + lmdbConfigured, + lmdbAvailable, + sqliteAutoChunkThreshold, + sqliteAutoArtifactBytes, + needsSqlite, + chunkCounts, + artifactBytes + }); + + if (backendPolicy.error) { + const missing = []; + if (backendPolicy.backendLabel === 'lmdb') { + if (needsCode && !lmdbCodeAvailable) missing.push(`code=${lmdbCodePath}`); + if (needsProse && !lmdbProseAvailable) missing.push(`prose=${lmdbProsePath}`); + } else { + if (needsCode && !sqliteCodeAvailable) missing.push(`code=${sqliteCodePath}`); + if (needsProse && !sqliteProseAvailable) missing.push(`prose=${sqliteProsePath}`); + } + const suffix = missing.length + ? missing.join(', ') + : (backendPolicy.backendLabel === 'lmdb' ? 'missing lmdb index' : 'missing sqlite index'); + return { + backendPolicy, + error: { + message: `${backendPolicy.error} (${suffix}).`, + missing + } + }; + } + + if (!needsSqlite && backendPolicy.backendForcedSqlite) { + onWarn?.('SQLite backend requested, but records-only mode selected; using file-backed records index.'); + } + if (!needsSqlite && backendPolicy.backendForcedLmdb) { + onWarn?.('LMDB backend requested, but records-only mode selected; using file-backed records index.'); + } + if (backendPolicy.backendDisabled) { + onWarn?.(`Unknown backend "${backendArg}". Falling back to memory.`); + } + + let useSqlite = backendPolicy.useSqlite; + let useLmdb = backendPolicy.useLmdb; + if (useLmdb) { + useSqlite = false; + } + + return { + backendPolicy, + useSqlite, + useLmdb, + sqliteFtsRequested: backendPolicy.sqliteFtsRequested, + backendForcedSqlite: backendPolicy.backendForcedSqlite, + backendForcedLmdb: backendPolicy.backendForcedLmdb + }; +}; diff --git a/src/retrieval/cli/query-plan.js b/src/retrieval/cli/query-plan.js new file mode 100644 index 000000000..21279e90b --- /dev/null +++ b/src/retrieval/cli/query-plan.js @@ -0,0 +1,195 @@ +import { hasActiveFilters } from '../filters.js'; +import { buildHighlightRegex } from './highlight.js'; +import { + buildPhraseNgrams, + parseQueryInput, + tokenizePhrase, + tokenizeQueryTerms +} from '../query-parse.js'; +import { + classifyQuery, + resolveIntentFieldWeights, + resolveIntentVectorMode +} from '../query-intent.js'; + +export function buildQueryPlan({ + query, + argv, + dict, + dictConfig, + postingsConfig, + caseTokens, + fileFilter, + caseFile, + searchRegexConfig, + filePrefilterEnabled, + fileChargramN, + searchType, + searchAuthor, + searchImport, + chunkAuthorFilter, + branchesMin, + loopsMin, + breaksMin, + continuesMin, + churnMin, + extFilter, + metaFilters, + modifiedAfter, + modifiedSinceDays, + fieldWeightsConfig, + denseVectorMode, + branchFilter +}) { + const parsedQuery = parseQueryInput(query); + const includeTokens = tokenizeQueryTerms(parsedQuery.includeTerms, dict, { ...dictConfig, caseSensitive: caseTokens }); + const phraseTokens = parsedQuery.phrases + .map((phrase) => tokenizePhrase(phrase, dict, { ...dictConfig, caseSensitive: caseTokens })) + .filter((tokens) => tokens.length); + const phraseInfo = buildPhraseNgrams(phraseTokens, postingsConfig); + const phraseNgrams = phraseInfo.ngrams; + const phraseNgramSet = phraseNgrams.length ? new Set(phraseNgrams) : null; + const phraseRange = { min: phraseInfo.minLen, max: phraseInfo.maxLen }; + const excludeTokens = tokenizeQueryTerms(parsedQuery.excludeTerms, dict, { ...dictConfig, caseSensitive: caseTokens }); + const excludePhraseTokens = parsedQuery.excludePhrases + .map((phrase) => tokenizePhrase(phrase, dict, { ...dictConfig, caseSensitive: caseTokens })) + .filter((tokens) => tokens.length); + const excludePhraseInfo = buildPhraseNgrams(excludePhraseTokens, postingsConfig); + const excludePhraseNgrams = excludePhraseInfo.ngrams; + const excludePhraseRange = excludePhraseInfo.minLen && excludePhraseInfo.maxLen + ? { min: excludePhraseInfo.minLen, max: excludePhraseInfo.maxLen } + : null; + const queryTokens = [...includeTokens, ...phraseTokens.flat()]; + const rx = buildHighlightRegex(queryTokens); + const embeddingQueryText = [...parsedQuery.includeTerms, ...parsedQuery.phrases] + .join(' ') + .trim() || query; + const intentInfo = classifyQuery({ + query, + tokens: queryTokens, + phrases: parsedQuery.phrases, + filters: { file: fileFilter } + }); + const fieldWeights = resolveIntentFieldWeights(fieldWeightsConfig, intentInfo); + const resolvedDenseVectorMode = resolveIntentVectorMode(denseVectorMode, intentInfo); + + const filters = { + type: searchType, + author: searchAuthor, + importName: searchImport, + lint: argv.lint, + churn: churnMin, + calls: argv.calls, + uses: argv.uses, + signature: argv.signature, + param: argv.param, + decorator: argv.decorator, + inferredType: argv['inferred-type'], + returnType: argv['return-type'], + throws: argv.throws, + reads: argv.reads, + writes: argv.writes, + mutates: argv.mutates, + alias: argv.alias, + risk: argv.risk, + riskTag: argv['risk-tag'], + riskSource: argv['risk-source'], + riskSink: argv['risk-sink'], + riskCategory: argv['risk-category'], + riskFlow: argv['risk-flow'], + structPack: argv['struct-pack'], + structRule: argv['struct-rule'], + structTag: argv['struct-tag'], + awaits: argv.awaits, + branches: branchesMin, + loops: loopsMin, + breaks: breaksMin, + continues: continuesMin, + visibility: argv.visibility, + extends: argv.extends, + async: argv.async, + generator: argv.generator, + returns: argv.returns, + file: fileFilter, + caseFile, + caseTokens, + regexConfig: fileFilter ? searchRegexConfig : null, + filePrefilter: { + enabled: filePrefilterEnabled, + chargramN: fileChargramN + }, + ext: extFilter, + meta: metaFilters, + chunkAuthor: chunkAuthorFilter, + modifiedAfter, + excludeTokens, + excludePhrases: excludePhraseNgrams, + excludePhraseRange + }; + const filtersActive = hasActiveFilters(filters); + + const cacheFilters = { + type: searchType, + author: searchAuthor, + calls: argv.calls || null, + uses: argv.uses || null, + signature: argv.signature || null, + param: argv.param || null, + import: searchImport, + lint: argv.lint || false, + churn: churnMin, + decorator: argv.decorator || null, + inferredType: argv['inferred-type'] || null, + returnType: argv['return-type'] || null, + throws: argv.throws || null, + reads: argv.reads || null, + writes: argv.writes || null, + mutates: argv.mutates || null, + risk: argv.risk || null, + riskTag: argv['risk-tag'] || null, + riskSource: argv['risk-source'] || null, + riskSink: argv['risk-sink'] || null, + riskCategory: argv['risk-category'] || null, + riskFlow: argv['risk-flow'] || null, + structPack: argv['struct-pack'] || null, + structRule: argv['struct-rule'] || null, + structTag: argv['struct-tag'] || null, + awaits: argv.awaits || null, + visibility: argv.visibility || null, + extends: argv.extends || null, + async: argv.async || false, + generator: argv.generator || false, + returns: argv.returns || false, + file: fileFilter || null, + ext: extFilter || null, + branch: branchFilter || null, + caseFile, + caseTokens, + regexConfig: fileFilter ? searchRegexConfig : null, + meta: metaFilters, + chunkAuthor: chunkAuthorFilter || null, + modifiedAfter, + modifiedSinceDays + }; + + return { + parsedQuery, + includeTokens, + phraseTokens, + phraseNgrams, + phraseNgramSet, + phraseRange, + excludeTokens, + excludePhraseNgrams, + excludePhraseRange, + queryTokens, + highlightRegex: rx, + embeddingQueryText, + intentInfo, + fieldWeights, + resolvedDenseVectorMode, + filters, + filtersActive, + cacheFilters + }; +} diff --git a/src/retrieval/cli/render-output.js b/src/retrieval/cli/render-output.js new file mode 100644 index 000000000..2dd5c72a2 --- /dev/null +++ b/src/retrieval/cli/render-output.js @@ -0,0 +1,31 @@ +export function compactHit(hit, includeExplain = false) { + if (!hit || typeof hit !== 'object') return hit; + const compact = {}; + const fields = [ + 'id', + 'file', + 'start', + 'end', + 'startLine', + 'endLine', + 'ext', + 'kind', + 'name', + 'headline', + 'score', + 'scoreType', + 'sparseScore', + 'sparseType', + 'annScore', + 'annSource', + 'annType', + 'context' + ]; + for (const field of fields) { + if (hit[field] !== undefined) compact[field] = hit[field]; + } + if (includeExplain && hit.scoreBreakdown !== undefined) { + compact.scoreBreakdown = hit.scoreBreakdown; + } + return compact; +} diff --git a/src/retrieval/cli/render.js b/src/retrieval/cli/render.js new file mode 100644 index 000000000..225a9c323 --- /dev/null +++ b/src/retrieval/cli/render.js @@ -0,0 +1,320 @@ +import { compactHit } from './render-output.js'; +import { formatFullChunk, formatShortChunk, getOutputCacheReporter } from '../output.js'; + +export function renderSearchOutput({ + emitOutput, + jsonOutput, + jsonCompact, + explain, + color, + rootDir, + backendLabel, + backendPolicyInfo, + runCode, + runProse, + runExtractedProse, + runRecords, + topN, + queryTokens, + highlightRegex, + contextExpansionEnabled, + expandedHits, + baseHits, + annEnabled, + annActive, + annBackend, + vectorExtension, + vectorAnnEnabled, + vectorAnnState, + vectorAnnUsed, + hnswConfig, + hnswAnnState, + modelIds, + embeddingProvider, + embeddingOnnx, + cacheInfo, + intentInfo, + resolvedDenseVectorMode, + fieldWeights, + contextExpansionStats, + idxProse, + idxCode, + idxRecords, + showStats, + showMatched, + verboseCache, + elapsedMs +}) { + const proseHitsFinal = expandedHits.prose.hits; + const extractedProseHitsFinal = expandedHits.extractedProse.hits; + const codeHitsFinal = expandedHits.code.hits; + const recordHitsFinal = expandedHits.records.hits; + + const memory = process.memoryUsage(); + const payload = { + backend: backendLabel, + prose: jsonCompact ? proseHitsFinal.map((hit) => compactHit(hit, explain)) : proseHitsFinal, + extractedProse: jsonCompact + ? extractedProseHitsFinal.map((hit) => compactHit(hit, explain)) + : extractedProseHitsFinal, + code: jsonCompact ? codeHitsFinal.map((hit) => compactHit(hit, explain)) : codeHitsFinal, + records: jsonCompact ? recordHitsFinal.map((hit) => compactHit(hit, explain)) : recordHitsFinal, + stats: { + elapsedMs, + annEnabled, + annActive, + annMode: vectorExtension.annMode, + annBackend, + backendPolicy: backendPolicyInfo, + annExtension: vectorAnnEnabled ? { + provider: vectorExtension.provider, + table: vectorExtension.table, + available: { + code: vectorAnnState.code.available, + prose: vectorAnnState.prose.available, + records: vectorAnnState.records.available + } + } : null, + annHnsw: hnswConfig.enabled ? { + available: { + code: hnswAnnState.code.available, + prose: hnswAnnState.prose.available, + records: hnswAnnState.records.available, + extractedProse: hnswAnnState['extracted-prose'].available + }, + space: hnswConfig.space, + efSearch: hnswConfig.efSearch + } : null, + models: { + code: modelIds.code, + prose: modelIds.prose, + extractedProse: modelIds.extractedProse, + records: modelIds.records + }, + embeddings: { + provider: embeddingProvider, + onnxModel: embeddingOnnx.modelPath || null, + onnxTokenizer: embeddingOnnx.tokenizerId || null + }, + cache: { + enabled: cacheInfo.enabled, + hit: cacheInfo.hit, + key: cacheInfo.key + }, + memory: { + rss: memory.rss, + heapTotal: memory.heapTotal, + heapUsed: memory.heapUsed, + external: memory.external, + arrayBuffers: memory.arrayBuffers + } + } + }; + + if (explain) { + payload.stats.intent = { + ...intentInfo, + denseVectorMode: resolvedDenseVectorMode, + fieldWeights + }; + payload.stats.contextExpansion = contextExpansionStats; + } + + if (emitOutput && jsonOutput) { + console.log(JSON.stringify(payload, null, 2)); + } + + if (emitOutput && !jsonOutput) { + let showProse = runProse ? topN : 0; + let showExtractedProse = runExtractedProse ? topN : 0; + let showCode = runCode ? topN : 0; + let showRecords = runRecords ? topN : 0; + + if (runProse && runCode) { + if (baseHits.proseHits.length < topN) { + showCode += showProse; + } + if (baseHits.codeHits.length < topN) { + showProse += showCode; + } + } + if (contextExpansionEnabled) { + showProse += expandedHits.prose.contextHits.length; + showExtractedProse += expandedHits.extractedProse.contextHits.length; + showCode += expandedHits.code.contextHits.length; + showRecords += expandedHits.records.contextHits.length; + } + + if (runProse) { + console.log(color.bold(`\n===== Markdown Results (${backendLabel}) =====`)); + const summaryState = { lastCount: 0 }; + proseHitsFinal.slice(0, showProse).forEach((hit, index) => { + if (index < 2) { + process.stdout.write(formatFullChunk({ + chunk: hit, + index, + mode: 'prose', + score: hit.score, + scoreType: hit.scoreType, + explain, + color, + queryTokens, + rx: highlightRegex, + matched: showMatched, + rootDir, + summaryState + })); + } else { + process.stdout.write(formatShortChunk({ + chunk: hit, + index, + mode: 'prose', + score: hit.score, + scoreType: hit.scoreType, + explain, + color, + queryTokens, + rx: highlightRegex, + matched: showMatched + })); + } + }); + console.log('\n'); + } + + if (runExtractedProse) { + console.log(color.bold(`===== Extracted Prose Results (${backendLabel}) =====`)); + const summaryState = { lastCount: 0 }; + extractedProseHitsFinal.slice(0, showExtractedProse).forEach((hit, index) => { + if (index < 2) { + process.stdout.write(formatFullChunk({ + chunk: hit, + index, + mode: 'extracted-prose', + score: hit.score, + scoreType: hit.scoreType, + explain, + color, + queryTokens, + rx: highlightRegex, + matched: showMatched, + rootDir, + summaryState + })); + } else { + process.stdout.write(formatShortChunk({ + chunk: hit, + index, + mode: 'extracted-prose', + score: hit.score, + scoreType: hit.scoreType, + explain, + color, + queryTokens, + rx: highlightRegex, + matched: showMatched + })); + } + }); + console.log('\n'); + } + + if (runCode) { + console.log(color.bold(`===== Code Results (${backendLabel}) =====`)); + const summaryState = { lastCount: 0 }; + codeHitsFinal.slice(0, showCode).forEach((hit, index) => { + if (index < 1) { + process.stdout.write(formatFullChunk({ + chunk: hit, + index, + mode: 'code', + score: hit.score, + scoreType: hit.scoreType, + explain, + color, + queryTokens, + rx: highlightRegex, + matched: showMatched, + rootDir, + summaryState + })); + } else { + process.stdout.write(formatShortChunk({ + chunk: hit, + index, + mode: 'code', + score: hit.score, + scoreType: hit.scoreType, + explain, + color, + queryTokens, + rx: highlightRegex, + matched: showMatched + })); + } + }); + console.log('\n'); + } + + if (runRecords) { + console.log(color.bold(`===== Records Results (${backendLabel}) =====`)); + recordHitsFinal.slice(0, showRecords).forEach((hit, index) => { + if (index < 2) { + process.stdout.write(formatFullChunk({ + chunk: hit, + index, + mode: 'records', + score: hit.score, + scoreType: hit.scoreType, + explain, + color, + queryTokens, + rx: highlightRegex, + matched: showMatched, + rootDir: null, + summaryState: null + })); + } else { + process.stdout.write(formatShortChunk({ + chunk: hit, + index, + mode: 'records', + score: hit.score, + scoreType: hit.scoreType, + explain, + color, + queryTokens, + rx: highlightRegex, + matched: showMatched + })); + } + }); + console.log('\n'); + } + + if (showStats) { + const proseCount = idxProse?.chunkMeta?.length ?? 0; + const codeCount = idxCode?.chunkMeta?.length ?? 0; + const recordsCount = idxRecords?.chunkMeta?.length ?? 0; + const cacheTag = cacheInfo.enabled ? (cacheInfo.hit ? 'cache=hit' : 'cache=miss') : 'cache=off'; + const statsParts = [ + `prose chunks=${proseCount}`, + `code chunks=${codeCount}`, + runRecords ? `records chunks=${recordsCount}` : null, + `(${cacheTag})` + ].filter(Boolean); + if (explain && backendPolicyInfo?.reason) { + statsParts.push(`backend=${backendLabel}`); + statsParts.push(`policy=${backendPolicyInfo.reason}`); + } + console.log(color.gray(`Stats: ${statsParts.join(', ')}`)); + } + } + + const outputCacheReporter = getOutputCacheReporter(); + if (emitOutput && verboseCache && outputCacheReporter) { + outputCacheReporter.report(); + } + + return payload; +} diff --git a/src/retrieval/cli/run-search-session.js b/src/retrieval/cli/run-search-session.js new file mode 100644 index 000000000..abf81106b --- /dev/null +++ b/src/retrieval/cli/run-search-session.js @@ -0,0 +1,342 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { incCacheEvent } from '../../shared/metrics.js'; +import { createSearchPipeline } from '../pipeline.js'; +import { buildQueryCacheKey, getIndexSignature } from '../cli-index.js'; +import { getQueryEmbedding } from '../embedding.js'; +import { expandContext } from '../context-expansion.js'; +import { loadQueryCache, pruneQueryCache } from '../query-cache.js'; +import { filterChunks } from '../output.js'; +import { runSearchByMode } from './search-runner.js'; + +export async function runSearchSession({ + rootDir, + userConfig, + metricsDir, + query, + searchMode, + runCode, + runProse, + runExtractedProse, + runRecords, + topN, + useSqlite, + annEnabled, + annActive, + vectorExtension, + vectorAnnEnabled, + vectorAnnState, + vectorAnnUsed, + hnswConfig, + hnswAnnState, + hnswAnnUsed, + sqliteFtsRequested, + sqliteFtsNormalize, + sqliteFtsProfile, + sqliteFtsWeights, + sqliteCodePath, + sqliteProsePath, + bm25K1, + bm25B, + fieldWeights, + postingsConfig, + queryTokens, + phraseNgramSet, + phraseRange, + symbolBoost, + filters, + filtersActive, + scoreBlend, + rrf, + minhashMaxDocs, + buildCandidateSetSqlite, + getTokenIndexForQuery, + rankSqliteFts, + rankVectorAnnSqlite, + idxProse, + idxExtractedProse, + idxCode, + idxRecords, + modelConfig, + modelIds, + embeddingProvider, + embeddingOnnx, + embeddingQueryText, + useStubEmbeddings, + contextExpansionEnabled, + contextExpansionOptions, + contextExpansionRespectFilters, + cacheFilters, + queryCacheEnabled, + queryCacheMaxEntries, + queryCacheTtlMs, + backendLabel, + resolvedDenseVectorMode, + intentInfo +}) { + const searchPipeline = createSearchPipeline({ + useSqlite, + sqliteFtsRequested, + sqliteFtsNormalize, + sqliteFtsProfile, + sqliteFtsWeights, + bm25K1, + bm25B, + fieldWeights, + postingsConfig, + queryTokens, + phraseNgramSet, + phraseRange, + symbolBoost, + filters, + filtersActive, + topN, + annEnabled: annActive, + scoreBlend, + rrf, + minhashMaxDocs, + vectorAnnState, + vectorAnnUsed, + hnswAnnState, + hnswAnnUsed, + buildCandidateSetSqlite, + getTokenIndexForQuery, + rankSqliteFts, + rankVectorAnnSqlite + }); + + let cacheHit = false; + let cacheKey = null; + let cacheSignature = null; + let cacheData = null; + let cachedPayload = null; + + const queryCachePath = path.join(metricsDir, 'queryCache.json'); + if (queryCacheEnabled) { + const signature = getIndexSignature({ + useSqlite, + backendLabel, + sqliteCodePath, + sqliteProsePath, + runRecords, + runExtractedProse, + root: rootDir, + userConfig + }); + cacheSignature = JSON.stringify(signature); + const cacheKeyInfo = buildQueryCacheKey({ + query, + backend: backendLabel, + mode: searchMode, + topN, + ann: annActive, + annMode: vectorExtension.annMode, + annProvider: vectorExtension.provider, + annExtension: vectorAnnEnabled, + scoreBlend, + fieldWeights, + denseVectorMode: resolvedDenseVectorMode, + intent: intentInfo?.type || null, + minhashMaxDocs, + sqliteFtsNormalize, + sqliteFtsProfile, + sqliteFtsWeights, + models: modelIds, + embeddings: { + provider: embeddingProvider, + onnxModel: embeddingOnnx.modelPath || null, + onnxTokenizer: embeddingOnnx.tokenizerId || null + }, + contextExpansion: { + enabled: contextExpansionEnabled, + maxPerHit: contextExpansionOptions.maxPerHit || null, + maxTotal: contextExpansionOptions.maxTotal || null, + includeCalls: contextExpansionOptions.includeCalls !== false, + includeImports: contextExpansionOptions.includeImports !== false, + includeExports: contextExpansionOptions.includeExports === true, + includeUsages: contextExpansionOptions.includeUsages === true, + respectFilters: contextExpansionRespectFilters + }, + filters: cacheFilters + }); + cacheKey = cacheKeyInfo.key; + cacheData = loadQueryCache(queryCachePath); + const entry = cacheData.entries.find((e) => e.key === cacheKey && e.signature === cacheSignature); + if (entry) { + const ttl = Number.isFinite(Number(entry.ttlMs)) ? Number(entry.ttlMs) : queryCacheTtlMs; + if (!ttl || (Date.now() - entry.ts) <= ttl) { + cachedPayload = entry.payload || null; + if (cachedPayload) { + const hasCode = !runCode || Array.isArray(cachedPayload.code); + const hasProse = !runProse || Array.isArray(cachedPayload.prose); + const hasRecords = !runRecords || Array.isArray(cachedPayload.records); + if (hasCode && hasProse && hasRecords) { + cacheHit = true; + entry.ts = Date.now(); + } + } + } + } + } + if (queryCacheEnabled) { + incCacheEvent({ cache: 'query', result: cacheHit ? 'hit' : 'miss' }); + } + + const needsEmbedding = !cacheHit && annActive && ( + (runProse && (idxProse.denseVec?.vectors?.length || vectorAnnState.prose.available || hnswAnnState.prose.available)) + || (runCode && (idxCode.denseVec?.vectors?.length || vectorAnnState.code.available || hnswAnnState.code.available)) + || (runExtractedProse && idxExtractedProse?.denseVec?.vectors?.length) + || (runRecords && idxRecords.denseVec?.vectors?.length) + ); + const embeddingCache = new Map(); + const getEmbeddingForModel = async (modelId, dims) => { + if (!modelId) return null; + const cacheKeyLocal = useStubEmbeddings ? `${modelId}:${dims || 'default'}` : modelId; + if (embeddingCache.has(cacheKeyLocal)) { + incCacheEvent({ cache: 'embedding', result: 'hit' }); + return embeddingCache.get(cacheKeyLocal); + } + incCacheEvent({ cache: 'embedding', result: 'miss' }); + const embedding = await getQueryEmbedding({ + text: embeddingQueryText, + modelId, + dims, + modelDir: modelConfig.dir, + useStub: useStubEmbeddings, + provider: embeddingProvider, + onnxConfig: embeddingOnnx, + rootDir + }); + embeddingCache.set(cacheKeyLocal, embedding); + return embedding; + }; + const queryEmbeddingCode = needsEmbedding && runCode && ( + idxCode.denseVec?.vectors?.length + || vectorAnnState.code.available + || hnswAnnState.code.available + ) + ? await getEmbeddingForModel(modelIds.code, idxCode.denseVec?.dims || null) + : null; + const queryEmbeddingProse = needsEmbedding && runProse && ( + idxProse.denseVec?.vectors?.length + || vectorAnnState.prose.available + || hnswAnnState.prose.available + ) + ? await getEmbeddingForModel(modelIds.prose, idxProse.denseVec?.dims || null) + : null; + const queryEmbeddingExtractedProse = needsEmbedding && runExtractedProse && idxExtractedProse?.denseVec?.vectors?.length + ? await getEmbeddingForModel(modelIds.extractedProse, idxExtractedProse.denseVec?.dims || null) + : null; + const queryEmbeddingRecords = needsEmbedding && runRecords && idxRecords.denseVec?.vectors?.length + ? await getEmbeddingForModel(modelIds.records, idxRecords.denseVec?.dims || null) + : null; + + const cachedHits = cacheHit && cachedPayload + ? { + proseHits: cachedPayload.prose || [], + extractedProseHits: cachedPayload.extractedProse || [], + codeHits: cachedPayload.code || [], + recordHits: cachedPayload.records || [] + } + : null; + const { proseHits, extractedProseHits, codeHits, recordHits } = cachedHits || runSearchByMode({ + searchPipeline, + runProse, + runExtractedProse, + runCode, + runRecords, + idxProse, + idxExtractedProse, + idxCode, + idxRecords, + queryEmbeddingProse, + queryEmbeddingExtractedProse, + queryEmbeddingCode, + queryEmbeddingRecords + }); + + const contextExpansionStats = { + enabled: contextExpansionEnabled, + code: 0, + prose: 0, + 'extracted-prose': 0, + records: 0 + }; + const expandModeHits = (mode, idx, hits) => { + if (!contextExpansionEnabled || !hits.length || !idx?.chunkMeta?.length) { + return { hits, contextHits: [] }; + } + const allowedIds = contextExpansionRespectFilters && filtersActive + ? new Set( + filterChunks(idx.chunkMeta, filters, idx.filterIndex, idx.fileRelations) + .map((chunk) => chunk.id) + ) + : null; + const contextHits = expandContext({ + hits, + chunkMeta: idx.chunkMeta, + fileRelations: idx.fileRelations, + repoMap: idx.repoMap, + options: contextExpansionOptions, + allowedIds + }); + contextExpansionStats[mode] = contextHits.length; + return { hits: hits.concat(contextHits), contextHits }; + }; + const proseExpanded = runProse ? expandModeHits('prose', idxProse, proseHits) : { hits: proseHits, contextHits: [] }; + const extractedProseExpanded = runExtractedProse + ? expandModeHits('extracted-prose', idxExtractedProse, extractedProseHits) + : { hits: extractedProseHits, contextHits: [] }; + const codeExpanded = runCode ? expandModeHits('code', idxCode, codeHits) : { hits: codeHits, contextHits: [] }; + const recordExpanded = runRecords ? expandModeHits('records', idxRecords, recordHits) : { hits: recordHits, contextHits: [] }; + + const hnswActive = Object.values(hnswAnnUsed).some(Boolean); + const annBackend = vectorAnnEnabled && (vectorAnnUsed.code || vectorAnnUsed.prose) + ? 'sqlite-extension' + : (hnswActive ? 'hnsw' : 'js'); + + if (queryCacheEnabled && cacheKey) { + if (!cacheData) cacheData = { version: 1, entries: [] }; + if (!cacheHit) { + cacheData.entries = cacheData.entries.filter((entry) => entry.key !== cacheKey); + cacheData.entries.push({ + key: cacheKey, + ts: Date.now(), + ttlMs: queryCacheTtlMs, + signature: cacheSignature, + meta: { + query, + backend: backendLabel + }, + payload: { + prose: proseHits, + code: codeHits, + records: recordHits + } + }); + } + pruneQueryCache(cacheData, queryCacheMaxEntries); + try { + await fs.mkdir(path.dirname(queryCachePath), { recursive: true }); + await fs.writeFile(queryCachePath, JSON.stringify(cacheData, null, 2)); + } catch {} + } + + return { + proseHits, + extractedProseHits, + codeHits, + recordHits, + proseExpanded, + extractedProseExpanded, + codeExpanded, + recordExpanded, + contextExpansionStats, + annBackend, + cache: { + enabled: queryCacheEnabled, + hit: cacheHit, + key: cacheKey + } + }; +} diff --git a/src/retrieval/cli/search-runner.js b/src/retrieval/cli/search-runner.js new file mode 100644 index 000000000..12e82b4dc --- /dev/null +++ b/src/retrieval/cli/search-runner.js @@ -0,0 +1,29 @@ +export function runSearchByMode({ + searchPipeline, + runProse, + runExtractedProse, + runCode, + runRecords, + idxProse, + idxExtractedProse, + idxCode, + idxRecords, + queryEmbeddingProse, + queryEmbeddingExtractedProse, + queryEmbeddingCode, + queryEmbeddingRecords +}) { + const proseHits = runProse + ? searchPipeline(idxProse, 'prose', queryEmbeddingProse) + : []; + const extractedProseHits = runExtractedProse + ? searchPipeline(idxExtractedProse, 'extracted-prose', queryEmbeddingExtractedProse) + : []; + const codeHits = runCode + ? searchPipeline(idxCode, 'code', queryEmbeddingCode) + : []; + const recordHits = runRecords + ? searchPipeline(idxRecords, 'records', queryEmbeddingRecords) + : []; + return { proseHits, extractedProseHits, codeHits, recordHits }; +} diff --git a/src/retrieval/cli/telemetry.js b/src/retrieval/cli/telemetry.js new file mode 100644 index 000000000..7a4ea02f7 --- /dev/null +++ b/src/retrieval/cli/telemetry.js @@ -0,0 +1,29 @@ +import { observeSearchDuration } from '../../shared/metrics.js'; + +export function createSearchTelemetry() { + const metricsStart = process.hrtime.bigint(); + let metricsRecorded = false; + let metricsMode = 'unknown'; + let metricsBackend = 'unknown'; + let metricsAnn = 'unknown'; + const recordSearchMetrics = (status) => { + if (metricsRecorded) return; + metricsRecorded = true; + const elapsed = Number(process.hrtime.bigint() - metricsStart) / 1e9; + try { + observeSearchDuration({ + mode: metricsMode, + backend: metricsBackend, + ann: metricsAnn, + status, + seconds: elapsed + }); + } catch {} + }; + return { + setMode: (mode) => { metricsMode = mode; }, + setBackend: (backend) => { metricsBackend = backend; }, + setAnn: (ann) => { metricsAnn = ann; }, + record: recordSearchMetrics + }; +} diff --git a/src/retrieval/context-expansion.js b/src/retrieval/context-expansion.js new file mode 100644 index 000000000..2af8902e9 --- /dev/null +++ b/src/retrieval/context-expansion.js @@ -0,0 +1,124 @@ +const pushIds = (acc, ids, reason) => { + for (const id of ids) { + if (id == null) continue; + acc.push({ id, reason }); + } +}; + +export function expandContext({ + hits, + chunkMeta, + fileRelations, + repoMap, + options = {}, + allowedIds = null +}) { + if (!Array.isArray(hits) || !hits.length || !Array.isArray(chunkMeta)) { + return []; + } + const maxPerHit = Number.isFinite(Number(options.maxPerHit)) ? Math.max(0, Number(options.maxPerHit)) : 4; + const maxTotal = Number.isFinite(Number(options.maxTotal)) ? Math.max(0, Number(options.maxTotal)) : 40; + const includeCalls = options.includeCalls !== false; + const includeImports = options.includeImports !== false; + const includeExports = options.includeExports === true; + const includeUsages = options.includeUsages === true; + + const byName = new Map(); + const byFile = new Map(); + for (const chunk of chunkMeta) { + if (!chunk) continue; + if (chunk.name) { + const list = byName.get(chunk.name) || []; + list.push(chunk.id); + byName.set(chunk.name, list); + } + if (chunk.file) { + const list = byFile.get(chunk.file) || []; + list.push(chunk.id); + byFile.set(chunk.file, list); + } + } + + const repoMapByName = new Map(); + if (Array.isArray(repoMap)) { + for (const entry of repoMap) { + if (!entry?.name || !entry?.file) continue; + const list = repoMapByName.get(entry.name) || []; + list.push(entry.file); + repoMapByName.set(entry.name, list); + } + } + + const primaryIds = new Set(hits.map((hit) => hit?.id).filter((id) => id != null)); + const addedIds = new Set(); + const contextHits = []; + + for (const hit of hits) { + if (contextHits.length >= maxTotal) break; + const sourceId = hit?.id; + const sourceChunk = sourceId != null ? chunkMeta[sourceId] : null; + if (!sourceChunk) continue; + const candidates = []; + if (includeCalls) { + const calls = sourceChunk.codeRelations?.calls || []; + for (const entry of calls) { + const callee = Array.isArray(entry) ? entry[1] : null; + if (!callee) continue; + const ids = byName.get(callee) || []; + if (ids.length) { + pushIds(candidates, ids, `call:${callee}`); + } else { + const files = repoMapByName.get(callee) || []; + for (const file of files) { + pushIds(candidates, byFile.get(file) || [], `call:${callee}`); + } + } + } + } + if (fileRelations && sourceChunk.file) { + const relations = typeof fileRelations.get === 'function' + ? fileRelations.get(sourceChunk.file) + : fileRelations[sourceChunk.file]; + if (relations) { + if (includeImports && Array.isArray(relations.importLinks)) { + for (const file of relations.importLinks) { + pushIds(candidates, byFile.get(file) || [], `import:${file}`); + } + } + if (includeUsages && Array.isArray(relations.usages)) { + for (const usage of relations.usages) { + pushIds(candidates, byName.get(usage) || [], `usage:${usage}`); + } + } + if (includeExports && Array.isArray(relations.exports)) { + for (const exp of relations.exports) { + pushIds(candidates, byName.get(exp) || [], `export:${exp}`); + } + } + } + } + + let addedForHit = 0; + for (const candidate of candidates) { + if (contextHits.length >= maxTotal || addedForHit >= maxPerHit) break; + const id = candidate.id; + if (primaryIds.has(id) || addedIds.has(id)) continue; + if (allowedIds && !allowedIds.has(id)) continue; + const chunk = chunkMeta[id]; + if (!chunk) continue; + addedIds.add(id); + addedForHit += 1; + contextHits.push({ + ...chunk, + score: 0, + scoreType: 'context', + context: { + sourceId, + reason: candidate.reason + } + }); + } + } + + return contextHits; +} diff --git a/src/retrieval/embedding.js b/src/retrieval/embedding.js new file mode 100644 index 000000000..45e36c382 --- /dev/null +++ b/src/retrieval/embedding.js @@ -0,0 +1,80 @@ +import fs from 'node:fs'; +import { stubEmbedding } from '../shared/embedding.js'; +import { createOnnxEmbedder, normalizeEmbeddingProvider } from '../shared/onnx-embeddings.js'; + +const embedderCache = new Map(); + +async function getEmbedder({ provider, modelId, modelDir, rootDir, onnxConfig }) { + const resolvedProvider = normalizeEmbeddingProvider(provider); + const cacheKey = JSON.stringify({ + provider: resolvedProvider, + modelId, + modelDir, + onnxConfig: onnxConfig || null, + rootDir + }); + if (embedderCache.has(cacheKey)) return embedderCache.get(cacheKey); + if (resolvedProvider === 'onnx') { + const embedder = createOnnxEmbedder({ + rootDir, + modelId, + modelsDir: modelDir, + onnxConfig + }); + embedderCache.set(cacheKey, embedder); + return embedder; + } + const { pipeline, env } = await import('@xenova/transformers'); + if (modelDir) { + try { + fs.mkdirSync(modelDir, { recursive: true }); + } catch {} + env.cacheDir = modelDir; + } + const embedder = await pipeline('feature-extraction', modelId); + embedderCache.set(cacheKey, embedder); + return embedder; +} + +/** + * Compute a query embedding using the configured model. + * Returns null when embeddings are unavailable. + * @param {object} options + * @param {string} options.text + * @param {string} options.modelId + * @param {number} options.dims + * @param {string} options.modelDir + * @param {boolean} options.useStub + * @returns {Promise} + */ +export async function getQueryEmbedding({ + text, + modelId, + dims, + modelDir, + useStub, + provider, + onnxConfig, + rootDir +}) { + if (useStub) { + return stubEmbedding(text, dims); + } + try { + const resolvedProvider = normalizeEmbeddingProvider(provider); + const embedder = await getEmbedder({ + provider: resolvedProvider, + modelId, + modelDir, + rootDir, + onnxConfig + }); + if (resolvedProvider === 'onnx') { + return await embedder.getEmbedding(text); + } + const output = await embedder(text, { pooling: 'mean', normalize: true }); + return Array.from(output.data); + } catch { + return null; + } +} diff --git a/src/retrieval/filter-index.js b/src/retrieval/filter-index.js new file mode 100644 index 000000000..4e8d5b28e --- /dev/null +++ b/src/retrieval/filter-index.js @@ -0,0 +1,146 @@ +import { tri } from '../shared/tokenize.js'; +import { buildBitmapIndex } from './bitmap.js'; + +/** + * Build lookup maps for common search filters. + * @param {Array} chunkMeta + * @param {{fileChargramN?:number}} [options] + * @returns {object} + */ +export function buildFilterIndex(chunkMeta = [], options = {}) { + const fileChargramN = Number.isFinite(Number(options.fileChargramN)) + ? Math.max(2, Math.floor(Number(options.fileChargramN))) + : 3; + const includeBitmaps = options.includeBitmaps !== false; + const index = { + byExt: new Map(), + byKind: new Map(), + byAuthor: new Map(), + byChunkAuthor: new Map(), + byVisibility: new Map(), + fileById: [], + fileIdByPath: new Map(), + fileChunksById: [], + fileChargrams: new Map(), + fileChargramN + }; + + const add = (map, value, id) => { + if (!value) return; + const values = Array.isArray(value) ? value : [value]; + for (const entry of values) { + const key = String(entry || '').toLowerCase(); + if (!key) continue; + let bucket = map.get(key); + if (!bucket) { + bucket = new Set(); + map.set(key, bucket); + } + bucket.add(id); + } + }; + + const normalizeFilePath = (value) => String(value || '').replace(/\\/g, '/').toLowerCase(); + const addFileChargrams = (fileId, fileValue) => { + const grams = new Set(tri(fileValue, fileChargramN)); + for (const gram of grams) { + let bucket = index.fileChargrams.get(gram); + if (!bucket) { + bucket = new Set(); + index.fileChargrams.set(gram, bucket); + } + bucket.add(fileId); + } + }; + const addFile = (fileValue, chunkId) => { + if (!fileValue) return; + const normalized = normalizeFilePath(fileValue); + let fileId = index.fileIdByPath.get(normalized); + if (fileId == null) { + fileId = index.fileById.length; + index.fileIdByPath.set(normalized, fileId); + index.fileById.push(normalized); + index.fileChunksById[fileId] = new Set(); + addFileChargrams(fileId, normalized); + } + index.fileChunksById[fileId].add(chunkId); + }; + + for (const chunk of chunkMeta) { + if (!chunk) continue; + const id = chunk.id; + if (!Number.isFinite(id)) continue; + addFile(chunk.file, id); + add(index.byExt, chunk.ext, id); + add(index.byKind, chunk.kind, id); + add(index.byAuthor, chunk.last_author, id); + const visibility = chunk.docmeta?.visibility || chunk.docmeta?.modifiers?.visibility || null; + add(index.byVisibility, visibility, id); + const chunkAuthors = Array.isArray(chunk.chunk_authors) ? chunk.chunk_authors : []; + for (const author of chunkAuthors) add(index.byChunkAuthor, author, id); + } + + if (includeBitmaps) { + index.bitmap = buildBitmapIndex(index); + } + return index; +} + +const serializeMap = (map) => { + const out = {}; + if (!map || typeof map.entries !== 'function') return out; + for (const [key, value] of map.entries()) { + out[key] = Array.from(value || []); + } + return out; +}; + +const hydrateMap = (value) => { + const map = new Map(); + if (!value || typeof value !== 'object') return map; + for (const [key, list] of Object.entries(value)) { + map.set(key, new Set(Array.isArray(list) ? list : [])); + } + return map; +}; + +export function serializeFilterIndex(index) { + if (!index) return null; + return { + fileChargramN: index.fileChargramN || 3, + byExt: serializeMap(index.byExt), + byKind: serializeMap(index.byKind), + byAuthor: serializeMap(index.byAuthor), + byChunkAuthor: serializeMap(index.byChunkAuthor), + byVisibility: serializeMap(index.byVisibility), + fileById: Array.isArray(index.fileById) ? index.fileById : [], + fileChunksById: Array.isArray(index.fileChunksById) + ? index.fileChunksById.map((set) => Array.from(set || [])) + : [], + fileChargrams: serializeMap(index.fileChargrams) + }; +} + +export function hydrateFilterIndex(raw) { + if (!raw || typeof raw !== 'object') return null; + const fileById = Array.isArray(raw.fileById) ? raw.fileById : []; + const fileIdByPath = new Map(fileById.map((value, idx) => [value, idx])); + const index = { + fileChargramN: Number.isFinite(Number(raw.fileChargramN)) + ? Math.max(2, Math.floor(Number(raw.fileChargramN))) + : 3, + byExt: hydrateMap(raw.byExt), + byKind: hydrateMap(raw.byKind), + byAuthor: hydrateMap(raw.byAuthor), + byChunkAuthor: hydrateMap(raw.byChunkAuthor), + byVisibility: hydrateMap(raw.byVisibility), + fileById, + fileIdByPath, + fileChunksById: Array.isArray(raw.fileChunksById) + ? raw.fileChunksById.map((list) => new Set(Array.isArray(list) ? list : [])) + : [], + fileChargrams: hydrateMap(raw.fileChargrams) + }; + index.bitmap = buildBitmapIndex(index); + return index; +} diff --git a/src/retrieval/filters.js b/src/retrieval/filters.js new file mode 100644 index 000000000..d8807c571 --- /dev/null +++ b/src/retrieval/filters.js @@ -0,0 +1,204 @@ +import { parseJson } from './query-cache.js'; +import { + CLIKE_EXTS, + CSHARP_EXTS, + CSS_EXTS, + GO_EXTS, + HTML_EXTS, + JAVA_EXTS, + JS_EXTS, + KOTLIN_EXTS, + LUA_EXTS, + OBJC_EXTS, + PERL_EXTS, + PHP_EXTS, + RUBY_EXTS, + SHELL_EXTS, + SQL_EXTS, + TS_EXTS +} from '../index/constants.js'; + +const PY_EXTS = new Set(['.py']); +const SWIFT_EXTS = new Set(['.swift']); +const DOC_EXTS = new Set(['.md', '.rst', '.adoc', '.asciidoc']); +const CONFIG_EXTS = new Set(['.json', '.toml', '.ini', '.cfg', '.conf', '.xml', '.yml', '.yaml']); + +const LANG_EXT_MAP = new Map([ + ['javascript', JS_EXTS], + ['js', JS_EXTS], + ['typescript', TS_EXTS], + ['ts', TS_EXTS], + ['python', PY_EXTS], + ['py', PY_EXTS], + ['swift', SWIFT_EXTS], + ['rust', new Set(['.rs'])], + ['go', GO_EXTS], + ['java', JAVA_EXTS], + ['csharp', CSHARP_EXTS], + ['c#', CSHARP_EXTS], + ['kotlin', KOTLIN_EXTS], + ['ruby', RUBY_EXTS], + ['php', PHP_EXTS], + ['lua', LUA_EXTS], + ['sql', SQL_EXTS], + ['perl', PERL_EXTS], + ['shell', SHELL_EXTS], + ['bash', SHELL_EXTS], + ['zsh', SHELL_EXTS], + ['clike', CLIKE_EXTS], + ['c', new Set(['.c', '.h'])], + ['cpp', new Set(['.cc', '.cpp', '.hpp', '.hh'])], + ['c++', new Set(['.cc', '.cpp', '.hpp', '.hh'])], + ['objc', OBJC_EXTS], + ['objective-c', OBJC_EXTS], + ['html', HTML_EXTS], + ['css', CSS_EXTS], + ['json', new Set(['.json'])], + ['yaml', new Set(['.yml', '.yaml'])], + ['toml', new Set(['.toml'])], + ['ini', new Set(['.ini', '.cfg', '.conf'])], + ['xml', new Set(['.xml'])], + ['markdown', new Set(['.md'])], + ['rst', new Set(['.rst'])], + ['asciidoc', new Set(['.adoc', '.asciidoc'])], + ['docs', DOC_EXTS], + ['config', CONFIG_EXTS] +]); + +/** + * Normalize extension filters into a lowercase list. + * @param {string|string[]|null|undefined} extArg + * @returns {string[]|null} + */ +export function normalizeExtFilter(extArg) { + const entries = Array.isArray(extArg) ? extArg : (extArg ? [extArg] : []); + if (!entries.length) return null; + const normalized = []; + for (const entry of entries) { + String(entry || '') + .split(/[,\s]+/) + .map((raw) => raw.trim()) + .filter(Boolean) + .forEach((raw) => { + let value = raw.toLowerCase(); + value = value.replace(/^\*+/, ''); + if (!value) return; + if (!value.startsWith('.')) value = `.${value}`; + normalized.push(value); + }); + } + return normalized.length ? Array.from(new Set(normalized)) : null; +} + +/** + * Normalize language filters into a list of extensions. + * @param {string|string[]|null|undefined} langArg + * @returns {string[]|null} + */ +export function normalizeLangFilter(langArg) { + const entries = Array.isArray(langArg) ? langArg : (langArg ? [langArg] : []); + if (!entries.length) return null; + const exts = new Set(); + for (const entry of entries) { + String(entry || '') + .split(/[,\s]+/) + .map((raw) => raw.trim().toLowerCase()) + .filter(Boolean) + .forEach((raw) => { + const mapped = LANG_EXT_MAP.get(raw); + if (!mapped) return; + for (const ext of mapped) exts.add(ext); + }); + } + return exts.size ? Array.from(exts) : null; +} + +/** + * Merge extension filters with language filters. + * @param {string[]|null} extFilter + * @param {string[]|null} langFilter + * @returns {string[]|null} + */ +export function mergeExtFilters(extFilter, langFilter) { + if (!extFilter && !langFilter) return null; + if (extFilter && langFilter) { + const langSet = new Set(langFilter); + const merged = extFilter.filter((ext) => langSet.has(ext)); + return merged.length ? Array.from(new Set(merged)) : null; + } + return extFilter || langFilter; +} + +/** + * Parse --meta and --meta-json into a normalized filter list. + * @param {string|string[]|null|undefined} metaArg + * @param {string|string[]|null|undefined} metaJsonArg + * @returns {Array<{key:string,value:any}>|null} + */ +export function parseMetaFilters(metaArg, metaJsonArg) { + const filters = []; + const pushFilter = (rawKey, rawValue) => { + const key = String(rawKey || '').trim(); + if (!key) return; + const value = rawValue === undefined ? null : rawValue; + filters.push({ key, value }); + }; + const handleEntry = (entry) => { + const text = String(entry || '').trim(); + if (!text) return; + const split = text.split('='); + const key = split.shift(); + const value = split.length ? split.join('=').trim() : null; + pushFilter(key, value === '' ? null : value); + }; + const metaEntries = Array.isArray(metaArg) ? metaArg : (metaArg ? [metaArg] : []); + for (const entry of metaEntries) handleEntry(entry); + const metaJsonEntries = Array.isArray(metaJsonArg) ? metaJsonArg : (metaJsonArg ? [metaJsonArg] : []); + for (const entry of metaJsonEntries) { + const parsed = parseJson(entry, null); + if (!parsed) continue; + if (Array.isArray(parsed)) { + parsed.forEach((item) => { + if (!item || typeof item !== 'object') return; + Object.entries(item).forEach(([key, value]) => pushFilter(key, value)); + }); + } else if (typeof parsed === 'object') { + Object.entries(parsed).forEach(([key, value]) => pushFilter(key, value)); + } + } + return filters.length ? filters : null; +} + +/** + * Check whether any search filters are active. + * @param {object|null|undefined} filters + * @returns {boolean} + */ +export function hasActiveFilters(filters) { + if (!filters || typeof filters !== 'object') return false; + for (const value of Object.values(filters)) { + if (value == null) continue; + if (typeof value === 'boolean') { + if (value) return true; + continue; + } + if (typeof value === 'number') { + if (Number.isFinite(value)) return true; + continue; + } + if (typeof value === 'string') { + if (value.trim()) return true; + continue; + } + if (Array.isArray(value)) { + if (value.length) return true; + continue; + } + if (typeof value === 'object') { + if (Object.keys(value).length) return true; + continue; + } + return true; + } + return false; +} diff --git a/src/retrieval/fts.js b/src/retrieval/fts.js new file mode 100644 index 000000000..87dcf48bd --- /dev/null +++ b/src/retrieval/fts.js @@ -0,0 +1,88 @@ +/** + * Resolve FTS5 bm25 weights from a profile or config override. + * @param {string} profile + * @param {object|number[]|null} config + * @returns {number[]} + */ +export function resolveFtsWeights(profile, config) { + const profiles = { + balanced: { + file: 0.2, + name: 1.5, + signature: 1.2, + kind: 0.6, + headline: 1.5, + doc: 1.8, + tokens: 1.0 + }, + headline: { + file: 0.1, + name: 1.2, + signature: 1.0, + kind: 0.4, + headline: 3.0, + doc: 2.2, + tokens: 1.0 + }, + name: { + file: 0.2, + name: 2.5, + signature: 1.6, + kind: 0.8, + headline: 1.2, + doc: 1.4, + tokens: 1.0 + } + }; + const base = profiles[profile] || profiles.balanced; + if (Array.isArray(config)) { + const values = config.map((v) => Number(v)).filter((v) => Number.isFinite(v)); + if (values.length >= 8) return values.slice(0, 8); + if (values.length === 7) return [0, ...values]; + if (values.length === 6) { + const [, file, name, kind, headline, tokens] = values; + return [ + 0, + file ?? base.file, + name ?? base.name, + base.signature, + kind ?? base.kind, + headline ?? base.headline, + base.doc, + tokens ?? base.tokens + ]; + } + if (values.length === 5) { + const [file, name, kind, headline, tokens] = values; + return [ + 0, + file ?? base.file, + name ?? base.name, + base.signature, + kind ?? base.kind, + headline ?? base.headline, + base.doc, + tokens ?? base.tokens + ]; + } + } else if (config && typeof config === 'object') { + const merged = { ...base }; + for (const key of ['file', 'name', 'signature', 'kind', 'headline', 'doc', 'tokens']) { + if (Number.isFinite(Number(config[key]))) merged[key] = Number(config[key]); + } + if (Number.isFinite(Number(config.body))) merged.tokens = Number(config.body); + return [0, merged.file, merged.name, merged.signature, merged.kind, merged.headline, merged.doc, merged.tokens]; + } + + return [0, base.file, base.name, base.signature, base.kind, base.headline, base.doc, base.tokens]; +} + +/** + * Build a bm25(chunks_fts, ...) SQL expression from weights. + * @param {number[]} weights + * @returns {string} + */ +export function buildFtsBm25Expr(weights) { + const safe = weights.map((val) => (Number.isFinite(val) ? val : 1)); + return `bm25(chunks_fts, ${safe.join(', ')})`; +} diff --git a/src/retrieval/index-cache.js b/src/retrieval/index-cache.js new file mode 100644 index 000000000..7d4880822 --- /dev/null +++ b/src/retrieval/index-cache.js @@ -0,0 +1,106 @@ +import fsSync from 'node:fs'; +import path from 'node:path'; + +const INDEX_FILES = [ + 'phrase_ngrams.json', + 'chargram_postings.json', + 'dense_vectors_uint8.json', + 'dense_vectors_doc_uint8.json', + 'dense_vectors_code_uint8.json', + 'dense_vectors_hnsw.meta.json', + 'dense_vectors_hnsw.bin', + 'field_postings.json', + 'field_tokens.json', + 'minhash_signatures.json', + 'file_relations.json', + 'file_meta.json', + 'filter_index.json', + 'index_state.json' +]; + +const fileSignature = (filePath) => { + try { + let statPath = filePath; + if (!fsSync.existsSync(statPath) && filePath.endsWith('.json')) { + const gzPath = `${filePath}.gz`; + if (fsSync.existsSync(gzPath)) statPath = gzPath; + } + const stat = fsSync.statSync(statPath); + return `${stat.size}:${stat.mtimeMs}`; + } catch { + return null; + } +}; + +const shardSignature = (dir, prefix) => { + try { + if (!fsSync.existsSync(dir)) return null; + const entries = fsSync + .readdirSync(dir) + .filter((name) => name.startsWith(prefix)) + .sort(); + if (!entries.length) return null; + return entries + .map((name) => fileSignature(path.join(dir, name)) || 'missing') + .join(','); + } catch { + return null; + } +}; + +const chunkMetaSignature = (dir) => { + const jsonPath = path.join(dir, 'chunk_meta.json'); + const jsonSig = fileSignature(jsonPath); + if (jsonSig) return `chunk_meta.json:${jsonSig}`; + const jsonlPath = path.join(dir, 'chunk_meta.jsonl'); + const jsonlSig = fileSignature(jsonlPath); + if (jsonlSig) return `chunk_meta.jsonl:${jsonlSig}`; + const metaPath = path.join(dir, 'chunk_meta.meta.json'); + const metaSig = fileSignature(metaPath); + const partsSig = shardSignature(path.join(dir, 'chunk_meta.parts'), 'chunk_meta.part-'); + if (metaSig || partsSig) { + return `chunk_meta.meta.json:${metaSig || 'missing'}|parts:${partsSig || 'missing'}`; + } + return 'chunk_meta.json:missing'; +}; + +const tokenPostingsSignature = (dir) => { + const jsonPath = path.join(dir, 'token_postings.json'); + const jsonSig = fileSignature(jsonPath); + if (jsonSig) return `token_postings.json:${jsonSig}`; + const metaPath = path.join(dir, 'token_postings.meta.json'); + const metaSig = fileSignature(metaPath); + const partsSig = shardSignature(path.join(dir, 'token_postings.shards'), 'token_postings.part-'); + if (metaSig || partsSig) { + return `token_postings.meta.json:${metaSig || 'missing'}|parts:${partsSig || 'missing'}`; + } + return 'token_postings.json:missing'; +}; + +export function buildIndexSignature(dir) { + if (!dir) return null; + const parts = [ + chunkMetaSignature(dir), + tokenPostingsSignature(dir), + ...INDEX_FILES.map((name) => { + const target = path.join(dir, name); + const sig = fileSignature(target); + return `${name}:${sig || 'missing'}`; + }) + ]; + return parts.join('|'); +} + +export function loadIndexWithCache(cache, dir, options, loader) { + if (!cache) return loader(dir, options); + const hnswKey = options?.includeHnsw ? JSON.stringify(options?.hnswConfig || {}) : 'no-hnsw'; + const cacheKey = `${dir}::${options?.modelIdDefault || ''}::${options?.fileChargramN || ''}::${hnswKey}`; + const signature = buildIndexSignature(dir); + const cached = cache.get(cacheKey); + if (cached && cached.signature === signature) { + return cached.value; + } + const value = loader(dir, options); + cache.set(cacheKey, { signature, value }); + return value; +} diff --git a/src/retrieval/lancedb.js b/src/retrieval/lancedb.js new file mode 100644 index 000000000..1e587769e --- /dev/null +++ b/src/retrieval/lancedb.js @@ -0,0 +1,172 @@ +import fs from 'node:fs'; +import { tryImport } from '../shared/optional-deps.js'; +import { normalizeLanceDbConfig } from '../shared/lancedb.js'; + +const CANDIDATE_PUSH_LIMIT = 500; + +let cachedModule = null; +let warnedMissing = false; +let warnedQuery = false; + +const warnOnce = (message) => { + if (warnedQuery) return; + warnedQuery = true; + console.warn(message); +}; + +const loadLanceDb = async () => { + if (cachedModule) return cachedModule; + const result = await tryImport('@lancedb/lancedb'); + if (!result.ok) { + if (!warnedMissing) { + warnedMissing = true; + console.warn('[ann] LanceDB unavailable; falling back to other ANN backends.'); + } + return null; + } + cachedModule = result.mod?.default || result.mod; + return cachedModule; +}; + +const connectionCache = new Map(); + +const getConnection = async (dir) => { + if (!dir) return null; + if (connectionCache.has(dir)) return connectionCache.get(dir); + const lancedb = await loadLanceDb(); + const connect = lancedb?.connect || lancedb?.default?.connect; + if (!connect) return null; + const db = await connect(dir); + const entry = { db, tables: new Map() }; + connectionCache.set(dir, entry); + return entry; +}; + +const getTable = async (dir, tableName) => { + const connection = await getConnection(dir); + if (!connection || !tableName) return null; + if (connection.tables.has(tableName)) return connection.tables.get(tableName); + const openTable = connection.db?.openTable; + if (typeof openTable !== 'function') return null; + const table = await openTable.call(connection.db, tableName); + connection.tables.set(tableName, table); + return table; +}; + +const toArray = async (query) => { + if (!query) return []; + if (typeof query.toArray === 'function') return query.toArray(); + if (typeof query.execute === 'function') return query.execute(); + if (typeof query.collect === 'function') return query.collect(); + return []; +}; + +const normalizeSim = (distance, metric) => { + if (!Number.isFinite(distance)) return null; + if (metric === 'l2') return -distance; + if (metric === 'cosine') return 1 - distance; + return distance; +}; + +const readRowId = (row, idColumn) => { + const value = row?.[idColumn] ?? row?.id ?? row?._id ?? row?.idx; + const numeric = Number(value); + if (Number.isFinite(numeric)) return numeric; + return null; +}; + +const readRowScore = (row, metric) => { + const distanceRaw = row?._distance ?? row?.distance; + if (distanceRaw != null) { + return normalizeSim(Number(distanceRaw), metric); + } + const scoreRaw = row?.score ?? row?._score ?? row?.sim ?? row?.similarity; + const score = Number(scoreRaw); + return Number.isFinite(score) ? score : null; +}; + +export async function rankLanceDb({ + lancedbInfo, + queryEmbedding, + topN, + candidateSet, + config +}) { + if (!lancedbInfo?.available) return []; + if (!Array.isArray(queryEmbedding) || !queryEmbedding.length) return []; + const resolvedConfig = normalizeLanceDbConfig(config); + if (!resolvedConfig.enabled) return []; + + const meta = lancedbInfo.meta || {}; + const tableName = meta.table || resolvedConfig.table; + const idColumn = meta.idColumn || resolvedConfig.idColumn; + const embeddingColumn = meta.embeddingColumn || resolvedConfig.embeddingColumn; + const metric = meta.metric || resolvedConfig.metric; + const dims = Number.isFinite(Number(meta.dims)) ? Number(meta.dims) : null; + if (dims && queryEmbedding.length !== dims) return []; + + const dir = lancedbInfo.dir; + if (!dir || !fs.existsSync(dir)) return []; + + let table; + try { + table = await getTable(dir, tableName); + } catch (err) { + warnOnce(`[ann] LanceDB table load failed; falling back to other ANN backends. ${err?.message || err}`); + return []; + } + if (!table || typeof table.search !== 'function') return []; + + const limitBase = Math.max(1, Number(topN) || 1); + const candidateCount = candidateSet && candidateSet.size ? candidateSet.size : 0; + const limit = candidateCount + ? Math.min(Math.max(limitBase * 4, limitBase + 10), candidateCount) + : limitBase; + let query; + if (embeddingColumn !== 'vector' && table.search.length > 1) { + query = table.search(queryEmbedding, { vectorColumn: embeddingColumn }); + } else { + query = table.search(queryEmbedding); + } + if (typeof query?.metricType === 'function') { + query = query.metricType(metric); + } else if (typeof query?.metric === 'function') { + query = query.metric(metric); + } else if (typeof query?.distanceType === 'function') { + query = query.distanceType(metric); + } + const canPushdown = candidateCount > 0 + && candidateCount <= CANDIDATE_PUSH_LIMIT + && typeof query?.where === 'function'; + if (canPushdown) { + const ids = Array.from(candidateSet).filter((id) => Number.isFinite(Number(id))); + if (ids.length) { + query = query.where(`${idColumn} IN (${ids.join(',')})`); + } + } + if (typeof query.limit === 'function') query = query.limit(limit); + if (typeof query.select === 'function') query = query.select([idColumn]); + + let rows; + try { + rows = await toArray(query); + } catch (err) { + warnOnce(`[ann] LanceDB query failed; falling back to other ANN backends. ${err?.message || err}`); + return []; + } + + const hits = []; + for (const row of rows || []) { + const idx = readRowId(row, idColumn); + if (idx == null) continue; + const sim = readRowScore(row, metric); + if (sim == null) continue; + hits.push({ idx, sim }); + } + const filtered = !candidateCount || canPushdown + ? hits + : hits.filter((hit) => candidateSet.has(hit.idx)); + return filtered + .sort((a, b) => (b.sim - a.sim) || (a.idx - b.idx)) + .slice(0, limitBase); +} diff --git a/src/retrieval/lmdb-helpers.js b/src/retrieval/lmdb-helpers.js new file mode 100644 index 000000000..b57b77d81 --- /dev/null +++ b/src/retrieval/lmdb-helpers.js @@ -0,0 +1,180 @@ +import { Unpackr } from 'msgpackr'; +import { buildFilterIndex, hydrateFilterIndex } from './filter-index.js'; +import { loadHnswIndex, normalizeHnswConfig, resolveHnswPaths, validateHnswMetaCompatibility } from '../shared/hnsw.js'; +import { LMDB_ARTIFACT_KEYS, LMDB_META_KEYS } from '../storage/lmdb/schema.js'; + +const unpackr = new Unpackr(); +const decode = (value) => (value == null ? null : unpackr.unpack(value)); + +/** + * Create LMDB helper functions for search. + * @param {object} options + * @param {(mode:'code'|'prose')=>object|null} options.getDb + * @param {object} options.hnswConfig + * @param {string} options.modelIdDefault + * @param {number} options.fileChargramN + * @param {Record} options.indexDirs + * @returns {object} + */ +export function createLmdbHelpers(options) { + const { + getDb, + hnswConfig: rawHnswConfig, + modelIdDefault, + fileChargramN, + indexDirs + } = options; + const hnswConfig = normalizeHnswConfig(rawHnswConfig || {}); + + const getArtifact = (db, key) => decode(db.get(key)); + + /** + * Load index artifacts from LMDB into in-memory structures. + * @param {'code'|'prose'} mode + * @param {object} [options] + * @returns {object} + */ + function loadIndexFromLmdb(mode, options = {}) { + const db = getDb(mode); + if (!db) throw new Error('LMDB backend requested but database is not available.'); + const includeMinhash = options.includeMinhash !== false; + const includeDense = options.includeDense !== false; + const includeChunks = options.includeChunks !== false; + const includeFilterIndex = options.includeFilterIndex !== false; + const includeHnsw = options.includeHnsw !== false; + + const chunkCountRaw = getArtifact(db, LMDB_META_KEYS.chunkCount); + const chunkCount = Number.isFinite(Number(chunkCountRaw)) ? Number(chunkCountRaw) : 0; + let chunkMeta = includeChunks + ? (getArtifact(db, LMDB_ARTIFACT_KEYS.chunkMeta) || []) + : (chunkCount ? Array.from({ length: chunkCount }) : []); + + const fileMetaRaw = getArtifact(db, LMDB_ARTIFACT_KEYS.fileMeta); + let fileMetaById = null; + if (Array.isArray(fileMetaRaw)) { + fileMetaById = new Map(); + for (const entry of fileMetaRaw) { + if (!entry || entry.id == null) continue; + fileMetaById.set(entry.id, entry); + } + } + if (!fileMetaById && includeChunks) { + const missingMeta = chunkMeta.some((chunk) => chunk && chunk.fileId != null && !chunk.file); + if (missingMeta) { + throw new Error('file_meta.json is required for fileId-based chunk metadata.'); + } + } else if (fileMetaById && includeChunks) { + for (const chunk of chunkMeta) { + if (!chunk || (chunk.file && chunk.ext)) continue; + const meta = fileMetaById.get(chunk.fileId); + if (!meta) continue; + if (!chunk.file) chunk.file = meta.file; + if (!chunk.ext) chunk.ext = meta.ext; + if (!chunk.externalDocs) chunk.externalDocs = meta.externalDocs; + if (!chunk.last_modified) chunk.last_modified = meta.last_modified; + if (!chunk.last_author) chunk.last_author = meta.last_author; + if (!chunk.churn) chunk.churn = meta.churn; + if (!chunk.churn_added) chunk.churn_added = meta.churn_added; + if (!chunk.churn_deleted) chunk.churn_deleted = meta.churn_deleted; + if (!chunk.churn_commits) chunk.churn_commits = meta.churn_commits; + } + } + + const fileRelationsRaw = getArtifact(db, LMDB_ARTIFACT_KEYS.fileRelations); + const repoMap = getArtifact(db, LMDB_ARTIFACT_KEYS.repoMap); + let fileRelations = null; + if (Array.isArray(fileRelationsRaw)) { + const map = new Map(); + for (const entry of fileRelationsRaw) { + if (!entry || !entry.file) continue; + map.set(entry.file, entry.relations || null); + } + fileRelations = map; + } + + const indexState = getArtifact(db, LMDB_ARTIFACT_KEYS.indexState); + const embeddingsState = indexState?.embeddings || null; + const embeddingsReady = embeddingsState?.ready !== false && embeddingsState?.pending !== true; + const denseVec = embeddingsReady && includeDense + ? getArtifact(db, LMDB_ARTIFACT_KEYS.denseVectors) + : null; + const denseVecDoc = embeddingsReady && includeDense + ? getArtifact(db, LMDB_ARTIFACT_KEYS.denseVectorsDoc) + : null; + const denseVecCode = embeddingsReady && includeDense + ? getArtifact(db, LMDB_ARTIFACT_KEYS.denseVectorsCode) + : null; + if (denseVec && !denseVec.model && modelIdDefault) denseVec.model = modelIdDefault; + if (denseVecDoc && !denseVecDoc.model && modelIdDefault) denseVecDoc.model = modelIdDefault; + if (denseVecCode && !denseVecCode.model && modelIdDefault) denseVecCode.model = modelIdDefault; + const hnswMeta = embeddingsReady && includeDense && includeHnsw && hnswConfig.enabled + ? getArtifact(db, LMDB_ARTIFACT_KEYS.denseHnswMeta) + : null; + let hnswIndex = null; + let hnswAvailable = false; + if (hnswMeta && includeHnsw && hnswConfig.enabled) { + const compatibility = validateHnswMetaCompatibility({ denseVectors: denseVec, hnswMeta }); + if (!compatibility.ok) { + console.warn(`[ann] Skipping HNSW index load due to incompatible metadata: ${compatibility.warnings.join('; ')}`); + } else { + const indexDir = indexDirs?.[mode] || null; + if (indexDir) { + const { indexPath } = resolveHnswPaths(indexDir); + const mergedConfig = { + ...hnswConfig, + space: hnswMeta.space || hnswConfig.space, + efSearch: hnswMeta.efSearch || hnswConfig.efSearch + }; + hnswIndex = loadHnswIndex({ indexPath, dims: hnswMeta.dims, config: mergedConfig }); + hnswAvailable = Boolean(hnswIndex); + } + } + } + + const fieldPostings = getArtifact(db, LMDB_ARTIFACT_KEYS.fieldPostings); + const fieldTokens = getArtifact(db, LMDB_ARTIFACT_KEYS.fieldTokens); + const filterIndexRaw = getArtifact(db, LMDB_ARTIFACT_KEYS.filterIndex); + const idx = { + chunkMeta, + fileRelations, + repoMap, + denseVec, + denseVecDoc, + denseVecCode, + hnsw: hnswMeta ? { + available: hnswAvailable, + index: hnswIndex, + meta: hnswMeta, + space: hnswMeta.space || hnswConfig.space + } : { available: false, index: null, meta: null, space: hnswConfig.space }, + state: indexState, + fieldPostings, + fieldTokens, + minhash: includeMinhash ? getArtifact(db, LMDB_ARTIFACT_KEYS.minhashSignatures) : null, + phraseNgrams: getArtifact(db, LMDB_ARTIFACT_KEYS.phraseNgrams), + chargrams: getArtifact(db, LMDB_ARTIFACT_KEYS.chargramPostings) + }; + if (idx.phraseNgrams?.vocab && !idx.phraseNgrams.vocabIndex) { + idx.phraseNgrams.vocabIndex = new Map(idx.phraseNgrams.vocab.map((term, i) => [term, i])); + } + if (idx.chargrams?.vocab && !idx.chargrams.vocabIndex) { + idx.chargrams.vocabIndex = new Map(idx.chargrams.vocab.map((term, i) => [term, i])); + } + if (idx.fieldPostings?.fields) { + for (const field of Object.keys(idx.fieldPostings.fields)) { + const entry = idx.fieldPostings.fields[field]; + if (!entry?.vocab || entry.vocabIndex) continue; + entry.vocabIndex = new Map(entry.vocab.map((term, i) => [term, i])); + } + } + idx.filterIndex = includeFilterIndex + ? (filterIndexRaw + ? (hydrateFilterIndex(filterIndexRaw) || buildFilterIndex(chunkMeta, { fileChargramN })) + : buildFilterIndex(chunkMeta, { fileChargramN })) + : null; + idx.tokenIndex = getArtifact(db, LMDB_ARTIFACT_KEYS.tokenPostings); + return idx; + } + + return { loadIndexFromLmdb }; +} diff --git a/src/retrieval/output.js b/src/retrieval/output.js new file mode 100644 index 000000000..c85ca9727 --- /dev/null +++ b/src/retrieval/output.js @@ -0,0 +1,4 @@ +export { configureOutputCaches, getOutputCacheReporter } from './output/cache.js'; +export { filterChunks } from './output/filters.js'; +export { cleanContext } from './output/context.js'; +export { formatFullChunk, formatShortChunk } from './output/format.js'; diff --git a/src/retrieval/output/cache.js b/src/retrieval/output/cache.js new file mode 100644 index 000000000..558180312 --- /dev/null +++ b/src/retrieval/output/cache.js @@ -0,0 +1,77 @@ +import { + createCacheReporter, + createLruCache, + DEFAULT_CACHE_MB, + DEFAULT_CACHE_TTL_MS, + estimateStringBytes +} from '../../shared/cache.js'; +import { getEnvConfig } from '../../shared/env.js'; + +const resolveEntryLimit = (raw) => { + const parsed = Number(raw); + return Number.isFinite(parsed) ? Math.max(0, Math.floor(parsed)) : null; +}; + +let outputCacheReporter = createCacheReporter({ enabled: false, log: null }); +let fileTextCache = createLruCache({ + name: 'fileText', + maxMb: DEFAULT_CACHE_MB.fileText, + ttlMs: DEFAULT_CACHE_TTL_MS.fileText, + sizeCalculation: estimateStringBytes, + reporter: outputCacheReporter +}); +let summaryCache = createLruCache({ + name: 'summary', + maxMb: DEFAULT_CACHE_MB.summary, + ttlMs: DEFAULT_CACHE_TTL_MS.summary, + sizeCalculation: estimateStringBytes, + reporter: outputCacheReporter +}); + +export function configureOutputCaches({ cacheConfig = null, verbose = false, log = null } = {}) { + const envConfig = getEnvConfig(); + const entryLimits = { + fileText: resolveEntryLimit(envConfig.fileCacheMax), + summary: resolveEntryLimit(envConfig.summaryCacheMax) + }; + outputCacheReporter = createCacheReporter({ enabled: verbose, log }); + const fileTextConfig = cacheConfig?.fileText || {}; + const summaryConfig = cacheConfig?.summary || {}; + fileTextCache = createLruCache({ + name: 'fileText', + maxMb: Number.isFinite(Number(fileTextConfig.maxMb)) + ? Number(fileTextConfig.maxMb) + : DEFAULT_CACHE_MB.fileText, + ttlMs: Number.isFinite(Number(fileTextConfig.ttlMs)) + ? Number(fileTextConfig.ttlMs) + : DEFAULT_CACHE_TTL_MS.fileText, + maxEntries: entryLimits.fileText, + sizeCalculation: estimateStringBytes, + reporter: outputCacheReporter + }); + summaryCache = createLruCache({ + name: 'summary', + maxMb: Number.isFinite(Number(summaryConfig.maxMb)) + ? Number(summaryConfig.maxMb) + : DEFAULT_CACHE_MB.summary, + ttlMs: Number.isFinite(Number(summaryConfig.ttlMs)) + ? Number(summaryConfig.ttlMs) + : DEFAULT_CACHE_TTL_MS.summary, + maxEntries: entryLimits.summary, + sizeCalculation: estimateStringBytes, + reporter: outputCacheReporter + }); + return outputCacheReporter; +} + +export function getOutputCacheReporter() { + return outputCacheReporter; +} + +export function getFileTextCache() { + return fileTextCache; +} + +export function getSummaryCache() { + return summaryCache; +} diff --git a/src/retrieval/output/context.js b/src/retrieval/output/context.js new file mode 100644 index 000000000..0362c0aae --- /dev/null +++ b/src/retrieval/output/context.js @@ -0,0 +1,10 @@ +export function cleanContext(lines) { + return lines + .filter((line) => { + const trimmed = line.trim(); + if (!trimmed || trimmed === '```') return false; + if (!/[a-zA-Z0-9]/.test(trimmed)) return false; + return true; + }) + .map((line) => line.replace(/\s+/g, ' ').trim()); +} diff --git a/src/retrieval/output/explain.js b/src/retrieval/output/explain.js new file mode 100644 index 000000000..05a9b0d8a --- /dev/null +++ b/src/retrieval/output/explain.js @@ -0,0 +1,90 @@ +const formatExplainLine = (label, parts, color) => { + const filtered = parts.filter(Boolean); + if (!filtered.length) return null; + return color.gray(` ${label}: `) + filtered.join(', '); +}; + +export function formatScoreBreakdown(scoreBreakdown, color) { + if (!scoreBreakdown || typeof scoreBreakdown !== 'object') return []; + const lines = []; + const selected = scoreBreakdown.selected || null; + if (selected) { + const parts = []; + if (selected.type) parts.push(`type=${selected.type}`); + if (Number.isFinite(selected.score)) parts.push(`score=${selected.score.toFixed(4)}`); + const line = formatExplainLine('Score', parts, color); + if (line) lines.push(line); + } + const sparse = scoreBreakdown.sparse || null; + if (sparse) { + const parts = []; + if (sparse.type) parts.push(`type=${sparse.type}`); + if (Number.isFinite(sparse.score)) parts.push(`score=${sparse.score.toFixed(4)}`); + if (Number.isFinite(sparse.k1)) parts.push(`k1=${sparse.k1.toFixed(2)}`); + if (Number.isFinite(sparse.b)) parts.push(`b=${sparse.b.toFixed(2)}`); + if (sparse.normalized != null) parts.push(`normalized=${sparse.normalized}`); + if (sparse.profile) parts.push(`profile=${sparse.profile}`); + if (Array.isArray(sparse.weights) && sparse.weights.length) { + const weights = sparse.weights + .map((value) => (Number.isFinite(value) ? value.toFixed(2) : String(value))) + .join('/'); + parts.push(`weights=${weights}`); + } + const line = formatExplainLine('Sparse', parts, color); + if (line) lines.push(line); + } + const ann = scoreBreakdown.ann || null; + if (ann) { + const parts = []; + if (Number.isFinite(ann.score)) parts.push(`score=${ann.score.toFixed(4)}`); + if (ann.source) parts.push(`source=${ann.source}`); + const line = formatExplainLine('ANN', parts, color); + if (line) lines.push(line); + } + const rrf = scoreBreakdown.rrf || null; + if (rrf) { + const parts = []; + if (Number.isFinite(rrf.k)) parts.push(`k=${rrf.k}`); + if (Number.isFinite(rrf.sparseRank)) parts.push(`sparseRank=${rrf.sparseRank}`); + if (Number.isFinite(rrf.annRank)) parts.push(`annRank=${rrf.annRank}`); + if (Number.isFinite(rrf.sparseRrf)) parts.push(`sparseScore=${rrf.sparseRrf.toFixed(4)}`); + if (Number.isFinite(rrf.annRrf)) parts.push(`annScore=${rrf.annRrf.toFixed(4)}`); + if (Number.isFinite(rrf.score)) parts.push(`score=${rrf.score.toFixed(4)}`); + const line = formatExplainLine('RRF', parts, color); + if (line) lines.push(line); + } + const blend = scoreBreakdown.blend || null; + if (blend) { + const parts = []; + if (Number.isFinite(blend.score)) parts.push(`score=${blend.score.toFixed(4)}`); + if (Number.isFinite(blend.sparseNormalized)) parts.push(`sparseNorm=${blend.sparseNormalized.toFixed(4)}`); + if (Number.isFinite(blend.annNormalized)) parts.push(`annNorm=${blend.annNormalized.toFixed(4)}`); + if (Number.isFinite(blend.sparseWeight) || Number.isFinite(blend.annWeight)) { + const sparseWeight = Number.isFinite(blend.sparseWeight) ? blend.sparseWeight.toFixed(2) : '0.00'; + const annWeight = Number.isFinite(blend.annWeight) ? blend.annWeight.toFixed(2) : '0.00'; + parts.push(`weights=${sparseWeight}/${annWeight}`); + } + const line = formatExplainLine('Blend', parts, color); + if (line) lines.push(line); + } + const phrase = scoreBreakdown.phrase || null; + if (phrase) { + const parts = []; + if (Number.isFinite(phrase.matches)) parts.push(`matches=${phrase.matches}`); + if (Number.isFinite(phrase.boost)) parts.push(`boost=${phrase.boost.toFixed(4)}`); + if (Number.isFinite(phrase.factor)) parts.push(`factor=${phrase.factor.toFixed(2)}`); + const line = formatExplainLine('Phrase', parts, color); + if (line) lines.push(line); + } + const symbol = scoreBreakdown.symbol || null; + if (symbol) { + const parts = []; + if (typeof symbol.definition === 'boolean') parts.push(`definition=${symbol.definition}`); + if (typeof symbol.export === 'boolean') parts.push(`export=${symbol.export}`); + if (Number.isFinite(symbol.factor)) parts.push(`factor=${symbol.factor.toFixed(2)}`); + if (Number.isFinite(symbol.boost)) parts.push(`boost=${symbol.boost.toFixed(4)}`); + const line = formatExplainLine('Symbol', parts, color); + if (line) lines.push(line); + } + return lines; +} diff --git a/src/retrieval/output/filters.js b/src/retrieval/output/filters.js new file mode 100644 index 000000000..6b3d8f4a5 --- /dev/null +++ b/src/retrieval/output/filters.js @@ -0,0 +1,635 @@ +import path from 'node:path'; +import { extractNgrams, tri } from '../../shared/tokenize.js'; +import { createSafeRegex, normalizeSafeRegexConfig } from '../../shared/safe-regex.js'; +import { + bitmapToSet, + createBitmapFromIds, + intersectBitmaps, + intersectSetWithBitmap, + isBitmapEmpty, + isRoaringAvailable, + unionBitmaps +} from '../bitmap.js'; + +/** + * Filter chunk metadata by search constraints. + * @param {Array} meta + * @param {object} filters + * @returns {Array} + */ +export function filterChunks(meta, filters = {}, filterIndex = null, fileRelations = null) { + const { + type, + author, + importName, + lint, + churn, + calls, + uses, + signature, + param, + decorator, + returnType, + throws, + reads, + writes, + mutates, + alias, + risk, + riskTag, + riskSource, + riskSink, + riskCategory, + riskFlow, + structPack, + structRule, + structTag, + awaits, + branches, + loops, + breaks, + continues, + inferredType, + visibility, + extends: extendsFilter, + async: asyncOnly, + generator: generatorOnly, + returns: returnsOnly, + file, + caseFile, + caseTokens, + ext, + meta: metaFilter, + chunkAuthor, + modifiedAfter, + excludeTokens, + excludePhrases, + excludePhraseRange + } = filters; + const normalize = (value) => String(value || '').toLowerCase(); + const normalizeFilePath = (value) => String(value || '').replace(/\\/g, '/'); + const normalizeFile = (value) => ( + caseFile ? normalizeFilePath(value) : normalize(normalizeFilePath(value)) + ); + const normalizeFilePrefilter = (value) => normalizeFilePath(value).toLowerCase(); + const safeRegexConfig = normalizeSafeRegexConfig(filters.regexConfig || {}, { + flags: caseFile ? '' : 'i' + }); + const normalizeList = (value) => { + if (!value) return []; + const entries = Array.isArray(value) ? value : [value]; + return entries + .flatMap((entry) => String(entry || '').split(/[,\s]+/)) + .map((entry) => entry.trim()) + .filter(Boolean); + }; + const parseFileMatcher = (entry) => { + const raw = String(entry || '').trim(); + if (!raw) return null; + const regexMatch = raw.match(/^\/(.+)\/([a-z]*)$/i); + if (regexMatch) { + const pattern = regexMatch[1]; + const flags = regexMatch[2] || ''; + const matcher = createSafeRegex(pattern, flags, safeRegexConfig); + if (matcher) return { type: 'regex', value: matcher }; + return { type: 'substring', value: normalizeFile(raw) }; + } + return { type: 'substring', value: normalizeFile(raw) }; + }; + const fileMatchers = normalizeList(file).map(parseFileMatcher).filter(Boolean); + const filePrefilterConfig = filters.filePrefilter || {}; + const filePrefilterEnabled = filePrefilterConfig.enabled !== false; + const fileChargramN = Number.isFinite(Number(filePrefilterConfig.chargramN)) + ? Math.max(2, Math.floor(Number(filePrefilterConfig.chargramN))) + : (filterIndex?.fileChargramN || 3); + const extNeedles = normalizeList(ext) + .map((entry) => { + let value = entry.toLowerCase(); + value = value.replace(/^\*+/, ''); + if (value && !value.startsWith('.')) value = `.${value}`; + return value; + }) + .filter(Boolean); + const typeNeedles = normalizeList(type).map(normalize); + const authorNeedles = normalizeList(author).map(normalize); + const metaFilters = Array.isArray(metaFilter) ? metaFilter : (metaFilter ? [metaFilter] : []); + const excludeNeedles = normalizeList(excludeTokens).map((value) => (caseTokens ? String(value || '') : normalize(value))); + const excludePhraseNeedles = normalizeList(excludePhrases).map((value) => (caseTokens ? String(value || '') : normalize(value))); + const structPackNeedles = normalizeList(structPack).map(normalize); + const structRuleNeedles = normalizeList(structRule).map(normalize); + const structTagNeedles = normalizeList(structTag).map(normalize); + const roaringAvailable = isRoaringAvailable(); + const bitmapIndex = filterIndex?.bitmap || null; + const buildCandidate = (sets, bitmaps) => { + const setList = Array.isArray(sets) ? sets.filter(Boolean) : []; + const bitmapList = Array.isArray(bitmaps) ? bitmaps.filter(Boolean) : []; + if (!setList.length && !bitmapList.length) return null; + if (roaringAvailable) { + let bitmap = bitmapList.length ? unionBitmaps(bitmapList) : null; + if (setList.length) { + const ids = []; + for (const set of setList) { + for (const id of set) ids.push(id); + } + const extraBitmap = createBitmapFromIds(ids, { force: true }); + if (extraBitmap) { + bitmap = bitmap ? unionBitmaps([bitmap, extraBitmap]) : extraBitmap; + } + } + if (bitmap) return { bitmap }; + } + const out = new Set(); + for (const set of setList) { + for (const id of set) out.add(id); + } + for (const bitmap of bitmapList) { + for (const id of bitmapToSet(bitmap)) out.add(id); + } + return { set: out }; + }; + const mergeCandidates = (candidates) => { + if (!Array.isArray(candidates) || !candidates.length) return null; + const sets = []; + const bitmaps = []; + for (const candidate of candidates) { + if (!candidate) continue; + if (candidate.bitmap) bitmaps.push(candidate.bitmap); + if (candidate.set) sets.push(candidate.set); + } + return buildCandidate(sets, bitmaps); + }; + const collectExactMatches = (map, values, bitmapMap = null) => { + if (!map || !values.length) return null; + const sets = []; + const bitmaps = []; + for (const value of values) { + if (!value) continue; + const set = map.get(value); + if (!set) continue; + const bitmap = bitmapMap ? bitmapMap.get(value) : null; + if (bitmap) { + bitmaps.push(bitmap); + } else { + sets.push(set); + } + } + if (!sets.length && !bitmaps.length) return { set: new Set() }; + return buildCandidate(sets, bitmaps); + }; + const collectSubstringMatches = (map, needle, bitmapMap = null) => { + if (!map || !needle) return null; + const sets = []; + const bitmaps = []; + for (const [key, set] of map.entries()) { + if (!key.includes(needle)) continue; + const bitmap = bitmapMap ? bitmapMap.get(key) : null; + if (bitmap) { + bitmaps.push(bitmap); + } else { + sets.push(set); + } + } + if (!sets.length && !bitmaps.length) return { set: new Set() }; + return buildCandidate(sets, bitmaps); + }; + const collectAnySubstringMatches = (map, values, bitmapMap = null) => { + if (!map || !values.length) return null; + const candidates = values + .map((value) => collectSubstringMatches(map, value, bitmapMap)) + .filter(Boolean); + return mergeCandidates(candidates); + }; + const intersectSets = (sets) => { + if (!sets.length) return null; + let acc = sets[0]; + for (let i = 1; i < sets.length; i += 1) { + const next = sets[i]; + const merged = new Set(); + for (const id of acc) { + if (next.has(id)) merged.add(id); + } + acc = merged; + if (!acc.size) break; + } + return acc; + }; + const intersectTwoSets = (left, right) => { + if (!left || !right) return new Set(); + const out = new Set(); + for (const id of left) { + if (right.has(id)) out.add(id); + } + return out; + }; + const intersectCandidates = (candidates) => { + if (!Array.isArray(candidates) || !candidates.length) return null; + const sets = []; + const bitmaps = []; + for (const candidate of candidates) { + if (!candidate) continue; + if (candidate.set) { + if (!candidate.set.size) return new Set(); + sets.push(candidate.set); + } + if (candidate.bitmap) { + if (isBitmapEmpty(candidate.bitmap)) return new Set(); + bitmaps.push(candidate.bitmap); + } + } + if (bitmaps.length) { + let bitmap = intersectBitmaps(bitmaps); + if (!bitmap || isBitmapEmpty(bitmap)) return new Set(); + if (sets.length) { + const setIntersection = intersectSets(sets); + if (!setIntersection || !setIntersection.size) return new Set(); + const setBitmap = createBitmapFromIds(setIntersection, { force: true }); + if (setBitmap) { + bitmap = intersectBitmaps([bitmap, setBitmap]); + return bitmap ? bitmapToSet(bitmap) : new Set(); + } + return intersectSetWithBitmap(setIntersection, bitmap); + } + return bitmapToSet(bitmap); + } + return intersectSets(sets); + }; + const extractRegexLiteral = (pattern) => { + let best = ''; + let current = ''; + let escaped = false; + for (const ch of pattern) { + if (escaped) { + current += ch; + escaped = false; + continue; + } + if (ch === '\\') { + escaped = true; + continue; + } + if ('^$.*+?()[]{}|'.includes(ch)) { + if (current.length > best.length) best = current; + current = ''; + continue; + } + current += ch; + } + if (current.length > best.length) best = current; + return best; + }; + const collectFilePrefilterMatches = () => { + if (!fileMatchers.length || !filterIndex || !filterIndex.fileChargrams || !filterIndex.fileChunksById) { + return null; + } + const fileIds = new Set(); + for (const matcher of fileMatchers) { + let needle = null; + if (matcher.type === 'substring') { + needle = normalizeFilePrefilter(matcher.value); + } else if (matcher.type === 'regex') { + const literal = extractRegexLiteral(matcher.value.source || ''); + needle = literal ? normalizeFilePrefilter(literal) : null; + } + if (!needle || needle.length < fileChargramN) continue; + const grams = tri(needle, fileChargramN); + if (!grams.length) continue; + let candidateFiles = null; + for (const gram of grams) { + const bucket = filterIndex.fileChargrams.get(gram); + if (!bucket) { + candidateFiles = new Set(); + break; + } + candidateFiles = candidateFiles ? intersectTwoSets(candidateFiles, bucket) : new Set(bucket); + if (!candidateFiles.size) break; + } + if (!candidateFiles || !candidateFiles.size) continue; + for (const fileId of candidateFiles) { + fileIds.add(fileId); + } + } + if (!fileIds.size) return null; + const chunkIds = new Set(); + for (const fileId of fileIds) { + const chunks = filterIndex.fileChunksById[fileId]; + if (!chunks) continue; + for (const id of chunks) chunkIds.add(id); + } + return chunkIds; + }; + const matchList = (list, value) => { + if (!value) return true; + if (!Array.isArray(list)) return false; + const needle = normalize(value); + return list.some((entry) => normalize(entry).includes(needle)); + }; + const matchInferredType = (inferred, value) => { + if (!value) return true; + if (!inferred) return false; + const needle = normalize(value); + const types = []; + const collect = (entries) => { + if (!Array.isArray(entries)) return; + for (const entry of entries) { + if (entry?.type) types.push(entry.type); + } + }; + const collectMap = (map) => { + if (!map || typeof map !== 'object') return; + Object.values(map).forEach((entries) => collect(entries)); + }; + collectMap(inferred.params); + collectMap(inferred.fields); + collectMap(inferred.locals); + collect(inferred.returns); + if (!types.length) return false; + return types.some((entry) => normalize(entry).includes(needle)); + }; + const matchStructural = (chunk) => { + if (!structPackNeedles.length && !structRuleNeedles.length && !structTagNeedles.length) { + return true; + } + const structural = chunk?.docmeta?.structural; + if (!Array.isArray(structural) || !structural.length) return false; + return structural.some((entry) => { + if (structPackNeedles.length) { + const packValue = normalize(entry?.pack || ''); + if (!structPackNeedles.some((needle) => packValue.includes(needle))) return false; + } + if (structRuleNeedles.length) { + const ruleValue = normalize(entry?.ruleId || ''); + if (!structRuleNeedles.some((needle) => ruleValue.includes(needle))) return false; + } + if (structTagNeedles.length) { + const tags = Array.isArray(entry?.tags) ? entry.tags : []; + if (!tags.some((tag) => + structTagNeedles.some((needle) => normalize(tag).includes(needle)) + )) { + return false; + } + } + return true; + }); + }; + const truthy = (value) => value === true; + const resolveMetaField = (record, key) => { + if (!record || typeof record !== 'object' || !key) return undefined; + if (!key.includes('.')) return record[key]; + return key.split('.').reduce((acc, part) => (acc && typeof acc === 'object' ? acc[part] : undefined), record); + }; + const matchMetaFilters = (chunk) => { + if (!metaFilters.length) return true; + const recordMeta = chunk?.docmeta?.record; + if (!recordMeta || typeof recordMeta !== 'object') return false; + for (const filter of metaFilters) { + const key = filter?.key; + if (!key) continue; + const value = filter?.value; + const field = resolveMetaField(recordMeta, key); + if (value == null || value === '') { + if (field == null) return false; + if (Array.isArray(field) && field.length === 0) return false; + if (typeof field === 'string' && !field.trim()) return false; + continue; + } + const needle = normalize(value); + if (Array.isArray(field)) { + if (!field.some((entry) => normalize(entry).includes(needle))) return false; + } else if (field && typeof field === 'object') { + if (!normalize(JSON.stringify(field)).includes(needle)) return false; + } else if (!normalize(field).includes(needle)) { + return false; + } + } + return true; + }; + const resolveFileRelations = (file) => { + if (!file || !fileRelations) return null; + if (typeof fileRelations.get === 'function') { + return fileRelations.get(file) || null; + } + return fileRelations[file] || null; + }; + const normalizeToken = caseTokens ? (value) => String(value || '') : normalize; + + const indexedCandidates = []; + if (filterIndex) { + if (extNeedles.length && filterIndex.byExt) { + const candidate = collectExactMatches( + filterIndex.byExt, + extNeedles, + bitmapIndex?.byExt + ); + if (candidate) indexedCandidates.push(candidate); + } + if (typeNeedles.length && filterIndex.byKind) { + const candidate = collectExactMatches( + filterIndex.byKind, + typeNeedles, + bitmapIndex?.byKind + ); + if (candidate) indexedCandidates.push(candidate); + } + if (authorNeedles.length && filterIndex.byAuthor) { + const candidate = collectAnySubstringMatches( + filterIndex.byAuthor, + authorNeedles, + bitmapIndex?.byAuthor + ); + if (candidate) indexedCandidates.push(candidate); + } + if (chunkAuthor && filterIndex.byChunkAuthor) { + const candidate = collectSubstringMatches( + filterIndex.byChunkAuthor, + normalize(chunkAuthor), + bitmapIndex?.byChunkAuthor + ); + if (candidate) indexedCandidates.push(candidate); + } + if (visibility && filterIndex.byVisibility) { + const candidate = collectSubstringMatches( + filterIndex.byVisibility, + normalize(visibility), + bitmapIndex?.byVisibility + ); + if (candidate) indexedCandidates.push(candidate); + } + if (fileMatchers.length && filePrefilterEnabled) { + const filePrefilterIds = collectFilePrefilterMatches(); + if (filePrefilterIds) { + const candidate = buildCandidate([filePrefilterIds], []); + if (candidate) indexedCandidates.push(candidate); + } + } + } + const candidateIds = indexedCandidates.length + ? intersectCandidates(indexedCandidates) + : null; + const sourceMeta = candidateIds + ? Array.from(candidateIds).map((id) => meta[id]).filter(Boolean) + : meta; + + return sourceMeta.filter((c) => { + if (!c) return false; + if (fileMatchers.length) { + const fileValue = String(c.file || ''); + const fileValueNorm = normalizeFile(fileValue); + const matches = fileMatchers.some((matcher) => { + if (matcher.type === 'regex') { + matcher.value.lastIndex = 0; + return matcher.value.test(fileValue); + } + return fileValueNorm.includes(matcher.value); + }); + if (!matches) return false; + } + if (extNeedles.length) { + const extValue = normalize(c.ext || path.extname(c.file || '')); + if (!extNeedles.includes(extValue)) return false; + } + if (!matchMetaFilters(c)) return false; + if (excludeNeedles.length || excludePhraseNeedles.length) { + const tokens = Array.isArray(c.tokens) ? c.tokens : []; + let ngrams = Array.isArray(c.ngrams) ? c.ngrams : null; + if (!ngrams && excludePhraseNeedles.length && tokens.length && excludePhraseRange?.min && excludePhraseRange?.max) { + ngrams = extractNgrams(tokens, excludePhraseRange.min, excludePhraseRange.max); + } + const tokenSet = new Set(tokens.map(normalizeToken)); + const ngramSet = new Set((ngrams || []).map(normalizeToken)); + const tokenMatch = excludeNeedles.some((needle) => tokenSet.has(needle) || ngramSet.has(needle)); + if (tokenMatch) return false; + if (excludePhraseNeedles.some((needle) => ngramSet.has(needle))) return false; + } + if (modifiedAfter != null) { + const lastModified = c.last_modified ? Date.parse(c.last_modified) : NaN; + if (!Number.isFinite(lastModified) || lastModified < modifiedAfter) return false; + } + if (typeNeedles.length) { + const kindValue = c.kind; + if (!kindValue) return false; + const kinds = Array.isArray(kindValue) ? kindValue : [kindValue]; + const matches = kinds.some((entry) => typeNeedles.includes(normalize(entry))); + if (!matches) return false; + } + if (authorNeedles.length) { + const authorValue = c.last_author; + if (!authorValue) return false; + const authors = Array.isArray(authorValue) ? authorValue : [authorValue]; + const matches = authorNeedles.some((needle) => + authors.some((entry) => normalize(entry).includes(needle)) + ); + if (!matches) return false; + } + if (chunkAuthor && !matchList(c.chunk_authors, chunkAuthor)) return false; + if (importName) { + const imports = c.codeRelations?.imports || resolveFileRelations(c.file)?.imports; + if (!Array.isArray(imports) || !imports.includes(importName)) return false; + } + if (lint && (!c.lint || !c.lint.length)) return false; + if (churn !== null && churn !== undefined) { + const churnValue = Number(c.churn); + if (!Number.isFinite(churnValue) || churnValue < churn) return false; + } + if (calls) { + const callsList = c.codeRelations?.calls; + if (!Array.isArray(callsList)) return false; + const found = callsList.find(([fn, callName]) => fn === calls || callName === calls); + if (!found) return false; + } + if (uses) { + const usages = c.codeRelations?.usages || resolveFileRelations(c.file)?.usages; + if (!Array.isArray(usages)) return false; + if (!usages.includes(uses)) return false; + } + if (signature) { + const sig = c.docmeta?.signature; + if (!sig) return false; + if (!sig.includes(signature)) return false; + } + if (param) { + const params = c.docmeta?.params; + if (!Array.isArray(params)) return false; + if (!params.includes(param)) return false; + } + if (decorator && !matchList(c.docmeta?.decorators, decorator)) return false; + if (returnType) { + const foundReturnType = c.docmeta?.returnType || null; + if (!foundReturnType || !normalize(foundReturnType).includes(normalize(returnType))) { + return false; + } + } + if (inferredType && !matchInferredType(c.docmeta?.inferredTypes, inferredType)) { + return false; + } + if (throws && !matchList(c.docmeta?.throws, throws)) return false; + if (awaits && !matchList(c.docmeta?.awaits, awaits)) return false; + if (reads && !matchList(c.docmeta?.dataflow?.reads, reads)) return false; + if (writes && !matchList(c.docmeta?.dataflow?.writes, writes)) return false; + if (mutates && !matchList(c.docmeta?.dataflow?.mutations, mutates)) return false; + if (alias && !matchList(c.docmeta?.dataflow?.aliases, alias)) return false; + const riskMeta = c.docmeta?.risk || null; + const riskTagValue = riskTag || risk; + if (riskTagValue && !matchList(riskMeta?.tags, riskTagValue)) return false; + if (riskSource) { + const sourceNames = Array.isArray(riskMeta?.sources) + ? riskMeta.sources.map((source) => source.name) + : null; + if (!matchList(sourceNames, riskSource)) return false; + } + if (riskSink) { + const sinkNames = Array.isArray(riskMeta?.sinks) + ? riskMeta.sinks.map((sink) => sink.name) + : null; + if (!matchList(sinkNames, riskSink)) return false; + } + if (riskCategory) { + const categories = Array.isArray(riskMeta?.categories) + ? riskMeta.categories + : (Array.isArray(riskMeta?.sinks) ? riskMeta.sinks.map((sink) => sink.category) : null); + if (!matchList(categories, riskCategory)) return false; + } + if (riskFlow) { + const flows = Array.isArray(riskMeta?.flows) + ? riskMeta.flows.map((flow) => `${flow.source}->${flow.sink}`) + : null; + if (!matchList(flows, riskFlow)) return false; + } + if (!matchStructural(c)) return false; + if (branches != null) { + const count = c.docmeta?.controlFlow?.branches; + if (!Number.isFinite(count) || count < branches) return false; + } + if (loops != null) { + const count = c.docmeta?.controlFlow?.loops; + if (!Number.isFinite(count) || count < loops) return false; + } + if (breaks != null) { + const count = c.docmeta?.controlFlow?.breaks; + if (!Number.isFinite(count) || count < breaks) return false; + } + if (continues != null) { + const count = c.docmeta?.controlFlow?.continues; + if (!Number.isFinite(count) || count < continues) return false; + } + if (visibility) { + const docVisibility = c.docmeta?.visibility || c.docmeta?.modifiers?.visibility || null; + if (!docVisibility || !normalize(docVisibility).includes(normalize(visibility))) { + return false; + } + } + if (extendsFilter) { + const parents = c.docmeta?.extends || c.docmeta?.bases || []; + if (!matchList(parents, extendsFilter)) return false; + } + if (truthy(asyncOnly)) { + if (!(c.docmeta?.async || c.docmeta?.modifiers?.async)) return false; + } + if (truthy(generatorOnly)) { + if (!(c.docmeta?.modifiers?.generator || c.docmeta?.yields)) return false; + } + if (truthy(returnsOnly)) { + if (!(c.docmeta?.returnsValue || c.docmeta?.returns)) return false; + } + return true; + }); +} diff --git a/src/search/output.js b/src/retrieval/output/format.js similarity index 54% rename from src/search/output.js rename to src/retrieval/output/format.js index a3f44c44b..17f756e6b 100644 --- a/src/search/output.js +++ b/src/retrieval/output/format.js @@ -1,335 +1,6 @@ -import fs from 'node:fs'; -import path from 'node:path'; -import { extractNgrams } from '../shared/tokenize.js'; - -const fileTextCache = new Map(); -const summaryCache = new Map(); - -/** - * Filter chunk metadata by search constraints. - * @param {Array} meta - * @param {object} filters - * @returns {Array} - */ -export function filterChunks(meta, filters = {}) { - const { - type, - author, - importName, - lint, - churn, - calls, - uses, - signature, - param, - decorator, - returnType, - throws, - reads, - writes, - mutates, - alias, - risk, - riskTag, - riskSource, - riskSink, - riskCategory, - riskFlow, - awaits, - branches, - loops, - breaks, - continues, - inferredType, - visibility, - extends: extendsFilter, - async: asyncOnly, - generator: generatorOnly, - returns: returnsOnly, - file, - ext, - meta: metaFilter, - chunkAuthor, - modifiedAfter, - excludeTokens, - excludePhrases, - excludePhraseRange - } = filters; - const normalize = (value) => String(value || '').toLowerCase(); - const normalizeList = (value) => { - if (!value) return []; - const entries = Array.isArray(value) ? value : [value]; - return entries - .flatMap((entry) => String(entry || '').split(/[,\s]+/)) - .map((entry) => entry.trim()) - .filter(Boolean); - }; - const parseFileMatcher = (entry) => { - const raw = String(entry || '').trim(); - if (!raw) return null; - const regexMatch = raw.match(/^\/(.+)\/([a-z]*)$/i); - if (regexMatch) { - const pattern = regexMatch[1]; - let flags = regexMatch[2] || ''; - if (!flags.includes('i')) flags += 'i'; - try { - return { type: 'regex', value: new RegExp(pattern, flags) }; - } catch { - return { type: 'substring', value: normalize(raw) }; - } - } - return { type: 'substring', value: normalize(raw) }; - }; - const fileMatchers = normalizeList(file).map(parseFileMatcher).filter(Boolean); - const extNeedles = normalizeList(ext) - .map((entry) => { - let value = entry.toLowerCase(); - value = value.replace(/^\*+/, ''); - if (value && !value.startsWith('.')) value = `.${value}`; - return value; - }) - .filter(Boolean); - const metaFilters = Array.isArray(metaFilter) ? metaFilter : (metaFilter ? [metaFilter] : []); - const excludeNeedles = normalizeList(excludeTokens).map(normalize); - const excludePhraseNeedles = normalizeList(excludePhrases).map(normalize); - const matchList = (list, value) => { - if (!value) return true; - if (!Array.isArray(list)) return false; - const needle = normalize(value); - return list.some((entry) => normalize(entry).includes(needle)); - }; - const matchInferredType = (inferred, value) => { - if (!value) return true; - if (!inferred) return false; - const needle = normalize(value); - const types = []; - const collect = (entries) => { - if (!Array.isArray(entries)) return; - for (const entry of entries) { - if (entry?.type) types.push(entry.type); - } - }; - const collectMap = (map) => { - if (!map || typeof map !== 'object') return; - Object.values(map).forEach((entries) => collect(entries)); - }; - collectMap(inferred.params); - collectMap(inferred.fields); - collectMap(inferred.locals); - collect(inferred.returns); - if (!types.length) return false; - return types.some((entry) => normalize(entry).includes(needle)); - }; - const truthy = (value) => value === true; - const resolveMetaField = (record, key) => { - if (!record || typeof record !== 'object' || !key) return undefined; - if (!key.includes('.')) return record[key]; - return key.split('.').reduce((acc, part) => (acc && typeof acc === 'object' ? acc[part] : undefined), record); - }; - const matchMetaFilters = (chunk) => { - if (!metaFilters.length) return true; - const recordMeta = chunk?.docmeta?.record; - if (!recordMeta || typeof recordMeta !== 'object') return false; - for (const filter of metaFilters) { - const key = filter?.key; - if (!key) continue; - const value = filter?.value; - const field = resolveMetaField(recordMeta, key); - if (value == null || value === '') { - if (field == null) return false; - if (Array.isArray(field) && field.length === 0) return false; - if (typeof field === 'string' && !field.trim()) return false; - continue; - } - const needle = normalize(value); - if (Array.isArray(field)) { - if (!field.some((entry) => normalize(entry).includes(needle))) return false; - } else if (field && typeof field === 'object') { - if (!normalize(JSON.stringify(field)).includes(needle)) return false; - } else if (!normalize(field).includes(needle)) { - return false; - } - } - return true; - }; - - return meta.filter((c) => { - if (!c) return false; - if (fileMatchers.length) { - const fileValue = String(c.file || ''); - const fileValueLower = normalize(fileValue); - const matches = fileMatchers.some((matcher) => { - if (matcher.type === 'regex') { - matcher.value.lastIndex = 0; - return matcher.value.test(fileValue); - } - return fileValueLower.includes(matcher.value); - }); - if (!matches) return false; - } - if (extNeedles.length) { - const extValue = normalize(c.ext || path.extname(c.file || '')); - if (!extNeedles.includes(extValue)) return false; - } - if (!matchMetaFilters(c)) return false; - if (excludeNeedles.length || excludePhraseNeedles.length) { - const tokens = Array.isArray(c.tokens) ? c.tokens : []; - let ngrams = Array.isArray(c.ngrams) ? c.ngrams : null; - if (!ngrams && excludePhraseNeedles.length && tokens.length && excludePhraseRange?.min && excludePhraseRange?.max) { - ngrams = extractNgrams(tokens, excludePhraseRange.min, excludePhraseRange.max); - } - const tokenSet = new Set(tokens.map(normalize)); - const ngramSet = new Set((ngrams || []).map(normalize)); - const tokenMatch = excludeNeedles.some((needle) => tokenSet.has(needle) || ngramSet.has(needle)); - if (tokenMatch) return false; - if (excludePhraseNeedles.some((needle) => ngramSet.has(needle))) return false; - } - if (modifiedAfter != null) { - const lastModified = c.last_modified ? Date.parse(c.last_modified) : NaN; - if (!Number.isFinite(lastModified) || lastModified < modifiedAfter) return false; - } - if (type && c.kind && c.kind.toLowerCase() !== type.toLowerCase()) return false; - if (author && c.last_author && !c.last_author.toLowerCase().includes(author.toLowerCase())) return false; - if (chunkAuthor && !matchList(c.chunk_authors, chunkAuthor)) return false; - if (importName && c.codeRelations && c.codeRelations.imports) { - if (!c.codeRelations.imports.includes(importName)) return false; - } - if (lint && (!c.lint || !c.lint.length)) return false; - if (churn !== null && churn !== undefined) { - const churnValue = Number(c.churn); - if (!Number.isFinite(churnValue) || churnValue < churn) return false; - } - if (calls && c.codeRelations && c.codeRelations.calls) { - const found = c.codeRelations.calls.find(([fn, callName]) => fn === calls || callName === calls); - if (!found) return false; - } - if (uses && c.codeRelations && c.codeRelations.usages) { - if (!c.codeRelations.usages.includes(uses)) return false; - } - if (signature && c.docmeta?.signature) { - if (!c.docmeta.signature.includes(signature)) return false; - } - if (param && c.docmeta?.params) { - if (!c.docmeta.params.includes(param)) return false; - } - if (decorator && !matchList(c.docmeta?.decorators, decorator)) return false; - if (returnType) { - const foundReturnType = c.docmeta?.returnType || null; - if (!foundReturnType || !normalize(foundReturnType).includes(normalize(returnType))) { - return false; - } - } - if (inferredType && !matchInferredType(c.docmeta?.inferredTypes, inferredType)) { - return false; - } - if (throws && !matchList(c.docmeta?.throws, throws)) return false; - if (awaits && !matchList(c.docmeta?.awaits, awaits)) return false; - if (reads && !matchList(c.docmeta?.dataflow?.reads, reads)) return false; - if (writes && !matchList(c.docmeta?.dataflow?.writes, writes)) return false; - if (mutates && !matchList(c.docmeta?.dataflow?.mutations, mutates)) return false; - if (alias && !matchList(c.docmeta?.dataflow?.aliases, alias)) return false; - const riskMeta = c.docmeta?.risk || null; - const riskTagValue = riskTag || risk; - if (riskTagValue && !matchList(riskMeta?.tags, riskTagValue)) return false; - if (riskSource) { - const sourceNames = Array.isArray(riskMeta?.sources) - ? riskMeta.sources.map((source) => source.name) - : null; - if (!matchList(sourceNames, riskSource)) return false; - } - if (riskSink) { - const sinkNames = Array.isArray(riskMeta?.sinks) - ? riskMeta.sinks.map((sink) => sink.name) - : null; - if (!matchList(sinkNames, riskSink)) return false; - } - if (riskCategory) { - const categories = Array.isArray(riskMeta?.categories) - ? riskMeta.categories - : (Array.isArray(riskMeta?.sinks) ? riskMeta.sinks.map((sink) => sink.category) : null); - if (!matchList(categories, riskCategory)) return false; - } - if (riskFlow) { - const flows = Array.isArray(riskMeta?.flows) - ? riskMeta.flows.map((flow) => `${flow.source}->${flow.sink}`) - : null; - if (!matchList(flows, riskFlow)) return false; - } - if (branches != null) { - const count = c.docmeta?.controlFlow?.branches; - if (!Number.isFinite(count) || count < branches) return false; - } - if (loops != null) { - const count = c.docmeta?.controlFlow?.loops; - if (!Number.isFinite(count) || count < loops) return false; - } - if (breaks != null) { - const count = c.docmeta?.controlFlow?.breaks; - if (!Number.isFinite(count) || count < breaks) return false; - } - if (continues != null) { - const count = c.docmeta?.controlFlow?.continues; - if (!Number.isFinite(count) || count < continues) return false; - } - if (visibility) { - const docVisibility = c.docmeta?.visibility || c.docmeta?.modifiers?.visibility || null; - if (!docVisibility || !normalize(docVisibility).includes(normalize(visibility))) { - return false; - } - } - if (extendsFilter) { - const parents = c.docmeta?.extends || c.docmeta?.bases || []; - if (!matchList(parents, extendsFilter)) return false; - } - if (truthy(asyncOnly)) { - if (!(c.docmeta?.async || c.docmeta?.modifiers?.async)) return false; - } - if (truthy(generatorOnly)) { - if (!(c.docmeta?.modifiers?.generator || c.docmeta?.yields)) return false; - } - if (truthy(returnsOnly)) { - if (!(c.docmeta?.returnsValue || c.docmeta?.returns)) return false; - } - return true; - }); -} - -/** - * Normalize context lines for display. - * @param {string[]} lines - * @returns {string[]} - */ -export function cleanContext(lines) { - return lines - .filter((line) => { - const trimmed = line.trim(); - if (!trimmed || trimmed === '```') return false; - if (!/[a-zA-Z0-9]/.test(trimmed)) return false; - return true; - }) - .map((line) => line.replace(/\s+/g, ' ').trim()); -} - -function getBodySummary(rootDir, chunk, maxWords = 80) { - try { - const absPath = path.join(rootDir, chunk.file); - const cacheKey = `${absPath}:${chunk.start}:${chunk.end}:${maxWords}`; - if (summaryCache.has(cacheKey)) return summaryCache.get(cacheKey); - let text = fileTextCache.get(absPath); - if (!text) { - text = fs.readFileSync(absPath, 'utf8'); - fileTextCache.set(absPath, text); - } - const chunkText = text.slice(chunk.start, chunk.end) - .replace(/\s+/g, ' ') - .trim(); - const words = chunkText.split(/\s+/).slice(0, maxWords).join(' '); - summaryCache.set(cacheKey, words); - return words; - } catch { - return '(Could not load summary)'; - } -} +import { cleanContext } from './context.js'; +import { formatScoreBreakdown } from './explain.js'; +import { getBodySummary } from './summary.js'; const formatInferredEntry = (entry) => { if (!entry?.type) return ''; @@ -371,6 +42,7 @@ export function formatFullChunk({ mode, score, scoreType, + explain = false, color, queryTokens = [], rx, @@ -398,6 +70,13 @@ export function formatFullChunk({ out += line1 + '\n'; + if (explain && chunk.scoreBreakdown) { + const explainLines = formatScoreBreakdown(chunk.scoreBreakdown, c); + if (explainLines.length) { + out += explainLines.join('\n') + '\n'; + } + } + const headlinePart = chunk.headline ? c.bold('Headline: ') + c.underline(chunk.headline) : ''; const lastModPart = chunk.last_modified ? c.gray('Last Modified: ') + c.bold(chunk.last_modified) : ''; const secondLine = [headlinePart, lastModPart].filter(Boolean).join(' '); @@ -438,11 +117,31 @@ export function formatFullChunk({ out += c.yellow(' CallSummary: ') + summaries.join(', ') + '\n'; } - if (chunk.codeRelations?.importLinks?.length) { + if (chunk.importLinks?.length) { + out += c.green(' ImportLinks: ') + chunk.importLinks.join(', ') + '\n'; + } else if (chunk.codeRelations?.importLinks?.length) { out += c.green(' ImportLinks: ') + chunk.codeRelations.importLinks.join(', ') + '\n'; } - if (chunk.codeRelations?.usages?.length) { + if (chunk.usages?.length) { + const usageFreq = Object.create(null); + chunk.usages.forEach((raw) => { + const trimmed = typeof raw === 'string' ? raw.trim() : ''; + if (!trimmed) return; + usageFreq[trimmed] = (usageFreq[trimmed] || 0) + 1; + }); + + const usageEntries = Object.entries(usageFreq).sort((a, b) => b[1] - a[1]); + const maxCount = usageEntries[0]?.[1] || 0; + + const usageStr = usageEntries.slice(0, 10).map(([usage, count]) => { + if (count === 1) return usage; + if (count === maxCount) return c.bold(c.yellow(`${usage} (${count})`)); + return c.cyan(`${usage} (${count})`); + }).join(', '); + + if (usageStr.length) out += c.cyan(' Usages: ') + usageStr + '\n'; + } else if (chunk.codeRelations?.usages?.length) { const usageFreq = Object.create(null); chunk.codeRelations.usages.forEach((raw) => { const trimmed = typeof raw === 'string' ? raw.trim() : ''; @@ -661,6 +360,7 @@ export function formatShortChunk({ mode, score, scoreType, + explain = false, color, queryTokens = [], rx, @@ -708,6 +408,13 @@ export function formatShortChunk({ } } + if (explain && chunk.scoreBreakdown) { + const explainLines = formatScoreBreakdown(chunk.scoreBreakdown, color); + if (explainLines.length) { + out += '\n' + explainLines.join('\n'); + } + } + out += '\n'; return out; } diff --git a/src/retrieval/output/summary.js b/src/retrieval/output/summary.js new file mode 100644 index 000000000..33b72a076 --- /dev/null +++ b/src/retrieval/output/summary.js @@ -0,0 +1,27 @@ +import path from 'node:path'; +import { readTextFileSync } from '../../shared/encoding.js'; +import { getFileTextCache, getSummaryCache } from './cache.js'; + +export function getBodySummary(rootDir, chunk, maxWords = 80) { + try { + const absPath = path.join(rootDir, chunk.file); + const cacheKey = `${absPath}:${chunk.start}:${chunk.end}:${maxWords}`; + const summaryCache = getSummaryCache(); + const fileTextCache = getFileTextCache(); + const cached = summaryCache.get(cacheKey); + if (cached !== null) return cached; + let text = fileTextCache.get(absPath); + if (text == null) { + ({ text } = readTextFileSync(absPath)); + fileTextCache.set(absPath, text); + } + const chunkText = text.slice(chunk.start, chunk.end) + .replace(/\s+/g, ' ') + .trim(); + const words = chunkText.split(/\s+/).slice(0, maxWords).join(' '); + summaryCache.set(cacheKey, words); + return words; + } catch { + return '(Could not load summary)'; + } +} diff --git a/src/retrieval/pipeline.js b/src/retrieval/pipeline.js new file mode 100644 index 000000000..a2c76a12e --- /dev/null +++ b/src/retrieval/pipeline.js @@ -0,0 +1,498 @@ +import { filterChunks } from './output.js'; +import { hasActiveFilters } from './filters.js'; +import { rankBM25, rankBM25Fields, rankDenseVectors, rankMinhash } from './rankers.js'; +import { extractNgrams, tri } from '../shared/tokenize.js'; +import { rankHnswIndex } from '../shared/hnsw.js'; + +const SQLITE_IN_LIMIT = 900; + +/** + * Create a search pipeline runner bound to a shared context. + * @param {object} context + * @returns {(idx:object, mode:'code'|'prose'|'records'|'extracted-prose', queryEmbedding:number[]|null)=>Array} + */ +export function createSearchPipeline(context) { + const { + useSqlite, + sqliteFtsRequested, + sqliteFtsNormalize, + sqliteFtsProfile, + sqliteFtsWeights, + bm25K1, + bm25B, + fieldWeights, + postingsConfig, + queryTokens, + phraseNgramSet, + phraseRange, + symbolBoost, + filters, + filtersActive, + topN, + annEnabled, + scoreBlend, + minhashMaxDocs, + vectorAnnState, + vectorAnnUsed, + hnswAnnState, + hnswAnnUsed, + buildCandidateSetSqlite, + getTokenIndexForQuery, + rankSqliteFts, + rankVectorAnnSqlite, + rrf + } = context; + const blendEnabled = scoreBlend?.enabled === true; + const blendSparseWeight = Number.isFinite(Number(scoreBlend?.sparseWeight)) + ? Number(scoreBlend.sparseWeight) + : 1; + const blendAnnWeight = Number.isFinite(Number(scoreBlend?.annWeight)) + ? Number(scoreBlend.annWeight) + : 1; + const symbolBoostEnabled = symbolBoost?.enabled !== false; + const symbolBoostDefinitionWeight = Number.isFinite(Number(symbolBoost?.definitionWeight)) + ? Number(symbolBoost.definitionWeight) + : 1.15; + const symbolBoostExportWeight = Number.isFinite(Number(symbolBoost?.exportWeight)) + ? Number(symbolBoost.exportWeight) + : 1.1; + const rrfEnabled = rrf?.enabled !== false; + const rrfK = Number.isFinite(Number(rrf?.k)) + ? Math.max(1, Number(rrf.k)) + : 60; + const minhashLimit = Number.isFinite(Number(minhashMaxDocs)) && Number(minhashMaxDocs) > 0 + ? Number(minhashMaxDocs) + : null; + const chargramMaxTokenLength = postingsConfig?.chargramMaxTokenLength == null + ? null + : Math.max(2, Math.floor(Number(postingsConfig.chargramMaxTokenLength))); + const fieldWeightsEnabled = fieldWeights + && Object.values(fieldWeights).some((value) => Number.isFinite(Number(value)) && Number(value) > 0); + + const isDefinitionKind = (kind) => typeof kind === 'string' + && /Declaration|Definition|Initializer|Deinitializer/.test(kind); + + const isExportedChunk = (chunk) => { + if (!chunk) return false; + if (chunk.exported === true || chunk?.meta?.exported === true) return true; + const kind = chunk.kind || ''; + if (typeof kind === 'string' && kind.includes('Export')) return true; + const exportsList = Array.isArray(chunk.exports) + ? chunk.exports + : (Array.isArray(chunk?.meta?.exports) ? chunk.meta.exports : null); + if (!exportsList || !chunk.name) return false; + return exportsList.includes(chunk.name); + }; + + /** + * Build a candidate set from file-backed indexes (or SQLite). + * @param {object} idx + * @param {string[]} tokens + * @param {'code'|'prose'|'records'|'extracted-prose'} mode + * @returns {Set|null} + */ + function buildCandidateSet(idx, tokens, mode) { + if (useSqlite && (mode === 'code' || mode === 'prose')) { + return buildCandidateSetSqlite(mode, tokens); + } + + const candidates = new Set(); + let matched = false; + + if (postingsConfig.enablePhraseNgrams !== false && idx.phraseNgrams?.vocab && idx.phraseNgrams?.postings) { + const vocabIndex = idx.phraseNgrams.vocabIndex + || (idx.phraseNgrams.vocabIndex = new Map(idx.phraseNgrams.vocab.map((t, i) => [t, i]))); + const ngrams = extractNgrams(tokens, postingsConfig.phraseMinN, postingsConfig.phraseMaxN); + for (const ng of ngrams) { + const hit = vocabIndex.get(ng); + if (hit === undefined) continue; + const posting = idx.phraseNgrams.postings[hit] || []; + posting.forEach((id) => candidates.add(id)); + matched = matched || posting.length > 0; + } + } + + if (postingsConfig.enableChargrams !== false && idx.chargrams?.vocab && idx.chargrams?.postings) { + const vocabIndex = idx.chargrams.vocabIndex + || (idx.chargrams.vocabIndex = new Map(idx.chargrams.vocab.map((t, i) => [t, i]))); + for (const token of tokens) { + if (chargramMaxTokenLength && token.length > chargramMaxTokenLength) continue; + for (let n = postingsConfig.chargramMinN; n <= postingsConfig.chargramMaxN; n++) { + for (const gram of tri(token, n)) { + const hit = vocabIndex.get(gram); + if (hit === undefined) continue; + const posting = idx.chargrams.postings[hit] || []; + posting.forEach((id) => candidates.add(id)); + matched = matched || posting.length > 0; + } + } + } + } + + return matched ? candidates : null; + } + + function getPhraseMatchInfo(chunk, phraseSet, range) { + if (!phraseSet || !phraseSet.size || !chunk) return { matches: 0 }; + let ngrams = Array.isArray(chunk.ngrams) && chunk.ngrams.length ? chunk.ngrams : null; + if (!ngrams && Array.isArray(chunk.tokens) && range?.min && range?.max) { + ngrams = extractNgrams(chunk.tokens, range.min, range.max); + } + if (!ngrams || !ngrams.length) return { matches: 0 }; + let matches = 0; + for (const ng of ngrams) { + if (phraseSet.has(ng)) matches += 1; + } + return { matches }; + } + + /** + * Execute the full search pipeline for a mode. + * @param {object} idx + * @param {'code'|'prose'|'records'|'extracted-prose'} mode + * @param {number[]|null} queryEmbedding + * @returns {Array} + */ + return function runSearch(idx, mode, queryEmbedding) { + const meta = idx.chunkMeta; + const sqliteEnabledForMode = useSqlite && (mode === 'code' || mode === 'prose'); + const filtersEnabled = typeof filtersActive === 'boolean' + ? filtersActive + : hasActiveFilters(filters); + + // Filtering + const filteredMeta = filtersEnabled + ? filterChunks(meta, filters, idx.filterIndex, idx.fileRelations) + : meta; + const allowedIdx = filtersEnabled ? new Set(filteredMeta.map((c) => c.id)) : null; + if (filtersEnabled && (!allowedIdx || allowedIdx.size === 0)) { + return []; + } + + const intersectCandidateSet = (candidateSet, allowedSet) => { + if (!allowedSet) return candidateSet; + if (!candidateSet) return allowedSet; + const filtered = new Set(); + for (const id of candidateSet) { + if (allowedSet.has(id)) filtered.add(id); + } + return filtered; + }; + + const searchTopN = Math.max(1, Number(topN) || 1); + const expandedTopN = searchTopN * 3; + + // Main search: BM25 token match (with optional SQLite FTS first pass) + let candidates = null; + let bmHits = []; + let sparseType = fieldWeightsEnabled ? 'bm25-fielded' : 'bm25'; + let sqliteFtsUsed = false; + const sqliteFtsAllowed = allowedIdx && allowedIdx.size ? allowedIdx : null; + const sqliteFtsCanPushdown = !!(sqliteFtsAllowed && sqliteFtsAllowed.size <= SQLITE_IN_LIMIT); + const sqliteFtsEligible = sqliteEnabledForMode + && sqliteFtsRequested + && (!filtersEnabled || sqliteFtsCanPushdown); + if (sqliteFtsEligible) { + bmHits = rankSqliteFts( + idx, + queryTokens, + mode, + expandedTopN, + sqliteFtsNormalize, + sqliteFtsCanPushdown ? sqliteFtsAllowed : null + ); + sqliteFtsUsed = bmHits.length > 0; + if (sqliteFtsUsed) { + sparseType = 'fts'; + candidates = new Set(bmHits.map((h) => h.idx)); + } + } + if (!bmHits.length) { + const tokenIndexOverride = sqliteEnabledForMode ? getTokenIndexForQuery(queryTokens, mode) : null; + candidates = buildCandidateSet(idx, queryTokens, mode); + bmHits = fieldWeightsEnabled + ? rankBM25Fields({ + idx, + tokens: queryTokens, + topN: expandedTopN, + fieldWeights, + allowedIdx, + k1: bm25K1, + b: bm25B + }) + : rankBM25({ + idx, + tokens: queryTokens, + topN: expandedTopN, + tokenIndexOverride, + allowedIdx, + k1: bm25K1, + b: bm25B + }); + sparseType = fieldWeightsEnabled ? 'bm25-fielded' : 'bm25'; + sqliteFtsUsed = false; + } + + // MinHash (embedding) ANN, if requested + let annHits = []; + let annSource = null; + const annCandidates = intersectCandidateSet(candidates, allowedIdx); + const annFallback = candidates && allowedIdx ? allowedIdx : null; + const annCandidatesEmpty = annCandidates && annCandidates.size === 0; + if (annEnabled) { + if (queryEmbedding && vectorAnnState?.[mode]?.available) { + if (!annCandidatesEmpty) { + annHits = rankVectorAnnSqlite(mode, queryEmbedding, expandedTopN, annCandidates); + } + if (!annHits.length && annFallback) { + annHits = rankVectorAnnSqlite(mode, queryEmbedding, expandedTopN, annFallback); + } + if (annHits.length) { + vectorAnnUsed[mode] = true; + annSource = 'sqlite-vector'; + } + } + if (!annHits.length && queryEmbedding && (idx.hnsw?.available || hnswAnnState?.[mode]?.available)) { + if (!annCandidatesEmpty) { + annHits = rankHnswIndex(idx.hnsw || {}, queryEmbedding, expandedTopN, annCandidates); + } + if (!annHits.length && annFallback) { + annHits = rankHnswIndex(idx.hnsw || {}, queryEmbedding, expandedTopN, annFallback); + } + if (annHits.length) { + if (hnswAnnUsed && mode in hnswAnnUsed) hnswAnnUsed[mode] = true; + annSource = 'hnsw'; + } + } + if (!annHits.length && queryEmbedding && idx.denseVec?.vectors?.length) { + if (!annCandidatesEmpty) { + annHits = rankDenseVectors(idx, queryEmbedding, expandedTopN, annCandidates); + } + if (!annHits.length && annFallback) { + annHits = rankDenseVectors(idx, queryEmbedding, expandedTopN, annFallback); + } + if (annHits.length) annSource = 'dense'; + } + if (!annHits.length) { + const minhashBase = candidates || (bmHits.length ? new Set(bmHits.map((h) => h.idx)) : null); + const minhashCandidates = intersectCandidateSet(minhashBase, allowedIdx); + const minhashFallback = minhashBase && allowedIdx ? allowedIdx : null; + const minhashCandidatesEmpty = minhashCandidates && minhashCandidates.size === 0; + const minhashTotal = minhashCandidates ? minhashCandidates.size : (idx.minhash?.signatures?.length || 0); + const allowMinhash = minhashTotal > 0 && (!minhashLimit || minhashTotal <= minhashLimit); + if (allowMinhash && !minhashCandidatesEmpty) { + annHits = rankMinhash(idx, queryTokens, expandedTopN, minhashCandidates); + if (annHits.length) annSource = 'minhash'; + } + if (!annHits.length && allowMinhash && minhashFallback) { + annHits = rankMinhash(idx, queryTokens, expandedTopN, minhashFallback); + if (annHits.length) annSource = 'minhash'; + } + } + } + + const useRrf = rrfEnabled && !blendEnabled && bmHits.length && annHits.length; + const sparseRanks = new Map(); + const annRanks = new Map(); + if (useRrf) { + bmHits.forEach((hit, index) => sparseRanks.set(hit.idx, index + 1)); + annHits.forEach((hit, index) => annRanks.set(hit.idx, index + 1)); + } + + if (idx.loadChunkMetaByIds) { + const idsToLoad = new Set(); + bmHits.forEach((h) => idsToLoad.add(h.idx)); + annHits.forEach((h) => idsToLoad.add(h.idx)); + const missing = Array.from(idsToLoad).filter((id) => !meta[id]); + if (missing.length) idx.loadChunkMetaByIds(mode, missing, meta); + } + + // Combine and dedup + const allHits = new Map(); + const recordHit = (idxVal, update) => { + const current = allHits.get(idxVal) || { bm25: null, fts: null, ann: null, annSource: null }; + allHits.set(idxVal, { ...current, ...update }); + }; + bmHits.forEach((h) => { + recordHit(h.idx, sparseType === 'fts' ? { fts: h.score } : { bm25: h.score }); + }); + annHits.forEach((h) => { + recordHit(h.idx, { ann: h.sim, annSource }); + }); + + const sparseMaxScore = bmHits.length + ? Math.max(...bmHits.map((hit) => (hit.score ?? hit.sim ?? 0))) + : null; + const scored = [...allHits.entries()] + .filter(([idxVal]) => !allowedIdx || allowedIdx.has(idxVal)) + .map(([idxVal, scores]) => { + const sparseScore = scores.fts ?? scores.bm25 ?? null; + const annScore = scores.ann ?? null; + const sparseTypeValue = scores.fts != null + ? 'fts' + : (scores.bm25 != null ? (fieldWeightsEnabled ? 'bm25-fielded' : 'bm25') : null); + let scoreType = null; + let score = null; + let blendInfo = null; + if (useRrf) { + const sparseRank = sparseRanks.get(idxVal) ?? null; + const annRank = annRanks.get(idxVal) ?? null; + const sparseRrf = sparseRank ? 1 / (rrfK + sparseRank) : 0; + const annRrf = annRank ? 1 / (rrfK + annRank) : 0; + scoreType = 'rrf'; + score = sparseRrf + annRrf; + blendInfo = { + k: rrfK, + sparseRank, + annRank, + sparseRrf, + annRrf, + score + }; + } else if (blendEnabled && (sparseScore != null || annScore != null)) { + const sparseMax = sparseScore != null + ? Math.max(sparseScore, sparseMaxScore || 0) + : 0; + const normalizedSparse = sparseScore != null && sparseMax > 0 + ? sparseScore / sparseMax + : null; + const clippedAnn = annScore != null + ? Math.max(-1, Math.min(1, annScore)) + : null; + const normalizedAnn = clippedAnn != null ? (clippedAnn + 1) / 2 : null; + const activeSparseWeight = normalizedSparse != null ? blendSparseWeight : 0; + const activeAnnWeight = normalizedAnn != null ? blendAnnWeight : 0; + const weightSum = activeSparseWeight + activeAnnWeight; + const blended = weightSum > 0 + ? ((normalizedSparse ?? 0) * activeSparseWeight + (normalizedAnn ?? 0) * activeAnnWeight) / weightSum + : 0; + scoreType = 'blend'; + score = blended; + blendInfo = { + score: blended, + sparseNormalized: normalizedSparse, + annNormalized: normalizedAnn, + sparseWeight: activeSparseWeight, + annWeight: activeAnnWeight + }; + } else if (sparseScore != null) { + scoreType = sparseTypeValue; + score = sparseScore; + } else if (annScore != null) { + scoreType = 'ann'; + score = annScore; + } else { + scoreType = 'none'; + score = 0; + } + const chunk = meta[idxVal]; + if (!chunk) return null; + const fileRelations = idx.fileRelations + ? (typeof idx.fileRelations.get === 'function' + ? idx.fileRelations.get(chunk.file) + : idx.fileRelations[chunk.file]) + : null; + const enrichedChunk = fileRelations + ? { + ...chunk, + imports: fileRelations.imports || chunk.imports, + exports: fileRelations.exports || chunk.exports, + usages: fileRelations.usages || chunk.usages, + importLinks: fileRelations.importLinks || chunk.importLinks + } + : chunk; + let phraseMatches = 0; + let phraseBoost = 0; + let phraseFactor = 0; + if (phraseNgramSet && phraseRange?.min && phraseRange?.max) { + const matchInfo = getPhraseMatchInfo(chunk, phraseNgramSet, phraseRange); + phraseMatches = matchInfo.matches; + if (phraseMatches) { + phraseFactor = Math.min(0.5, phraseMatches * 0.1); + phraseBoost = score * phraseFactor; + score += phraseBoost; + } + } + let symbolBoost = 0; + let symbolFactor = 1; + let symbolInfo = null; + if (symbolBoostEnabled) { + const isDefinition = isDefinitionKind(chunk.kind); + const isExported = isExportedChunk(enrichedChunk); + let factor = 1; + if (isDefinition) factor *= symbolBoostDefinitionWeight; + if (isExported) factor *= symbolBoostExportWeight; + symbolFactor = factor; + if (factor !== 1) { + symbolBoost = score * (factor - 1); + score *= factor; + } + symbolInfo = { + definition: isDefinition, + export: isExported, + factor: symbolFactor, + boost: symbolBoost + }; + } + const scoreBreakdown = { + sparse: sparseScore != null ? { + type: sparseTypeValue, + score: sparseScore, + normalized: scores.fts != null ? sqliteFtsNormalize : null, + weights: scores.fts != null ? sqliteFtsWeights : null, + profile: scores.fts != null ? sqliteFtsProfile : null, + fielded: fieldWeightsEnabled || false, + k1: scores.bm25 != null ? bm25K1 : null, + b: scores.bm25 != null ? bm25B : null, + ftsFallback: sqliteFtsRequested ? !sqliteFtsUsed : false + } : null, + ann: annScore != null ? { + score: annScore, + source: scores.annSource || null + } : null, + rrf: useRrf ? blendInfo : null, + phrase: phraseNgramSet ? { + matches: phraseMatches, + boost: phraseBoost, + factor: phraseFactor + } : null, + symbol: symbolInfo, + blend: blendEnabled && !useRrf ? blendInfo : null, + selected: { + type: scoreType, + score + } + }; + return { + idx: idxVal, + score, + scoreType, + scoreBreakdown, + chunk: enrichedChunk, + sparseScore, + sparseType: sparseTypeValue, + annScore, + annSource: scores.annSource || null + }; + }) + .filter(Boolean) + .sort((a, b) => (b.score - a.score) || (a.idx - b.idx)) + .slice(0, searchTopN); + + const ranked = scored + .map((entry) => ({ + ...entry.chunk, + score: entry.score, + scoreType: entry.scoreType, + sparseScore: entry.sparseScore, + sparseType: entry.sparseType, + annScore: entry.annScore, + annSource: entry.annSource, + annType: entry.annSource, + scoreBreakdown: entry.scoreBreakdown + })) + .filter(Boolean); + + return ranked; + }; +} diff --git a/src/search/query-cache.js b/src/retrieval/query-cache.js similarity index 100% rename from src/search/query-cache.js rename to src/retrieval/query-cache.js diff --git a/src/retrieval/query-intent.js b/src/retrieval/query-intent.js new file mode 100644 index 000000000..516238842 --- /dev/null +++ b/src/retrieval/query-intent.js @@ -0,0 +1,84 @@ +const PATH_PATTERN = /(^|[\s"'`])(\.{1,2}[\\/]|[A-Za-z]:[\\/]|~[\\/]|\/)/; +const CODE_TOKEN_PATTERN = /[{}()[\];:<>.=]|=>|->|::|\+\+|--|\|\||&&/; +const CAMEL_PATTERN = /[a-z][A-Z]/; +const SNAKE_PATTERN = /_/; + +const DEFAULT_FIELD_WEIGHTS = { + code: { name: 2.0, signature: 1.5, doc: 1.2, comment: 0.6, body: 1.0 }, + prose: { name: 1.2, signature: 0.9, doc: 2.1, comment: 1.8, body: 1.7 }, + path: { name: 2.4, signature: 1.7, doc: 0.9, comment: 0.4, body: 0.7 }, + mixed: { name: 1.8, signature: 1.3, doc: 1.6, comment: 1.2, body: 1.2 } +}; + +const detectSignals = (query, tokens) => { + const normalized = query || ''; + const words = tokens.filter((token) => /^[a-z0-9_]+$/i.test(token)); + const symbolTokens = tokens.filter((token) => /[^a-z0-9_]/i.test(token)); + const hasPath = PATH_PATTERN.test(normalized) || /[\\/]/.test(normalized); + const hasCodePunctuation = CODE_TOKEN_PATTERN.test(normalized) + || symbolTokens.length > 0; + const hasCamel = CAMEL_PATTERN.test(normalized); + const hasSnake = SNAKE_PATTERN.test(normalized); + const wordCount = words.length; + return { + hasPath, + hasCodePunctuation, + hasCamel, + hasSnake, + wordCount, + symbolCount: symbolTokens.length + }; +}; + +export const classifyQuery = ({ query, tokens = [], phrases = [], filters = {} }) => { + const signals = detectSignals(query, tokens); + const scores = { code: 0, prose: 0, path: 0 }; + + if (signals.hasPath || filters?.file || filters?.path) scores.path += 3; + if (signals.hasCodePunctuation) scores.code += 2; + if (signals.hasCamel || signals.hasSnake) scores.code += 1; + if (signals.wordCount >= 3) scores.prose += 2; + if (phrases.length >= 2) scores.prose += 1; + if (signals.symbolCount >= 2) scores.code += 1; + + const sorted = Object.entries(scores).sort((a, b) => b[1] - a[1]); + const [topType, topScore] = sorted[0]; + const secondScore = sorted[1]?.[1] ?? 0; + let type = topScore === 0 ? 'mixed' : topType; + if (topScore >= 2 && (topScore - secondScore <= 1)) { + type = 'mixed'; + } + if (scores.path >= 3 && scores.path >= scores.code && scores.path >= scores.prose) { + type = 'path'; + } + const vectorMode = type === 'prose' ? 'doc' : (type === 'code' || type === 'path' ? 'code' : null); + + return { + type, + scores, + signals, + vectorMode, + reason: type === 'mixed' ? 'signals mixed or weak' : `dominant ${type} signals` + }; +}; + +export const resolveIntentVectorMode = (denseVectorMode, intent) => { + if (denseVectorMode !== 'auto') return denseVectorMode; + if (intent?.vectorMode) return intent.vectorMode; + return denseVectorMode; +}; + +export const resolveIntentFieldWeights = (fieldWeightsInput, intent) => { + if (fieldWeightsInput === false) return null; + const key = intent?.type && DEFAULT_FIELD_WEIGHTS[intent.type] + ? intent.type + : 'code'; + const resolved = { ...DEFAULT_FIELD_WEIGHTS[key] }; + if (fieldWeightsInput && typeof fieldWeightsInput === 'object') { + for (const [field, value] of Object.entries(resolved)) { + const override = Number(fieldWeightsInput[field]); + if (Number.isFinite(override)) resolved[field] = override; + } + } + return resolved; +}; diff --git a/src/search/query-parse.js b/src/retrieval/query-parse.js similarity index 100% rename from src/search/query-parse.js rename to src/retrieval/query-parse.js diff --git a/src/search/query.js b/src/retrieval/query.js similarity index 80% rename from src/search/query.js rename to src/retrieval/query.js index a8502a128..6fc216bdc 100644 --- a/src/search/query.js +++ b/src/retrieval/query.js @@ -1,4 +1,10 @@ -import { extractNgrams, splitId, splitWordsWithDict } from '../shared/tokenize.js'; +import { + extractNgrams, + extractPunctuationTokens, + splitId, + splitIdPreserveCase, + splitWordsWithDict +} from '../shared/tokenize.js'; /** * Parse churn arg into a numeric threshold. @@ -78,11 +84,13 @@ export function parseQueryInput(raw) { const normalizeToken = (value) => String(value || '').normalize('NFKD'); -const expandQueryToken = (raw, dict) => { +const expandQueryToken = (raw, dict, options) => { + const caseSensitive = options?.caseSensitive === true; const normalized = normalizeToken(raw); if (!normalized) return []; + if (caseSensitive) return [normalized]; if (normalized.length <= 3 || dict.has(normalized)) return [normalized]; - const expanded = splitWordsWithDict(normalized, dict); + const expanded = splitWordsWithDict(normalized, dict, options); return expanded.length ? expanded : [normalized]; }; @@ -92,13 +100,16 @@ const expandQueryToken = (raw, dict) => { * @param {Set} dict * @returns {string[]} */ -export function tokenizeQueryTerms(rawTerms, dict) { +export function tokenizeQueryTerms(rawTerms, dict, options) { + const caseSensitive = options?.caseSensitive === true; + const splitter = caseSensitive ? splitIdPreserveCase : splitId; const tokens = []; const entries = Array.isArray(rawTerms) ? rawTerms : (rawTerms ? [rawTerms] : []); for (const entry of entries) { - const parts = splitId(String(entry || '')).map(normalizeToken).filter(Boolean); + tokens.push(...extractPunctuationTokens(entry)); + const parts = splitter(String(entry || '')).map(normalizeToken).filter(Boolean); for (const part of parts) { - tokens.push(...expandQueryToken(part, dict)); + tokens.push(...expandQueryToken(part, dict, options)); } } return tokens.filter(Boolean); @@ -110,11 +121,14 @@ export function tokenizeQueryTerms(rawTerms, dict) { * @param {Set} dict * @returns {string[]} */ -export function tokenizePhrase(phrase, dict) { - const parts = splitId(String(phrase || '')).map(normalizeToken).filter(Boolean); +export function tokenizePhrase(phrase, dict, options) { + const caseSensitive = options?.caseSensitive === true; + const splitter = caseSensitive ? splitIdPreserveCase : splitId; + const parts = splitter(String(phrase || '')).map(normalizeToken).filter(Boolean); const tokens = []; + tokens.push(...extractPunctuationTokens(phrase)); for (const part of parts) { - tokens.push(...expandQueryToken(part, dict)); + tokens.push(...expandQueryToken(part, dict, options)); } return tokens.filter(Boolean); } diff --git a/src/search/rankers.js b/src/retrieval/rankers.js similarity index 58% rename from src/search/rankers.js rename to src/retrieval/rankers.js index f12be5947..7a11d0ffc 100644 --- a/src/search/rankers.js +++ b/src/retrieval/rankers.js @@ -1,15 +1,17 @@ -import { SimpleMinHash } from '../indexer/minhash.js'; +import { SimpleMinHash } from '../index/minhash.js'; /** * Legacy BM25-like scoring using chunk metadata fields directly. * @param {object} idx * @param {string[]} tokens * @param {number} topN + * @param {Set|null} [allowedIdx] * @returns {Array<{idx:number,score:number}>} */ -export function rankBM25Legacy(idx, tokens, topN) { +export function rankBM25Legacy(idx, tokens, topN, allowedIdx = null) { + if (allowedIdx && allowedIdx.size === 0) return []; const scores = new Map(); - const ids = idx.chunkMeta.map((_, i) => i); + const ids = allowedIdx ? Array.from(allowedIdx) : idx.chunkMeta.map((_, i) => i); ids.forEach((i) => { const chunk = idx.chunkMeta[i]; if (!chunk) return; @@ -56,13 +58,25 @@ export function getTokenIndex(idx) { * @param {string[]} params.tokens * @param {number} params.topN * @param {object|null} [params.tokenIndexOverride] + * @param {Set|null} [params.allowedIdx] * @param {number} [params.k1] * @param {number} [params.b] * @returns {Array<{idx:number,score:number}>} */ -export function rankBM25({ idx, tokens, topN, tokenIndexOverride = null, k1 = 1.2, b = 0.75 }) { +export function rankBM25({ + idx, + tokens, + topN, + tokenIndexOverride = null, + allowedIdx = null, + k1 = 1.2, + b = 0.75 +}) { const tokenIndex = tokenIndexOverride || getTokenIndex(idx); - if (!tokenIndex || !tokenIndex.vocab || !tokenIndex.postings) return rankBM25Legacy(idx, tokens, topN); + if (!tokenIndex || !tokenIndex.vocab || !tokenIndex.postings) { + return rankBM25Legacy(idx, tokens, topN, allowedIdx); + } + if (allowedIdx && allowedIdx.size === 0) return []; const scores = new Map(); const docLengths = tokenIndex.docLengths; @@ -81,6 +95,7 @@ export function rankBM25({ idx, tokens, topN, tokenIndexOverride = null, k1 = 1. const idf = Math.log(1 + (totalDocs - df + 0.5) / (df + 0.5)); for (const [docId, tf] of posting) { + if (allowedIdx && !allowedIdx.has(docId)) continue; const dl = docLengths[docId] || 0; const denom = tf + k1 * (1 - b + b * (dl / avgDocLen)); const score = idf * ((tf * (k1 + 1)) / denom) * qCount; @@ -99,6 +114,79 @@ export function rankBM25({ idx, tokens, topN, tokenIndexOverride = null, k1 = 1. .slice(0, topN); } +/** + * Rank documents using BM25 across fielded postings. + * @param {object} params + * @param {object} params.idx + * @param {string[]} params.tokens + * @param {number} params.topN + * @param {object} params.fieldWeights + * @param {Set|null} [params.allowedIdx] + * @param {number} [params.k1] + * @param {number} [params.b] + * @returns {Array<{idx:number,score:number}>} + */ +export function rankBM25Fields({ + idx, + tokens, + topN, + fieldWeights, + allowedIdx = null, + k1 = 1.2, + b = 0.75 +}) { + const fields = idx.fieldPostings?.fields; + if (!fields || !fieldWeights || !tokens.length) { + return rankBM25({ idx, tokens, topN, k1, b, allowedIdx }); + } + if (allowedIdx && allowedIdx.size === 0) return []; + + const qtf = new Map(); + tokens.forEach((tok) => qtf.set(tok, (qtf.get(tok) || 0) + 1)); + + const scores = new Map(); + for (const [field, weight] of Object.entries(fieldWeights)) { + const fieldWeight = Number(weight); + if (!Number.isFinite(fieldWeight) || fieldWeight <= 0) continue; + const index = fields[field]; + if (!index || !index.vocab || !index.postings) continue; + if (!index.vocabIndex) { + index.vocabIndex = new Map(index.vocab.map((t, i) => [t, i])); + } + const docLengths = Array.isArray(index.docLengths) ? index.docLengths : []; + const avgDocLen = Number.isFinite(index.avgDocLen) ? index.avgDocLen : 1; + const totalDocs = Number.isFinite(index.totalDocs) ? index.totalDocs : docLengths.length; + if (!totalDocs) continue; + + for (const [tok, qCount] of qtf.entries()) { + const tokIdx = index.vocabIndex.get(tok); + if (tokIdx === undefined) continue; + const posting = index.postings[tokIdx] || []; + const df = posting.length; + if (!df) continue; + const idf = Math.log(1 + (totalDocs - df + 0.5) / (df + 0.5)); + + for (const [docId, tf] of posting) { + if (allowedIdx && !allowedIdx.has(docId)) continue; + const dl = docLengths[docId] || 0; + const denom = tf + k1 * (1 - b + b * (dl / avgDocLen)); + const score = idf * ((tf * (k1 + 1)) / denom) * qCount * fieldWeight; + scores.set(docId, (scores.get(docId) || 0) + score); + } + } + } + + const weighted = [...scores.entries()].map(([docId, score]) => { + const weight = idx.chunkMeta[docId]?.weight || 1; + return { idx: docId, score: score * weight }; + }); + + return weighted + .filter(({ score }) => score > 0) + .sort((a, b) => (b.score - a.score) || (a.idx - b.idx)) + .slice(0, topN); +} + function minhashSigForTokens(tokens) { const mh = new SimpleMinHash(); tokens.forEach((t) => mh.update(t)); @@ -118,14 +206,20 @@ function jaccard(sigA, sigB) { * @param {number} topN * @returns {Array<{idx:number,sim:number}>} */ -export function rankMinhash(idx, tokens, topN) { +export function rankMinhash(idx, tokens, topN, candidateSet = null) { if (!idx.minhash?.signatures?.length) return []; + if (!Array.isArray(tokens) || !tokens.length) return []; const qSig = minhashSigForTokens(tokens); - const scored = idx.minhash.signatures - .map((sig, i) => ({ idx: i, sim: jaccard(qSig, sig) })) + const ids = candidateSet ? Array.from(candidateSet) : idx.minhash.signatures.map((_, i) => i); + const scored = []; + for (const id of ids) { + const sig = idx.minhash.signatures[id]; + if (!sig) continue; + scored.push({ idx: id, sim: jaccard(qSig, sig) }); + } + return scored .sort((a, b) => (b.sim - a.sim) || (a.idx - b.idx)) .slice(0, topN); - return scored; } /** @@ -140,10 +234,10 @@ export function rankDenseVectors(idx, queryEmbedding, topN, candidateSet) { const vectors = idx.denseVec?.vectors; if (!queryEmbedding || !Array.isArray(vectors) || !vectors.length) return []; const dims = idx.denseVec?.dims || queryEmbedding.length; - const levels = 256; const minVal = -1; - const maxVal = 1; - const scale = (maxVal - minVal) / (levels - 1); + const scale = Number.isFinite(idx.denseVec?.scale) + ? idx.denseVec.scale + : (2 / 255); const ids = candidateSet ? Array.from(candidateSet) : vectors.map((_, i) => i); const scored = []; diff --git a/src/retrieval/sqlite-cache.js b/src/retrieval/sqlite-cache.js new file mode 100644 index 000000000..2b0c62a21 --- /dev/null +++ b/src/retrieval/sqlite-cache.js @@ -0,0 +1,56 @@ +import fsSync from 'node:fs'; + +const fileSignature = (filePath) => { + try { + const stat = fsSync.statSync(filePath); + return `${stat.size}:${stat.mtimeMs}`; + } catch { + return null; + } +}; + +export function createSqliteDbCache() { + const entries = new Map(); + + const get = (dbPath) => { + const entry = entries.get(dbPath); + if (!entry) return null; + const signature = fileSignature(dbPath); + if (!signature || signature !== entry.signature) { + try { + entry.db?.close?.(); + } catch {} + entries.delete(dbPath); + return null; + } + return entry.db || null; + }; + + const set = (dbPath, db) => { + const signature = fileSignature(dbPath); + entries.set(dbPath, { db, signature }); + }; + + const close = (dbPath) => { + const entry = entries.get(dbPath); + if (!entry) return; + try { + entry.db?.close?.(); + } catch {} + entries.delete(dbPath); + }; + + const closeAll = () => { + for (const dbPath of entries.keys()) { + close(dbPath); + } + }; + + return { + get, + set, + close, + closeAll, + size: () => entries.size + }; +} diff --git a/src/search/sqlite-helpers.js b/src/retrieval/sqlite-helpers.js similarity index 60% rename from src/search/sqlite-helpers.js rename to src/retrieval/sqlite-helpers.js index 7c87b2848..6bb673abd 100644 --- a/src/search/sqlite-helpers.js +++ b/src/retrieval/sqlite-helpers.js @@ -1,8 +1,10 @@ import { extractNgrams, tri } from '../shared/tokenize.js'; import { parseArrayField, parseJson } from './query-cache.js'; import { buildFtsBm25Expr } from './fts.js'; +import { buildFilterIndex } from './filter-index.js'; const SQLITE_IN_LIMIT = 900; +const FTS_TOKEN_SAFE = /^[\p{L}\p{N}_]+$/u; /** * Create SQLite helper functions for search. @@ -24,8 +26,12 @@ export function createSqliteHelpers(options) { vectorExtension, vectorAnnState, queryVectorAnn, - modelIdDefault + modelIdDefault, + fileChargramN } = options; + const chargramMaxTokenLength = postingsConfig?.chargramMaxTokenLength == null + ? null + : Math.max(2, Math.floor(Number(postingsConfig.chargramMaxTokenLength))); const sqliteCache = { tokenStats: new Map(), @@ -43,76 +49,134 @@ export function createSqliteHelpers(options) { return Array.from(view); } + /** + * Map a chunk row into the in-memory metadata shape. + * @param {object} row + * @returns {object} + */ + function mapChunkRow(row) { + const start = Number.isFinite(row.start) ? row.start : null; + const end = Number.isFinite(row.end) ? row.end : null; + const startLine = Number.isFinite(row.startLine) ? row.startLine : null; + const endLine = Number.isFinite(row.endLine) ? row.endLine : null; + const metaV2 = row.chunk_id + ? { + chunkId: row.chunk_id, + file: row.file || null, + segment: null, + range: { + start, + end, + startLine, + endLine + }, + lang: null, + ext: row.ext || null, + kind: row.kind || null, + name: row.name || null + } + : null; + return { + id: row.id, + file: row.file, + start, + end, + startLine, + endLine, + ext: row.ext, + kind: row.kind, + name: row.name, + metaV2, + weight: typeof row.weight === 'number' ? row.weight : 1, + headline: row.headline, + preContext: parseJson(row.preContext, []), + postContext: parseJson(row.postContext, []), + tokens: parseArrayField(row.tokens), + ngrams: parseJson(row.ngrams, []), + codeRelations: parseJson(row.codeRelations, null), + docmeta: parseJson(row.docmeta, null), + stats: parseJson(row.stats, null), + complexity: parseJson(row.complexity, null), + lint: parseJson(row.lint, null), + externalDocs: parseJson(row.externalDocs, null), + last_modified: row.last_modified, + last_author: row.last_author, + churn: row.churn, + chunk_authors: parseJson(row.chunk_authors, null) + }; + } + + /** + * Fill an array of chunk metadata with rows. + * @param {Array} rows + * @param {Array} target + */ + function hydrateChunkMeta(rows, target) { + for (const row of rows) { + target[row.id] = mapChunkRow(row); + } + } + /** * Load index artifacts from SQLite into in-memory structures. * @param {'code'|'prose'} mode * @returns {object} */ - function loadIndexFromSqlite(mode) { + function loadIndexFromSqlite(mode, options = {}) { const db = getDb(mode); if (!db) throw new Error('SQLite backend requested but database is not available.'); - const chunkRows = db.prepare('SELECT * FROM chunks WHERE mode = ? ORDER BY id').all(mode); + const includeMinhash = options.includeMinhash !== false; + const includeDense = options.includeDense !== false; + const includeChunks = options.includeChunks !== false; + const includeFilterIndex = options.includeFilterIndex !== false; let maxLocalId = -1; - for (const row of chunkRows) { - if (row.id > maxLocalId) maxLocalId = row.id; - } - - const chunkMeta = maxLocalId >= 0 ? Array.from({ length: maxLocalId + 1 }) : []; - for (const row of chunkRows) { - chunkMeta[row.id] = { - id: row.id, - file: row.file, - start: row.start, - end: row.end, - startLine: row.startLine, - endLine: row.endLine, - ext: row.ext, - kind: row.kind, - name: row.name, - weight: typeof row.weight === 'number' ? row.weight : 1, - headline: row.headline, - preContext: parseJson(row.preContext, []), - postContext: parseJson(row.postContext, []), - tokens: parseArrayField(row.tokens), - ngrams: parseJson(row.ngrams, []), - codeRelations: parseJson(row.codeRelations, null), - docmeta: parseJson(row.docmeta, null), - stats: parseJson(row.stats, null), - complexity: parseJson(row.complexity, null), - lint: parseJson(row.lint, null), - externalDocs: parseJson(row.externalDocs, null), - last_modified: row.last_modified, - last_author: row.last_author, - churn: row.churn, - chunk_authors: parseJson(row.chunk_authors, null) - }; + let chunkMeta = []; + if (includeChunks) { + const chunkRows = db.prepare('SELECT * FROM chunks WHERE mode = ? ORDER BY id').all(mode); + for (const row of chunkRows) { + if (row.id > maxLocalId) maxLocalId = row.id; + } + chunkMeta = maxLocalId >= 0 ? Array.from({ length: maxLocalId + 1 }) : []; + hydrateChunkMeta(chunkRows, chunkMeta); + } else { + const maxRow = db.prepare('SELECT MAX(id) as maxId FROM chunks WHERE mode = ?').get(mode); + maxLocalId = Number.isFinite(maxRow?.maxId) ? maxRow.maxId : -1; + chunkMeta = maxLocalId >= 0 ? Array.from({ length: maxLocalId + 1 }) : []; } - const signatures = Array.from({ length: chunkMeta.length }); - const sigStmt = db.prepare('SELECT doc_id, sig FROM minhash_signatures WHERE mode = ? ORDER BY doc_id'); - for (const row of sigStmt.iterate(mode)) { - signatures[row.doc_id] = unpackUint32(row.sig); + let minhash = null; + if (includeMinhash) { + const signatures = Array.from({ length: chunkMeta.length }); + const sigStmt = db.prepare('SELECT doc_id, sig FROM minhash_signatures WHERE mode = ? ORDER BY doc_id'); + for (const row of sigStmt.iterate(mode)) { + signatures[row.doc_id] = unpackUint32(row.sig); + } + minhash = signatures.length ? { signatures } : null; } - const minhash = signatures.length ? { signatures } : null; - const denseMeta = db.prepare('SELECT dims, scale, model FROM dense_meta WHERE mode = ?').get(mode) || {}; - const vectors = Array.from({ length: chunkMeta.length }); - const denseStmt = db.prepare('SELECT doc_id, vector FROM dense_vectors WHERE mode = ? ORDER BY doc_id'); - for (const row of denseStmt.iterate(mode)) { - vectors[row.doc_id] = row.vector; + let denseVec = null; + if (includeDense) { + const denseMeta = db.prepare('SELECT dims, scale, model FROM dense_meta WHERE mode = ?').get(mode) || {}; + const vectors = Array.from({ length: chunkMeta.length }); + const denseStmt = db.prepare('SELECT doc_id, vector FROM dense_vectors WHERE mode = ? ORDER BY doc_id'); + for (const row of denseStmt.iterate(mode)) { + vectors[row.doc_id] = row.vector; + } + const fallbackVec = vectors.find((vec) => vec && vec.length); + denseVec = vectors.length ? { + model: denseMeta.model || modelIdDefault, + dims: denseMeta.dims || (fallbackVec ? fallbackVec.length : 0), + scale: typeof denseMeta.scale === 'number' ? denseMeta.scale : 1.0, + vectors + } : null; } - const fallbackVec = vectors.find((vec) => vec && vec.length); - const denseVec = vectors.length ? { - model: denseMeta.model || modelIdDefault, - dims: denseMeta.dims || (fallbackVec ? fallbackVec.length : 0), - scale: typeof denseMeta.scale === 'number' ? denseMeta.scale : 1.0, - vectors - } : null; return { chunkMeta, denseVec, - minhash + minhash, + filterIndex: includeFilterIndex ? buildFilterIndex(chunkMeta, { fileChargramN }) : null, + loadChunkMetaByIds }; } @@ -130,6 +194,30 @@ export function createSqliteHelpers(options) { return chunks; } + /** + * Load chunk metadata rows for a list of ids. + * @param {'code'|'prose'} mode + * @param {number[]} ids + * @param {Array|null} target + * @returns {Array} + */ + function loadChunkMetaByIds(mode, ids, target = null) { + const db = getDb(mode); + if (!db || !ids || !ids.length) return target || []; + const unique = Array.from(new Set(ids.filter((id) => Number.isFinite(id)))); + if (!unique.length) return target || []; + const out = target || []; + for (const chunk of chunkArray(unique)) { + const placeholders = chunk.map(() => '?').join(','); + const stmt = db.prepare( + `SELECT * FROM chunks WHERE mode = ? AND id IN (${placeholders})` + ); + const rows = stmt.all(mode, ...chunk); + hydrateChunkMeta(rows, out); + } + return out; + } + /** * Fetch vocabulary rows for a list of values. * @param {'code'|'prose'} mode @@ -312,6 +400,7 @@ export function createSqliteHelpers(options) { if (postingsConfig.enableChargrams !== false) { const gramSet = new Set(); for (const token of tokens) { + if (chargramMaxTokenLength && token.length > chargramMaxTokenLength) continue; for (let n = postingsConfig.chargramMinN; n <= postingsConfig.chargramMaxN; n++) { for (const gram of tri(token, n)) { gramSet.add(gram); @@ -340,16 +429,33 @@ export function createSqliteHelpers(options) { * @param {'code'|'prose'} mode * @param {number} topN * @param {boolean} [normalizeScores] + * @param {Set|null} [allowedIds] * @returns {Array<{idx:number,score:number}>} */ - function rankSqliteFts(idx, queryTokens, mode, topN, normalizeScores = false) { + function rankSqliteFts(idx, queryTokens, mode, topN, normalizeScores = false, allowedIds = null) { const db = getDb(mode); if (!db || !queryTokens.length) return []; - const ftsQuery = queryTokens.join(' '); + if (allowedIds && allowedIds.size === 0) return []; + const ftsTokens = queryTokens.filter((token) => FTS_TOKEN_SAFE.test(token)); + if (!ftsTokens.length) return []; + const ftsQuery = ftsTokens.join(' '); const bm25Expr = buildFtsBm25Expr(sqliteFtsWeights); + const allowedList = allowedIds && allowedIds.size ? Array.from(allowedIds) : null; + const canPushdown = !!(allowedList && allowedList.length <= SQLITE_IN_LIMIT); + const allowedClause = canPushdown + ? ` AND chunks_fts.rowid IN (${allowedList.map(() => '?').join(',')})` + : ''; + const params = canPushdown + ? [ftsQuery, mode, ...allowedList, topN] + : [ftsQuery, mode, topN]; const rows = db.prepare( - `SELECT rowid AS id, ${bm25Expr} AS score FROM chunks_fts WHERE chunks_fts MATCH ? AND mode = ? ORDER BY score ASC, rowid ASC LIMIT ?` - ).all(ftsQuery, mode, topN); + `SELECT chunks_fts.rowid AS id, ${bm25Expr} AS score, chunks.weight AS weight + FROM chunks_fts + JOIN chunks ON chunks.id = chunks_fts.rowid + WHERE chunks_fts MATCH ? AND chunks.mode = ? + ${allowedClause} + ORDER BY score ASC, chunks_fts.rowid ASC LIMIT ?` + ).all(...params); const rawScores = rows.map((row) => -row.score); let min = 0; let max = 0; @@ -360,15 +466,23 @@ export function createSqliteHelpers(options) { const hits = []; for (let i = 0; i < rows.length; i++) { const row = rows[i]; - if (row.id < 0 || row.id >= idx.chunkMeta.length) continue; - const weight = idx.chunkMeta[row.id]?.weight || 1; + if (row.id == null || row.id < 0) continue; + const weight = typeof row.weight === 'number' + ? row.weight + : (idx.chunkMeta?.[row.id]?.weight || 1); const raw = rawScores[i]; const normalized = normalizeScores ? (max > min ? (raw - min) / (max - min) : 1) : raw; hits.push({ idx: row.id, score: normalized * weight }); } - return hits; + let filteredHits = hits; + if (allowedIds && allowedIds.size && !canPushdown) { + filteredHits = filteredHits.filter((hit) => allowedIds.has(hit.idx)); + } + return filteredHits + .sort((a, b) => (b.score - a.score) || (a.idx - b.idx)) + .slice(0, topN); } /** diff --git a/src/search/cli.js b/src/search/cli.js deleted file mode 100644 index 5046970b8..000000000 --- a/src/search/cli.js +++ /dev/null @@ -1,976 +0,0 @@ -/** - * Ultra-Complete Search Utility for Rich Semantic Index (Pretty Output) - * By: ChatGPT & Nick, 2025 - * [--calls function] Filter for call relationships (calls to/from function) - * [--uses ident] Filter for usage of identifier - */ - -import fs from 'node:fs/promises'; -import fsSync from 'node:fs'; -import path from 'node:path'; -import crypto from 'node:crypto'; -import minimist from 'minimist'; -import { DEFAULT_MODEL_ID, getDictionaryPaths, getDictConfig, getIndexDir, getMetricsDir, getModelConfig, loadUserConfig, resolveRepoRoot, resolveSqlitePaths } from '../../tools/dict-utils.js'; -import { getVectorExtensionConfig, hasVectorTable, loadVectorExtension, queryVectorAnn, resolveVectorExtensionPath } from '../../tools/vector-extension.js'; -import { resolveFtsWeights } from './fts.js'; -import { getQueryEmbedding } from './embedding.js'; -import { loadQueryCache, parseJson, pruneQueryCache } from './query-cache.js'; -import { normalizeExtFilter, parseMetaFilters } from './filters.js'; -import { formatFullChunk, formatShortChunk } from './output.js'; -import { parseChurnArg, parseModifiedArgs, parseQueryInput, tokenizePhrase, tokenizeQueryTerms, buildPhraseNgrams } from './query-parse.js'; -import { normalizePostingsConfig } from '../shared/postings-config.js'; -import { createSqliteHelpers } from './sqlite-helpers.js'; -import { createSearchPipeline } from './pipeline.js'; - -const argv = minimist(process.argv.slice(2), { - boolean: ['json', 'json-compact', 'human', 'stats', 'ann', 'headline', 'lint', 'matched', 'async', 'generator', 'returns'], - alias: { n: 'top', c: 'context', t: 'type' }, - default: { n: 5, context: 3 }, - string: [ - 'calls', - 'uses', - 'signature', - 'param', - 'decorator', - 'inferred-type', - 'return-type', - 'throws', - 'reads', - 'writes', - 'mutates', - 'churn', - 'alias', - 'awaits', - 'branches', - 'loops', - 'breaks', - 'continues', - 'risk', - 'risk-tag', - 'risk-source', - 'risk-sink', - 'risk-category', - 'risk-flow', - 'meta', - 'meta-json', - 'file', - 'ext', - 'chunk-author', - 'modified-after', - 'modified-since', - 'visibility', - 'extends', - 'mode', - 'backend', - 'path', - 'model', - 'repo', - 'fts-profile', - 'fts-weights', - 'bm25-k1', - 'bm25-b' - ], -}); -const t0 = Date.now(); -const rootArg = argv.repo ? path.resolve(argv.repo) : null; -const ROOT = rootArg || resolveRepoRoot(process.cwd()); -const userConfig = loadUserConfig(ROOT); -const modelConfig = getModelConfig(ROOT, userConfig); -const modelIdDefault = argv.model || modelConfig.id || DEFAULT_MODEL_ID; -const sqliteConfig = userConfig.sqlite || {}; -const postingsConfig = normalizePostingsConfig(userConfig.indexing?.postings || {}); -const vectorExtension = getVectorExtensionConfig(ROOT, userConfig); -const bm25Config = userConfig.search?.bm25 || {}; -const bm25K1 = Number.isFinite(Number(argv['bm25-k1'])) - ? Number(argv['bm25-k1']) - : (Number.isFinite(Number(bm25Config.k1)) ? Number(bm25Config.k1) : 1.2); -const bm25B = Number.isFinite(Number(argv['bm25-b'])) - ? Number(argv['bm25-b']) - : (Number.isFinite(Number(bm25Config.b)) ? Number(bm25Config.b) : 0.75); -const sqliteFtsNormalize = userConfig.search?.sqliteFtsNormalize === true; -const sqliteFtsProfile = (argv['fts-profile'] || process.env.PAIROFCLEATS_FTS_PROFILE || userConfig.search?.sqliteFtsProfile || 'balanced').toLowerCase(); -let sqliteFtsWeightsConfig = userConfig.search?.sqliteFtsWeights || null; -if (argv['fts-weights']) { - const parsed = parseJson(argv['fts-weights'], null); - if (parsed) { - sqliteFtsWeightsConfig = parsed; - } else { - const values = String(argv['fts-weights']) - .split(/[,\s]+/) - .filter(Boolean) - .map((val) => Number(val)) - .filter((val) => Number.isFinite(val)); - sqliteFtsWeightsConfig = values.length ? values : sqliteFtsWeightsConfig; - } -} -const metricsDir = getMetricsDir(ROOT, userConfig); -const useStubEmbeddings = process.env.PAIROFCLEATS_EMBEDDINGS === 'stub'; -const rawArgs = process.argv.slice(2); -const query = argv._.join(' ').trim(); -if (!query) { - console.error('usage: search "query" [--repo path|--json|--json-compact|--human|--stats|--no-ann|--context N|--type T|--backend memory|sqlite|sqlite-fts|...]|--mode code|prose|both|records|all|--meta key=value|--meta-json {...}|--path path|--file path|--ext .ext|--churn [min]|--modified-after date|--modified-since days|--chunk-author name|--signature|--param|--decorator|--inferred-type|--return-type|--throws|--reads|--writes|--mutates|--alias|--awaits|--branches|--loops|--breaks|--continues|--risk|--risk-tag|--risk-source|--risk-sink|--risk-category|--risk-flow|--extends|--visibility|--async|--generator|--returns'); - process.exit(1); -} -const contextLines = Math.max(0, parseInt(argv.context, 10) || 0); -const searchType = argv.type || null; -const searchAuthor = argv.author || null; -const searchImport = argv.import || null; -const chunkAuthorFilter = argv['chunk-author'] || null; -const searchMode = String(argv.mode || 'both').toLowerCase(); -const allowedModes = new Set(['code', 'prose', 'both', 'records', 'all']); -if (!allowedModes.has(searchMode)) { - console.error(`Invalid --mode ${searchMode}. Use code|prose|both|records|all.`); - process.exit(1); -} -const runCode = searchMode === 'code' || searchMode === 'both' || searchMode === 'all'; -const runProse = searchMode === 'prose' || searchMode === 'both' || searchMode === 'all'; -const runRecords = searchMode === 'records' || searchMode === 'all'; -const branchesMin = Number.isFinite(Number(argv.branches)) ? Number(argv.branches) : null; -const loopsMin = Number.isFinite(Number(argv.loops)) ? Number(argv.loops) : null; -const breaksMin = Number.isFinite(Number(argv.breaks)) ? Number(argv.breaks) : null; -const continuesMin = Number.isFinite(Number(argv.continues)) ? Number(argv.continues) : null; -let churnMin = null; -try { - churnMin = parseChurnArg(argv.churn); -} catch (err) { - console.error(err.message); - process.exit(1); -} -let modifiedArgs; -try { - modifiedArgs = parseModifiedArgs(argv['modified-after'], argv['modified-since']); -} catch (err) { - console.error(err.message); - process.exit(1); -} -const modifiedAfter = modifiedArgs.modifiedAfter; -const modifiedSinceDays = modifiedArgs.modifiedSinceDays; -const fileFilters = []; -if (argv.path) fileFilters.push(argv.path); -if (argv.file) fileFilters.push(argv.file); -const fileFilter = fileFilters.length ? fileFilters.flat() : null; -const extFilter = normalizeExtFilter(argv.ext); -const metaFilters = parseMetaFilters(argv.meta, argv['meta-json']); -const sqlitePaths = resolveSqlitePaths(ROOT, userConfig); -const sqliteCodePath = sqlitePaths.codePath; -const sqliteProsePath = sqlitePaths.prosePath; -const needsCode = runCode; -const needsProse = runProse; -const backendArg = typeof argv.backend === 'string' ? argv.backend.toLowerCase() : ''; -const sqliteScoreModeConfig = sqliteConfig.scoreMode === 'fts'; -const sqliteFtsRequested = backendArg === 'sqlite-fts' || backendArg === 'fts' || (!backendArg && sqliteScoreModeConfig); -const backendForcedSqlite = backendArg === 'sqlite' || sqliteFtsRequested; -const backendDisabled = backendArg && !(backendArg === 'sqlite' || sqliteFtsRequested); -const sqliteConfigured = sqliteConfig.use === true; -const sqliteCodeAvailable = fsSync.existsSync(sqliteCodePath); -const sqliteProseAvailable = fsSync.existsSync(sqliteProsePath); -const sqliteAvailable = (!needsCode || sqliteCodeAvailable) && (!needsProse || sqliteProseAvailable); -const annFlagPresent = rawArgs.includes('--ann') || rawArgs.includes('--no-ann'); -const annDefault = userConfig.search?.annDefault !== false; -const annEnabled = annFlagPresent ? argv.ann : annDefault; -const vectorAnnEnabled = annEnabled && vectorExtension.enabled; -const queryCacheConfig = userConfig.search?.queryCache || {}; -const queryCacheEnabled = queryCacheConfig.enabled === true; -const queryCacheMaxEntries = Number.isFinite(Number(queryCacheConfig.maxEntries)) - ? Math.max(1, Number(queryCacheConfig.maxEntries)) - : 200; -const queryCacheTtlMs = Number.isFinite(Number(queryCacheConfig.ttlMs)) - ? Math.max(0, Number(queryCacheConfig.ttlMs)) - : 0; -const queryCachePath = path.join(metricsDir, 'queryCache.json'); -const jsonCompact = argv['json-compact'] === true; -const jsonOutput = argv.json || jsonCompact; - -const sqliteFtsWeights = resolveFtsWeights(sqliteFtsProfile, sqliteFtsWeightsConfig); - -if (backendForcedSqlite && !sqliteAvailable) { - const missing = []; - if (needsCode && !sqliteCodeAvailable) missing.push(`code=${sqliteCodePath}`); - if (needsProse && !sqliteProseAvailable) missing.push(`prose=${sqliteProsePath}`); - const suffix = missing.length ? missing.join(', ') : 'missing sqlite index'; - console.error(`SQLite backend requested but index not found (${suffix}).`); - process.exit(1); -} - -const needsSqlite = runCode || runProse; -if (!needsSqlite && backendForcedSqlite) { - console.warn('SQLite backend requested, but records-only mode selected; using file-backed records index.'); -} -let useSqlite = needsSqlite && (backendForcedSqlite || (!backendDisabled && sqliteConfigured)) && sqliteAvailable; -let dbCode = null; -let dbProse = null; -const vectorAnnState = { - code: { available: false }, - prose: { available: false }, - records: { available: false } -}; -const vectorAnnUsed = { code: false, prose: false, records: false }; -let vectorAnnWarned = false; -if (useSqlite) { - let Database; - try { - ({ default: Database } = await import('better-sqlite3')); - } catch (err) { - console.error('better-sqlite3 is required for the SQLite backend. Run npm install first.'); - process.exit(1); - } - - const requiredTables = sqliteFtsRequested - ? [ - 'chunks', - 'chunks_fts', - 'minhash_signatures', - 'dense_vectors', - 'dense_meta' - ] - : [ - 'chunks', - 'token_vocab', - 'token_postings', - 'doc_lengths', - 'token_stats', - 'phrase_vocab', - 'phrase_postings', - 'chargram_vocab', - 'chargram_postings', - 'minhash_signatures', - 'dense_vectors', - 'dense_meta' - ]; - - const openSqlite = (dbPath, label) => { - const db = new Database(dbPath, { readonly: true }); - const tableRows = db.prepare("SELECT name FROM sqlite_master WHERE type='table'").all(); - const tableNames = new Set(tableRows.map((row) => row.name)); - const missing = requiredTables.filter((name) => !tableNames.has(name)); - if (missing.length) { - const message = `SQLite index ${label} is missing required tables (${missing.join(', ')}). Rebuild with npm run build-sqlite-index.`; - if (backendForcedSqlite) { - console.error(message); - process.exit(1); - } - console.warn(`${message} Falling back to file-backed indexes.`); - db.close(); - return null; - } - return db; - }; - - const initVectorAnn = (db, mode) => { - if (!vectorAnnEnabled || !db) return; - const loadResult = loadVectorExtension(db, vectorExtension, `sqlite ${mode}`); - if (!loadResult.ok) { - if (!vectorAnnWarned) { - const extPath = resolveVectorExtensionPath(vectorExtension); - console.warn(`[ann] SQLite vector extension unavailable (${loadResult.reason}).`); - console.warn(`[ann] Expected extension at ${extPath || 'unset'}; falling back to JS ANN.`); - vectorAnnWarned = true; - } - return; - } - if (!hasVectorTable(db, vectorExtension.table)) { - if (!vectorAnnWarned) { - console.warn(`[ann] SQLite vector table missing (${vectorExtension.table}). Rebuild with npm run build-sqlite-index.`); - vectorAnnWarned = true; - } - return; - } - vectorAnnState[mode].available = true; - }; - - if (needsCode) dbCode = openSqlite(sqliteCodePath, 'code'); - if (needsProse) dbProse = openSqlite(sqliteProsePath, 'prose'); - if (needsCode) initVectorAnn(dbCode, 'code'); - if (needsProse) initVectorAnn(dbProse, 'prose'); - if ((needsCode && !dbCode) || (needsProse && !dbProse)) { - if (dbCode) dbCode.close(); - if (dbProse) dbProse.close(); - dbCode = null; - dbProse = null; - useSqlite = false; - } -} - -const backendLabel = useSqlite - ? (sqliteFtsRequested ? 'sqlite-fts' : 'sqlite') - : 'memory'; -let modelIdForCode = null; -let modelIdForProse = null; -let modelIdForRecords = null; - -/** - * Return the active SQLite connection for a mode. - * @param {'code'|'prose'} mode - * @returns {import('better-sqlite3').Database|null} - */ -function getSqliteDb(mode) { - if (!useSqlite) return null; - if (mode === 'code') return dbCode; - if (mode === 'prose') return dbProse; - return null; -} - -const sqliteHelpers = createSqliteHelpers({ - getDb: getSqliteDb, - postingsConfig, - sqliteFtsWeights, - vectorExtension, - vectorAnnState, - queryVectorAnn, - modelIdDefault -}); -const { - loadIndexFromSqlite, - buildCandidateSetSqlite, - getTokenIndexForQuery, - rankSqliteFts, - rankVectorAnnSqlite -} = sqliteHelpers; - - -const dictConfig = getDictConfig(ROOT, userConfig); -const dictionaryPaths = await getDictionaryPaths(ROOT, dictConfig); -const dict = new Set(); -for (const dictFile of dictionaryPaths) { - try { - const contents = fsSync.readFileSync(dictFile, 'utf8'); - contents - .split(/\r?\n/) - .map((w) => w.trim().toLowerCase()) - .filter(Boolean) - .forEach((w) => dict.add(w)); - } catch {} -} - -const color = { - green: (t) => `\x1b[32m${t}\x1b[0m`, - yellow: (t) => `\x1b[33m${t}\x1b[0m`, - red: (t) => `\x1b[31m${t}\x1b[0m`, - cyan: (t) => `\x1b[36m${t}\x1b[0m`, - magenta: (t) => `\x1b[35m${t}\x1b[0m`, - blue: (t) => `\x1b[34m${t}\x1b[0m`, - gray: (t) => `\x1b[90m${t}\x1b[0m`, - bold: (t) => `\x1b[1m${t}\x1b[0m`, - underline: (t) => `\x1b[4m${t}\x1b[0m` -}; - -// --- LOAD INDEX --- -/** - * Load file-backed index artifacts from a directory. - * @param {string} dir - * @returns {object} - */ -function loadIndex(dir) { - const readJson = (name) => JSON.parse(fsSync.readFileSync(path.join(dir, name), 'utf8')); - const loadOptional = (name) => { - try { - return readJson(name); - } catch { - return null; - } - }; - const chunkMeta = readJson('chunk_meta.json'); - const denseVec = loadOptional('dense_vectors_uint8.json'); - if (denseVec && !denseVec.model) denseVec.model = modelIdDefault; - const idx = { - chunkMeta, - denseVec, - minhash: loadOptional('minhash_signatures.json'), - phraseNgrams: loadOptional('phrase_ngrams.json'), - chargrams: loadOptional('chargram_postings.json') - }; - try { - idx.tokenIndex = readJson('token_postings.json'); - } catch {} - return idx; -} -/** - * Resolve the index directory (cache-first, local fallback). - * @param {'code'|'prose'|'records'} mode - * @returns {string} - */ -function resolveIndexDir(mode) { - const cached = getIndexDir(ROOT, mode, userConfig); - const cachedMeta = path.join(cached, 'chunk_meta.json'); - if (fsSync.existsSync(cachedMeta)) return cached; - const local = path.join(ROOT, `index-${mode}`); - const localMeta = path.join(local, 'chunk_meta.json'); - if (fsSync.existsSync(localMeta)) return local; - return cached; -} - -/** - * Build a size/mtime signature for a file. - * @param {string} filePath - * @returns {string|null} - */ -function fileSignature(filePath) { - try { - const stat = fsSync.statSync(filePath); - return `${stat.size}:${stat.mtimeMs}`; - } catch { - return null; - } -} - -/** - * Build a signature payload for cache invalidation. - * @returns {object} - */ -function getIndexSignature() { - if (useSqlite) { - const recordDir = runRecords ? resolveIndexDir('records') : null; - const recordMeta = recordDir ? path.join(recordDir, 'chunk_meta.json') : null; - const recordDense = recordDir ? path.join(recordDir, 'dense_vectors_uint8.json') : null; - return { - backend: backendLabel, - code: fileSignature(sqliteCodePath), - prose: fileSignature(sqliteProsePath), - records: recordMeta ? fileSignature(recordMeta) : null, - recordsDense: recordDense ? fileSignature(recordDense) : null - }; - } - const codeDir = resolveIndexDir('code'); - const proseDir = resolveIndexDir('prose'); - const codeMeta = path.join(codeDir, 'chunk_meta.json'); - const proseMeta = path.join(proseDir, 'chunk_meta.json'); - const codeDense = path.join(codeDir, 'dense_vectors_uint8.json'); - const proseDense = path.join(proseDir, 'dense_vectors_uint8.json'); - const recordDir = runRecords ? resolveIndexDir('records') : null; - const recordMeta = recordDir ? path.join(recordDir, 'chunk_meta.json') : null; - const recordDense = recordDir ? path.join(recordDir, 'dense_vectors_uint8.json') : null; - return { - backend: backendLabel, - code: fileSignature(codeMeta), - prose: fileSignature(proseMeta), - codeDense: fileSignature(codeDense), - proseDense: fileSignature(proseDense), - records: recordMeta ? fileSignature(recordMeta) : null, - recordsDense: recordDense ? fileSignature(recordDense) : null - }; -} - -/** - * Build a deterministic cache key for the current query + settings. - * @returns {{key:string,payload:object}} - */ -function buildQueryCacheKey() { - const payload = { - query, - backend: backendLabel, - mode: searchMode, - topN: argv.n, - ann: annEnabled, - annMode: vectorExtension.annMode, - annProvider: vectorExtension.provider, - annExtension: vectorAnnEnabled, - sqliteFtsNormalize, - sqliteFtsProfile, - sqliteFtsWeights, - models: { - code: modelIdForCode, - prose: modelIdForProse, - records: modelIdForRecords - }, - filters: { - type: searchType, - author: searchAuthor, - calls: argv.calls || null, - uses: argv.uses || null, - signature: argv.signature || null, - param: argv.param || null, - import: searchImport, - lint: argv.lint || false, - churn: churnMin, - decorator: argv.decorator || null, - inferredType: argv['inferred-type'] || null, - returnType: argv['return-type'] || null, - throws: argv.throws || null, - reads: argv.reads || null, - writes: argv.writes || null, - mutates: argv.mutates || null, - risk: argv.risk || null, - riskTag: argv['risk-tag'] || null, - riskSource: argv['risk-source'] || null, - riskSink: argv['risk-sink'] || null, - riskCategory: argv['risk-category'] || null, - riskFlow: argv['risk-flow'] || null, - awaits: argv.awaits || null, - visibility: argv.visibility || null, - extends: argv.extends || null, - async: argv.async || false, - generator: argv.generator || false, - returns: argv.returns || false, - file: fileFilter || null, - ext: extFilter || null, - meta: metaFilters, - chunkAuthor: chunkAuthorFilter || null, - modifiedAfter, - modifiedSinceDays - } - }; - const raw = JSON.stringify(payload); - const key = crypto.createHash('sha1').update(raw).digest('hex'); - return { key, payload }; -} - - -const idxProse = runProse - ? (useSqlite ? loadIndexFromSqlite('prose') : loadIndex(resolveIndexDir('prose'))) - : { chunkMeta: [], denseVec: null, minhash: null }; -const idxCode = runCode - ? (useSqlite ? loadIndexFromSqlite('code') : loadIndex(resolveIndexDir('code'))) - : { chunkMeta: [], denseVec: null, minhash: null }; -const idxRecords = runRecords - ? loadIndex(resolveIndexDir('records')) - : { chunkMeta: [], denseVec: null, minhash: null }; -modelIdForCode = runCode ? (idxCode?.denseVec?.model || modelIdDefault) : null; -modelIdForProse = runProse ? (idxProse?.denseVec?.model || modelIdDefault) : null; -modelIdForRecords = runRecords ? (idxRecords?.denseVec?.model || modelIdDefault) : null; - -// --- QUERY TOKENIZATION --- -const parsedQuery = parseQueryInput(query); -const includeTokens = tokenizeQueryTerms(parsedQuery.includeTerms, dict); -const phraseTokens = parsedQuery.phrases - .map((phrase) => tokenizePhrase(phrase, dict)) - .filter((tokens) => tokens.length); -const phraseInfo = buildPhraseNgrams(phraseTokens, postingsConfig); -const phraseNgrams = phraseInfo.ngrams; -const phraseNgramSet = phraseNgrams.length ? new Set(phraseNgrams) : null; -const phraseRange = { min: phraseInfo.minLen, max: phraseInfo.maxLen }; -const excludeTokens = tokenizeQueryTerms(parsedQuery.excludeTerms, dict); -const excludePhraseTokens = parsedQuery.excludePhrases - .map((phrase) => tokenizePhrase(phrase, dict)) - .filter((tokens) => tokens.length); -const excludePhraseInfo = buildPhraseNgrams(excludePhraseTokens, postingsConfig); -const excludePhraseNgrams = excludePhraseInfo.ngrams; -const excludePhraseRange = excludePhraseInfo.minLen && excludePhraseInfo.maxLen - ? { min: excludePhraseInfo.minLen, max: excludePhraseInfo.maxLen } - : null; -const queryTokens = [...includeTokens, ...phraseTokens.flat()]; -const rx = queryTokens.length ? new RegExp(`(${queryTokens.join('|')})`, 'ig') : null; -const embeddingQueryText = [...parsedQuery.includeTerms, ...parsedQuery.phrases] - .join(' ') - .trim() || query; -const filters = { - type: searchType, - author: searchAuthor, - importName: searchImport, - lint: argv.lint, - churn: churnMin, - calls: argv.calls, - uses: argv.uses, - signature: argv.signature, - param: argv.param, - decorator: argv.decorator, - inferredType: argv['inferred-type'], - returnType: argv['return-type'], - throws: argv.throws, - reads: argv.reads, - writes: argv.writes, - mutates: argv.mutates, - alias: argv.alias, - risk: argv.risk, - riskTag: argv['risk-tag'], - riskSource: argv['risk-source'], - riskSink: argv['risk-sink'], - riskCategory: argv['risk-category'], - riskFlow: argv['risk-flow'], - awaits: argv.awaits, - branches: branchesMin, - loops: loopsMin, - breaks: breaksMin, - continues: continuesMin, - visibility: argv.visibility, - extends: argv.extends, - async: argv.async, - generator: argv.generator, - returns: argv.returns, - file: fileFilter, - ext: extFilter, - meta: metaFilters, - chunkAuthor: chunkAuthorFilter, - modifiedAfter, - excludeTokens, - excludePhrases: excludePhraseNgrams, - excludePhraseRange -}; -const searchPipeline = createSearchPipeline({ - useSqlite, - sqliteFtsRequested, - sqliteFtsNormalize, - sqliteFtsProfile, - sqliteFtsWeights, - bm25K1, - bm25B, - postingsConfig, - queryTokens, - phraseNgramSet, - phraseRange, - filters, - topN: argv.n, - annEnabled, - vectorAnnState, - vectorAnnUsed, - buildCandidateSetSqlite, - getTokenIndexForQuery, - rankSqliteFts, - rankVectorAnnSqlite -}); -// --- SEARCH BM25 TOKENS/PHRASES --- - -/** - * Build a compact search hit payload for tooling. - * @param {object} hit - * @returns {object} - */ -function compactHit(hit) { - if (!hit || typeof hit !== 'object') return hit; - const compact = {}; - const fields = [ - 'id', - 'file', - 'start', - 'end', - 'startLine', - 'endLine', - 'ext', - 'kind', - 'name', - 'headline', - 'score', - 'scoreType', - 'sparseScore', - 'sparseType', - 'annScore', - 'annSource', - 'annType' - ]; - for (const field of fields) { - if (hit[field] !== undefined) compact[field] = hit[field]; - } - return compact; -} - - -// --- MAIN --- -(async () => { - let cacheHit = false; - let cacheKey = null; - let cacheSignature = null; - let cacheData = null; - let cachedPayload = null; - - if (queryCacheEnabled) { - const signature = getIndexSignature(); - cacheSignature = JSON.stringify(signature); - const cacheKeyInfo = buildQueryCacheKey(); - cacheKey = cacheKeyInfo.key; - cacheData = loadQueryCache(queryCachePath); - const entry = cacheData.entries.find((e) => e.key === cacheKey && e.signature === cacheSignature); - if (entry) { - const ttl = Number.isFinite(Number(entry.ttlMs)) ? Number(entry.ttlMs) : queryCacheTtlMs; - if (!ttl || (Date.now() - entry.ts) <= ttl) { - cachedPayload = entry.payload || null; - if (cachedPayload) { - const hasCode = !runCode || Array.isArray(cachedPayload.code); - const hasProse = !runProse || Array.isArray(cachedPayload.prose); - const hasRecords = !runRecords || Array.isArray(cachedPayload.records); - if (hasCode && hasProse && hasRecords) { - cacheHit = true; - entry.ts = Date.now(); - } - } - } - } - } - - const needsEmbedding = !cacheHit && annEnabled && ( - (runProse && (idxProse.denseVec?.vectors?.length || vectorAnnState.prose.available)) || - (runCode && (idxCode.denseVec?.vectors?.length || vectorAnnState.code.available)) || - (runRecords && idxRecords.denseVec?.vectors?.length) - ); - const embeddingCache = new Map(); - const getEmbeddingForModel = async (modelId, dims) => { - if (!modelId) return null; - const cacheKey = useStubEmbeddings ? `${modelId}:${dims || 'default'}` : modelId; - if (embeddingCache.has(cacheKey)) return embeddingCache.get(cacheKey); - const embedding = await getQueryEmbedding({ - text: embeddingQueryText, - modelId, - dims, - modelDir: modelConfig.dir, - useStub: useStubEmbeddings - }); - embeddingCache.set(cacheKey, embedding); - return embedding; - }; - const queryEmbeddingCode = needsEmbedding && runCode && (idxCode.denseVec?.vectors?.length || vectorAnnState.code.available) - ? await getEmbeddingForModel(modelIdForCode, idxCode.denseVec?.dims || null) - : null; - const queryEmbeddingProse = needsEmbedding && runProse && (idxProse.denseVec?.vectors?.length || vectorAnnState.prose.available) - ? await getEmbeddingForModel(modelIdForProse, idxProse.denseVec?.dims || null) - : null; - const queryEmbeddingRecords = needsEmbedding && runRecords && idxRecords.denseVec?.vectors?.length - ? await getEmbeddingForModel(modelIdForRecords, idxRecords.denseVec?.dims || null) - : null; - const proseHits = cacheHit && cachedPayload - ? (cachedPayload.prose || []) - : (runProse ? searchPipeline(idxProse, 'prose', queryEmbeddingProse) : []); - const codeHits = cacheHit && cachedPayload - ? (cachedPayload.code || []) - : (runCode ? searchPipeline(idxCode, 'code', queryEmbeddingCode) : []); - const recordHits = cacheHit && cachedPayload - ? (cachedPayload.records || []) - : (runRecords ? searchPipeline(idxRecords, 'records', queryEmbeddingRecords) : []); - const annBackend = vectorAnnEnabled && (vectorAnnUsed.code || vectorAnnUsed.prose) - ? 'sqlite-extension' - : 'js'; - - // Output - if (jsonOutput) { - // Full JSON - const memory = process.memoryUsage(); - console.log(JSON.stringify({ - backend: backendLabel, - prose: jsonCompact ? proseHits.map(compactHit) : proseHits, - code: jsonCompact ? codeHits.map(compactHit) : codeHits, - records: jsonCompact ? recordHits.map(compactHit) : recordHits, - stats: { - elapsedMs: Date.now() - t0, - annEnabled, - annMode: vectorExtension.annMode, - annBackend, - annExtension: vectorAnnEnabled ? { - provider: vectorExtension.provider, - table: vectorExtension.table, - available: { - code: vectorAnnState.code.available, - prose: vectorAnnState.prose.available, - records: vectorAnnState.records.available - } - } : null, - models: { - code: modelIdForCode, - prose: modelIdForProse, - records: modelIdForRecords - }, - cache: { - enabled: queryCacheEnabled, - hit: cacheHit, - key: cacheKey - }, - memory: { - rss: memory.rss, - heapTotal: memory.heapTotal, - heapUsed: memory.heapUsed, - external: memory.external, - arrayBuffers: memory.arrayBuffers - } - } - }, null, 2)); - } - - if (!jsonOutput) { - let showProse = runProse ? argv.n : 0; - let showCode = runCode ? argv.n : 0; - let showRecords = runRecords ? argv.n : 0; - - if (runProse && runCode) { - if (proseHits.length < argv.n) { - showCode += showProse; - } - if (codeHits.length < argv.n) { - showProse += showCode; - } - } - - // Human output, enhanced formatting and summaries - if (runProse) { - console.log(color.bold(`\n===== 📖 Markdown Results (${backendLabel}) =====`)); - const summaryState = { lastCount: 0 }; - proseHits.slice(0, showProse).forEach((h, i) => { - if (i < 2) { - process.stdout.write(formatFullChunk({ - chunk: h, - index: i, - mode: 'prose', - score: h.score, - scoreType: h.scoreType, - color, - queryTokens, - rx, - matched: argv.matched, - rootDir: ROOT, - summaryState - })); - } else { - process.stdout.write(formatShortChunk({ - chunk: h, - index: i, - mode: 'prose', - score: h.score, - scoreType: h.scoreType, - color, - queryTokens, - rx, - matched: argv.matched - })); - } - }); - console.log('\n'); - } - - if (runCode) { - console.log(color.bold(`===== 🔨 Code Results (${backendLabel}) =====`)); - const summaryState = { lastCount: 0 }; - codeHits.slice(0, showCode).forEach((h, i) => { - if (i < 1) { - process.stdout.write(formatFullChunk({ - chunk: h, - index: i, - mode: 'code', - score: h.score, - scoreType: h.scoreType, - color, - queryTokens, - rx, - matched: argv.matched, - rootDir: ROOT, - summaryState - })); - } else { - process.stdout.write(formatShortChunk({ - chunk: h, - index: i, - mode: 'code', - score: h.score, - scoreType: h.scoreType, - color, - queryTokens, - rx, - matched: argv.matched - })); - } - }); - console.log('\n'); - } - - if (runRecords) { - console.log(color.bold(`===== 🧾 Records Results (${backendLabel}) =====`)); - recordHits.slice(0, showRecords).forEach((h, i) => { - if (i < 2) { - process.stdout.write(formatFullChunk({ - chunk: h, - index: i, - mode: 'records', - score: h.score, - scoreType: h.scoreType, - color, - queryTokens, - rx, - matched: argv.matched, - rootDir: null, - summaryState: null - })); - } else { - process.stdout.write(formatShortChunk({ - chunk: h, - index: i, - mode: 'records', - score: h.score, - scoreType: h.scoreType, - color, - queryTokens, - rx, - matched: argv.matched - })); - } - }); - console.log('\n'); - } - - // Optionally stats - if (argv.stats) { - const cacheTag = queryCacheEnabled ? (cacheHit ? 'cache=hit' : 'cache=miss') : 'cache=off'; - const statsParts = [ - `prose chunks=${idxProse.chunkMeta.length}`, - `code chunks=${idxCode.chunkMeta.length}`, - runRecords ? `records chunks=${idxRecords.chunkMeta.length}` : null, - `(${cacheTag})` - ].filter(Boolean); - console.log(color.gray(`Stats: ${statsParts.join(', ')}`)); - } - } - - /* ---------- Update .repoMetrics and .searchHistory ---------- */ - const metricsPath = path.join(metricsDir, 'metrics.json'); - const historyPath = path.join(metricsDir, 'searchHistory'); - const noResultPath = path.join(metricsDir, 'noResultQueries'); - await fs.mkdir(path.dirname(metricsPath), { recursive: true }); - - let metrics = {}; - try { - metrics = JSON.parse(await fs.readFile(metricsPath, 'utf8')); - } catch { - metrics = {}; - } - const inc = (f, key) => { - if (!metrics[f]) metrics[f] = { md: 0, code: 0, records: 0, terms: [] }; - metrics[f][key] = (metrics[f][key] || 0) + 1; - queryTokens.forEach((t) => { - if (!metrics[f].terms.includes(t)) metrics[f].terms.push(t); - }); - }; - proseHits.forEach((h) => inc(h.file, 'md')); - codeHits.forEach((h) => inc(h.file, 'code')); - recordHits.forEach((h) => inc(h.file, 'records')); - await fs.writeFile(metricsPath, JSON.stringify(metrics) + '\n'); - - await fs.appendFile( - historyPath, - JSON.stringify({ - time: new Date().toISOString(), - query, - mdFiles: proseHits.length, - codeFiles: codeHits.length, - recordFiles: recordHits.length, - ms: Date.now() - t0, - cached: cacheHit, - }) + '\n' - ); - - if (proseHits.length === 0 && codeHits.length === 0 && recordHits.length === 0) { - await fs.appendFile( - noResultPath, - JSON.stringify({ time: new Date().toISOString(), query }) + '\n' - ); - } - - if (queryCacheEnabled && cacheKey) { - if (!cacheData) cacheData = { version: 1, entries: [] }; - if (!cacheHit) { - cacheData.entries = cacheData.entries.filter((entry) => entry.key !== cacheKey); - cacheData.entries.push({ - key: cacheKey, - ts: Date.now(), - ttlMs: queryCacheTtlMs, - signature: cacheSignature, - meta: { - query, - backend: backendLabel - }, - payload: { - prose: proseHits, - code: codeHits, - records: recordHits - } - }); - } - pruneQueryCache(cacheData, queryCacheMaxEntries); - try { - await fs.mkdir(path.dirname(queryCachePath), { recursive: true }); - await fs.writeFile(queryCachePath, JSON.stringify(cacheData, null, 2)); - } catch {} - } -})(); diff --git a/src/search/embedding.js b/src/search/embedding.js deleted file mode 100644 index 466c0bbd8..000000000 --- a/src/search/embedding.js +++ /dev/null @@ -1,42 +0,0 @@ -import fs from 'node:fs'; -import { stubEmbedding } from '../shared/embedding.js'; - -const embedderCache = new Map(); - -async function getEmbedder(modelId, modelDir) { - if (embedderCache.has(modelId)) return embedderCache.get(modelId); - const { pipeline, env } = await import('@xenova/transformers'); - if (modelDir) { - try { - fs.mkdirSync(modelDir, { recursive: true }); - } catch {} - env.cacheDir = modelDir; - } - const embedder = await pipeline('feature-extraction', modelId); - embedderCache.set(modelId, embedder); - return embedder; -} - -/** - * Compute a query embedding using the configured model. - * Returns null when embeddings are unavailable. - * @param {object} options - * @param {string} options.text - * @param {string} options.modelId - * @param {number} options.dims - * @param {string} options.modelDir - * @param {boolean} options.useStub - * @returns {Promise} - */ -export async function getQueryEmbedding({ text, modelId, dims, modelDir, useStub }) { - if (useStub) { - return stubEmbedding(text, dims); - } - try { - const embedder = await getEmbedder(modelId, modelDir); - const output = await embedder(text, { pooling: 'mean', normalize: true }); - return Array.from(output.data); - } catch { - return null; - } -} diff --git a/src/search/filters.js b/src/search/filters.js deleted file mode 100644 index 4e2c5e6bf..000000000 --- a/src/search/filters.js +++ /dev/null @@ -1,66 +0,0 @@ -import { parseJson } from './query-cache.js'; - -/** - * Normalize extension filters into a lowercase list. - * @param {string|string[]|null|undefined} extArg - * @returns {string[]|null} - */ -export function normalizeExtFilter(extArg) { - const entries = Array.isArray(extArg) ? extArg : (extArg ? [extArg] : []); - if (!entries.length) return null; - const normalized = []; - for (const entry of entries) { - String(entry || '') - .split(/[,\\s]+/) - .map((raw) => raw.trim()) - .filter(Boolean) - .forEach((raw) => { - let value = raw.toLowerCase(); - value = value.replace(/^\\*+/, ''); - if (!value) return; - if (!value.startsWith('.')) value = `.${value}`; - normalized.push(value); - }); - } - return normalized.length ? Array.from(new Set(normalized)) : null; -} - -/** - * Parse --meta and --meta-json into a normalized filter list. - * @param {string|string[]|null|undefined} metaArg - * @param {string|string[]|null|undefined} metaJsonArg - * @returns {Array<{key:string,value:any}>|null} - */ -export function parseMetaFilters(metaArg, metaJsonArg) { - const filters = []; - const pushFilter = (rawKey, rawValue) => { - const key = String(rawKey || '').trim(); - if (!key) return; - const value = rawValue === undefined ? null : rawValue; - filters.push({ key, value }); - }; - const handleEntry = (entry) => { - const text = String(entry || '').trim(); - if (!text) return; - const split = text.split('='); - const key = split.shift(); - const value = split.length ? split.join('=').trim() : null; - pushFilter(key, value === '' ? null : value); - }; - const metaEntries = Array.isArray(metaArg) ? metaArg : (metaArg ? [metaArg] : []); - for (const entry of metaEntries) handleEntry(entry); - const metaJsonEntries = Array.isArray(metaJsonArg) ? metaJsonArg : (metaJsonArg ? [metaJsonArg] : []); - for (const entry of metaJsonEntries) { - const parsed = parseJson(entry, null); - if (!parsed) continue; - if (Array.isArray(parsed)) { - parsed.forEach((item) => { - if (!item || typeof item !== 'object') return; - Object.entries(item).forEach(([key, value]) => pushFilter(key, value)); - }); - } else if (typeof parsed === 'object') { - Object.entries(parsed).forEach(([key, value]) => pushFilter(key, value)); - } - } - return filters.length ? filters : null; -} diff --git a/src/search/fts.js b/src/search/fts.js deleted file mode 100644 index 631c42584..000000000 --- a/src/search/fts.js +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Resolve FTS5 bm25 weights from a profile or config override. - * @param {string} profile - * @param {object|number[]|null} config - * @returns {number[]} - */ -export function resolveFtsWeights(profile, config) { - const profiles = { - balanced: { file: 0.2, name: 1.5, kind: 0.6, headline: 2.0, tokens: 1.0 }, - headline: { file: 0.1, name: 1.2, kind: 0.4, headline: 3.0, tokens: 1.0 }, - name: { file: 0.2, name: 2.5, kind: 0.8, headline: 1.2, tokens: 1.0 } - }; - const base = profiles[profile] || profiles.balanced; - - if (Array.isArray(config)) { - const values = config.map((v) => Number(v)).filter((v) => Number.isFinite(v)); - if (values.length >= 6) return values.slice(0, 6); - if (values.length === 5) return [0, ...values]; - } else if (config && typeof config === 'object') { - const merged = { ...base }; - for (const key of ['file', 'name', 'kind', 'headline', 'tokens']) { - if (Number.isFinite(Number(config[key]))) merged[key] = Number(config[key]); - } - return [0, merged.file, merged.name, merged.kind, merged.headline, merged.tokens]; - } - - return [0, base.file, base.name, base.kind, base.headline, base.tokens]; -} - -/** - * Build a bm25(chunks_fts, ...) SQL expression from weights. - * @param {number[]} weights - * @returns {string} - */ -export function buildFtsBm25Expr(weights) { - const safe = weights.map((val) => (Number.isFinite(val) ? val : 1)); - return `bm25(chunks_fts, ${safe.join(', ')})`; -} diff --git a/src/search/pipeline.js b/src/search/pipeline.js deleted file mode 100644 index 54cd95337..000000000 --- a/src/search/pipeline.js +++ /dev/null @@ -1,256 +0,0 @@ -import { filterChunks } from './output.js'; -import { rankBM25, rankDenseVectors, rankMinhash } from './rankers.js'; -import { extractNgrams, tri } from '../shared/tokenize.js'; - -/** - * Create a search pipeline runner bound to a shared context. - * @param {object} context - * @returns {(idx:object, mode:'code'|'prose'|'records', queryEmbedding:number[]|null)=>Array} - */ -export function createSearchPipeline(context) { - const { - useSqlite, - sqliteFtsRequested, - sqliteFtsNormalize, - sqliteFtsProfile, - sqliteFtsWeights, - bm25K1, - bm25B, - postingsConfig, - queryTokens, - phraseNgramSet, - phraseRange, - filters, - topN, - annEnabled, - vectorAnnState, - vectorAnnUsed, - buildCandidateSetSqlite, - getTokenIndexForQuery, - rankSqliteFts, - rankVectorAnnSqlite - } = context; - - /** - * Build a candidate set from file-backed indexes (or SQLite). - * @param {object} idx - * @param {string[]} tokens - * @param {'code'|'prose'|'records'} mode - * @returns {Set|null} - */ - function buildCandidateSet(idx, tokens, mode) { - if (useSqlite && (mode === 'code' || mode === 'prose')) { - return buildCandidateSetSqlite(mode, tokens); - } - - const candidates = new Set(); - let matched = false; - - if (postingsConfig.enablePhraseNgrams !== false && idx.phraseNgrams?.vocab && idx.phraseNgrams?.postings) { - const vocabIndex = new Map(idx.phraseNgrams.vocab.map((t, i) => [t, i])); - const ngrams = extractNgrams(tokens, postingsConfig.phraseMinN, postingsConfig.phraseMaxN); - for (const ng of ngrams) { - const hit = vocabIndex.get(ng); - if (hit === undefined) continue; - const posting = idx.phraseNgrams.postings[hit] || []; - posting.forEach((id) => candidates.add(id)); - matched = matched || posting.length > 0; - } - } - - if (postingsConfig.enableChargrams !== false && idx.chargrams?.vocab && idx.chargrams?.postings) { - const vocabIndex = new Map(idx.chargrams.vocab.map((t, i) => [t, i])); - for (const token of tokens) { - for (let n = postingsConfig.chargramMinN; n <= postingsConfig.chargramMaxN; n++) { - for (const gram of tri(token, n)) { - const hit = vocabIndex.get(gram); - if (hit === undefined) continue; - const posting = idx.chargrams.postings[hit] || []; - posting.forEach((id) => candidates.add(id)); - matched = matched || posting.length > 0; - } - } - } - } - - return matched ? candidates : null; - } - - function getPhraseMatchInfo(chunk, phraseSet, range) { - if (!phraseSet || !phraseSet.size || !chunk) return { matches: 0 }; - let ngrams = Array.isArray(chunk.ngrams) && chunk.ngrams.length ? chunk.ngrams : null; - if (!ngrams && Array.isArray(chunk.tokens) && range?.min && range?.max) { - ngrams = extractNgrams(chunk.tokens, range.min, range.max); - } - if (!ngrams || !ngrams.length) return { matches: 0 }; - let matches = 0; - for (const ng of ngrams) { - if (phraseSet.has(ng)) matches += 1; - } - return { matches }; - } - - /** - * Execute the full search pipeline for a mode. - * @param {object} idx - * @param {'code'|'prose'|'records'} mode - * @param {number[]|null} queryEmbedding - * @returns {Array} - */ - return function runSearch(idx, mode, queryEmbedding) { - const meta = idx.chunkMeta; - const sqliteEnabledForMode = useSqlite && (mode === 'code' || mode === 'prose'); - - // Filtering - const filteredMeta = filterChunks(meta, filters); - const allowedIdx = new Set(filteredMeta.map((c) => c.id)); - - const searchTopN = Math.max(1, Number(topN) || 1); - const expandedTopN = searchTopN * 3; - - // Main search: BM25 token match - let candidates = null; - let bmHits = []; - if (sqliteEnabledForMode && sqliteFtsRequested) { - bmHits = rankSqliteFts(idx, queryTokens, mode, expandedTopN, sqliteFtsNormalize); - candidates = bmHits.length ? new Set(bmHits.map((h) => h.idx)) : null; - } else { - const tokenIndexOverride = sqliteEnabledForMode ? getTokenIndexForQuery(queryTokens, mode) : null; - candidates = buildCandidateSet(idx, queryTokens, mode); - bmHits = rankBM25({ - idx, - tokens: queryTokens, - topN: expandedTopN, - tokenIndexOverride, - k1: bm25K1, - b: bm25B - }); - } - - // MinHash (embedding) ANN, if requested - let annHits = []; - let annSource = null; - if (annEnabled) { - if (queryEmbedding && vectorAnnState?.[mode]?.available) { - annHits = rankVectorAnnSqlite(mode, queryEmbedding, expandedTopN, candidates); - if (!annHits.length && candidates && candidates.size) { - annHits = rankVectorAnnSqlite(mode, queryEmbedding, expandedTopN, null); - } - if (annHits.length) { - vectorAnnUsed[mode] = true; - annSource = 'sqlite-vector'; - } - } - if (!annHits.length && queryEmbedding && idx.denseVec?.vectors?.length) { - annHits = rankDenseVectors(idx, queryEmbedding, expandedTopN, candidates); - if (annHits.length) annSource = 'dense'; - } - if (!annHits.length) { - annHits = rankMinhash(idx, queryTokens, expandedTopN); - if (annHits.length) annSource = 'minhash'; - } - } - - // Combine and dedup - const allHits = new Map(); - const sparseType = (sqliteEnabledForMode && sqliteFtsRequested) ? 'fts' : 'bm25'; - const recordHit = (idxVal, update) => { - const current = allHits.get(idxVal) || { bm25: null, fts: null, ann: null, annSource: null }; - allHits.set(idxVal, { ...current, ...update }); - }; - bmHits.forEach((h) => { - recordHit(h.idx, sparseType === 'fts' ? { fts: h.score } : { bm25: h.score }); - }); - annHits.forEach((h) => { - recordHit(h.idx, { ann: h.sim, annSource }); - }); - - const scored = [...allHits.entries()] - .filter(([idxVal]) => allowedIdx.has(idxVal)) - .map(([idxVal, scores]) => { - const sparseScore = scores.fts ?? scores.bm25 ?? null; - const annScore = scores.ann ?? null; - const sparseTypeValue = scores.fts != null ? 'fts' : (scores.bm25 != null ? 'bm25' : null); - let scoreType = null; - let score = null; - if (annScore != null && (sparseScore == null || annScore > sparseScore)) { - scoreType = 'ann'; - score = annScore; - } else if (sparseScore != null) { - scoreType = sparseTypeValue; - score = sparseScore; - } else { - scoreType = 'none'; - score = 0; - } - const chunk = meta[idxVal]; - if (!chunk) return null; - let phraseMatches = 0; - let phraseBoost = 0; - let phraseFactor = 0; - if (phraseNgramSet && phraseRange?.min && phraseRange?.max) { - const matchInfo = getPhraseMatchInfo(chunk, phraseNgramSet, phraseRange); - phraseMatches = matchInfo.matches; - if (phraseMatches) { - phraseFactor = Math.min(0.5, phraseMatches * 0.1); - phraseBoost = score * phraseFactor; - score += phraseBoost; - } - } - const scoreBreakdown = { - sparse: sparseScore != null ? { - type: sparseTypeValue, - score: sparseScore, - normalized: scores.fts != null ? sqliteFtsNormalize : null, - weights: scores.fts != null ? sqliteFtsWeights : null, - profile: scores.fts != null ? sqliteFtsProfile : null, - k1: scores.bm25 != null ? bm25K1 : null, - b: scores.bm25 != null ? bm25B : null - } : null, - ann: annScore != null ? { - score: annScore, - source: scores.annSource || null - } : null, - phrase: phraseNgramSet ? { - matches: phraseMatches, - boost: phraseBoost, - factor: phraseFactor - } : null, - selected: { - type: scoreType, - score - } - }; - return { - idx: idxVal, - score, - scoreType, - scoreBreakdown, - chunk, - sparseScore, - sparseType: sparseTypeValue, - annScore, - annSource: scores.annSource || null - }; - }) - .filter(Boolean) - .sort((a, b) => (b.score - a.score) || (a.idx - b.idx)) - .slice(0, searchTopN); - - const ranked = scored - .map((entry) => ({ - ...entry.chunk, - score: entry.score, - scoreType: entry.scoreType, - sparseScore: entry.sparseScore, - sparseType: entry.sparseType, - annScore: entry.annScore, - annSource: entry.annSource, - annType: entry.annSource, - scoreBreakdown: entry.scoreBreakdown - })) - .filter(Boolean); - - return ranked; - }; -} diff --git a/src/shared/artifact-io.js b/src/shared/artifact-io.js new file mode 100644 index 000000000..d1dcb0825 --- /dev/null +++ b/src/shared/artifact-io.js @@ -0,0 +1,255 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { gunzipSync } from 'node:zlib'; + +const MAX_JSON_BYTES_ENV = Number(process.env.PAIROFCLEATS_MAX_JSON_BYTES); +export const MAX_JSON_BYTES = Number.isFinite(MAX_JSON_BYTES_ENV) && MAX_JSON_BYTES_ENV > 0 + ? Math.floor(MAX_JSON_BYTES_ENV) + : 512 * 1024 * 1024 - 1024; + +const toJsonTooLargeError = (filePath, size) => { + const err = new Error( + `JSON artifact too large to load (${size} bytes): ${filePath}` + ); + err.code = 'ERR_JSON_TOO_LARGE'; + return err; +}; + +const getBakPath = (filePath) => `${filePath}.bak`; + +const cleanupBak = (filePath) => { + const bakPath = getBakPath(filePath); + if (!fs.existsSync(bakPath)) return; + try { + fs.rmSync(bakPath, { force: true }); + } catch {} +}; + +const PIECE_CACHE_LIMIT = 8; +const pieceCache = new Map(); + +const buildCacheKey = (filePath) => { + try { + const stat = fs.statSync(filePath); + return `${filePath}:${stat.size}:${stat.mtimeMs}`; + } catch { + return null; + } +}; + +const readCache = (filePath) => { + const key = buildCacheKey(filePath); + if (!key) return null; + const cached = pieceCache.get(key); + if (!cached) return null; + pieceCache.delete(key); + pieceCache.set(key, cached); + return cached; +}; + +const writeCache = (filePath, value) => { + const key = buildCacheKey(filePath); + if (!key) return; + if (pieceCache.has(key)) pieceCache.delete(key); + pieceCache.set(key, value); + if (pieceCache.size > PIECE_CACHE_LIMIT) { + const firstKey = pieceCache.keys().next().value; + if (firstKey) pieceCache.delete(firstKey); + } +}; + +const shouldTreatAsTooLarge = (err) => { + if (!err) return false; + if (err.code === 'ERR_STRING_TOO_LONG') return true; + const message = typeof err.message === 'string' ? err.message : ''; + return message.includes('Invalid string length'); +}; + +const readBuffer = (targetPath, maxBytes) => { + const stat = fs.statSync(targetPath); + if (stat.size > maxBytes) { + throw toJsonTooLargeError(targetPath, stat.size); + } + return fs.readFileSync(targetPath); +}; + +export const readJsonFile = (filePath, { maxBytes = MAX_JSON_BYTES } = {}) => { + const parseBuffer = (buffer, sourcePath) => { + if (buffer.length > maxBytes) { + throw toJsonTooLargeError(sourcePath, buffer.length); + } + try { + return JSON.parse(buffer.toString('utf8')); + } catch (err) { + if (shouldTreatAsTooLarge(err)) { + throw toJsonTooLargeError(sourcePath, buffer.length); + } + throw err; + } + }; + const tryRead = (targetPath, options = {}) => { + const { gzip = false, cleanup = false } = options; + const buffer = readBuffer(targetPath, maxBytes); + const parsed = parseBuffer(gzip ? gunzipSync(buffer) : buffer, targetPath); + if (cleanup) cleanupBak(targetPath); + return parsed; + }; + const bakPath = getBakPath(filePath); + if (fs.existsSync(filePath)) { + try { + return tryRead(filePath, { cleanup: true }); + } catch (err) { + if (fs.existsSync(bakPath)) { + return tryRead(bakPath); + } + throw err; + } + } + if (filePath.endsWith('.json')) { + const gzPath = `${filePath}.gz`; + const gzBakPath = getBakPath(gzPath); + if (fs.existsSync(gzPath)) { + try { + return tryRead(gzPath, { gzip: true, cleanup: true }); + } catch (err) { + if (fs.existsSync(gzBakPath)) { + return tryRead(gzBakPath, { gzip: true }); + } + throw err; + } + } + } + if (fs.existsSync(bakPath)) { + return tryRead(bakPath); + } + if (filePath.endsWith('.json')) { + const gzBakPath = getBakPath(`${filePath}.gz`); + if (fs.existsSync(gzBakPath)) { + return tryRead(gzBakPath, { gzip: true }); + } + } + throw new Error(`Missing JSON artifact: ${filePath}`); +}; + +const readJsonFileCached = (filePath, { maxBytes = MAX_JSON_BYTES } = {}) => { + const cached = readCache(filePath); + if (cached) return cached; + const data = readJsonFile(filePath, { maxBytes }); + writeCache(filePath, data); + return data; +}; + +export const readJsonLinesArraySync = (filePath, { maxBytes = MAX_JSON_BYTES } = {}) => { + const cached = readCache(filePath); + if (cached) return cached; + const tryRead = (targetPath, cleanup = false) => { + const stat = fs.statSync(targetPath); + if (stat.size > maxBytes) { + throw toJsonTooLargeError(targetPath, stat.size); + } + let raw = ''; + try { + raw = fs.readFileSync(targetPath, 'utf8'); + } catch (err) { + if (shouldTreatAsTooLarge(err)) { + throw toJsonTooLargeError(targetPath, stat.size); + } + throw err; + } + if (!raw.trim()) return []; + const parsed = raw + .split(/\r?\n/) + .filter((line) => line.trim().length > 0) + .map((line) => JSON.parse(line)); + if (cleanup) cleanupBak(targetPath); + writeCache(targetPath, parsed); + return parsed; + }; + const bakPath = getBakPath(filePath); + if (fs.existsSync(filePath)) { + try { + return tryRead(filePath, true); + } catch (err) { + if (fs.existsSync(bakPath)) { + return tryRead(bakPath); + } + throw err; + } + } + if (fs.existsSync(bakPath)) { + return tryRead(bakPath); + } + throw new Error(`Missing JSONL artifact: ${filePath}`); +}; + +const readShardFiles = (dir, prefix) => { + if (!fs.existsSync(dir)) return []; + return fs + .readdirSync(dir) + .filter((name) => name.startsWith(prefix) && (name.endsWith('.json') || name.endsWith('.jsonl'))) + .sort() + .map((name) => path.join(dir, name)); +}; + +const existsOrBak = (filePath) => fs.existsSync(filePath) || fs.existsSync(getBakPath(filePath)); + +export const loadChunkMeta = (dir, { maxBytes = MAX_JSON_BYTES } = {}) => { + const metaPath = path.join(dir, 'chunk_meta.meta.json'); + const partsDir = path.join(dir, 'chunk_meta.parts'); + if (existsOrBak(metaPath) || fs.existsSync(partsDir)) { + const meta = existsOrBak(metaPath) ? readJsonFile(metaPath, { maxBytes }) : null; + const parts = Array.isArray(meta?.parts) && meta.parts.length + ? meta.parts.map((name) => path.join(dir, name)) + : readShardFiles(partsDir, 'chunk_meta.part-'); + if (!parts.length) { + throw new Error(`Missing chunk_meta shard files in ${partsDir}`); + } + return parts.flatMap((partPath) => readJsonLinesArraySync(partPath, { maxBytes })); + } + const jsonlPath = path.join(dir, 'chunk_meta.jsonl'); + if (existsOrBak(jsonlPath)) { + return readJsonLinesArraySync(jsonlPath, { maxBytes }); + } + const jsonPath = path.join(dir, 'chunk_meta.json'); + if (existsOrBak(jsonPath)) { + return readJsonFile(jsonPath, { maxBytes }); + } + throw new Error(`Missing index artifact: chunk_meta.json`); +}; + +export const loadTokenPostings = (dir, { maxBytes = MAX_JSON_BYTES } = {}) => { + const metaPath = path.join(dir, 'token_postings.meta.json'); + const shardsDir = path.join(dir, 'token_postings.shards'); + if (existsOrBak(metaPath) || fs.existsSync(shardsDir)) { + const meta = existsOrBak(metaPath) ? readJsonFile(metaPath, { maxBytes }) : {}; + const shards = Array.isArray(meta?.parts) && meta.parts.length + ? meta.parts.map((name) => path.join(dir, name)) + : readShardFiles(shardsDir, 'token_postings.part-'); + if (!shards.length) { + throw new Error(`Missing token_postings shard files in ${shardsDir}`); + } + const vocab = []; + const postings = []; + for (const shardPath of shards) { + const shard = readJsonFileCached(shardPath, { maxBytes }); + const shardVocab = Array.isArray(shard?.vocab) ? shard.vocab : (Array.isArray(shard?.arrays?.vocab) ? shard.arrays.vocab : []); + const shardPostings = Array.isArray(shard?.postings) ? shard.postings : (Array.isArray(shard?.arrays?.postings) ? shard.arrays.postings : []); + vocab.push(...shardVocab); + postings.push(...shardPostings); + } + const docLengths = Array.isArray(meta?.docLengths) + ? meta.docLengths + : (Array.isArray(meta?.arrays?.docLengths) ? meta.arrays.docLengths : []); + return { + ...meta, + vocab, + postings, + docLengths + }; + } + const jsonPath = path.join(dir, 'token_postings.json'); + if (existsOrBak(jsonPath)) { + return readJsonFile(jsonPath, { maxBytes }); + } + throw new Error(`Missing index artifact: token_postings.json`); +}; diff --git a/src/shared/artifact-schemas.js b/src/shared/artifact-schemas.js new file mode 100644 index 000000000..d4c189d0a --- /dev/null +++ b/src/shared/artifact-schemas.js @@ -0,0 +1,320 @@ +import Ajv from 'ajv'; + +const ajv = new Ajv({ + allErrors: true, + allowUnionTypes: true, + strict: true +}); + +const intId = { type: 'integer', minimum: 0 }; +const nullableString = { type: ['string', 'null'] }; +const nullableInt = { type: ['integer', 'null'], minimum: 0 }; + +const chunkMetaEntry = { + type: 'object', + required: ['id', 'start', 'end'], + properties: { + id: intId, + fileId: nullableInt, + start: intId, + end: intId, + startLine: nullableInt, + endLine: nullableInt, + kind: nullableString, + name: nullableString, + ext: nullableString + }, + additionalProperties: true +}; + +const postingEntry = { + type: 'array', + minItems: 2, + maxItems: 2, + items: [intId, intId] +}; + +const postingsList = { + type: 'array', + items: { type: 'array', items: postingEntry } +}; + +const vocabArray = { + type: 'array', + items: { type: 'string' } +}; + +const docLengthsArray = { + type: 'array', + items: intId +}; + +const graphNode = { + type: 'object', + required: ['id', 'out', 'in'], + properties: { + id: { type: 'string' }, + file: nullableString, + name: nullableString, + kind: nullableString, + chunkId: nullableString, + out: { type: 'array', items: { type: 'string' } }, + in: { type: 'array', items: { type: 'string' } } + }, + additionalProperties: true +}; + +const graphPayload = { + type: 'object', + required: ['nodeCount', 'edgeCount', 'nodes'], + properties: { + nodeCount: intId, + edgeCount: intId, + nodes: { type: 'array', items: graphNode } + }, + additionalProperties: true +}; + +const idPostingList = { + type: 'array', + items: { type: 'array', items: intId } +}; + +const denseVectorArray = { + type: 'array', + items: intId +}; + +const validators = { + chunk_meta: ajv.compile({ + type: 'array', + items: chunkMetaEntry + }), + file_meta: ajv.compile({ + type: 'array', + items: { + type: 'object', + required: ['id', 'file'], + properties: { + id: intId, + file: { type: 'string' }, + ext: nullableString + }, + additionalProperties: true + } + }), + repo_map: ajv.compile({ + type: 'array', + items: { + type: 'object', + required: ['file', 'name'], + properties: { + file: { type: 'string' }, + name: { type: 'string' }, + kind: nullableString, + signature: nullableString, + exported: { type: ['boolean', 'null'] } + }, + additionalProperties: true + } + }), + file_relations: ajv.compile({ + type: 'array', + items: { + type: 'object', + required: ['file', 'relations'], + properties: { + file: { type: 'string' }, + relations: { type: 'object' } + }, + additionalProperties: true + } + }), + token_postings: ajv.compile({ + type: 'object', + required: ['vocab', 'postings', 'docLengths'], + properties: { + vocab: vocabArray, + postings: postingsList, + docLengths: docLengthsArray, + avgDocLen: { type: 'number' }, + totalDocs: { type: 'integer' } + }, + additionalProperties: true + }), + field_postings: ajv.compile({ + type: 'object', + required: ['fields'], + properties: { + fields: { + type: 'object', + additionalProperties: { + type: 'object', + required: ['vocab', 'postings', 'docLengths'], + properties: { + vocab: vocabArray, + postings: postingsList, + docLengths: docLengthsArray, + avgDocLen: { type: 'number' }, + totalDocs: { type: 'integer' } + }, + additionalProperties: true + } + } + }, + additionalProperties: true + }), + field_tokens: ajv.compile({ + type: 'array', + items: { + type: 'object', + properties: { + name: { type: 'array', items: { type: 'string' } }, + signature: { type: 'array', items: { type: 'string' } }, + doc: { type: 'array', items: { type: 'string' } }, + comment: { type: 'array', items: { type: 'string' } }, + body: { type: 'array', items: { type: 'string' } } + }, + additionalProperties: true + } + }), + minhash_signatures: ajv.compile({ + type: 'object', + required: ['signatures'], + properties: { + signatures: { + type: 'array', + items: { type: 'array', items: intId } + } + }, + additionalProperties: true + }), + dense_vectors: ajv.compile({ + type: 'object', + required: ['dims', 'vectors'], + properties: { + dims: { type: 'integer', minimum: 1 }, + model: nullableString, + scale: { type: 'number' }, + vectors: { type: 'array', items: denseVectorArray } + }, + additionalProperties: true + }), + dense_vectors_hnsw_meta: ajv.compile({ + type: 'object', + required: ['dims', 'count', 'space', 'm', 'efConstruction', 'efSearch'], + properties: { + version: { type: 'integer', minimum: 1 }, + generatedAt: nullableString, + model: nullableString, + dims: { type: 'integer', minimum: 1 }, + count: { type: 'integer', minimum: 0 }, + space: { type: 'string' }, + m: { type: 'integer', minimum: 1 }, + efConstruction: { type: 'integer', minimum: 1 }, + efSearch: { type: 'integer', minimum: 1 } + }, + additionalProperties: true + }), + phrase_ngrams: ajv.compile({ + type: 'object', + required: ['vocab', 'postings'], + properties: { + vocab: vocabArray, + postings: idPostingList + }, + additionalProperties: true + }), + chargram_postings: ajv.compile({ + type: 'object', + required: ['vocab', 'postings'], + properties: { + vocab: vocabArray, + postings: idPostingList + }, + additionalProperties: true + }), + filter_index: ajv.compile({ + type: 'object', + required: ['fileById', 'fileChunksById'], + properties: { + fileChargramN: { type: 'integer', minimum: 2 }, + fileById: { type: 'array', items: { type: 'string' } }, + fileChunksById: idPostingList, + byExt: { type: 'object' }, + byKind: { type: 'object' }, + byAuthor: { type: 'object' }, + byChunkAuthor: { type: 'object' }, + byVisibility: { type: 'object' }, + fileChargrams: { type: 'object' } + }, + additionalProperties: true + }), + pieces_manifest: ajv.compile({ + type: 'object', + required: ['version', 'pieces'], + properties: { + version: { type: 'integer' }, + generatedAt: nullableString, + updatedAt: nullableString, + mode: nullableString, + stage: nullableString, + pieces: { + type: 'array', + items: { + type: 'object', + required: ['type', 'name', 'format', 'path'], + properties: { + type: { type: 'string' }, + name: { type: 'string' }, + format: { type: 'string' }, + path: { type: 'string' }, + bytes: { type: ['integer', 'null'] }, + checksum: nullableString + }, + additionalProperties: true + } + } + }, + additionalProperties: true + }), + index_state: ajv.compile({ + type: 'object', + required: ['generatedAt', 'mode'], + properties: { + generatedAt: { type: 'string' }, + updatedAt: nullableString, + mode: { type: 'string' }, + stage: nullableString + }, + additionalProperties: true + }), + graph_relations: ajv.compile({ + type: 'object', + required: ['version', 'generatedAt', 'callGraph', 'usageGraph', 'importGraph'], + properties: { + version: { type: 'integer', minimum: 1 }, + generatedAt: { type: 'string' }, + callGraph: graphPayload, + usageGraph: graphPayload, + importGraph: graphPayload + }, + additionalProperties: true + }) +}; + +const formatError = (error) => { + const path = error.instancePath || '/'; + const message = error.message || 'schema error'; + return `${path} ${message}`.trim(); +}; + +export function validateArtifact(name, data) { + const validator = validators[name]; + if (!validator) return { ok: true, errors: [] }; + const ok = Boolean(validator(data)); + const errors = ok || !validator.errors + ? [] + : validator.errors.map(formatError); + return { ok, errors }; +} diff --git a/src/shared/bench-progress.js b/src/shared/bench-progress.js new file mode 100644 index 000000000..77ff2e158 --- /dev/null +++ b/src/shared/bench-progress.js @@ -0,0 +1,29 @@ +export function formatShardFileProgress(entry, options = {}) { + const shardByLabel = options.shardByLabel instanceof Map ? options.shardByLabel : new Map(); + const lineTotal = options.lineTotal; + const count = Number.isFinite(entry.fileIndex) ? entry.fileIndex : entry.count; + const total = Number.isFinite(entry.fileTotal) ? entry.fileTotal : entry.total; + const pct = Number.isFinite(entry.pct) + ? entry.pct + : (Number.isFinite(count) && Number.isFinite(total) && total > 0) + ? (count / total) * 100 + : null; + const pctText = Number.isFinite(pct) ? `${pct.toFixed(1)}%` : null; + const shardLabel = entry.shardLabel; + const shardInfo = shardLabel ? shardByLabel.get(shardLabel) : null; + const shardText = shardInfo + ? `${shardInfo.index}/${shardInfo.total}` + : (shardLabel || null); + const shardPrefix = shardText ? `[shard ${shardText}]` : '[shard]'; + const countText = Number.isFinite(count) && Number.isFinite(total) + ? `${count}/${total}` + : null; + const lineText = Number.isFinite(lineTotal) && lineTotal > 0 + ? `lines ${lineTotal.toLocaleString()}` + : null; + const head = [shardPrefix, countText, pctText ? `(${pctText})` : null] + .filter(Boolean) + .join(' '); + const tail = [lineText, entry.file].filter(Boolean); + return tail.length ? `${head} | ${tail.join(' | ')}` : head; +} diff --git a/src/shared/bundle-io.js b/src/shared/bundle-io.js new file mode 100644 index 000000000..6a75616ec --- /dev/null +++ b/src/shared/bundle-io.js @@ -0,0 +1,116 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { Packr, Unpackr } from 'msgpackr'; +import { sha1, checksumString } from './hash.js'; +import { stableStringify } from './stable-json.js'; + +const BUNDLE_FORMAT_TAG = 'pairofcleats.bundle'; +const BUNDLE_VERSION = 1; +const MSGPACK_EXTENSIONS = new Set(['.mpk', '.msgpack', '.msgpackr']); + +const packr = new Packr({ useRecords: false, structuredClone: true }); +const unpackr = new Unpackr({ useRecords: false }); + +const normalizeBundlePayload = (value) => { + if (Array.isArray(value)) { + return value.map((entry) => normalizeBundlePayload(entry)); + } + if (!value || typeof value !== 'object' || value.constructor !== Object) { + return value; + } + const out = {}; + for (const key of Object.keys(value).sort()) { + out[key] = normalizeBundlePayload(value[key]); + } + return out; +}; + +const checksumBundlePayload = async (payload) => ( + checksumString(stableStringify(payload)) +); + +export function normalizeBundleFormat(raw) { + if (typeof raw !== 'string') return 'json'; + const normalized = raw.trim().toLowerCase(); + if (normalized === 'msgpack' || normalized === 'msgpackr' || normalized === 'mpk') { + return 'msgpack'; + } + return 'json'; +} + +export function resolveBundleFilename(relKey, format) { + const ext = format === 'msgpack' ? 'mpk' : 'json'; + return `${sha1(relKey)}.${ext}`; +} + +export function resolveBundleFormatFromName(bundleName, fallback = 'json') { + if (typeof bundleName !== 'string' || !bundleName) return fallback; + const ext = path.extname(bundleName).toLowerCase(); + return MSGPACK_EXTENSIONS.has(ext) ? 'msgpack' : 'json'; +} + +export async function writeBundleFile({ bundlePath, bundle, format = 'json' }) { + const resolvedFormat = normalizeBundleFormat(format); + if (resolvedFormat === 'msgpack') { + const normalized = normalizeBundlePayload(bundle); + const checksum = await checksumBundlePayload(normalized); + const envelope = { + format: BUNDLE_FORMAT_TAG, + version: BUNDLE_VERSION, + checksum: checksum ? { algo: checksum.algo, value: checksum.value } : null, + payload: normalized + }; + const encoded = packr.pack(envelope); + await fs.writeFile(bundlePath, Buffer.from(encoded)); + return { + format: resolvedFormat, + checksum: checksum?.value ?? null, + checksumAlgo: checksum?.algo ?? null + }; + } + await fs.writeFile(bundlePath, `${JSON.stringify(bundle)}\n`); + return { format: resolvedFormat, checksum: null, checksumAlgo: null }; +} + +export async function readBundleFile(bundlePath, { format = null } = {}) { + const resolvedFormat = format || resolveBundleFormatFromName(bundlePath); + if (resolvedFormat === 'msgpack') { + const buffer = await fs.readFile(bundlePath); + const envelope = unpackr.unpack(buffer); + if (!envelope || typeof envelope !== 'object') { + return { ok: false, reason: 'invalid bundle envelope' }; + } + if (envelope.format !== BUNDLE_FORMAT_TAG || envelope.version !== BUNDLE_VERSION) { + return { ok: false, reason: 'unsupported bundle envelope' }; + } + const payload = envelope.payload; + if (!payload || !Array.isArray(payload.chunks)) { + return { ok: false, reason: 'invalid bundle payload' }; + } + const checksum = envelope.checksum?.value; + if (checksum) { + const normalized = normalizeBundlePayload(payload); + if (envelope.checksum?.algo === 'xxh64') { + const expected = await checksumBundlePayload(normalized); + if (!expected || expected.value !== checksum) { + return { ok: false, reason: 'bundle checksum mismatch' }; + } + return { ok: true, bundle: normalized }; + } + if (envelope.checksum?.algo === 'sha1') { + const expected = sha1(stableStringify(normalized)); + if (expected !== checksum) { + return { ok: false, reason: 'bundle checksum mismatch' }; + } + return { ok: true, bundle: normalized }; + } + } + return { ok: true, bundle: payload }; + } + const raw = await fs.readFile(bundlePath, 'utf8'); + const bundle = JSON.parse(raw); + if (!bundle || !Array.isArray(bundle.chunks)) { + return { ok: false, reason: 'invalid bundle' }; + } + return { ok: true, bundle }; +} diff --git a/src/shared/cache.js b/src/shared/cache.js new file mode 100644 index 000000000..e7ad09fcb --- /dev/null +++ b/src/shared/cache.js @@ -0,0 +1,136 @@ +import { LRUCache } from 'lru-cache'; + +const BYTES_PER_MB = 1024 * 1024; + +export const DEFAULT_CACHE_MB = { + fileText: 64, + summary: 32, + lint: 16, + complexity: 16, + gitMeta: 16 +}; + +export const DEFAULT_CACHE_TTL_MS = { + fileText: 0, + summary: 0, + lint: 0, + complexity: 0, + gitMeta: 0 +}; + +export const mbToBytes = (value) => { + const parsed = Number(value); + if (!Number.isFinite(parsed)) return 0; + return Math.max(0, Math.floor(parsed * BYTES_PER_MB)); +}; + +export const estimateStringBytes = (value) => { + if (typeof value !== 'string') return 0; + return Buffer.byteLength(value, 'utf8'); +}; + +export const estimateJsonBytes = (value) => { + try { + return Buffer.byteLength(JSON.stringify(value), 'utf8'); + } catch { + return 0; + } +}; + +export function createCacheReporter({ enabled = false, log = null } = {}) { + const entries = []; + return { + track(stats) { + if (stats) entries.push(stats); + }, + report() { + if (!enabled || !log || !entries.length) return; + log('Cache stats:'); + for (const stats of entries) { + const sizeMb = stats.maxSizeBytes ? (stats.maxSizeBytes / BYTES_PER_MB).toFixed(1) : 'n/a'; + const ttlMs = Number.isFinite(stats.ttlMs) ? stats.ttlMs : 0; + log(`- ${stats.name}: hits=${stats.hits}, misses=${stats.misses}, evictions=${stats.evictions}, sets=${stats.sets}, maxEntries=${stats.maxEntries ?? 'n/a'}, maxMb=${sizeMb}, ttlMs=${ttlMs}`); + } + } + }; +} + +export function createLruCache({ + name, + maxMb, + ttlMs, + maxEntries, + sizeCalculation, + reporter +}) { + const entryLimit = Number.isFinite(Number(maxEntries)) + ? Math.max(0, Math.floor(Number(maxEntries))) + : null; + const hasEntryLimit = entryLimit !== null; + const maxSizeBytes = hasEntryLimit ? 0 : mbToBytes(maxMb); + const ttlValue = Number.isFinite(Number(ttlMs)) ? Math.max(0, Number(ttlMs)) : 0; + + const stats = { + name, + hits: 0, + misses: 0, + evictions: 0, + sets: 0, + maxEntries: hasEntryLimit ? entryLimit : null, + maxSizeBytes, + ttlMs: ttlValue + }; + + if (reporter && typeof reporter.track === 'function') { + reporter.track(stats); + } + + if ((hasEntryLimit && entryLimit > 0) || maxSizeBytes > 0) { + const options = { + allowStale: false, + updateAgeOnGet: true, + dispose: (_value, _key, reason) => { + if (reason === 'evict') stats.evictions += 1; + } + }; + if (hasEntryLimit && entryLimit > 0) { + options.max = entryLimit; + } else { + options.maxSize = maxSizeBytes; + options.sizeCalculation = typeof sizeCalculation === 'function' + ? sizeCalculation + : estimateJsonBytes; + } + if (ttlValue > 0) options.ttl = ttlValue; + const cache = new LRUCache(options); + return { + get(key) { + const value = cache.get(key); + if (value === undefined) { + stats.misses += 1; + return null; + } + stats.hits += 1; + return value; + }, + set(key, value) { + stats.sets += 1; + cache.set(key, value); + }, + cache, + stats + }; + } + + return { + get() { + stats.misses += 1; + return null; + }, + set() { + stats.sets += 1; + }, + cache: null, + stats + }; +} diff --git a/src/shared/capabilities.js b/src/shared/capabilities.js new file mode 100644 index 000000000..333608867 --- /dev/null +++ b/src/shared/capabilities.js @@ -0,0 +1,44 @@ +import { tryRequire } from './optional-deps.js'; + +let cached = null; + +const check = (name, options) => tryRequire(name, options).ok; + +export function getCapabilities(options = {}) { + if (cached && options.refresh !== true) return cached; + const opts = { + verbose: options.verbose === true, + logger: options.logger + }; + cached = { + watcher: { + chokidar: check('chokidar', opts), + parcel: check('@parcel/watcher', opts) + }, + regex: { + re2: check('re2', opts), + re2js: check('re2js', opts) + }, + hash: { + nodeRsXxhash: check('@node-rs/xxhash', opts), + wasmXxhash: check('xxhash-wasm', opts) + }, + compression: { + gzip: true, + zstd: check('@mongodb-js/zstd', opts) + }, + extractors: { + pdf: check('pdfjs-dist', opts), + docx: check('mammoth', opts) + }, + mcp: { + sdk: check('@modelcontextprotocol/sdk', opts), + legacy: true + }, + externalBackends: { + tantivy: check('tantivy', opts), + lancedb: check('@lancedb/lancedb', opts) + } + }; + return cached; +} diff --git a/src/shared/cli-options.js b/src/shared/cli-options.js new file mode 100644 index 000000000..27283abeb --- /dev/null +++ b/src/shared/cli-options.js @@ -0,0 +1,123 @@ +import { validateConfig } from '../config/validate.js'; + +export const INDEX_BUILD_OPTIONS = { + mode: { type: 'string', default: 'all' }, + stage: { type: 'string' }, + dims: { type: 'number', default: 384 }, + threads: { type: 'number' }, + incremental: { type: 'boolean', default: false, alias: 'i' }, + 'stub-embeddings': { type: 'boolean', default: false }, + watch: { type: 'boolean', default: false }, + 'watch-poll': { type: 'number', default: 2000 }, + 'watch-debounce': { type: 'number', default: 500 }, + sqlite: { type: 'boolean' }, + 'debug-crash': { type: 'boolean', default: false }, + model: { type: 'string' }, + repo: { type: 'string' } +}; + +export const BENCH_OPTIONS = { + ann: { type: 'boolean' }, + 'no-ann': { type: 'boolean' }, + json: { type: 'boolean', default: false }, + 'write-report': { type: 'boolean', default: false }, + build: { type: 'boolean', default: false }, + 'build-index': { type: 'boolean', default: false }, + 'build-sqlite': { type: 'boolean', default: false }, + incremental: { type: 'boolean', default: false }, + 'stub-embeddings': { type: 'boolean', default: false }, + 'index-profile': { type: 'string' }, + 'no-index-profile': { type: 'boolean', default: false }, + 'real-embeddings': { type: 'boolean', default: false }, + queries: { type: 'string' }, + backend: { type: 'string' }, + out: { type: 'string' }, + 'bm25-k1': { type: 'number' }, + 'bm25-b': { type: 'number' }, + 'fts-profile': { type: 'string' }, + 'fts-weights': { type: 'string' }, + repo: { type: 'string' }, + top: { type: 'number', default: 5 }, + limit: { type: 'number', default: 0 }, + 'heap-mb': { type: 'number' }, + 'query-concurrency': { type: 'number' }, + threads: { type: 'number' } +}; + +export function mergeCliOptions(...sets) { + const merged = {}; + for (const set of sets) { + if (!set || typeof set !== 'object') continue; + for (const [key, value] of Object.entries(set)) { + merged[key] = value; + } + } + return merged; +} + +const INDEX_BUILD_SCHEMA = { + type: 'object', + properties: { + mode: { type: 'string' }, + stage: { type: 'string' }, + dims: { type: 'number' }, + threads: { type: 'number' }, + incremental: { type: 'boolean' }, + watch: { type: 'boolean' }, + sqlite: { type: 'boolean' }, + model: { type: 'string' }, + repo: { type: 'string' } + } +}; + +const BENCH_SCHEMA = { + type: 'object', + properties: { + ann: { type: 'boolean' }, + 'no-ann': { type: 'boolean' }, + build: { type: 'boolean' }, + 'build-index': { type: 'boolean' }, + 'build-sqlite': { type: 'boolean' }, + incremental: { type: 'boolean' }, + 'stub-embeddings': { type: 'boolean' }, + 'index-profile': { type: 'string' }, + 'real-embeddings': { type: 'boolean' }, + backend: { type: 'string' }, + top: { type: 'number' }, + limit: { type: 'number' }, + 'bm25-k1': { type: 'number' }, + 'bm25-b': { type: 'number' }, + 'fts-profile': { type: 'string' }, + 'fts-weights': { type: 'string' }, + 'query-concurrency': { type: 'number' }, + threads: { type: 'number' }, + 'heap-mb': { type: 'number' } + } +}; + +const throwOnErrors = (label, errors) => { + if (!errors.length) return; + const message = errors.join('; '); + throw new Error(`${label} validation failed: ${message}`); +}; + +export function validateBuildArgs(argv) { + const result = validateConfig(INDEX_BUILD_SCHEMA, argv); + if (!result.ok) throwOnErrors('build-index args', result.errors); +} + +export function validateBenchArgs(argv) { + const result = validateConfig(BENCH_SCHEMA, argv); + if (!result.ok) throwOnErrors('bench args', result.errors); + const conflicts = []; + if (argv.ann && argv['no-ann']) { + conflicts.push('ann and no-ann cannot both be set'); + } + if (argv['stub-embeddings'] && argv['real-embeddings']) { + conflicts.push('stub-embeddings and real-embeddings cannot both be set'); + } + if (argv['index-profile'] && argv['no-index-profile']) { + conflicts.push('index-profile and no-index-profile cannot both be set'); + } + throwOnErrors('bench args', conflicts); +} diff --git a/src/shared/cli.js b/src/shared/cli.js new file mode 100644 index 000000000..7e3fe9d61 --- /dev/null +++ b/src/shared/cli.js @@ -0,0 +1,47 @@ +import path from 'node:path'; +import yargs from 'yargs/yargs'; +import { hideBin } from 'yargs/helpers'; + +const DEFAULT_PARSER_CONFIG = { + 'camel-case-expansion': false, + 'dot-notation': false +}; + +/** + * Create a configured yargs instance for CLI tools. + * @param {{argv?:string[],scriptName?:string,usage?:string,options?:object,aliases?:object}} input + * @returns {import('yargs').Argv} + */ +export function createCli(input = {}) { + const { + argv = process.argv, + scriptName, + usage, + options = {}, + aliases = {} + } = input; + const name = scriptName || path.basename(argv[1] || 'cli'); + const mergedOptions = { ...options }; + if (!Object.prototype.hasOwnProperty.call(mergedOptions, 'profile')) { + mergedOptions.profile = { + type: 'string', + describe: 'Profile name from profiles/*.json' + }; + } + const parser = yargs(hideBin(argv)) + .scriptName(name) + .parserConfiguration(DEFAULT_PARSER_CONFIG) + .strict(false) + .help() + .alias('h', 'help') + .wrap(100); + if (usage) parser.usage(usage); + if (Object.keys(mergedOptions).length) parser.options(mergedOptions); + if (Object.keys(aliases).length) parser.alias(aliases); + parser.middleware((args) => { + if (args.profile) { + process.env.PAIROFCLEATS_PROFILE = String(args.profile).trim(); + } + }); + return parser; +} diff --git a/src/shared/concurrency.js b/src/shared/concurrency.js index c700070bd..248c02158 100644 --- a/src/shared/concurrency.js +++ b/src/shared/concurrency.js @@ -1,22 +1,95 @@ +import PQueue from 'p-queue'; + /** - * Run async work over items with a concurrency limit. + * Create shared task queues for IO, CPU, and embeddings work. + * @param {{ioConcurrency:number,cpuConcurrency:number,embeddingConcurrency?:number,ioPendingLimit?:number,cpuPendingLimit?:number,embeddingPendingLimit?:number}} input + * @returns {{io:PQueue,cpu:PQueue,embedding:PQueue}} + */ +export function createTaskQueues({ + ioConcurrency, + cpuConcurrency, + embeddingConcurrency, + ioPendingLimit, + cpuPendingLimit, + embeddingPendingLimit +}) { + const io = new PQueue({ concurrency: Math.max(1, Math.floor(ioConcurrency || 1)) }); + const cpu = new PQueue({ concurrency: Math.max(1, Math.floor(cpuConcurrency || 1)) }); + const embeddingLimit = Number.isFinite(Number(embeddingConcurrency)) + ? Math.max(1, Math.floor(Number(embeddingConcurrency))) + : Math.max(1, Math.floor(cpuConcurrency || 1)); + const embedding = new PQueue({ concurrency: embeddingLimit }); + const applyLimit = (queue, limit) => { + if (!Number.isFinite(limit) || limit <= 0) return; + queue.maxPending = Math.floor(limit); + }; + applyLimit(io, ioPendingLimit); + applyLimit(cpu, cpuPendingLimit); + applyLimit(embedding, embeddingPendingLimit); + return { io, cpu, embedding }; +} + +/** + * Run async work over items using a shared queue. + * @param {PQueue} queue * @param {Array} items - * @param {number} limit * @param {(item:any, index:number)=>Promise} worker - * @returns {Promise} + * @param {{collectResults?:boolean,onResult?:(result:any, index:number)=>Promise,retries?:number,retryDelayMs?:number}} [options] + * @returns {Promise} */ -export async function runWithConcurrency(items, limit, worker) { - if (!items.length) return []; - const results = new Array(items.length); - const workerCount = Math.max(1, Math.min(limit, items.length)); - let nextIndex = 0; - const runners = Array.from({ length: workerCount }, async () => { +export async function runWithQueue(queue, items, worker, options = {}) { + if (!items.length) return options.collectResults === false ? null : []; + const collectResults = options.collectResults !== false; + const onResult = typeof options.onResult === 'function' ? options.onResult : null; + const retries = Number.isFinite(Number(options.retries)) ? Math.max(0, Math.floor(Number(options.retries))) : 0; + const retryDelayMs = Number.isFinite(Number(options.retryDelayMs)) ? Math.max(0, Math.floor(Number(options.retryDelayMs))) : 0; + const results = collectResults ? new Array(items.length) : null; + const pending = new Set(); + const maxPending = Number.isFinite(queue?.maxPending) ? queue.maxPending : null; + const enqueue = async (item, index) => { + if (maxPending) { + while (pending.size >= maxPending) { + await Promise.race(pending); + } + } + const task = queue.add(async () => { + let attempt = 0; + let result; while (true) { - const idx = nextIndex++; - if (idx >= items.length) break; - results[idx] = await worker(items[idx], idx); + try { + result = await worker(item, index); + break; + } catch (err) { + attempt += 1; + if (attempt > retries) throw err; + if (retryDelayMs > 0) { + await new Promise((resolve) => setTimeout(resolve, retryDelayMs)); + } + } } - }); - await Promise.all(runners); + if (collectResults) results[index] = result; + if (onResult) await onResult(result, index); + return result; + }); + pending.add(task); + task.finally(() => pending.delete(task)); + }; + for (let index = 0; index < items.length; index += 1) { + await enqueue(items[index], index); + } + await Promise.all(pending); return results; } + +/** + * Run async work over items with a per-call concurrency limit. + * @param {Array} items + * @param {number} limit + * @param {(item:any, index:number)=>Promise} worker + * @param {{collectResults?:boolean,onResult?:(result:any, index:number)=>Promise}} [options] + * @returns {Promise} + */ +export async function runWithConcurrency(items, limit, worker, options = {}) { + const queue = new PQueue({ concurrency: Math.max(1, Math.floor(limit || 1)) }); + return runWithQueue(queue, items, worker, options); +} diff --git a/src/shared/config.js b/src/shared/config.js new file mode 100644 index 000000000..a7f93f88c --- /dev/null +++ b/src/shared/config.js @@ -0,0 +1,17 @@ +export function isPlainObject(value) { + return value && typeof value === 'object' && !Array.isArray(value); +} + +export function mergeConfig(base, overrides) { + if (!isPlainObject(base)) return overrides; + if (!isPlainObject(overrides)) return base; + const next = { ...base }; + for (const [key, value] of Object.entries(overrides)) { + if (isPlainObject(value) && isPlainObject(next[key])) { + next[key] = mergeConfig(next[key], value); + } else { + next[key] = value; + } + } + return next; +} diff --git a/src/shared/dictionary.js b/src/shared/dictionary.js new file mode 100644 index 000000000..18d22f067 --- /dev/null +++ b/src/shared/dictionary.js @@ -0,0 +1,99 @@ +import { TextEncoder } from 'node:util'; + +const encoder = new TextEncoder(); + +const compareBytes = (query, bytes, start, end) => { + const entryLen = end - start; + const minLen = Math.min(query.length, entryLen); + for (let i = 0; i < minLen; i += 1) { + const diff = query[i] - bytes[start + i]; + if (diff < 0) return -1; + if (diff > 0) return 1; + } + if (query.length === entryLen) return 0; + return query.length < entryLen ? -1 : 1; +}; + +const compareEntry = (a, b) => { + const minLen = Math.min(a.bytes.length, b.bytes.length); + for (let i = 0; i < minLen; i += 1) { + const diff = a.bytes[i] - b.bytes[i]; + if (diff < 0) return -1; + if (diff > 0) return 1; + } + if (a.bytes.length === b.bytes.length) return 0; + return a.bytes.length < b.bytes.length ? -1 : 1; +}; + +const normalizeWord = (word) => (typeof word === 'string' ? word : ''); + +export const createSharedDictionary = (words) => { + const list = Array.isArray(words) ? words : Array.from(words || []); + const entries = []; + let maxLen = 0; + for (const word of list) { + const normalized = normalizeWord(word); + if (!normalized) continue; + if (normalized.length > maxLen) maxLen = normalized.length; + entries.push({ word: normalized, bytes: encoder.encode(normalized) }); + } + if (!entries.length) return null; + entries.sort(compareEntry); + let totalBytes = 0; + for (const entry of entries) totalBytes += entry.bytes.length; + if (!Number.isFinite(totalBytes) || totalBytes <= 0) return null; + if (totalBytes > 0xffffffff) return null; + + const bytesBuffer = new SharedArrayBuffer(totalBytes); + const bytesView = new Uint8Array(bytesBuffer); + const offsetsBuffer = new SharedArrayBuffer((entries.length + 1) * Uint32Array.BYTES_PER_ELEMENT); + const offsetsView = new Uint32Array(offsetsBuffer); + + let offset = 0; + for (let i = 0; i < entries.length; i += 1) { + const entry = entries[i]; + offsetsView[i] = offset; + bytesView.set(entry.bytes, offset); + offset += entry.bytes.length; + } + offsetsView[entries.length] = offset; + + return { + bytes: bytesBuffer, + offsets: offsetsBuffer, + count: entries.length, + maxLen + }; +}; + +export const createSharedDictionaryView = (payload) => { + if (!payload?.bytes || !payload?.offsets) return null; + const bytes = new Uint8Array(payload.bytes); + const offsets = new Uint32Array(payload.offsets); + const countRaw = Number.isFinite(payload.count) ? payload.count : Math.max(0, offsets.length - 1); + const count = Math.max(0, Math.min(countRaw, offsets.length - 1)); + const maxLen = Number.isFinite(payload.maxLen) ? payload.maxLen : 0; + const has = (value) => { + if (typeof value !== 'string' || !value) return false; + if (maxLen && value.length > maxLen) return false; + const query = encoder.encode(value); + let low = 0; + let high = count - 1; + while (low <= high) { + const mid = (low + high) >> 1; + const start = offsets[mid]; + const end = offsets[mid + 1]; + const cmp = compareBytes(query, bytes, start, end); + if (cmp === 0) return true; + if (cmp < 0) high = mid - 1; + else low = mid + 1; + } + return false; + }; + return { + size: count, + has, + __sharedDict: true, + __maxTokenLength: maxLen + }; +}; diff --git a/src/shared/embedding.js b/src/shared/embedding.js index 32b5173eb..9c7f313c8 100644 --- a/src/shared/embedding.js +++ b/src/shared/embedding.js @@ -7,7 +7,8 @@ import crypto from 'node:crypto'; * @returns {number[]} */ export function stubEmbedding(text, dims) { - const safeDims = Number.isFinite(dims) && dims > 0 ? Math.floor(dims) : 512; + // Keep stub embeddings aligned with the default index dimensions. + const safeDims = Number.isFinite(dims) && dims > 0 ? Math.floor(dims) : 384; const hash = crypto.createHash('sha256').update(text).digest(); let seed = 0; for (const byte of hash) seed = (seed * 31 + byte) >>> 0; diff --git a/src/shared/encoding.js b/src/shared/encoding.js new file mode 100644 index 000000000..baf0aca58 --- /dev/null +++ b/src/shared/encoding.js @@ -0,0 +1,82 @@ +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import chardet from 'chardet'; +import iconv from 'iconv-lite'; +import { sha1 } from './hash.js'; + +const utf8Decoder = new TextDecoder('utf-8', { fatal: true }); + +const normalizeEncoding = (value) => { + if (!value) return null; + return String(value).trim().replace(/_/g, '-').toLowerCase(); +}; + +const detectEncoding = (buffer) => { + if (!buffer || !buffer.length) return { encoding: null, confidence: null }; + try { + const detected = chardet.analyse(buffer) || []; + if (Array.isArray(detected) && detected.length) { + const best = detected[0]; + return { + encoding: normalizeEncoding(best?.name), + confidence: Number.isFinite(best?.confidence) ? best.confidence : null + }; + } + } catch {} + try { + const detected = normalizeEncoding(chardet.detect(buffer)); + return { encoding: detected, confidence: null }; + } catch {} + return { encoding: null, confidence: null }; +}; + +export const decodeTextBuffer = (buffer) => { + if (!buffer || !buffer.length) { + return { + text: '', + encoding: 'utf8', + usedFallback: false, + confidence: null + }; + } + try { + return { + text: utf8Decoder.decode(buffer), + encoding: 'utf8', + usedFallback: false, + confidence: null + }; + } catch {} + const { encoding: detected, confidence } = detectEncoding(buffer); + let encoding = detected || 'latin1'; + if (!iconv.encodingExists(encoding)) { + encoding = 'latin1'; + } + return { + text: iconv.decode(buffer, encoding), + encoding, + usedFallback: true, + confidence + }; +}; + +export const readTextFile = async (filePath) => { + const buffer = await fsPromises.readFile(filePath); + return decodeTextBuffer(buffer); +}; + +export const readTextFileWithHash = async (filePath, options = {}) => { + const buffer = options.buffer ?? await fsPromises.readFile(filePath); + const decoded = decodeTextBuffer(buffer); + const hash = sha1(buffer); + return { + ...decoded, + hash, + buffer + }; +}; + +export const readTextFileSync = (filePath) => { + const buffer = fs.readFileSync(filePath); + return decodeTextBuffer(buffer); +}; diff --git a/src/shared/env.js b/src/shared/env.js new file mode 100644 index 000000000..3b1471d1b --- /dev/null +++ b/src/shared/env.js @@ -0,0 +1,65 @@ +const TRUE_VALUES = new Set(['1', 'true', 'yes', 'on']); +const FALSE_VALUES = new Set(['0', 'false', 'no', 'off']); + +const normalizeString = (value) => { + if (typeof value !== 'string') return ''; + return value.trim(); +}; + +const parseBool = (value) => { + if (value == null) return null; + const normalized = String(value).trim().toLowerCase(); + if (TRUE_VALUES.has(normalized)) return true; + if (FALSE_VALUES.has(normalized)) return false; + return null; +}; + +const parseNumber = (value) => { + if (value == null || value === '') return null; + const parsed = Number(value); + return Number.isFinite(parsed) ? parsed : null; +}; + +export function getEnvConfig(env = process.env) { + return { + profile: normalizeString(env.PAIROFCLEATS_PROFILE), + cacheRoot: normalizeString(env.PAIROFCLEATS_CACHE_ROOT), + home: normalizeString(env.PAIROFCLEATS_HOME), + dictDir: normalizeString(env.PAIROFCLEATS_DICT_DIR), + model: normalizeString(env.PAIROFCLEATS_MODEL), + modelsDir: normalizeString(env.PAIROFCLEATS_MODELS_DIR), + toolingDir: normalizeString(env.PAIROFCLEATS_TOOLING_DIR), + toolingInstallScope: normalizeString(env.PAIROFCLEATS_TOOLING_INSTALL_SCOPE), + extensionsDir: normalizeString(env.PAIROFCLEATS_EXTENSIONS_DIR), + embeddings: normalizeString(env.PAIROFCLEATS_EMBEDDINGS), + debugCrash: parseBool(env.PAIROFCLEATS_DEBUG_CRASH), + threads: parseNumber(env.PAIROFCLEATS_THREADS), + bundleThreads: parseNumber(env.PAIROFCLEATS_BUNDLE_THREADS), + workerPool: normalizeString(env.PAIROFCLEATS_WORKER_POOL), + maxOldSpaceMb: parseNumber(env.PAIROFCLEATS_MAX_OLD_SPACE_MB), + uvThreadpoolSize: parseNumber(env.PAIROFCLEATS_UV_THREADPOOL_SIZE), + nodeOptions: normalizeString(env.PAIROFCLEATS_NODE_OPTIONS), + stage: normalizeString(env.PAIROFCLEATS_STAGE), + ftsProfile: normalizeString(env.PAIROFCLEATS_FTS_PROFILE), + vectorExtension: normalizeString(env.PAIROFCLEATS_VECTOR_EXTENSION), + verbose: parseBool(env.PAIROFCLEATS_VERBOSE), + progressFiles: parseBool(env.PAIROFCLEATS_PROGRESS_FILES), + progressLines: parseBool(env.PAIROFCLEATS_PROGRESS_LINES), + fileCacheMax: parseNumber(env.PAIROFCLEATS_FILE_CACHE_MAX), + summaryCacheMax: parseNumber(env.PAIROFCLEATS_SUMMARY_CACHE_MAX), + logFormat: normalizeString(env.PAIROFCLEATS_LOG_FORMAT), + logLevel: normalizeString(env.PAIROFCLEATS_LOG_LEVEL) + }; +} + +export function parseEnvBool(value) { + return parseBool(value); +} + +export function normalizeEnvString(value) { + return normalizeString(value); +} + +export function parseEnvNumber(value) { + return parseNumber(value); +} diff --git a/src/shared/error-codes.js b/src/shared/error-codes.js new file mode 100644 index 000000000..a16fc79af --- /dev/null +++ b/src/shared/error-codes.js @@ -0,0 +1,24 @@ +export const ERROR_CODES = Object.freeze({ + INVALID_REQUEST: 'INVALID_REQUEST', + NOT_FOUND: 'NOT_FOUND', + NO_INDEX: 'NO_INDEX', + INTERNAL: 'INTERNAL', + QUEUE_OVERLOADED: 'QUEUE_OVERLOADED', + TOOL_TIMEOUT: 'TOOL_TIMEOUT', + DOWNLOAD_VERIFY_FAILED: 'DOWNLOAD_VERIFY_FAILED', + ARCHIVE_UNSAFE: 'ARCHIVE_UNSAFE', + ARCHIVE_TOO_LARGE: 'ARCHIVE_TOO_LARGE' +}); + +export const isErrorCode = (value) => ( + typeof value === 'string' && Object.values(ERROR_CODES).includes(value) +); + +export const createError = (code, message, details = null) => { + const err = new Error(message || 'Error'); + err.code = code; + if (details && typeof details === 'object') { + Object.assign(err, details); + } + return err; +}; diff --git a/src/shared/file-stats.js b/src/shared/file-stats.js new file mode 100644 index 000000000..7240659e5 --- /dev/null +++ b/src/shared/file-stats.js @@ -0,0 +1,36 @@ +import fs from 'node:fs'; +import { runWithConcurrency } from './concurrency.js'; +import { toPosix } from './files.js'; + +export async function countFileLines(filePath) { + return new Promise((resolve) => { + let count = 0; + let sawData = false; + const stream = fs.createReadStream(filePath); + stream.on('data', (chunk) => { + sawData = sawData || chunk.length > 0; + for (let i = 0; i < chunk.length; i += 1) { + if (chunk[i] === 10) count += 1; + } + }); + stream.on('error', () => resolve(0)); + stream.on('end', () => resolve(sawData ? count + 1 : 0)); + }); +} + +export async function countLinesForEntries(entries, { concurrency = 8 } = {}) { + const lineCounts = new Map(); + if (!Array.isArray(entries) || entries.length === 0) return lineCounts; + await runWithConcurrency( + entries, + concurrency, + async (entry) => { + const rel = toPosix(entry.rel || entry.abs || ''); + if (!rel) return; + const lines = await countFileLines(entry.abs); + lineCounts.set(rel, lines); + }, + { collectResults: false } + ); + return lineCounts; +} diff --git a/src/shared/hash.js b/src/shared/hash.js index aa67f74eb..b2547a5b4 100644 --- a/src/shared/hash.js +++ b/src/shared/hash.js @@ -1,4 +1,27 @@ import crypto from 'node:crypto'; +import fs from 'node:fs'; +import xxhash from 'xxhash-wasm'; + +const XXHASH_HEX_WIDTH = 16; +let xxhashState = null; + +const loadXxhash = async () => { + if (!xxhashState) { + xxhashState = xxhash(); + } + return xxhashState; +}; + +const formatXxhashHex = (value) => { + if (typeof value === 'bigint') { + return value.toString(16).padStart(XXHASH_HEX_WIDTH, '0'); + } + if (typeof value === 'number') { + return Math.floor(value).toString(16).padStart(XXHASH_HEX_WIDTH, '0'); + } + if (typeof value === 'string') return value; + return ''; +}; /** * Compute a SHA1 hash hex string. @@ -8,3 +31,34 @@ import crypto from 'node:crypto'; export function sha1(str) { return crypto.createHash('sha1').update(str).digest('hex'); } + +/** + * Compute a SHA1 hash for a file on disk. + * @param {string} filePath + * @returns {Promise} + */ +export function sha1File(filePath) { + return new Promise((resolve, reject) => { + const hash = crypto.createHash('sha1'); + const stream = fs.createReadStream(filePath); + stream.on('error', reject); + stream.on('data', (chunk) => hash.update(chunk)); + stream.on('end', () => resolve(hash.digest('hex'))); + }); +} + +export async function checksumString(input) { + const { h64ToString } = await loadXxhash(); + return { algo: 'xxh64', value: h64ToString(input) }; +} + +export async function checksumFile(filePath) { + const { create64 } = await loadXxhash(); + return new Promise((resolve, reject) => { + const hasher = create64(); + const stream = fs.createReadStream(filePath); + stream.on('error', reject); + stream.on('data', (chunk) => hasher.update(chunk)); + stream.on('end', () => resolve({ algo: 'xxh64', value: formatXxhashHex(hasher.digest()) })); + }); +} diff --git a/src/shared/hash/xxhash-backend.js b/src/shared/hash/xxhash-backend.js new file mode 100644 index 000000000..39f4e44ca --- /dev/null +++ b/src/shared/hash/xxhash-backend.js @@ -0,0 +1,106 @@ +import fs from 'node:fs'; +import xxhashWasm from 'xxhash-wasm'; +import { tryRequire } from '../optional-deps.js'; + +const XXHASH_HEX_WIDTH = 16; +let wasmStatePromise = null; +let wasmBackendPromise = null; + +const loadWasmState = async () => { + if (!wasmStatePromise) { + wasmStatePromise = xxhashWasm(); + } + return wasmStatePromise; +}; + +export const formatXxhashHex = (value) => { + if (typeof value === 'bigint') { + return value.toString(16).padStart(XXHASH_HEX_WIDTH, '0'); + } + if (typeof value === 'number') { + return Math.floor(value).toString(16).padStart(XXHASH_HEX_WIDTH, '0'); + } + if (typeof value === 'string') { + const trimmed = value.startsWith('0x') ? value.slice(2) : value; + return trimmed.padStart(XXHASH_HEX_WIDTH, '0'); + } + return ''; +}; + +const createWasmBackend = async () => { + if (wasmBackendPromise) return wasmBackendPromise; + wasmBackendPromise = (async () => { + const { h64ToString, create64 } = await loadWasmState(); + return { + name: 'wasm', + hash64: async (input) => formatXxhashHex(h64ToString(input)), + hash64Stream: async (stream) => new Promise((resolve, reject) => { + const hasher = create64(); + stream.on('error', reject); + stream.on('data', (chunk) => hasher.update(chunk)); + stream.on('end', () => resolve(formatXxhashHex(hasher.digest()))); + }) + }; + })(); + return wasmBackendPromise; +}; + +const resolveNativeFns = (mod) => { + const hash64 = mod?.xxh64 || mod?.xxhash64 || mod?.hash64 || mod?.xxh64Raw; + const create64 = mod?.createXXHash64 || mod?.createXxh64 || mod?.createHash64 || mod?.create64; + return { hash64, create64 }; +}; + +const createNativeBackend = async (options = {}) => { + const result = tryRequire('@node-rs/xxhash', options); + if (!result.ok || !result.mod) return null; + const { hash64, create64 } = resolveNativeFns(result.mod); + if (typeof hash64 !== 'function') return null; + const base = { + name: 'native', + hash64: async (input) => formatXxhashHex(hash64(input)) + }; + if (typeof create64 === 'function') { + return { + ...base, + hash64Stream: async (stream) => new Promise((resolve, reject) => { + const hasher = create64(); + stream.on('error', reject); + stream.on('data', (chunk) => hasher.update(chunk)); + stream.on('end', () => resolve(formatXxhashHex(hasher.digest()))); + }) + }; + } + const wasmBackend = await createWasmBackend(); + return { + ...base, + hash64Stream: wasmBackend.hash64Stream + }; +}; + +const maybeLogFallback = (message, options = {}) => { + if (!options?.verbose && options?.verbose !== true) return; + const logger = typeof options.logger === 'function' ? options.logger : console.warn; + logger(`[hash] ${message}`); +}; + +export const resolveXxhashBackend = async ({ backend = 'auto', logger, verbose } = {}) => { + const normalized = typeof backend === 'string' ? backend.trim().toLowerCase() : 'auto'; + const options = { logger, verbose }; + if (normalized === 'native') { + const nativeBackend = await createNativeBackend(options); + if (nativeBackend) return nativeBackend; + maybeLogFallback('Native xxhash unavailable; falling back to wasm.', options); + return createWasmBackend(); + } + if (normalized === 'wasm') { + return createWasmBackend(); + } + const nativeBackend = await createNativeBackend(options); + if (nativeBackend) return nativeBackend; + return createWasmBackend(); +}; + +export const hash64Stream = (stream, backend) => backend.hash64Stream(stream); + +export const hashFileStream = (filePath) => fs.createReadStream(filePath); diff --git a/src/shared/hnsw.js b/src/shared/hnsw.js new file mode 100644 index 000000000..dbe01e5b1 --- /dev/null +++ b/src/shared/hnsw.js @@ -0,0 +1,220 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); + +const parseNodeMajor = () => { + const raw = process.versions?.node || ''; + const major = Number(String(raw).split('.')[0]); + return Number.isFinite(major) ? major : null; +}; + +const supportsHnswRuntime = () => { + const major = parseNodeMajor(); + if (!Number.isFinite(major)) return true; + return major < 24; +}; + +let warnedRuntimeUnsupported = false; +let warnedLoadFailure = false; +let warnedFallbackUsed = false; +const warnRuntimeUnsupported = () => { + if (warnedRuntimeUnsupported) return; + warnedRuntimeUnsupported = true; + console.warn(`[ann] HNSW disabled on Node ${process.versions.node}; use Node 20/22 or disable embeddings.hnsw.`); +}; + +const warnLoadFailure = (message) => { + if (warnedLoadFailure) return; + warnedLoadFailure = true; + console.warn(`[ann] HNSW index load failed; falling back to JS ANN. ${message || ''}`.trim()); +}; + +const warnFallbackUsed = (message) => { + if (warnedFallbackUsed) return; + warnedFallbackUsed = true; + console.warn(`[ann] HNSW primary index unreadable; using backup. ${message || ''}`.trim()); +}; + +const resolveHnswLib = () => { + if (!supportsHnswRuntime()) { + warnRuntimeUnsupported(); + return null; + } + try { + return require('hnswlib-node'); + } catch { + return null; + } +}; + +const getBakPath = (filePath) => `${filePath}.bak`; + +const resolveIndexPath = (indexPath) => { + if (!indexPath) return null; + if (fs.existsSync(indexPath)) { + return { path: indexPath, cleanup: true }; + } + const bakPath = getBakPath(indexPath); + if (fs.existsSync(bakPath)) { + return { path: bakPath, cleanup: false }; + } + return null; +}; + +const cleanupBak = (indexPath) => { + const bakPath = getBakPath(indexPath); + if (!fs.existsSync(bakPath)) return; + try { + fs.rmSync(bakPath, { force: true }); + } catch {} +}; + +const SPACES = new Set(['cosine', 'l2', 'ip']); + +const normalizeInt = (value, fallback) => { + const parsed = Number(value); + if (!Number.isFinite(parsed) || parsed <= 0) return fallback; + return Math.floor(parsed); +}; + +const normalizeSpace = (value) => { + if (typeof value !== 'string') return 'cosine'; + const trimmed = value.trim().toLowerCase(); + return SPACES.has(trimmed) ? trimmed : 'cosine'; +}; + +export function normalizeHnswConfig(raw = {}) { + if (raw === false) return { enabled: false }; + const config = raw && typeof raw === 'object' ? raw : {}; + return { + enabled: config.enabled !== false, + space: normalizeSpace(config.space), + m: normalizeInt(config.m, 16), + efConstruction: normalizeInt(config.efConstruction, 200), + efSearch: normalizeInt(config.efSearch, 64), + randomSeed: normalizeInt(config.randomSeed, 100), + allowReplaceDeleted: config.allowReplaceDeleted === true + }; +} + +export function resolveHnswPaths(indexDir) { + return { + indexPath: path.join(indexDir, 'dense_vectors_hnsw.bin'), + metaPath: path.join(indexDir, 'dense_vectors_hnsw.meta.json') + }; +} + +export function validateHnswMetaCompatibility({ denseVectors, hnswMeta } = {}) { + const warnings = []; + if (!denseVectors || !hnswMeta) { + return { ok: true, warnings }; + } + const vecDims = Number(denseVectors.dims); + const metaDims = Number(hnswMeta.dims); + if (Number.isFinite(vecDims) && Number.isFinite(metaDims) && vecDims !== metaDims) { + warnings.push(`dims mismatch (vectors=${vecDims}, meta=${metaDims})`); + } + const vecModel = typeof denseVectors.model === 'string' ? denseVectors.model : null; + const metaModel = typeof hnswMeta.model === 'string' ? hnswMeta.model : null; + if (vecModel && metaModel && vecModel !== metaModel) { + warnings.push(`model mismatch (vectors=${vecModel}, meta=${metaModel})`); + } + const vecCount = Array.isArray(denseVectors.vectors) ? denseVectors.vectors.length : null; + const metaCount = Number(hnswMeta.count); + if (Number.isFinite(metaCount) && metaCount >= 0 && Number.isFinite(vecCount) && vecCount !== metaCount) { + warnings.push(`count mismatch (vectors=${vecCount}, meta=${metaCount})`); + } + const metaSpace = typeof hnswMeta.space === 'string' ? hnswMeta.space.trim().toLowerCase() : null; + if (metaSpace && !SPACES.has(metaSpace)) { + warnings.push(`space invalid (meta=${metaSpace})`); + } + return { ok: warnings.length === 0, warnings }; +} + +export function loadHnswIndex({ indexPath, dims, config, lib } = {}) { + const resolved = resolveIndexPath(indexPath); + if (!resolved) return null; + if (!Number.isFinite(dims) || dims <= 0) return null; + const normalized = normalizeHnswConfig(config); + if (!normalized.enabled) return null; + const resolvedLib = lib || resolveHnswLib(); + const HNSW = resolvedLib?.HierarchicalNSW || resolvedLib?.default?.HierarchicalNSW || resolvedLib?.default; + if (!HNSW) return null; + const buildIndex = () => new HNSW(normalized.space, dims); + const applyEfSearch = (index) => { + if (!normalized.efSearch) return; + try { + index.setEf(normalized.efSearch); + } catch {} + }; + const tryLoad = (candidatePath) => { + const index = buildIndex(); + index.readIndexSync(candidatePath, normalized.allowReplaceDeleted); + applyEfSearch(index); + return index; + }; + + try { + const index = tryLoad(resolved.path); + if (resolved.cleanup) cleanupBak(indexPath); + return index; + } catch (err) { + // If the primary file exists but is unreadable/corrupt, fall back to the + // backup if available. This avoids hard failures when a prior atomic + // replace left a valid .bak behind. + const primaryPath = indexPath; + const bakPath = getBakPath(indexPath); + const altPath = resolved.path === primaryPath ? bakPath : primaryPath; + if (altPath && altPath !== resolved.path && fs.existsSync(altPath)) { + try { + const index = tryLoad(altPath); + warnFallbackUsed(path.basename(altPath)); + return index; + } catch (altErr) { + warnLoadFailure(altErr?.message ? `(${altErr.message})` : ''); + return null; + } + } + warnLoadFailure(err?.message ? `(${err.message})` : ''); + return null; + } +} + +export function rankHnswIndex({ index, space }, queryEmbedding, topN, candidateSet) { + const embedding = Array.isArray(queryEmbedding) + ? queryEmbedding + : (ArrayBuffer.isView(queryEmbedding) ? Array.from(queryEmbedding) : null); + if (!index || !embedding || !embedding.length) return []; + // If a candidate set is provided but empty, the correct answer is an empty + // hit list (consistent with other rankers) rather than an unfiltered search. + if (candidateSet && typeof candidateSet.size === 'number' && candidateSet.size === 0) return []; + const requested = Math.max(1, Number(topN) || 1); + const maxElements = typeof index.getCurrentCount === 'function' + ? index.getCurrentCount() + : (typeof index.getMaxElements === 'function' + ? index.getMaxElements() + : index.maxElements); + const cap = Number.isFinite(maxElements) && maxElements > 0 + ? Math.min(requested, Math.floor(maxElements)) + : requested; + const limit = candidateSet && typeof candidateSet.size === 'number' + ? Math.max(1, Math.min(cap, candidateSet.size)) + : cap; + const filter = candidateSet && typeof candidateSet.size === 'number' + ? (label) => candidateSet.has(label) + : undefined; + const result = index.searchKnn(embedding, limit, filter); + const distances = result?.distances || []; + const neighbors = result?.neighbors || []; + const hits = []; + for (let i = 0; i < neighbors.length; i += 1) { + const idx = neighbors[i]; + if (idx == null) continue; + const distance = distances[i]; + const sim = space === 'l2' ? -distance : 1 - distance; + hits.push({ idx, sim }); + } + return hits.sort((a, b) => (b.sim - a.sim) || (a.idx - b.idx)); +} diff --git a/src/shared/json-stream.js b/src/shared/json-stream.js new file mode 100644 index 000000000..78ea0cca7 --- /dev/null +++ b/src/shared/json-stream.js @@ -0,0 +1,274 @@ +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import { once } from 'node:events'; +import { Transform } from 'node:stream'; +import { Gzip } from 'fflate'; + +const writeChunk = async (stream, chunk) => { + if (!stream.write(chunk)) { + await once(stream, 'drain'); + } +}; + +const waitForFinish = (stream) => new Promise((resolve, reject) => { + stream.on('error', reject); + stream.on('finish', resolve); +}); + +const createTempPath = (filePath) => ( + `${filePath}.tmp-${process.pid}-${Date.now()}-${Math.random().toString(16).slice(2, 8)}` +); + +const createFflateGzipStream = (options = {}) => { + const level = Number.isFinite(Number(options.level)) ? Math.floor(Number(options.level)) : 6; + const gzip = new Gzip({ level }); + const stream = new Transform({ + transform(chunk, encoding, callback) { + try { + const buffer = typeof chunk === 'string' ? Buffer.from(chunk, encoding) : Buffer.from(chunk); + gzip.push(buffer, false); + callback(); + } catch (err) { + callback(err); + } + }, + flush(callback) { + try { + gzip.push(new Uint8Array(0), true); + callback(); + } catch (err) { + callback(err); + } + } + }); + gzip.ondata = (chunk) => { + if (chunk && chunk.length) { + stream.push(Buffer.from(chunk)); + } + }; + return stream; +}; + +const getBakPath = (filePath) => `${filePath}.bak`; + +const replaceFile = async (tempPath, finalPath) => { + const bakPath = getBakPath(finalPath); + const finalExists = fs.existsSync(finalPath); + let backupAvailable = fs.existsSync(bakPath); + const copyFallback = async () => { + try { + await fsPromises.copyFile(tempPath, finalPath); + await fsPromises.rm(tempPath, { force: true }); + return true; + } catch { + return false; + } + }; + if (finalExists && !backupAvailable) { + try { + await fsPromises.rename(finalPath, bakPath); + backupAvailable = true; + } catch (err) { + if (err?.code !== 'ENOENT') { + backupAvailable = fs.existsSync(bakPath); + } + } + } + try { + await fsPromises.rename(tempPath, finalPath); + } catch (err) { + if (err?.code !== 'EEXIST' + && err?.code !== 'EPERM' + && err?.code !== 'ENOTEMPTY' + && err?.code !== 'EACCES' + && err?.code !== 'EXDEV') { + throw err; + } + if (!backupAvailable) { + if (await copyFallback()) return; + throw err; + } + try { + await fsPromises.rm(finalPath, { force: true }); + } catch {} + try { + await fsPromises.rename(tempPath, finalPath); + } catch (renameErr) { + if (await copyFallback()) return; + throw renameErr; + } + } +}; + +const createJsonWriteStream = (filePath, options = {}) => { + const { compression = null, atomic = false } = options; + const targetPath = atomic ? createTempPath(filePath) : filePath; + const fileStream = fs.createWriteStream(targetPath); + if (compression === 'gzip') { + const gzip = createFflateGzipStream(); + gzip.pipe(fileStream); + return { + stream: gzip, + done: Promise.all([waitForFinish(gzip), waitForFinish(fileStream)]) + .then(async () => { + if (atomic) { + await replaceFile(targetPath, filePath); + } + }) + .catch(async (err) => { + if (atomic) { + try { await fsPromises.rm(targetPath, { force: true }); } catch {} + } + throw err; + }) + }; + } + return { + stream: fileStream, + done: waitForFinish(fileStream) + .then(async () => { + if (atomic) { + await replaceFile(targetPath, filePath); + } + }) + .catch(async (err) => { + if (atomic) { + try { await fsPromises.rm(targetPath, { force: true }); } catch {} + } + throw err; + }) + }; +}; + +const normalizeJsonValue = (value) => { + if (value && typeof value === 'object' && typeof value.toJSON === 'function') { + try { + return value.toJSON(); + } catch { + return value; + } + } + return value; +}; + +const writeJsonValue = async (stream, value) => { + const normalized = normalizeJsonValue(value); + if (normalized === null || typeof normalized !== 'object') { + if (normalized === undefined || typeof normalized === 'function' || typeof normalized === 'symbol') { + await writeChunk(stream, 'null'); + return; + } + await writeChunk(stream, JSON.stringify(normalized)); + return; + } + if (Array.isArray(normalized)) { + await writeChunk(stream, '['); + let first = true; + for (const item of normalized) { + if (!first) await writeChunk(stream, ','); + const itemValue = normalizeJsonValue(item); + if (itemValue === undefined || typeof itemValue === 'function' || typeof itemValue === 'symbol') { + await writeChunk(stream, 'null'); + } else { + await writeJsonValue(stream, itemValue); + } + first = false; + } + await writeChunk(stream, ']'); + return; + } + await writeChunk(stream, '{'); + let first = true; + for (const [key, entry] of Object.entries(normalized)) { + const entryValue = normalizeJsonValue(entry); + if (entryValue === undefined || typeof entryValue === 'function' || typeof entryValue === 'symbol') { + continue; + } + if (!first) await writeChunk(stream, ','); + await writeChunk(stream, `${JSON.stringify(key)}:`); + await writeJsonValue(stream, entryValue); + first = false; + } + await writeChunk(stream, '}'); +}; + +const writeArrayItems = async (stream, items) => { + let first = true; + for (const item of items) { + if (!first) await writeChunk(stream, ','); + await writeJsonValue(stream, item); + first = false; + } +}; + +/** + * Stream JSON lines to disk (one JSON object per line). + * @param {string} filePath + * @param {Iterable} items + * @param {{trailingNewline?:boolean,compression?:string|null}} [options] + * @returns {Promise} + */ +export async function writeJsonLinesFile(filePath, items, options = {}) { + const { compression = null, atomic = false } = options; + const { stream, done } = createJsonWriteStream(filePath, { compression, atomic }); + for (const item of items) { + await writeJsonValue(stream, item); + await writeChunk(stream, '\n'); + } + stream.end(); + await done; +} + +/** + * Stream a JSON array to disk without holding the full string in memory. + * @param {string} filePath + * @param {Iterable} items + * @param {{trailingNewline?:boolean}} [options] + * @returns {Promise} + */ +export async function writeJsonArrayFile(filePath, items, options = {}) { + const { trailingNewline = true, compression = null, atomic = false } = options; + const { stream, done } = createJsonWriteStream(filePath, { compression, atomic }); + await writeChunk(stream, '['); + await writeArrayItems(stream, items); + await writeChunk(stream, ']'); + if (trailingNewline) await writeChunk(stream, '\n'); + stream.end(); + await done; +} + +/** + * Stream a JSON object with one or more array fields to disk. + * @param {string} filePath + * @param {{fields?:object,arrays?:object,trailingNewline?:boolean}} input + * @returns {Promise} + */ +export async function writeJsonObjectFile(filePath, input = {}) { + const { + fields = {}, + arrays = {}, + trailingNewline = true, + compression = null, + atomic = false + } = input; + const { stream, done } = createJsonWriteStream(filePath, { compression, atomic }); + await writeChunk(stream, '{'); + let first = true; + for (const [key, value] of Object.entries(fields)) { + if (!first) await writeChunk(stream, ','); + await writeChunk(stream, `${JSON.stringify(key)}:`); + await writeJsonValue(stream, value); + first = false; + } + for (const [key, items] of Object.entries(arrays)) { + const header = `${JSON.stringify(key)}:[`; + await writeChunk(stream, `${first ? '' : ','}${header}`); + first = false; + await writeArrayItems(stream, items); + await writeChunk(stream, ']'); + } + await writeChunk(stream, '}'); + if (trailingNewline) await writeChunk(stream, '\n'); + stream.end(); + await done; +} diff --git a/src/shared/jsonc.js b/src/shared/jsonc.js new file mode 100644 index 000000000..04b492c45 --- /dev/null +++ b/src/shared/jsonc.js @@ -0,0 +1,26 @@ +import fs from 'node:fs'; +import { parse as parseJsonc, printParseErrorCode } from 'jsonc-parser'; + +const describeSource = (source) => (source ? ` ${source}` : ''); + +export function parseJsoncText(rawText, source = '') { + const text = typeof rawText === 'string' ? rawText : String(rawText ?? ''); + if (!text.trim()) { + throw new Error(`Failed to parse${describeSource(source)}: empty file.`); + } + const errors = []; + const parsed = parseJsonc(text, errors, { allowTrailingComma: false }); + if (errors.length) { + const first = errors[0]; + const code = typeof printParseErrorCode === 'function' + ? printParseErrorCode(first.error) + : String(first.error); + throw new Error(`Failed to parse${describeSource(source)}: ${code}`); + } + return parsed; +} + +export function readJsoncFile(filePath) { + const raw = fs.readFileSync(filePath, 'utf8'); + return parseJsoncText(raw, filePath); +} diff --git a/src/shared/jsonrpc.js b/src/shared/jsonrpc.js new file mode 100644 index 000000000..3786b1889 --- /dev/null +++ b/src/shared/jsonrpc.js @@ -0,0 +1,102 @@ +import { PassThrough } from 'node:stream'; +import { StreamMessageReader, StreamMessageWriter } from 'vscode-jsonrpc'; + +const writerCache = new WeakMap(); + +const getWriterState = (outputStream) => { + let state = writerCache.get(outputStream); + if (state) return state; + const writer = new StreamMessageWriter(outputStream); + state = { writer, closed: false, queue: Promise.resolve() }; + const markClosed = () => { + state.closed = true; + }; + if (typeof outputStream.once === 'function') { + outputStream.once('close', markClosed); + outputStream.once('finish', markClosed); + outputStream.once('error', markClosed); + } + writerCache.set(outputStream, state); + return state; +}; + +/** + * Get a JSON-RPC writer bound to a specific stream with serialized writes. + * @param {import('node:stream').Writable} outputStream + * @returns {{write:(payload:object)=>Promise,close:()=>void}} + */ +export function getJsonRpcWriter(outputStream) { + if (!outputStream || typeof outputStream.write !== 'function') { + throw new Error('getJsonRpcWriter requires a writable stream.'); + } + const state = getWriterState(outputStream); + const write = (payload) => { + const run = async () => { + if (state.closed || outputStream.destroyed || outputStream.writableEnded) { + throw new Error('JSON-RPC stream closed.'); + } + return state.writer.write(payload); + }; + state.queue = state.queue.then(run, run); + return state.queue.catch((err) => { + if (err?.code === 'ERR_STREAM_DESTROYED') { + state.closed = true; + } + throw err; + }); + }; + const close = () => { + state.closed = true; + state.writer.dispose?.(); + writerCache.delete(outputStream); + }; + return { write, close }; +} + +/** + * Close and dispose a cached JSON-RPC writer for a stream. + * @param {import('node:stream').Writable} outputStream + */ +export function closeJsonRpcWriter(outputStream) { + const state = writerCache.get(outputStream); + if (!state) return; + state.closed = true; + state.writer.dispose?.(); + writerCache.delete(outputStream); +} + +/** + * Write a JSON-RPC message with Content-Length framing. + * @param {import('node:stream').Writable} outputStream + * @param {object} payload + * @returns {Promise|void} + */ +export function writeFramedJsonRpc(outputStream, payload) { + return getJsonRpcWriter(outputStream).write(payload); +} + +/** + * Create a framed JSON-RPC parser for Content-Length-delimited payloads. + * @param {{onMessage?:(msg:object)=>void,onError?:(err:Error)=>void,maxBufferBytes?:number}} input + * @returns {{push:(chunk:Buffer|string)=>void,dispose:()=>void}} + */ +export function createFramedJsonRpcParser({ onMessage, onError } = {}) { + const stream = new PassThrough(); + const reader = new StreamMessageReader(stream); + const handleMessage = typeof onMessage === 'function' ? onMessage : () => {}; + const handleError = typeof onError === 'function' ? onError : () => {}; + + reader.onError(handleError); + reader.listen(handleMessage); + + return { + push(chunk) { + if (!chunk || chunk.length === 0) return; + stream.write(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + }, + dispose() { + reader.dispose(); + stream.end(); + } + }; +} diff --git a/src/shared/lancedb.js b/src/shared/lancedb.js new file mode 100644 index 000000000..63974e967 --- /dev/null +++ b/src/shared/lancedb.js @@ -0,0 +1,65 @@ +import path from 'node:path'; + +const METRICS = new Set(['cosine', 'l2', 'dot']); + +const normalizeText = (value, fallback) => { + if (typeof value !== 'string') return fallback; + const trimmed = value.trim(); + return trimmed ? trimmed : fallback; +}; + +const normalizeInt = (value, fallback) => { + const parsed = Number(value); + if (!Number.isFinite(parsed) || parsed <= 0) return fallback; + return Math.floor(parsed); +}; + +const normalizeMetric = (value) => { + if (typeof value !== 'string') return 'cosine'; + const trimmed = value.trim().toLowerCase(); + if (trimmed === 'ip') return 'dot'; + return METRICS.has(trimmed) ? trimmed : 'cosine'; +}; + +export function normalizeLanceDbConfig(raw = {}) { + if (raw === false) return { enabled: false }; + const config = raw && typeof raw === 'object' ? raw : {}; + return { + enabled: config.enabled !== false, + table: normalizeText(config.table, 'vectors'), + embeddingColumn: normalizeText(config.embeddingColumn, 'vector'), + idColumn: normalizeText(config.idColumn, 'id'), + metric: normalizeMetric(config.metric), + batchSize: normalizeInt(config.batchSize, 1024) + }; +} + +export function resolveLanceDbPaths(indexDir) { + return { + merged: { + dir: path.join(indexDir, 'dense_vectors.lancedb'), + metaPath: path.join(indexDir, 'dense_vectors.lancedb.meta.json') + }, + doc: { + dir: path.join(indexDir, 'dense_vectors_doc.lancedb'), + metaPath: path.join(indexDir, 'dense_vectors_doc.lancedb.meta.json') + }, + code: { + dir: path.join(indexDir, 'dense_vectors_code.lancedb'), + metaPath: path.join(indexDir, 'dense_vectors_code.lancedb.meta.json') + } + }; +} + +export function resolveLanceDbTarget(mode, denseVectorMode) { + const resolved = typeof denseVectorMode === 'string' + ? denseVectorMode.trim().toLowerCase() + : ''; + if (resolved === 'code') return 'code'; + if (resolved === 'doc') return 'doc'; + if (resolved === 'auto') { + if (mode === 'code') return 'code'; + if (mode === 'prose' || mode === 'extracted-prose') return 'doc'; + } + return 'merged'; +} diff --git a/src/shared/metrics.js b/src/shared/metrics.js new file mode 100644 index 000000000..235752663 --- /dev/null +++ b/src/shared/metrics.js @@ -0,0 +1,271 @@ +import { Counter, Gauge, Histogram, Registry } from 'prom-client'; + +const registry = new Registry(); +let initialized = false; +let metrics = null; + +const normalizeString = (value) => ( + typeof value === 'string' ? value.trim().toLowerCase() : '' +); + +const normalizeLabel = (value, allowed, fallback = 'unknown') => { + const normalized = normalizeString(value); + if (!normalized) return fallback; + return allowed.has(normalized) ? normalized : fallback; +}; + +const STAGES = new Set(['stage1', 'stage2', 'stage3', 'stage4', 'unknown']); +const MODES = new Set(['code', 'prose', 'all', 'records', 'extracted-prose', 'unknown']); +const BACKENDS = new Set(['memory', 'sqlite', 'sqlite-fts', 'unknown']); +const STATUSES = new Set(['ok', 'error', 'unknown']); +const ANN = new Set(['on', 'off', 'unknown']); +const POOLS = new Set(['tokenize', 'quantize', 'watch', 'unknown']); +const TASKS = new Set(['tokenize', 'quantize', 'unknown']); +const WATCH_EVENTS = new Set(['add', 'change', 'unlink', 'error', 'unknown']); +const DEBOUNCE = new Set(['scheduled', 'fired', 'canceled', 'unknown']); +const CACHES = new Set(['query', 'embedding', 'output', 'unknown']); +const CACHE_RESULTS = new Set(['hit', 'miss', 'unknown']); +const SURFACES = new Set(['cli', 'api', 'mcp', 'search', 'index', 'unknown']); +const FALLBACKS = new Set(['backend', 'vector-candidates', 'unknown']); +const TIMEOUTS = new Set(['tool', 'search', 'index', 'unknown']); + +const normalizeStage = (value) => normalizeLabel(value, STAGES); +const normalizeMode = (value) => normalizeLabel(value, MODES); +const normalizeBackend = (value) => normalizeLabel(value, BACKENDS); +const normalizeStatus = (value) => normalizeLabel(value, STATUSES); +const normalizePool = (value) => normalizeLabel(value, POOLS); +const normalizeTask = (value) => normalizeLabel(value, TASKS); +const normalizeWatchEvent = (value) => normalizeLabel(value, WATCH_EVENTS); +const normalizeDebounce = (value) => normalizeLabel(value, DEBOUNCE); +const normalizeCache = (value) => normalizeLabel(value, CACHES); +const normalizeCacheResult = (value) => normalizeLabel(value, CACHE_RESULTS); +const normalizeSurface = (value) => normalizeLabel(value, SURFACES); +const normalizeFallback = (value) => normalizeLabel(value, FALLBACKS); +const normalizeTimeout = (value) => normalizeLabel(value, TIMEOUTS); +const normalizeAnn = (value) => { + if (value === true || value === 'on') return 'on'; + if (value === false || value === 'off') return 'off'; + return normalizeLabel(value, ANN); +}; + +const ensureMetrics = () => { + if (initialized) return; + metrics = { + indexDuration: new Histogram({ + name: 'pairofcleats_index_duration_seconds', + help: 'Index build duration in seconds.', + labelNames: ['stage', 'mode', 'status'], + buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60, 120, 300, 600, 1200, 3600], + registers: [registry] + }), + searchDuration: new Histogram({ + name: 'pairofcleats_search_duration_seconds', + help: 'Search duration in seconds.', + labelNames: ['mode', 'backend', 'ann', 'status'], + buckets: [0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 60], + registers: [registry] + }), + indexRuns: new Counter({ + name: 'pairofcleats_index_runs_total', + help: 'Count of index runs.', + labelNames: ['stage', 'mode', 'status'], + registers: [registry] + }), + searchRuns: new Counter({ + name: 'pairofcleats_search_runs_total', + help: 'Count of search runs.', + labelNames: ['mode', 'backend', 'ann', 'status'], + registers: [registry] + }), + workerQueueDepth: new Gauge({ + name: 'pairofcleats_worker_queue_depth', + help: 'Worker pool queue depth.', + labelNames: ['pool'], + registers: [registry] + }), + workerActiveTasks: new Gauge({ + name: 'pairofcleats_worker_active_tasks', + help: 'Active worker pool tasks.', + labelNames: ['pool'], + registers: [registry] + }), + workerTaskDuration: new Histogram({ + name: 'pairofcleats_worker_task_duration_seconds', + help: 'Worker task duration in seconds.', + labelNames: ['pool', 'task', 'worker', 'status'], + buckets: [0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10], + registers: [registry] + }), + workerRetries: new Counter({ + name: 'pairofcleats_worker_retries_total', + help: 'Worker pool restart attempts.', + labelNames: ['pool'], + registers: [registry] + }), + watchBacklog: new Gauge({ + name: 'pairofcleats_watch_backlog', + help: 'Pending watch backlog size.', + labelNames: ['pool'], + registers: [registry] + }), + watchEvents: new Counter({ + name: 'pairofcleats_watch_events_total', + help: 'Total watch events observed.', + labelNames: ['event'], + registers: [registry] + }), + watchDebounce: new Counter({ + name: 'pairofcleats_watch_debounce_total', + help: 'Watch debounce schedule events.', + labelNames: ['type'], + registers: [registry] + }), + watchBuildDuration: new Histogram({ + name: 'pairofcleats_watch_build_duration_seconds', + help: 'Watch-triggered build duration in seconds.', + labelNames: ['status'], + buckets: [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 30, 60, 120, 300], + registers: [registry] + }), + watchBursts: new Counter({ + name: 'pairofcleats_watch_bursts_total', + help: 'Detected watch event bursts.', + labelNames: ['pool'], + registers: [registry] + }), + cacheEvents: new Counter({ + name: 'pairofcleats_cache_events_total', + help: 'Cache hit/miss events.', + labelNames: ['cache', 'result'], + registers: [registry] + }), + fallbacks: new Counter({ + name: 'pairofcleats_fallbacks_total', + help: 'Fallback events by surface.', + labelNames: ['surface', 'reason'], + registers: [registry] + }), + timeouts: new Counter({ + name: 'pairofcleats_timeouts_total', + help: 'Timeout events by surface.', + labelNames: ['surface', 'operation'], + registers: [registry] + }) + }; + initialized = true; +}; + +const normalizeSeconds = (value) => { + const parsed = Number(value); + if (!Number.isFinite(parsed) || parsed < 0) return 0; + return parsed; +}; + +export function observeIndexDuration({ stage, mode, status, seconds }) { + ensureMetrics(); + const labels = { + stage: normalizeStage(stage), + mode: normalizeMode(mode), + status: normalizeStatus(status) + }; + const duration = normalizeSeconds(seconds); + metrics.indexDuration.observe(labels, duration); + metrics.indexRuns.inc(labels); +} + +export function observeSearchDuration({ mode, backend, ann, status, seconds }) { + ensureMetrics(); + const labels = { + mode: normalizeMode(mode), + backend: normalizeBackend(backend), + ann: normalizeAnn(ann), + status: normalizeStatus(status) + }; + const duration = normalizeSeconds(seconds); + metrics.searchDuration.observe(labels, duration); + metrics.searchRuns.inc(labels); +} + +export function setWorkerQueueDepth({ pool, value }) { + ensureMetrics(); + metrics.workerQueueDepth.set({ pool: normalizePool(pool) }, Number(value) || 0); +} + +export function setWorkerActiveTasks({ pool, value }) { + ensureMetrics(); + metrics.workerActiveTasks.set({ pool: normalizePool(pool) }, Number(value) || 0); +} + +export function observeWorkerTaskDuration({ pool, task, worker, status, seconds }) { + ensureMetrics(); + metrics.workerTaskDuration.observe({ + pool: normalizePool(pool), + task: normalizeTask(task), + worker: worker ? String(worker) : 'unknown', + status: normalizeStatus(status) + }, normalizeSeconds(seconds)); +} + +export function incWorkerRetries({ pool }) { + ensureMetrics(); + metrics.workerRetries.inc({ pool: normalizePool(pool) }); +} + +export function setWatchBacklog(value) { + ensureMetrics(); + metrics.watchBacklog.set({ pool: 'watch' }, Number(value) || 0); +} + +export function incWatchEvent(eventType) { + ensureMetrics(); + metrics.watchEvents.inc({ event: normalizeWatchEvent(eventType) }); +} + +export function incWatchDebounce(type) { + ensureMetrics(); + metrics.watchDebounce.inc({ type: normalizeDebounce(type) }); +} + +export function observeWatchBuildDuration({ status, seconds }) { + ensureMetrics(); + metrics.watchBuildDuration.observe({ status: normalizeStatus(status) }, normalizeSeconds(seconds)); +} + +export function incWatchBurst() { + ensureMetrics(); + metrics.watchBursts.inc({ pool: 'watch' }); +} + +export function incCacheEvent({ cache, result }) { + ensureMetrics(); + metrics.cacheEvents.inc({ + cache: normalizeCache(cache), + result: normalizeCacheResult(result) + }); +} + +export function incFallback({ surface, reason }) { + ensureMetrics(); + metrics.fallbacks.inc({ + surface: normalizeSurface(surface), + reason: normalizeFallback(reason) + }); +} + +export function incTimeout({ surface, operation }) { + ensureMetrics(); + metrics.timeouts.inc({ + surface: normalizeSurface(surface), + operation: normalizeTimeout(operation) + }); +} + +export function getMetricsRegistry() { + ensureMetrics(); + return registry; +} + +export async function getMetricsText() { + ensureMetrics(); + return registry.metrics(); +} diff --git a/src/shared/onnx-embeddings.js b/src/shared/onnx-embeddings.js new file mode 100644 index 000000000..6f5715bdf --- /dev/null +++ b/src/shared/onnx-embeddings.js @@ -0,0 +1,334 @@ +import fs from 'node:fs'; +import path from 'node:path'; + +const GRAPH_LEVELS = new Set(['disabled', 'basic', 'extended', 'all']); +const PROVIDER_ALIASES = new Map([ + ['onnx', 'onnx'], + ['onnxruntime', 'onnx'], + ['onnxruntime-node', 'onnx'], + ['xenova', 'xenova'], + ['transformers', 'xenova'] +]); + +const normalizeProvider = (value) => { + // Default to Xenova/Transformers when unset; reject unknown values. + if (typeof value !== 'string') return 'xenova'; + const trimmed = value.trim().toLowerCase(); + if (!trimmed) return 'xenova'; + const resolved = PROVIDER_ALIASES.get(trimmed); + if (!resolved) { + const supported = Array.from(new Set(PROVIDER_ALIASES.values())).sort(); + throw new Error( + `[embeddings] Unknown embedding provider: ${JSON.stringify(value)}. ` + + `Expected one of: ${supported.join(', ')}.` + ); + } + return resolved; +}; + +const normalizeProviders = (value) => { + if (Array.isArray(value)) { + return value.map((entry) => String(entry).trim()).filter(Boolean); + } + if (typeof value === 'string') { + return value.split(',').map((entry) => entry.trim()).filter(Boolean); + } + return null; +}; + +const normalizeThread = (value) => { + const parsed = Number(value); + if (!Number.isFinite(parsed) || parsed <= 0) return null; + return Math.floor(parsed); +}; + +const normalizeGraphLevel = (value) => { + if (typeof value !== 'string') return null; + const normalized = value.trim().toLowerCase(); + return GRAPH_LEVELS.has(normalized) ? normalized : null; +}; + +const LARGE_MODEL_BYTES = Math.floor(1.5 * 1024 * 1024 * 1024); + +const statSize = (filePath) => { + try { + return fs.statSync(filePath).size; + } catch { + return null; + } +}; + +export function normalizeOnnxConfig(raw = {}) { + const config = raw && typeof raw === 'object' ? raw : {}; + const executionProviders = normalizeProviders(config.executionProviders); + return { + modelPath: typeof config.modelPath === 'string' ? config.modelPath.trim() : '', + tokenizerId: typeof config.tokenizerId === 'string' ? config.tokenizerId.trim() : '', + executionProviders: executionProviders && executionProviders.length ? executionProviders : null, + intraOpNumThreads: normalizeThread(config.intraOpNumThreads), + interOpNumThreads: normalizeThread(config.interOpNumThreads), + graphOptimizationLevel: normalizeGraphLevel(config.graphOptimizationLevel) + }; +} + +export function normalizeEmbeddingProvider(raw) { + return normalizeProvider(raw); +} + +export function resolveOnnxModelPath({ rootDir, modelPath, modelsDir, modelId }) { + const root = rootDir ? path.resolve(rootDir) : process.cwd(); + const trimmed = typeof modelPath === 'string' ? modelPath.trim() : ''; + const tryPath = (candidate) => { + if (!candidate || !fs.existsSync(candidate)) return null; + try { + const stat = fs.statSync(candidate); + if (stat.isDirectory()) { + const nested = [ + path.join(candidate, 'model.onnx'), + path.join(candidate, 'model_quantized.onnx'), + path.join(candidate, 'onnx', 'model.onnx'), + path.join(candidate, 'onnx', 'model_quantized.onnx') + ]; + for (const entry of nested) { + if (fs.existsSync(entry)) return entry; + } + return null; + } + } catch { + return null; + } + return candidate; + }; + if (trimmed) { + const resolved = path.isAbsolute(trimmed) ? trimmed : path.join(root, trimmed); + const stat = tryPath(resolved); + if (stat) return stat; + } + const modelRoot = modelId && modelsDir ? path.join(modelsDir, modelId) : null; + const candidates = [ + modelRoot ? path.join(modelRoot, 'onnx', 'model.onnx') : null, + modelRoot ? path.join(modelRoot, 'onnx', 'model_quantized.onnx') : null, + modelRoot ? path.join(modelRoot, 'model.onnx') : null, + modelRoot ? path.join(modelRoot, 'model_quantized.onnx') : null + ]; + for (const candidate of candidates) { + const resolved = tryPath(candidate); + if (resolved) return resolved; + } + return null; +} + +const onnxCache = new Map(); + +const normalizeVec = (vec) => { + if (!Array.isArray(vec) || vec.length === 0) return vec || []; + const norm = Math.sqrt(vec.reduce((sum, v) => sum + v * v, 0)); + if (!Number.isFinite(norm) || norm === 0) return vec; + return vec.map((v) => v / norm); +}; + +const normalizeExecutionProviders = (providers, lowMemory) => { + if (!providers || !lowMemory) return providers; + return providers.map((entry) => { + if (typeof entry === 'string') { + return entry === 'cpu' ? { name: 'cpu', useArena: false } : entry; + } + if (entry && entry.name === 'cpu' && entry.useArena === undefined) { + return { ...entry, useArena: false }; + } + return entry; + }); +}; + +const buildSessionOptions = (config, { lowMemory = false } = {}) => { + const options = {}; + const providers = normalizeExecutionProviders(config.executionProviders, lowMemory); + if (providers && providers.length) { + options.executionProviders = providers; + } else if (lowMemory) { + options.executionProviders = [{ name: 'cpu', useArena: false }]; + } + if (config.intraOpNumThreads) options.intraOpNumThreads = config.intraOpNumThreads; + if (config.interOpNumThreads) options.interOpNumThreads = config.interOpNumThreads; + if (config.graphOptimizationLevel) { + options.graphOptimizationLevel = config.graphOptimizationLevel; + } else if (lowMemory) { + options.graphOptimizationLevel = 'basic'; + } + if (lowMemory) { + options.enableCpuMemArena = false; + options.enableMemPattern = false; + options.executionMode = 'sequential'; + } + return Object.keys(options).length ? options : undefined; +}; + +const flatten = (nested) => nested.flatMap((row) => row.map((value) => BigInt(value))); + +const toTensor = (TensorCtor, rows) => { + const batch = rows.length; + const width = rows[0]?.length || 0; + if (!batch || !width) return null; + const data = BigInt64Array.from(flatten(rows)); + return new TensorCtor('int64', data, [batch, width]); +}; + +const buildFeeds = (session, encoded, TensorCtor) => { + const inputNames = Array.isArray(session.inputNames) ? session.inputNames : []; + const feeds = {}; + const inputs = { + input_ids: encoded.input_ids, + attention_mask: encoded.attention_mask, + token_type_ids: encoded.token_type_ids + }; + for (const name of inputNames) { + const values = inputs[name]; + if (!values) continue; + const tensor = toTensor(TensorCtor, values); + if (tensor) feeds[name] = tensor; + } + return feeds; +}; + +const findOutput = (outputs) => { + if (!outputs) return null; + const preferred = [ + 'sentence_embedding', + 'embeddings', + 'pooler_output', + 'last_hidden_state', + 'output_0' + ]; + for (const key of preferred) { + if (outputs[key]) return outputs[key]; + } + const fallbackKey = Object.keys(outputs)[0]; + return fallbackKey ? outputs[fallbackKey] : null; +}; + +const meanPool = (tensor, attentionMask) => { + const dims = tensor?.dims || []; + if (dims.length !== 3) return []; + const [batch, seq, hidden] = dims; + const data = tensor.data || []; + const flatMask = attentionMask ? attentionMask.flat() : new Array(batch * seq).fill(1); + const output = new Array(batch).fill(null); + for (let b = 0; b < batch; b += 1) { + const vec = new Array(hidden).fill(0); + let count = 0; + for (let t = 0; t < seq; t += 1) { + const maskVal = Number(flatMask[b * seq + t] ?? 0); + if (!maskVal) continue; + count += 1; + const offset = (b * seq + t) * hidden; + for (let h = 0; h < hidden; h += 1) { + vec[h] += data[offset + h]; + } + } + if (count > 0) { + for (let h = 0; h < hidden; h += 1) { + vec[h] = vec[h] / count; + } + } + output[b] = normalizeVec(vec); + } + return output; +}; + +const rowsFromTensor = (tensor) => { + const dims = tensor?.dims || []; + if (dims.length !== 2) return []; + const [rows, cols] = dims; + const data = tensor.data || []; + const out = new Array(rows); + for (let r = 0; r < rows; r += 1) { + const start = r * cols; + out[r] = normalizeVec(Array.from(data.slice(start, start + cols))); + } + return out; +}; + +export function createOnnxEmbedder({ rootDir, modelId, modelsDir, onnxConfig }) { + const normalized = normalizeOnnxConfig(onnxConfig); + const resolvedModelPath = resolveOnnxModelPath({ + rootDir, + modelPath: normalized.modelPath, + modelsDir, + modelId + }); + if (!resolvedModelPath) { + const hint = modelId ? ` (modelId=${JSON.stringify(modelId)})` : ''; + throw new Error( + `ONNX model path not found${hint}. ` + + 'Set indexing.embeddings.onnx.modelPath or run "npm run download-models".' + ); + } + const modelSize = statSize(resolvedModelPath); + const lowMemory = Number.isFinite(modelSize) && modelSize >= LARGE_MODEL_BYTES; + const tokenizerId = normalized.tokenizerId || modelId; + const cacheKey = JSON.stringify({ + resolvedModelPath, + tokenizerId, + executionProviders: normalized.executionProviders || null, + lowMemory, + intraOpNumThreads: normalized.intraOpNumThreads || null, + interOpNumThreads: normalized.interOpNumThreads || null, + graphOptimizationLevel: normalized.graphOptimizationLevel || null + }); + if (!onnxCache.has(cacheKey)) { + const sessionOptions = buildSessionOptions(normalized, { lowMemory }); + const promise = (async () => { + const { AutoTokenizer, env } = await import('@xenova/transformers'); + if (modelsDir) { + env.cacheDir = modelsDir; + } + const tokenizer = await AutoTokenizer.from_pretrained(tokenizerId); + const { InferenceSession, Tensor } = await import('onnxruntime-node'); + let session; + try { + session = await InferenceSession.create(resolvedModelPath, sessionOptions); + } catch (err) { + if (!lowMemory) { + const fallbackOptions = buildSessionOptions(normalized, { lowMemory: true }); + session = await InferenceSession.create(resolvedModelPath, fallbackOptions); + } else { + throw err; + } + } + return { tokenizer, session, Tensor }; + })(); + onnxCache.set(cacheKey, promise); + } + const embedderPromise = onnxCache.get(cacheKey); + const getEmbeddings = async (texts) => { + const list = Array.isArray(texts) ? texts : []; + if (!list.length) return []; + const { tokenizer, session, Tensor } = await embedderPromise; + const wantsTokenTypeIds = Array.isArray(session.inputNames) + && session.inputNames.includes('token_type_ids'); + const encoded = tokenizer(list, { + padding: true, + truncation: true, + return_tensor: false, + return_token_type_ids: wantsTokenTypeIds + }); + const feeds = buildFeeds(session, encoded, Tensor); + if (!Object.keys(feeds).length) return Array.from({ length: list.length }, () => []); + const outputs = await session.run(feeds); + const mainOutput = findOutput(outputs); + if (!mainOutput) return Array.from({ length: list.length }, () => []); + if (Array.isArray(mainOutput?.dims) && mainOutput.dims.length === 2) { + return rowsFromTensor(mainOutput); + } + const mask = encoded.attention_mask; + return meanPool(mainOutput, mask); + }; + return { + embedderPromise, + getEmbeddings, + getEmbedding: async (text) => { + const list = await getEmbeddings([text]); + return list[0] || []; + } + }; +} diff --git a/src/shared/optional-deps.js b/src/shared/optional-deps.js new file mode 100644 index 000000000..149fc895c --- /dev/null +++ b/src/shared/optional-deps.js @@ -0,0 +1,48 @@ +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); +const TRUE_VALUES = new Set(['1', 'true', 'yes', 'on']); + +const isVerbose = (options = {}) => { + if (options.verbose === true) return true; + const raw = String(process.env.PAIROFCLEATS_VERBOSE || '').trim().toLowerCase(); + return TRUE_VALUES.has(raw); +}; + +const normalizeErrorReason = (err) => { + const code = err?.code; + if (code === 'MODULE_NOT_FOUND' || code === 'ERR_MODULE_NOT_FOUND') { + return 'missing'; + } + if (code === 'ERR_REQUIRE_ESM') return 'unsupported'; + return 'error'; +}; + +const maybeLog = (message, err, options = {}) => { + if (!isVerbose(options)) return; + const logger = typeof options.logger === 'function' ? options.logger : console.warn; + const detail = err?.message ? ` (${err.message})` : ''; + logger(`[deps] ${message}${detail}`); +}; + +export function tryRequire(name, options = {}) { + try { + const mod = require(name); + return { ok: true, mod }; + } catch (err) { + const reason = normalizeErrorReason(err); + maybeLog(`Optional dependency unavailable: ${name}`, err, options); + return { ok: false, error: err, reason }; + } +} + +export async function tryImport(name, options = {}) { + try { + const mod = await import(name); + return { ok: true, mod }; + } catch (err) { + const reason = normalizeErrorReason(err); + maybeLog(`Optional dependency unavailable: ${name}`, err, options); + return { ok: false, error: err, reason }; + } +} diff --git a/src/shared/postings-config.js b/src/shared/postings-config.js index b439012ec..16ee2d647 100644 --- a/src/shared/postings-config.js +++ b/src/shared/postings-config.js @@ -7,13 +7,23 @@ * phraseMinN:number, * phraseMaxN:number, * chargramMinN:number, - * chargramMaxN:number + * chargramMaxN:number, + * chargramMaxTokenLength:number|null, + * chargramSource:string, + * fielded:boolean * }} */ export function normalizePostingsConfig(input = {}) { const cfg = input && typeof input === 'object' ? input : {}; const enablePhraseNgrams = cfg.enablePhraseNgrams !== false; const enableChargrams = cfg.enableChargrams !== false; + const fielded = cfg.fielded !== false; + const chargramSourceRaw = typeof cfg.chargramSource === 'string' + ? cfg.chargramSource.trim().toLowerCase() + : ''; + const chargramSource = ['full', 'fields'].includes(chargramSourceRaw) + ? chargramSourceRaw + : 'fields'; const toInt = (value) => { const num = Number(value); @@ -31,6 +41,15 @@ export function normalizePostingsConfig(input = {}) { const phraseRange = normalizeRange(cfg.phraseMinN, cfg.phraseMaxN, { min: 2, max: 4 }); const chargramRange = normalizeRange(cfg.chargramMinN, cfg.chargramMaxN, { min: 3, max: 5 }); + let chargramMaxTokenLength = 48; + if (cfg.chargramMaxTokenLength === 0 || cfg.chargramMaxTokenLength === false) { + chargramMaxTokenLength = null; + } else { + const maxTokenRaw = Number(cfg.chargramMaxTokenLength); + if (Number.isFinite(maxTokenRaw)) { + chargramMaxTokenLength = Math.max(2, Math.floor(maxTokenRaw)); + } + } return { enablePhraseNgrams, @@ -38,6 +57,9 @@ export function normalizePostingsConfig(input = {}) { phraseMinN: phraseRange.min, phraseMaxN: phraseRange.max, chargramMinN: chargramRange.min, - chargramMaxN: chargramRange.max + chargramMaxN: chargramRange.max, + chargramMaxTokenLength, + chargramSource, + fielded }; } diff --git a/src/shared/progress.js b/src/shared/progress.js index f6428d700..68a12d974 100644 --- a/src/shared/progress.js +++ b/src/shared/progress.js @@ -1,19 +1,196 @@ +import pino from 'pino'; + /** * Write a simple progress line to stderr. * @param {string} step * @param {number} i * @param {number} total */ +let lastProgressActive = false; +let lastProgressWidth = 0; +let logger = null; +let structuredEnabled = false; +let logContext = {}; +let ringMax = 200; +let ringMaxBytes = 2 * 1024 * 1024; +const ringEvents = []; +const ringSizes = []; +let ringBytes = 0; +const defaultRedactPaths = [ + 'password', + 'token', + 'secret', + 'apiKey', + 'authorization', + 'headers.authorization', + 'headers.cookie', + 'headers.set-cookie', + 'auth', + 'credentials' +]; + +const normalizeRedact = (value) => { + if (value === false) return null; + if (Array.isArray(value)) { + return value.length ? { paths: value, censor: '[redacted]' } : null; + } + if (value && typeof value === 'object') { + const paths = Array.isArray(value.paths) ? value.paths : []; + const censor = typeof value.censor === 'string' ? value.censor : '[redacted]'; + const remove = value.remove === true; + return paths.length ? { paths, censor, remove } : null; + } + return { paths: defaultRedactPaths, censor: '[redacted]' }; +}; + +const recordEvent = (level, msg, meta) => { + const payload = { + ts: new Date().toISOString(), + level, + msg, + meta: meta && typeof meta === 'object' ? meta : null + }; + let encoded = ''; + try { + encoded = JSON.stringify(payload); + } catch { + encoded = '{"ts":"[unserializable]","level":"error","msg":"[unserializable]"}'; + } + const size = Buffer.byteLength(encoded, 'utf8'); + ringEvents.push(payload); + ringSizes.push(size); + ringBytes += size; + while (ringEvents.length > ringMax || ringBytes > ringMaxBytes) { + ringBytes -= ringSizes.shift() || 0; + ringEvents.shift(); + } +}; + +export function configureLogger(options = {}) { + const enabled = options.enabled === true; + structuredEnabled = enabled; + if (!enabled) { + logger = null; + logContext = options.context && typeof options.context === 'object' + ? { ...options.context } + : {}; + return; + } + if (Number.isFinite(Number(options.ringMax))) { + ringMax = Math.max(1, Math.floor(Number(options.ringMax))); + } + if (Number.isFinite(Number(options.ringMaxBytes))) { + ringMaxBytes = Math.max(1024, Math.floor(Number(options.ringMaxBytes))); + } + const level = typeof options.level === 'string' && options.level.trim() + ? options.level.trim().toLowerCase() + : 'info'; + const redact = normalizeRedact(options.redact); + const transport = options.pretty + ? { + target: 'pino-pretty', + options: { colorize: true, translateTime: 'SYS:standard' } + } + : undefined; + logger = pino({ + level, + base: null, + timestamp: pino.stdTimeFunctions.isoTime, + ...(redact ? { redact } : {}) + }, transport); + logContext = options.context && typeof options.context === 'object' + ? { ...options.context } + : {}; +} + +export function updateLogContext(context = {}) { + if (!context || typeof context !== 'object') return; + logContext = { ...logContext, ...context }; +} + +export function getRecentLogEvents() { + return ringEvents.slice(); +} + +export function isStructuredLogging() { + return structuredEnabled; +} + +function clearProgressLine() { + if (!lastProgressActive || !process.stderr.isTTY) return; + const width = Math.max(0, lastProgressWidth); + if (width > 0) { + process.stderr.write(`\r${' '.repeat(width)}\r`); + } + lastProgressActive = false; + lastProgressWidth = 0; +} + export function showProgress(step, i, total) { + if (structuredEnabled) return; const pct = ((i / total) * 100).toFixed(1); - process.stderr.write(`\r${step.padEnd(40)} ${i}/${total} (${pct}%)`.padEnd(70)); - if (i === total) process.stderr.write('\n'); + const line = `${step} ${i}/${total} (${pct}%)`; + const isTty = process.stderr.isTTY; + if (isTty) { + process.stderr.write(`\r${line}\x1b[K`); + lastProgressActive = true; + lastProgressWidth = line.length; + if (i === total) { + process.stderr.write('\n'); + lastProgressActive = false; + lastProgressWidth = 0; + } + } else { + process.stderr.write(`${line}\n`); + lastProgressActive = false; + lastProgressWidth = 0; + } } /** * Write a log message to stderr. * @param {string} msg + * @param {object} [meta] + */ +export function log(msg, meta = null) { + if (logger) { + logger.info({ ...logContext, ...(meta || {}) }, msg); + recordEvent('info', msg, meta); + return; + } + recordEvent('info', msg, meta); + clearProgressLine(); + process.stderr.write(`\n${msg}\n`); +} + +/** + * Write a single log line to stderr without extra spacing. + * @param {string} msg + * @param {object} [meta] + */ +export function logLine(msg, meta = null) { + if (logger) { + logger.info({ ...logContext, ...(meta || {}) }, msg); + recordEvent('info', msg, meta); + return; + } + recordEvent('info', msg, meta); + clearProgressLine(); + process.stderr.write(`${msg}\n`); +} + +/** + * Write an error log message. + * @param {string} msg + * @param {object} [meta] */ -export function log(msg) { +export function logError(msg, meta = null) { + if (logger) { + logger.error({ ...logContext, ...(meta || {}) }, msg); + recordEvent('error', msg, meta); + return; + } + recordEvent('error', msg, meta); + clearProgressLine(); process.stderr.write(`\n${msg}\n`); } diff --git a/src/shared/safe-regex.js b/src/shared/safe-regex.js new file mode 100644 index 000000000..4b3659168 --- /dev/null +++ b/src/shared/safe-regex.js @@ -0,0 +1,173 @@ +import { compileRe2js } from './safe-regex/backends/re2js.js'; +import { compileRe2, isRe2Available } from './safe-regex/backends/re2.js'; + +export const DEFAULT_SAFE_REGEX_CONFIG = { + engine: 'auto', + maxPatternLength: 512, + maxInputLength: 10000, + maxProgramSize: 2000, + timeoutMs: 25, + flags: '' +}; + +const normalizeLimit = (value, fallback) => { + if (value === 0 || value === false) return null; + const parsed = Number(value); + if (Number.isFinite(parsed) && parsed > 0) return Math.floor(parsed); + return fallback; +}; + +const normalizeEngine = (raw, fallback) => { + if (!raw) return fallback; + const key = String(raw).trim().toLowerCase(); + if (key === 'auto') return 'auto'; + if (key === 're2') return 're2'; + if (key === 're2js') return 're2js'; + return fallback; +}; + +const normalizeFlags = (raw) => { + if (!raw) return ''; + const seen = new Set(); + const out = []; + for (const ch of String(raw)) { + if (!'gimsuy'.includes(ch) || seen.has(ch)) continue; + seen.add(ch); + out.push(ch); + } + return out.join(''); +}; + +const mergeFlags = (explicit, fallback) => { + const primary = normalizeFlags(explicit); + const defaults = normalizeFlags(fallback); + if (!defaults) return primary; + if (!primary) return defaults; + const merged = []; + const seen = new Set(); + for (const ch of `${defaults}${primary}`) { + if (seen.has(ch)) continue; + seen.add(ch); + merged.push(ch); + } + return merged.join(''); +}; + +class SafeRegex { + constructor(backend, source, flags, config, requestedEngine) { + this.backend = backend; + this.engine = backend?.engine || 're2js'; + this.requestedEngine = requestedEngine || 'auto'; + this.source = source; + this.flags = flags; + this.config = config; + this.lastIndex = 0; + this.isGlobal = flags.includes('g'); + this.isSticky = flags.includes('y'); + this.usesLastIndex = this.isGlobal || this.isSticky; + } + + exec(input) { + const text = String(input ?? ''); + if (!text) { + if (this.usesLastIndex) this.lastIndex = 0; + return null; + } + + const { maxInputLength, timeoutMs } = this.config || {}; + if (maxInputLength && text.length > maxInputLength) { + if (this.usesLastIndex) this.lastIndex = 0; + return null; + } + + const startIndex = this.usesLastIndex && Number.isFinite(this.lastIndex) + ? Math.max(0, this.lastIndex) + : 0; + + if (this.usesLastIndex && startIndex > text.length) { + this.lastIndex = 0; + return null; + } + + const match = this.backend.match(text, startIndex, { timeoutMs, sticky: this.isSticky }); + if (!match) { + if (this.usesLastIndex) this.lastIndex = 0; + return null; + } + + const groups = Array.isArray(match.groups) ? match.groups : []; + const result = groups.slice(); + result.index = match.index; + result.input = text; + + if (this.usesLastIndex) { + if (Number.isFinite(match.nextLastIndex)) { + this.lastIndex = match.nextLastIndex; + } else { + let next = match.end; + if (Number.isFinite(next) && next === match.index) { + next = Math.min(text.length, next + 1); + } + this.lastIndex = Number.isFinite(next) ? next : 0; + } + } + + return result; + } + + test(input) { + return !!this.exec(input); + } +} + +export function normalizeSafeRegexConfig(raw = {}, defaults = {}) { + const base = { ...DEFAULT_SAFE_REGEX_CONFIG, ...defaults }; + const config = raw && typeof raw === 'object' ? raw : {}; + const hasFlagOverride = Object.prototype.hasOwnProperty.call(config, 'flags'); + const hasEngineOverride = Object.prototype.hasOwnProperty.call(config, 'engine'); + return { + engine: normalizeEngine(hasEngineOverride ? config.engine : base.engine, base.engine), + maxPatternLength: normalizeLimit(config.maxPatternLength, base.maxPatternLength), + maxInputLength: normalizeLimit(config.maxInputLength, base.maxInputLength), + maxProgramSize: normalizeLimit(config.maxProgramSize, base.maxProgramSize), + timeoutMs: normalizeLimit(config.timeoutMs, base.timeoutMs), + flags: normalizeFlags(hasFlagOverride ? config.flags : base.flags) + }; +} + +let warnedMissingRe2 = false; +const warnMissingRe2Once = () => { + if (warnedMissingRe2) return; + warnedMissingRe2 = true; + console.warn('SafeRegex: engine "re2" requested but optional dependency "re2" is not available; falling back to re2js.'); +}; + +export function createSafeRegex(pattern, flags = '', config = {}) { + const normalized = normalizeSafeRegexConfig(config); + const source = String(pattern ?? ''); + if (!source) return null; + if (normalized.maxPatternLength && source.length > normalized.maxPatternLength) { + return null; + } + + const combinedFlags = mergeFlags(flags, normalized.flags); + const requestedEngine = normalized.engine || 'auto'; + + // Try native RE2 if requested (auto or explicit) and available. + if (requestedEngine !== 're2js') { + const nativeAvailable = isRe2Available(); + if (nativeAvailable) { + const backend = compileRe2(source, combinedFlags); + if (backend) return new SafeRegex(backend, source, combinedFlags, normalized, requestedEngine); + } else if (requestedEngine === 're2') { + warnMissingRe2Once(); + } + } + + // Fall back to RE2JS. + const backend = compileRe2js(source, combinedFlags, normalized); + if (!backend) return null; + return new SafeRegex(backend, source, combinedFlags, normalized, requestedEngine); +} + +export const isNativeRe2Available = isRe2Available; diff --git a/src/shared/safe-regex/backends/re2.js b/src/shared/safe-regex/backends/re2.js new file mode 100644 index 000000000..cc114988d --- /dev/null +++ b/src/shared/safe-regex/backends/re2.js @@ -0,0 +1,64 @@ +import { createRequire } from 'node:module'; + +let cachedRe2 = undefined; + +const loadRe2 = () => { + if (cachedRe2 !== undefined) return cachedRe2; + try { + const require = createRequire(import.meta.url); + const mod = require('re2'); + const RE2 = (mod && typeof mod === 'object' && 'default' in mod) ? mod.default : mod; + cachedRe2 = typeof RE2 === 'function' ? RE2 : null; + } catch { + cachedRe2 = null; + } + return cachedRe2; +}; + +export const isRe2Available = () => Boolean(loadRe2()); + +export const compileRe2 = (source, flags) => { + const RE2 = loadRe2(); + if (!RE2) return null; + try { + const compiled = new RE2(source, flags); + if (!compiled) return null; + return { + engine: 're2', + source, + flags, + match(text, startIndex, { timeoutMs = 0, sticky = false } = {}) { + const started = timeoutMs ? Date.now() : 0; + + // Keep lastIndex behavior consistent with JS RegExp semantics: + // only meaningful for global (g) or sticky (y). + const usesLastIndex = flags.includes('g') || flags.includes('y') || sticky; + if (typeof compiled.lastIndex === 'number') { + compiled.lastIndex = usesLastIndex ? startIndex : 0; + } + + const result = compiled.exec(text); + + if (timeoutMs && Date.now() - started > timeoutMs) return null; + if (!result) return null; + + const index = Number.isFinite(result.index) ? result.index : 0; + + // Some RE2 builds may ignore 'y'; enforce sticky if requested. + if (sticky && index !== startIndex) return null; + + const groups = Array.from(result); + const matchText = groups[0] ?? ''; + const end = index + String(matchText).length; + + const nextLastIndex = (typeof compiled.lastIndex === 'number' && Number.isFinite(compiled.lastIndex)) + ? compiled.lastIndex + : null; + + return { groups, index, end, nextLastIndex }; + } + }; + } catch { + return null; + } +}; diff --git a/src/shared/safe-regex/backends/re2js.js b/src/shared/safe-regex/backends/re2js.js new file mode 100644 index 000000000..6c9391f9c --- /dev/null +++ b/src/shared/safe-regex/backends/re2js.js @@ -0,0 +1,43 @@ +import { RE2JS } from 're2js'; + +const toFlagMask = (flags) => { + let mask = 0; + if (flags.includes('i')) mask |= RE2JS.CASE_INSENSITIVE; + if (flags.includes('m')) mask |= RE2JS.MULTILINE; + if (flags.includes('s')) mask |= RE2JS.DOTALL; + return mask; +}; + +export const compileRe2js = (source, flags, config = {}) => { + const mask = toFlagMask(flags); + try { + const translated = RE2JS.translateRegExp(source); + const compiled = RE2JS.compile(translated, mask); + if (config.maxProgramSize && compiled.programSize() > config.maxProgramSize) { + return null; + } + const groupCount = compiled.groupCount(); + return { + engine: 're2js', + source, + flags, + match(text, startIndex, { timeoutMs = 0, sticky = false } = {}) { + const started = timeoutMs ? Date.now() : 0; + const matcher = compiled.matcher(text); + const found = matcher.find(startIndex); + if (timeoutMs && Date.now() - started > timeoutMs) return null; + if (!found) return null; + const index = matcher.start(); + if (sticky && index !== startIndex) return null; + const end = matcher.end(); + const groups = new Array(groupCount + 1); + for (let i = 0; i <= groupCount; i += 1) { + groups[i] = matcher.group(i); + } + return { groups, index, end }; + } + }; + } catch { + return null; + } +}; diff --git a/src/shared/stable-json.js b/src/shared/stable-json.js new file mode 100644 index 000000000..ff75a57ac --- /dev/null +++ b/src/shared/stable-json.js @@ -0,0 +1,17 @@ +export function stableStringify(value) { + return JSON.stringify(normalize(value)); +} + +function normalize(value) { + if (Array.isArray(value)) { + return value.map((entry) => normalize(entry)); + } + if (!value || typeof value !== 'object' || value.constructor !== Object) { + return value; + } + const out = {}; + for (const key of Object.keys(value).sort()) { + out[key] = normalize(value[key]); + } + return out; +} diff --git a/src/shared/threads.js b/src/shared/threads.js new file mode 100644 index 000000000..a933cb66c --- /dev/null +++ b/src/shared/threads.js @@ -0,0 +1,77 @@ +import os from 'node:os'; + +/** + * Resolve thread limits and concurrency defaults. + * @param {object} input + * @returns {object} + */ +export function resolveThreadLimits(input = {}) { + const { + argv = {}, + rawArgv = [], + envConfig = {}, + configConcurrency = null, + importConcurrencyConfig = null, + ioConcurrencyCapConfig = null, + defaultMultiplier = 4 + } = input; + const cpuCount = os.cpus().length; + const defaultFileConcurrency = Math.max(1, Math.min(cpuCount, 16)); + const defaultThreads = Math.max(1, defaultFileConcurrency * defaultMultiplier); + const rawCliThreads = Number(argv.threads); + const envThreads = Number(envConfig.threads); + const threadsArgPresent = Array.isArray(rawArgv) + && rawArgv.some((arg) => arg === '--threads' || String(arg).startsWith('--threads=')); + const envThreadsProvided = Number.isFinite(envThreads) && envThreads > 0; + const cliThreadsProvided = threadsArgPresent + || (Number.isFinite(rawCliThreads) && rawCliThreads !== defaultThreads); + const cliConcurrency = envThreadsProvided + ? envThreads + : (cliThreadsProvided ? rawCliThreads : null); + const requestedConcurrency = Number.isFinite(cliConcurrency) + ? Math.floor(cliConcurrency) + : Number.isFinite(configConcurrency) + ? Math.floor(configConcurrency) + : defaultFileConcurrency; + const cappedConcurrency = Math.max(1, Math.min(cpuCount, requestedConcurrency)); + const maxConcurrencyCap = Math.max(defaultFileConcurrency, cappedConcurrency); + const fileConcurrency = Math.max(1, Math.min(maxConcurrencyCap, cappedConcurrency)); + const importConcurrency = Math.max( + 1, + Math.min( + maxConcurrencyCap, + Number.isFinite(cliConcurrency) + ? fileConcurrency + : Number.isFinite(Number(importConcurrencyConfig)) + ? Number(importConcurrencyConfig) + : fileConcurrency + ) + ); + const ioPlatformCap = process.platform === 'win32' ? 32 : 64; + const ioBase = Math.max(fileConcurrency, importConcurrency); + const configuredIoCap = Number.isFinite(Number(ioConcurrencyCapConfig)) && Number(ioConcurrencyCapConfig) > 0 + ? Math.floor(Number(ioConcurrencyCapConfig)) + : null; + const ioDerived = Math.max(1, Math.min(ioPlatformCap, ioBase * 4)); + const ioConcurrency = configuredIoCap !== null + ? Math.max(1, Math.min(ioDerived, configuredIoCap)) + : ioDerived; + const cpuConcurrency = Math.max(1, Math.min(maxConcurrencyCap, fileConcurrency)); + const source = envThreadsProvided + ? 'env' + : cliThreadsProvided + ? 'cli' + : Number.isFinite(configConcurrency) + ? 'config' + : 'default'; + return { + cpuCount, + defaultThreads, + maxConcurrencyCap, + fileConcurrency, + importConcurrency, + ioConcurrency, + cpuConcurrency, + source + }; +} diff --git a/src/shared/tokenize.js b/src/shared/tokenize.js index 7e2470302..5b8b7e13b 100644 --- a/src/shared/tokenize.js +++ b/src/shared/tokenize.js @@ -1,3 +1,4 @@ +import AhoCorasick from 'aho-corasick'; import Snowball from 'snowball-stemmers'; const stemmer = Snowball.newStemmer('english'); @@ -32,32 +33,254 @@ export function splitId(s) { } /** - * Split a token into dictionary words when possible. - * @param {string} token - * @param {Set} dict + * Split an identifier into tokens while preserving case. + * @param {string} s * @returns {string[]} */ -export function splitWordsWithDict(token, dict) { - if (!dict || dict.size === 0) return [token]; +export function splitIdPreserveCase(s) { + return s + .replace(/([a-z])([A-Z])/g, '$1 $2') + .replace(/[_\-]+/g, ' ') + .split(/[^a-zA-Z0-9]+/u) + .flatMap((tok) => tok.split(/(?<=.)(?=[A-Z])/)) + .filter(Boolean); +} + +export function extractPunctuationTokens(text) { + if (!text) return []; + const tokens = text.match(/[=<>!:+\-*/%&|^~.?]{1,4}|[()[\]{}.,;:]/g); + return tokens ? tokens.filter(Boolean) : []; +} + +const DEFAULT_DICT_SEGMENTATION = { + mode: 'auto', + dpMaxTokenLength: 32 +}; + +const VALID_DICT_SEGMENT_MODES = new Set(['auto', 'greedy', 'dp', 'aho']); +const MAX_AHO_DICT_SIZE = 200000; + +const normalizeDictSegmentation = (options = {}) => { + const modeRaw = typeof options.segmentation === 'string' + ? options.segmentation.toLowerCase() + : ''; + const mode = VALID_DICT_SEGMENT_MODES.has(modeRaw) + ? modeRaw + : DEFAULT_DICT_SEGMENTATION.mode; + const dpMaxTokenLengthRaw = Number(options.dpMaxTokenLength); + const dpMaxTokenLength = Number.isFinite(dpMaxTokenLengthRaw) + ? Math.max(4, Math.floor(dpMaxTokenLengthRaw)) + : DEFAULT_DICT_SEGMENTATION.dpMaxTokenLength; + return { mode, dpMaxTokenLength }; +}; + +const getDictMaxLen = (dict) => { + if (!dict || dict.size === 0) return 0; + const cached = dict.__maxTokenLength; + if (Number.isFinite(cached) && cached > 0) return cached; + const altMax = Number.isFinite(dict.maxLen) && dict.maxLen > 0 ? dict.maxLen : 0; + if (altMax) return altMax; + if (dict.__sharedDict) return 0; + if (typeof dict[Symbol.iterator] !== 'function') return 0; + let maxLen = 0; + for (const word of dict) { + if (typeof word === 'string' && word.length > maxLen) maxLen = word.length; + } + dict.__maxTokenLength = maxLen; + return maxLen; +}; + +const buildDictAhoMatcher = (dict) => { + if (!dict || dict.__sharedDict) return null; + if (typeof dict[Symbol.iterator] !== 'function') return null; + if (Number.isFinite(dict.size) && dict.size > MAX_AHO_DICT_SIZE) return null; + const matcher = new AhoCorasick(); + const words = []; + for (const word of dict) { + if (typeof word !== 'string' || !word) continue; + words.push(word); + matcher.add(word, word); + } + if (!words.length) return null; + matcher.build_fail(); + return { matcher, words }; +}; + +const getDictAhoMatcher = (dict) => { + if (!dict || dict.size === 0 || dict.__sharedDict) return null; + const cached = dict.__ahoMatcher; + if (cached && cached.size === dict.size) return cached.matcher; + const built = buildDictAhoMatcher(dict); + if (!built) return null; + dict.__ahoMatcher = { + matcher: built.matcher, + size: dict.size, + words: built.words + }; + return built.matcher; +}; + +const buildAhoMatches = (token, dict) => { + const matcher = getDictAhoMatcher(dict); + if (!matcher || !token) return null; + const matchesByStart = Array.from({ length: token.length }, () => []); + matcher.search(token, (value, _data, offset) => { + if (!value) return; + const start = Number(offset); + if (!Number.isFinite(start) || start < 0) return; + const end = start + value.length; + if (end > token.length) return; + matchesByStart[start].push({ word: value, end }); + }); + return matchesByStart; +}; + +const findLongestMatch = (token, start, dict, maxLen) => { + const endLimit = Math.min(token.length, start + maxLen); + for (let end = endLimit; end > start; end--) { + const sub = token.slice(start, end); + if (dict.has(sub)) return sub; + } + return null; +}; + +const hasDictMatchAt = (token, start, dict, maxLen) => !!findLongestMatch(token, start, dict, maxLen); + +const splitWordsWithDictGreedy = (token, dict, maxLen) => { const result = []; let i = 0; while (i < token.length) { - let found = false; - for (let j = token.length; j > i; j--) { - const sub = token.slice(i, j); - if (dict.has(sub)) { - result.push(sub); - i = j; - found = true; - break; + const match = findLongestMatch(token, i, dict, maxLen); + if (match) { + result.push(match); + i += match.length; + continue; + } + const unknownStart = i; + i += 1; + while (i < token.length && !hasDictMatchAt(token, i, dict, maxLen)) { + i += 1; + } + result.push(token.slice(unknownStart, i)); + } + return result; +}; + +const pickBetterSegment = (current, candidate) => { + if (!current) return candidate; + if (candidate.matchChars > current.matchChars) return candidate; + if (candidate.matchChars < current.matchChars) return current; + if (candidate.segments < current.segments) return candidate; + if (candidate.segments > current.segments) return current; + if (candidate.isDict && !current.isDict) return candidate; + return current; +}; + +const splitWordsWithDictDp = (token, dict, maxLen, matchesByStart = null) => { + const n = token.length; + const best = new Array(n + 1).fill(null); + best[n] = { matchChars: 0, segments: 0, next: n, token: '', isDict: false }; + for (let i = n - 1; i >= 0; i--) { + let bestChoice = null; + const fallback = best[i + 1]; + if (fallback) { + bestChoice = pickBetterSegment(bestChoice, { + matchChars: fallback.matchChars, + segments: fallback.segments + 1, + next: i + 1, + token: token.slice(i, i + 1), + isDict: false + }); + } + if (matchesByStart) { + const matches = matchesByStart[i]; + if (matches && matches.length) { + for (const match of matches) { + const nextScore = best[match.end]; + if (!nextScore) continue; + bestChoice = pickBetterSegment(bestChoice, { + matchChars: nextScore.matchChars + match.word.length, + segments: nextScore.segments + 1, + next: match.end, + token: match.word, + isDict: true + }); + } + } + } else { + const endLimit = Math.min(n, i + maxLen); + for (let end = endLimit; end > i; end--) { + const word = token.slice(i, end); + if (!dict.has(word)) continue; + const nextScore = best[end]; + if (!nextScore) continue; + bestChoice = pickBetterSegment(bestChoice, { + matchChars: nextScore.matchChars + word.length, + segments: nextScore.segments + 1, + next: end, + token: word, + isDict: true + }); } } - if (!found) { - result.push(token[i]); - i++; + best[i] = bestChoice; + } + const segments = []; + let idx = 0; + while (idx < n && best[idx]) { + const entry = best[idx]; + segments.push(entry); + idx = entry.next; + } + const result = []; + let buffer = ''; + for (const seg of segments) { + if (!seg.isDict) { + buffer += seg.token; + continue; + } + if (buffer) { + result.push(buffer); + buffer = ''; } + result.push(seg.token); } + if (buffer) result.push(buffer); return result; +}; + +const scoreSegments = (segments, dict) => segments.reduce((sum, seg) => ( + dict.has(seg) ? sum + seg.length : sum +), 0); + +/** + * Split a token into dictionary words when possible. + * @param {string} token + * @param {{size:number,has:function}|Set} dict + * @param {{segmentation?:string,dpMaxTokenLength?:number}} [options] + * @returns {string[]} + */ +export function splitWordsWithDict(token, dict, options = {}) { + if (!dict || dict.size === 0 || typeof dict.has !== 'function') return [token]; + if (!token) return []; + const { mode, dpMaxTokenLength } = normalizeDictSegmentation(options); + const maxLen = getDictMaxLen(dict); + if (!maxLen) return [token]; + const greedy = splitWordsWithDictGreedy(token, dict, maxLen); + if (mode === 'greedy') return greedy; + const shouldUseDp = token.length <= dpMaxTokenLength; + const matchesByStart = shouldUseDp && ['auto', 'dp', 'aho'].includes(mode) + ? buildAhoMatches(token, dict) + : null; + if (mode === 'dp' || mode === 'aho') { + if (!shouldUseDp) return greedy; + return splitWordsWithDictDp(token, dict, maxLen, matchesByStart); + } + if (shouldUseDp) { + const dp = splitWordsWithDictDp(token, dict, maxLen, matchesByStart); + if (scoreSegments(dp, dict) > scoreSegments(greedy, dict)) return dp; + } + return greedy; } /** diff --git a/src/sqlite/utils.js b/src/sqlite/utils.js deleted file mode 100644 index 07dcaf186..000000000 --- a/src/sqlite/utils.js +++ /dev/null @@ -1,90 +0,0 @@ -import fs from 'node:fs'; -import path from 'node:path'; - -/** - * Split an array into fixed-size chunks. - * @param {Array} items - * @param {number} [size] - * @returns {Array>} - */ -export function chunkArray(items, size = 900) { - const chunks = []; - for (let i = 0; i < items.length; i += size) { - chunks.push(items.slice(i, i + size)); - } - return chunks; -} - -/** - * Return the set of table names in a SQLite database. - * @param {import('better-sqlite3').Database} db - * @returns {Set} - */ -export function getTableNames(db) { - const rows = db.prepare("SELECT name FROM sqlite_master WHERE type='table'").all(); - return new Set(rows.map((row) => row.name)); -} - -/** - * Check that all required tables exist. - * @param {import('better-sqlite3').Database} db - * @param {string[]} requiredTables - * @returns {boolean} - */ -export function hasRequiredTables(db, requiredTables) { - const tableNames = getTableNames(db); - return requiredTables.every((name) => tableNames.has(name)); -} - -/** - * Normalize a file path to POSIX separators. - * @param {string} value - * @returns {string} - */ -export function normalizeFilePath(value) { - if (typeof value !== 'string') return value; - return value.replace(/\\/g, '/'); -} - -/** - * Read and parse JSON from disk. - * @param {string} filePath - * @returns {any} - */ -export function readJson(filePath) { - return JSON.parse(fs.readFileSync(filePath, 'utf8')); -} - -/** - * Read JSON from disk if it exists; otherwise return null. - * @param {string} dir - * @param {string} name - * @returns {any|null} - */ -export function loadOptional(dir, name) { - const target = path.join(dir, name); - if (!fs.existsSync(target)) return null; - return readJson(target); -} - -/** - * Load file-backed index artifacts from a directory. - * @param {string} dir - * @param {string} modelId - * @returns {object|null} - */ -export function loadIndex(dir, modelId) { - const chunkMetaPath = path.join(dir, 'chunk_meta.json'); - if (!fs.existsSync(chunkMetaPath)) return null; - const chunkMeta = readJson(chunkMetaPath); - const denseVec = loadOptional(dir, 'dense_vectors_uint8.json'); - if (denseVec && !denseVec.model) denseVec.model = modelId || null; - return { - chunkMeta, - denseVec, - phraseNgrams: loadOptional(dir, 'phrase_ngrams.json'), - chargrams: loadOptional(dir, 'chargram_postings.json'), - minhash: loadOptional(dir, 'minhash_signatures.json'), - tokenPostings: loadOptional(dir, 'token_postings.json') - }; -} diff --git a/src/storage/backend-policy.js b/src/storage/backend-policy.js new file mode 100644 index 000000000..0f90a48f7 --- /dev/null +++ b/src/storage/backend-policy.js @@ -0,0 +1,218 @@ +export function resolveBackendPolicy({ + backendArg, + sqliteScoreModeConfig = false, + sqliteConfigured = true, + sqliteAvailable = false, + lmdbConfigured = true, + lmdbAvailable = false, + sqliteAutoChunkThreshold = 0, + sqliteAutoArtifactBytes = 0, + needsSqlite = true, + chunkCounts = [], + artifactBytes = [] +} = {}) { + const normalized = typeof backendArg === 'string' ? backendArg.toLowerCase() : ''; + const backendAuto = !normalized || normalized === 'auto'; + const sqliteFtsRequested = normalized === 'sqlite-fts' + || normalized === 'fts' + || (backendAuto && sqliteScoreModeConfig === true); + const backendForcedSqlite = normalized === 'sqlite' || sqliteFtsRequested; + const backendForcedLmdb = normalized === 'lmdb'; + const backendForcedMemory = normalized === 'memory'; + const backendDisabled = normalized + && !backendAuto + && !backendForcedSqlite + && !backendForcedLmdb + && !backendForcedMemory; + + const counts = Array.isArray(chunkCounts) + ? chunkCounts.filter((count) => Number.isFinite(count)) + : []; + const maxChunkCount = counts.length ? Math.max(...counts) : null; + const byteTotals = Array.isArray(artifactBytes) + ? artifactBytes.filter((count) => Number.isFinite(count)) + : []; + const totalArtifactBytes = byteTotals.length + ? byteTotals.reduce((sum, next) => sum + next, 0) + : null; + + const policy = { + requested: normalized || 'auto', + sqliteAutoChunkThreshold, + sqliteAutoArtifactBytes, + maxChunkCount, + totalArtifactBytes, + lmdbAvailable, + lmdbConfigured + }; + + if (backendDisabled) { + return { + useSqlite: false, + useLmdb: false, + backendLabel: 'memory', + sqliteFtsRequested: false, + backendForcedSqlite: false, + backendForcedLmdb: false, + backendForcedMemory: false, + backendDisabled: true, + reason: 'unknown backend requested', + policy + }; + } + + if (!needsSqlite) { + return { + useSqlite: false, + useLmdb: false, + backendLabel: 'memory', + sqliteFtsRequested: false, + backendForcedSqlite: false, + backendForcedLmdb, + backendForcedMemory, + backendDisabled: false, + reason: 'no sqlite needed for selected mode', + policy + }; + } + + if (backendForcedLmdb && !lmdbAvailable) { + return { + useSqlite: false, + useLmdb: false, + backendLabel: 'lmdb', + sqliteFtsRequested: false, + backendForcedSqlite: false, + backendForcedLmdb, + backendForcedMemory, + backendDisabled: false, + reason: 'lmdb indexes missing', + error: 'LMDB backend requested but index not found', + policy + }; + } + + if (backendForcedLmdb) { + return { + useSqlite: false, + useLmdb: true, + backendLabel: 'lmdb', + sqliteFtsRequested: false, + backendForcedSqlite: false, + backendForcedLmdb, + backendForcedMemory, + backendDisabled: false, + reason: 'lmdb backend forced by flag', + policy + }; + } + + if (backendForcedSqlite && !sqliteAvailable) { + return { + useSqlite: false, + useLmdb: false, + backendLabel: sqliteFtsRequested ? 'sqlite-fts' : 'sqlite', + sqliteFtsRequested, + backendForcedSqlite, + backendForcedLmdb, + backendForcedMemory, + backendDisabled: false, + reason: 'sqlite indexes missing', + error: 'SQLite backend requested but index not found', + policy + }; + } + + if (backendForcedSqlite) { + return { + useSqlite: true, + useLmdb: false, + backendLabel: sqliteFtsRequested ? 'sqlite-fts' : 'sqlite', + sqliteFtsRequested, + backendForcedSqlite, + backendForcedLmdb, + backendForcedMemory, + backendDisabled: false, + reason: 'sqlite backend forced by flag', + policy + }; + } + + if (backendForcedMemory) { + return { + useSqlite: false, + useLmdb: false, + backendLabel: 'memory', + sqliteFtsRequested: false, + backendForcedSqlite: false, + backendForcedLmdb, + backendForcedMemory: true, + backendDisabled: false, + reason: 'memory backend forced by flag', + policy + }; + } + + if (!sqliteConfigured || !sqliteAvailable) { + if (lmdbConfigured && lmdbAvailable) { + return { + useSqlite: false, + useLmdb: true, + backendLabel: 'lmdb', + sqliteFtsRequested: false, + backendForcedSqlite: false, + backendForcedLmdb: false, + backendForcedMemory: false, + backendDisabled: false, + reason: sqliteConfigured ? 'sqlite indexes unavailable; using lmdb' : 'sqlite disabled; using lmdb', + policy + }; + } + return { + useSqlite: false, + useLmdb: false, + backendLabel: 'memory', + sqliteFtsRequested: false, + backendForcedSqlite: false, + backendForcedLmdb, + backendForcedMemory: false, + backendDisabled: false, + reason: sqliteConfigured ? 'sqlite indexes unavailable' : 'sqlite disabled', + policy + }; + } + + let autoUseSqlite = true; + let autoReason = 'auto default'; + const thresholdsEnabled = sqliteAutoChunkThreshold > 0 || sqliteAutoArtifactBytes > 0; + if (thresholdsEnabled) { + const hits = []; + if (sqliteAutoChunkThreshold > 0 && Number.isFinite(maxChunkCount)) { + hits.push(maxChunkCount >= sqliteAutoChunkThreshold ? 'chunkCount' : null); + } + if (sqliteAutoArtifactBytes > 0 && Number.isFinite(totalArtifactBytes)) { + hits.push(totalArtifactBytes >= sqliteAutoArtifactBytes ? 'artifactBytes' : null); + } + const hitReasons = hits.filter(Boolean); + if (hitReasons.length) { + autoUseSqlite = true; + autoReason = `auto threshold met (${hitReasons.join(', ')})`; + } else if (hits.length) { + autoUseSqlite = false; + autoReason = 'auto threshold not met'; + } + } + + return { + useSqlite: autoUseSqlite, + useLmdb: false, + backendLabel: autoUseSqlite ? (sqliteFtsRequested ? 'sqlite-fts' : 'sqlite') : 'memory', + sqliteFtsRequested, + backendForcedSqlite: false, + backendForcedLmdb: false, + backendForcedMemory: false, + backendDisabled: false, + reason: autoReason, + policy + }; +} diff --git a/src/storage/lmdb/schema.js b/src/storage/lmdb/schema.js new file mode 100644 index 000000000..3b65652ed --- /dev/null +++ b/src/storage/lmdb/schema.js @@ -0,0 +1,31 @@ +export const LMDB_SCHEMA_VERSION = 1; + +export const LMDB_META_KEYS = { + schemaVersion: 'meta:schemaVersion', + createdAt: 'meta:createdAt', + mode: 'meta:mode', + artifacts: 'meta:artifacts', + chunkCount: 'meta:chunkCount', + sourceIndex: 'meta:sourceIndex' +}; + +export const LMDB_ARTIFACT_KEYS = { + chunkMeta: 'artifact:chunk_meta', + tokenPostings: 'artifact:token_postings', + fileMeta: 'artifact:file_meta', + fileRelations: 'artifact:file_relations', + repoMap: 'artifact:repo_map', + filterIndex: 'artifact:filter_index', + fieldPostings: 'artifact:field_postings', + fieldTokens: 'artifact:field_tokens', + phraseNgrams: 'artifact:phrase_ngrams', + chargramPostings: 'artifact:chargram_postings', + minhashSignatures: 'artifact:minhash_signatures', + denseVectors: 'artifact:dense_vectors_uint8', + denseVectorsDoc: 'artifact:dense_vectors_doc_uint8', + denseVectorsCode: 'artifact:dense_vectors_code_uint8', + denseHnswMeta: 'artifact:dense_vectors_hnsw_meta', + indexState: 'artifact:index_state' +}; + +export const LMDB_ARTIFACT_LIST = Object.values(LMDB_ARTIFACT_KEYS); diff --git a/src/sqlite/build-helpers.js b/src/storage/sqlite/build-helpers.js similarity index 90% rename from src/sqlite/build-helpers.js rename to src/storage/sqlite/build-helpers.js index 663f8013a..8b563473f 100644 --- a/src/sqlite/build-helpers.js +++ b/src/storage/sqlite/build-helpers.js @@ -9,8 +9,14 @@ import { normalizeFilePath } from './utils.js'; */ export function buildChunkRow(chunk, mode, id) { const tokensArray = Array.isArray(chunk.tokens) ? chunk.tokens : []; + const chunkId = chunk?.metaV2?.chunkId || chunk?.chunkId || null; + const signature = typeof chunk.docmeta?.signature === 'string' + ? chunk.docmeta.signature + : (typeof chunk.signature === 'string' ? chunk.signature : null); + const doc = typeof chunk.docmeta?.doc === 'string' ? chunk.docmeta.doc : null; return { id, + chunk_id: chunkId, mode, file: normalizeFilePath(chunk.file), start: chunk.start, @@ -20,7 +26,9 @@ export function buildChunkRow(chunk, mode, id) { ext: chunk.ext || null, kind: chunk.kind || null, name: chunk.name || null, + signature, headline: chunk.headline || null, + doc, preContext: chunk.preContext ? JSON.stringify(chunk.preContext) : null, postContext: chunk.postContext ? JSON.stringify(chunk.postContext) : null, weight: typeof chunk.weight === 'number' ? chunk.weight : 1, diff --git a/src/storage/sqlite/build/bundle-loader.js b/src/storage/sqlite/build/bundle-loader.js new file mode 100644 index 000000000..d1a4ed7a9 --- /dev/null +++ b/src/storage/sqlite/build/bundle-loader.js @@ -0,0 +1,46 @@ +import fsSync from 'node:fs'; +import path from 'node:path'; +import Piscina from 'piscina'; +import { readBundleFile } from '../../../shared/bundle-io.js'; + +export const createBundleLoader = ({ bundleThreads, workerPath }) => { + const useWorkers = Number.isFinite(bundleThreads) && bundleThreads > 1; + const pool = useWorkers && workerPath + ? new Piscina({ filename: workerPath, maxThreads: bundleThreads }) + : null; + + const loadBundle = async ({ bundleDir, entry, file }) => { + const bundleName = entry?.bundle; + if (!bundleName) { + return { file, ok: false, reason: 'missing bundle entry' }; + } + const bundlePath = path.join(bundleDir, bundleName); + if (!fsSync.existsSync(bundlePath)) { + return { file, ok: false, reason: 'bundle file missing' }; + } + try { + if (pool) { + const result = await pool.run({ bundlePath }); + if (!result?.ok) { + return { file, ok: false, reason: result?.reason || 'invalid bundle' }; + } + return { file, ok: true, bundle: result.bundle }; + } + const result = await readBundleFile(bundlePath); + if (!result.ok) { + return { file, ok: false, reason: result.reason || 'invalid bundle' }; + } + return { file, ok: true, bundle: result.bundle }; + } catch (err) { + return { file, ok: false, reason: err?.message || String(err) }; + } + }; + + const close = async () => { + if (pool) { + await pool.destroy(); + } + }; + + return { loadBundle, close, useWorkers }; +}; diff --git a/src/storage/sqlite/build/delete.js b/src/storage/sqlite/build/delete.js new file mode 100644 index 000000000..513f86b78 --- /dev/null +++ b/src/storage/sqlite/build/delete.js @@ -0,0 +1,45 @@ +import { chunkArray } from '../utils.js'; + +export function deleteDocIds(db, mode, docIds, extraTables = []) { + if (!docIds.length) return; + const deleteTargets = [ + { table: 'chunks', column: 'id' }, + { table: 'chunks_fts', column: 'rowid' }, + { table: 'token_postings', column: 'doc_id' }, + { table: 'phrase_postings', column: 'doc_id' }, + { table: 'chargram_postings', column: 'doc_id' }, + { table: 'minhash_signatures', column: 'doc_id' }, + { table: 'dense_vectors', column: 'doc_id' }, + { table: 'doc_lengths', column: 'doc_id' } + ]; + for (const extra of extraTables) { + if (extra?.table && extra?.column) deleteTargets.push(extra); + } + for (const chunk of chunkArray(docIds)) { + const placeholders = chunk.map(() => '?').join(','); + for (const target of deleteTargets) { + const withMode = target.withMode !== false; + const values = target.transform ? chunk.map(target.transform) : chunk; + const where = withMode + ? `mode = ? AND ${target.column} IN (${placeholders})` + : `${target.column} IN (${placeholders})`; + const stmt = db.prepare(`DELETE FROM ${target.table} WHERE ${where}`); + if (withMode) { + stmt.run(mode, ...values); + } else { + stmt.run(...values); + } + } + } +} + +export function updateTokenStats(db, mode, insertTokenStats) { + const row = db.prepare( + 'SELECT COUNT(*) AS total_docs, AVG(len) AS avg_doc_len FROM doc_lengths WHERE mode = ?' + ).get(mode) || {}; + insertTokenStats.run( + mode, + typeof row.avg_doc_len === 'number' ? row.avg_doc_len : 0, + typeof row.total_docs === 'number' ? row.total_docs : 0 + ); +} diff --git a/src/storage/sqlite/build/from-artifacts.js b/src/storage/sqlite/build/from-artifacts.js new file mode 100644 index 000000000..0db127f19 --- /dev/null +++ b/src/storage/sqlite/build/from-artifacts.js @@ -0,0 +1,557 @@ +import fsSync from 'node:fs'; +import path from 'node:path'; +import readline from 'node:readline'; +import { + buildChunkRow, + buildTokenFrequency, + prepareVectorAnnTable +} from '../build-helpers.js'; +import { CREATE_INDEXES_SQL, CREATE_TABLES_BASE_SQL, SCHEMA_VERSION } from '../schema.js'; +import { normalizeFilePath, readJson, loadOptional } from '../utils.js'; +import { packUint32, packUint8, dequantizeUint8ToFloat32, toVectorId } from '../vector.js'; +import { applyBuildPragmas, restoreBuildPragmas } from './pragmas.js'; +import { normalizeManifestFiles } from './manifest.js'; +import { validateSqliteDatabase } from './validate.js'; +import { createInsertStatements } from './statements.js'; + +const listShardFiles = (dir, prefix) => { + if (!fsSync.existsSync(dir)) return []; + return fsSync + .readdirSync(dir) + .filter((name) => name.startsWith(prefix) && (name.endsWith('.json') || name.endsWith('.jsonl'))) + .sort() + .map((name) => path.join(dir, name)); +}; + +const resolveChunkMetaSources = (dir) => { + const metaPath = path.join(dir, 'chunk_meta.meta.json'); + const partsDir = path.join(dir, 'chunk_meta.parts'); + if (fsSync.existsSync(metaPath) || fsSync.existsSync(partsDir)) { + let parts = []; + if (fsSync.existsSync(metaPath)) { + try { + const meta = readJson(metaPath); + if (Array.isArray(meta?.parts) && meta.parts.length) { + parts = meta.parts.map((name) => path.join(dir, name)); + } + } catch {} + } + if (!parts.length) { + parts = listShardFiles(partsDir, 'chunk_meta.part-'); + } + return parts.length ? { format: 'jsonl', paths: parts } : null; + } + const jsonlPath = path.join(dir, 'chunk_meta.jsonl'); + if (fsSync.existsSync(jsonlPath)) { + return { format: 'jsonl', paths: [jsonlPath] }; + } + const jsonPath = path.join(dir, 'chunk_meta.json'); + if (fsSync.existsSync(jsonPath)) { + return { format: 'json', paths: [jsonPath] }; + } + return null; +}; + +const resolveTokenPostingsSources = (dir) => { + const metaPath = path.join(dir, 'token_postings.meta.json'); + const shardsDir = path.join(dir, 'token_postings.shards'); + if (!fsSync.existsSync(metaPath) && !fsSync.existsSync(shardsDir)) return null; + let parts = []; + if (fsSync.existsSync(metaPath)) { + try { + const meta = readJson(metaPath); + if (Array.isArray(meta?.parts) && meta.parts.length) { + parts = meta.parts.map((name) => path.join(dir, name)); + } + } catch {} + } + if (!parts.length) { + parts = listShardFiles(shardsDir, 'token_postings.part-'); + } + return parts.length ? { metaPath, parts } : null; +}; + +const readJsonLinesFile = async (filePath, onEntry) => { + const stream = fsSync.createReadStream(filePath, { encoding: 'utf8' }); + const rl = readline.createInterface({ input: stream, crlfDelay: Infinity }); + for await (const line of rl) { + const trimmed = line.trim(); + if (!trimmed) continue; + onEntry(JSON.parse(trimmed)); + } +}; + +export const loadIndexPieces = (dir, modelId) => { + const sources = resolveChunkMetaSources(dir); + if (!sources) return null; + const denseVec = loadOptional(dir, 'dense_vectors_uint8.json'); + if (denseVec && !denseVec.model) denseVec.model = modelId || null; + return { + chunkMeta: null, + dir, + fileMeta: loadOptional(dir, 'file_meta.json'), + denseVec, + phraseNgrams: loadOptional(dir, 'phrase_ngrams.json'), + chargrams: loadOptional(dir, 'chargram_postings.json'), + minhash: loadOptional(dir, 'minhash_signatures.json'), + tokenPostings: null + }; +}; + +export async function buildDatabaseFromArtifacts({ + Database, + outPath, + index, + indexDir, + mode, + manifestFiles, + emitOutput, + validateMode, + vectorConfig, + modelConfig +}) { + if (!index) return 0; + const manifestLookup = normalizeManifestFiles(manifestFiles || {}); + if (emitOutput && manifestLookup.conflicts.length) { + console.warn(`[sqlite] Manifest path conflicts for ${mode}; using normalized entries.`); + } + const manifestByNormalized = manifestLookup.map; + const validationStats = { chunks: 0, dense: 0, minhash: 0 }; + const vectorExtension = vectorConfig?.extension || {}; + const encodeVector = vectorConfig?.encodeVector; + + const db = new Database(outPath); + applyBuildPragmas(db); + + let count = 0; + let succeeded = false; + try { + db.exec(CREATE_TABLES_BASE_SQL); + db.pragma(`user_version = ${SCHEMA_VERSION}`); + const vectorAnn = prepareVectorAnnTable({ db, indexData: index, mode, vectorConfig }); + + const statements = createInsertStatements(db); + const { + insertChunk, + insertFts, + insertTokenVocab, + insertTokenPosting, + insertDocLength, + insertTokenStats, + insertPhraseVocab, + insertPhrasePosting, + insertChargramVocab, + insertChargramPosting, + insertMinhash, + insertDense, + insertDenseMeta, + insertFileManifest + } = statements; + + function ingestTokenIndex(tokenIndex, targetMode) { + if (!tokenIndex?.vocab || !tokenIndex?.postings) return; + const vocab = tokenIndex.vocab; + const postings = tokenIndex.postings; + const docLengths = Array.isArray(tokenIndex.docLengths) ? tokenIndex.docLengths : []; + const avgDocLen = typeof tokenIndex.avgDocLen === 'number' ? tokenIndex.avgDocLen : null; + const totalDocs = typeof tokenIndex.totalDocs === 'number' ? tokenIndex.totalDocs : docLengths.length; + + const insertVocabTx = db.transaction(() => { + for (let i = 0; i < vocab.length; i += 1) { + insertTokenVocab.run(targetMode, i, vocab[i]); + } + }); + insertVocabTx(); + + const insertPostingsTx = db.transaction(() => { + for (let tokenId = 0; tokenId < postings.length; tokenId += 1) { + const posting = postings[tokenId] || []; + for (const entry of posting) { + if (!entry) continue; + const docId = entry[0]; + const tf = entry[1]; + insertTokenPosting.run(targetMode, tokenId, docId, tf); + } + } + }); + insertPostingsTx(); + + const insertLengthsTx = db.transaction(() => { + for (let docId = 0; docId < docLengths.length; docId += 1) { + insertDocLength.run(targetMode, docId, docLengths[docId]); + } + }); + insertLengthsTx(); + + insertTokenStats.run(targetMode, avgDocLen, totalDocs); + } + + function ingestTokenIndexFromPieces(targetMode, indexDir) { + const directPath = path.join(indexDir, 'token_postings.json'); + const directPathGz = `${directPath}.gz`; + const sources = resolveTokenPostingsSources(indexDir); + if (!sources && !fsSync.existsSync(directPath) && !fsSync.existsSync(directPathGz)) { + return false; + } + if (!sources) { + const tokenIndex = readJson(directPath); + ingestTokenIndex(tokenIndex, targetMode); + return true; + } + const meta = fsSync.existsSync(sources.metaPath) ? readJson(sources.metaPath) : {}; + const docLengths = Array.isArray(meta?.docLengths) + ? meta.docLengths + : (Array.isArray(meta?.arrays?.docLengths) ? meta.arrays.docLengths : []); + const totalDocs = Number.isFinite(meta?.totalDocs) ? meta.totalDocs : docLengths.length; + const avgDocLen = Number.isFinite(meta?.avgDocLen) + ? meta.avgDocLen + : (Number.isFinite(meta?.fields?.avgDocLen) ? meta.fields.avgDocLen : ( + docLengths.length + ? docLengths.reduce((sum, len) => sum + (Number.isFinite(len) ? len : 0), 0) / docLengths.length + : 0 + )); + const insertLengthsTx = db.transaction(() => { + for (let docId = 0; docId < docLengths.length; docId += 1) { + insertDocLength.run(targetMode, docId, docLengths[docId]); + } + }); + insertLengthsTx(); + insertTokenStats.run(targetMode, avgDocLen, totalDocs); + let tokenId = 0; + for (const shardPath of sources.parts) { + const shard = readJson(shardPath); + const vocab = Array.isArray(shard?.vocab) + ? shard.vocab + : (Array.isArray(shard?.arrays?.vocab) ? shard.arrays.vocab : []); + const postings = Array.isArray(shard?.postings) + ? shard.postings + : (Array.isArray(shard?.arrays?.postings) ? shard.arrays.postings : []); + const insertVocabTx = db.transaction(() => { + for (let i = 0; i < vocab.length; i += 1) { + insertTokenVocab.run(targetMode, tokenId + i, vocab[i]); + } + }); + insertVocabTx(); + const insertPostingsTx = db.transaction(() => { + for (let i = 0; i < postings.length; i += 1) { + const posting = postings[i] || []; + const postingTokenId = tokenId + i; + for (const entry of posting) { + if (!entry) continue; + insertTokenPosting.run(targetMode, postingTokenId, entry[0], entry[1]); + } + } + }); + insertPostingsTx(); + tokenId += vocab.length; + } + return true; + } + + function ingestTokenIndexFromChunks(chunks, targetMode) { + if (!Array.isArray(chunks) || !chunks.length) return; + const tokenIdMap = new Map(); + let nextTokenId = 0; + let totalDocs = 0; + let totalLen = 0; + const insertTx = db.transaction(() => { + for (let i = 0; i < chunks.length; i += 1) { + const chunk = chunks[i]; + if (!chunk) continue; + const docId = Number.isFinite(chunk.id) ? chunk.id : i; + const tokensArray = Array.isArray(chunk.tokens) ? chunk.tokens : []; + const docLen = tokensArray.length; + totalDocs += 1; + totalLen += docLen; + insertDocLength.run(targetMode, docId, docLen); + if (!docLen) continue; + const freq = buildTokenFrequency(tokensArray); + for (const [token, tf] of freq.entries()) { + let tokenId = tokenIdMap.get(token); + if (tokenId === undefined) { + tokenId = nextTokenId; + nextTokenId += 1; + tokenIdMap.set(token, tokenId); + insertTokenVocab.run(targetMode, tokenId, token); + } + insertTokenPosting.run(targetMode, tokenId, docId, tf); + } + } + }); + insertTx(); + insertTokenStats.run(targetMode, totalDocs ? totalLen / totalDocs : 0, totalDocs); + } + + function ingestPostingIndex(indexData, targetMode, insertVocabStmt, insertPostingStmt) { + if (!indexData?.vocab || !indexData?.postings) return; + const vocab = indexData.vocab; + const postings = indexData.postings; + + const insertVocabTx = db.transaction(() => { + for (let i = 0; i < vocab.length; i += 1) { + insertVocabStmt.run(targetMode, i, vocab[i]); + } + }); + insertVocabTx(); + + const insertPostingsTx = db.transaction(() => { + for (let tokenId = 0; tokenId < postings.length; tokenId += 1) { + const posting = postings[tokenId] || []; + for (const docId of posting) { + insertPostingStmt.run(targetMode, tokenId, docId); + } + } + }); + insertPostingsTx(); + } + + function ingestMinhash(minhash, targetMode) { + if (!minhash?.signatures || !minhash.signatures.length) return; + const insertTx = db.transaction(() => { + for (let docId = 0; docId < minhash.signatures.length; docId += 1) { + const sig = minhash.signatures[docId]; + if (!sig) continue; + insertMinhash.run(targetMode, docId, packUint32(sig)); + validationStats.minhash += 1; + } + }); + insertTx(); + } + + function ingestDense(dense, targetMode) { + if (!dense?.vectors || !dense.vectors.length) return; + insertDenseMeta.run( + targetMode, + dense.dims || null, + typeof dense.scale === 'number' ? dense.scale : 1.0, + dense.model || modelConfig.id || null + ); + const insertTx = db.transaction(() => { + for (let docId = 0; docId < dense.vectors.length; docId += 1) { + const vec = dense.vectors[docId]; + if (!vec) continue; + insertDense.run(targetMode, docId, packUint8(vec)); + validationStats.dense += 1; + if (vectorAnn?.insert && encodeVector) { + const floatVec = dequantizeUint8ToFloat32(vec); + const encoded = encodeVector(floatVec, vectorExtension); + if (encoded) vectorAnn.insert.run(toVectorId(docId), encoded); + } + } + }); + insertTx(); + } + + const buildChunkRowWithMeta = (chunk, targetMode, fileMetaById) => { + const fileMeta = Number.isFinite(chunk.fileId) + ? fileMetaById.get(chunk.fileId) + : null; + const resolvedFile = normalizeFilePath(chunk.file || fileMeta?.file); + const resolvedExt = chunk.ext || fileMeta?.ext || null; + const resolvedExternalDocs = chunk.externalDocs || fileMeta?.externalDocs || null; + const resolvedLastModified = chunk.last_modified || fileMeta?.last_modified || null; + const resolvedLastAuthor = chunk.last_author || fileMeta?.last_author || null; + const resolvedChurn = typeof chunk.churn === 'number' + ? chunk.churn + : (typeof fileMeta?.churn === 'number' ? fileMeta.churn : null); + const resolvedChurnAdded = typeof chunk.churn_added === 'number' + ? chunk.churn_added + : (typeof fileMeta?.churn_added === 'number' ? fileMeta.churn_added : null); + const resolvedChurnDeleted = typeof chunk.churn_deleted === 'number' + ? chunk.churn_deleted + : (typeof fileMeta?.churn_deleted === 'number' ? fileMeta.churn_deleted : null); + const resolvedChurnCommits = typeof chunk.churn_commits === 'number' + ? chunk.churn_commits + : (typeof fileMeta?.churn_commits === 'number' ? fileMeta.churn_commits : null); + const tokensArray = Array.isArray(chunk.tokens) ? chunk.tokens : []; + const tokensText = tokensArray.join(' '); + const signatureText = typeof chunk.docmeta?.signature === 'string' + ? chunk.docmeta.signature + : (typeof chunk.signature === 'string' ? chunk.signature : null); + const docText = typeof chunk.docmeta?.doc === 'string' ? chunk.docmeta.doc : null; + const stableChunkId = chunk?.metaV2?.chunkId || chunk?.chunkId || null; + return { + id: Number.isFinite(chunk.id) ? chunk.id : null, + chunk_id: stableChunkId, + mode: targetMode, + file: resolvedFile, + start: chunk.start, + end: chunk.end, + startLine: chunk.startLine || null, + endLine: chunk.endLine || null, + ext: resolvedExt, + kind: chunk.kind || null, + name: chunk.name || null, + signature: signatureText, + headline: chunk.headline || null, + doc: docText, + preContext: chunk.preContext ? JSON.stringify(chunk.preContext) : null, + postContext: chunk.postContext ? JSON.stringify(chunk.postContext) : null, + weight: typeof chunk.weight === 'number' ? chunk.weight : 1, + tokens: tokensArray.length ? JSON.stringify(tokensArray) : null, + tokensText, + ngrams: chunk.ngrams ? JSON.stringify(chunk.ngrams) : null, + codeRelations: chunk.codeRelations ? JSON.stringify(chunk.codeRelations) : null, + docmeta: chunk.docmeta ? JSON.stringify(chunk.docmeta) : null, + stats: chunk.stats ? JSON.stringify(chunk.stats) : null, + complexity: chunk.complexity ? JSON.stringify(chunk.complexity) : null, + lint: chunk.lint ? JSON.stringify(chunk.lint) : null, + externalDocs: resolvedExternalDocs ? JSON.stringify(resolvedExternalDocs) : null, + last_modified: resolvedLastModified, + last_author: resolvedLastAuthor, + churn: resolvedChurn, + churn_added: resolvedChurnAdded, + churn_deleted: resolvedChurnDeleted, + churn_commits: resolvedChurnCommits, + chunk_authors: chunk.chunk_authors ? JSON.stringify(chunk.chunk_authors) : null + }; + }; + + const ingestChunkMetaPieces = async (targetMode, indexDir, fileMetaById) => { + const sources = resolveChunkMetaSources(indexDir); + if (!sources) return { count: 0, fileCounts: new Map() }; + const fileCounts = new Map(); + const rows = []; + const insert = db.transaction((batch) => { + for (const row of batch) { + insertChunk.run(row); + insertFts.run(row); + } + }); + const flush = () => { + if (!rows.length) return; + insert(rows); + rows.length = 0; + }; + let chunkCount = 0; + const handleChunk = (chunk) => { + if (!chunk) return; + if (!Number.isFinite(chunk.id)) { + chunk.id = chunkCount; + } + const row = buildChunkRowWithMeta(chunk, targetMode, fileMetaById); + if (row.file) { + fileCounts.set(row.file, (fileCounts.get(row.file) || 0) + 1); + } + rows.push(row); + chunkCount += 1; + if (rows.length >= 500) flush(); + }; + if (sources.format === 'json') { + const data = readJson(sources.paths[0]); + if (Array.isArray(data)) { + for (const chunk of data) handleChunk(chunk); + } + } else { + for (const chunkPath of sources.paths) { + await readJsonLinesFile(chunkPath, handleChunk); + } + } + flush(); + return { count: chunkCount, fileCounts }; + }; + + async function ingestIndex(indexData, targetMode, indexDir) { + if (!indexData && !indexDir) return 0; + const fileMetaById = new Map(); + if (Array.isArray(indexData?.fileMeta)) { + for (const entry of indexData.fileMeta) { + if (!entry || !Number.isFinite(entry.id)) continue; + fileMetaById.set(entry.id, entry); + } + } + let chunkCount = 0; + let fileCounts = new Map(); + if (Array.isArray(indexData?.chunkMeta)) { + const insert = db.transaction((rows) => { + for (const row of rows) { + insertChunk.run(row); + insertFts.run(row); + } + }); + const rows = []; + for (let i = 0; i < indexData.chunkMeta.length; i += 1) { + const chunk = indexData.chunkMeta[i]; + if (!chunk) continue; + if (!Number.isFinite(chunk.id)) { + chunk.id = i; + } + const row = buildChunkRowWithMeta(chunk, targetMode, fileMetaById); + rows.push(row); + if (row.file) { + fileCounts.set(row.file, (fileCounts.get(row.file) || 0) + 1); + } + chunkCount += 1; + } + insert(rows); + } else if (indexDir) { + const result = await ingestChunkMetaPieces(targetMode, indexDir, fileMetaById); + chunkCount = result.count; + fileCounts = result.fileCounts; + } + + let tokenIngested = false; + if (indexData?.tokenPostings) { + ingestTokenIndex(indexData.tokenPostings, targetMode); + tokenIngested = true; + } + if (!tokenIngested && indexDir) { + tokenIngested = ingestTokenIndexFromPieces(targetMode, indexDir); + } + if (!tokenIngested) { + console.warn(`[sqlite] token_postings missing; rebuilding tokens for ${targetMode}.`); + if (Array.isArray(indexData?.chunkMeta)) { + ingestTokenIndexFromChunks(indexData.chunkMeta, targetMode); + } else { + console.warn(`[sqlite] chunk_meta unavailable for token rebuild (${targetMode}).`); + } + } + + ingestPostingIndex(indexData?.phraseNgrams, targetMode, insertPhraseVocab, insertPhrasePosting); + ingestPostingIndex(indexData?.chargrams, targetMode, insertChargramVocab, insertChargramPosting); + ingestMinhash(indexData?.minhash, targetMode); + ingestDense(indexData?.denseVec, targetMode); + ingestFileManifest(fileCounts, targetMode); + + return chunkCount; + } + + function ingestFileManifest(fileCounts, targetMode) { + if (!fileCounts || !fileCounts.size) return; + const insertTx = db.transaction(() => { + for (const [file, count] of fileCounts.entries()) { + const normalizedFile = normalizeFilePath(file); + const entry = manifestByNormalized.get(normalizedFile)?.entry || null; + insertFileManifest.run( + targetMode, + normalizedFile, + entry?.hash || null, + Number.isFinite(entry?.mtimeMs) ? entry.mtimeMs : null, + Number.isFinite(entry?.size) ? entry.size : null, + count + ); + } + }); + insertTx(); + } + + count = await ingestIndex(index, mode, indexDir); + validationStats.chunks = count; + db.exec(CREATE_INDEXES_SQL); + validateSqliteDatabase(db, mode, { + validateMode, + expected: validationStats, + emitOutput + }); + succeeded = true; + } finally { + restoreBuildPragmas(db); + db.close(); + if (!succeeded) { + try { + fsSync.rmSync(outPath, { force: true }); + } catch {} + } + } + return count; +} diff --git a/src/storage/sqlite/build/from-bundles.js b/src/storage/sqlite/build/from-bundles.js new file mode 100644 index 000000000..05e5a1df0 --- /dev/null +++ b/src/storage/sqlite/build/from-bundles.js @@ -0,0 +1,301 @@ +import fsSync from 'node:fs'; +import path from 'node:path'; +import { buildChunkRow, buildTokenFrequency } from '../build-helpers.js'; +import { CREATE_INDEXES_SQL, CREATE_TABLES_BASE_SQL, SCHEMA_VERSION } from '../schema.js'; +import { normalizeFilePath } from '../utils.js'; +import { packUint32, packUint8, quantizeVec, toVectorId } from '../vector.js'; +import { applyBuildPragmas, restoreBuildPragmas } from './pragmas.js'; +import { normalizeManifestFiles } from './manifest.js'; +import { validateSqliteDatabase } from './validate.js'; +import { createInsertStatements } from './statements.js'; +import { createBundleLoader } from './bundle-loader.js'; + +export async function buildDatabaseFromBundles({ + Database, + outPath, + mode, + incrementalData, + envConfig, + threadLimits, + emitOutput, + validateMode, + vectorConfig, + modelConfig, + workerPath +}) { + if (!incrementalData?.manifest) { + return { count: 0, reason: 'missing incremental manifest' }; + } + const manifestFiles = incrementalData.manifest.files || {}; + const manifestLookup = normalizeManifestFiles(manifestFiles); + const manifestEntries = manifestLookup.entries; + if (!manifestEntries.length) { + return { count: 0, reason: 'incremental manifest empty' }; + } + if (emitOutput && manifestLookup.conflicts.length) { + console.warn(`[sqlite] Manifest path conflicts for ${mode}; using normalized entries.`); + } + const totalFiles = manifestEntries.length; + let processedFiles = 0; + let lastProgressLog = 0; + const progressIntervalMs = 1000; + const envBundleThreads = Number(envConfig.bundleThreads); + const bundleThreads = Number.isFinite(envBundleThreads) && envBundleThreads > 0 + ? Math.floor(envBundleThreads) + : Math.max(1, Math.floor(threadLimits.fileConcurrency)); + const bundleLoader = createBundleLoader({ bundleThreads, workerPath }); + const useBundleWorkers = bundleLoader.useWorkers; + const logBundleProgress = (file, force = false) => { + if (!emitOutput) return; + const now = Date.now(); + if (!force && now - lastProgressLog < progressIntervalMs) return; + lastProgressLog = now; + const percent = ((processedFiles / totalFiles) * 100).toFixed(1); + const suffix = file ? ` | ${file}` : ''; + console.log(`[sqlite] bundles ${processedFiles}/${totalFiles} (${percent}%)${suffix}`); + }; + if (emitOutput) { + console.log(`[sqlite] Using incremental bundles for ${mode} (${totalFiles} files).`); + if (useBundleWorkers) { + console.log(`[sqlite] Bundle parser workers: ${bundleThreads}.`); + } + } + + const db = new Database(outPath); + applyBuildPragmas(db); + db.exec(CREATE_TABLES_BASE_SQL); + db.pragma(`user_version = ${SCHEMA_VERSION}`); + let succeeded = false; + try { + const statements = createInsertStatements(db); + const { + insertChunk, + insertFts, + insertTokenVocab, + insertTokenPosting, + insertDocLength, + insertTokenStats, + insertPhraseVocab, + insertPhrasePosting, + insertChargramVocab, + insertChargramPosting, + insertMinhash, + insertDense, + insertDenseMeta, + insertFileManifest + } = statements; + + const tokenIdMap = new Map(); + const phraseIdMap = new Map(); + const chargramIdMap = new Map(); + let nextTokenId = 0; + let nextPhraseId = 0; + let nextChargramId = 0; + let nextDocId = 0; + let totalDocs = 0; + let totalLen = 0; + const validationStats = { chunks: 0, dense: 0, minhash: 0 }; + + const fileCounts = new Map(); + for (const record of manifestEntries) { + fileCounts.set(record.normalized, 0); + } + + const vectorExtension = vectorConfig?.extension || {}; + const vectorAnnEnabled = vectorConfig?.enabled === true; + const encodeVector = vectorConfig?.encodeVector; + let denseMetaSet = false; + let denseDims = null; + let denseWarned = false; + let vectorAnnLoaded = false; + let vectorAnnReady = false; + let vectorAnnTable = vectorExtension.table || 'dense_vectors_ann'; + let vectorAnnColumn = vectorExtension.column || 'embedding'; + let insertVectorAnn = null; + if (vectorAnnEnabled) { + const loadResult = vectorConfig.loadVectorExtension(db, vectorExtension, `sqlite ${mode}`); + if (loadResult.ok) { + vectorAnnLoaded = true; + if (vectorConfig.hasVectorTable(db, vectorAnnTable)) { + vectorAnnReady = true; + } + } else { + console.warn(`[sqlite] Vector extension unavailable for ${mode}: ${loadResult.reason}`); + } + } + + const insertBundle = db.transaction((bundle, fileKey) => { + const normalizedFile = normalizeFilePath(fileKey); + let chunkCount = 0; + for (const chunk of bundle.chunks || []) { + const docId = nextDocId; + nextDocId += 1; + + const row = buildChunkRow({ ...chunk, file: chunk.file || fileKey }, mode, docId); + insertChunk.run(row); + insertFts.run(row); + + const tokensArray = Array.isArray(chunk.tokens) ? chunk.tokens : []; + insertDocLength.run(mode, docId, tokensArray.length); + totalDocs += 1; + totalLen += tokensArray.length; + + if (tokensArray.length) { + const freq = buildTokenFrequency(tokensArray); + for (const [token, tf] of freq.entries()) { + let tokenId = tokenIdMap.get(token); + if (tokenId === undefined) { + tokenId = nextTokenId; + nextTokenId += 1; + tokenIdMap.set(token, tokenId); + insertTokenVocab.run(mode, tokenId, token); + } + insertTokenPosting.run(mode, tokenId, docId, tf); + } + } + + if (Array.isArray(chunk.ngrams)) { + const unique = new Set(chunk.ngrams); + for (const ng of unique) { + let phraseId = phraseIdMap.get(ng); + if (phraseId === undefined) { + phraseId = nextPhraseId; + nextPhraseId += 1; + phraseIdMap.set(ng, phraseId); + insertPhraseVocab.run(mode, phraseId, ng); + } + insertPhrasePosting.run(mode, phraseId, docId); + } + } + + if (Array.isArray(chunk.chargrams)) { + const unique = new Set(chunk.chargrams); + for (const gram of unique) { + let gramId = chargramIdMap.get(gram); + if (gramId === undefined) { + gramId = nextChargramId; + nextChargramId += 1; + chargramIdMap.set(gram, gramId); + insertChargramVocab.run(mode, gramId, gram); + } + insertChargramPosting.run(mode, gramId, docId); + } + } + + if (Array.isArray(chunk.minhashSig) && chunk.minhashSig.length) { + insertMinhash.run(mode, docId, packUint32(chunk.minhashSig)); + validationStats.minhash += 1; + } + + if (Array.isArray(chunk.embedding) && chunk.embedding.length) { + const dims = chunk.embedding.length; + if (!denseMetaSet) { + insertDenseMeta.run(mode, dims, 1.0, modelConfig.id || null); + denseMetaSet = true; + denseDims = dims; + } else if (denseDims !== null && dims !== denseDims && !denseWarned) { + console.warn(`Dense vector dims mismatch for ${mode}: expected ${denseDims}, got ${dims}`); + denseWarned = true; + } + insertDense.run(mode, docId, packUint8(quantizeVec(chunk.embedding))); + validationStats.dense += 1; + if (vectorAnnLoaded) { + if (!vectorAnnReady) { + const created = vectorConfig.ensureVectorTable(db, vectorExtension, dims); + if (created.ok) { + vectorAnnReady = true; + vectorAnnTable = created.tableName; + vectorAnnColumn = created.column; + insertVectorAnn = db.prepare( + `INSERT OR REPLACE INTO ${vectorAnnTable} (rowid, ${vectorAnnColumn}) VALUES (?, ?)` + ); + } + } + if (vectorAnnReady && insertVectorAnn && encodeVector) { + const encoded = encodeVector(chunk.embedding, vectorExtension); + if (encoded) insertVectorAnn.run(toVectorId(docId), encoded); + } + } + } + + chunkCount += 1; + } + + fileCounts.set(normalizedFile, (fileCounts.get(normalizedFile) || 0) + chunkCount); + }); + + let count = 0; + let bundleFailure = null; + const batchSize = useBundleWorkers + ? Math.max(1, Math.min(totalFiles, Math.max(1, bundleThreads * 2))) + : 1; + try { + for (let i = 0; i < manifestEntries.length; i += batchSize) { + const batch = manifestEntries.slice(i, i + batchSize); + const tasks = batch.map((record) => bundleLoader.loadBundle({ + bundleDir: incrementalData.bundleDir, + entry: record.entry, + file: record.file + })); + const results = await Promise.all(tasks); + const failure = results.find((result) => !result.ok); + if (failure) { + bundleFailure = `${failure.reason} for ${failure.file}`; + break; + } + for (const result of results) { + insertBundle(result.bundle, result.file); + count += result.bundle.chunks.length; + processedFiles += 1; + logBundleProgress(result.file, processedFiles === totalFiles); + } + if (bundleFailure) break; + } + } finally { + await bundleLoader.close(); + } + + if (bundleFailure) { + if (emitOutput) { + console.warn(`[sqlite] Bundle build failed for ${mode}: ${bundleFailure}.`); + } + return { count: 0, reason: bundleFailure }; + } + + validationStats.chunks = count; + insertTokenStats.run(mode, totalDocs ? totalLen / totalDocs : 0, totalDocs); + + const insertManifestTx = db.transaction(() => { + for (const [file, chunkCount] of fileCounts.entries()) { + const normalizedFile = normalizeFilePath(file); + const entry = manifestLookup.map.get(normalizedFile)?.entry || null; + insertFileManifest.run( + mode, + normalizedFile, + entry?.hash || null, + Number.isFinite(entry?.mtimeMs) ? entry.mtimeMs : null, + Number.isFinite(entry?.size) ? entry.size : null, + chunkCount + ); + } + }); + insertManifestTx(); + + db.exec(CREATE_INDEXES_SQL); + validateSqliteDatabase(db, mode, { + validateMode, + expected: validationStats, + emitOutput + }); + succeeded = true; + return { count }; + } finally { + restoreBuildPragmas(db); + db.close(); + if (!succeeded) { + try { + fsSync.rmSync(outPath, { force: true }); + } catch {} + } + } +} diff --git a/src/storage/sqlite/build/incremental-update.js b/src/storage/sqlite/build/incremental-update.js new file mode 100644 index 000000000..7be740463 --- /dev/null +++ b/src/storage/sqlite/build/incremental-update.js @@ -0,0 +1,436 @@ +import fsSync from 'node:fs'; +import path from 'node:path'; +import { readBundleFile } from '../../../shared/bundle-io.js'; +import { buildChunkRow, buildTokenFrequency } from '../build-helpers.js'; +import { REQUIRED_TABLES, SCHEMA_VERSION } from '../schema.js'; +import { hasRequiredTables, normalizeFilePath } from '../utils.js'; +import { packUint32, packUint8, quantizeVec, toVectorId } from '../vector.js'; +import { deleteDocIds, updateTokenStats } from './delete.js'; +import { diffFileManifests, getFileManifest, normalizeManifestFiles } from './manifest.js'; +import { createInsertStatements } from './statements.js'; +import { getSchemaVersion, validateSqliteDatabase } from './validate.js'; +import { ensureVocabIds } from './vocab.js'; + +const MAX_INCREMENTAL_CHANGE_RATIO = 0.35; +const VOCAB_GROWTH_LIMITS = { + token_vocab: { ratio: 0.4, absolute: 200000 }, + phrase_vocab: { ratio: 0.5, absolute: 150000 }, + chargram_vocab: { ratio: 1.0, absolute: 250000 } +}; + +class IncrementalSkipError extends Error { + constructor(reason) { + super(reason); + this.reason = reason; + } +} + +export async function incrementalUpdateDatabase({ + Database, + outPath, + mode, + incrementalData, + modelConfig, + vectorConfig, + emitOutput, + validateMode, + expectedDense +}) { + if (!incrementalData?.manifest) { + return { used: false, reason: 'missing incremental manifest' }; + } + if (!fsSync.existsSync(outPath)) { + return { used: false, reason: 'sqlite db missing' }; + } + + const expectedModel = expectedDense?.model || modelConfig.id || null; + const expectedDims = Number.isFinite(expectedDense?.dims) ? expectedDense.dims : null; + + const db = new Database(outPath); + try { + db.pragma('journal_mode = WAL'); + db.pragma('synchronous = NORMAL'); + } catch {} + + const schemaVersion = getSchemaVersion(db); + if (schemaVersion !== SCHEMA_VERSION) { + db.close(); + return { + used: false, + reason: `schema mismatch (db=${schemaVersion ?? 'unknown'}, expected=${SCHEMA_VERSION})` + }; + } + + if (!hasRequiredTables(db, REQUIRED_TABLES)) { + db.close(); + return { used: false, reason: 'schema missing' }; + } + + const dbDenseMeta = db.prepare( + 'SELECT dims, scale, model FROM dense_meta WHERE mode = ?' + ).get(mode); + const dbDims = Number.isFinite(dbDenseMeta?.dims) ? dbDenseMeta.dims : null; + const dbModel = dbDenseMeta?.model || null; + if ((expectedModel || expectedDims !== null) && !dbDenseMeta) { + db.close(); + return { used: false, reason: 'dense metadata missing' }; + } + if (expectedModel) { + if (!dbModel) { + db.close(); + return { used: false, reason: 'dense metadata model missing' }; + } + if (dbModel !== expectedModel) { + db.close(); + return { used: false, reason: `model mismatch (db=${dbModel}, expected=${expectedModel})` }; + } + } + if (expectedDims !== null) { + if (dbDims === null) { + db.close(); + return { used: false, reason: 'dense metadata dims missing' }; + } + if (dbDims !== expectedDims) { + db.close(); + return { used: false, reason: `dense dims mismatch (db=${dbDims}, expected=${expectedDims})` }; + } + } + + const manifestFiles = incrementalData.manifest.files || {}; + const manifestLookup = normalizeManifestFiles(manifestFiles); + if (!manifestLookup.entries.length) { + db.close(); + return { used: false, reason: 'incremental manifest empty' }; + } + if (manifestLookup.conflicts.length) { + db.close(); + return { used: false, reason: 'manifest path conflicts' }; + } + + const dbFiles = getFileManifest(db, mode); + if (!dbFiles.size) { + const chunkRow = db.prepare('SELECT COUNT(*) AS total FROM chunks WHERE mode = ?') + .get(mode) || {}; + if (Number.isFinite(chunkRow.total) && chunkRow.total > 0) { + db.close(); + return { used: false, reason: 'file manifest empty' }; + } + } + + const { changed, deleted } = diffFileManifests(manifestLookup.entries, dbFiles); + const totalFiles = manifestLookup.entries.length; + if (totalFiles) { + const changeRatio = (changed.length + deleted.length) / totalFiles; + if (changeRatio > MAX_INCREMENTAL_CHANGE_RATIO) { + db.close(); + return { + used: false, + reason: `change ratio ${changeRatio.toFixed(2)} exceeds ${MAX_INCREMENTAL_CHANGE_RATIO}` + }; + } + } + if (!changed.length && !deleted.length) { + db.close(); + return { used: true, changedFiles: 0, deletedFiles: 0, insertedChunks: 0 }; + } + + const bundles = new Map(); + for (const record of changed) { + const fileKey = record.file; + const normalizedFile = record.normalized; + const entry = record.entry; + const bundleName = entry?.bundle; + if (!bundleName) { + db.close(); + return { used: false, reason: `missing bundle for ${fileKey}` }; + } + const bundlePath = path.join(incrementalData.bundleDir, bundleName); + if (!fsSync.existsSync(bundlePath)) { + db.close(); + return { used: false, reason: `bundle missing for ${fileKey}` }; + } + const result = await readBundleFile(bundlePath); + if (!result.ok) { + db.close(); + return { used: false, reason: `invalid bundle for ${fileKey}` }; + } + bundles.set(normalizedFile, { bundle: result.bundle, entry, fileKey, normalizedFile }); + } + + const tokenValues = []; + const phraseValues = []; + const chargramValues = []; + const incomingDimsSet = new Set(); + for (const bundleEntry of bundles.values()) { + const bundle = bundleEntry.bundle; + for (const chunk of bundle.chunks || []) { + const tokensArray = Array.isArray(chunk.tokens) ? chunk.tokens : []; + if (tokensArray.length) tokenValues.push(...tokensArray); + if (Array.isArray(chunk.ngrams)) phraseValues.push(...chunk.ngrams); + if (Array.isArray(chunk.chargrams)) chargramValues.push(...chunk.chargrams); + if (Array.isArray(chunk.embedding) && chunk.embedding.length) { + incomingDimsSet.add(chunk.embedding.length); + } + } + } + if (incomingDimsSet.size > 1) { + db.close(); + return { used: false, reason: 'embedding dims mismatch across bundles' }; + } + const incomingDims = incomingDimsSet.size ? [...incomingDimsSet][0] : null; + if (incomingDims !== null && dbDims !== null && incomingDims !== dbDims) { + db.close(); + return { used: false, reason: `embedding dims mismatch (db=${dbDims}, incoming=${incomingDims})` }; + } + if (incomingDims !== null && expectedDims !== null && incomingDims !== expectedDims) { + db.close(); + return { used: false, reason: `embedding dims mismatch (expected=${expectedDims}, incoming=${incomingDims})` }; + } + + const statements = createInsertStatements(db); + const { + insertChunk, + insertFts, + insertTokenVocab, + insertTokenPosting, + insertDocLength, + insertTokenStats, + insertPhraseVocab, + insertPhrasePosting, + insertChargramVocab, + insertChargramPosting, + insertMinhash, + insertDense, + insertDenseMeta, + insertFileManifest + } = statements; + + const existingIdsByFile = new Map(); + const fileRows = db.prepare('SELECT id, file FROM chunks WHERE mode = ? ORDER BY id') + .all(mode); + for (const row of fileRows) { + const normalized = normalizeFilePath(row.file); + const entry = existingIdsByFile.get(normalized) || { file: normalized, ids: [] }; + entry.ids.push(row.id); + existingIdsByFile.set(normalized, entry); + } + + const maxRow = db.prepare('SELECT MAX(id) AS maxId FROM chunks WHERE mode = ?') + .get(mode); + let nextDocId = Number.isFinite(maxRow?.maxId) ? maxRow.maxId + 1 : 0; + const freeDocIds = []; + let insertedChunks = 0; + + const vectorExtension = vectorConfig?.extension || {}; + const vectorAnnEnabled = vectorConfig?.enabled === true; + const encodeVector = vectorConfig?.encodeVector; + let denseMetaSet = false; + let denseDims = null; + let denseWarned = false; + let vectorAnnLoaded = false; + let vectorAnnReady = false; + let vectorAnnTable = vectorExtension.table || 'dense_vectors_ann'; + let vectorAnnColumn = vectorExtension.column || 'embedding'; + let insertVectorAnn = null; + if (vectorAnnEnabled) { + const loadResult = vectorConfig.loadVectorExtension(db, vectorExtension, `sqlite ${mode}`); + if (loadResult.ok) { + vectorAnnLoaded = true; + if (vectorConfig.hasVectorTable(db, vectorAnnTable)) { + vectorAnnReady = true; + } + } else if (emitOutput) { + console.warn(`[sqlite] Vector extension unavailable for ${mode}: ${loadResult.reason}`); + } + } + + const vectorDeleteTargets = vectorAnnLoaded && vectorAnnReady + ? [{ table: vectorAnnTable, column: 'rowid', withMode: false, transform: toVectorId }] + : []; + + const applyChanges = db.transaction(() => { + const tokenVocab = ensureVocabIds( + db, + mode, + 'token_vocab', + 'token_id', + 'token', + tokenValues, + insertTokenVocab, + { limits: VOCAB_GROWTH_LIMITS.token_vocab } + ); + if (tokenVocab.skip) { + throw new IncrementalSkipError(tokenVocab.reason || 'token vocab growth too large'); + } + + const phraseVocab = ensureVocabIds( + db, + mode, + 'phrase_vocab', + 'phrase_id', + 'ngram', + phraseValues, + insertPhraseVocab, + { limits: VOCAB_GROWTH_LIMITS.phrase_vocab } + ); + if (phraseVocab.skip) { + throw new IncrementalSkipError(phraseVocab.reason || 'phrase vocab growth too large'); + } + + const chargramVocab = ensureVocabIds( + db, + mode, + 'chargram_vocab', + 'gram_id', + 'gram', + chargramValues, + insertChargramVocab, + { limits: VOCAB_GROWTH_LIMITS.chargram_vocab } + ); + if (chargramVocab.skip) { + throw new IncrementalSkipError(chargramVocab.reason || 'chargram vocab growth too large'); + } + + const tokenIdMap = tokenVocab.map; + const phraseIdMap = phraseVocab.map; + const chargramIdMap = chargramVocab.map; + + for (const file of deleted) { + const normalizedFile = normalizeFilePath(file); + const entry = existingIdsByFile.get(normalizedFile); + const docIds = entry?.ids || []; + deleteDocIds(db, mode, docIds, vectorDeleteTargets); + db.prepare('DELETE FROM file_manifest WHERE mode = ? AND file = ?') + .run(mode, normalizedFile); + } + + for (const record of changed) { + const normalizedFile = record.normalized; + const entry = existingIdsByFile.get(normalizedFile); + const reuseIds = entry?.ids || []; + const docIds = reuseIds; + let reuseIndex = 0; + deleteDocIds(db, mode, docIds, vectorDeleteTargets); + + const bundleEntry = bundles.get(normalizedFile); + const bundle = bundleEntry?.bundle; + let chunkCount = 0; + for (const chunk of bundle?.chunks || []) { + let docId; + if (reuseIndex < reuseIds.length) { + docId = reuseIds[reuseIndex]; + reuseIndex += 1; + } else if (freeDocIds.length) { + docId = freeDocIds.pop(); + } else { + docId = nextDocId; + nextDocId += 1; + } + const row = buildChunkRow( + { ...chunk, file: chunk.file || normalizedFile }, + mode, + docId + ); + insertChunk.run(row); + insertFts.run(row); + + const tokensArray = Array.isArray(chunk.tokens) ? chunk.tokens : []; + insertDocLength.run(mode, docId, tokensArray.length); + const freq = buildTokenFrequency(tokensArray); + for (const [token, tf] of freq.entries()) { + const tokenId = tokenIdMap.get(token); + if (tokenId === undefined) continue; + insertTokenPosting.run(mode, tokenId, docId, tf); + } + + if (Array.isArray(chunk.ngrams)) { + const unique = new Set(chunk.ngrams); + for (const ng of unique) { + const phraseId = phraseIdMap.get(ng); + if (phraseId === undefined) continue; + insertPhrasePosting.run(mode, phraseId, docId); + } + } + + if (Array.isArray(chunk.chargrams)) { + const unique = new Set(chunk.chargrams); + for (const gram of unique) { + const gramId = chargramIdMap.get(gram); + if (gramId === undefined) continue; + insertChargramPosting.run(mode, gramId, docId); + } + } + + if (Array.isArray(chunk.minhashSig) && chunk.minhashSig.length) { + insertMinhash.run(mode, docId, packUint32(chunk.minhashSig)); + } + + if (Array.isArray(chunk.embedding) && chunk.embedding.length) { + const dims = chunk.embedding.length; + if (!denseMetaSet) { + insertDenseMeta.run(mode, dims, 1.0, modelConfig.id || null); + denseMetaSet = true; + denseDims = dims; + } else if (denseDims !== null && dims !== denseDims && !denseWarned) { + console.warn(`Dense vector dims mismatch for ${mode}: expected ${denseDims}, got ${dims}`); + denseWarned = true; + } + insertDense.run(mode, docId, packUint8(quantizeVec(chunk.embedding))); + if (vectorAnnLoaded) { + if (!vectorAnnReady) { + const created = vectorConfig.ensureVectorTable(db, vectorExtension, dims); + if (created.ok) { + vectorAnnReady = true; + vectorAnnTable = created.tableName; + vectorAnnColumn = created.column; + insertVectorAnn = db.prepare( + `INSERT OR REPLACE INTO ${vectorAnnTable} (rowid, ${vectorAnnColumn}) VALUES (?, ?)` + ); + } + } + if (vectorAnnReady && insertVectorAnn && encodeVector) { + const encoded = encodeVector(chunk.embedding, vectorExtension); + if (encoded) insertVectorAnn.run(toVectorId(docId), encoded); + } + } + } + + chunkCount += 1; + insertedChunks += 1; + } + if (reuseIndex < reuseIds.length) { + freeDocIds.push(...reuseIds.slice(reuseIndex)); + } + + const manifestEntry = record.entry || bundleEntry?.entry || {}; + insertFileManifest.run( + mode, + normalizedFile, + manifestEntry?.hash || null, + Number.isFinite(manifestEntry?.mtimeMs) ? manifestEntry.mtimeMs : null, + Number.isFinite(manifestEntry?.size) ? manifestEntry.size : null, + chunkCount + ); + } + + updateTokenStats(db, mode, insertTokenStats); + validateSqliteDatabase(db, mode, { validateMode, emitOutput }); + }); + + try { + applyChanges(); + } catch (err) { + db.close(); + if (err instanceof IncrementalSkipError) { + return { used: false, reason: err.reason }; + } + throw err; + } + db.close(); + return { + used: true, + changedFiles: changed.length, + deletedFiles: deleted.length, + insertedChunks + }; +} diff --git a/src/storage/sqlite/build/manifest.js b/src/storage/sqlite/build/manifest.js new file mode 100644 index 000000000..40bc86c5f --- /dev/null +++ b/src/storage/sqlite/build/manifest.js @@ -0,0 +1,74 @@ +import { normalizeFilePath } from '../utils.js'; + +export function getFileManifest(db, mode) { + const rows = db.prepare('SELECT file, hash, mtimeMs, size FROM file_manifest WHERE mode = ?') + .all(mode); + const map = new Map(); + for (const row of rows) { + map.set(normalizeFilePath(row.file), row); + } + return map; +} + +export function isManifestMatch(entry, dbEntry) { + if (!dbEntry) return false; + if (entry?.hash && dbEntry.hash) return entry.hash === dbEntry.hash; + const mtimeMatch = Number.isFinite(entry?.mtimeMs) && Number.isFinite(dbEntry?.mtimeMs) + ? entry.mtimeMs === dbEntry.mtimeMs + : false; + const sizeMatch = Number.isFinite(entry?.size) && Number.isFinite(dbEntry?.size) + ? entry.size === dbEntry.size + : false; + return mtimeMatch && sizeMatch; +} + +export function normalizeManifestFiles(manifestFiles) { + const entries = []; + const map = new Map(); + const conflicts = []; + for (const [file, entry] of Object.entries(manifestFiles || {})) { + const normalized = normalizeFilePath(file); + const record = { file, normalized, entry }; + const existing = map.get(normalized); + if (!existing) { + map.set(normalized, record); + continue; + } + if (isManifestMatch(entry, existing.entry)) { + if (!existing.entry?.hash && entry?.hash) { + map.set(normalized, record); + } + continue; + } + const score = (candidate) => (candidate?.hash ? 3 : 0) + + (Number.isFinite(candidate?.mtimeMs) ? 1 : 0) + + (Number.isFinite(candidate?.size) ? 1 : 0); + if (score(entry) > score(existing.entry)) { + map.set(normalized, record); + } + conflicts.push(normalized); + } + entries.push(...map.values()); + return { entries, map, conflicts }; +} + +export function diffFileManifests(manifestEntries, dbFiles) { + const changed = []; + const deleted = []; + const manifestSet = new Set(); + + for (const record of manifestEntries || []) { + if (!record?.normalized) continue; + manifestSet.add(record.normalized); + const dbEntry = dbFiles.get(record.normalized); + if (!isManifestMatch(record.entry, dbEntry)) { + changed.push(record); + } + } + + for (const [file] of dbFiles.entries()) { + if (!manifestSet.has(file)) deleted.push(file); + } + + return { changed, deleted }; +} diff --git a/src/storage/sqlite/build/pragmas.js b/src/storage/sqlite/build/pragmas.js new file mode 100644 index 000000000..0dbeea388 --- /dev/null +++ b/src/storage/sqlite/build/pragmas.js @@ -0,0 +1,12 @@ +export const applyBuildPragmas = (db) => { + try { db.pragma('journal_mode = WAL'); } catch {} + try { db.pragma('synchronous = OFF'); } catch {} + try { db.pragma('temp_store = MEMORY'); } catch {} + try { db.pragma('cache_size = -200000'); } catch {} + try { db.pragma('mmap_size = 268435456'); } catch {} +}; + +export const restoreBuildPragmas = (db) => { + try { db.pragma('synchronous = NORMAL'); } catch {} + try { db.pragma('temp_store = DEFAULT'); } catch {} +}; diff --git a/src/storage/sqlite/build/statements.js b/src/storage/sqlite/build/statements.js new file mode 100644 index 000000000..b0736b503 --- /dev/null +++ b/src/storage/sqlite/build/statements.js @@ -0,0 +1,74 @@ +export const createInsertStatements = (db) => { + const insertChunk = db.prepare(` + INSERT OR REPLACE INTO chunks ( + id, chunk_id, mode, file, start, end, startLine, endLine, ext, kind, name, + headline, preContext, postContext, weight, tokens, ngrams, codeRelations, + docmeta, stats, complexity, lint, externalDocs, last_modified, last_author, + churn, chunk_authors + ) VALUES ( + @id, @chunk_id, @mode, @file, @start, @end, @startLine, @endLine, @ext, @kind, + @name, @headline, @preContext, @postContext, @weight, @tokens, @ngrams, + @codeRelations, @docmeta, @stats, @complexity, @lint, @externalDocs, + @last_modified, @last_author, @churn, @chunk_authors + ); + `); + + const insertFts = db.prepare(` + INSERT OR REPLACE INTO chunks_fts (rowid, mode, file, name, signature, kind, headline, doc, tokens) + VALUES (@id, @mode, @file, @name, @signature, @kind, @headline, @doc, @tokensText); + `); + + const insertTokenVocab = db.prepare( + 'INSERT OR REPLACE INTO token_vocab (mode, token_id, token) VALUES (?, ?, ?)' + ); + const insertTokenPosting = db.prepare( + 'INSERT OR REPLACE INTO token_postings (mode, token_id, doc_id, tf) VALUES (?, ?, ?, ?)' + ); + const insertDocLength = db.prepare( + 'INSERT OR REPLACE INTO doc_lengths (mode, doc_id, len) VALUES (?, ?, ?)' + ); + const insertTokenStats = db.prepare( + 'INSERT OR REPLACE INTO token_stats (mode, avg_doc_len, total_docs) VALUES (?, ?, ?)' + ); + const insertPhraseVocab = db.prepare( + 'INSERT OR REPLACE INTO phrase_vocab (mode, phrase_id, ngram) VALUES (?, ?, ?)' + ); + const insertPhrasePosting = db.prepare( + 'INSERT OR REPLACE INTO phrase_postings (mode, phrase_id, doc_id) VALUES (?, ?, ?)' + ); + const insertChargramVocab = db.prepare( + 'INSERT OR REPLACE INTO chargram_vocab (mode, gram_id, gram) VALUES (?, ?, ?)' + ); + const insertChargramPosting = db.prepare( + 'INSERT OR REPLACE INTO chargram_postings (mode, gram_id, doc_id) VALUES (?, ?, ?)' + ); + const insertMinhash = db.prepare( + 'INSERT OR REPLACE INTO minhash_signatures (mode, doc_id, sig) VALUES (?, ?, ?)' + ); + const insertDense = db.prepare( + 'INSERT OR REPLACE INTO dense_vectors (mode, doc_id, vector) VALUES (?, ?, ?)' + ); + const insertDenseMeta = db.prepare( + 'INSERT OR REPLACE INTO dense_meta (mode, dims, scale, model) VALUES (?, ?, ?, ?)' + ); + const insertFileManifest = db.prepare( + 'INSERT OR REPLACE INTO file_manifest (mode, file, hash, mtimeMs, size, chunk_count) VALUES (?, ?, ?, ?, ?, ?)' + ); + + return { + insertChunk, + insertFts, + insertTokenVocab, + insertTokenPosting, + insertDocLength, + insertTokenStats, + insertPhraseVocab, + insertPhrasePosting, + insertChargramVocab, + insertChargramPosting, + insertMinhash, + insertDense, + insertDenseMeta, + insertFileManifest + }; +}; diff --git a/src/storage/sqlite/build/validate.js b/src/storage/sqlite/build/validate.js new file mode 100644 index 000000000..829d31585 --- /dev/null +++ b/src/storage/sqlite/build/validate.js @@ -0,0 +1,83 @@ +import { REQUIRED_TABLES } from '../schema.js'; +import { hasRequiredTables } from '../utils.js'; + +export function getSchemaVersion(db) { + try { + const value = db.pragma('user_version', { simple: true }); + return Number.isFinite(value) ? value : null; + } catch { + return null; + } +} + +export function validateSqliteDatabase(db, mode, options = {}) { + const validateMode = options.validateMode || 'off'; + if (validateMode === 'off') return; + + const errors = []; + if (!hasRequiredTables(db, REQUIRED_TABLES)) { + errors.push('missing required tables'); + } + + const pragmaName = validateMode === 'full' ? 'integrity_check' : 'quick_check'; + try { + const rows = db.prepare(`PRAGMA ${pragmaName}`).all(); + const messages = []; + for (const row of rows) { + for (const value of Object.values(row)) { + if (value !== 'ok') messages.push(value); + } + } + if (messages.length) { + errors.push(`${pragmaName} failed: ${messages.join('; ')}`); + } + } catch (err) { + errors.push(`${pragmaName} failed: ${err?.message || err}`); + } + + const expected = options.expected || {}; + const expectedChunks = Number.isFinite(expected.chunks) ? expected.chunks : null; + if (expectedChunks !== null) { + const chunkCount = db.prepare('SELECT COUNT(*) AS total FROM chunks WHERE mode = ?') + .get(mode)?.total ?? 0; + if (chunkCount !== expectedChunks) { + errors.push(`chunks=${chunkCount} expected=${expectedChunks}`); + } + const ftsCount = db.prepare('SELECT COUNT(*) AS total FROM chunks_fts WHERE mode = ?') + .get(mode)?.total ?? 0; + if (ftsCount !== expectedChunks) { + errors.push(`chunks_fts=${ftsCount} expected=${expectedChunks}`); + } + const lengthCount = db.prepare('SELECT COUNT(*) AS total FROM doc_lengths WHERE mode = ?') + .get(mode)?.total ?? 0; + if (lengthCount !== expectedChunks) { + errors.push(`doc_lengths=${lengthCount} expected=${expectedChunks}`); + } + } + + const expectedDense = Number.isFinite(expected.dense) ? expected.dense : null; + if (expectedDense !== null) { + const denseCount = db.prepare('SELECT COUNT(*) AS total FROM dense_vectors WHERE mode = ?') + .get(mode)?.total ?? 0; + if (denseCount !== expectedDense) { + errors.push(`dense_vectors=${denseCount} expected=${expectedDense}`); + } + } + + const expectedMinhash = Number.isFinite(expected.minhash) ? expected.minhash : null; + if (expectedMinhash !== null) { + const minhashCount = db.prepare( + 'SELECT COUNT(*) AS total FROM minhash_signatures WHERE mode = ?' + ).get(mode)?.total ?? 0; + if (minhashCount !== expectedMinhash) { + errors.push(`minhash_signatures=${minhashCount} expected=${expectedMinhash}`); + } + } + + if (errors.length) { + throw new Error(`[sqlite] Validation (${validateMode}) failed for ${mode}: ${errors.join(', ')}`); + } + if (options.emitOutput) { + console.log(`[sqlite] Validation (${validateMode}) ok for ${mode}.`); + } +} diff --git a/src/storage/sqlite/build/vocab.js b/src/storage/sqlite/build/vocab.js new file mode 100644 index 000000000..1fd8e585c --- /dev/null +++ b/src/storage/sqlite/build/vocab.js @@ -0,0 +1,76 @@ +import { chunkArray } from '../utils.js'; + +export function getVocabCount(db, mode, table) { + const row = db.prepare(`SELECT COUNT(*) AS total FROM ${table} WHERE mode = ?`) + .get(mode) || {}; + return Number.isFinite(row.total) ? row.total : 0; +} + +export function fetchVocabRows(db, mode, table, idColumn, valueColumn, values) { + const unique = Array.from(new Set(values.filter(Boolean))); + if (!unique.length) return []; + const rows = []; + for (const chunk of chunkArray(unique)) { + const placeholders = chunk.map(() => '?').join(','); + const stmt = db.prepare( + `SELECT ${idColumn} AS id, ${valueColumn} AS value FROM ${table} ` + + `WHERE mode = ? AND ${valueColumn} IN (${placeholders})` + ); + rows.push(...stmt.all(mode, ...chunk)); + } + return rows; +} + +export function ensureVocabIds( + db, + mode, + table, + idColumn, + valueColumn, + values, + insertStmt, + options = {} +) { + const unique = Array.from(new Set(values.filter(Boolean))); + const totalBefore = getVocabCount(db, mode, table); + if (!unique.length) { + return { map: new Map(), inserted: 0, total: totalBefore, skip: false }; + } + const existing = fetchVocabRows(db, mode, table, idColumn, valueColumn, unique); + const map = new Map(existing.map((row) => [row.value, row.id])); + const missing = unique.filter((value) => !map.has(value)); + if (!missing.length) { + return { map, inserted: 0, total: totalBefore, skip: false }; + } + + const limits = options?.limits || null; + if (limits && totalBefore > 0) { + const ratio = missing.length / totalBefore; + const ratioLimit = Number.isFinite(limits.ratio) ? limits.ratio : null; + const absLimit = Number.isFinite(limits.absolute) ? limits.absolute : null; + if ((ratioLimit !== null && ratio > ratioLimit) || (absLimit !== null && missing.length > absLimit)) { + return { + map, + inserted: 0, + total: totalBefore, + skip: true, + reason: `${table} growth ${missing.length}/${totalBefore}` + }; + } + } + + missing.sort(); + const maxRow = db.prepare(`SELECT MAX(${idColumn}) AS maxId FROM ${table} WHERE mode = ?`) + .get(mode); + let nextId = Number.isFinite(maxRow?.maxId) ? maxRow.maxId + 1 : 0; + const insertTx = db.transaction(() => { + for (const value of missing) { + insertStmt.run(mode, nextId, value); + map.set(value, nextId); + nextId += 1; + } + }); + insertTx(); + + return { map, inserted: missing.length, total: totalBefore + missing.length, skip: false }; +} diff --git a/src/sqlite/incremental.js b/src/storage/sqlite/incremental.js similarity index 100% rename from src/sqlite/incremental.js rename to src/storage/sqlite/incremental.js diff --git a/src/sqlite/schema.js b/src/storage/sqlite/schema.js similarity index 94% rename from src/sqlite/schema.js rename to src/storage/sqlite/schema.js index e2efc89f5..57e19b395 100644 --- a/src/sqlite/schema.js +++ b/src/storage/sqlite/schema.js @@ -1,4 +1,4 @@ -export const SCHEMA_VERSION = 5; +export const SCHEMA_VERSION = 7; export const REQUIRED_TABLES = [ 'chunks', @@ -17,7 +17,7 @@ export const REQUIRED_TABLES = [ 'file_manifest' ]; -export const CREATE_TABLES_SQL = ` +export const CREATE_TABLES_BASE_SQL = ` DROP TABLE IF EXISTS chunks_fts; DROP TABLE IF EXISTS chunks; DROP TABLE IF EXISTS token_postings; @@ -35,6 +35,7 @@ export const CREATE_TABLES_SQL = ` CREATE TABLE chunks ( id INTEGER PRIMARY KEY, + chunk_id TEXT, mode TEXT, file TEXT, start INTEGER, @@ -61,13 +62,14 @@ export const CREATE_TABLES_SQL = ` churn REAL, chunk_authors TEXT ); - CREATE INDEX idx_chunks_file ON chunks (mode, file); CREATE VIRTUAL TABLE chunks_fts USING fts5( mode UNINDEXED, file, name, + signature, kind, headline, + doc, tokens, tokenize = 'unicode61' ); @@ -85,7 +87,6 @@ export const CREATE_TABLES_SQL = ` tf INTEGER NOT NULL, PRIMARY KEY (mode, token_id, doc_id) ); - CREATE INDEX idx_token_postings_token ON token_postings (mode, token_id); CREATE TABLE doc_lengths ( mode TEXT NOT NULL, doc_id INTEGER NOT NULL, @@ -110,7 +111,6 @@ export const CREATE_TABLES_SQL = ` doc_id INTEGER NOT NULL, PRIMARY KEY (mode, phrase_id, doc_id) ); - CREATE INDEX idx_phrase_postings_phrase ON phrase_postings (mode, phrase_id); CREATE TABLE chargram_vocab ( mode TEXT NOT NULL, gram_id INTEGER NOT NULL, @@ -124,7 +124,6 @@ export const CREATE_TABLES_SQL = ` doc_id INTEGER NOT NULL, PRIMARY KEY (mode, gram_id, doc_id) ); - CREATE INDEX idx_chargram_postings_gram ON chargram_postings (mode, gram_id); CREATE TABLE minhash_signatures ( mode TEXT NOT NULL, doc_id INTEGER NOT NULL, @@ -152,5 +151,17 @@ export const CREATE_TABLES_SQL = ` chunk_count INTEGER, PRIMARY KEY (mode, file) ); +`; + +export const CREATE_INDEXES_SQL = ` + CREATE INDEX idx_chunks_file ON chunks (mode, file); + CREATE INDEX idx_token_postings_token ON token_postings (mode, token_id); + CREATE INDEX idx_phrase_postings_phrase ON phrase_postings (mode, phrase_id); + CREATE INDEX idx_chargram_postings_gram ON chargram_postings (mode, gram_id); CREATE INDEX idx_file_manifest_mode_file ON file_manifest (mode, file); `; + +export const CREATE_TABLES_SQL = ` +${CREATE_TABLES_BASE_SQL} +${CREATE_INDEXES_SQL} +`; diff --git a/src/storage/sqlite/utils.js b/src/storage/sqlite/utils.js new file mode 100644 index 000000000..35952e5a5 --- /dev/null +++ b/src/storage/sqlite/utils.js @@ -0,0 +1,184 @@ +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { + MAX_JSON_BYTES, + loadChunkMeta, + loadTokenPostings, + readJsonFile +} from '../../shared/artifact-io.js'; + +/** + * Split an array into fixed-size chunks. + * @param {Array} items + * @param {number} [size] + * @returns {Array>} + */ +export function chunkArray(items, size = 900) { + const chunks = []; + for (let i = 0; i < items.length; i += size) { + chunks.push(items.slice(i, i + size)); + } + return chunks; +} + +/** + * Return the set of table names in a SQLite database. + * @param {import('better-sqlite3').Database} db + * @returns {Set} + */ +export function getTableNames(db) { + const rows = db.prepare("SELECT name FROM sqlite_master WHERE type='table'").all(); + return new Set(rows.map((row) => row.name)); +} + +/** + * Check that all required tables exist. + * @param {import('better-sqlite3').Database} db + * @param {string[]} requiredTables + * @returns {boolean} + */ +export function hasRequiredTables(db, requiredTables) { + const tableNames = getTableNames(db); + return requiredTables.every((name) => tableNames.has(name)); +} + +/** + * Normalize a file path to POSIX separators. + * @param {string} value + * @returns {string} + */ +export function normalizeFilePath(value) { + if (typeof value !== 'string') return value; + return value.replace(/\\/g, '/'); +} + +/** + * Read and parse JSON from disk. + * @param {string} filePath + * @returns {any} + */ +export function readJson(filePath) { + return readJsonFile(filePath, { maxBytes: MAX_JSON_BYTES }); +} + +/** + * Read JSON from disk if it exists; otherwise return null. + * @param {string} dir + * @param {string} name + * @returns {any|null} + */ +export function loadOptional(dir, name) { + const target = path.join(dir, name); + const hasTarget = fs.existsSync(target) || fs.existsSync(`${target}.bak`); + const hasGz = name.endsWith('.json') + && (fs.existsSync(`${target}.gz`) || fs.existsSync(`${target}.gz.bak`)); + if (!hasTarget && !hasGz) { + return null; + } + try { + return readJson(target); + } catch (err) { + if (err?.code === 'ERR_JSON_TOO_LARGE') { + console.warn(`[sqlite] Skipping ${name}: ${err.message}`); + return null; + } + throw err; + } +} + +/** + * Load file-backed index artifacts from a directory. + * @param {string} dir + * @param {string} modelId + * @returns {object|null} + */ +export function loadIndex(dir, modelId) { + const chunkMetaPath = path.join(dir, 'chunk_meta.json'); + const chunkMetaJsonlPath = path.join(dir, 'chunk_meta.jsonl'); + const chunkMetaMetaPath = path.join(dir, 'chunk_meta.meta.json'); + if (!fs.existsSync(chunkMetaPath) + && !fs.existsSync(chunkMetaJsonlPath) + && !fs.existsSync(chunkMetaMetaPath)) { + return null; + } + const chunkMeta = loadChunkMeta(dir, { maxBytes: MAX_JSON_BYTES }); + const denseVec = loadOptional(dir, 'dense_vectors_uint8.json'); + if (denseVec && !denseVec.model) denseVec.model = modelId || null; + return { + chunkMeta, + fileMeta: loadOptional(dir, 'file_meta.json'), + denseVec, + phraseNgrams: loadOptional(dir, 'phrase_ngrams.json'), + chargrams: loadOptional(dir, 'chargram_postings.json'), + minhash: loadOptional(dir, 'minhash_signatures.json'), + tokenPostings: (() => { + const direct = loadOptional(dir, 'token_postings.json'); + if (direct) return direct; + try { + return loadTokenPostings(dir, { maxBytes: MAX_JSON_BYTES }); + } catch { + return null; + } + })() + }; +} + +const SQLITE_SIDECARS = ['-wal', '-shm']; + +async function removeSqliteSidecars(basePath) { + await Promise.all(SQLITE_SIDECARS.map(async (suffix) => { + try { + await fsPromises.rm(`${basePath}${suffix}`, { force: true }); + } catch {} + })); +} + +/** + * Atomically replace a sqlite database, cleaning up WAL/SHM sidecars. + * @param {string} tempDbPath + * @param {string} finalDbPath + * @param {{keepBackup?:boolean,backupPath?:string}} [options] + */ +export async function replaceSqliteDatabase(tempDbPath, finalDbPath, options = {}) { + const keepBackup = options.keepBackup === true; + const backupPath = options.backupPath || `${finalDbPath}.bak`; + const finalExists = fs.existsSync(finalDbPath); + + await removeSqliteSidecars(finalDbPath); + await removeSqliteSidecars(tempDbPath); + + let backupAvailable = fs.existsSync(backupPath); + if (finalExists && !backupAvailable) { + try { + await fsPromises.rename(finalDbPath, backupPath); + backupAvailable = true; + } catch (err) { + if (err?.code !== 'ENOENT') { + backupAvailable = fs.existsSync(backupPath); + } + } + } + + try { + await fsPromises.rename(tempDbPath, finalDbPath); + } catch (err) { + if (err?.code !== 'EEXIST' && err?.code !== 'EPERM' && err?.code !== 'ENOTEMPTY') { + throw err; + } + if (!backupAvailable) { + throw err; + } + try { + await fsPromises.rm(finalDbPath, { force: true }); + } catch {} + await fsPromises.rename(tempDbPath, finalDbPath); + } + + if (!keepBackup) { + try { + await fsPromises.rm(backupPath, { force: true }); + } catch {} + } + await removeSqliteSidecars(backupPath); +} diff --git a/src/sqlite/vector.js b/src/storage/sqlite/vector.js similarity index 100% rename from src/sqlite/vector.js rename to src/storage/sqlite/vector.js diff --git a/sublime/PairOfCleats/Default.sublime-commands b/sublime/PairOfCleats/Default.sublime-commands new file mode 100644 index 000000000..673771531 --- /dev/null +++ b/sublime/PairOfCleats/Default.sublime-commands @@ -0,0 +1,94 @@ +[ + { + "caption": "PairOfCleats: Open Settings", + "command": "pair_of_cleats_open_settings" + }, + { + "caption": "PairOfCleats: Validate Settings", + "command": "pair_of_cleats_validate_settings" + }, + { + "caption": "PairOfCleats: Search", + "command": "pair_of_cleats_search" + }, + { + "caption": "PairOfCleats: Search (With Options)", + "command": "pair_of_cleats_search_with_options" + }, + { + "caption": "PairOfCleats: Search Selection", + "command": "pair_of_cleats_search_selection" + }, + { + "caption": "PairOfCleats: Search Symbol Under Cursor", + "command": "pair_of_cleats_search_symbol_under_cursor" + }, + { + "caption": "PairOfCleats: Search History", + "command": "pair_of_cleats_search_history" + }, + { + "caption": "PairOfCleats: Repeat Last Search", + "command": "pair_of_cleats_repeat_last_search" + }, + { + "caption": "PairOfCleats: Explain Search", + "command": "pair_of_cleats_explain_search" + }, + { + "caption": "PairOfCleats: Index Build (Code)", + "command": "pair_of_cleats_index_build_code" + }, + { + "caption": "PairOfCleats: Index Build (Prose)", + "command": "pair_of_cleats_index_build_prose" + }, + { + "caption": "PairOfCleats: Index Build (All)", + "command": "pair_of_cleats_index_build_all" + }, + { + "caption": "PairOfCleats: Index Watch Start", + "command": "pair_of_cleats_index_watch_start" + }, + { + "caption": "PairOfCleats: Index Watch Stop", + "command": "pair_of_cleats_index_watch_stop" + }, + { + "caption": "PairOfCleats: Index Validate", + "command": "pair_of_cleats_index_validate" + }, + { + "caption": "PairOfCleats: Open Index Directory", + "command": "pair_of_cleats_open_index_directory" + }, + { + "caption": "PairOfCleats: Map (Repo)", + "command": "pair_of_cleats_map_repo" + }, + { + "caption": "PairOfCleats: Map (Current Folder)", + "command": "pair_of_cleats_map_current_folder" + }, + { + "caption": "PairOfCleats: Map (Current File)", + "command": "pair_of_cleats_map_current_file" + }, + { + "caption": "PairOfCleats: Map (Symbol Under Cursor)", + "command": "pair_of_cleats_map_symbol_under_cursor" + }, + { + "caption": "PairOfCleats: Map (Selection)", + "command": "pair_of_cleats_map_selection" + }, + { + "caption": "PairOfCleats: Map Jump to Node", + "command": "pair_of_cleats_map_jump_to_node" + }, + { + "caption": "PairOfCleats: Map Open Last Viewer", + "command": "pair_of_cleats_map_open_last_viewer" + } +] diff --git a/sublime/PairOfCleats/Default.sublime-keymap b/sublime/PairOfCleats/Default.sublime-keymap new file mode 100644 index 000000000..fe51488c7 --- /dev/null +++ b/sublime/PairOfCleats/Default.sublime-keymap @@ -0,0 +1 @@ +[] diff --git a/sublime/PairOfCleats/Main.sublime-menu b/sublime/PairOfCleats/Main.sublime-menu new file mode 100644 index 000000000..db1842c2d --- /dev/null +++ b/sublime/PairOfCleats/Main.sublime-menu @@ -0,0 +1,15 @@ +[ + { + "id": "preferences", + "children": [ + { + "caption": "PairOfCleats Settings", + "command": "pair_of_cleats_open_settings" + }, + { + "caption": "PairOfCleats Validate Settings", + "command": "pair_of_cleats_validate_settings" + } + ] + } +] diff --git a/sublime/PairOfCleats/PairOfCleats.sublime-settings b/sublime/PairOfCleats/PairOfCleats.sublime-settings new file mode 100644 index 000000000..0e15ef13f --- /dev/null +++ b/sublime/PairOfCleats/PairOfCleats.sublime-settings @@ -0,0 +1,41 @@ +{ + "pairofcleats_path": "", + "node_path": "", + "index_mode_default": "both", + "search_backend_default": "", + "open_results_in": "quick_panel", + "search_limit": 25, + "results_buffer_threshold": 50, + "history_limit": 25, + "search_prompt_options": false, + "index_watch_scope": "repo", + "index_watch_folder": "", + "index_watch_mode": "all", + "index_watch_poll_ms": 2000, + "index_watch_debounce_ms": 500, + "map_type_default": "combined", + "map_format_default": "html-iso", + "map_prompt_options": false, + "map_output_dir": ".pairofcleats/maps", + "map_only_exported": false, + "map_collapse_default": "none", + "map_max_files": 200, + "map_max_members_per_file": 60, + "map_max_edges": 3000, + "map_top_k_by_degree": false, + "map_show_report_panel": null, + "map_stream_output": false, + "map_open_uri_template": "subl://open?file={file}&line={line}&column={column}", + "map_three_url": "", + "map_index_mode": "code", + "map_wasd_sensitivity": 16000, + "map_wasd_acceleration": 6000, + "map_wasd_max_speed": 24000, + "map_wasd_drag": 6, + "map_zoom_sensitivity": 0.1, + "profile": "", + "cache_root": "", + "embeddings_mode": "", + "node_options": "", + "env": {} +} diff --git a/sublime/PairOfCleats/README.md b/sublime/PairOfCleats/README.md new file mode 100644 index 000000000..76e994c5a --- /dev/null +++ b/sublime/PairOfCleats/README.md @@ -0,0 +1,115 @@ +# PairOfCleats Sublime Text + +PairOfCleats integration for Sublime Text 3. + +## Install + +- Copy or symlink `sublime/PairOfCleats` into your Sublime `Packages` directory. +- Ensure Node.js 18+ is available on PATH (or set `node_path`). +- Install the PairOfCleats CLI (global npm install or local repo checkout). + +## Package Control notes + +This package avoids external Python dependencies. It relies on the Node runtime +and the PairOfCleats CLI or local repo binaries. + +## CLI discovery + +Resolution order: +1) `pairofcleats_path` setting (absolute or repo-relative) +2) `node_modules/.bin/pairofcleats` (repo-local) +3) `bin/pairofcleats.js` (repo-local) +4) `pairofcleats` on PATH + +If the selected path ends in `.js`, the plugin runs it with `node_path` (or `node`). + +## Settings + +Open the command palette and run `PairOfCleats: Open Settings` or `PairOfCleats: Validate Settings`. + +- `pairofcleats_path`: Path to the CLI binary or `bin/pairofcleats.js`. +- `node_path`: Optional override for the Node.js binary. +- `index_mode_default`: `code`, `prose`, or `both`. +- `search_backend_default`: `memory`, `sqlite`, `sqlite-fts`, or `lmdb`. +- `open_results_in`: `quick_panel`, `new_tab`, or `output_panel`. +- `search_limit`: Default `--top` value. +- `results_buffer_threshold`: When using `quick_panel`, switch to the output panel once results reach this count (0 disables). +- `history_limit`: Maximum queries stored per project. +- `search_prompt_options`: Prompt for mode/backend/limit each search. +- `index_watch_scope`: `repo` or `folder` for watch root selection. +- `index_watch_folder`: Optional folder path (absolute or repo-relative) when using `folder` scope. +- `index_watch_mode`: `all`, `code`, `prose`, `records`, or `extracted-prose`. +- `index_watch_poll_ms`: Watch polling interval in ms (when polling is enabled). +- `index_watch_debounce_ms`: Debounce interval for watch rebuilds (ms). +- `map_type_default`: `combined`, `imports`, `calls`, `usages`, or `dataflow`. +- `map_format_default`: `html-iso`, `html`, `svg`, `dot`, or `json`. +- `map_prompt_options`: Prompt for map type/format each run. +- `map_output_dir`: Output directory for map artifacts (absolute or repo-relative). +- `map_only_exported`: When true, include exported symbols only. +- `map_collapse_default`: `none`, `file`, or `dir`. +- `map_max_files`: Guardrail for file nodes. +- `map_max_members_per_file`: Guardrail for members per file. +- `map_max_edges`: Guardrail for edges. +- `map_top_k_by_degree`: Prefer top-k files by edge degree when truncating. +- `map_show_report_panel`: Set to true to show warnings/summary in an output panel. +- `map_stream_output`: Stream CLI output to the map panel. +- `map_open_uri_template`: URI template for the isometric viewer (Sublime links). +- `map_three_url`: Override three.js module path (default resolves from node_modules). +- `map_index_mode`: Index mode to read (`code` or `prose`). +- `map_wasd_sensitivity`: Isometric viewer WASD sensitivity. +- `map_wasd_acceleration`: Isometric viewer WASD acceleration. +- `map_wasd_max_speed`: Isometric viewer WASD max speed. +- `map_wasd_drag`: Isometric viewer WASD damping. +- `map_zoom_sensitivity`: Isometric viewer zoom sensitivity. +- `profile`: Sets `PAIROFCLEATS_PROFILE`. +- `cache_root`: Sets `PAIROFCLEATS_CACHE_ROOT`. +- `embeddings_mode`: Sets `PAIROFCLEATS_EMBEDDINGS`. +- `node_options`: Sets `PAIROFCLEATS_NODE_OPTIONS`. +- `env`: Extra environment overrides (merged with defaults). + +## Commands + +- `PairOfCleats: Search` +- `PairOfCleats: Search (With Options)` +- `PairOfCleats: Search Selection` +- `PairOfCleats: Search Symbol Under Cursor` +- `PairOfCleats: Search History` +- `PairOfCleats: Repeat Last Search` +- `PairOfCleats: Explain Search` +- `PairOfCleats: Index Build (Code)` +- `PairOfCleats: Index Build (Prose)` +- `PairOfCleats: Index Build (All)` +- `PairOfCleats: Index Watch Start` +- `PairOfCleats: Index Watch Stop` +- `PairOfCleats: Index Validate` +- `PairOfCleats: Open Index Directory` +- `PairOfCleats: Map (Repo)` +- `PairOfCleats: Map (Current Folder)` +- `PairOfCleats: Map (Current File)` +- `PairOfCleats: Map (Symbol Under Cursor)` +- `PairOfCleats: Map (Selection)` +- `PairOfCleats: Map Jump to Node` +- `PairOfCleats: Map Open Last Viewer` + +## Project overrides + +In your `.sublime-project` file: + +```json +{ + "settings": { + "pairofcleats": { + "pairofcleats_path": "./bin/pairofcleats.js", + "env": { + "PAIROFCLEATS_PROFILE": "balanced" + } + } + } +} +``` + +## CLI output contract + +The Sublime integration is designed to use `--json` output so it can access full +metadata when available. It does not assume the compact JSON contract used by +other editors. diff --git a/sublime/PairOfCleats/__init__.py b/sublime/PairOfCleats/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sublime/PairOfCleats/commands/__init__.py b/sublime/PairOfCleats/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sublime/PairOfCleats/commands/index.py b/sublime/PairOfCleats/commands/index.py new file mode 100644 index 000000000..079a44b06 --- /dev/null +++ b/sublime/PairOfCleats/commands/index.py @@ -0,0 +1,373 @@ +import sublime +import sublime_plugin + +from ..lib import config +from ..lib import index_state +from ..lib import indexing +from ..lib import paths +from ..lib import runner +from ..lib import ui +from ..lib import watch + +INDEX_PANEL = 'pairofcleats-index' + + +def _resolve_repo_root(window): + return paths.resolve_repo_root(window, return_reason=True) + + +def _has_repo_root(window): + return paths.has_repo_root(window) + + +def _run_index_build(window, mode): + settings = config.get_settings(window) + repo_root, reason = _resolve_repo_root(window) + if not repo_root: + ui.show_error('PairOfCleats: {0}'.format(reason)) + return + if reason: + ui.show_status('PairOfCleats: {0}'.format(reason)) + + errors = config.validate_settings(settings, repo_root) + if errors: + ui.show_error('PairOfCleats settings need attention:\n- {0}'.format('\n- '.join(errors))) + return + + args = indexing.build_index_args(mode, repo_root=repo_root) + cli = paths.resolve_cli(settings, repo_root) + command = cli['command'] + full_args = list(cli.get('args_prefix') or []) + args + env = config.build_env(settings) + + ui.show_status('PairOfCleats: index build started ({0}).'.format(mode)) + + def on_done(result): + if result.returncode == 0: + index_state.record_last_build(window, mode) + ui.show_status('PairOfCleats: index build complete ({0}).'.format(mode)) + return + message = result.output.strip() or 'PairOfCleats index build failed.' + ui.show_error(message) + + runner.run_process( + command, + full_args, + cwd=repo_root, + env=env, + window=window, + title='PairOfCleats index build', + capture_json=False, + on_done=on_done, + stream_output=True, + panel_name=INDEX_PANEL + ) + + +def _run_index_watch(window): + settings = config.get_settings(window) + repo_root, reason = _resolve_repo_root(window) + if not repo_root: + ui.show_error('PairOfCleats: {0}'.format(reason)) + return + if reason: + ui.show_status('PairOfCleats: {0}'.format(reason)) + + errors = config.validate_settings(settings, repo_root) + if errors: + ui.show_error('PairOfCleats settings need attention:\n- {0}'.format('\n- '.join(errors))) + return + + if watch.is_running(window): + active_root = watch.current_root(window) + message = 'PairOfCleats: watch already running.' + if active_root: + message = '{0} ({1})'.format(message, active_root) + ui.show_status(message) + return + + watch_root = paths.resolve_watch_root(window, settings) + if not watch_root: + ui.show_error('PairOfCleats: unable to resolve watch root.') + return + + mode = settings.get('index_watch_mode') or 'all' + poll_ms = settings.get('index_watch_poll_ms') + debounce_ms = settings.get('index_watch_debounce_ms') + + args = indexing.build_index_args( + mode, + repo_root=watch_root, + watch=True, + watch_poll_ms=poll_ms, + watch_debounce_ms=debounce_ms + ) + + cli = paths.resolve_cli(settings, repo_root) + command = cli['command'] + full_args = list(cli.get('args_prefix') or []) + args + env = config.build_env(settings) + + ui.show_status('PairOfCleats: watch started ({0}).'.format(watch_root)) + + def on_done(result): + watch.clear_if_done(window) + if result.returncode == 0: + ui.show_status('PairOfCleats: watch stopped.') + return + message = result.output.strip() or 'PairOfCleats watch failed.' + ui.show_error(message) + + handle = runner.run_process( + command, + full_args, + cwd=watch_root, + env=env, + window=window, + title='PairOfCleats index watch', + capture_json=False, + on_done=on_done, + stream_output=True, + panel_name=INDEX_PANEL + ) + watch.register(window, handle, watch_root) + + +def _run_index_watch_stop(window): + if watch.stop(window): + ui.show_status('PairOfCleats: watch stopping...') + else: + ui.show_status('PairOfCleats: no watch to stop.') + + +def _run_index_validate(window): + settings = config.get_settings(window) + repo_root, reason = _resolve_repo_root(window) + if not repo_root: + ui.show_error('PairOfCleats: {0}'.format(reason)) + return + if reason: + ui.show_status('PairOfCleats: {0}'.format(reason)) + + errors = config.validate_settings(settings, repo_root) + if errors: + ui.show_error('PairOfCleats settings need attention:\n- {0}'.format('\n- '.join(errors))) + return + + args = indexing.build_validate_args(repo_root=repo_root, json_output=True) + cli = paths.resolve_cli(settings, repo_root) + command = cli['command'] + full_args = list(cli.get('args_prefix') or []) + args + env = config.build_env(settings) + + ui.show_status('PairOfCleats: validating index...') + + def on_done(result): + if result.error: + ui.show_error(result.error) + return + if result.returncode not in (0, 1): + message = result.output.strip() or 'PairOfCleats index validate failed.' + ui.show_error(message) + return + payload = result.payload + if not isinstance(payload, dict): + ui.show_error('PairOfCleats index validate returned invalid JSON.') + return + text = _format_validate_report(payload) + ui.write_output_panel(window, 'pairofcleats-validate', text) + if payload.get('ok'): + ui.show_status('PairOfCleats: index validation ok.') + else: + ui.show_error('PairOfCleats: index validation found issues.') + + runner.run_process( + command, + full_args, + cwd=repo_root, + env=env, + window=window, + title='PairOfCleats index validate', + capture_json=True, + on_done=on_done, + stream_output=False + ) + + +def _format_validate_report(payload): + lines = ['PairOfCleats index validation', ''] + root = payload.get('root') or '' + if root: + lines.append('Repo: {0}'.format(root)) + lines.append('Status: {0}'.format('ok' if payload.get('ok') else 'issues')) + lines.append('') + + modes = payload.get('modes') or {} + if isinstance(modes, dict): + for mode, entry in modes.items(): + if not isinstance(entry, dict): + continue + status = 'ok' if entry.get('ok') else 'missing' + path = entry.get('path') or '' + lines.append('{0}: {1}'.format(mode, status)) + if path: + lines.append(' {0}'.format(path)) + missing = entry.get('missing') + if isinstance(missing, list) and missing: + lines.append(' missing: {0}'.format(', '.join(missing))) + warnings = entry.get('warnings') + if isinstance(warnings, list) and warnings: + lines.append(' warnings: {0}'.format(', '.join(warnings))) + lines.append('') + + issues = payload.get('issues') + if isinstance(issues, list) and issues: + lines.append('Issues:') + for issue in issues: + lines.append('- {0}'.format(issue)) + lines.append('') + + warnings = payload.get('warnings') + if isinstance(warnings, list) and warnings: + lines.append('Warnings:') + for warning in warnings: + lines.append('- {0}'.format(warning)) + lines.append('') + + hints = payload.get('hints') + if isinstance(hints, list) and hints: + lines.append('Hints:') + for hint in hints: + lines.append('- {0}'.format(hint)) + lines.append('') + + return '\n'.join(lines).rstrip() + '\n' + + +def _run_open_index_dir(window): + settings = config.get_settings(window) + repo_root, reason = _resolve_repo_root(window) + if not repo_root: + ui.show_error('PairOfCleats: {0}'.format(reason)) + return + if reason: + ui.show_status('PairOfCleats: {0}'.format(reason)) + + errors = config.validate_settings(settings, repo_root) + if errors: + ui.show_error('PairOfCleats settings need attention:\n- {0}'.format('\n- '.join(errors))) + return + + args = indexing.build_config_dump_args(repo_root=repo_root, json_output=True) + cli = paths.resolve_cli(settings, repo_root) + command = cli['command'] + full_args = list(cli.get('args_prefix') or []) + args + env = config.build_env(settings) + + def on_done(result): + if result.error: + ui.show_error(result.error) + return + if result.returncode != 0: + message = result.output.strip() or 'PairOfCleats config dump failed.' + ui.show_error(message) + return + payload = result.payload + if not isinstance(payload, dict): + ui.show_error('PairOfCleats config dump returned invalid JSON.') + return + derived = payload.get('derived') or {} + repo_cache_root = derived.get('repoCacheRoot') + if not repo_cache_root: + ui.show_error('PairOfCleats: repo cache root unavailable.') + return + window.run_command('open_dir', {'dir': repo_cache_root}) + + runner.run_process( + command, + full_args, + cwd=repo_root, + env=env, + window=window, + title='PairOfCleats config dump', + capture_json=True, + on_done=on_done, + stream_output=False + ) + + +class PairOfCleatsIndexBuildCodeCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + _run_index_build(self.window, 'code') + + +class PairOfCleatsIndexBuildProseCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + _run_index_build(self.window, 'prose') + + +class PairOfCleatsIndexBuildAllCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + _run_index_build(self.window, 'all') + + +class PairOfCleatsIndexWatchStartCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + _run_index_watch(self.window) + + +class PairOfCleatsIndexWatchStopCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + _run_index_watch_stop(self.window) + + +class PairOfCleatsIndexValidateCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + _run_index_validate(self.window) + + +class PairOfCleatsOpenIndexDirectoryCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + _run_open_index_dir(self.window) diff --git a/sublime/PairOfCleats/commands/map.py b/sublime/PairOfCleats/commands/map.py new file mode 100644 index 000000000..385b075c3 --- /dev/null +++ b/sublime/PairOfCleats/commands/map.py @@ -0,0 +1,418 @@ +import json +import os +import webbrowser +from urllib.parse import quote + +import sublime +import sublime_plugin + +from ..lib import config +from ..lib import map as map_lib +from ..lib import map_state +from ..lib import paths +from ..lib import results +from ..lib import runner +from ..lib import ui + +MAP_TYPE_CHOICES = [ + ('combined', 'combined (imports + calls + usages + dataflow)'), + ('imports', 'imports only'), + ('calls', 'calls only'), + ('usages', 'usages only'), + ('dataflow', 'dataflow only') +] + +MAP_FORMAT_CHOICES = [ + ('html-iso', 'isometric HTML (three.js)'), + ('html', 'graphviz HTML'), + ('svg', 'graphviz SVG'), + ('dot', 'graphviz DOT'), + ('json', 'map model JSON') +] + + +def _resolve_repo_root(window, path_hint=None): + return paths.resolve_repo_root(window, return_reason=True, path_hint=path_hint) + + +def _has_repo_root(window, path_hint=None): + return paths.has_repo_root(window, path_hint=path_hint) + + +def _extract_selection(view): + if view is None: + return '' + for region in view.sel(): + if not region.empty(): + return view.substr(region) + return '' + + +def _extract_symbol(view): + if view is None: + return '' + selection = view.sel() + if not selection: + return '' + region = selection[0] + word = view.word(region) + return view.substr(word) + + +def _relative_focus(repo_root, path_value): + if not path_value: + return '' + if os.path.isabs(path_value): + try: + rel = os.path.relpath(path_value, repo_root) + return rel.replace('\\', '/') + except Exception: + return path_value.replace('\\', '/') + return path_value.replace('\\', '/') + + +def _open_in_browser(path_value): + if not path_value: + return + try: + resolved = os.path.abspath(path_value) + url = 'file:///{0}'.format(quote(resolved.replace('\\', '/'))) + except Exception: + url = 'file:///{0}'.format(path_value.replace('\\', '/')) + try: + webbrowser.open_new_tab(url) + except Exception: + ui.show_error('PairOfCleats: failed to open browser.') + + +def _render_report(payload): + lines = ['PairOfCleats map report', ''] + if not isinstance(payload, dict): + return '\n'.join(lines) + summary = payload.get('summary') or {} + counts = summary.get('counts') or {} + lines.append('files: {0}'.format(counts.get('files') or 0)) + lines.append('members: {0}'.format(counts.get('members') or 0)) + lines.append('edges: {0}'.format(counts.get('edges') or 0)) + warnings = payload.get('warnings') or [] + if warnings: + lines.append('') + lines.append('Warnings:') + for warning in warnings: + lines.append('- {0}'.format(warning)) + return '\n'.join(lines) + '\n' + + +def _offer_rebuild(window, warnings): + if not warnings or window is None: + return + needs = any( + 'dataflow metadata missing' in warning or 'controlFlow metadata missing' in warning + for warning in warnings + ) + if not needs: + return + + def on_select(index): + if index == 0: + window.run_command('pair_of_cleats_index_build_all') + + window.show_quick_panel( + ['Rebuild index with dataflow/control-flow enabled', 'Dismiss'], + on_select + ) + +def _prompt_map_type(window, settings, on_done): + default_type = map_lib.resolve_map_type(settings) + labels = [entry[1] for entry in MAP_TYPE_CHOICES] + selected_index = 0 + for idx, (value, _) in enumerate(MAP_TYPE_CHOICES): + if value == default_type: + selected_index = idx + break + + def on_select(index): + if index < 0: + return + on_done(MAP_TYPE_CHOICES[index][0]) + + window.show_quick_panel(labels, on_select, selected_index=selected_index) + + +def _prompt_map_format(window, settings, on_done): + default_format = map_lib.resolve_map_format(settings) + labels = [entry[1] for entry in MAP_FORMAT_CHOICES] + selected_index = 0 + for idx, (value, _) in enumerate(MAP_FORMAT_CHOICES): + if value == default_format: + selected_index = idx + break + + def on_select(index): + if index < 0: + return + on_done(MAP_FORMAT_CHOICES[index][0]) + + window.show_quick_panel(labels, on_select, selected_index=selected_index) + + +def _dispatch_map(window, scope, focus, map_type=None, map_format=None, path_hint=None): + settings = config.get_settings(window) + repo_root, reason = _resolve_repo_root(window, path_hint=path_hint) + if not repo_root: + ui.show_error('PairOfCleats: {0}'.format(reason)) + return + if reason: + ui.show_status('PairOfCleats: {0}'.format(reason)) + + errors = config.validate_settings(settings, repo_root) + if errors: + message = 'PairOfCleats settings need attention:\n- {0}'.format( + '\n- '.join(errors) + ) + ui.show_error(message) + return + + map_type = map_type or map_lib.resolve_map_type(settings) + map_format = map_format or map_lib.resolve_map_format(settings) + output_path, model_path, node_list_path = map_lib.build_output_paths( + repo_root, settings, scope, map_type, map_format + ) + args = map_lib.build_map_args( + repo_root, + settings, + scope, + focus, + map_type, + map_format, + output_path, + model_path, + node_list_path + ) + + cli = paths.resolve_cli(settings, repo_root) + command = cli['command'] + full_args = list(cli.get('args_prefix') or []) + args + env = config.build_env(settings) + + ui.show_status('PairOfCleats: generating map...') + + def on_done(result): + if result.returncode != 0: + message = result.output.strip() or 'PairOfCleats map failed.' + ui.show_error(message) + return + if result.error: + ui.show_error(result.error) + return + payload = result.payload + if not isinstance(payload, dict) or not payload.get('ok'): + ui.show_error('PairOfCleats map returned invalid JSON.') + return + + map_state.record_last_map(window, payload) + report_text = _render_report(payload) + if settings.get('map_show_report_panel'): + ui.write_output_panel(window, 'pairofcleats-map', report_text) + _offer_rebuild(window, payload.get('warnings') or []) + + resolved_path = payload.get('outPath') or output_path + resolved_format = payload.get('format') or map_format + + if resolved_format in ('html', 'html-iso', 'svg'): + _open_in_browser(resolved_path) + elif resolved_path: + window.open_file(resolved_path) + + runner.run_process( + command, + full_args, + cwd=repo_root, + env=env, + window=window, + title='PairOfCleats map', + capture_json=True, + on_done=on_done, + stream_output=settings.get('map_stream_output') is True, + panel_name='pairofcleats-map' + ) + + +def _run_with_options(window, scope, focus, map_type=None, map_format=None, path_hint=None): + settings = config.get_settings(window) + if not settings.get('map_prompt_options'): + _dispatch_map(window, scope, focus, map_type=map_type, map_format=map_format, path_hint=path_hint) + return + + def after_type(selected_type): + def after_format(selected_format): + _dispatch_map(window, scope, focus, map_type=selected_type, map_format=selected_format, path_hint=path_hint) + _prompt_map_format(window, settings, after_format) + + _prompt_map_type(window, settings, after_type) + + +class PairOfCleatsMapRepoCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + _run_with_options(self.window, 'repo', '', path_hint=None) + + +class PairOfCleatsMapCurrentFolderCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + view = self.window.active_view() + folder = None + if view and view.file_name(): + folder = os.path.dirname(view.file_name()) + if not folder and self.window.folders(): + folder = self.window.folders()[0] + repo_root, reason = _resolve_repo_root(self.window, path_hint=folder) + if not repo_root: + ui.show_error('PairOfCleats: {0}'.format(reason)) + return + focus = _relative_focus(repo_root, folder) if folder else '' + _run_with_options(self.window, 'dir', focus, path_hint=folder) + + +class PairOfCleatsMapCurrentFileCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + view = self.window.active_view() + return bool(view and view.file_name()) + + def is_visible(self): + return True + + def run(self): + view = self.window.active_view() + if not view or not view.file_name(): + ui.show_status('PairOfCleats: no active file.') + return + repo_root, reason = _resolve_repo_root(self.window, path_hint=view.file_name()) + if not repo_root: + ui.show_error('PairOfCleats: {0}'.format(reason)) + return + focus = _relative_focus(repo_root, view.file_name()) + _run_with_options(self.window, 'file', focus, path_hint=view.file_name()) + + +class PairOfCleatsMapSymbolUnderCursorCommand(sublime_plugin.TextCommand): + def is_enabled(self): + return bool(self.view and self.view.file_name()) + + def is_visible(self): + return True + + def run(self, edit): + symbol = _extract_symbol(self.view) + if not symbol: + ui.show_status('PairOfCleats: no symbol under cursor.') + return + file_name = self.view.file_name() if self.view else None + repo_root, reason = _resolve_repo_root(self.view.window(), path_hint=file_name) + if not repo_root: + ui.show_error('PairOfCleats: {0}'.format(reason)) + return + focus = '{0}::{1}'.format(_relative_focus(repo_root, file_name), symbol) if file_name else symbol + _run_with_options(self.view.window(), 'symbol', focus, path_hint=file_name) + + +class PairOfCleatsMapSelectionCommand(sublime_plugin.TextCommand): + def is_enabled(self): + return bool(self.view) + + def is_visible(self): + return True + + def run(self, edit): + selection = _extract_selection(self.view) + if not selection: + ui.show_status('PairOfCleats: no selection.') + return + file_name = self.view.file_name() if self.view else None + _run_with_options(self.view.window(), 'symbol', selection.strip(), path_hint=file_name) + + +class PairOfCleatsMapJumpToNodeCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + state = map_state.get_last_map(self.window) + if not state: + ui.show_status('PairOfCleats: no map history yet.') + return + node_list_path = state.get('nodeListPath') + if not node_list_path or not os.path.exists(node_list_path): + ui.show_status('PairOfCleats: node list unavailable.') + return + try: + with open(node_list_path, 'r') as handle: + payload = json.load(handle) + except Exception: + ui.show_error('PairOfCleats: failed to read node list.') + return + nodes = payload.get('nodes') if isinstance(payload, dict) else None + if not isinstance(nodes, list) or not nodes: + ui.show_status('PairOfCleats: node list empty.') + return + + items = [] + for node in nodes: + label = node.get('label') or node.get('id') + detail = node.get('file') or '' + items.append([label, detail]) + + repo_root, reason = _resolve_repo_root(self.window) + if not repo_root: + ui.show_error('PairOfCleats: {0}'.format(reason)) + return + + def on_select(index): + if index < 0: + return + node = nodes[index] + hit = { + 'file': node.get('file'), + 'startLine': node.get('startLine'), + 'endLine': node.get('endLine') + } + results.open_hit(self.window, hit, repo_root=repo_root) + + self.window.show_quick_panel(items, on_select) + + +class PairOfCleatsMapOpenLastViewerCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + state = map_state.get_last_map(self.window) + if not state: + ui.show_status('PairOfCleats: no map history yet.') + return + path_value = state.get('outPath') + if not path_value: + ui.show_status('PairOfCleats: no map output yet.') + return + format_value = state.get('format') or '' + if format_value in ('html', 'html-iso', 'svg'): + _open_in_browser(path_value) + else: + self.window.open_file(path_value) diff --git a/sublime/PairOfCleats/commands/search.py b/sublime/PairOfCleats/commands/search.py new file mode 100644 index 000000000..6b814fc48 --- /dev/null +++ b/sublime/PairOfCleats/commands/search.py @@ -0,0 +1,425 @@ +import sublime +import sublime_plugin + +from ..lib import config +from ..lib import history +from ..lib import paths +from ..lib import results +from ..lib import runner +from ..lib import search as search_lib +from ..lib import ui + +LIMIT_CHOICES = [10, 25, 50, 100, 200] + + +def _resolve_repo_root(window): + return paths.resolve_repo_root(window, return_reason=True) + + +def _has_repo_root(window): + return paths.has_repo_root(window) + + +def _resolve_defaults(settings, overrides=None): + overrides = overrides or {} + mode = overrides.get('mode') or settings.get('index_mode_default') or 'both' + backend = overrides.get('backend') or settings.get('search_backend_default') or '' + limit = overrides.get('limit') or settings.get('search_limit') or 25 + return { + 'mode': mode, + 'backend': backend, + 'limit': limit + } + + +def _resolve_results_target(settings, hit_count): + target = settings.get('open_results_in') or 'quick_panel' + threshold = settings.get('results_buffer_threshold') + if target == 'quick_panel' and isinstance(threshold, int) and threshold > 0: + if hit_count >= threshold: + return 'output_panel' + if target in ('quick_panel', 'new_tab', 'output_panel'): + return target + return 'quick_panel' + + +def _prompt_query(window, initial, on_done): + window.show_input_panel( + 'PairOfCleats search query', + initial or '', + lambda value: on_done(value.strip()), + None, + None + ) + + +def _prompt_options(window, settings, defaults, on_done, force_prompt=False): + if not force_prompt and not settings.get('search_prompt_options'): + on_done(defaults) + return + + options = dict(defaults) + mode_choices = ['code', 'prose', 'both'] + default_mode = options.get('mode') + mode_index = mode_choices.index(default_mode) if default_mode in mode_choices else 2 + + def on_mode_select(index): + if index < 0: + on_done(options) + return + options['mode'] = mode_choices[index] + _prompt_backend(window, options, on_done) + + window.show_quick_panel( + mode_choices, + on_mode_select, + selected_index=mode_index + ) + + +def _prompt_backend(window, options, on_done): + backend_choices = [ + ('', 'auto'), + ('memory', 'memory'), + ('sqlite', 'sqlite'), + ('sqlite-fts', 'sqlite-fts'), + ('lmdb', 'lmdb') + ] + labels = ['backend: {0}'.format(label) for _, label in backend_choices] + current = options.get('backend') or '' + current_index = 0 + for idx, (value, _) in enumerate(backend_choices): + if value == current: + current_index = idx + break + + def on_backend_select(index): + if index < 0: + on_done(options) + return + options['backend'] = backend_choices[index][0] + _prompt_limit(window, options, on_done) + + window.show_quick_panel(labels, on_backend_select, selected_index=current_index) + + +def _prompt_limit(window, options, on_done): + limit_default = options.get('limit') + limit_values = [] + if isinstance(limit_default, int) and limit_default > 0: + limit_values.append(limit_default) + for value in LIMIT_CHOICES: + if value not in limit_values: + limit_values.append(value) + + choices = ['limit: {0}'.format(value) for value in limit_values] + choices.append('limit: custom') + + def on_limit_select(index): + if index < 0: + on_done(options) + return + if index < len(limit_values): + options['limit'] = limit_values[index] + on_done(options) + return + + def on_custom_done(value): + value = value.strip() + if not value: + on_done(options) + return + try: + parsed = int(value) + except Exception: + ui.show_error('Limit must be an integer.') + on_done(options) + return + if parsed < 1: + ui.show_error('Limit must be at least 1.') + on_done(options) + return + options['limit'] = parsed + on_done(options) + + window.show_input_panel( + 'PairOfCleats result limit', + str(limit_default or ''), + on_custom_done, + None, + None + ) + + window.show_quick_panel(choices, on_limit_select, selected_index=0) + + +def _execute_search(window, query, overrides=None, explain=False): + if not query: + return + + settings = config.get_settings(window) + repo_root, reason = _resolve_repo_root(window) + if not repo_root: + ui.show_error('PairOfCleats: {0}'.format(reason)) + return + if reason: + ui.show_status('PairOfCleats: {0}'.format(reason)) + + errors = config.validate_settings(settings, repo_root) + if errors: + message = 'PairOfCleats settings need attention:\n- {0}'.format( + '\n- '.join(errors) + ) + ui.show_error(message) + return + + resolved = _resolve_defaults(settings, overrides) + args = search_lib.build_search_args( + query, + repo_root=repo_root, + mode=resolved.get('mode'), + backend=resolved.get('backend') or None, + limit=resolved.get('limit'), + explain=explain + ) + + cli = paths.resolve_cli(settings, repo_root) + command = cli['command'] + full_args = list(cli.get('args_prefix') or []) + args + env = config.build_env(settings) + + ui.show_status('PairOfCleats: searching...') + + def on_done(result): + if result.returncode != 0: + message = result.output.strip() or 'PairOfCleats search failed.' + ui.show_error(message) + return + if result.error: + ui.show_error(result.error) + return + payload = result.payload + if not isinstance(payload, dict): + ui.show_error('PairOfCleats search returned invalid JSON.') + return + if payload.get('ok') is False: + ui.show_error(payload.get('message') or 'PairOfCleats search failed.') + return + + hits = results.collect_hits(payload) + history_limit = settings.get('history_limit') + history.record_query(window, query, resolved, history_limit) + + if explain: + text = results.format_explain_text(hits) + ui.write_output_panel(window, results.RESULTS_PANEL, text) + return + + if not hits: + ui.show_status('PairOfCleats: no results.') + return + + target = _resolve_results_target(settings, len(hits)) + if target == 'output_panel': + text = results.format_results_text(hits) + ui.write_output_panel(window, results.RESULTS_PANEL, text) + return + if target == 'new_tab': + text = results.format_results_text(hits) + results.open_results_view(window, text) + return + + items = [results.format_quick_panel_item(hit) for hit in hits] + + def on_select(index): + if index < 0: + return + results.open_hit(window, hits[index], repo_root) + + window.show_quick_panel(items, on_select) + + runner.run_process( + command, + full_args, + cwd=repo_root, + env=env, + window=window, + title='PairOfCleats search', + capture_json=True, + on_done=on_done, + stream_output=False + ) + + +def _extract_selection(view): + if view is None: + return '' + for region in view.sel(): + if not region.empty(): + return view.substr(region) + return '' + + +def _extract_symbol(view): + if view is None: + return '' + selection = view.sel() + if not selection: + return '' + region = selection[0] + word = view.word(region) + return view.substr(word) + + +def _search_with_query(window, query, overrides=None, force_prompt=False): + if not query: + ui.show_status('PairOfCleats: empty query.') + return + settings = config.get_settings(window) + defaults = _resolve_defaults(settings, overrides) + + def after_options(options): + _execute_search(window, query, options) + + _prompt_options(window, settings, defaults, after_options, force_prompt=force_prompt) + + +def _search_with_prompt(window, overrides=None, force_prompt=False): + settings = config.get_settings(window) + defaults = _resolve_defaults(settings, overrides) + last = history.get_last_query(window) + initial = last.get('query') if isinstance(last, dict) else '' + + def on_query(value): + if not value: + return + def after_options(options): + _execute_search(window, value, options) + _prompt_options(window, settings, defaults, after_options, force_prompt=force_prompt) + + _prompt_query(window, initial, on_query) + + +class PairOfCleatsSearchCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self, query=None): + if query: + _search_with_query(self.window, query) + return + _search_with_prompt(self.window) + + +class PairOfCleatsSearchWithOptionsCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self, query=None): + if query: + _search_with_query(self.window, query, force_prompt=True) + return + _search_with_prompt(self.window, force_prompt=True) + + +class PairOfCleatsSearchSelectionCommand(sublime_plugin.TextCommand): + def is_enabled(self): + return bool(self.view) + + def is_visible(self): + return True + + def run(self, edit): + query = _extract_selection(self.view) + if not query: + ui.show_status('PairOfCleats: no selection to search.') + return + _search_with_query(self.view.window(), query) + + +class PairOfCleatsSearchSymbolUnderCursorCommand(sublime_plugin.TextCommand): + def is_enabled(self): + return bool(self.view) + + def is_visible(self): + return True + + def run(self, edit): + query = _extract_symbol(self.view) + if not query: + ui.show_status('PairOfCleats: no symbol under cursor.') + return + _search_with_query(self.view.window(), query) + + +class PairOfCleatsSearchHistoryCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + entries = history.load_history(self.window) + if not entries: + ui.show_status('PairOfCleats: no history yet.') + return + + items = [] + for entry in entries: + query = entry.get('query') or '' + mode = entry.get('mode') or 'both' + backend = entry.get('backend') or 'auto' + limit = entry.get('limit') or '' + detail = 'mode {0} | backend {1} | limit {2}'.format(mode, backend, limit) + items.append([query, detail]) + + def on_select(index): + if index < 0: + return + entry = entries[index] + _execute_search(self.window, entry.get('query'), entry) + + self.window.show_quick_panel(items, on_select) + + +class PairOfCleatsRepeatLastSearchCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + entry = history.get_last_query(self.window) + if not entry: + ui.show_status('PairOfCleats: no previous search to repeat.') + return + _execute_search(self.window, entry.get('query'), entry) + + +class PairOfCleatsExplainSearchCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + entry = history.get_last_query(self.window) + if entry and entry.get('query'): + _execute_search(self.window, entry.get('query'), entry, explain=True) + return + + def on_query(value): + if not value: + return + _execute_search(self.window, value, explain=True) + + _prompt_query(self.window, '', on_query) diff --git a/sublime/PairOfCleats/commands/settings.py b/sublime/PairOfCleats/commands/settings.py new file mode 100644 index 000000000..60210abee --- /dev/null +++ b/sublime/PairOfCleats/commands/settings.py @@ -0,0 +1,19 @@ +import sublime +import sublime_plugin + + +class PairOfCleatsOpenSettingsCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + self.window.run_command( + 'edit_settings', + { + 'base_file': '${packages}/PairOfCleats/PairOfCleats.sublime-settings', + 'user_file': '${packages}/User/PairOfCleats.sublime-settings' + } + ) diff --git a/sublime/PairOfCleats/commands/validate.py b/sublime/PairOfCleats/commands/validate.py new file mode 100644 index 000000000..e3ae8c0e9 --- /dev/null +++ b/sublime/PairOfCleats/commands/validate.py @@ -0,0 +1,30 @@ +import sublime_plugin + +from ..lib import config +from ..lib import paths +from ..lib import ui + + +class PairOfCleatsValidateSettingsCommand(sublime_plugin.WindowCommand): + def is_enabled(self): + return True + + def is_visible(self): + return True + + def run(self): + settings = config.get_settings(self.window) + repo_root, reason = paths.resolve_repo_root(self.window, return_reason=True) + if not repo_root: + ui.show_error('PairOfCleats: {0}'.format(reason)) + return + if reason: + ui.show_status('PairOfCleats: {0}'.format(reason)) + errors = config.validate_settings(settings, repo_root) + if errors: + message = 'PairOfCleats settings need attention:\n- {0}'.format( + '\n- '.join(errors) + ) + ui.show_error(message) + return + ui.show_status('PairOfCleats settings look good.') diff --git a/sublime/PairOfCleats/lib/__init__.py b/sublime/PairOfCleats/lib/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sublime/PairOfCleats/lib/api_client.py b/sublime/PairOfCleats/lib/api_client.py new file mode 100644 index 000000000..21e569ea0 --- /dev/null +++ b/sublime/PairOfCleats/lib/api_client.py @@ -0,0 +1,201 @@ +import json +import os +import urllib.parse +import urllib.request +import urllib.error + + +def normalize_base_url(value): + if not value: + return '' + value = str(value).strip() + if value.endswith('/'): + value = value[:-1] + return value + + +def build_url(base_url, path, params=None): + base_url = normalize_base_url(base_url) + if not base_url: + return '' + params = params or {} + filtered = {} + for key, value in params.items(): + if value is None or value == '': + continue + filtered[str(key)] = str(value) + query = urllib.parse.urlencode(filtered, doseq=True) + if query: + return '{0}{1}?{2}'.format(base_url, path, query) + return '{0}{1}'.format(base_url, path) + + +def _open_url(url, timeout_ms=5000): + timeout = float(timeout_ms or 5000) / 1000.0 + if timeout <= 0: + timeout = 5.0 + + try: + resp = urllib.request.urlopen(url, timeout=timeout) + try: + status = resp.getcode() or 0 + headers = dict(resp.headers.items()) + data = resp.read() + finally: + try: + resp.close() + except Exception: + pass + return status, headers, data + except urllib.error.HTTPError as err: + try: + data = err.read() + except Exception: + data = b'' + headers = dict(getattr(err, 'headers', {}).items()) if getattr(err, 'headers', None) else {} + status = getattr(err, 'code', 0) or 0 + return status, headers, data + + +def request_json(url, timeout_ms=5000): + status, headers, data = _open_url(url, timeout_ms=timeout_ms) + text = (data or b'').decode('utf-8', 'replace') + if status < 200 or status >= 300: + raise RuntimeError('API request failed ({0}): {1}'.format(status, text.strip() or url)) + try: + return json.loads(text or '{}'), headers + except Exception as exc: + raise RuntimeError('API returned invalid JSON: {0}'.format(exc)) + + +def request_text(url, timeout_ms=5000): + status, headers, data = _open_url(url, timeout_ms=timeout_ms) + text = (data or b'').decode('utf-8', 'replace') + if status < 200 or status >= 300: + raise RuntimeError('API request failed ({0}): {1}'.format(status, text.strip() or url)) + return text, headers + + +def _ensure_parent_dir(path_value): + if not path_value: + return + parent = os.path.dirname(path_value) + if not parent: + return + if os.path.isdir(parent): + return + try: + os.makedirs(parent) + except Exception: + pass + + +def _write_text(path_value, text): + _ensure_parent_dir(path_value) + with open(path_value, 'w') as handle: + handle.write(text or '') + + +def _write_json(path_value, payload): + _ensure_parent_dir(path_value) + with open(path_value, 'w') as handle: + json.dump(payload, handle, indent=2, sort_keys=True) + + +def generate_map_report( + base_url, + repo_root, + settings, + scope, + focus, + include, + map_format, + output_path, + model_path, + node_list_path): + base_url = normalize_base_url(base_url) + if not base_url: + raise RuntimeError('api_server_url is not set') + + timeout_ms = settings.get('api_timeout_ms') if isinstance(settings, dict) else None + if not isinstance(timeout_ms, int) or timeout_ms <= 0: + timeout_ms = 5000 + + params = { + 'repo': repo_root, + 'mode': settings.get('map_index_mode') or 'code', + 'scope': scope, + 'focus': focus, + 'include': include, + 'collapse': settings.get('map_collapse_default') or 'none' + } + + if settings.get('map_only_exported'): + params['onlyExported'] = '1' + + max_files = settings.get('map_max_files') + if isinstance(max_files, int) and max_files > 0: + params['maxFiles'] = str(max_files) + + max_members = settings.get('map_max_members_per_file') + if isinstance(max_members, int) and max_members > 0: + params['maxMembersPerFile'] = str(max_members) + + max_edges = settings.get('map_max_edges') + if isinstance(max_edges, int) and max_edges > 0: + params['maxEdges'] = str(max_edges) + + if settings.get('map_top_k_by_degree') is True: + params['topKByDegree'] = '1' + + open_uri = settings.get('map_open_uri_template') + if open_uri: + params['openUriTemplate'] = open_uri + + three_url = settings.get('map_three_url') + if three_url: + params['threeUrl'] = three_url + + # Viewer controls (only used by html-iso) + for setting_key, param_key in [ + ('map_wasd_sensitivity', 'wasdSensitivity'), + ('map_wasd_acceleration', 'wasdAcceleration'), + ('map_wasd_max_speed', 'wasdMaxSpeed'), + ('map_wasd_drag', 'wasdDrag'), + ('map_zoom_sensitivity', 'zoomSensitivity')]: + value = settings.get(setting_key) + if isinstance(value, (int, float)): + params[param_key] = str(value) + + model_url = build_url(base_url, '/map', dict(params, **{'format': 'json'})) + map_model, model_headers = request_json(model_url, timeout_ms=timeout_ms) + _write_json(model_path, map_model) + + nodes_url = build_url(base_url, '/map/nodes', params) + node_list, _headers = request_json(nodes_url, timeout_ms=timeout_ms) + _write_json(node_list_path, node_list) + + out_url = build_url(base_url, '/map', dict(params, **{'format': map_format})) + out_path = output_path + + if map_format in ('json', 'dot'): + text, _headers = request_text(out_url, timeout_ms=timeout_ms) + _write_text(output_path, text) + out_path = output_path + else: + out_path = out_url + + cache_key = model_headers.get('X-PairofCleats-Map-CacheKey') or '' + + return { + 'ok': True, + 'source': 'api', + 'repo': repo_root, + 'format': map_format, + 'outPath': out_path, + 'modelPath': model_path, + 'nodeListPath': node_list_path, + 'cacheKey': cache_key, + 'summary': map_model.get('summary') if isinstance(map_model, dict) else None, + 'warnings': map_model.get('warnings') if isinstance(map_model, dict) else None + } diff --git a/sublime/PairOfCleats/lib/config.py b/sublime/PairOfCleats/lib/config.py new file mode 100644 index 000000000..bb46e30ab --- /dev/null +++ b/sublime/PairOfCleats/lib/config.py @@ -0,0 +1,251 @@ +import os + +import sublime + +SETTINGS_FILE = 'PairOfCleats.sublime-settings' + +DEFAULT_SETTINGS = { + 'pairofcleats_path': '', + 'node_path': '', + 'index_mode_default': 'both', + 'search_backend_default': '', + 'open_results_in': 'quick_panel', + 'search_limit': 25, + 'results_buffer_threshold': 50, + 'history_limit': 25, + 'search_prompt_options': False, + 'index_watch_scope': 'repo', + 'index_watch_folder': '', + 'index_watch_mode': 'all', + 'index_watch_poll_ms': 2000, + 'index_watch_debounce_ms': 500, + 'map_type_default': 'combined', + 'map_format_default': 'html-iso', + 'map_prompt_options': False, + 'map_output_dir': '.pairofcleats/maps', + 'map_only_exported': False, + 'map_collapse_default': 'none', + 'map_max_files': 200, + 'map_max_members_per_file': 60, + 'map_max_edges': 3000, + 'map_top_k_by_degree': False, + 'map_show_report_panel': None, + 'map_stream_output': False, + 'map_open_uri_template': 'subl://open?file={file}&line={line}&column={column}', + 'map_three_url': '', + 'map_index_mode': 'code', + 'map_wasd_sensitivity': 16000, + 'map_wasd_acceleration': 6000, + 'map_wasd_max_speed': 24000, + 'map_wasd_drag': 6, + 'map_zoom_sensitivity': 0.1, + 'profile': '', + 'cache_root': '', + 'embeddings_mode': '', + 'node_options': '', + 'env': {} +} + +VALID_INDEX_MODES = {'code', 'prose', 'both'} +VALID_BACKENDS = {'memory', 'sqlite', 'sqlite-fts', 'lmdb'} +VALID_OPEN_TARGETS = {'quick_panel', 'new_tab', 'output_panel'} +VALID_WATCH_SCOPES = {'repo', 'folder'} +VALID_WATCH_MODES = {'all', 'code', 'prose', 'records', 'extracted-prose'} +VALID_MAP_TYPES = {'combined', 'imports', 'calls', 'usages', 'dataflow'} +VALID_MAP_FORMATS = {'json', 'dot', 'svg', 'html', 'html-iso'} +VALID_MAP_COLLAPSE = {'none', 'file', 'dir'} +VALID_MAP_MODES = {'code', 'prose'} + + +def prime_settings(): + try: + sublime.load_settings(SETTINGS_FILE) + except Exception: + pass + + +def get_settings(window=None): + base = _load_base_settings() + overrides = extract_project_settings(window) + return merge_settings(base, overrides) + + +def extract_project_settings(window): + if window is None: + return {} + data = window.project_data() or {} + settings = data.get('settings') if isinstance(data, dict) else {} + if not isinstance(settings, dict): + settings = {} + + override = settings.get('pairofcleats') or settings.get('PairOfCleats') + if override is None: + override = data.get('pairofcleats') or data.get('PairOfCleats') + if isinstance(override, dict): + return override + return {} + + +def merge_settings(base, overrides): + merged = dict(base) + for key, value in overrides.items(): + if key == 'env' and isinstance(value, dict): + env = dict(merged.get('env') or {}) + env.update(value) + merged['env'] = env + else: + merged[key] = value + return merged + + +def build_env(settings): + env = dict(os.environ) + extra = settings.get('env') or {} + if isinstance(extra, dict): + for key, value in extra.items(): + if key: + env[str(key)] = str(value) + + if settings.get('profile'): + env['PAIROFCLEATS_PROFILE'] = str(settings['profile']) + if settings.get('cache_root'): + env['PAIROFCLEATS_CACHE_ROOT'] = str(settings['cache_root']) + if settings.get('embeddings_mode'): + env['PAIROFCLEATS_EMBEDDINGS'] = str(settings['embeddings_mode']) + if settings.get('node_options'): + env['PAIROFCLEATS_NODE_OPTIONS'] = str(settings['node_options']) + return env + + +def validate_settings(settings, repo_root=None): + errors = [] + + mode = settings.get('index_mode_default') + if mode and mode not in VALID_INDEX_MODES: + errors.append( + 'index_mode_default must be one of: code, prose, both.' + ) + + backend = settings.get('search_backend_default') + if backend and backend not in VALID_BACKENDS: + errors.append( + 'search_backend_default must be one of: memory, sqlite, sqlite-fts, lmdb.' + ) + + target = settings.get('open_results_in') + if target and target not in VALID_OPEN_TARGETS: + errors.append( + 'open_results_in must be one of: quick_panel, new_tab, output_panel.' + ) + + env = settings.get('env') + if env is not None and not isinstance(env, dict): + errors.append('env must be a JSON object (dictionary).') + + cli_path = settings.get('pairofcleats_path') + if cli_path and (os.path.isabs(cli_path) or repo_root): + resolved = _resolve_path(repo_root, cli_path) + if resolved and not os.path.exists(resolved): + errors.append( + 'pairofcleats_path does not exist: {0}'.format(resolved) + ) + + node_path = settings.get('node_path') + if node_path and os.path.isabs(node_path): + if not os.path.exists(node_path): + errors.append( + 'node_path does not exist: {0}'.format(node_path) + ) + + _validate_int_setting(errors, settings, 'search_limit', allow_zero=False) + _validate_int_setting(errors, settings, 'results_buffer_threshold', allow_zero=True) + _validate_int_setting(errors, settings, 'history_limit', allow_zero=True) + _validate_int_setting(errors, settings, 'index_watch_poll_ms', allow_zero=False) + _validate_int_setting(errors, settings, 'index_watch_debounce_ms', allow_zero=False) + + watch_scope = settings.get('index_watch_scope') + if watch_scope and watch_scope not in VALID_WATCH_SCOPES: + errors.append('index_watch_scope must be repo or folder.') + + watch_mode = settings.get('index_watch_mode') + if watch_mode and watch_mode not in VALID_WATCH_MODES: + errors.append('index_watch_mode must be one of: all, code, prose, records, extracted-prose.') + + watch_folder = settings.get('index_watch_folder') + if watch_folder and (os.path.isabs(watch_folder) or repo_root): + resolved = _resolve_path(repo_root, watch_folder) + if resolved and not os.path.exists(resolved): + errors.append('index_watch_folder does not exist: {0}'.format(resolved)) + + map_type = settings.get('map_type_default') + if map_type and map_type not in VALID_MAP_TYPES: + errors.append('map_type_default must be one of: combined, imports, calls, usages, dataflow.') + + map_format = settings.get('map_format_default') + if map_format and map_format not in VALID_MAP_FORMATS: + errors.append('map_format_default must be one of: json, dot, svg, html, html-iso.') + + map_collapse = settings.get('map_collapse_default') + if map_collapse and map_collapse not in VALID_MAP_COLLAPSE: + errors.append('map_collapse_default must be one of: none, file, dir.') + + map_mode = settings.get('map_index_mode') + if map_mode and map_mode not in VALID_MAP_MODES: + errors.append('map_index_mode must be code or prose.') + + _validate_int_setting(errors, settings, 'map_max_files', allow_zero=False) + _validate_int_setting(errors, settings, 'map_max_members_per_file', allow_zero=False) + _validate_int_setting(errors, settings, 'map_max_edges', allow_zero=False) + _validate_number_setting(errors, settings, 'map_wasd_sensitivity', allow_zero=False) + _validate_number_setting(errors, settings, 'map_wasd_acceleration', allow_zero=False) + _validate_number_setting(errors, settings, 'map_wasd_max_speed', allow_zero=False) + _validate_number_setting(errors, settings, 'map_wasd_drag', allow_zero=False) + _validate_number_setting(errors, settings, 'map_zoom_sensitivity', allow_zero=False) + + return errors + + +def _load_base_settings(): + settings = sublime.load_settings(SETTINGS_FILE) + values = dict(DEFAULT_SETTINGS) + for key in DEFAULT_SETTINGS: + values[key] = settings.get(key, DEFAULT_SETTINGS[key]) + return values + + +def _resolve_path(repo_root, raw_path): + if not raw_path: + return None + if os.path.isabs(raw_path): + return raw_path + if repo_root: + return os.path.join(repo_root, raw_path) + return raw_path + + +def _validate_int_setting(errors, settings, key, allow_zero=False): + value = settings.get(key) + if value is None or value == '': + return + if isinstance(value, bool) or not isinstance(value, int): + errors.append('{0} must be an integer.'.format(key)) + return + if allow_zero: + if value < 0: + errors.append('{0} must be 0 or higher.'.format(key)) + elif value < 1: + errors.append('{0} must be 1 or higher.'.format(key)) + + +def _validate_number_setting(errors, settings, key, allow_zero=False): + value = settings.get(key) + if value is None or value == '': + return + if isinstance(value, bool) or not isinstance(value, (int, float)): + errors.append('{0} must be a number.'.format(key)) + return + if allow_zero: + if value < 0: + errors.append('{0} must be 0 or higher.'.format(key)) + elif value <= 0: + errors.append('{0} must be greater than 0.'.format(key)) diff --git a/sublime/PairOfCleats/lib/history.py b/sublime/PairOfCleats/lib/history.py new file mode 100644 index 000000000..c47c7e8d0 --- /dev/null +++ b/sublime/PairOfCleats/lib/history.py @@ -0,0 +1,69 @@ +def load_history(window): + if window is None: + return [] + _, state = _load_state(window) + history = state.get('history') + if isinstance(history, list): + return list(history) + return [] + + +def get_last_query(window): + if window is None: + return None + _, state = _load_state(window) + last = state.get('last_search') + if isinstance(last, dict) and last.get('query'): + return dict(last) + history = state.get('history') + if isinstance(history, list) and history: + entry = history[0] + if isinstance(entry, dict) and entry.get('query'): + return dict(entry) + return None + + +def record_query(window, query, options, limit): + if window is None or not query: + return + data, state = _load_state(window) + history = state.get('history') + if not isinstance(history, list): + history = [] + entry = _build_entry(query, options) + history = [item for item in history if not _matches_entry(item, entry)] + history.insert(0, entry) + if isinstance(limit, int) and limit > 0: + history = history[:limit] + state['history'] = history + state['last_search'] = entry + data['pairofcleats_state'] = state + window.set_project_data(data) + + +def _load_state(window): + data = window.project_data() or {} + state = data.get('pairofcleats_state') + if not isinstance(state, dict): + state = {} + return data, state + + +def _build_entry(query, options): + entry = { + 'query': query + } + if isinstance(options, dict): + for key in ('mode', 'backend', 'limit'): + if key in options: + entry[key] = options.get(key) + return entry + + +def _matches_entry(existing, target): + if not isinstance(existing, dict): + return False + for key in ('query', 'mode', 'backend', 'limit'): + if existing.get(key) != target.get(key): + return False + return True diff --git a/sublime/PairOfCleats/lib/index_state.py b/sublime/PairOfCleats/lib/index_state.py new file mode 100644 index 000000000..f04e24dfe --- /dev/null +++ b/sublime/PairOfCleats/lib/index_state.py @@ -0,0 +1,33 @@ +import datetime + + +def record_last_build(window, mode): + if window is None: + return None + timestamp = datetime.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z' + data = window.project_data() or {} + state = data.get('pairofcleats_state') + if not isinstance(state, dict): + state = {} + index_state = state.get('index') + if not isinstance(index_state, dict): + index_state = {} + index_state['last_mode'] = mode + index_state['last_time'] = timestamp + state['index'] = index_state + data['pairofcleats_state'] = state + window.set_project_data(data) + return index_state + + +def get_last_build(window): + if window is None: + return None + data = window.project_data() or {} + state = data.get('pairofcleats_state') + if not isinstance(state, dict): + return None + index_state = state.get('index') + if isinstance(index_state, dict): + return dict(index_state) + return None diff --git a/sublime/PairOfCleats/lib/indexing.py b/sublime/PairOfCleats/lib/indexing.py new file mode 100644 index 000000000..be1eb6328 --- /dev/null +++ b/sublime/PairOfCleats/lib/indexing.py @@ -0,0 +1,32 @@ +def build_index_args(mode, repo_root=None, watch=False, watch_poll_ms=None, watch_debounce_ms=None): + args = ['index', 'watch' if watch else 'build'] + if mode: + args.extend(['--mode', mode]) + if watch: + if watch_poll_ms is not None: + args.extend(['--watch-poll', str(watch_poll_ms)]) + if watch_debounce_ms is not None: + args.extend(['--watch-debounce', str(watch_debounce_ms)]) + if repo_root: + args.extend(['--repo', repo_root]) + return args + + +def build_validate_args(repo_root=None, modes=None, json_output=True): + args = ['index', 'validate'] + if json_output: + args.append('--json') + if modes: + args.extend(['--mode', modes]) + if repo_root: + args.extend(['--repo', repo_root]) + return args + + +def build_config_dump_args(repo_root=None, json_output=True): + args = ['config', 'dump'] + if json_output: + args.append('--json') + if repo_root: + args.extend(['--repo', repo_root]) + return args diff --git a/sublime/PairOfCleats/lib/map.py b/sublime/PairOfCleats/lib/map.py new file mode 100644 index 000000000..e876d87e4 --- /dev/null +++ b/sublime/PairOfCleats/lib/map.py @@ -0,0 +1,133 @@ +import os +import time + + +MAP_TYPES = { + 'imports': 'imports', + 'calls': 'calls', + 'usages': 'usages', + 'dataflow': 'dataflow,aliases', + 'combined': 'imports,calls,usages,dataflow,exports' +} + +MAP_FORMATS = { + 'json': '.json', + 'dot': '.dot', + 'svg': '.svg', + 'html': '.html', + 'html-iso': '.iso.html' +} + + +def resolve_output_dir(repo_root, settings): + output_dir = settings.get('map_output_dir') or '.pairofcleats/maps' + if os.path.isabs(output_dir): + return output_dir + return os.path.normpath(os.path.join(repo_root, output_dir)) + + +def build_output_paths(repo_root, settings, scope, map_type, map_format): + output_dir = resolve_output_dir(repo_root, settings) + timestamp = time.strftime('%Y%m%d-%H%M%S') + safe_scope = (scope or 'repo').replace(' ', '_') + safe_type = (map_type or 'combined').replace(' ', '_') + base = 'map_{0}_{1}_{2}'.format(safe_scope, safe_type, timestamp) + + extension = MAP_FORMATS.get(map_format, '.json') + output_path = os.path.join(output_dir, base + extension) + model_path = os.path.join(output_dir, base + '.model.json') + node_list_path = os.path.join(output_dir, base + '.nodes.json') + return output_path, model_path, node_list_path + + +def resolve_map_type(settings, override=None): + if override: + return override + return settings.get('map_type_default') or 'combined' + + +def resolve_map_format(settings, override=None): + if override: + return override + return settings.get('map_format_default') or 'html-iso' + + +def build_map_args( + repo_root, + settings, + scope, + focus, + map_type, + map_format, + output_path, + model_path, + node_list_path): + args = ['report', 'map', '--repo', repo_root] + + mode = settings.get('map_index_mode') or 'code' + args += ['--mode', mode] + + args += ['--scope', scope] + if focus: + args += ['--focus', focus] + + include = MAP_TYPES.get(map_type) + if include: + args += ['--include', include] + + if settings.get('map_only_exported'): + args.append('--only-exported') + + collapse = settings.get('map_collapse_default') + if collapse: + args += ['--collapse', collapse] + + max_files = settings.get('map_max_files') + if isinstance(max_files, int) and max_files > 0: + args += ['--max-files', str(max_files)] + + max_members = settings.get('map_max_members_per_file') + if isinstance(max_members, int) and max_members > 0: + args += ['--max-members-per-file', str(max_members)] + + max_edges = settings.get('map_max_edges') + if isinstance(max_edges, int) and max_edges > 0: + args += ['--max-edges', str(max_edges)] + + if settings.get('map_top_k_by_degree') is True: + args.append('--top-k-by-degree') + + if map_format: + args += ['--format', map_format] + + if output_path: + args += ['--out', output_path] + + if model_path: + args += ['--model-out', model_path] + + if node_list_path: + args += ['--node-list-out', node_list_path] + + open_uri = settings.get('map_open_uri_template') + if open_uri: + args += ['--open-uri-template', open_uri] + + three_url = settings.get('map_three_url') + if three_url: + args += ['--three-url', three_url] + + _append_number(args, settings, 'map_wasd_sensitivity', '--wasd-sensitivity') + _append_number(args, settings, 'map_wasd_acceleration', '--wasd-acceleration') + _append_number(args, settings, 'map_wasd_max_speed', '--wasd-max-speed') + _append_number(args, settings, 'map_wasd_drag', '--wasd-drag') + _append_number(args, settings, 'map_zoom_sensitivity', '--zoom-sensitivity') + + args.append('--json') + return args + + +def _append_number(args, settings, key, flag): + value = settings.get(key) + if isinstance(value, (int, float)): + args += [flag, str(value)] diff --git a/sublime/PairOfCleats/lib/map_state.py b/sublime/PairOfCleats/lib/map_state.py new file mode 100644 index 000000000..501aa349c --- /dev/null +++ b/sublime/PairOfCleats/lib/map_state.py @@ -0,0 +1,25 @@ +def get_last_map(window): + if window is None: + return None + _, state = _load_state(window) + entry = state.get('last_map') + if isinstance(entry, dict): + return dict(entry) + return None + + +def record_last_map(window, payload): + if window is None or not isinstance(payload, dict): + return + data, state = _load_state(window) + state['last_map'] = dict(payload) + data['pairofcleats_state'] = state + window.set_project_data(data) + + +def _load_state(window): + data = window.project_data() or {} + state = data.get('pairofcleats_state') + if not isinstance(state, dict): + state = {} + return data, state diff --git a/sublime/PairOfCleats/lib/paths.py b/sublime/PairOfCleats/lib/paths.py new file mode 100644 index 000000000..e250a8c8b --- /dev/null +++ b/sublime/PairOfCleats/lib/paths.py @@ -0,0 +1,159 @@ +import os + + +def find_repo_root(start_path): + if not start_path: + return None + + path = start_path + if os.path.isfile(path): + path = os.path.dirname(path) + path = os.path.abspath(path) + + while True: + if os.path.isfile(os.path.join(path, '.pairofcleats.json')): + return path + if os.path.isdir(os.path.join(path, '.git')): + return path + + parent = os.path.dirname(path) + if parent == path: + break + path = parent + + return None + + +def resolve_repo_root(window, return_reason=False, path_hint=None): + root, reason = _resolve_repo_root(window, path_hint=path_hint) + if return_reason: + return root, reason + return root + + +def has_repo_root(window, path_hint=None): + root, _ = resolve_repo_root(window, return_reason=True, path_hint=path_hint) + return root is not None + + +def _resolve_repo_root(window, path_hint=None): + if window is None: + return None, 'No active window.' + + hint_root = None + if path_hint: + hint_path = path_hint + if os.path.isfile(hint_path): + hint_path = os.path.dirname(hint_path) + if hint_path: + root = find_repo_root(hint_path) + if root: + return root, None + hint_root = os.path.abspath(hint_path) + + candidates = [] + active_file = None + folders = window.folders() or [] + folders = sorted([os.path.abspath(folder) for folder in folders if folder]) + if folders: + candidates.extend(folders) + else: + view = window.active_view() + active_file = view.file_name() if view else None + if active_file: + candidates.append(active_file) + + for candidate in candidates: + root = find_repo_root(candidate) + if root: + return root, None + + if hint_root: + return hint_root, 'Repo root not found; using hint path.' + if folders: + return folders[0], 'Repo root not found; using open folder.' + if active_file: + return os.path.dirname(active_file), 'Repo root not found; using active file folder.' + + if candidates: + return None, 'Repo root not found. Open a folder with .pairofcleats.json or .git.' + + return None, 'No folders are open. Add a folder or project to enable PairOfCleats.' + + +def resolve_watch_root(window, settings): + repo_root, _ = resolve_repo_root(window, return_reason=True) + scope = (settings.get('index_watch_scope') or 'repo').strip().lower() + folder_override = settings.get('index_watch_folder') or '' + if scope == 'folder': + if folder_override: + resolved = resolve_path(repo_root, folder_override) + if resolved: + return resolved + folders = window.folders() if window is not None else [] + if folders: + return folders[0] + return repo_root + + +def resolve_cli(settings, repo_root): + node_path = settings.get('node_path') or 'node' + configured = (settings.get('pairofcleats_path') or '').strip() + if configured: + resolved = resolve_path(repo_root, configured) + return _cli_for_path(resolved, node_path, 'settings') + + local_bin = _find_local_binary(repo_root) + if local_bin: + return _cli_for_path(local_bin, node_path, 'node_modules') + + if repo_root: + local_js = os.path.join(repo_root, 'bin', 'pairofcleats.js') + if os.path.exists(local_js): + return _cli_for_path(local_js, node_path, 'repo-bin') + + return { + 'command': 'pairofcleats', + 'args_prefix': [], + 'source': 'path' + } + + +def resolve_path(repo_root, value): + if not value: + return None + if os.path.isabs(value): + return value + if repo_root: + return os.path.normpath(os.path.join(repo_root, value)) + return value + + +def _find_local_binary(repo_root): + if not repo_root: + return None + bin_dir = os.path.join(repo_root, 'node_modules', '.bin') + candidates = [ + 'pairofcleats', + 'pairofcleats.cmd', + 'pairofcleats.ps1' + ] + for name in candidates: + candidate = os.path.join(bin_dir, name) + if os.path.exists(candidate): + return candidate + return None + + +def _cli_for_path(path_value, node_path, source): + if path_value and path_value.lower().endswith('.js'): + return { + 'command': node_path or 'node', + 'args_prefix': [path_value], + 'source': source + } + return { + 'command': path_value, + 'args_prefix': [], + 'source': source + } diff --git a/sublime/PairOfCleats/lib/results.py b/sublime/PairOfCleats/lib/results.py new file mode 100644 index 000000000..7c4174114 --- /dev/null +++ b/sublime/PairOfCleats/lib/results.py @@ -0,0 +1,181 @@ +import os + +import sublime + +HIGHLIGHT_KEY = 'pairofcleats.search.highlight' +HIGHLIGHT_SCOPE = 'region.yellowish' +RESULTS_PANEL = 'pairofcleats-results' + + +def collect_hits(payload): + hits = [] + if not isinstance(payload, dict): + return hits + + def add(section, items): + if not isinstance(items, list): + return + for hit in items: + if not isinstance(hit, dict): + continue + merged = dict(hit) + merged['section'] = section + hits.append(merged) + + add('code', payload.get('code')) + add('prose', payload.get('prose')) + add('extracted-prose', payload.get('extractedProse')) + add('records', payload.get('records')) + return hits + + +def format_quick_panel_item(hit): + file_label = format_file_label(hit) + score_label = format_score_label(hit) + section = hit.get('section') or '' + name = hit.get('name') or hit.get('symbol') or '' + headline = hit.get('headline') or hit.get('preview') or '' + + label = name or headline or file_label + detail_parts = [file_label] + if section: + detail_parts.append(section) + if score_label: + detail_parts.append(score_label) + detail = ' | '.join([part for part in detail_parts if part]) + + if headline and headline != label: + return [label, detail, headline] + return [label, detail] + + +def format_results_text(hits): + lines = ['PairOfCleats results ({0})'.format(len(hits)), ''] + for idx, hit in enumerate(hits, start=1): + file_label = format_file_label(hit) + section = hit.get('section') or '' + score_label = format_score_label(hit) + name = hit.get('name') or hit.get('symbol') or '' + headline = hit.get('headline') or hit.get('preview') or '' + + header_parts = ['{0}.'.format(idx), file_label] + if section: + header_parts.append('[{0}]'.format(section)) + if score_label: + header_parts.append(score_label) + lines.append(' '.join([part for part in header_parts if part])) + + if name: + lines.append(' {0}'.format(name)) + if headline and headline != name: + lines.append(' {0}'.format(headline)) + lines.append('') + return '\n'.join(lines).rstrip() + '\n' + + +def format_explain_text(hits): + lines = ['PairOfCleats explain ({0})'.format(len(hits)), ''] + for idx, hit in enumerate(hits, start=1): + file_label = format_file_label(hit) + section = hit.get('section') or '' + score_label = format_score_label(hit) + lines.append('{0}. {1}'.format(idx, file_label)) + if section or score_label: + detail = ' '.join([part for part in [section, score_label] if part]) + if detail: + lines.append(' {0}'.format(detail)) + + breakdown = hit.get('scoreBreakdown') + if isinstance(breakdown, dict) and breakdown: + for key in sorted(breakdown.keys()): + value = breakdown[key] + lines.append(' {0}: {1}'.format(key, value)) + else: + lines.append(' (no score breakdown)') + lines.append('') + return '\n'.join(lines).rstrip() + '\n' + + +def open_hit(window, hit, repo_root=None): + file_path = resolve_hit_path(hit, repo_root) + if not file_path: + return None + + start_line = hit.get('startLine') + encoded_path = file_path + if isinstance(start_line, int) and start_line > 0: + encoded_path = '{0}:{1}'.format(file_path, start_line) + + view = window.open_file(encoded_path, sublime.ENCODED_POSITION) + highlight_hit(view, hit) + return view + + +def open_results_view(window, text): + if window is None: + return None + view = window.new_file() + view.set_name('PairOfCleats Results') + view.set_scratch(True) + view.set_read_only(False) + view.run_command('append', {'characters': text, 'force': True}) + view.set_read_only(True) + return view + + +def resolve_hit_path(hit, repo_root): + if not isinstance(hit, dict): + return None + file_path = hit.get('file') + if not file_path: + return None + if os.path.isabs(file_path): + return file_path + if repo_root: + return os.path.join(repo_root, file_path) + return file_path + + +def highlight_hit(view, hit): + if view is None or not isinstance(hit, dict): + return + start_line = hit.get('startLine') + end_line = hit.get('endLine') or start_line + if not isinstance(start_line, int) or start_line <= 0: + return + if not isinstance(end_line, int) or end_line <= 0: + end_line = start_line + + def apply(): + if view.is_loading(): + sublime.set_timeout(apply, 10) + return + view.erase_regions(HIGHLIGHT_KEY) + start_pt = view.text_point(start_line - 1, 0) + end_pt = view.text_point(end_line - 1, 0) + region = view.full_line(sublime.Region(start_pt, end_pt)) + view.add_regions(HIGHLIGHT_KEY, [region], HIGHLIGHT_SCOPE, flags=0) + + sublime.set_timeout(apply, 0) + + +def format_file_label(hit): + file_path = hit.get('file') or '' + start_line = hit.get('startLine') + end_line = hit.get('endLine') + if isinstance(start_line, int) and start_line > 0: + if isinstance(end_line, int) and end_line > start_line: + return '{0}:{1}-{2}'.format(file_path, start_line, end_line) + return '{0}:{1}'.format(file_path, start_line) + return file_path + + +def format_score_label(hit): + score = hit.get('score') + score_type = hit.get('scoreType') or '' + if isinstance(score, (int, float)): + label = '{0:.2f}'.format(score) + if score_type: + label = '{0} {1}'.format(label, score_type) + return 'score {0}'.format(label) + return '' diff --git a/sublime/PairOfCleats/lib/runner.py b/sublime/PairOfCleats/lib/runner.py new file mode 100644 index 000000000..c45eaecc6 --- /dev/null +++ b/sublime/PairOfCleats/lib/runner.py @@ -0,0 +1,121 @@ +import json +import os +import subprocess +import threading + +import sublime + + +class ProcessResult(object): + def __init__(self, returncode, output, payload=None, error=None): + self.returncode = returncode + self.output = output + self.payload = payload + self.error = error + + +class ProcessHandle(object): + def __init__(self, process, thread, on_cancel=None): + self.process = process + self.thread = thread + self._on_cancel = on_cancel + + def cancel(self): + if self.process.poll() is not None: + return + try: + self.process.terminate() + except Exception: + pass + if self._on_cancel: + self._on_cancel() + timer = threading.Timer(1.5, self._kill_if_running) + timer.daemon = True + timer.start() + + def _kill_if_running(self): + if self.process.poll() is not None: + return + try: + self.process.kill() + except Exception: + pass + + +def run_process(command, args, cwd=None, env=None, window=None, title='PairOfCleats', + capture_json=False, on_done=None, stream_output=True, + panel_name='pairofcleats'): + if window is None: + window = sublime.active_window() + panel = None + if stream_output: + panel = _ensure_panel(window, panel_name) + _show_panel(window, panel_name) + + full_env = dict(os.environ) + if env: + full_env.update(env) + + proc = subprocess.Popen( + [command] + list(args), + cwd=cwd or None, + env=full_env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + output_lines = [] + + def append_line(line): + output_lines.append(line) + if panel is not None: + _append_panel(panel, line) + + def done_callback(result): + if on_done: + on_done(result) + + def worker(): + try: + for line in proc.stdout: + append_line(line) + finally: + proc.wait() + + output = ''.join(output_lines) + payload = None + error = None + if capture_json: + try: + payload = json.loads(output or '{}') + except Exception as exc: + error = 'Failed to parse JSON output: {0}'.format(exc) + result = ProcessResult(proc.returncode, output, payload=payload, error=error) + sublime.set_timeout(lambda: done_callback(result), 0) + + thread = threading.Thread(target=worker) + thread.daemon = True + thread.start() + + return ProcessHandle(proc, thread) + + +def _ensure_panel(window, name): + panel = window.create_output_panel(name) + panel.set_read_only(False) + return panel + + +def _show_panel(window, name): + window.run_command('show_panel', {'panel': 'output.{0}'.format(name)}) + + +def _append_panel(panel, text): + def append(): + panel.run_command('append', { + 'characters': text, + 'force': True, + 'scroll_to_end': True + }) + sublime.set_timeout(append, 0) diff --git a/sublime/PairOfCleats/lib/search.py b/sublime/PairOfCleats/lib/search.py new file mode 100644 index 000000000..b4f98f561 --- /dev/null +++ b/sublime/PairOfCleats/lib/search.py @@ -0,0 +1,13 @@ +def build_search_args(query, repo_root=None, mode=None, backend=None, limit=None, explain=False): + args = ['search', query, '--json'] + if mode and mode != 'both': + args.extend(['--mode', mode]) + if backend: + args.extend(['--backend', backend]) + if limit: + args.extend(['--top', str(limit)]) + if explain: + args.append('--explain') + if repo_root: + args.extend(['--repo', repo_root]) + return args diff --git a/sublime/PairOfCleats/lib/ui.py b/sublime/PairOfCleats/lib/ui.py new file mode 100644 index 000000000..a2ba08db2 --- /dev/null +++ b/sublime/PairOfCleats/lib/ui.py @@ -0,0 +1,35 @@ +import sublime + + +def show_error(message): + try: + sublime.error_message(message) + except Exception: + print(message) + + +def show_status(message): + try: + sublime.status_message(message) + except Exception: + print(message) + + +def write_output_panel(window, name, text): + if window is None: + window = sublime.active_window() + if window is None: + return None + + panel = window.create_output_panel(name) + panel.set_read_only(False) + panel.run_command('select_all') + panel.run_command('right_delete') + panel.run_command('append', { + 'characters': text, + 'force': True, + 'scroll_to_end': False + }) + panel.set_read_only(True) + window.run_command('show_panel', {'panel': 'output.{0}'.format(name)}) + return panel diff --git a/sublime/PairOfCleats/lib/watch.py b/sublime/PairOfCleats/lib/watch.py new file mode 100644 index 000000000..13c9b1d4c --- /dev/null +++ b/sublime/PairOfCleats/lib/watch.py @@ -0,0 +1,71 @@ +_WATCHERS = {} + + +def _window_key(window): + if window is None: + return 'global' + try: + return str(window.id()) + except Exception: + return 'global' + + +def register(window, handle, root): + key = _window_key(window) + _WATCHERS[key] = { + 'handle': handle, + 'root': root + } + + +def is_running(window): + key = _window_key(window) + entry = _WATCHERS.get(key) + if not entry: + return False + handle = entry.get('handle') + process = getattr(handle, 'process', None) + if process is None: + return False + return process.poll() is None + + +def stop(window): + key = _window_key(window) + entry = _WATCHERS.pop(key, None) + if not entry: + return False + handle = entry.get('handle') + if handle: + handle.cancel() + return True + + +def stop_all(): + keys = list(_WATCHERS.keys()) + for key in keys: + entry = _WATCHERS.pop(key, None) + if not entry: + continue + handle = entry.get('handle') + if handle: + handle.cancel() + + +def clear_if_done(window): + key = _window_key(window) + entry = _WATCHERS.get(key) + if not entry: + return + handle = entry.get('handle') + process = getattr(handle, 'process', None) + if process is None or process.poll() is not None: + _WATCHERS.pop(key, None) + + +def current_root(window): + key = _window_key(window) + entry = _WATCHERS.get(key) + if not entry: + return None + return entry.get('root') diff --git a/sublime/PairOfCleats/messages/install.txt b/sublime/PairOfCleats/messages/install.txt new file mode 100644 index 000000000..ae510cdde --- /dev/null +++ b/sublime/PairOfCleats/messages/install.txt @@ -0,0 +1,7 @@ +PairOfCleats Sublime Text + +Install requirements: +- Node.js 18+ available on PATH (or set node_path in settings) +- PairOfCleats CLI available via npm or a local repo checkout + +Open the settings command palette entry: "PairOfCleats: Open Settings" to configure the CLI path and environment overrides. \ No newline at end of file diff --git a/sublime/PairOfCleats/plugin.py b/sublime/PairOfCleats/plugin.py new file mode 100644 index 000000000..389555c97 --- /dev/null +++ b/sublime/PairOfCleats/plugin.py @@ -0,0 +1,33 @@ +import sublime +import sublime_plugin + +from .lib import config +from .lib import watch +from .commands import index as _index_commands +from .commands import map as _map_commands +from .commands import search as _search_commands +from .commands import settings as _settings_commands +from .commands import validate as _validate_commands + +PLUGIN_NAME = 'PairOfCleats' + + +def plugin_loaded(): + config.prime_settings() + + +def plugin_unloaded(): + watch.stop_all() + + +class PairOfCleatsWindowListener(sublime_plugin.EventListener): + def on_window_command(self, window, command_name, args): + if command_name == 'close_window': + watch.stop(window) + + def on_post_window_command(self, window, command_name, args): + if command_name == 'close_window': + watch.stop(window) + + def on_exit(self): + watch.stop_all() diff --git a/tests/all.js b/tests/all.js index 3ba3c7085..09f77f4a4 100644 --- a/tests/all.js +++ b/tests/all.js @@ -1,12 +1,17 @@ #!/usr/bin/env node import path from 'node:path'; import { spawnSync } from 'node:child_process'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['skip-bench', 'skip-script-coverage'], - default: { 'skip-bench': false, 'skip-script-coverage': false } -}); +const argv = createCli({ + scriptName: 'test-all', + options: { + 'skip-bench': { type: 'boolean', default: false }, + 'skip-script-coverage': { type: 'boolean', default: false }, + retries: { type: 'number', default: 2 }, + 'log-dir': { type: 'string', default: '' } + } +}).parse(); const envSkipBench = process.env.PAIROFCLEATS_SKIP_BENCH === 'true' || process.env.PAIROFCLEATS_SKIP_BENCH === '1' @@ -29,7 +34,15 @@ const run = (label, args) => { }; if (!skipScriptCoverage) { - run('script-coverage-test', [path.join(root, 'tests', 'script-coverage.js')]); + const args = [path.join(root, 'tests', 'script-coverage.js')]; + const passRetries = process.argv.some((arg) => arg === '--retries' || arg.startsWith('--retries=')); + if (passRetries) { + args.push('--retries', String(argv.retries)); + } + if (argv['log-dir']) { + args.push('--log-dir', argv['log-dir']); + } + run('script-coverage-test', args); } if (!skipBench) { diff --git a/tests/api-server-stream.js b/tests/api-server-stream.js new file mode 100644 index 000000000..43b04e12c --- /dev/null +++ b/tests/api-server-stream.js @@ -0,0 +1,206 @@ +#!/usr/bin/env node +import http from 'node:http'; +import path from 'node:path'; +import readline from 'node:readline'; +import fsPromises from 'node:fs/promises'; +import { spawn, spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const cacheRoot = path.join(root, 'tests', '.cache', 'api-server-stream'); +const serverPath = path.join(root, 'tools', 'api-server.js'); + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const build = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', fixtureRoot], + { env, stdio: 'inherit' } +); +if (build.status !== 0) { + console.error('api-server stream test failed: build_index failed'); + process.exit(build.status ?? 1); +} + +const server = spawn( + process.execPath, + [serverPath, '--port', '0', '--json', '--quiet', '--repo', fixtureRoot], + { env, stdio: ['ignore', 'pipe', 'pipe'] } +); + +let stderr = ''; +server.stderr?.on('data', (chunk) => { + stderr += chunk.toString(); +}); + +const readStartup = async () => { + const rl = readline.createInterface({ input: server.stdout }); + return await new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + rl.close(); + reject(new Error('api-server startup timed out')); + }, 10000); + rl.once('line', (line) => { + clearTimeout(timeout); + rl.close(); + resolve(line); + }); + }); +}; + +const parseSse = (block) => { + const lines = block.split(/\r?\n/); + let event = 'message'; + let data = ''; + for (const line of lines) { + if (line.startsWith('event:')) { + event = line.replace('event:', '').trim(); + continue; + } + if (line.startsWith('data:')) { + data += line.replace('data:', '').trim(); + } + } + const payload = data ? JSON.parse(data) : null; + return { event, data: payload }; +}; + +const readSse = async (method, requestPath, body) => await new Promise((resolve, reject) => { + const payload = body ? JSON.stringify(body) : null; + const events = []; + let buffer = ''; + const req = http.request( + { + host: serverInfo.host, + port: serverInfo.port, + path: requestPath, + method, + headers: payload + ? { + 'Content-Type': 'application/json', + 'Content-Length': Buffer.byteLength(payload) + } + : {} + }, + (res) => { + res.on('data', (chunk) => { + buffer += chunk.toString(); + while (true) { + const idx = buffer.indexOf('\n\n'); + if (idx === -1) break; + const block = buffer.slice(0, idx).trim(); + buffer = buffer.slice(idx + 2); + if (!block) continue; + const parsed = parseSse(block); + events.push(parsed); + if (parsed.event === 'done') { + resolve(events); + req.destroy(); + break; + } + } + }); + res.on('end', () => resolve(events)); + } + ); + req.on('error', reject); + if (payload) req.write(payload); + req.end(); +}); + +const abortStream = async (method, requestPath, body) => await new Promise((resolve, reject) => { + const payload = body ? JSON.stringify(body) : null; + const req = http.request( + { + host: serverInfo.host, + port: serverInfo.port, + path: requestPath, + method, + headers: payload + ? { + 'Content-Type': 'application/json', + 'Content-Length': Buffer.byteLength(payload) + } + : {} + }, + (res) => { + const timeout = setTimeout(() => { + req.destroy(); + resolve(); + }, 1000); + res.once('data', () => { + clearTimeout(timeout); + req.destroy(); + resolve(); + }); + res.on('error', (err) => { + clearTimeout(timeout); + if (err?.code === 'ECONNRESET') return resolve(); + reject(err); + }); + } + ); + req.on('error', (err) => { + if (err?.code === 'ECONNRESET') return resolve(); + reject(err); + }); + if (payload) req.write(payload); + req.end(); +}); + +let serverInfo = null; +try { + const line = await readStartup(); + serverInfo = JSON.parse(line || '{}'); + if (!serverInfo?.port) { + throw new Error('api-server did not report a listening port'); + } + + const statusEvents = await readSse('GET', '/status/stream'); + const statusResult = statusEvents.find((evt) => evt.event === 'result'); + if (!statusResult?.data?.status?.repo?.root) { + throw new Error('status stream missing repo payload'); + } + + const searchEvents = await readSse('POST', '/search/stream', { query: 'return', mode: 'code' }); + const searchResult = searchEvents.find((evt) => evt.event === 'result'); + const hits = searchResult?.data?.result?.code || []; + if (!hits.length) { + throw new Error('search stream returned no results'); + } + + await abortStream('POST', '/search/stream', { query: 'return', mode: 'code' }); + const followUp = await readSse('GET', '/status/stream'); + const followResult = followUp.find((evt) => evt.event === 'result'); + if (!followResult?.data?.status?.repo?.root) { + throw new Error('stream abort should not break subsequent requests'); + } +} catch (err) { + console.error(err?.message || err); + if (stderr.trim()) { + console.error(stderr.trim()); + } + server.kill('SIGKILL'); + process.exit(1); +} + +await new Promise((resolve) => { + const timeout = setTimeout(() => { + server.kill('SIGKILL'); + resolve(); + }, 5000); + server.once('exit', () => { + clearTimeout(timeout); + resolve(); + }); + server.kill('SIGTERM'); +}); + +console.log('api-server stream tests passed'); diff --git a/tests/api-server.js b/tests/api-server.js new file mode 100644 index 000000000..0f7b038c8 --- /dev/null +++ b/tests/api-server.js @@ -0,0 +1,164 @@ +#!/usr/bin/env node +import http from 'node:http'; +import path from 'node:path'; +import readline from 'node:readline'; +import fsPromises from 'node:fs/promises'; +import { spawn, spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const cacheRoot = path.join(root, 'tests', '.cache', 'api-server'); +const emptyRepo = path.join(cacheRoot, 'empty'); +const serverPath = path.join(root, 'tools', 'api-server.js'); + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); +await fsPromises.mkdir(emptyRepo, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const build = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', fixtureRoot], + { env, stdio: 'inherit' } +); +if (build.status !== 0) { + console.error('api-server test failed: build_index failed'); + process.exit(build.status ?? 1); +} + +const server = spawn( + process.execPath, + [serverPath, '--port', '0', '--json', '--quiet', '--repo', fixtureRoot], + { env, stdio: ['ignore', 'pipe', 'pipe'] } +); + +let stderr = ''; +server.stderr?.on('data', (chunk) => { + stderr += chunk.toString(); +}); + +const readStartup = async () => { + const rl = readline.createInterface({ input: server.stdout }); + return await new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + rl.close(); + reject(new Error('api-server startup timed out')); + }, 10000); + rl.once('line', (line) => { + clearTimeout(timeout); + rl.close(); + resolve(line); + }); + }); +}; + +const requestJson = async (method, requestPath, body) => await new Promise((resolve, reject) => { + const host = serverInfo?.host || '127.0.0.1'; + const port = serverInfo?.port || 0; + const payload = body ? JSON.stringify(body) : null; + const req = http.request( + { + host, + port, + path: requestPath, + method, + headers: payload + ? { + 'Content-Type': 'application/json', + 'Content-Length': Buffer.byteLength(payload) + } + : {} + }, + (res) => { + let data = ''; + res.on('data', (chunk) => { + data += chunk.toString(); + }); + res.on('end', () => { + try { + resolve({ status: res.statusCode || 0, body: JSON.parse(data || '{}') }); + } catch (err) { + reject(err); + } + }); + } + ); + req.on('error', reject); + if (payload) req.write(payload); + req.end(); +}); + +let serverInfo = null; +try { + const line = await readStartup(); + serverInfo = JSON.parse(line || '{}'); + if (!serverInfo?.port) { + throw new Error('api-server did not report a listening port'); + } + + const health = await requestJson('GET', '/health'); + if (!health.body?.ok || typeof health.body.uptimeMs !== 'number') { + throw new Error('api-server /health response invalid'); + } + + const status = await requestJson('GET', '/status'); + if (!status.body?.ok || !status.body.status?.repo?.root) { + throw new Error('api-server /status response missing repo info'); + } + + const search = await requestJson('POST', '/search', { query: 'return', mode: 'code', top: 3 }); + const hits = search.body?.result?.code || []; + if (!search.body?.ok || !hits.length) { + throw new Error('api-server /search returned no results'); + } + if (hits[0]?.tokens !== undefined) { + throw new Error('api-server /search should default to compact JSON output'); + } + + const invalid = await requestJson('POST', '/search', {}); + if (invalid.status !== 400 || invalid.body?.ok !== false || invalid.body?.code !== 'INVALID_REQUEST') { + throw new Error('api-server should reject missing query'); + } + + const unknownField = await requestJson('POST', '/search', { + query: 'return', + extraField: true + }); + if (unknownField.status !== 400 || unknownField.body?.code !== 'INVALID_REQUEST') { + throw new Error('api-server should reject unknown fields'); + } + + const noIndex = await requestJson('POST', '/search', { + repoPath: emptyRepo, + query: 'return' + }); + if (noIndex.status !== 409 || noIndex.body?.code !== 'NO_INDEX') { + throw new Error('api-server should return NO_INDEX when indexes are missing'); + } +} catch (err) { + console.error(err?.message || err); + if (stderr.trim()) { + console.error(stderr.trim()); + } + server.kill('SIGKILL'); + process.exit(1); +} + +await new Promise((resolve) => { + const timeout = setTimeout(() => { + server.kill('SIGKILL'); + resolve(); + }, 5000); + server.once('exit', () => { + clearTimeout(timeout); + resolve(); + }); + server.kill('SIGTERM'); +}); + +console.log('api-server tests passed'); diff --git a/tests/artifact-bak-recovery.js b/tests/artifact-bak-recovery.js new file mode 100644 index 000000000..c2cdb26d4 --- /dev/null +++ b/tests/artifact-bak-recovery.js @@ -0,0 +1,85 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { readJsonFile, readJsonLinesArraySync } from '../src/shared/artifact-io.js'; +import { writeJsonLinesFile } from '../src/shared/json-stream.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'artifact-bak-recovery'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); + +const primaryPath = path.join(tempRoot, 'primary.json'); +const primaryBak = `${primaryPath}.bak`; +await fsPromises.writeFile(primaryPath, JSON.stringify({ ok: true })); +await fsPromises.writeFile(primaryBak, JSON.stringify({ ok: false })); + +const primary = readJsonFile(primaryPath); +if (!primary?.ok) { + console.error('artifact bak test failed: primary read did not return expected payload.'); + process.exit(1); +} +if (fs.existsSync(primaryBak)) { + console.error('artifact bak test failed: backup was not cleaned up after primary read.'); + process.exit(1); +} + +const corruptPath = path.join(tempRoot, 'corrupt.json'); +const corruptBak = `${corruptPath}.bak`; +await fsPromises.writeFile(corruptPath, '{bad json'); +await fsPromises.writeFile(corruptBak, JSON.stringify({ ok: 'backup' })); + +const fallback = readJsonFile(corruptPath); +if (fallback?.ok !== 'backup') { + console.error('artifact bak test failed: fallback did not return backup payload.'); + process.exit(1); +} +if (!fs.existsSync(corruptBak)) { + console.error('artifact bak test failed: backup should remain after fallback read.'); + process.exit(1); +} + +const missingPath = path.join(tempRoot, 'missing.json'); +const missingBak = `${missingPath}.bak`; +await fsPromises.writeFile(missingBak, JSON.stringify({ ok: 'onlybak' })); +const missing = readJsonFile(missingPath); +if (missing?.ok !== 'onlybak') { + console.error('artifact bak test failed: missing primary did not fall back to backup.'); + process.exit(1); +} +if (!fs.existsSync(missingBak)) { + console.error('artifact bak test failed: backup should remain when primary is missing.'); + process.exit(1); +} + +const jsonlPath = path.join(tempRoot, 'lines.jsonl'); +const jsonlBak = `${jsonlPath}.bak`; +await writeJsonLinesFile(jsonlPath, [{ id: 1 }, { id: 2 }], { atomic: false }); +await fsPromises.writeFile(jsonlBak, '{"id":3}\n'); +const jsonl = readJsonLinesArraySync(jsonlPath); +if (jsonl.length !== 2) { + console.error('artifact bak test failed: jsonl primary read did not return expected rows.'); + process.exit(1); +} +if (fs.existsSync(jsonlBak)) { + console.error('artifact bak test failed: jsonl backup was not cleaned up after primary read.'); + process.exit(1); +} + +const jsonlCorruptPath = path.join(tempRoot, 'lines-corrupt.jsonl'); +const jsonlCorruptBak = `${jsonlCorruptPath}.bak`; +await fsPromises.writeFile(jsonlCorruptPath, '{bad\n'); +await fsPromises.writeFile(jsonlCorruptBak, '{"id":4}\n{"id":5}\n'); +const jsonlFallback = readJsonLinesArraySync(jsonlCorruptPath); +if (jsonlFallback.length !== 2) { + console.error('artifact bak test failed: jsonl backup fallback did not return expected rows.'); + process.exit(1); +} +if (!fs.existsSync(jsonlCorruptBak)) { + console.error('artifact bak test failed: jsonl backup should remain after fallback read.'); + process.exit(1); +} + +console.log('artifact bak recovery tests passed'); diff --git a/tests/artifact-formats.js b/tests/artifact-formats.js new file mode 100644 index 000000000..b9bf9a416 --- /dev/null +++ b/tests/artifact-formats.js @@ -0,0 +1,78 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { loadIndex } from '../src/retrieval/cli-index.js'; + +const root = process.cwd(); +const cacheRoot = path.join(root, 'tests', '.cache', 'artifact-formats'); + +await fs.rm(cacheRoot, { recursive: true, force: true }); +await fs.mkdir(cacheRoot, { recursive: true }); + +const chunkMetaLines = [ + { id: 0, file: 'src/a.js', start: 0, end: 10, ext: '.js', kind: 'Function', name: 'alpha' }, + { id: 1, file: 'src/b.js', start: 0, end: 20, ext: '.js', kind: 'Function', name: 'beta' } +]; +await fs.writeFile( + path.join(cacheRoot, 'chunk_meta.jsonl'), + `${chunkMetaLines.map((row) => JSON.stringify(row)).join('\n')}\n` +); +await fs.writeFile( + path.join(cacheRoot, 'chunk_meta.json'), + JSON.stringify([{ id: 99, file: 'src/legacy.js', start: 0, end: 1, ext: '.js' }], null, 2) +); + +const shardsDir = path.join(cacheRoot, 'token_postings.shards'); +await fs.mkdir(shardsDir, { recursive: true }); + +const partA = { + vocab: ['alpha'], + postings: [[[0, 1]]] +}; +const partB = { + vocab: ['beta'], + postings: [[[1, 2]]] +}; + +const partAName = 'token_postings.part-00000.json'; +const partBName = 'token_postings.part-00001.json'; +await fs.writeFile(path.join(shardsDir, partAName), JSON.stringify(partA, null, 2)); +await fs.writeFile(path.join(shardsDir, partBName), JSON.stringify(partB, null, 2)); + +const meta = { + avgDocLen: 1.5, + totalDocs: 2, + format: 'sharded', + shardSize: 1, + vocabCount: 2, + parts: [ + path.join('token_postings.shards', partAName), + path.join('token_postings.shards', partBName) + ], + docLengths: [1, 2] +}; +await fs.writeFile( + path.join(cacheRoot, 'token_postings.meta.json'), + JSON.stringify(meta, null, 2) +); +await fs.writeFile( + path.join(cacheRoot, 'token_postings.json'), + JSON.stringify({ vocab: ['legacy'], postings: [[[0, 1]]], docLengths: [1], avgDocLen: 1, totalDocs: 1 }, null, 2) +); + +const idx = loadIndex(cacheRoot, { modelIdDefault: null, fileChargramN: 3 }); + +if (!idx || !Array.isArray(idx.chunkMeta) || idx.chunkMeta.length !== 2) { + console.error('Expected chunk_meta to load from JSONL.'); + process.exit(1); +} +if (!idx.tokenIndex || idx.tokenIndex.vocab?.length !== 2) { + console.error('Expected token_postings shards to load into tokenIndex.'); + process.exit(1); +} +if (!Array.isArray(idx.tokenIndex.docLengths) || idx.tokenIndex.docLengths.length !== 2) { + console.error('Expected docLengths to load from token_postings meta.'); + process.exit(1); +} + +console.log('artifact formats test passed'); diff --git a/tests/artifact-size-guardrails.js b/tests/artifact-size-guardrails.js new file mode 100644 index 000000000..179db2e73 --- /dev/null +++ b/tests/artifact-size-guardrails.js @@ -0,0 +1,118 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const cacheRoot = path.join(root, 'tests', '.cache', 'artifact-size-guardrails'); +const repoRoot = path.join(cacheRoot, 'repo'); + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); + +const tokens = []; +for (let i = 0; i < 200; i += 1) { + tokens.push(`token_${i}_${'x'.repeat(24)}`); +} +const lines = []; +for (let i = 0; i < tokens.length; i += 20) { + lines.push(tokens.slice(i, i + 20).join(' ')); +} +const content = `${lines.join('\n')}\n`; +for (let i = 0; i < 3; i += 1) { + await fsPromises.writeFile(path.join(repoRoot, `big-${i}.js`), content); +} +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ + sqlite: { + use: false + }, + indexing: { + fileScan: { + minified: { sampleMinBytes: 20000 } + }, + chunkTokenMode: 'full', + artifacts: { + chunkMetaFormat: 'json', + chunkMetaShardSize: 0, + tokenPostingsFormat: 'json' + } + } + }, null, 2) +); + +const baseEnv = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const runBuild = (label, envOverrides) => { + const result = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--mode', 'code', '--repo', repoRoot], + { cwd: repoRoot, env: { ...baseEnv, ...envOverrides }, stdio: 'inherit' } + ); + if (result.status !== 0) { + console.error(`Failed: build_index (${label})`); + process.exit(result.status ?? 1); + } +}; + +runBuild('artifact guardrails (small max)', { PAIROFCLEATS_MAX_JSON_BYTES: '2048' }); + +const userConfig = loadUserConfig(repoRoot); +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const indexDir = getIndexDir(repoRoot, 'code', userConfig); + +const chunkMetaMetaPath = path.join(indexDir, 'chunk_meta.meta.json'); +const chunkMetaPartsDir = path.join(indexDir, 'chunk_meta.parts'); +if (!fs.existsSync(chunkMetaMetaPath) || !fs.existsSync(chunkMetaPartsDir)) { + console.error('Expected chunk_meta sharding when max JSON bytes is small.'); + process.exit(1); +} +if (fs.existsSync(path.join(indexDir, 'chunk_meta.json'))) { + console.error('Expected chunk_meta.json to be suppressed when sharding.'); + process.exit(1); +} + +const tokenMetaPath = path.join(indexDir, 'token_postings.meta.json'); +const tokenShardsDir = path.join(indexDir, 'token_postings.shards'); +if (!fs.existsSync(tokenMetaPath) || !fs.existsSync(tokenShardsDir)) { + console.error('Expected token_postings shards when max JSON bytes is small.'); + process.exit(1); +} +if (fs.existsSync(path.join(indexDir, 'token_postings.json'))) { + console.error('Expected token_postings.json to be suppressed when sharding.'); + process.exit(1); +} + +runBuild('artifact guardrails (large max)', { PAIROFCLEATS_MAX_JSON_BYTES: '52428800' }); + +const nextIndexDir = getIndexDir(repoRoot, 'code', userConfig); +const nextChunkMetaMeta = path.join(nextIndexDir, 'chunk_meta.meta.json'); +const nextChunkMetaParts = path.join(nextIndexDir, 'chunk_meta.parts'); +if (fs.existsSync(nextChunkMetaMeta) || fs.existsSync(nextChunkMetaParts)) { + console.error('Expected chunk_meta to remain unsharded when max JSON bytes is large.'); + process.exit(1); +} +if (!fs.existsSync(path.join(nextIndexDir, 'chunk_meta.json'))) { + console.error('Expected chunk_meta.json when max JSON bytes is large.'); + process.exit(1); +} + +const nextTokenMetaPath = path.join(nextIndexDir, 'token_postings.meta.json'); +const nextTokenShardsDir = path.join(nextIndexDir, 'token_postings.shards'); +if (fs.existsSync(nextTokenMetaPath) || fs.existsSync(nextTokenShardsDir)) { + console.error('Expected token_postings to remain unsharded when max JSON bytes is large.'); + process.exit(1); +} +if (!fs.existsSync(path.join(nextIndexDir, 'token_postings.json'))) { + console.error('Expected token_postings.json when max JSON bytes is large.'); + process.exit(1); +} + +console.log('artifact size guardrails test passed'); diff --git a/tests/artifacts/file-meta.test.js b/tests/artifacts/file-meta.test.js new file mode 100644 index 000000000..e23bad955 --- /dev/null +++ b/tests/artifacts/file-meta.test.js @@ -0,0 +1,31 @@ +#!/usr/bin/env node +import { buildFileMeta } from '../../src/index/build/artifacts/file-meta.js'; + +const fail = (message) => { + console.error(message); + process.exit(1); +}; + +const state = { + chunks: [ + { file: 'a.js', ext: '.js' }, + { file: 'a.js', ext: '.js' }, + { file: 'b.js', ext: '.js' } + ] +}; + +const { fileMeta, fileIdByPath } = buildFileMeta(state); +if (fileMeta.length !== 2) { + fail('Expected fileMeta to contain one entry per file.'); +} +if (fileMeta[0].file !== 'a.js' || fileMeta[0].id !== 0) { + fail('Expected a.js to be assigned id 0.'); +} +if (fileMeta[1].file !== 'b.js' || fileMeta[1].id !== 1) { + fail('Expected b.js to be assigned id 1.'); +} +if (fileIdByPath.get('a.js') !== 0 || fileIdByPath.get('b.js') !== 1) { + fail('Expected fileIdByPath to map files to stable ids.'); +} + +console.log('artifact file meta tests passed'); diff --git a/tests/artifacts/token-mode.test.js b/tests/artifacts/token-mode.test.js new file mode 100644 index 000000000..b242a5f34 --- /dev/null +++ b/tests/artifacts/token-mode.test.js @@ -0,0 +1,40 @@ +#!/usr/bin/env node +import { resolveTokenMode } from '../../src/index/build/artifacts/token-mode.js'; + +const fail = (message) => { + console.error(message); + process.exit(1); +}; + +const baseState = { + chunks: [{ tokens: ['a', 'b', 'c'] }] +}; + +const autoSample = resolveTokenMode({ + indexingConfig: {}, + state: baseState, + fileCounts: { candidates: 6000 } +}); +if (autoSample.resolvedTokenMode !== 'sample') { + fail('Expected auto mode to resolve to sample when file count exceeds max.'); +} + +const tokenBudgetSample = resolveTokenMode({ + indexingConfig: { chunkTokenMaxTokens: 1 }, + state: { chunks: [{ tokens: ['a', 'b'] }] }, + fileCounts: { candidates: 1 } +}); +if (tokenBudgetSample.resolvedTokenMode !== 'sample') { + fail('Expected auto mode to resolve to sample when token budget exceeded.'); +} + +const noneMode = resolveTokenMode({ + indexingConfig: { chunkTokenMode: 'none' }, + state: baseState, + fileCounts: { candidates: 0 } +}); +if (noneMode.resolvedTokenMode !== 'none') { + fail('Expected explicit chunkTokenMode=none to be respected.'); +} + +console.log('artifact token mode tests passed'); diff --git a/tests/backend-policy.js b/tests/backend-policy.js new file mode 100644 index 000000000..cb817b6a9 --- /dev/null +++ b/tests/backend-policy.js @@ -0,0 +1,83 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { resolveBackendPolicy } from '../src/storage/backend-policy.js'; + +const autoDefault = resolveBackendPolicy({ + backendArg: 'auto', + sqliteScoreModeConfig: false, + sqliteConfigured: true, + sqliteAvailable: true, + lmdbAvailable: true, + needsSqlite: true +}); +assert.equal(autoDefault.useSqlite, true); +assert.equal(autoDefault.useLmdb, false); +assert.equal(autoDefault.backendLabel, 'sqlite'); + +const autoChunkThreshold = resolveBackendPolicy({ + backendArg: 'auto', + sqliteConfigured: true, + sqliteAvailable: true, + sqliteAutoChunkThreshold: 10, + needsSqlite: true, + chunkCounts: [5] +}); +assert.equal(autoChunkThreshold.useSqlite, false); +assert.equal(autoChunkThreshold.useLmdb, false); + +const autoArtifactThreshold = resolveBackendPolicy({ + backendArg: 'auto', + sqliteConfigured: true, + sqliteAvailable: true, + sqliteAutoArtifactBytes: 100, + needsSqlite: true, + artifactBytes: [200] +}); +assert.equal(autoArtifactThreshold.useSqlite, true); + +const forcedMemory = resolveBackendPolicy({ + backendArg: 'memory', + sqliteConfigured: true, + sqliteAvailable: true, + lmdbAvailable: true, + needsSqlite: true +}); +assert.equal(forcedMemory.useSqlite, false); +assert.equal(forcedMemory.useLmdb, false); +assert.equal(forcedMemory.backendLabel, 'memory'); + +const forcedSqliteMissing = resolveBackendPolicy({ + backendArg: 'sqlite', + sqliteConfigured: true, + sqliteAvailable: false, + lmdbAvailable: true, + needsSqlite: true +}); +assert.ok(forcedSqliteMissing.error); + +const forcedLmdb = resolveBackendPolicy({ + backendArg: 'lmdb', + lmdbAvailable: true, + needsSqlite: true +}); +assert.equal(forcedLmdb.useLmdb, true); +assert.equal(forcedLmdb.backendLabel, 'lmdb'); + +const forcedLmdbMissing = resolveBackendPolicy({ + backendArg: 'lmdb', + lmdbAvailable: false, + needsSqlite: true +}); +assert.ok(forcedLmdbMissing.error); + +const autoFallbackLmdb = resolveBackendPolicy({ + backendArg: 'auto', + sqliteConfigured: true, + sqliteAvailable: false, + lmdbAvailable: true, + needsSqlite: true +}); +assert.equal(autoFallbackLmdb.useLmdb, true); +assert.equal(autoFallbackLmdb.backendLabel, 'lmdb'); + +console.log('backend-policy test passed'); diff --git a/tests/bench-language-lock-semantics.js b/tests/bench-language-lock-semantics.js new file mode 100644 index 000000000..4abc50b5d --- /dev/null +++ b/tests/bench-language-lock-semantics.js @@ -0,0 +1,45 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { checkIndexLock } from '../tools/bench/language/locks.js'; + +const root = path.join(process.cwd(), 'tests', '.cache', 'bench-language-lock-semantics'); +const locksDir = path.join(root, 'locks'); +const lockPath = path.join(locksDir, 'index.lock'); + +await fsPromises.rm(root, { recursive: true, force: true }); +await fsPromises.mkdir(locksDir, { recursive: true }); + +const staleStartedAt = new Date(Date.now() - 5000).toISOString(); +await fsPromises.writeFile(lockPath, JSON.stringify({ pid: process.pid, startedAt: staleStartedAt })); + +const staleResult = await checkIndexLock({ + repoCacheRoot: root, + repoLabel: 'repo', + lockMode: 'fail-fast', + lockWaitMs: 0, + lockStaleMs: 1000, + onLog: () => {} +}); +assert.equal(staleResult.ok, true, 'expected stale lock to be cleared'); +assert.equal(staleResult.cleared, true, 'expected stale lock to report cleared'); +const staleExists = await fsPromises.stat(lockPath).then(() => true).catch(() => false); +assert.equal(staleExists, false, 'expected stale lock file to be removed'); + +await fsPromises.mkdir(locksDir, { recursive: true }); +await fsPromises.writeFile(lockPath, JSON.stringify({ pid: process.pid, startedAt: new Date().toISOString() })); + +const activeResult = await checkIndexLock({ + repoCacheRoot: root, + repoLabel: 'repo', + lockMode: 'fail-fast', + lockWaitMs: 0, + lockStaleMs: 60 * 60 * 1000, + onLog: () => {} +}); +assert.equal(activeResult.ok, false, 'expected active lock to block'); +assert.equal(activeResult.detail.pid, process.pid, 'expected active lock pid to be reported'); +assert.equal(activeResult.detail.alive, true, 'expected active lock pid to be alive'); + +console.log('bench-language lock semantics test passed'); diff --git a/tests/bench-language-lock.js b/tests/bench-language-lock.js new file mode 100644 index 000000000..b811d639f --- /dev/null +++ b/tests/bench-language-lock.js @@ -0,0 +1,82 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getRepoCacheRoot } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'bench-language-lock'); +const reposRoot = path.join(tempRoot, 'repos'); +const cacheRoot = path.join(tempRoot, 'cache'); +const resultsRoot = path.join(tempRoot, 'results'); +const configPath = path.join(tempRoot, 'repos.json'); +const queriesPath = path.join(root, 'tests', 'fixtures', 'sample', 'queries.txt'); +const repoId = 'test/lock-repo'; +const repoPath = path.join(reposRoot, 'javascript', repoId.replace('/', '__')); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoPath, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); +await fsPromises.mkdir(resultsRoot, { recursive: true }); + +await fsPromises.writeFile(path.join(repoPath, 'README.md'), 'bench lock test'); + +const config = { + javascript: { + label: 'JavaScript', + queries: queriesPath, + repos: { + typical: [repoId] + } + } +}; +await fsPromises.writeFile(configPath, JSON.stringify(config, null, 2)); + +const repoCacheRoot = getRepoCacheRoot(repoPath, { cache: { root: cacheRoot } }); +const lockDir = path.join(repoCacheRoot, 'locks'); +await fsPromises.mkdir(lockDir, { recursive: true }); +await fsPromises.writeFile( + path.join(lockDir, 'index.lock'), + JSON.stringify({ pid: process.pid, startedAt: new Date().toISOString() }) +); + +const scriptPath = path.join(root, 'tools', 'bench-language-repos.js'); +const result = spawnSync( + process.execPath, + [ + scriptPath, + '--config', + configPath, + '--root', + reposRoot, + '--cache-root', + cacheRoot, + '--results', + resultsRoot, + '--no-clone', + '--dry-run', + '--lock-mode', + 'fail-fast', + '--json' + ], + { encoding: 'utf8' } +); + +if (result.status !== 0) { + console.error(result.stderr || 'bench-language-lock test failed'); + process.exit(result.status ?? 1); +} + +const payload = JSON.parse(result.stdout || '{}'); +const task = Array.isArray(payload.tasks) ? payload.tasks[0] : null; +if (!task || !task.skipped) { + console.error('Expected bench task to be skipped due to lock.'); + process.exit(1); +} +if (task.skipReason !== 'lock') { + console.error(`Expected skipReason=lock, got ${task.skipReason}`); + process.exit(1); +} + +console.log('bench-language lock test passed'); diff --git a/tests/bench-language-progress-parse.js b/tests/bench-language-progress-parse.js new file mode 100644 index 000000000..4139ecca4 --- /dev/null +++ b/tests/bench-language-progress-parse.js @@ -0,0 +1,50 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { + parseFileProgressLine, + parseImportStatsLine, + parseLineProgress, + parseProgressLine, + parseScanMode, + parseShardLine +} from '../tools/bench/language/progress/parse.js'; + +const shard = parseShardLine('-> Shard 2/5: python (42 files)'); +assert.deepEqual(shard, { + index: 2, + total: 5, + shardLabel: 'python', + fileCount: 42 +}); + +const importStats = parseImportStatsLine('\u2192 Imports: modules=12, edges=34, files=56'); +assert.deepEqual(importStats, { modules: 12, edges: 34, files: 56 }); + +const combined = parseFileProgressLine('Files 10/100 (10.0%) [shard 2/5] File 3/10 lines 1,234 src/index.js'); +assert.equal(combined.count, 10); +assert.equal(combined.total, 100); +assert.equal(combined.pct, 10); +assert.equal(combined.shardLabel, '2/5'); +assert.equal(combined.fileIndex, 3); +assert.equal(combined.fileTotal, 10); +assert.equal(combined.file, 'src/index.js'); + +const fileOnly = parseFileProgressLine('File 7/12 src/lib.rs'); +assert.equal(fileOnly.count, null); +assert.equal(fileOnly.total, null); +assert.equal(fileOnly.pct, null); +assert.equal(fileOnly.shardLabel, ''); +assert.equal(fileOnly.fileIndex, 7); +assert.equal(fileOnly.fileTotal, 12); +assert.equal(fileOnly.file, 'src/lib.rs'); + +const progress = parseProgressLine('Files 90/200 (45.0%)'); +assert.deepEqual(progress, { step: 'Files', count: 90, total: 200, pct: 45 }); + +const lineProgress = parseLineProgress('Line 5 / 20'); +assert.deepEqual(lineProgress, { current: 5, total: 20 }); + +assert.equal(parseScanMode('Scanning code'), 'code'); +assert.equal(parseScanMode('Scanning prose'), 'prose'); + +console.log('bench-language progress parse test passed'); diff --git a/tests/bench-language-repos.js b/tests/bench-language-repos.js new file mode 100644 index 000000000..7f1733450 --- /dev/null +++ b/tests/bench-language-repos.js @@ -0,0 +1,38 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { formatShardFileProgress } from '../src/shared/bench-progress.js'; + +const root = process.cwd(); +const scriptPath = path.join(root, 'tools', 'bench-language-repos.js'); +const result = spawnSync(process.execPath, [scriptPath, '--list', '--json'], { encoding: 'utf8' }); +if (result.status !== 0) { + console.error(result.stderr || 'bench-language-repos failed'); + process.exit(result.status ?? 1); +} + +const payload = JSON.parse(result.stdout || '{}'); +assert.ok(Array.isArray(payload.languages), 'languages array missing'); +assert.ok(payload.languages.includes('javascript'), 'javascript language missing'); +assert.ok(payload.languages.includes('shell'), 'shell language missing'); +assert.ok(Array.isArray(payload.tasks), 'tasks array missing'); +assert.ok(payload.tasks.length > 0, 'no benchmark tasks listed'); + +const shardByLabel = new Map([['src', { index: 2, total: 4 }]]); +const progressLine = formatShardFileProgress( + { + fileIndex: 175, + fileTotal: 176, + pct: 99.4, + shardLabel: 'src', + file: 'src/app.js' + }, + { shardByLabel, lineTotal: 123 } +); +assert.ok(progressLine.startsWith('[shard 2/4] 175/176 (99.4%)'), 'shard prefix missing'); +assert.ok(progressLine.includes('lines 123'), 'line count missing'); +assert.ok(progressLine.includes('src/app.js'), 'file path missing'); +assert.ok(!progressLine.includes('Files '), 'legacy Files label should be removed'); + +console.log('bench-language-repos test passed.'); diff --git a/tests/bench-progress-format.js b/tests/bench-progress-format.js new file mode 100644 index 000000000..9894ea7ef --- /dev/null +++ b/tests/bench-progress-format.js @@ -0,0 +1,30 @@ +#!/usr/bin/env node +import { formatShardFileProgress } from '../src/shared/bench-progress.js'; + +const shardByLabel = new Map([['alpha', { index: 2, total: 10 }]]); +const output = formatShardFileProgress({ + shardLabel: 'alpha', + fileIndex: 5, + fileTotal: 20, + pct: 25.0, + file: 'src/app.js' +}, { shardByLabel, lineTotal: 100 }); + +if (!output.includes('[shard 2/10]')) { + console.error('bench progress format test failed: missing shard index'); + process.exit(1); +} +if (!output.includes('5/20')) { + console.error('bench progress format test failed: missing file counts'); + process.exit(1); +} +if (!output.includes('lines 100')) { + console.error('bench progress format test failed: missing line count'); + process.exit(1); +} +if (!output.includes('src/app.js')) { + console.error('bench progress format test failed: missing file path'); + process.exit(1); +} + +console.log('bench progress format test passed'); diff --git a/tests/bench.js b/tests/bench.js index 11d500320..f9fc55836 100644 --- a/tests/bench.js +++ b/tests/bench.js @@ -1,21 +1,53 @@ #!/usr/bin/env node +import fsSync from 'node:fs'; import fs from 'node:fs/promises'; import path from 'node:path'; -import { spawnSync } from 'node:child_process'; -import minimist from 'minimist'; - -const argv = minimist(process.argv.slice(2), { - boolean: ['ann', 'no-ann', 'json', 'write-report', 'build', 'build-index', 'build-sqlite', 'incremental', 'stub-embeddings'], - string: ['queries', 'backend', 'out', 'bm25-k1', 'bm25-b', 'fts-profile', 'fts-weights'], - alias: { n: 'top', q: 'queries' }, - default: { top: 5, limit: 0, json: false, 'write-report': false } +import { spawn, spawnSync } from 'node:child_process'; +import { createCli } from '../src/shared/cli.js'; +import { BENCH_OPTIONS, validateBenchArgs } from '../src/shared/cli-options.js'; +import { getIndexDir, getRuntimeConfig, loadUserConfig, resolveRuntimeEnv, resolveSqlitePaths } from '../tools/dict-utils.js'; +import { getEnvConfig } from '../src/shared/env.js'; +import { runWithConcurrency } from '../src/shared/concurrency.js'; +import os from 'node:os'; +import { createSafeRegex, normalizeSafeRegexConfig } from '../src/shared/safe-regex.js'; +import { build as buildHistogram } from 'hdr-histogram-js'; + +const rawArgs = process.argv.slice(2); +const argv = createCli({ + scriptName: 'bench', + options: BENCH_OPTIONS, + aliases: { n: 'top', q: 'queries' } +}).parse(); +validateBenchArgs(argv); + +const safeRegexConfig = normalizeSafeRegexConfig({ + maxPatternLength: 64, + maxInputLength: 64, + timeoutMs: 10, + flags: 'i' }); +const safeRegex = createSafeRegex('a+b', '', safeRegexConfig); +if (!safeRegex || !safeRegex.test('Aaab')) { + console.error('Safe regex self-check failed.'); + process.exit(1); +} +const rejected = createSafeRegex('a'.repeat(128), '', safeRegexConfig); +if (rejected) { + console.error('Safe regex maxPatternLength guard failed.'); + process.exit(1); +} +if (safeRegex.test('a'.repeat(100))) { + console.error('Safe regex maxInputLength guard failed.'); + process.exit(1); +} const root = process.cwd(); +const repoArg = argv.repo ? path.resolve(argv.repo) : null; const searchPath = path.join(root, 'search.js'); const reportPath = path.join(root, 'tools', 'report-artifacts.js'); const buildIndexPath = path.join(root, 'build_index.js'); const buildSqlitePath = path.join(root, 'tools', 'build-sqlite-index.js'); +const indexerServicePath = path.join(root, 'tools', 'indexer-service.js'); const defaultQueriesPath = path.join(root, 'tests', 'parity-queries.txt'); const queriesPath = argv.queries ? path.resolve(argv.queries) : defaultQueriesPath; @@ -41,8 +73,10 @@ if (!queries.length) { const topN = Math.max(1, parseInt(argv.top, 10) || 5); const limit = Math.max(0, parseInt(argv.limit, 10) || 0); const selectedQueries = limit > 0 ? queries.slice(0, limit) : queries; -const annEnabled = argv.ann !== false; +const annFlagPresent = rawArgs.includes('--ann') || rawArgs.includes('--no-ann'); +const annEnabled = annFlagPresent ? argv.ann === true : true; const annArg = annEnabled ? '--ann' : '--no-ann'; +const jsonOutput = argv.json === true; const bm25K1Arg = argv['bm25-k1']; const bm25BArg = argv['bm25-b']; const ftsProfileArg = argv['fts-profile']; @@ -56,16 +90,128 @@ function resolveBackends(value) { return Array.from(new Set(list.map((entry) => entry.trim()).filter(Boolean))); } const backends = resolveBackends(argv.backend); -const buildIndex = argv['build-index'] || argv.build; -const buildSqlite = argv['build-sqlite'] || argv.build; -const buildIncremental = argv.incremental === true; -const stubEmbeddings = argv['stub-embeddings'] === true; +let buildIndex = argv['build-index'] || argv.build; +let buildSqlite = argv['build-sqlite'] || argv.build; +const buildIncremental = argv.incremental === true || buildSqlite; +const envConfig = getEnvConfig(); +const indexProfileArg = typeof argv['index-profile'] === 'string' + ? argv['index-profile'].trim() + : ''; +const noIndexProfile = rawArgs.includes('--no-index-profile'); +const originalEnvProfile = process.env.PAIROFCLEATS_PROFILE; +const indexProfileRaw = indexProfileArg; +const suppressEnvProfile = noIndexProfile && !indexProfileRaw; +if (suppressEnvProfile) { + delete process.env.PAIROFCLEATS_PROFILE; +} +const runtimeRoot = repoArg || root; +const userConfig = loadUserConfig( + runtimeRoot, + indexProfileRaw ? { profile: indexProfileRaw } : {} +); +if (suppressEnvProfile) { + if (originalEnvProfile === undefined) { + delete process.env.PAIROFCLEATS_PROFILE; + } else { + process.env.PAIROFCLEATS_PROFILE = originalEnvProfile; + } +} +const runtimeConfig = getRuntimeConfig(runtimeRoot, userConfig); +const embeddingProvider = userConfig.indexing?.embeddings?.provider || 'xenova'; +const needsMemory = backends.includes('memory'); +const needsSqlite = backends.some((entry) => entry.startsWith('sqlite')); +const hasIndex = (mode) => { + const dir = getIndexDir(runtimeRoot, mode, userConfig); + const metaPaths = [ + 'chunk_meta.json', + 'chunk_meta.jsonl', + 'chunk_meta.meta.json', + 'chunk_meta.parts' + ]; + return metaPaths.some((entry) => fsSync.existsSync(path.join(dir, entry))); +}; +const hasSqliteIndex = (mode) => { + const paths = resolveSqlitePaths(runtimeRoot, userConfig); + const target = mode === 'prose' ? paths.prosePath : paths.codePath; + return fsSync.existsSync(target); +}; +if (needsMemory && !buildIndex && (!hasIndex('code') || !hasIndex('prose'))) { + buildIndex = true; + if (!jsonOutput) { + console.log('[bench] Missing file-backed index; enabling build-index.'); + } +} +if (needsSqlite && !buildSqlite && (!hasSqliteIndex('code') || !hasSqliteIndex('prose'))) { + buildSqlite = true; + if (!jsonOutput) { + console.log('[bench] Missing sqlite index; enabling build-sqlite.'); + } +} +if (buildSqlite && !buildIndex) buildIndex = true; +const heapArgRaw = argv['heap-mb']; +const heapArg = Number.isFinite(Number(heapArgRaw)) ? Math.floor(Number(heapArgRaw)) : null; +const heapRecommendation = getRecommendedHeapMb(); +const baseNodeOptions = stripMaxOldSpaceFlag(process.env.NODE_OPTIONS || ''); +const hasHeapFlag = baseNodeOptions.includes('--max-old-space-size'); +let heapOverride = null; +if (Number.isFinite(heapArg) && heapArg > 0) { + heapOverride = heapArg; +} else if ( + !Number.isFinite(runtimeConfig.maxOldSpaceMb) + && !envConfig.maxOldSpaceMb + && !hasHeapFlag +) { + heapOverride = heapRecommendation.recommendedMb; +} +const runtimeConfigForRun = heapOverride + ? { ...runtimeConfig, maxOldSpaceMb: heapOverride } + : runtimeConfig; +const envStubEmbeddings = envConfig.embeddings === 'stub'; +const realEmbeddings = argv['real-embeddings'] === true; +const stubEmbeddings = argv['stub-embeddings'] === true + || (!realEmbeddings && envStubEmbeddings); +const baseEnvInput = { ...process.env }; +if (baseNodeOptions) { + baseEnvInput.NODE_OPTIONS = baseNodeOptions; +} else { + delete baseEnvInput.NODE_OPTIONS; +} +const baseEnv = resolveRuntimeEnv(runtimeConfigForRun, baseEnvInput); +const profileArgPresent = rawArgs.includes('--profile') || rawArgs.includes('--index-profile'); +if (noIndexProfile && !profileArgPresent && baseEnv.PAIROFCLEATS_PROFILE) { + delete baseEnv.PAIROFCLEATS_PROFILE; +} +if (realEmbeddings && baseEnv.PAIROFCLEATS_EMBEDDINGS) { + delete baseEnv.PAIROFCLEATS_EMBEDDINGS; +} +if (heapOverride) { + baseEnv.PAIROFCLEATS_MAX_OLD_SPACE_MB = String(heapOverride); + if (!jsonOutput) { + console.log( + `[bench] heap ${formatGb(heapOverride)} (${heapOverride} MB) ` + + `(override with --heap-mb or PAIROFCLEATS_MAX_OLD_SPACE_MB)` + ); + } +} +const benchEnvWithProfile = indexProfileRaw + ? { ...baseEnv, PAIROFCLEATS_PROFILE: indexProfileRaw } + : baseEnv; + +function logBench(message) { + if (!message) return; + if (jsonOutput) { + process.stderr.write(`${message}\n`); + } else { + console.log(message); + } +} function runSearch(query, backend) { const args = [ searchPath, query, '--json', + '--json-compact', '--stats', '--backend', backend, @@ -73,17 +219,47 @@ function runSearch(query, backend) { String(topN), annArg ]; + if (repoArg) args.push('--repo', repoArg); if (bm25K1Arg) args.push('--bm25-k1', String(bm25K1Arg)); if (bm25BArg) args.push('--bm25-b', String(bm25BArg)); if (ftsProfileArg) args.push('--fts-profile', String(ftsProfileArg)); if (ftsWeightsArg) args.push('--fts-weights', String(ftsWeightsArg)); - const result = spawnSync(process.execPath, args, { encoding: 'utf8' }); - if (result.status !== 0) { - console.error(`Search failed for backend=${backend} query="${query}"`); - if (result.stderr) console.error(result.stderr.trim()); - process.exit(result.status ?? 1); + const env = { ...benchEnvWithProfile }; + if (stubEmbeddings) { + env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + } else { + delete env.PAIROFCLEATS_EMBEDDINGS; } - return JSON.parse(result.stdout || '{}'); + return new Promise((resolve) => { + const child = spawn(process.execPath, args, { env, stdio: ['ignore', 'pipe', 'pipe'] }); + let stdout = ''; + let stderr = ''; + child.stdout.on('data', (chunk) => { + stdout += chunk; + }); + child.stderr.on('data', (chunk) => { + stderr += chunk; + }); + child.on('error', (err) => { + console.error(`Search failed to start for backend=${backend} query="${query}"`); + if (err?.message) console.error(err.message); + process.exit(1); + }); + child.on('close', (code) => { + if (code !== 0) { + console.error(`Search failed for backend=${backend} query="${query}"`); + if (stderr) console.error(stderr.trim()); + process.exit(code ?? 1); + } + try { + resolve(JSON.parse(stdout || '{}')); + } catch (err) { + console.error(`Search response parse failed for backend=${backend} query="${query}"`); + if (stderr) console.error(stderr.trim()); + process.exit(1); + } + }); + }); } function mean(values) { @@ -91,27 +267,88 @@ function mean(values) { return values.reduce((a, b) => a + b, 0) / values.length; } -function percentile(sortedValues, pct) { - if (!sortedValues.length) return 0; - const idx = Math.min(sortedValues.length - 1, Math.max(0, Math.floor((pct / 100) * (sortedValues.length - 1)))); - return sortedValues[idx]; +function buildPercentileHistogram(values, scale) { + if (!values.length) return null; + const scaled = values.map((value) => Math.max(1, Math.round(value * scale))); + const maxValue = Math.max(...scaled, 1); + const histogram = buildHistogram({ + lowestDiscernibleValue: 1, + highestTrackableValue: maxValue, + numberOfSignificantValueDigits: 3 + }); + scaled.forEach((value) => histogram.recordValue(value)); + return histogram; } -function buildStats(values) { - if (!values.length) return { mean: 0, p50: 0, p95: 0, min: 0, max: 0 }; - const sorted = [...values].sort((a, b) => a - b); +function buildStats(values, { scale = 1 } = {}) { + if (!values.length) return { mean: 0, p50: 0, p95: 0, p99: 0, min: 0, max: 0 }; + const histogram = buildPercentileHistogram(values, scale); + const pct = (value) => (histogram ? histogram.getValueAtPercentile(value) / scale : 0); return { mean: mean(values), - p50: percentile(sorted, 50), - p95: percentile(sorted, 95), - min: sorted[0], - max: sorted[sorted.length - 1] + p50: pct(50), + p95: pct(95), + p99: pct(99), + min: Math.min(...values), + max: Math.max(...values) + }; +} + +function stripMaxOldSpaceFlag(options) { + if (!options) return ''; + return options + .replace(/--max-old-space-size=\d+/g, '') + .replace(/\s+/g, ' ') + .trim(); +} + +function formatGb(mb) { + return `${(mb / 1024).toFixed(1)} GB`; +} + +function formatDuration(ms) { + const total = Math.max(0, Math.floor(ms / 1000)); + const hours = Math.floor(total / 3600); + const minutes = Math.floor((total % 3600) / 60); + const seconds = total % 60; + if (hours > 0) return `${hours}h ${minutes}m ${seconds}s`; + if (minutes > 0) return `${minutes}m ${seconds}s`; + return `${seconds}s`; +} + +function formatDurationMs(ms) { + if (!Number.isFinite(ms)) return 'n/a'; + if (ms < 1000) return `${Math.max(0, Math.round(ms))}ms`; + return formatDuration(ms); +} + +function formatRate(value, unit) { + if (!Number.isFinite(value)) return 'n/a'; + const rounded = value >= 100 ? value.toFixed(0) : value >= 10 ? value.toFixed(1) : value.toFixed(2); + return `${rounded} ${unit}/s`; +} + +function getRecommendedHeapMb() { + const totalMb = Math.floor(os.totalmem() / (1024 * 1024)); + const recommended = Math.max(4096, Math.floor(totalMb * 0.75)); + const rounded = Math.floor(recommended / 256) * 256; + return { + totalMb, + recommendedMb: Math.max(4096, rounded) }; } function runBuild(args, label, env) { const start = Date.now(); - const result = spawnSync(process.execPath, args, { env, stdio: 'inherit' }); + const result = spawnSync(process.execPath, args, { + env, + encoding: 'utf8', + stdio: jsonOutput ? ['ignore', 'pipe', 'pipe'] : 'inherit' + }); + if (jsonOutput) { + if (result.stdout) process.stderr.write(result.stdout); + if (result.stderr) process.stderr.write(result.stderr); + } if (result.status !== 0) { console.error(`Build failed: ${label}`); process.exit(result.status ?? 1); @@ -119,98 +356,322 @@ function runBuild(args, label, env) { return Date.now() - start; } +function runServiceQueue(queueName, env) { + const args = [indexerServicePath, 'work', '--queue', queueName, '--concurrency', '1']; + const result = spawnSync(process.execPath, args, { + env, + encoding: 'utf8', + stdio: jsonOutput ? ['ignore', 'pipe', 'pipe'] : 'inherit' + }); + if (jsonOutput) { + if (result.stdout) process.stderr.write(result.stdout); + if (result.stderr) process.stderr.write(result.stderr); + } + if (result.status !== 0) { + console.error(`Service queue failed: ${queueName}`); + process.exit(result.status ?? 1); + } +} + const buildMs = {}; if (buildIndex || buildSqlite) { - const buildEnv = { ...process.env }; - if (stubEmbeddings) buildEnv.PAIROFCLEATS_EMBEDDINGS = 'stub'; + const buildEnv = { ...benchEnvWithProfile }; + if (Number.isFinite(Number(argv.threads)) && Number(argv.threads) > 0) { + buildEnv.PAIROFCLEATS_THREADS = String(argv.threads); + } + if (stubEmbeddings) { + buildEnv.PAIROFCLEATS_EMBEDDINGS = 'stub'; + } else { + delete buildEnv.PAIROFCLEATS_EMBEDDINGS; + } + const twoStageConfig = userConfig.indexing?.twoStage || {}; + const useStageQueue = twoStageConfig.enabled === true + && twoStageConfig.background === true + && twoStageConfig.queue !== false; + const embeddingMode = typeof userConfig.indexing?.embeddings?.mode === 'string' + ? userConfig.indexing.embeddings.mode.trim().toLowerCase() + : ''; + const embeddingsEnabled = userConfig.indexing?.embeddings?.enabled !== false; + const useEmbeddingService = embeddingsEnabled && embeddingMode === 'service'; if (buildIndex) { const args = [buildIndexPath]; + if (repoArg) args.push('--repo', repoArg); if (stubEmbeddings) args.push('--stub-embeddings'); if (buildIncremental) args.push('--incremental'); + if (argv.threads) args.push('--threads', String(argv.threads)); buildMs.index = runBuild(args, 'build index', buildEnv); + if (useStageQueue) { + runServiceQueue('index', buildEnv); + logBench('[bench] Stage2 enrichment complete; continuing with benchmark queries.'); + } } if (buildSqlite) { const args = [buildSqlitePath]; + if (repoArg) args.push('--repo', repoArg); if (buildIncremental) args.push('--incremental'); buildMs.sqlite = runBuild(args, 'build sqlite', buildEnv); } + if (buildIndex && useEmbeddingService) { + runServiceQueue('embeddings', buildEnv); + } } -const latency = {}; -const memoryRss = {}; -const hitCounts = {}; -const resultCounts = {}; -for (const backend of backends) { - latency[backend] = []; - memoryRss[backend] = []; - hitCounts[backend] = 0; - resultCounts[backend] = []; +const queryTasks = []; +let queryIndex = 0; +for (const query of selectedQueries) { + queryIndex += 1; + for (const backend of backends) { + queryTasks.push({ query, backend, queryIndex }); + } } -for (const query of selectedQueries) { +const queryConcurrencyRaw = Number(argv['query-concurrency']); +const queryConcurrencyList = Number.isFinite(queryConcurrencyRaw) && queryConcurrencyRaw > 0 + ? [Math.floor(queryConcurrencyRaw)] + : [4]; + +const runQueries = async (requestedConcurrency) => { + const latency = {}; + const memoryRss = {}; + const hitCounts = {}; + const resultCounts = {}; for (const backend of backends) { - const payload = runSearch(query, backend); - latency[backend].push(payload.stats?.elapsedMs || 0); + latency[backend] = []; + memoryRss[backend] = []; + hitCounts[backend] = 0; + resultCounts[backend] = []; + } + + const totalSearches = selectedQueries.length * backends.length; + const queryProgress = { + count: 0, + startMs: Date.now(), + lastLogMs: 0, + lastPct: 0 + }; + const logQueryProgress = (force = false) => { + if (!totalSearches) return; + const now = Date.now(); + const pct = (queryProgress.count / totalSearches) * 100; + const elapsedMs = now - queryProgress.startMs; + const rate = elapsedMs > 0 ? queryProgress.count / (elapsedMs / 1000) : 0; + const remaining = totalSearches - queryProgress.count; + const etaMs = rate > 0 && remaining > 0 ? (remaining / rate) * 1000 : 0; + const shouldLog = force + || queryProgress.count === totalSearches + || now - queryProgress.lastLogMs >= 10000 + || pct - queryProgress.lastPct >= 5; + if (!shouldLog) return; + const elapsedText = formatDuration(elapsedMs); + const avgSearchText = queryProgress.count + ? formatDurationMs(elapsedMs / queryProgress.count) + : 'n/a'; + const avgQueryText = selectedQueries.length + ? formatDurationMs(elapsedMs / selectedQueries.length) + : 'n/a'; + const etaText = etaMs > 0 ? formatDuration(etaMs) : 'n/a'; + logBench( + `[bench] Queries ${queryProgress.count}/${totalSearches} (${pct.toFixed(1)}%) | ` + + `concurrency ${requestedConcurrency} | elapsed ${elapsedText} | ` + + `avg/search ${avgSearchText} | avg/query ${avgQueryText} | eta ${etaText}` + ); + queryProgress.lastLogMs = now; + queryProgress.lastPct = pct; + }; + + logBench( + `[bench] Running ${selectedQueries.length} queries across ${backends.length} backends ` + + `(${totalSearches} searches) with concurrency ${requestedConcurrency}.` + ); + logQueryProgress(true); + + const loggedQueries = new Set(); + const runQueryTask = async (task) => { + if (!loggedQueries.has(task.queryIndex)) { + loggedQueries.add(task.queryIndex); + logBench( + `[bench] (concurrency ${requestedConcurrency}) Query ` + + `${task.queryIndex}/${selectedQueries.length}: ${task.query}` + ); + } + const payload = await runSearch(task.query, task.backend); + queryProgress.count += 1; + logQueryProgress(); + const elapsedMs = Number(payload.stats?.elapsedMs); + if (!Number.isFinite(elapsedMs)) { + console.error(`[bench] Missing timing stats for backend=${task.backend} query="${task.query}"`); + process.exit(1); + } + latency[task.backend].push(elapsedMs); const codeHits = Array.isArray(payload.code) ? payload.code.length : 0; const proseHits = Array.isArray(payload.prose) ? payload.prose.length : 0; const totalHits = codeHits + proseHits; - resultCounts[backend].push(totalHits); - if (totalHits > 0) hitCounts[backend] += 1; + resultCounts[task.backend].push(totalHits); + if (totalHits > 0) hitCounts[task.backend] += 1; const rss = payload.stats?.memory?.rss; - if (Number.isFinite(rss)) memoryRss[backend].push(rss); + if (Number.isFinite(rss)) memoryRss[task.backend].push(rss); + }; + if (queryTasks.length) { + await runWithConcurrency( + queryTasks, + Math.max(1, Math.min(requestedConcurrency, queryTasks.length)), + runQueryTask + ); } + logQueryProgress(true); + const queryWallMs = Date.now() - queryProgress.startMs; + const queryWallMsPerSearch = totalSearches ? queryWallMs / totalSearches : 0; + const queryWallMsPerQuery = selectedQueries.length ? queryWallMs / selectedQueries.length : 0; + + const latencyStats = Object.fromEntries(backends.map((b) => [b, buildStats(latency[b], { scale: 1000 })])); + const memoryStats = Object.fromEntries(backends.map((b) => [b, buildStats(memoryRss[b], { scale: 1 })])); + const hitRate = Object.fromEntries(backends.map((b) => [ + b, + selectedQueries.length ? hitCounts[b] / selectedQueries.length : 0 + ])); + const resultCountAvg = Object.fromEntries(backends.map((b) => [b, mean(resultCounts[b])])); + + const summary = { + queries: selectedQueries.length, + topN, + annEnabled, + embeddingProvider, + backends, + queryConcurrency: requestedConcurrency, + queryWallMs, + queryWallMsPerSearch, + queryWallMsPerQuery, + latencyMsAvg: Object.fromEntries(backends.map((b) => [b, latencyStats[b].mean])), + latencyMs: latencyStats, + hitRate, + resultCountAvg, + memoryRss: memoryStats, + buildMs: Object.keys(buildMs).length ? buildMs : null + }; + + return { summary }; +}; + +const runs = []; +for (const concurrency of queryConcurrencyList) { + runs.push(await runQueries(concurrency)); } -const reportResult = spawnSync(process.execPath, [reportPath, '--json'], { encoding: 'utf8' }); +const reportArgs = [reportPath, '--json']; +if (repoArg) reportArgs.push('--repo', repoArg); +const reportResult = spawnSync(process.execPath, reportArgs, { encoding: 'utf8' }); const artifactReport = reportResult.status === 0 ? JSON.parse(reportResult.stdout || '{}') : {}; +const corruption = artifactReport?.corruption || null; +if (corruption && corruption.ok === false) { + const issues = Array.isArray(corruption.issues) && corruption.issues.length + ? corruption.issues.join('; ') + : 'unknown issues'; + console.error(`[bench] Artifact corruption check failed: ${issues}`); + process.exit(1); +} -const latencyStats = Object.fromEntries(backends.map((b) => [b, buildStats(latency[b])])); -const memoryStats = Object.fromEntries(backends.map((b) => [b, buildStats(memoryRss[b])])); -const hitRate = Object.fromEntries(backends.map((b) => [ - b, - selectedQueries.length ? hitCounts[b] / selectedQueries.length : 0 -])); -const resultCountAvg = Object.fromEntries(backends.map((b) => [b, mean(resultCounts[b])])); -const summary = { - queries: selectedQueries.length, - topN, - annEnabled, - backends, - latencyMsAvg: Object.fromEntries(backends.map((b) => [b, latencyStats[b].mean])), - latencyMs: latencyStats, - hitRate, - resultCountAvg, - memoryRss: memoryStats, - buildMs: Object.keys(buildMs).length ? buildMs : null -}; +const summaries = runs.map((run) => run.summary).filter(Boolean); +const concurrencyStats = {}; +for (const runSummary of summaries) { + const concurrency = runSummary?.queryConcurrency; + if (concurrency === 4) { + concurrencyStats[String(concurrency)] = { + latencyMsAvg: runSummary.latencyMsAvg, + latencyMs: runSummary.latencyMs, + hitRate: runSummary.hitRate, + resultCountAvg: runSummary.resultCountAvg, + memoryRss: runSummary.memoryRss + }; + } +} +const summary = summaries[0] + ? { + ...summaries[0], + ...(Object.keys(concurrencyStats).length ? { concurrencyStats } : {}) + } + : null; const output = { generatedAt: new Date().toISOString(), + repo: { root: repoArg || root }, summary, + runs: summaries, artifacts: artifactReport }; if (argv.json) { console.log(JSON.stringify(output, null, 2)); } else { - console.log('Benchmark summary'); - console.log(`- Queries: ${summary.queries}`); - console.log(`- TopN: ${summary.topN}`); - console.log(`- Ann: ${summary.annEnabled}`); - for (const backend of backends) { - const stats = latencyStats[backend]; - console.log(`- ${backend} avg ms: ${stats.mean.toFixed(1)} (p95 ${stats.p95.toFixed(1)})`); - console.log(`- ${backend} hit rate: ${(hitRate[backend] * 100).toFixed(1)}% (avg hits ${resultCountAvg[backend].toFixed(1)})`); - const mem = memoryStats[backend]; - if (mem && mem.mean) { - console.log(`- ${backend} rss avg mb: ${(mem.mean / (1024 * 1024)).toFixed(1)} (p95 ${(mem.p95 / (1024 * 1024)).toFixed(1)})`); + for (const runSummary of summaries) { + if (!runSummary) continue; + const concurrencyLabel = Number.isFinite(runSummary.queryConcurrency) + ? ` (concurrency ${runSummary.queryConcurrency})` + : ''; + console.log(`Benchmark summary${concurrencyLabel}`); + console.log(`- Queries: ${runSummary.queries}`); + console.log(`- TopN: ${runSummary.topN}`); + console.log(`- Ann: ${runSummary.annEnabled}`); + if (Number.isFinite(runSummary.queryWallMs)) { + console.log( + `- Query wall time: ${formatDuration(runSummary.queryWallMs)} ` + + `(avg/search ${formatDurationMs(runSummary.queryWallMsPerSearch)}, ` + + `avg/query ${formatDurationMs(runSummary.queryWallMsPerQuery)})` + ); + } + for (const backend of runSummary.backends || backends) { + const stats = runSummary.latencyMs?.[backend]; + if (stats) { + console.log(`- ${backend} avg ms: ${stats.mean.toFixed(1)} (p95 ${stats.p95.toFixed(1)}, p99 ${stats.p99.toFixed(1)})`); + } + const hitRate = runSummary.hitRate?.[backend]; + const resultCount = runSummary.resultCountAvg?.[backend]; + if (Number.isFinite(hitRate) && Number.isFinite(resultCount)) { + console.log(`- ${backend} hit rate: ${(hitRate * 100).toFixed(1)}% (avg hits ${resultCount.toFixed(1)})`); + } + const mem = runSummary.memoryRss?.[backend]; + if (mem && mem.mean) { + console.log(`- ${backend} rss avg mb: ${(mem.mean / (1024 * 1024)).toFixed(1)} (p95 ${(mem.p95 / (1024 * 1024)).toFixed(1)}, p99 ${(mem.p99 / (1024 * 1024)).toFixed(1)})`); + } + } + if (runSummary.buildMs?.index) { + console.log(`- build index ms: ${runSummary.buildMs.index.toFixed(0)}`); + } + if (runSummary.buildMs?.sqlite) { + console.log(`- build sqlite ms: ${runSummary.buildMs.sqlite.toFixed(0)}`); + } + const throughput = artifactReport?.throughput || null; + if (throughput?.code) { + const codeThroughput = throughput.code; + console.log( + `- throughput code: ${formatRate(codeThroughput.chunksPerSec, 'chunks')}, ` + + `${formatRate(codeThroughput.tokensPerSec, 'tokens')}, ` + + `${formatRate(codeThroughput.bytesPerSec, 'bytes')}` + ); + } + if (throughput?.prose) { + const proseThroughput = throughput.prose; + console.log( + `- throughput prose: ${formatRate(proseThroughput.chunksPerSec, 'chunks')}, ` + + `${formatRate(proseThroughput.tokensPerSec, 'tokens')}, ` + + `${formatRate(proseThroughput.bytesPerSec, 'bytes')}` + ); + } + if (throughput?.lmdb?.code) { + const lmdbCode = throughput.lmdb.code; + console.log( + `- throughput lmdb code: ${formatRate(lmdbCode.chunksPerSec, 'chunks')}, ` + + `${formatRate(lmdbCode.tokensPerSec, 'tokens')}, ` + + `${formatRate(lmdbCode.bytesPerSec, 'bytes')}` + ); + } + if (throughput?.lmdb?.prose) { + const lmdbProse = throughput.lmdb.prose; + console.log( + `- throughput lmdb prose: ${formatRate(lmdbProse.chunksPerSec, 'chunks')}, ` + + `${formatRate(lmdbProse.tokensPerSec, 'tokens')}, ` + + `${formatRate(lmdbProse.bytesPerSec, 'bytes')}` + ); } - } - if (buildMs.index) { - console.log(`- build index ms: ${buildMs.index.toFixed(0)}`); - } - if (buildMs.sqlite) { - console.log(`- build sqlite ms: ${buildMs.sqlite.toFixed(0)}`); } } diff --git a/tests/build-embeddings-cache.js b/tests/build-embeddings-cache.js new file mode 100644 index 000000000..a689445ec --- /dev/null +++ b/tests/build-embeddings-cache.js @@ -0,0 +1,63 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getRepoCacheRoot } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'build-embeddings-cache'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'alpha.js'), + 'export const alpha = () => 1;\n' +); +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ indexing: { treeSitter: { enabled: false } } }, null, 2) +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const runNode = (label, args) => { + const result = spawnSync(process.execPath, args, { cwd: repoRoot, env, stdio: 'inherit' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +runNode('build_index', [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot]); +runNode('build_embeddings', [path.join(root, 'tools', 'build-embeddings.js'), '--stub-embeddings', '--mode', 'code', '--repo', repoRoot]); + +const repoCacheRoot = getRepoCacheRoot(repoRoot, { cache: { root: cacheRoot } }); +const cacheDir = path.join(repoCacheRoot, 'embeddings', 'code', 'files'); +const cacheFiles = fs.existsSync(cacheDir) + ? fs.readdirSync(cacheDir).filter((name) => name.endsWith('.json')) + : []; +if (!cacheFiles.length) { + console.error('Expected embedding cache files to be created'); + process.exit(1); +} +const cachePath = path.join(cacheDir, cacheFiles[0]); +const before = await fsPromises.stat(cachePath); + +runNode('build_embeddings cached', [path.join(root, 'tools', 'build-embeddings.js'), '--stub-embeddings', '--mode', 'code', '--repo', repoRoot]); + +const after = await fsPromises.stat(cachePath); +if (after.mtimeMs !== before.mtimeMs) { + console.error('Expected embedding cache file to be reused without rewrite'); + process.exit(1); +} + +console.log('embedding cache reuse test passed'); diff --git a/tests/build-index-all.js b/tests/build-index-all.js new file mode 100644 index 000000000..4b49d1fb8 --- /dev/null +++ b/tests/build-index-all.js @@ -0,0 +1,57 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'build-index-all'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile(path.join(repoRoot, 'alpha.js'), 'const alpha = 1;\\n'); +await fsPromises.writeFile(path.join(repoRoot, 'beta.md'), '# Beta\\n'); +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ indexing: { treeSitter: { enabled: false } } }, null, 2) +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const result = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--mode', 'all', '--stub-embeddings', '--repo', repoRoot], + { env, stdio: 'inherit' } +); +if (result.status !== 0) { + console.error('Failed: build_index --mode all'); + process.exit(result.status ?? 1); +} + +const userConfig = loadUserConfig(repoRoot); +const modes = ['code', 'prose', 'extracted-prose']; +const hasChunkMeta = (dir) => ( + fs.existsSync(path.join(dir, 'chunk_meta.json')) + || fs.existsSync(path.join(dir, 'chunk_meta.jsonl')) + || fs.existsSync(path.join(dir, 'chunk_meta.meta.json')) + || fs.existsSync(path.join(dir, 'chunk_meta.parts')) +); + +for (const mode of modes) { + const dir = getIndexDir(repoRoot, mode, userConfig); + if (!hasChunkMeta(dir)) { + console.error(`Expected chunk metadata for ${mode} in ${dir}`); + process.exit(1); + } +} + +console.log('build-index --mode all test passed'); diff --git a/tests/build-runtime/content-hash.test.js b/tests/build-runtime/content-hash.test.js new file mode 100644 index 000000000..db96a3ae4 --- /dev/null +++ b/tests/build-runtime/content-hash.test.js @@ -0,0 +1,67 @@ +#!/usr/bin/env node +import { buildContentConfigHash, normalizeContentConfig } from '../../src/index/build/runtime/hash.js'; + +const fail = (message) => { + console.error(message); + process.exit(1); +}; + +const config = { + indexing: { + concurrency: 12, + importConcurrency: 4, + workerPool: { enabled: true }, + debugCrash: true, + shards: { enabled: true }, + fileListSampleSize: 123, + maxFileBytes: 2048 + } +}; + +const normalized = normalizeContentConfig(config); +if (!normalized.indexing || normalized.indexing.maxFileBytes !== 2048) { + fail('normalizeContentConfig should preserve relevant indexing fields.'); +} +for (const key of ['concurrency', 'importConcurrency', 'workerPool', 'debugCrash', 'shards', 'fileListSampleSize']) { + if (normalized.indexing[key] !== undefined) { + fail(`normalizeContentConfig should remove indexing.${key}.`); + } +} + +const envA = { cacheRoot: '/tmp/a', stage: 'stage1' }; +const envB = { cacheRoot: '/tmp/b', stage: 'stage1' }; +const hashA = buildContentConfigHash(config, envA); +const hashB = buildContentConfigHash(config, envB); +if (hashA !== hashB) { + fail('buildContentConfigHash should ignore cacheRoot differences.'); +} + +const configVariant = { + indexing: { + concurrency: 1, + importConcurrency: 2, + maxFileBytes: 2048 + } +}; +const hashC = buildContentConfigHash(configVariant, envA); +if (hashA !== hashC) { + fail('buildContentConfigHash should ignore concurrency-only changes.'); +} + +const envC = { cacheRoot: '/tmp/a', stage: 'stage2' }; +const hashD = buildContentConfigHash(config, envC); +if (hashA === hashD) { + fail('buildContentConfigHash should change when env fields change.'); +} + +const configDiff = { + indexing: { + maxFileBytes: 4096 + } +}; +const hashE = buildContentConfigHash(configDiff, envA); +if (hashA === hashE) { + fail('buildContentConfigHash should change when config fields change.'); +} + +console.log('build runtime content hash tests passed'); diff --git a/tests/build-runtime/stage-overrides.test.js b/tests/build-runtime/stage-overrides.test.js new file mode 100644 index 000000000..088a85868 --- /dev/null +++ b/tests/build-runtime/stage-overrides.test.js @@ -0,0 +1,53 @@ +#!/usr/bin/env node +import { buildStageOverrides, normalizeStage } from '../../src/index/build/runtime/stage.js'; + +const fail = (message) => { + console.error(message); + process.exit(1); +}; + +if (normalizeStage('stage1') !== 'stage1') { + fail('normalizeStage should match stage1.'); +} +if (normalizeStage('embed') !== 'stage3') { + fail('normalizeStage should map embed to stage3.'); +} +if (normalizeStage('ann') !== 'stage4') { + fail('normalizeStage should map ann to stage4.'); +} +if (normalizeStage('') !== null) { + fail('normalizeStage should return null for empty input.'); +} + +const stage1Overrides = buildStageOverrides({ stage1: { lint: true } }, 'stage1'); +if (!stage1Overrides || stage1Overrides.lint !== true) { + fail('stage1 overrides should merge explicit values.'); +} +if (stage1Overrides.embeddings?.enabled !== false) { + fail('stage1 overrides should disable embeddings.'); +} +if (stage1Overrides.treeSitter?.enabled !== false) { + fail('stage1 overrides should disable tree-sitter.'); +} +if (stage1Overrides.typeInference !== false) { + fail('stage1 overrides should disable type inference.'); +} + +const stage2Overrides = buildStageOverrides({ stage2: { lint: false, embeddings: { enabled: true } } }, 'stage2'); +if (!stage2Overrides || stage2Overrides.embeddings?.enabled !== true) { + fail('stage2 overrides should preserve explicit embeddings config.'); +} + +const stage3Overrides = buildStageOverrides({ stage3: { lint: true } }, 'stage3'); +if (!stage3Overrides || stage3Overrides.lint !== true) { + fail('stage3 overrides should merge explicit values.'); +} +if (stage3Overrides.treeSitter?.enabled !== false) { + fail('stage3 overrides should disable tree-sitter.'); +} + +if (buildStageOverrides({}, 'unknown') !== null) { + fail('buildStageOverrides should return null for unknown stages.'); +} + +console.log('build runtime stage overrides tests passed'); diff --git a/tests/cache-lru.js b/tests/cache-lru.js new file mode 100644 index 000000000..be8ba225f --- /dev/null +++ b/tests/cache-lru.js @@ -0,0 +1,31 @@ +import assert from 'node:assert/strict'; +import { createLruCache, estimateStringBytes } from '../src/shared/cache.js'; + +const sizeCache = createLruCache({ + name: 'size-test', + maxMb: 0.0001, + ttlMs: 0, + sizeCalculation: estimateStringBytes +}); + +sizeCache.set('a', 'a'.repeat(80)); +sizeCache.set('b', 'b'.repeat(80)); + +const hasA = sizeCache.get('a') !== null; +const hasB = sizeCache.get('b') !== null; +assert.ok(!(hasA && hasB), 'expected size-based eviction'); +assert.ok(sizeCache.stats.evictions >= 1, 'expected at least one eviction'); + +const ttlCache = createLruCache({ + name: 'ttl-test', + maxMb: 1, + ttlMs: 10, + sizeCalculation: estimateStringBytes +}); + +ttlCache.set('x', 'value'); +await new Promise((resolve) => setTimeout(resolve, 25)); +const expired = ttlCache.get('x'); +assert.equal(expired, null, 'expected ttl-based expiration'); + +console.log('cache lru test passed'); diff --git a/tests/capabilities-report.js b/tests/capabilities-report.js new file mode 100644 index 000000000..9feccbc6f --- /dev/null +++ b/tests/capabilities-report.js @@ -0,0 +1,23 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { getCapabilities } from '../src/shared/capabilities.js'; + +const caps = getCapabilities({ refresh: true }); + +assert.ok(caps && typeof caps === 'object', 'capabilities should be an object'); +assert.equal(typeof caps.watcher?.chokidar, 'boolean', 'watcher.chokidar should be boolean'); +assert.equal(typeof caps.watcher?.parcel, 'boolean', 'watcher.parcel should be boolean'); +assert.equal(typeof caps.regex?.re2, 'boolean', 'regex.re2 should be boolean'); +assert.equal(typeof caps.regex?.re2js, 'boolean', 'regex.re2js should be boolean'); +assert.equal(typeof caps.hash?.nodeRsXxhash, 'boolean', 'hash.nodeRsXxhash should be boolean'); +assert.equal(typeof caps.hash?.wasmXxhash, 'boolean', 'hash.wasmXxhash should be boolean'); +assert.equal(typeof caps.compression?.gzip, 'boolean', 'compression.gzip should be boolean'); +assert.equal(typeof caps.compression?.zstd, 'boolean', 'compression.zstd should be boolean'); +assert.equal(typeof caps.extractors?.pdf, 'boolean', 'extractors.pdf should be boolean'); +assert.equal(typeof caps.extractors?.docx, 'boolean', 'extractors.docx should be boolean'); +assert.equal(typeof caps.mcp?.legacy, 'boolean', 'mcp.legacy should be boolean'); +assert.equal(typeof caps.mcp?.sdk, 'boolean', 'mcp.sdk should be boolean'); +assert.equal(typeof caps.externalBackends?.tantivy, 'boolean', 'externalBackends.tantivy should be boolean'); +assert.equal(typeof caps.externalBackends?.lancedb, 'boolean', 'externalBackends.lancedb should be boolean'); + +console.log('capabilities report tests passed'); diff --git a/tests/chargram-guardrails.js b/tests/chargram-guardrails.js new file mode 100644 index 000000000..83aaaaa85 --- /dev/null +++ b/tests/chargram-guardrails.js @@ -0,0 +1,47 @@ +#!/usr/bin/env node +import { createTokenizationContext, tokenizeChunkText } from '../src/index/build/tokenization.js'; +import { tri } from '../src/shared/tokenize.js'; + +const context = createTokenizationContext({ + dictWords: new Set(), + dictConfig: { segmentation: 'greedy' }, + postingsConfig: { + enableChargrams: true, + chargramMinN: 3, + chargramMaxN: 3, + chargramMaxTokenLength: 5, + chargramSource: 'full' + } +}); + +const payload = tokenizeChunkText({ + text: 'short veryverylongtoken', + mode: 'code', + ext: '.js', + context +}); + +const longGram = tri('veryverylongtoken', 3)[0]; +if (payload.chargrams.includes(longGram)) { + console.error('chargram guardrail test failed: long token chargrams should be skipped.'); + process.exit(1); +} + +const fieldPayload = tokenizeChunkText({ + text: 'short', + mode: 'code', + ext: '.js', + context, + chargramTokens: ['field'] +}); +const fieldGram = tri('field', 3)[0]; +if (!fieldPayload.chargrams.includes(fieldGram)) { + console.error('chargram guardrail test failed: field chargrams missing.'); + process.exit(1); +} +if (fieldPayload.chargrams.includes(tri('short', 3)[0])) { + console.error('chargram guardrail test failed: expected chargrams to use field tokens only.'); + process.exit(1); +} + +console.log('chargram guardrail test passed'); diff --git a/tests/chunk-meta-jsonl-cleanup.js b/tests/chunk-meta-jsonl-cleanup.js new file mode 100644 index 000000000..594440853 --- /dev/null +++ b/tests/chunk-meta-jsonl-cleanup.js @@ -0,0 +1,101 @@ +#!/usr/bin/env node + +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; + +import { + createChunkMetaIterator, + enqueueChunkMetaArtifacts +} from '../src/index/build/artifacts/writers/chunk-meta.js'; + +const root = process.cwd(); +const cacheRoot = path.join(root, 'tests', '.cache', 'chunk-meta-jsonl-cleanup'); +const outDir = path.join(cacheRoot, 'index'); + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(outDir, { recursive: true }); + +const chunks = [ + { id: 0, file: 'alpha.js', start: 0, end: 10, startLine: 1, endLine: 1, kind: 'code' }, + { id: 1, file: 'beta.js', start: 0, end: 12, startLine: 1, endLine: 1, kind: 'code' }, + { id: 2, file: 'gamma.js', start: 0, end: 14, startLine: 1, endLine: 1, kind: 'code' } +]; + +const chunkMetaIterator = createChunkMetaIterator({ + chunks, + fileIdByPath: new Map(), + resolvedTokenMode: 'none', + tokenSampleSize: 0 +}); + +const runWriter = async (chunkMetaPlan) => { + const writes = []; + const enqueueWrite = (label, job) => { + writes.push({ label, job }); + }; + const enqueueJsonArray = (label, _payload, _options) => { + throw new Error(`Unexpected enqueueJsonArray for chunk meta (${label})`); + }; + const addPieceFile = () => {}; + const formatArtifactLabel = (value) => value; + + const state = { chunks }; + await enqueueChunkMetaArtifacts({ + state, + outDir, + chunkMetaIterator, + chunkMetaPlan, + enqueueJsonArray, + enqueueWrite, + addPieceFile, + formatArtifactLabel + }); + + for (const { label, job } of writes) { + try { + // eslint-disable-next-line no-await-in-loop + await job(); + } catch (err) { + throw new Error(`Failed write job (${label}): ${err?.message || err}`); + } + } +}; + +const metaPath = path.join(outDir, 'chunk_meta.meta.json'); +const partsDir = path.join(outDir, 'chunk_meta.parts'); +const jsonlPath = path.join(outDir, 'chunk_meta.jsonl'); + +await runWriter({ + chunkMetaUseJsonl: true, + chunkMetaUseShards: true, + chunkMetaShardSize: 1, + chunkMetaCount: chunks.length +}); + +if (!fs.existsSync(metaPath) || !fs.existsSync(partsDir)) { + console.error('Expected sharded chunk_meta artifacts (meta + parts).'); + process.exit(1); +} +if (fs.existsSync(jsonlPath)) { + console.error('Did not expect chunk_meta.jsonl when writing sharded chunk_meta.'); + process.exit(1); +} + +await runWriter({ + chunkMetaUseJsonl: true, + chunkMetaUseShards: false, + chunkMetaShardSize: 0, + chunkMetaCount: chunks.length +}); + +if (!fs.existsSync(jsonlPath)) { + console.error('Expected chunk_meta.jsonl when writing unsharded JSONL chunk_meta.'); + process.exit(1); +} +if (fs.existsSync(metaPath) || fs.existsSync(partsDir)) { + console.error('Expected stale sharded chunk_meta artifacts to be removed when writing unsharded JSONL.'); + process.exit(1); +} + +console.log('chunk_meta JSONL cleanup test passed'); diff --git a/tests/chunking-limits.js b/tests/chunking-limits.js new file mode 100644 index 000000000..ebbb79ef2 --- /dev/null +++ b/tests/chunking-limits.js @@ -0,0 +1,80 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { smartChunk } from '../src/index/chunking.js'; +import { buildMetaV2 } from '../src/index/metadata-v2.js'; + +const lineText = [ + 'alpha', + 'bravo', + 'charlie', + 'delta', + 'echo', + 'foxtrot', + 'golf' +].join('\n') + '\n'; +const lineContext = { chunking: { maxLines: 3 } }; + +const first = smartChunk({ + text: lineText, + ext: '.js', + relPath: 'src/sample.js', + mode: 'code', + context: lineContext +}); +const second = smartChunk({ + text: lineText, + ext: '.js', + relPath: 'src/sample.js', + mode: 'code', + context: lineContext +}); + +assert.ok(first.length > 1, 'expected line splitting'); + +const countLines = (value) => { + if (!value) return 0; + const trimmed = value.endsWith('\n') ? value.slice(0, -1) : value; + return trimmed ? trimmed.split('\n').length : 0; +}; + +for (const chunk of first) { + const slice = lineText.slice(chunk.start, chunk.end); + const lineCount = countLines(slice); + assert.ok(lineCount <= 3, `chunk line count ${lineCount} exceeds maxLines`); +} + +const segment = { + segmentId: 'seg-1', + type: 'code', + languageId: 'javascript', + parentSegmentId: null +}; +const toChunkId = (chunk) => buildMetaV2({ + chunk: { + ...chunk, + file: 'src/sample.js', + ext: '.js', + segment + }, + docmeta: {} +}).chunkId; + +assert.deepEqual(first.map(toChunkId), second.map(toChunkId), 'expected stable chunk IDs'); + +const byteText = 'abcdefghijABCDEFGHIJ'; +const byteChunks = smartChunk({ + text: byteText, + ext: '.txt', + relPath: 'notes.txt', + mode: 'code', + context: { chunking: { maxBytes: 7 } } +}); + +assert.ok(byteChunks.length > 1, 'expected byte splitting'); +for (const chunk of byteChunks) { + const slice = byteText.slice(chunk.start, chunk.end); + const bytes = Buffer.byteLength(slice, 'utf8'); + assert.ok(bytes <= 7, `chunk byte count ${bytes} exceeds maxBytes`); +} + +console.log('chunking limits test passed'); diff --git a/tests/chunking-sql-lua.js b/tests/chunking-sql-lua.js new file mode 100644 index 000000000..63d164acd --- /dev/null +++ b/tests/chunking-sql-lua.js @@ -0,0 +1,30 @@ +#!/usr/bin/env node +import { buildLuaChunks } from '../src/lang/lua.js'; +import { buildSqlChunks } from '../src/lang/sql.js'; + +const luaText = "local function foo(a)\n return a\nend -- done\n"; +const luaChunks = buildLuaChunks(luaText) || []; +if (!luaChunks.some((chunk) => chunk.name === 'foo')) { + console.error('Expected Lua chunk for foo when end has a trailing comment.'); + process.exit(1); +} + +const mysqlSql = "DELIMITER $$\nCREATE FUNCTION add_one(x INT)\nRETURNS INT\nBEGIN\nSELECT x + 1;\nEND $$\nDELIMITER ;\nSELECT 1;"; +const mysqlChunks = buildSqlChunks(mysqlSql, { dialect: 'mysql' }) || []; +if (mysqlChunks.length !== 2) { + console.error(`Expected 2 MySQL statements, got ${mysqlChunks.length}.`); + process.exit(1); +} +if (mysqlChunks[0].kind !== 'FunctionDeclaration') { + console.error('Expected first MySQL chunk to be a FunctionDeclaration.'); + process.exit(1); +} + +const pgSql = "CREATE FUNCTION test_fn() RETURNS text AS $$\nSELECT ';';\n$$ LANGUAGE sql;\nSELECT 2;"; +const pgChunks = buildSqlChunks(pgSql, { dialect: 'postgres' }) || []; +if (pgChunks.length !== 2) { + console.error(`Expected 2 Postgres statements, got ${pgChunks.length}.`); + process.exit(1); +} + +console.log('sql/lua chunking test passed'); diff --git a/tests/chunking-yaml.js b/tests/chunking-yaml.js new file mode 100644 index 000000000..068ac7ce9 --- /dev/null +++ b/tests/chunking-yaml.js @@ -0,0 +1,53 @@ +#!/usr/bin/env node +import { smartChunk } from '../src/index/chunking.js'; + +const text = "alpha: 1\nbeta: 2\n"; +const defaultChunks = smartChunk({ + text, + ext: '.yaml', + relPath: 'config.yaml', + mode: 'code' +}); +if (defaultChunks.length !== 1 || defaultChunks[0].name !== 'root') { + console.error('Expected default YAML chunking to return a root chunk.'); + process.exit(1); +} + +const top = smartChunk({ + text, + ext: '.yaml', + relPath: 'config.yaml', + mode: 'code', + context: { yamlChunking: { mode: 'top-level' } } +}); +const topNames = top.map((chunk) => chunk.name); +if (top.length !== 2 || !topNames.includes('alpha') || !topNames.includes('beta')) { + console.error(`Unexpected top-level YAML chunks: ${topNames.join(',')}`); + process.exit(1); +} + +const rootOnly = smartChunk({ + text, + ext: '.yaml', + relPath: 'config.yaml', + mode: 'code', + context: { yamlChunking: { mode: 'root' } } +}); +if (rootOnly.length !== 1 || rootOnly[0].name !== 'root') { + console.error('Expected root-only YAML chunking.'); + process.exit(1); +} + +const autoLarge = smartChunk({ + text, + ext: '.yaml', + relPath: 'config.yaml', + mode: 'code', + context: { yamlChunking: { mode: 'auto', maxBytes: 4 } } +}); +if (autoLarge.length !== 1 || autoLarge[0].name !== 'root') { + console.error('Expected auto YAML chunking to fall back to root.'); + process.exit(1); +} + +console.log('yaml chunking test passed'); diff --git a/tests/chunking/json.test.js b/tests/chunking/json.test.js new file mode 100644 index 000000000..6ad5c5da8 --- /dev/null +++ b/tests/chunking/json.test.js @@ -0,0 +1,30 @@ +#!/usr/bin/env node +import { chunkJson } from '../../src/index/chunking.js'; + +const expect = (condition, message) => { + if (!condition) { + console.error(message); + process.exit(1); + } +}; + +const jsonText = JSON.stringify({ + name: 'alpha', + config: { enabled: true }, + text: 'escaped \"quote\"' +}); + +const chunks = chunkJson(jsonText, {}) || []; +const names = new Set(chunks.map((chunk) => chunk.name)); +expect(names.has('name'), 'Missing chunk for name key.'); +expect(names.has('config'), 'Missing chunk for config key.'); +expect(names.has('text'), 'Missing chunk for text key.'); + +const arrayChunk = chunkJson('["a","b"]', {}) || []; +expect(arrayChunk.length === 1, 'Expected array JSON to return a single chunk.'); +expect(arrayChunk[0].name === 'root', 'Expected root chunk for array JSON.'); + +const invalid = chunkJson('{', {}); +expect(invalid === null, 'Expected invalid JSON to return null.'); + +console.log('Chunking JSON test passed.'); diff --git a/tests/chunking/limits.test.js b/tests/chunking/limits.test.js new file mode 100644 index 000000000..cac52fe95 --- /dev/null +++ b/tests/chunking/limits.test.js @@ -0,0 +1,40 @@ +#!/usr/bin/env node +import { applyChunkingLimits } from '../../src/index/chunking/limits.js'; + +const expect = (condition, message) => { + if (!condition) { + console.error(message); + process.exit(1); + } +}; + +const lineText = [ + 'alpha', + 'bravo', + 'charlie', + 'delta' +].join('\n'); + +const baseChunk = { start: 0, end: lineText.length, name: 'root', kind: 'Section', meta: {} }; +const lineChunks = applyChunkingLimits([baseChunk], lineText, { chunking: { maxLines: 2 } }); + +expect(lineChunks.length === 2, `Expected 2 chunks for maxLines, got ${lineChunks.length}`); +lineChunks.forEach((chunk) => { + expect(chunk.meta?.startLine >= 1, 'Expected startLine in chunk meta.'); + expect(chunk.meta?.endLine >= chunk.meta?.startLine, 'Expected endLine >= startLine.'); +}); + +const byteText = 'abcdefghij'; +const byteChunks = applyChunkingLimits( + [{ start: 0, end: byteText.length, name: 'root', kind: 'Section', meta: {} }], + byteText, + { chunking: { maxBytes: 4 } } +); + +expect(byteChunks.length >= 3, `Expected multiple chunks for maxBytes, got ${byteChunks.length}`); +byteChunks.forEach((chunk) => { + const slice = byteText.slice(chunk.start, chunk.end); + expect(Buffer.byteLength(slice, 'utf8') <= 4, 'Chunk exceeded maxBytes.'); +}); + +console.log('Chunking limits test passed.'); diff --git a/tests/chunking/yaml.test.js b/tests/chunking/yaml.test.js new file mode 100644 index 000000000..2ef1a5349 --- /dev/null +++ b/tests/chunking/yaml.test.js @@ -0,0 +1,42 @@ +#!/usr/bin/env node +import { chunkYaml } from '../../src/index/chunking.js'; + +const expect = (condition, message) => { + if (!condition) { + console.error(message); + process.exit(1); + } +}; + +const yamlText = [ + 'defaults: &defaults', + ' name: base', + 'service:', + ' <<: *defaults', + ' port: 80' +].join('\n'); + +const topLevel = chunkYaml(yamlText, 'config.yml', { + yamlChunking: { mode: 'top-level', maxBytes: 1024 } +}) || []; + +const names = new Set(topLevel.map((chunk) => chunk.name)); +expect(names.has('defaults'), 'Missing top-level chunk for defaults.'); +expect(names.has('service'), 'Missing top-level chunk for service.'); + +const rootOnly = chunkYaml(yamlText, 'config.yml', { yamlChunking: { mode: 'root' } }) || []; +expect(rootOnly.length === 1, `Expected root mode to return 1 chunk, got ${rootOnly.length}`); +expect(rootOnly[0].name === 'root', 'Expected root chunk name.'); + +const multiDoc = [ + '---', + 'first: 1', + '---', + 'second: 2' +].join('\n'); +const multiChunks = chunkYaml(multiDoc, 'config.yml', { yamlChunking: { mode: 'top-level' } }) || []; +const multiNames = new Set(multiChunks.map((chunk) => chunk.name)); +expect(multiNames.has('first'), 'Missing first doc chunk.'); +expect(multiNames.has('second'), 'Missing second doc chunk.'); + +console.log('Chunking YAML test passed.'); diff --git a/tests/churn-filter.js b/tests/churn-filter.js index 81d1e3312..878ea27f2 100644 --- a/tests/churn-filter.js +++ b/tests/churn-filter.js @@ -3,7 +3,7 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; import { spawnSync } from 'node:child_process'; -import { getGitMeta } from '../src/indexer/git.js'; +import { getGitMeta } from '../src/index/git.js'; const root = process.cwd(); const tempRoot = path.join(root, 'tests', '.cache', 'churn-filter'); @@ -64,10 +64,7 @@ const env = { }; const repoArgs = ['--repo', repoRoot]; -const originalCwd = process.cwd(); -process.chdir(repoRoot); -const gitMeta = await getGitMeta('notes.md', 0, 1, { blame: false }); -process.chdir(originalCwd); +const gitMeta = await getGitMeta('notes.md', 1, 2, { blame: false, baseDir: repoRoot }); const expectedChurn = 5; if (gitMeta.churn !== expectedChurn) { console.error(`Expected churn ${expectedChurn}, got ${gitMeta.churn}`); diff --git a/tests/clean-artifacts.js b/tests/clean-artifacts.js index 7e73e56f7..0d240cc59 100644 --- a/tests/clean-artifacts.js +++ b/tests/clean-artifacts.js @@ -53,7 +53,7 @@ await fsPromises.writeFile(path.join(extensionsDir, 'ext.bin'), 'ext'); const result = spawnSync( process.execPath, - [path.join(root, 'tools', 'clean-artifacts.js')], + [path.join(root, 'tools', 'clean-artifacts.js'), '--repo', repoRoot], { cwd: repoRoot, env, stdio: 'inherit' } ); @@ -74,7 +74,7 @@ await fsPromises.writeFile(path.join(repoCacheRoot, 'marker.txt'), 'marker'); const resultAll = spawnSync( process.execPath, - [path.join(root, 'tools', 'clean-artifacts.js'), '--all'], + [path.join(root, 'tools', 'clean-artifacts.js'), '--repo', repoRoot, '--all'], { cwd: repoRoot, env, stdio: 'inherit' } ); diff --git a/tests/clike-doc-comments.js b/tests/clike-doc-comments.js new file mode 100644 index 000000000..0e07ab7de --- /dev/null +++ b/tests/clike-doc-comments.js @@ -0,0 +1,43 @@ +#!/usr/bin/env node +import { buildCLikeChunks } from '../src/lang/clike.js'; + +const expect = (condition, message) => { + if (!condition) { + console.error(message); + process.exit(1); + } +}; + +const cText = [ + '/**', + ' * Greets the user.', + ' */', + 'int greet(int x) {', + ' return x;', + '}' +].join('\n'); + +const cChunks = buildCLikeChunks(cText, '.c', { treeSitter: { enabled: false }, log: () => {} }) || []; +const greetChunk = cChunks.find((chunk) => chunk.kind === 'FunctionDeclaration' && chunk.name === 'greet'); +expect(!!greetChunk, 'Expected to find a C-like function chunk for greet.'); +expect( + String(greetChunk.meta?.docstring || '').includes('Greets the user'), + `Expected greet docstring to include "Greets the user", got: ${JSON.stringify(greetChunk.meta?.docstring || '')}` +); + +const objcText = [ + '@interface Widget : NSObject', + '/// Greets from ObjC.', + '- (void)greet;', + '@end' +].join('\n'); + +const objcChunks = buildCLikeChunks(objcText, '.m', { treeSitter: { enabled: false }, log: () => {} }) || []; +const objcGreet = objcChunks.find((chunk) => chunk.kind === 'MethodDeclaration' && String(chunk.name || '').includes('greet')); +expect(!!objcGreet, 'Expected to find an ObjC method chunk for greet.'); +expect( + String(objcGreet.meta?.docstring || '').includes('Greets from ObjC'), + `Expected ObjC greet docstring to include "Greets from ObjC", got: ${JSON.stringify(objcGreet.meta?.docstring || '')}` +); + +console.log('C-like doc comment extraction test passed.'); diff --git a/tests/code-map-basic.js b/tests/code-map-basic.js new file mode 100644 index 000000000..f099d74b5 --- /dev/null +++ b/tests/code-map-basic.js @@ -0,0 +1,112 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'code-map-basic'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const config = { + indexing: { + astDataflow: true, + controlFlow: true, + typeInference: true, + typeInferenceCrossFile: true + } +}; + +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify(config, null, 2) +); + +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'util.js'), + 'export function add(a, b) { return a + b; }\n' + + 'export function mutate(obj) { obj.count = obj.count + 1; return obj; }\n' +); +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'main.js'), + 'import { add, mutate } from "./util.js";\n' + + 'function run(x) {\n' + + ' if (x > 0) { return add(x, 1); }\n' + + ' return add(x, 2);\n' + + '}\n' + + 'async function go(items) {\n' + + ' for (const item of items) {\n' + + ' await Promise.resolve(item);\n' + + ' mutate(item);\n' + + ' }\n' + + '}\n' + + 'export default function main(items) { return go(items); }\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); + +if (buildResult.status !== 0) { + console.error('Failed: build index for code map basic test'); + process.exit(buildResult.status ?? 1); +} + +const mapResult = spawnSync( + process.execPath, + [path.join(root, 'tools', 'report-code-map.js'), '--format', 'json', '--repo', repoRoot], + { cwd: repoRoot, env, encoding: 'utf8' } +); + +if (mapResult.status !== 0) { + console.error('Failed: map generator'); + if (mapResult.stderr) console.error(mapResult.stderr.trim()); + process.exit(mapResult.status ?? 1); +} + +let payload = null; +try { + payload = JSON.parse(mapResult.stdout || '{}'); +} catch { + console.error('Failed: map output invalid JSON'); + process.exit(1); +} + +if (!Array.isArray(payload.nodes) || payload.nodes.length === 0) { + console.error('Failed: map nodes missing'); + process.exit(1); +} + +const members = payload.nodes.flatMap((node) => node.members || []); +if (!members.length) { + console.error('Failed: map members missing'); + process.exit(1); +} + +const hasControlFlow = members.some((member) => member.controlFlow); +const hasDataflow = members.some((member) => member.dataflow); +if (!hasControlFlow || !hasDataflow) { + console.error('Failed: expected dataflow/controlFlow metadata'); + process.exit(1); +} + +const edgeTypes = new Set(payload.edges.map((edge) => edge.type)); +if (!edgeTypes.has('import') || !edgeTypes.has('call')) { + console.error('Failed: expected import + call edges'); + process.exit(1); +} + +console.log('code map basic tests passed'); diff --git a/tests/code-map-default-guardrails.js b/tests/code-map-default-guardrails.js new file mode 100644 index 000000000..859a54d83 --- /dev/null +++ b/tests/code-map-default-guardrails.js @@ -0,0 +1,84 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { DEFAULT_LIMITS } from '../src/map/constants.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'code-map-default-guardrails'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); + +const functionCount = DEFAULT_LIMITS.maxMembersPerFile + 15; +let source = ''; +for (let i = 0; i < functionCount; i += 1) { + source += `export function fn${i}(value) { return value + ${i}; }\n`; +} + +await fsPromises.writeFile(path.join(repoRoot, 'src', 'many.js'), source); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); + +if (buildResult.status !== 0) { + console.error('Failed: build index for code map default guardrails test'); + process.exit(buildResult.status ?? 1); +} + +const mapResult = spawnSync( + process.execPath, + [path.join(root, 'tools', 'report-code-map.js'), '--format', 'json', '--repo', repoRoot], + { cwd: repoRoot, env, encoding: 'utf8' } +); + +if (mapResult.status !== 0) { + console.error('Failed: report-code-map for default guardrails test'); + if (mapResult.stderr) console.error(mapResult.stderr.trim()); + process.exit(mapResult.status ?? 1); +} + +let model = null; +try { + model = JSON.parse(mapResult.stdout || '{}'); +} catch { + console.error('Failed: map output invalid JSON (default guardrails test)'); + process.exit(1); +} + +const fileNode = (model.nodes || []).find((node) => node?.path === 'src/many.js'); +if (!fileNode) { + console.error('Failed: map missing src/many.js node (default guardrails test)'); + process.exit(1); +} + +const members = Array.isArray(fileNode.members) ? fileNode.members : []; +if (members.length > DEFAULT_LIMITS.maxMembersPerFile) { + console.error( + `Failed: expected members <= ${DEFAULT_LIMITS.maxMembersPerFile} but saw ${members.length}` + ); + process.exit(1); +} + +const droppedMembers = model?.summary?.dropped?.members ?? 0; +const truncated = model?.summary?.truncated === true; +if (!truncated || droppedMembers <= 0) { + console.error('Failed: expected map summary to indicate truncation (default guardrails test)'); + process.exit(1); +} + +console.log('code map default guardrails tests passed'); diff --git a/tests/code-map-determinism.js b/tests/code-map-determinism.js new file mode 100644 index 000000000..e05c42983 --- /dev/null +++ b/tests/code-map-determinism.js @@ -0,0 +1,76 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'code-map-determinism'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ indexing: { astDataflow: true, controlFlow: true } }, null, 2) +); + +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'one.js'), + 'export function alpha() { return 1; }\n' +); +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'two.js'), + 'import { alpha } from "./one.js";\nexport function beta() { return alpha(); }\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); + +if (buildResult.status !== 0) { + console.error('Failed: build index for determinism test'); + process.exit(buildResult.status ?? 1); +} + +const runMap = () => spawnSync( + process.execPath, + [path.join(root, 'tools', 'report-code-map.js'), '--format', 'json', '--repo', repoRoot], + { cwd: repoRoot, env, encoding: 'utf8' } +); + +const first = runMap(); +const second = runMap(); + +if (first.status !== 0 || second.status !== 0) { + console.error('Failed: map generator runs'); + process.exit(1); +} + +const strip = (payload) => { + const clone = JSON.parse(JSON.stringify(payload)); + clone.generatedAt = null; + if (clone.summary) clone.summary.generatedAt = null; + return clone; +}; + +const firstPayload = strip(JSON.parse(first.stdout || '{}')); +const secondPayload = strip(JSON.parse(second.stdout || '{}')); + +if (JSON.stringify(firstPayload) !== JSON.stringify(secondPayload)) { + console.error('Failed: map output not deterministic'); + process.exit(1); +} + +console.log('code map determinism tests passed'); diff --git a/tests/code-map-dot.js b/tests/code-map-dot.js new file mode 100644 index 000000000..8d3b5d63a --- /dev/null +++ b/tests/code-map-dot.js @@ -0,0 +1,73 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'code-map-dot'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ indexing: { astDataflow: true, controlFlow: true } }, null, 2) +); + +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'a.js'), + 'import { add } from "./b.js";\n' + + 'export function run(x) { return add(x, 1); }\n' +); +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'b.js'), + 'export function add(a, b) { return a + b; }\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); + +if (buildResult.status !== 0) { + console.error('Failed: build index for code map dot test'); + process.exit(buildResult.status ?? 1); +} + +const mapResult = spawnSync( + process.execPath, + [path.join(root, 'tools', 'report-code-map.js'), '--format', 'dot', '--repo', repoRoot], + { cwd: repoRoot, env, encoding: 'utf8' } +); + +if (mapResult.status !== 0) { + console.error('Failed: map dot output'); + process.exit(mapResult.status ?? 1); +} + +const output = mapResult.stdout || ''; +if (!output.includes('PORT=')) { + console.error('Failed: dot output missing ports'); + process.exit(1); +} +if (!output.includes('->')) { + console.error('Failed: dot output missing edges'); + process.exit(1); +} +if (!output.includes('style="dashed"')) { + console.error('Failed: dot output missing import style'); + process.exit(1); +} + +console.log('code map dot tests passed'); diff --git a/tests/code-map-graphviz-available.js b/tests/code-map-graphviz-available.js new file mode 100644 index 000000000..b327d8fda --- /dev/null +++ b/tests/code-map-graphviz-available.js @@ -0,0 +1,120 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); + +const dotCheck = spawnSync('dot', ['-V'], { encoding: 'utf8' }); +if (dotCheck.status !== 0) { + console.log('code map graphviz available test skipped (dot not found)'); + process.exit(0); +} + +const tempRoot = path.join(root, 'tests', '.cache', 'code-map-graphviz-available'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ indexing: { astDataflow: true, controlFlow: true } }, null, 2) +); + +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'a.js'), + 'import { add } from "./b.js";\n' + + 'export function run(x) { return add(x, 1); }\n' +); +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'b.js'), + 'export function add(a, b) { return a + b; }\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); + +if (buildResult.status !== 0) { + console.error('Failed: build index for code map graphviz available test'); + process.exit(buildResult.status ?? 1); +} + +// Verify stdout rendering. +const mapStdoutResult = spawnSync( + process.execPath, + [path.join(root, 'tools', 'report-code-map.js'), '--format', 'svg', '--repo', repoRoot], + { cwd: repoRoot, env, encoding: 'utf8' } +); + +if (mapStdoutResult.status !== 0) { + console.error('Failed: map svg output (stdout)'); + if (mapStdoutResult.stderr) console.error(mapStdoutResult.stderr.trim()); + process.exit(mapStdoutResult.status ?? 1); +} + +const stdoutSvg = (mapStdoutResult.stdout || '').trim(); +if (!stdoutSvg.includes(''); + process.exit(1); +} + +// Verify file output through --out + --json. +const outPath = path.join(tempRoot, 'map.svg'); +const mapFileResult = spawnSync( + process.execPath, + [ + path.join(root, 'tools', 'report-code-map.js'), + '--format', + 'svg', + '--out', + outPath, + '--json', + '--repo', + repoRoot + ], + { cwd: repoRoot, env, encoding: 'utf8' } +); + +if (mapFileResult.status !== 0) { + console.error('Failed: map svg output (file)'); + if (mapFileResult.stderr) console.error(mapFileResult.stderr.trim()); + process.exit(mapFileResult.status ?? 1); +} + +let report = null; +try { + report = JSON.parse(mapFileResult.stdout || '{}'); +} catch { + console.error('Failed: svg --json output invalid JSON'); + process.exit(1); +} + +if (report.format !== 'svg') { + console.error(`Failed: expected format svg but saw ${report.format}`); + process.exit(1); +} +if (!report.outPath) { + console.error('Failed: svg report missing outPath'); + process.exit(1); +} + +const fileSvg = (await fsPromises.readFile(report.outPath, 'utf8')).trim(); +if (!fileSvg.includes(''); + process.exit(1); +} + +console.log('code map graphviz available tests passed'); diff --git a/tests/code-map-graphviz-fallback.js b/tests/code-map-graphviz-fallback.js new file mode 100644 index 000000000..d288c225b --- /dev/null +++ b/tests/code-map-graphviz-fallback.js @@ -0,0 +1,74 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'code-map-graphviz'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'a.js'), + 'export function alpha() { return 1; }\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); + +if (buildResult.status !== 0) { + console.error('Failed: build index for graphviz fallback test'); + process.exit(buildResult.status ?? 1); +} + +const outPath = path.join(tempRoot, 'map.svg'); +const mapResult = spawnSync( + process.execPath, + [ + path.join(root, 'tools', 'report-code-map.js'), + '--format', 'svg', + '--repo', repoRoot, + '--out', outPath, + '--json' + ], + { + cwd: repoRoot, + env: { + ...env, + PATH: '', + Path: '' + }, + encoding: 'utf8' + } +); + +if (mapResult.status !== 0) { + console.error('Failed: graphviz fallback map output'); + process.exit(mapResult.status ?? 1); +} + +const payload = JSON.parse(mapResult.stdout || '{}'); +if (payload.format !== 'dot') { + console.error('Failed: expected dot fallback'); + process.exit(1); +} +if (!payload.outPath || !payload.outPath.endsWith('.dot')) { + console.error('Failed: expected .dot output path'); + process.exit(1); +} + +console.log('code map graphviz fallback tests passed'); diff --git a/tests/code-map-guardrails.js b/tests/code-map-guardrails.js new file mode 100644 index 000000000..de758ffc0 --- /dev/null +++ b/tests/code-map-guardrails.js @@ -0,0 +1,69 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'code-map-guardrails'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); + +const funcs = []; +for (let i = 0; i < 120; i += 1) { + funcs.push(`export function fn${i}() { return ${i}; }`); +} +await fsPromises.writeFile(path.join(repoRoot, 'src', 'many.js'), funcs.join('\n')); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); + +if (buildResult.status !== 0) { + console.error('Failed: build index for guardrails test'); + process.exit(buildResult.status ?? 1); +} + +const mapResult = spawnSync( + process.execPath, + [ + path.join(root, 'tools', 'report-code-map.js'), + '--format', 'json', + '--repo', repoRoot, + '--max-members-per-file', '5', + '--max-files', '1', + '--max-edges', '2' + ], + { cwd: repoRoot, env, encoding: 'utf8' } +); + +if (mapResult.status !== 0) { + console.error('Failed: guardrails map output'); + process.exit(mapResult.status ?? 1); +} + +const payload = JSON.parse(mapResult.stdout || '{}'); +const summary = payload.summary || {}; +const dropped = summary.dropped || {}; +if (!summary.truncated) { + console.error('Failed: guardrails did not truncate'); + process.exit(1); +} +if (!dropped.members || dropped.members < 1) { + console.error('Failed: guardrails did not drop members'); + process.exit(1); +} + +console.log('code map guardrails tests passed'); diff --git a/tests/compact-pieces.js b/tests/compact-pieces.js new file mode 100644 index 000000000..927a63593 --- /dev/null +++ b/tests/compact-pieces.js @@ -0,0 +1,99 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'compact-pieces'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); +const buildIndexPath = path.join(root, 'build_index.js'); +const compactPiecesPath = path.join(root, 'tools', 'compact-pieces.js'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile(path.join(repoRoot, 'alpha.js'), 'const alpha = 1;\n'); +await fsPromises.writeFile(path.join(repoRoot, 'beta.js'), 'const beta = 2;\n'); +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ + indexing: { + treeSitter: { enabled: false }, + artifacts: { + chunkMetaFormat: 'jsonl', + chunkMetaShardSize: 1, + tokenPostingsFormat: 'sharded', + tokenPostingsShardSize: 1 + } + } + }, null, 2) +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const runNode = (label, args, cwd = repoRoot) => { + const result = spawnSync(process.execPath, args, { cwd, env, stdio: 'inherit' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +runNode('build_index', [buildIndexPath, '--stub-embeddings', '--mode', 'code', '--repo', repoRoot]); + +const userConfig = loadUserConfig(repoRoot); +const previousCacheRoot = process.env.PAIROFCLEATS_CACHE_ROOT; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const indexDir = getIndexDir(repoRoot, 'code', userConfig); +if (previousCacheRoot === undefined) { + delete process.env.PAIROFCLEATS_CACHE_ROOT; +} else { + process.env.PAIROFCLEATS_CACHE_ROOT = previousCacheRoot; +} +const chunkPartsDir = path.join(indexDir, 'chunk_meta.parts'); +const tokenPartsDir = path.join(indexDir, 'token_postings.shards'); +const beforeChunkParts = fs.existsSync(chunkPartsDir) ? fs.readdirSync(chunkPartsDir).length : 0; +const beforeTokenParts = fs.existsSync(tokenPartsDir) ? fs.readdirSync(tokenPartsDir).length : 0; +if (beforeChunkParts < 2) { + console.error('Expected multiple chunk_meta parts before compaction.'); + process.exit(1); +} + +runNode('compact-pieces', [ + compactPiecesPath, + '--repo', + repoRoot, + '--mode', + 'code', + '--chunk-meta-size', + '10', + '--token-postings-size', + '10' +]); + +const afterChunkParts = fs.existsSync(chunkPartsDir) ? fs.readdirSync(chunkPartsDir).length : 0; +const afterTokenParts = fs.existsSync(tokenPartsDir) ? fs.readdirSync(tokenPartsDir).length : 0; +if (afterChunkParts >= beforeChunkParts) { + console.error('Expected chunk_meta parts to shrink after compaction.'); + process.exit(1); +} +if (beforeTokenParts >= 2 && afterTokenParts >= beforeTokenParts) { + console.error('Expected token_postings shards to shrink after compaction.'); + process.exit(1); +} + +const logPath = path.join(indexDir, 'pieces', 'compaction.log'); +if (!fs.existsSync(logPath)) { + console.error(`Expected compaction log at ${logPath}`); + process.exit(1); +} + +console.log('compact pieces test passed'); diff --git a/tests/compare-models.js b/tests/compare-models.js deleted file mode 100644 index 26e1681e2..000000000 --- a/tests/compare-models.js +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env node -import fs from 'node:fs'; -import fsPromises from 'node:fs/promises'; -import path from 'node:path'; -import { spawnSync } from 'node:child_process'; - -const root = process.cwd(); -const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); -const tempRoot = path.join(root, 'tests', '.cache', 'compare-models'); -const cacheRoot = path.join(tempRoot, 'cache'); - -await fsPromises.rm(tempRoot, { recursive: true, force: true }); -await fsPromises.mkdir(cacheRoot, { recursive: true }); - -const env = { - ...process.env, - PAIROFCLEATS_CACHE_ROOT: cacheRoot, - PAIROFCLEATS_EMBEDDINGS: 'stub' -}; - -const models = [ - 'Xenova/all-MiniLM-L12-v2', - 'Xenova/all-MiniLM-L6-v2' -]; - -const result = spawnSync( - process.execPath, - [ - path.join(root, 'tools', 'compare-models.js'), - '--models', - models.join(','), - '--build', - '--stub-embeddings', - '--no-ann', - '--limit', - '2', - '--json' - ], - { cwd: fixtureRoot, env, encoding: 'utf8' } -); - -if (result.status !== 0) { - console.error('compare models test failed: script error.'); - if (result.stderr) console.error(result.stderr.trim()); - process.exit(result.status ?? 1); -} - -const payload = JSON.parse(result.stdout || '{}'); -if (!payload.summary || !payload.settings || !payload.results) { - console.error('compare models test failed: missing fields.'); - process.exit(1); -} -if (!Array.isArray(payload.settings.models) || payload.settings.models.length < 2) { - console.error('compare models test failed: models missing.'); - process.exit(1); -} -if (!payload.summary.models || !payload.summary.comparisons) { - console.error('compare models test failed: summary missing.'); - process.exit(1); -} - -console.log('compare models test passed'); diff --git a/tests/config-dump.js b/tests/config-dump.js new file mode 100644 index 000000000..309ef6977 --- /dev/null +++ b/tests/config-dump.js @@ -0,0 +1,19 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { spawnSync } from 'node:child_process'; + +const root = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..'); +const scriptPath = path.join(root, 'tools', 'config-dump.js'); +const result = spawnSync(process.execPath, [scriptPath, '--json'], { encoding: 'utf8', cwd: root }); +if (result.status !== 0) { + throw new Error(`config-dump failed: ${result.stderr || result.stdout}`); +} +const payload = JSON.parse(result.stdout || '{}'); +if (!payload.repoRoot) { + throw new Error('config-dump did not report repoRoot.'); +} +if (!payload.derived || !payload.derived.cacheRoot) { + throw new Error('config-dump did not include derived cacheRoot.'); +} +console.log('Config dump test passed'); diff --git a/tests/config-validate.js b/tests/config-validate.js index 349208dc8..dca333af9 100644 --- a/tests/config-validate.js +++ b/tests/config-validate.js @@ -14,7 +14,7 @@ const invalidPath = path.join(cacheRoot, 'invalid.json'); await fsPromises.writeFile( validPath, - JSON.stringify({ search: { annDefault: true }, sqlite: { use: true } }, null, 2) + JSON.stringify({ search: { annDefault: true }, sqlite: { use: true }, runtime: { uvThreadpoolSize: 8 } }, null, 2) ); await fsPromises.writeFile( invalidPath, diff --git a/tests/context-expansion.js b/tests/context-expansion.js new file mode 100644 index 000000000..78733b07e --- /dev/null +++ b/tests/context-expansion.js @@ -0,0 +1,35 @@ +#!/usr/bin/env node +import { expandContext } from '../src/retrieval/context-expansion.js'; + +const chunkMeta = [ + { id: 0, file: 'src/a.js', name: 'alpha', codeRelations: { calls: [['alpha', 'beta']] } }, + { id: 1, file: 'src/b.js', name: 'beta' }, + { id: 2, file: 'src/c.js', name: 'gamma' } +]; + +const fileRelations = new Map([ + ['src/a.js', { importLinks: ['src/c.js'], usages: ['beta'], exports: [] }] +]); + +const hits = [{ id: 0, file: 'src/a.js' }]; +const contextHits = expandContext({ + hits, + chunkMeta, + fileRelations, + repoMap: null, + options: { + maxPerHit: 5, + maxTotal: 10, + includeCalls: true, + includeImports: true, + includeUsages: true + } +}); + +const ids = new Set(contextHits.map((hit) => hit.id)); +if (!ids.has(1) || !ids.has(2)) { + console.error('Expected context expansion to include call and import targets.'); + process.exit(1); +} + +console.log('context expansion test passed'); diff --git a/tests/core-api.js b/tests/core-api.js new file mode 100644 index 000000000..c709737d1 --- /dev/null +++ b/tests/core-api.js @@ -0,0 +1,50 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { buildIndex, search, status } from '../src/integrations/core/index.js'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const cacheRoot = path.join(root, 'tests', '.cache', 'core-api'); + +if (!fs.existsSync(fixtureRoot)) { + console.error(`Fixture not found: ${fixtureRoot}`); + process.exit(1); +} + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +await buildIndex(fixtureRoot, { + mode: 'code', + sqlite: false, + stubEmbeddings: true, + log: () => {} +}); + +const userConfig = loadUserConfig(fixtureRoot); +const indexDir = getIndexDir(fixtureRoot, 'code', userConfig); +const chunkPath = path.join(indexDir, 'chunk_meta.json'); +if (!fs.existsSync(chunkPath)) { + console.error(`Core API test failed: missing ${chunkPath}`); + process.exit(1); +} + +const searchPayload = await search(fixtureRoot, { query: 'index', mode: 'code', json: true }); +if (!searchPayload || !Array.isArray(searchPayload.code)) { + console.error('Core API test failed: search payload missing code results.'); + process.exit(1); +} + +const statusPayload = await status(fixtureRoot); +if (!statusPayload?.repo?.root) { + console.error('Core API test failed: status payload missing repo root.'); + process.exit(1); +} + +console.log('core api test passed'); diff --git a/tests/ctags-ingest.js b/tests/ctags-ingest.js new file mode 100644 index 000000000..262a721cb --- /dev/null +++ b/tests/ctags-ingest.js @@ -0,0 +1,46 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'ctags-ingest'); +const repoRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const inputPath = path.join(root, 'tests', 'fixtures', 'ctags', 'tags.jsonl'); +const outPath = path.join(tempRoot, 'ctags.jsonl'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); + +const result = spawnSync( + process.execPath, + [path.join(root, 'tools', 'ctags-ingest.js'), '--repo', repoRoot, '--input', inputPath, '--out', outPath, '--json'], + { encoding: 'utf8' } +); +if (result.status !== 0) { + console.error(result.stderr || result.stdout || 'ctags-ingest failed'); + process.exit(result.status ?? 1); +} + +if (!fs.existsSync(outPath)) { + console.error('ctags output not found'); + process.exit(1); +} + +const lines = fs.readFileSync(outPath, 'utf8').trim().split(/\r?\n/).filter(Boolean); +assert.ok(lines.length >= 2, 'expected ctags output lines'); + +const first = JSON.parse(lines[0]); +assert.equal(first.file, 'src/widget.js'); +assert.equal(first.name, 'Widget'); +assert.equal(first.kind, 'class'); +assert.equal(first.language, 'JavaScript'); +assert.equal(first.startLine, 3); + +const metaPath = `${outPath}.meta.json`; +const meta = JSON.parse(fs.readFileSync(metaPath, 'utf8')); +assert.equal(meta.stats.entries, lines.length); + +console.log('ctags ingest test passed'); diff --git a/tests/dict-adaptive.js b/tests/dict-adaptive.js new file mode 100644 index 000000000..d29171654 --- /dev/null +++ b/tests/dict-adaptive.js @@ -0,0 +1,26 @@ +#!/usr/bin/env node +import { applyAdaptiveDictConfig } from '../tools/dict-utils.js'; + +const base = { + segmentation: 'auto', + dpMaxTokenLength: 32, + dpMaxTokenLengthByFileCount: [ + { maxFiles: 5000, dpMaxTokenLength: 32 }, + { maxFiles: 20000, dpMaxTokenLength: 24 }, + { maxFiles: 999999, dpMaxTokenLength: 16 } + ] +}; + +const expect = (actual, expected, label) => { + if (actual !== expected) { + console.error(`dict adaptive test failed (${label}): expected ${expected}, got ${actual}`); + process.exit(1); + } +}; + +expect(applyAdaptiveDictConfig(base, 100).dpMaxTokenLength, 32, 'small repo'); +expect(applyAdaptiveDictConfig(base, 12000).dpMaxTokenLength, 24, 'mid repo'); +expect(applyAdaptiveDictConfig(base, 80000).dpMaxTokenLength, 16, 'large repo'); +expect(applyAdaptiveDictConfig({ segmentation: 'greedy', dpMaxTokenLength: 12 }, 50000).dpMaxTokenLength, 12, 'greedy override'); + +console.log('dictionary adaptive config test passed'); diff --git a/tests/discover.js b/tests/discover.js new file mode 100644 index 000000000..e8101331e --- /dev/null +++ b/tests/discover.js @@ -0,0 +1,98 @@ +import assert from 'node:assert/strict'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { discoverFiles, discoverFilesForModes } from '../src/index/build/discover.js'; +import { buildIgnoreMatcher } from '../src/index/build/ignore.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'discover'); + +await fs.rm(tempRoot, { recursive: true, force: true }); +await fs.mkdir(path.join(tempRoot, 'src'), { recursive: true }); +await fs.mkdir(path.join(tempRoot, 'docs'), { recursive: true }); +await fs.mkdir(path.join(tempRoot, 'src', 'deep', 'nested'), { recursive: true }); + +const gitCheck = spawnSync('git', ['--version'], { encoding: 'utf8' }); +if (gitCheck.status !== 0) { + console.log('skip: git not available'); + process.exit(0); +} + +const runGit = (args) => { + const result = spawnSync('git', args, { cwd: tempRoot, encoding: 'utf8' }); + if (result.status !== 0) { + throw new Error(`git ${args.join(' ')} failed: ${result.stderr || result.stdout}`); + } +}; + +runGit(['init']); +runGit(['config', 'user.email', 'tests@example.com']); +runGit(['config', 'user.name', 'Tests']); + +await fs.writeFile(path.join(tempRoot, 'src', 'app.js'), 'console.log("hi")\n'); +await fs.writeFile(path.join(tempRoot, 'src', 'deep', 'nested', 'too-deep.js'), 'console.log("deep")\n'); +await fs.writeFile(path.join(tempRoot, 'docs', 'readme.md'), '# Hello\n'); +await fs.writeFile(path.join(tempRoot, 'Dockerfile.dev'), 'FROM node:20\n'); +await fs.writeFile(path.join(tempRoot, 'Makefile.in'), 'build:\n\t@echo ok\n'); +runGit(['add', '.']); +runGit(['commit', '-m', 'init']); + +await fs.writeFile(path.join(tempRoot, 'src', 'untracked.js'), 'console.log("no")\n'); + +const { ignoreMatcher } = await buildIgnoreMatcher({ root: tempRoot, userConfig: {} }); + +const skipped = []; +const codeEntries = await discoverFiles({ + root: tempRoot, + mode: 'code', + ignoreMatcher, + skippedFiles: skipped, + maxFileBytes: null +}); +const codeRel = codeEntries.map((entry) => entry.rel); +assert.ok(codeRel.includes('src/app.js'), 'tracked code file missing'); +assert.ok(codeRel.includes('Dockerfile.dev'), 'Dockerfile variant missing'); +assert.ok(codeRel.includes('Makefile.in'), 'Makefile variant missing'); +assert.ok(!codeRel.includes('src/untracked.js'), 'untracked file should not be discovered'); +assert.ok(codeEntries[0].stat && typeof codeEntries[0].stat.size === 'number', 'stat missing'); + +const depthSkipped = []; +const depthLimited = await discoverFiles({ + root: tempRoot, + mode: 'code', + ignoreMatcher, + skippedFiles: depthSkipped, + maxFileBytes: null, + maxDepth: 1 +}); +assert.ok(!depthLimited.some((entry) => entry.rel.includes('deep/nested')), 'maxDepth should skip deep files'); +assert.ok(depthSkipped.some((entry) => entry.reason === 'max-depth'), 'maxDepth skip reason missing'); + +const countSkipped = []; +const countLimited = await discoverFiles({ + root: tempRoot, + mode: 'code', + ignoreMatcher, + skippedFiles: countSkipped, + maxFileBytes: null, + maxFiles: 1 +}); +assert.ok(countLimited.length <= 1, 'maxFiles should cap entries'); +assert.ok(countSkipped.some((entry) => entry.reason === 'max-files'), 'maxFiles skip reason missing'); + +const skippedByMode = { code: [], prose: [] }; +const byMode = await discoverFilesForModes({ + root: tempRoot, + modes: ['code', 'prose'], + ignoreMatcher, + skippedByMode, + maxFileBytes: null +}); +assert.ok(byMode.code.some((entry) => entry.rel === 'src/app.js'), 'code mode missing app.js'); +assert.ok(byMode.prose.some((entry) => entry.rel === 'docs/readme.md'), 'prose mode missing readme'); +assert.ok(!byMode.code.some((entry) => entry.rel === 'src/untracked.js'), 'untracked file should not appear'); +assert.ok(byMode.code.every((entry) => entry.stat), 'code entries missing stat'); +assert.ok(byMode.prose.every((entry) => entry.stat), 'prose entries missing stat'); + +console.log('discover test passed'); diff --git a/tests/docs-consistency.js b/tests/docs-consistency.js deleted file mode 100644 index 918aeada7..000000000 --- a/tests/docs-consistency.js +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env node -import fs from 'node:fs'; -import path from 'node:path'; - -const root = process.cwd(); -const roadmapPath = path.join(root, 'ROADMAP.md'); -const planPath = path.join(root, 'COMPLETE_PLAN.md'); -const readmePath = path.join(root, 'README.md'); - -const failures = []; -const roadmap = fs.existsSync(roadmapPath) ? fs.readFileSync(roadmapPath, 'utf8') : ''; -const plan = fs.existsSync(planPath) ? fs.readFileSync(planPath, 'utf8') : ''; -const readme = fs.existsSync(readmePath) ? fs.readFileSync(readmePath, 'utf8') : ''; - -if (!plan) failures.push('COMPLETE_PLAN.md missing or empty.'); -if (!roadmap) failures.push('ROADMAP.md missing or empty.'); -if (roadmap && !roadmap.toLowerCase().includes('historical')) { - failures.push('ROADMAP.md should be marked as historical.'); -} -if (roadmap && !roadmap.includes('COMPLETE_PLAN.md')) { - failures.push('ROADMAP.md should reference COMPLETE_PLAN.md as the source of truth.'); -} -if (readme && !readme.includes('COMPLETE_PLAN.md')) { - failures.push('README.md should reference COMPLETE_PLAN.md.'); -} - -if (failures.length) { - failures.forEach((msg) => console.error(msg)); - process.exit(1); -} - -console.log('Docs consistency test passed'); diff --git a/tests/download-dicts.js b/tests/download-dicts.js index c3e99f595..84a50b035 100644 --- a/tests/download-dicts.js +++ b/tests/download-dicts.js @@ -1,6 +1,7 @@ #!/usr/bin/env node import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; +import crypto from 'node:crypto'; import http from 'node:http'; import path from 'node:path'; import { spawn } from 'node:child_process'; @@ -17,6 +18,9 @@ if (!fs.existsSync(sourceFile)) { console.error(`Missing fixture: ${sourceFile}`); process.exit(1); } +const sourceHash = crypto.createHash('sha256') + .update(await fsPromises.readFile(sourceFile)) + .digest('hex'); const server = http.createServer((req, res) => { const filePath = sourceFile; @@ -60,6 +64,8 @@ const result = await run( path.join(root, 'tools', 'download-dicts.js'), '--url', `test=${url}`, + '--sha256', + `test=${sourceHash}`, '--lang', 'test', '--dir', @@ -98,5 +104,9 @@ if (!manifest.test || manifest.test.url !== url || manifest.test.file !== 'test. console.error('download-dicts test failed: manifest entry mismatch.'); process.exit(1); } +if (manifest.test.sha256 !== sourceHash || manifest.test.verified !== true) { + console.error('download-dicts test failed: hash verification missing.'); + process.exit(1); +} console.log('download-dicts test passed'); diff --git a/tests/download-extensions.js b/tests/download-extensions.js index 722801f52..a397f2cba 100644 --- a/tests/download-extensions.js +++ b/tests/download-extensions.js @@ -1,6 +1,7 @@ #!/usr/bin/env node import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; +import crypto from 'node:crypto'; import http from 'node:http'; import path from 'node:path'; import { spawn, spawnSync } from 'node:child_process'; @@ -37,6 +38,11 @@ function runDownload(args) { }); } +async function hashFile(filePath) { + const buffer = await fsPromises.readFile(filePath); + return crypto.createHash('sha256').update(buffer).digest('hex'); +} + await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve)); const address = server.address(); const port = typeof address === 'object' && address ? address.port : 0; @@ -45,15 +51,22 @@ const cases = [ { label: 'zip', archive: 'vec0.zip', expectedArchive: 'zip' }, { label: 'tar', archive: 'vec0.tar', expectedArchive: 'tar' } ]; +const maliciousCases = [ + { label: 'zip-slip', archive: 'vec0-slip.zip', escapeName: 'pwned-zip.txt' }, + { label: 'tar-slip', archive: 'vec0-slip.tar', escapeName: 'pwned-tar.txt' } +]; const failures = []; for (const entry of cases) { const extensionDir = path.join(tempRoot, entry.label); const url = `http://127.0.0.1:${port}/${entry.archive}`; + const archiveHash = await hashFile(path.join(fixturesRoot, entry.archive)); const status = await runDownload([ path.join(root, 'tools', 'download-extensions.js'), '--url', `vec0=${url}`, + '--sha256', + `vec0=${archiveHash}`, '--dir', extensionDir, '--provider', @@ -98,6 +111,9 @@ for (const entry of cases) { if (!record.extractedFrom) { failures.push(`${entry.label} manifest extractedFrom missing`); } + if (record.sha256 !== archiveHash || record.verified !== true) { + failures.push(`${entry.label} manifest hash verification missing`); + } const verify = spawnSync( process.execPath, @@ -126,6 +142,38 @@ for (const entry of cases) { } } +for (const entry of maliciousCases) { + const extensionDir = path.join(tempRoot, entry.label); + const url = `http://127.0.0.1:${port}/${entry.archive}`; + const archiveHash = await hashFile(path.join(fixturesRoot, entry.archive)); + const escapePath = path.join(tempRoot, entry.escapeName); + await fsPromises.rm(escapePath, { force: true }); + await runDownload([ + path.join(root, 'tools', 'download-extensions.js'), + '--url', + `vec0=${url}`, + '--sha256', + `vec0=${archiveHash}`, + '--dir', + extensionDir, + '--provider', + 'sqlite-vec', + '--platform', + 'win32', + '--arch', + 'x64', + '--force' + ]); + const expectedPath = path.join(extensionDir, 'sqlite-vec', 'win32-x64', 'vec0.dll'); + if (fs.existsSync(escapePath)) { + failures.push(`${entry.label} wrote outside extraction root`); + continue; + } + if (fs.existsSync(expectedPath)) { + failures.push(`${entry.label} unexpectedly extracted binary`); + } +} + server.close(); if (failures.length) { diff --git a/tests/editor-parity.js b/tests/editor-parity.js new file mode 100644 index 000000000..329a5ec24 --- /dev/null +++ b/tests/editor-parity.js @@ -0,0 +1,102 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import path from 'node:path'; +import { DEFAULT_LIMITS } from '../src/map/constants.js'; + +const root = process.cwd(); + +const readJson = (filePath, label) => { + try { + return JSON.parse(fs.readFileSync(filePath, 'utf8')); + } catch (err) { + console.error(`Failed: ${label} invalid JSON (${filePath})`); + console.error(String(err?.message || err)); + process.exit(1); + } +}; + +const vscodePackagePath = path.join(root, 'extensions', 'vscode', 'package.json'); +const vscodePackage = readJson(vscodePackagePath, 'vscode extension manifest'); +const vscodeConfig = vscodePackage?.contributes?.configuration?.properties || {}; + +const getVsCodeDefault = (key) => vscodeConfig?.[key]?.default; + +const sublimeSettingsPath = path.join( + root, + 'sublime', + 'PairOfCleats', + 'PairOfCleats.sublime-settings' +); +const sublimeSettings = readJson(sublimeSettingsPath, 'sublime settings'); + +const sublimeCommandsPath = path.join( + root, + 'sublime', + 'PairOfCleats', + 'Default.sublime-commands' +); +const sublimeCommands = readJson(sublimeCommandsPath, 'sublime command palette'); + +const requiredVsCodeKeys = [ + 'pairofcleats.cliPath', + 'pairofcleats.searchMode', + 'pairofcleats.searchBackend', + 'pairofcleats.searchAnn', + 'pairofcleats.maxResults' +]; + +for (const key of requiredVsCodeKeys) { + if (!(key in vscodeConfig)) { + console.error(`Failed: VSCode extension missing configuration property: ${key}`); + process.exit(1); + } +} + +const requiredSublimeCommands = [ + 'pair_of_cleats_search', + 'pair_of_cleats_search_selection', + 'pair_of_cleats_index_build_all', + 'pair_of_cleats_map_repo', + 'pair_of_cleats_map_current_file', + 'pair_of_cleats_map_jump_to_node' +]; + +if (!Array.isArray(sublimeCommands)) { + console.error('Failed: Sublime Default.sublime-commands is not a JSON array'); + process.exit(1); +} + +const sublimeCommandSet = new Set(sublimeCommands.map((entry) => entry?.command).filter(Boolean)); +for (const command of requiredSublimeCommands) { + if (!sublimeCommandSet.has(command)) { + console.error(`Failed: Sublime command palette missing command: ${command}`); + process.exit(1); + } +} + +const ensureEqual = (label, actual, expected) => { + if (actual !== expected) { + console.error(`Failed: ${label} expected ${JSON.stringify(expected)} but saw ${JSON.stringify(actual)}`); + process.exit(1); + } +}; + +// Search defaults parity (Sublime ↔ VSCode). +ensureEqual('search limit parity', sublimeSettings.search_limit, getVsCodeDefault('pairofcleats.maxResults')); +ensureEqual('search mode parity', sublimeSettings.index_mode_default, getVsCodeDefault('pairofcleats.searchMode')); +ensureEqual( + 'search backend parity', + sublimeSettings.search_backend_default, + getVsCodeDefault('pairofcleats.searchBackend') +); + +// Guardrail parity (Sublime ↔ CLI defaults). +ensureEqual('map max files parity', sublimeSettings.map_max_files, DEFAULT_LIMITS.maxFiles); +ensureEqual( + 'map max members per file parity', + sublimeSettings.map_max_members_per_file, + DEFAULT_LIMITS.maxMembersPerFile +); +ensureEqual('map max edges parity', sublimeSettings.map_max_edges, DEFAULT_LIMITS.maxEdges); + +console.log('editor parity checklist tests passed'); diff --git a/tests/embedding-batch-autotune.js b/tests/embedding-batch-autotune.js new file mode 100644 index 000000000..7587e78c1 --- /dev/null +++ b/tests/embedding-batch-autotune.js @@ -0,0 +1,38 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { parseBuildArgs } from '../src/index/build/args.js'; +import { createBuildRuntime } from '../src/index/build/runtime.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'embedding-batch-autotune'); +const repoRoot = path.join(tempRoot, 'repo'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +process.env.PAIROFCLEATS_CACHE_ROOT = tempRoot; + +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ + indexing: { + embeddings: { enabled: true, mode: 'stub' }, + treeSitter: { enabled: false } + } + }, null, 2) +); + +const defaults = parseBuildArgs([]).argv; +const argv = { ...defaults, 'stub-embeddings': true }; +const runtime = await createBuildRuntime({ root: repoRoot, argv, rawArgv: [] }); + +if (runtime.embeddingBatchSize < 32 || runtime.embeddingBatchSize > 128) { + console.error(`Unexpected embedding batch size: ${runtime.embeddingBatchSize}`); + process.exit(1); +} +if (runtime.embeddingConcurrency < 1) { + console.error(`Unexpected embedding concurrency: ${runtime.embeddingConcurrency}`); + process.exit(1); +} + +console.log('embedding auto-tune test passed'); diff --git a/tests/embedding-batch-multipliers.js b/tests/embedding-batch-multipliers.js new file mode 100644 index 000000000..a6cb4896b --- /dev/null +++ b/tests/embedding-batch-multipliers.js @@ -0,0 +1,18 @@ +#!/usr/bin/env node +import { normalizeEmbeddingBatchMultipliers, resolveEmbeddingBatchSize } from '../src/index/build/embedding-batch.js'; + +const multipliers = normalizeEmbeddingBatchMultipliers({ typescript: 4, python: 2 }, { typescript: 3, rust: 1.5 }); + +const expect = (label, actual, expected) => { + if (actual !== expected) { + console.error(`embedding batch multiplier failed (${label}): ${actual} !== ${expected}`); + process.exit(1); + } +}; + +expect('typescript', resolveEmbeddingBatchSize(10, 'typescript', multipliers), 40); +expect('python', resolveEmbeddingBatchSize(10, 'python', multipliers), 20); +expect('rust fallback', resolveEmbeddingBatchSize(10, 'rust', multipliers), 15); +expect('unknown', resolveEmbeddingBatchSize(10, 'go', multipliers), 10); + +console.log('embedding batch multiplier test passed'); diff --git a/tests/embedding-provider-strict.js b/tests/embedding-provider-strict.js new file mode 100644 index 000000000..5fc9f99d1 --- /dev/null +++ b/tests/embedding-provider-strict.js @@ -0,0 +1,18 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; + +import { normalizeEmbeddingProvider } from '../src/shared/onnx-embeddings.js'; + +assert.equal(normalizeEmbeddingProvider(undefined), 'xenova'); +assert.equal(normalizeEmbeddingProvider(' '), 'xenova'); +assert.equal(normalizeEmbeddingProvider('TRANSFORMERS'), 'xenova'); +assert.equal(normalizeEmbeddingProvider('onnxruntime-node'), 'onnx'); +assert.equal(normalizeEmbeddingProvider('xenova'), 'xenova'); + +assert.throws( + () => normalizeEmbeddingProvider('provider-a'), + /Unknown embedding provider/i, + 'expected unknown provider to throw rather than silently falling back' +); + +console.log('embedding provider strict validation test passed'); diff --git a/tests/embeddings-cache-identity.js b/tests/embeddings-cache-identity.js new file mode 100644 index 000000000..d0a0ed6be --- /dev/null +++ b/tests/embeddings-cache-identity.js @@ -0,0 +1,99 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getRepoCacheRoot, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'embeddings-cache-identity'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); +await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const buildIndex = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildIndex.status !== 0) { + console.error('embeddings cache identity test failed: build_index failed'); + process.exit(buildIndex.status ?? 1); +} + +const runEmbeddings = (dims) => { + const result = spawnSync( + process.execPath, + [ + path.join(root, 'tools', 'build-embeddings.js'), + '--stub-embeddings', + '--mode', + 'code', + '--dims', + String(dims), + '--repo', + repoRoot + ], + { cwd: repoRoot, env, stdio: 'inherit' } + ); + if (result.status !== 0) { + console.error(`embeddings cache identity test failed: build-embeddings dims=${dims} failed`); + process.exit(result.status ?? 1); + } +}; + +runEmbeddings(8); + +const userConfig = loadUserConfig(repoRoot); +const repoCacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const cacheDir = path.join(repoCacheRoot, 'embeddings', 'code', 'files'); +const firstFiles = (await fsPromises.readdir(cacheDir)) + .filter((name) => name.endsWith('.json')); +if (!firstFiles.length) { + console.error('embeddings cache identity test failed: missing cache files'); + process.exit(1); +} + +const firstCache = JSON.parse( + await fsPromises.readFile(path.join(cacheDir, firstFiles[0]), 'utf8') +); +const meta = firstCache?.cacheMeta?.identity; +if (!meta) { + console.error('embeddings cache identity test failed: missing cache metadata'); + process.exit(1); +} +if (meta.dims !== 8 || meta.scale !== 2 / 255 || meta.stub !== true) { + console.error('embeddings cache identity test failed: cache identity did not include expected dims/scale/stub'); + process.exit(1); +} +if (!meta.modelId || typeof meta.modelId !== 'string') { + console.error('embeddings cache identity test failed: cache identity missing modelId'); + process.exit(1); +} +if (!meta.provider || typeof meta.provider !== 'string') { + console.error('embeddings cache identity test failed: cache identity missing provider'); + process.exit(1); +} + +runEmbeddings(12); +const secondFiles = (await fsPromises.readdir(cacheDir)) + .filter((name) => name.endsWith('.json')); +const firstSet = new Set(firstFiles); +const hasNew = secondFiles.some((name) => !firstSet.has(name)); +if (!hasNew) { + console.error('embeddings cache identity test failed: expected new cache entries after dims change'); + process.exit(1); +} + +console.log('embeddings cache identity tests passed'); diff --git a/tests/embeddings-cache-invalidation.js b/tests/embeddings-cache-invalidation.js new file mode 100644 index 000000000..81aea820b --- /dev/null +++ b/tests/embeddings-cache-invalidation.js @@ -0,0 +1,100 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { buildCacheIdentity, buildCacheKey, isCacheValid } from '../tools/build-embeddings/cache.js'; + +const base = buildCacheIdentity({ + modelId: 'model-a', + provider: 'provider-a', + mode: 'inline', + stub: false, + dims: 384, + scale: 0.5 +}); +const dimsChanged = buildCacheIdentity({ + modelId: 'model-a', + provider: 'provider-a', + mode: 'inline', + stub: false, + dims: 768, + scale: 0.5 +}); +const modelChanged = buildCacheIdentity({ + modelId: 'model-b', + provider: 'provider-a', + mode: 'inline', + stub: false, + dims: 384, + scale: 0.5 +}); +const providerChanged = buildCacheIdentity({ + modelId: 'model-a', + provider: 'provider-b', + mode: 'inline', + stub: false, + dims: 384, + scale: 0.5 +}); + +assert.notEqual(base.key, dimsChanged.key, 'expected cache identity to change with dims'); +assert.notEqual(base.key, modelChanged.key, 'expected cache identity to change with model'); +assert.notEqual(base.key, providerChanged.key, 'expected cache identity to change with provider'); + +// Provider-specific knobs should participate in cache invalidation. +const onnxBase = buildCacheIdentity({ + modelId: 'model-a', + provider: 'onnx', + mode: 'inline', + stub: false, + dims: 384, + scale: 0.5, + onnx: { + modelPath: '/models/model-a.onnx', + tokenizerId: 'model-a', + executionProviders: ['cpu'], + intraOpNumThreads: 1, + interOpNumThreads: 1, + graphOptimizationLevel: 'all' + } +}); +const onnxModelPathChanged = buildCacheIdentity({ + modelId: 'model-a', + provider: 'onnx', + mode: 'inline', + stub: false, + dims: 384, + scale: 0.5, + onnx: { + modelPath: '/models/other.onnx', + tokenizerId: 'model-a', + executionProviders: ['cpu'], + intraOpNumThreads: 1, + interOpNumThreads: 1, + graphOptimizationLevel: 'all' + } +}); +assert.notEqual(onnxBase.key, onnxModelPathChanged.key, 'expected cache identity to change with onnx modelPath'); + +const signature = 'sig-1'; +const cached = { + chunkSignature: signature, + cacheMeta: { identityKey: base.key } +}; +assert.equal(isCacheValid({ cached, signature, identityKey: base.key }), true, 'expected cache to be valid for matching identity'); +assert.equal(isCacheValid({ cached, signature, identityKey: dimsChanged.key }), false, 'expected cache to be invalid for mismatched identity'); + +const cacheKey = buildCacheKey({ + file: 'src/index.js', + hash: 'hash-1', + signature, + identityKey: base.key +}); +assert.ok(cacheKey, 'expected cache key for hashed file'); +const cacheKeyMismatch = buildCacheKey({ + file: 'src/index.js', + hash: 'hash-1', + signature, + identityKey: dimsChanged.key +}); +assert.notEqual(cacheKey, cacheKeyMismatch, 'expected cache key to change with identity'); + +console.log('embeddings cache invalidation test passed'); diff --git a/tests/embeddings-dims-mismatch.js b/tests/embeddings-dims-mismatch.js new file mode 100644 index 000000000..dae690f7a --- /dev/null +++ b/tests/embeddings-dims-mismatch.js @@ -0,0 +1,86 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getRepoCacheRoot, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'embeddings-dims-mismatch'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); +await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const buildIndex = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildIndex.status !== 0) { + console.error('embeddings dims mismatch test failed: build_index failed'); + process.exit(buildIndex.status ?? 1); +} + +const runEmbeddings = () => spawnSync( + process.execPath, + [ + path.join(root, 'tools', 'build-embeddings.js'), + '--stub-embeddings', + '--mode', + 'code', + '--dims', + '8', + '--repo', + repoRoot + ], + { cwd: repoRoot, env, encoding: 'utf8' } +); + +const firstRun = runEmbeddings(); +if (firstRun.status !== 0) { + console.error('embeddings dims mismatch test failed: initial build-embeddings failed'); + process.exit(firstRun.status ?? 1); +} + +const userConfig = loadUserConfig(repoRoot); +const repoCacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const cacheDir = path.join(repoCacheRoot, 'embeddings', 'code', 'files'); +const cacheFiles = (await fsPromises.readdir(cacheDir)).filter((name) => name.endsWith('.json')); +if (!cacheFiles.length) { + console.error('embeddings dims mismatch test failed: no cache files found'); + process.exit(1); +} + +const targetPath = path.join(cacheDir, cacheFiles[0]); +const cached = JSON.parse(await fsPromises.readFile(targetPath, 'utf8')); +const bumpVector = (vec) => { + if (Array.isArray(vec)) vec.push(0); +}; +bumpVector(cached?.mergedVectors?.[0]); +bumpVector(cached?.codeVectors?.[0]); +bumpVector(cached?.docVectors?.[0]); +await fsPromises.writeFile(targetPath, JSON.stringify(cached)); + +const secondRun = runEmbeddings(); +if (secondRun.status === 0) { + console.error('embeddings dims mismatch test failed: expected dims mismatch error'); + process.exit(1); +} +const output = `${secondRun.stdout || ''}${secondRun.stderr || ''}`; +if (!output.includes('embedding dims mismatch')) { + console.error('embeddings dims mismatch test failed: missing mismatch error message'); + process.exit(1); +} + +console.log('embeddings dims mismatch tests passed'); diff --git a/tests/embeddings-dims-validation.js b/tests/embeddings-dims-validation.js new file mode 100644 index 000000000..5cc270e4f --- /dev/null +++ b/tests/embeddings-dims-validation.js @@ -0,0 +1,21 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { createDimsValidator, isDimsMismatch, validateCachedDims } from '../tools/build-embeddings/embed.js'; + +const validator = createDimsValidator({ mode: 'code', configuredDims: 4 }); +validator.assertDims(4); +assert.throws(() => validator.assertDims(5), /embedding dims mismatch/, 'expected configured dims mismatch to throw'); + +const cachedOk = [[0, 1, 2, 3], [4, 5, 6, 7]]; +validateCachedDims({ vectors: cachedOk, expectedDims: 4, mode: 'code' }); + +assert.throws( + () => validateCachedDims({ vectors: [[0, 1, 2]], expectedDims: 4, mode: 'code' }), + /embedding dims mismatch/, + 'expected cached dims mismatch to throw' +); + +const mismatchError = new Error('[embeddings] code embedding dims mismatch (configured=4, observed=5).'); +assert.equal(isDimsMismatch(mismatchError), true, 'expected dims mismatch error to be detected'); + +console.log('embeddings dims validation test passed'); diff --git a/tests/embeddings-sqlite-dense.js b/tests/embeddings-sqlite-dense.js new file mode 100644 index 000000000..fa55be38b --- /dev/null +++ b/tests/embeddings-sqlite-dense.js @@ -0,0 +1,87 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fsPromises from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import { updateSqliteDense } from '../tools/build-embeddings/sqlite-dense.js'; + +let Database; +try { + ({ default: Database } = await import('better-sqlite3')); +} catch (err) { + console.error('better-sqlite3 is required for embeddings sqlite dense test.'); + process.exit(1); +} + +const tempRoot = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'pairofcleats-embeddings-sqlite-')); +const dbPath = path.join(tempRoot, 'index-code.db'); +const dbMissingPath = path.join(tempRoot, 'index-missing.db'); + +const vectors = [ + [1, 2, 3], + [4, 5, 6] +]; + +const createDbWithTables = (target) => { + const db = new Database(target); + db.exec('CREATE TABLE dense_vectors (mode TEXT, doc_id INTEGER, vector BLOB)'); + db.exec('CREATE TABLE dense_meta (mode TEXT, dims INTEGER, scale REAL, model TEXT)'); + db.close(); +}; + +createDbWithTables(dbPath); +new Database(dbMissingPath).close(); + +const disabledResult = updateSqliteDense({ + Database, + root: tempRoot, + userConfig: { sqlite: { use: false } }, + mode: 'code', + vectors, + dims: 3, + scale: 1, + modelId: 'model-a', + dbPath, + emitOutput: false +}); +assert.equal(disabledResult.skipped, true, 'expected sqlite update to skip when disabled'); + +const missingResult = updateSqliteDense({ + Database, + root: tempRoot, + userConfig: { sqlite: { use: true } }, + mode: 'code', + vectors, + dims: 3, + scale: 1, + modelId: 'model-a', + dbPath: dbMissingPath, + emitOutput: false +}); +assert.equal(missingResult.skipped, true, 'expected sqlite update to skip when tables missing'); +assert.equal(missingResult.reason, 'missing dense tables', 'expected missing dense tables reason'); + +const enabledResult = updateSqliteDense({ + Database, + root: tempRoot, + userConfig: { sqlite: { use: true } }, + mode: 'code', + vectors, + dims: 3, + scale: 1, + modelId: 'model-a', + dbPath, + emitOutput: false +}); +assert.equal(enabledResult.skipped, false, 'expected sqlite update to run when enabled'); + +const db = new Database(dbPath, { readonly: true }); +const denseCount = db.prepare('SELECT COUNT(*) AS total FROM dense_vectors').get().total; +const metaCount = db.prepare('SELECT COUNT(*) AS total FROM dense_meta').get().total; +const modeCount = db.prepare('SELECT COUNT(*) AS total FROM dense_vectors WHERE mode = ?').get('code').total; +db.close(); +assert.equal(denseCount, vectors.length, 'expected dense vectors to be written'); +assert.equal(metaCount, 1, 'expected dense metadata to be written'); +assert.equal(modeCount, vectors.length, 'expected mode-specific dense vectors'); + +console.log('embeddings sqlite dense test passed'); diff --git a/tests/embeddings-validate.js b/tests/embeddings-validate.js new file mode 100644 index 000000000..2262e3462 --- /dev/null +++ b/tests/embeddings-validate.js @@ -0,0 +1,81 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const cacheRoot = path.join(root, 'tests', '.cache', 'embeddings-validate'); +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const buildPath = path.join(root, 'build_index.js'); +const embeddingsPath = path.join(root, 'tools', 'build-embeddings.js'); +const validatePath = path.join(root, 'tools', 'index-validate.js'); + +const run = (args, label) => { + const result = spawnSync(process.execPath, args, { env, encoding: 'utf8' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + return result.stdout || ''; +}; + +run([buildPath, '--stub-embeddings', '--repo', fixtureRoot], 'build index'); +run([embeddingsPath, '--stub-embeddings', '--repo', fixtureRoot], 'build embeddings'); + +const validateResult = spawnSync( + process.execPath, + [validatePath, '--repo', fixtureRoot, '--json'], + { env, encoding: 'utf8' } +); +if (validateResult.status !== 0) { + console.error('Expected index-validate to pass after build-embeddings.'); + if (validateResult.stderr) console.error(validateResult.stderr.trim()); + process.exit(validateResult.status ?? 1); +} +let payload; +try { + payload = JSON.parse(validateResult.stdout); +} catch { + console.error('index-validate did not return valid JSON.'); + process.exit(1); +} +if (!payload || payload.ok !== true) { + console.error('index-validate JSON payload missing ok=true.'); + process.exit(1); +} + +const previousCacheRoot = process.env.PAIROFCLEATS_CACHE_ROOT; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const userConfig = loadUserConfig(fixtureRoot); +const codeDir = getIndexDir(fixtureRoot, 'code', userConfig); +if (previousCacheRoot === undefined) { + delete process.env.PAIROFCLEATS_CACHE_ROOT; +} else { + process.env.PAIROFCLEATS_CACHE_ROOT = previousCacheRoot; +} +const statePath = path.join(codeDir, 'index_state.json'); +let state; +try { + state = JSON.parse(await fsPromises.readFile(statePath, 'utf8')); +} catch { + console.error('Failed to read index_state.json after build-embeddings.'); + process.exit(1); +} +const embeddings = state?.embeddings || {}; +if (embeddings.enabled !== true || embeddings.ready !== true || embeddings.pending === true) { + console.error('index_state embeddings flags not marked ready after build-embeddings.'); + process.exit(1); +} + +console.log('Stage3 embeddings validation test passed'); diff --git a/tests/encoding-fallback.js b/tests/encoding-fallback.js new file mode 100644 index 000000000..d2b1280a3 --- /dev/null +++ b/tests/encoding-fallback.js @@ -0,0 +1,65 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { readTextFile } from '../src/shared/encoding.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'encoding'); +const cacheRoot = path.join(root, 'tests', '.cache', 'encoding-fallback'); +const sourcePath = path.join(fixtureRoot, 'latin1.js'); + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const { text, usedFallback } = await readTextFile(sourcePath); +if (!text.includes('café')) { + console.error('Encoding fallback did not decode latin1.js correctly.'); + process.exit(1); +} +if (!usedFallback) { + console.error('Expected encoding fallback to be used for latin1.js.'); + process.exit(1); +} + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub', + PAIROFCLEATS_WORKER_POOL: 'off' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', fixtureRoot], + { cwd: fixtureRoot, env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('Failed: build_index'); + process.exit(buildResult.status ?? 1); +} + +const searchResult = spawnSync( + process.execPath, + [path.join(root, 'search.js'), '--json', '--repo', fixtureRoot, 'café'], + { cwd: fixtureRoot, env, encoding: 'utf8' } +); +if (searchResult.status !== 0) { + console.error('Failed: search'); + process.exit(searchResult.status ?? 1); +} +let payload = null; +try { + payload = JSON.parse(searchResult.stdout || '{}'); +} catch { + console.error('Search output is not valid JSON.'); + process.exit(1); +} +const hits = Array.isArray(payload?.code) ? payload.code : []; +const hit = hits.find((entry) => typeof entry?.file === 'string' && entry.file.endsWith('latin1.js')); +if (!hit) { + console.error('Expected search hit for latin1.js in encoding fixture.'); + process.exit(1); +} + +console.log('encoding fallback test passed'); diff --git a/tests/encoding-hash.js b/tests/encoding-hash.js new file mode 100644 index 000000000..b04a2e80b --- /dev/null +++ b/tests/encoding-hash.js @@ -0,0 +1,29 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { readTextFileWithHash } from '../src/shared/encoding.js'; +import { sha1 } from '../src/shared/hash.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'encoding-hash'); +const filePath = path.join(tempRoot, 'latin1.txt'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); + +const buffer = Buffer.from([0xff, 0xfe, 0xfd, 0x41]); +await fsPromises.writeFile(filePath, buffer); + +const info = await readTextFileWithHash(filePath); +const expectedHash = sha1(buffer); + +if (info.hash !== expectedHash) { + console.error('encoding hash test failed: hash did not match raw bytes.'); + process.exit(1); +} +if (!info.usedFallback) { + console.error('encoding hash test failed: expected fallback decoding for invalid UTF-8.'); + process.exit(1); +} + +console.log('encoding hash tests passed'); diff --git a/tests/eval-quality.js b/tests/eval-quality.js new file mode 100644 index 000000000..1a51197d4 --- /dev/null +++ b/tests/eval-quality.js @@ -0,0 +1,80 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'eval-quality'); +const cacheRoot = path.join(tempRoot, 'cache'); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const datasetPath = path.join(fixtureRoot, 'eval.json'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', fixtureRoot], + { env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('eval quality test failed: build_index failed'); + process.exit(buildResult.status ?? 1); +} + +const evalResult = spawnSync( + process.execPath, + [ + path.join(root, 'tools', 'eval', 'run.js'), + '--repo', + fixtureRoot, + '--dataset', + datasetPath, + '--backend', + 'memory', + '--no-ann', + '--top', + '5' + ], + { env, encoding: 'utf8' } +); + +if (evalResult.status !== 0) { + console.error('eval quality test failed: eval run returned error'); + if (evalResult.stderr) console.error(evalResult.stderr.trim()); + process.exit(evalResult.status ?? 1); +} + +let payload = null; +try { + payload = JSON.parse(evalResult.stdout || '{}'); +} catch (err) { + console.error('eval quality test failed: invalid JSON output'); + process.exit(1); +} + +const summary = payload?.summary || {}; +const recallAt5 = summary?.recallAtK?.['5'] ?? 0; +const ndcgAt5 = summary?.ndcgAtK?.['5'] ?? 0; +const mrr = summary?.mrr ?? 0; + +if (recallAt5 < 0.6) { + console.error(`eval quality test failed: recall@5 too low (${recallAt5.toFixed(3)})`); + process.exit(1); +} +if (ndcgAt5 < 0.6) { + console.error(`eval quality test failed: ndcg@5 too low (${ndcgAt5.toFixed(3)})`); + process.exit(1); +} +if (mrr < 0.5) { + console.error(`eval quality test failed: mrr too low (${mrr.toFixed(3)})`); + process.exit(1); +} + +console.log('eval quality tests passed'); diff --git a/tests/ext-filter.js b/tests/ext-filter.js new file mode 100644 index 000000000..7eb0eccd4 --- /dev/null +++ b/tests/ext-filter.js @@ -0,0 +1,17 @@ +#!/usr/bin/env node +import { normalizeExtFilter } from '../src/retrieval/filters.js'; + +const result = normalizeExtFilter(['*.js', 'JS', '.Md']); +const expected = ['.js', '.md']; + +const sorted = (result || []).slice().sort(); +const expectedSorted = expected.slice().sort(); + +const sameLength = sorted.length === expectedSorted.length; +const sameValues = sorted.every((value, idx) => value === expectedSorted[idx]); +if (!sameLength || !sameValues) { + console.error(`normalizeExtFilter failed: expected ${expectedSorted.join(', ')}, got ${sorted.join(', ')}`); + process.exit(1); +} + +console.log('ext filter test passed'); diff --git a/tests/external-docs.js b/tests/external-docs.js new file mode 100644 index 000000000..a913d30f3 --- /dev/null +++ b/tests/external-docs.js @@ -0,0 +1,72 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'external-docs'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'index.js'), + [ + "import foo from '@scope/pkg';", + "import bar from 'left-pad';", + "console.log(foo, bar);" + ].join('\n') + '\n' +); + +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('external docs test failed: build_index failed'); + process.exit(buildResult.status ?? 1); +} + +const userConfig = loadUserConfig(repoRoot); +const codeDir = getIndexDir(repoRoot, 'code', userConfig); +const fileMetaPath = path.join(codeDir, 'file_meta.json'); +if (!fs.existsSync(fileMetaPath)) { + console.error(`Missing file metadata: ${fileMetaPath}`); + process.exit(1); +} + +const files = JSON.parse(fs.readFileSync(fileMetaPath, 'utf8')); +const expectedScoped = 'https://www.npmjs.com/package/@scope/pkg'; +const expectedUnscoped = 'https://www.npmjs.com/package/left-pad'; +const encodedScoped = 'https://www.npmjs.com/package/%40scope/pkg'; + +const allDocs = files.flatMap((file) => file.externalDocs || []); +if (!allDocs.includes(expectedScoped)) { + console.error(`Missing scoped npm doc link: ${expectedScoped}`); + process.exit(1); +} +if (allDocs.includes(encodedScoped)) { + console.error(`Scoped npm doc link should preserve @: ${encodedScoped}`); + process.exit(1); +} +if (!allDocs.includes(expectedUnscoped)) { + console.error(`Missing npm doc link: ${expectedUnscoped}`); + process.exit(1); +} + +console.log('External docs test passed'); diff --git a/tests/extracted-prose.js b/tests/extracted-prose.js new file mode 100644 index 000000000..b29f5b2f1 --- /dev/null +++ b/tests/extracted-prose.js @@ -0,0 +1,76 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'extracted-prose'); +const repoRoot = path.join(tempRoot, 'repo'); +const srcDir = path.join(repoRoot, 'src'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(srcDir, { recursive: true }); + +const commentText = 'extracted-prose sentinel phrase'; +const source = [ + '/**', + ` * ${commentText}`, + ' */', + 'export function sample() { return 1; }', + '' +].join('\n'); +await fsPromises.writeFile(path.join(srcDir, 'sample.js'), source); + +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ sqlite: { use: false } }, null, 2) +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: path.join(tempRoot, 'cache'), + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; +process.env.PAIROFCLEATS_CACHE_ROOT = env.PAIROFCLEATS_CACHE_ROOT; +process.env.PAIROFCLEATS_EMBEDDINGS = env.PAIROFCLEATS_EMBEDDINGS; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--repo', repoRoot, '--mode', 'extracted-prose', '--stub-embeddings'], + { env, encoding: 'utf8' } +); +if (buildResult.status !== 0) { + console.error('Extracted-prose test failed: build_index error.'); + if (buildResult.stderr) console.error(buildResult.stderr.trim()); + process.exit(buildResult.status ?? 1); +} + +const searchResult = spawnSync( + process.execPath, + [path.join(root, 'search.js'), '--repo', repoRoot, '--mode', 'extracted-prose', '--json', commentText], + { env, encoding: 'utf8' } +); +if (searchResult.status !== 0) { + console.error('Extracted-prose test failed: search error.'); + if (searchResult.stderr) console.error(searchResult.stderr.trim()); + process.exit(searchResult.status ?? 1); +} + +let payload; +try { + payload = JSON.parse(searchResult.stdout || '{}'); +} catch (err) { + console.error('Extracted-prose test failed: invalid JSON output.'); + if (searchResult.stdout) console.error(searchResult.stdout.trim()); + process.exit(1); +} + +const hits = Array.isArray(payload.extractedProse) ? payload.extractedProse : []; +const matched = hits.some((hit) => hit?.file === 'src/sample.js'); +if (!matched) { + console.error('Extracted-prose test failed: expected hit missing.'); + process.exit(1); +} + +console.log('Extracted-prose test passed.'); diff --git a/tests/fielded-bm25.js b/tests/fielded-bm25.js new file mode 100644 index 000000000..ca0d5a9b0 --- /dev/null +++ b/tests/fielded-bm25.js @@ -0,0 +1,87 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'fielded-bm25'); +const cacheRoot = path.join(tempRoot, 'cache'); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', fixtureRoot], + { env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('fielded bm25 test failed: build_index failed'); + process.exit(buildResult.status ?? 1); +} + +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const userConfig = loadUserConfig(fixtureRoot); +const fieldPostings = path.join( + getIndexDir(fixtureRoot, 'code', userConfig), + 'field_postings.json' +); + +if (!fs.existsSync(fieldPostings)) { + console.error('fielded bm25 test failed: field_postings.json missing'); + process.exit(1); +} + +const result = spawnSync( + process.execPath, + [ + path.join(root, 'search.js'), + 'greet', + '--mode', + 'code', + '--no-ann', + '--json', + '--repo', + fixtureRoot + ], + { env, encoding: 'utf8' } +); + +if (result.status !== 0) { + console.error('fielded bm25 test failed: search returned error'); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); +} + +let payload = null; +try { + payload = JSON.parse(result.stdout || '{}'); +} catch (err) { + console.error('fielded bm25 test failed: invalid JSON output'); + process.exit(1); +} + +const hit = payload?.code?.[0]; +if (!hit) { + console.error('fielded bm25 test failed: no hits'); + process.exit(1); +} +if (hit.scoreType !== 'bm25-fielded') { + console.error(`fielded bm25 test failed: expected bm25-fielded, got ${hit.scoreType}`); + process.exit(1); +} +if (hit.scoreBreakdown?.sparse?.fielded !== true) { + console.error('fielded bm25 test failed: sparse.fielded not true'); + process.exit(1); +} + +console.log('fielded bm25 tests passed'); diff --git a/tests/file-line-guard.js b/tests/file-line-guard.js new file mode 100644 index 000000000..6dfdc7693 --- /dev/null +++ b/tests/file-line-guard.js @@ -0,0 +1,83 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, getMetricsDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'file-line-guard'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const configPath = path.join(repoRoot, '.pairofcleats.json'); +await fsPromises.writeFile( + configPath, + JSON.stringify({ + indexing: { + fileCaps: { default: { maxLines: 2 } }, + fileListSampleSize: 20, + treeSitter: { enabled: false } + } + }, null, 2) +); + +const largePath = path.join(repoRoot, 'too_many_lines.js'); +const smallPath = path.join(repoRoot, 'ok.js'); +await fsPromises.writeFile(largePath, 'line1\nline2\nline3\nline4\n'); +await fsPromises.writeFile(smallPath, 'function ok() { return 1; }\n'); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('Failed: build_index'); + process.exit(buildResult.status ?? 1); +} + +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const userConfig = loadUserConfig(repoRoot); +const codeDir = getIndexDir(repoRoot, 'code', userConfig); +const fileListsPath = path.join(codeDir, '.filelists.json'); +if (!fs.existsSync(fileListsPath)) { + console.error('Missing .filelists.json'); + process.exit(1); +} +const fileLists = JSON.parse(await fsPromises.readFile(fileListsPath, 'utf8')); +const skippedSample = fileLists?.skipped?.sample; +if (!Array.isArray(skippedSample)) { + console.error('Skipped sample payload is not an array'); + process.exit(1); +} +const oversize = skippedSample.find((entry) => entry?.file && entry.file.endsWith('too_many_lines.js')); +if (!oversize || oversize.reason !== 'oversize') { + console.error('Expected oversize skip entry for too_many_lines.js'); + process.exit(1); +} + +const metricsDir = getMetricsDir(repoRoot, userConfig); +const metricsPath = path.join(metricsDir, 'index-code.json'); +if (!fs.existsSync(metricsPath)) { + console.error('Missing index-code metrics'); + process.exit(1); +} +const metrics = JSON.parse(await fsPromises.readFile(metricsPath, 'utf8')); +const oversizeCount = metrics?.files?.skippedByReason?.oversize || 0; +if (oversizeCount < 1) { + console.error('Expected skippedByReason.oversize to be >= 1'); + process.exit(1); +} + +console.log('File line guard test passed'); diff --git a/tests/file-processor/cached-bundle.test.js b/tests/file-processor/cached-bundle.test.js new file mode 100644 index 000000000..1b65f35f3 --- /dev/null +++ b/tests/file-processor/cached-bundle.test.js @@ -0,0 +1,94 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { reuseCachedBundle } from '../../src/index/build/file-processor/cached-bundle.js'; + +const fail = (message) => { + console.error(message); + process.exit(1); +}; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'file-processor-cached'); +const repoRoot = path.join(tempRoot, 'repo'); +await fs.rm(tempRoot, { recursive: true, force: true }); +await fs.mkdir(repoRoot, { recursive: true }); + +const targetPath = path.join(repoRoot, 'cached.js'); +await fs.writeFile(targetPath, 'export const demo = 1;\n'); +const stat = await fs.stat(targetPath); + +const cachedBundle = { + chunks: [ + { + file: 'cached.js', + ext: '.js', + start: 0, + end: 10, + startLine: 1, + endLine: 1, + kind: 'code', + name: 'demo', + lang: 'javascript', + codeRelations: { + imports: ['dep'], + exports: ['demo'], + calls: ['demo'] + }, + docmeta: { signature: 'demo()' }, + tokens: ['demo'], + seq: ['demo'], + ngrams: [], + chargrams: [] + } + ], + fileRelations: null +}; + +const { result, skip } = reuseCachedBundle({ + abs: targetPath, + relKey: 'cached.js', + fileIndex: 0, + fileStat: stat, + fileHash: 'hash', + ext: '.js', + fileCaps: {}, + cachedBundle, + incrementalState: { + manifest: { + files: { + 'cached.js': { bundle: 'cached.json', hash: 'hash' } + } + } + }, + allImports: { + dep: [{ source: 'cached.js', target: 'dep.js' }] + }, + fileStructural: null, + toolInfo: null, + fileStart: Date.now(), + knownLines: 1, + fileLanguageId: null +}); + +if (skip) { + fail('Expected cached bundle to be reused without skip.'); +} +if (!result) { + fail('Expected cached bundle reuse result.'); +} +if (!result.fileRelations?.importLinks?.length) { + fail('Expected importLinks to be rehydrated from allImports.'); +} +const chunk = result.chunks[0]; +if (!chunk?.metaV2?.chunkId) { + fail('Expected cached chunk to have metaV2 chunkId.'); +} +if (!Array.isArray(chunk?.codeRelations?.calls)) { + fail('Expected cached chunk to preserve non-file relation fields.'); +} +if (!result.fileMetrics?.cached) { + fail('Expected cached file metrics to set cached=true.'); +} + +console.log('file processor cached bundle tests passed'); diff --git a/tests/file-processor/skip.test.js b/tests/file-processor/skip.test.js new file mode 100644 index 000000000..024ca75fa --- /dev/null +++ b/tests/file-processor/skip.test.js @@ -0,0 +1,61 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { createFileScanner } from '../../src/index/build/file-scan.js'; +import { resolveBinarySkip, resolvePreReadSkip } from '../../src/index/build/file-processor/skip.js'; + +const fail = (message) => { + console.error(message); + process.exit(1); +}; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'file-processor-skip'); +await fs.rm(tempRoot, { recursive: true, force: true }); +await fs.mkdir(tempRoot, { recursive: true }); + +const fileScanner = createFileScanner(); +const runIo = (fn) => fn(); + +const minifiedPath = path.join(tempRoot, 'app.min.js'); +await fs.writeFile(minifiedPath, 'const x=1;'); +const minifiedStat = await fs.stat(minifiedPath); +const minifiedSkip = await resolvePreReadSkip({ + abs: minifiedPath, + fileEntry: { lines: 1, scan: { checkedBinary: true, checkedMinified: true } }, + fileStat: minifiedStat, + ext: '.js', + fileCaps: {}, + fileScanner, + runIo +}); +if (!minifiedSkip || minifiedSkip.reason !== 'minified') { + fail('Expected minified filename to skip with reason=minified.'); +} + +const cappedPath = path.join(tempRoot, 'big.txt'); +await fs.writeFile(cappedPath, 'abcdef'); +const cappedStat = await fs.stat(cappedPath); +const cappedSkip = await resolvePreReadSkip({ + abs: cappedPath, + fileEntry: { lines: 1, scan: { checkedBinary: true, checkedMinified: true } }, + fileStat: cappedStat, + ext: '.txt', + fileCaps: { default: { maxBytes: 1 } }, + fileScanner, + runIo +}); +if (!cappedSkip || cappedSkip.reason !== 'oversize' || cappedSkip.maxBytes !== 1) { + fail('Expected maxBytes to skip with reason=oversize and maxBytes.'); +} + +const binarySkip = await resolveBinarySkip({ + abs: minifiedPath, + fileBuffer: Buffer.from([0, 0, 0, 0, 0]), + fileScanner +}); +if (!binarySkip || binarySkip.reason !== 'binary') { + fail('Expected binary buffer to skip with reason=binary.'); +} + +console.log('file processor skip tests passed'); diff --git a/tests/file-size-guard.js b/tests/file-size-guard.js index 38c52d507..2ffde4722 100644 --- a/tests/file-size-guard.js +++ b/tests/file-size-guard.js @@ -17,7 +17,7 @@ await fsPromises.mkdir(cacheRoot, { recursive: true }); const configPath = path.join(repoRoot, '.pairofcleats.json'); await fsPromises.writeFile( configPath, - JSON.stringify({ indexing: { maxFileBytes: 120 } }, null, 2) + JSON.stringify({ indexing: { maxFileBytes: 120, fileListSampleSize: 10 } }, null, 2) ); const largePath = path.join(repoRoot, 'big.js'); @@ -44,17 +44,18 @@ if (buildResult.status !== 0) { process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; const userConfig = loadUserConfig(repoRoot); const codeDir = getIndexDir(repoRoot, 'code', userConfig); -const skippedPath = path.join(codeDir, '.skippedfiles.json'); -if (!fs.existsSync(skippedPath)) { - console.error('Missing .skippedfiles.json'); +const fileListsPath = path.join(codeDir, '.filelists.json'); +if (!fs.existsSync(fileListsPath)) { + console.error('Missing .filelists.json'); process.exit(1); } -const skipped = JSON.parse(await fsPromises.readFile(skippedPath, 'utf8')); -if (!Array.isArray(skipped)) { - console.error('Skipped files payload is not an array'); +const fileLists = JSON.parse(await fsPromises.readFile(fileListsPath, 'utf8')); +const skippedSample = fileLists?.skipped?.sample; +if (!Array.isArray(skippedSample)) { + console.error('Skipped sample payload is not an array'); process.exit(1); } -const oversize = skipped.find((entry) => entry?.file && entry.file.endsWith('big.js')); +const oversize = skippedSample.find((entry) => entry?.file && entry.file.endsWith('big.js')); if (!oversize || oversize.reason !== 'oversize') { console.error('Expected oversize skip entry for big.js'); process.exit(1); diff --git a/tests/filter-index-artifact.js b/tests/filter-index-artifact.js new file mode 100644 index 000000000..18beae329 --- /dev/null +++ b/tests/filter-index-artifact.js @@ -0,0 +1,44 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; +import { readJsonFile } from '../src/shared/artifact-io.js'; +import { loadIndex } from '../src/retrieval/cli-index.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'filter-index-artifact'); +const repoRoot = path.join(tempRoot, 'repo'); +const srcDir = path.join(repoRoot, 'src'); +const configPath = path.join(repoRoot, '.pairofcleats.json'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(srcDir, { recursive: true }); +await fsPromises.writeFile(path.join(srcDir, 'example.js'), 'const a = 1;\n', 'utf8'); +await fsPromises.writeFile( + configPath, + JSON.stringify({ search: { filePrefilter: { chargramN: 4 } } }, null, 2) +); + +const buildResult = spawnSync(process.execPath, [ + path.join(root, 'build_index.js'), + '--stub-embeddings', + '--repo', + repoRoot +], { encoding: 'utf8' }); +if (buildResult.status !== 0) { + console.error(buildResult.stderr || buildResult.stdout || 'build_index failed'); + process.exit(buildResult.status ?? 1); +} + +const userConfig = loadUserConfig(repoRoot); +const indexDir = getIndexDir(repoRoot, 'code', userConfig); +const filterIndexPath = path.join(indexDir, 'filter_index.json'); +const raw = readJsonFile(filterIndexPath); +assert.equal(raw.fileChargramN, 4, 'expected filter_index.json fileChargramN to match config'); + +const idx = loadIndex(indexDir, { modelIdDefault: 'test', fileChargramN: 2 }); +assert.equal(idx.filterIndex?.fileChargramN, 4, 'expected hydrated filter index to use persisted fileChargramN'); + +console.log('filter index artifact test passed'); diff --git a/tests/filter-index.js b/tests/filter-index.js new file mode 100644 index 000000000..fbfd74b53 --- /dev/null +++ b/tests/filter-index.js @@ -0,0 +1,49 @@ +#!/usr/bin/env node +import { buildFilterIndex } from '../src/retrieval/filter-index.js'; +import { filterChunks } from '../src/retrieval/output.js'; + +const meta = [ + { + id: 0, + ext: '.js', + kind: 'FunctionDeclaration', + last_author: 'Alice', + chunk_authors: ['Alice'], + docmeta: { visibility: 'public' } + }, + { + id: 1, + ext: '.py', + kind: 'ClassDeclaration', + last_author: 'Bob', + chunk_authors: ['Bob', 'Alice'], + docmeta: { visibility: 'private' } + }, + { + id: 2, + ext: '.py', + kind: 'FunctionDeclaration', + last_author: 'Carol', + chunk_authors: ['Carol'], + docmeta: { visibility: 'public' } + } +]; + +const index = buildFilterIndex(meta); + +const expectIds = (filters, expected, label) => { + const results = filterChunks(meta, filters, index).map((entry) => entry.id).sort(); + const expectedSorted = expected.slice().sort(); + const same = results.length === expectedSorted.length + && results.every((id, i) => id === expectedSorted[i]); + if (!same) { + console.error(`${label} failed: expected ${expectedSorted.join(', ')} got ${results.join(', ')}`); + process.exit(1); + } +}; + +expectIds({ ext: '.py', author: 'bob' }, [1], 'author+ext'); +expectIds({ chunkAuthor: 'alice' }, [0, 1], 'chunkAuthor'); +expectIds({ visibility: 'public', type: 'FunctionDeclaration' }, [0, 2], 'visibility+type'); + +console.log('Filter index test passed'); diff --git a/tests/filter-strictness.js b/tests/filter-strictness.js new file mode 100644 index 000000000..6605c51db --- /dev/null +++ b/tests/filter-strictness.js @@ -0,0 +1,68 @@ +#!/usr/bin/env node +import { filterChunks } from '../src/retrieval/output.js'; +import { buildFilterIndex } from '../src/retrieval/filter-index.js'; + +const meta = [ + { + id: 0, + kind: 'FunctionDeclaration', + last_author: 'Alice', + docmeta: { signature: 'foo(bar)', params: ['bar'] }, + codeRelations: { calls: [['foo', 'fetch']], usages: ['config'] }, + file: 'src/a.js' + }, + { + id: 1, + kind: 'FunctionDeclaration', + docmeta: {}, + codeRelations: {}, + file: 'src/b.js' + }, + { + id: 2, + kind: 'ClassDeclaration', + last_author: 'Bob', + docmeta: { signature: 'baz()', params: ['baz'] }, + codeRelations: { calls: [['baz', 'other']], usages: ['other'] }, + file: 'src/c.js' + }, + { + id: 3, + docmeta: {}, + codeRelations: {}, + file: 'docs/readme.md' + }, + { + id: 4, + kind: ['FunctionDeclaration', 'MethodDefinition'], + last_author: ['Carol', 'Dana'], + docmeta: { signature: 'qux()', params: ['qux'] }, + codeRelations: {}, + file: 'src/nested/util.ts' + } +]; +const filterIndex = buildFilterIndex(meta, { fileChargramN: 3 }); + +const expectIds = (filters, expected, label) => { + const result = filterChunks(meta, filters, filterIndex).map((entry) => entry.id).sort(); + const expectedSorted = expected.slice().sort(); + const ok = result.length === expectedSorted.length + && result.every((value, idx) => value === expectedSorted[idx]); + if (!ok) { + console.error(`${label} failed: expected [${expectedSorted.join(', ')}], got [${result.join(', ')}]`); + process.exit(1); + } +}; + +expectIds({ signature: 'foo' }, [0], 'signature filter'); +expectIds({ param: 'bar' }, [0], 'param filter'); +expectIds({ calls: 'fetch' }, [0], 'calls filter'); +expectIds({ uses: 'config' }, [0], 'uses filter'); +expectIds({ type: 'FunctionDeclaration' }, [0, 1, 4], 'type filter strict'); +expectIds({ type: 'FunctionDeclaration ClassDeclaration' }, [0, 1, 2, 4], 'type multi filter'); +expectIds({ author: 'Alice' }, [0], 'author filter strict'); +expectIds({ author: 'car' }, [4], 'author filter substring'); +expectIds({ file: 'src/b.js', filePrefilter: { enabled: true, chargramN: 3 } }, [1], 'file filter substring'); +expectIds({ file: '/util\\.ts$/i', filePrefilter: { enabled: true, chargramN: 3 } }, [4], 'file filter regex'); + +console.log('filter strictness test passed'); diff --git a/tests/fixture-eval.js b/tests/fixture-eval.js index d5069143f..e2010f6b2 100644 --- a/tests/fixture-eval.js +++ b/tests/fixture-eval.js @@ -3,14 +3,19 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; import { spawnSync } from 'node:child_process'; -import minimist from 'minimist'; - -const argv = minimist(process.argv.slice(2), { - boolean: ['json', 'write-report'], - string: ['backend', 'out'], - alias: { n: 'top' }, - default: { top: 5, backend: 'memory', json: false, 'write-report': false } -}); +import { createCli } from '../src/shared/cli.js'; + +const argv = createCli({ + scriptName: 'fixture-eval', + options: { + json: { type: 'boolean', default: false }, + 'write-report': { type: 'boolean', default: false }, + backend: { type: 'string', default: 'memory' }, + out: { type: 'string' }, + top: { type: 'number', default: 5 } + }, + aliases: { n: 'top' } +}).parse(); const root = process.cwd(); const fixturesRoot = path.join(root, 'tests', 'fixtures'); diff --git a/tests/fixture-parity.js b/tests/fixture-parity.js index 72576d09a..9b759b840 100644 --- a/tests/fixture-parity.js +++ b/tests/fixture-parity.js @@ -3,23 +3,57 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; import { spawnSync } from 'node:child_process'; +import { createCli } from '../src/shared/cli.js'; const root = process.cwd(); const fixturesRoot = path.join(root, 'tests', 'fixtures'); +const argv = createCli({ + scriptName: 'fixture-parity', + options: { + all: { type: 'boolean', default: false }, + fixture: { type: 'string', default: 'sample' }, + fixtures: { type: 'string', default: '' }, + 'timeout-ms': { type: 'number', default: 300000 } + } +}).parse(); +const parsedTimeout = Number.isFinite(argv['timeout-ms']) ? argv['timeout-ms'] : 300000; +const timeoutMs = Math.max(1000, Math.floor(parsedTimeout)); +const defaultProfile = process.platform === 'win32' ? 'ci-parity' : ''; +const resolvedProfile = process.env.PAIROFCLEATS_PROFILE || defaultProfile; function resolveFixtures() { + if (argv.fixtures) { + const list = argv.fixtures + .split(/[,\s]+/) + .map((entry) => entry.trim()) + .filter(Boolean); + if (list.length) return list; + } const entries = fs.readdirSync(fixturesRoot, { withFileTypes: true }); - return entries.filter((entry) => entry.isDirectory()).map((entry) => entry.name).sort(); + const allFixtures = entries.filter((entry) => entry.isDirectory()).map((entry) => entry.name).sort(); + if (argv.all) return allFixtures; + return [argv.fixture]; } function run(args, label, cwd, env) { const result = spawnSync(process.execPath, args, { cwd, env, + timeout: timeoutMs, + killSignal: 'SIGTERM', stdio: 'inherit' }); if (result.status !== 0) { - console.error(`Failed: ${label}`); + const details = []; + if (result.error?.code === 'ETIMEDOUT') { + details.push(`timeout after ${timeoutMs}ms`); + } + if (result.signal) details.push(`signal ${result.signal}`); + if (result.error && result.error.code !== 'ETIMEDOUT') { + details.push(result.error.message || String(result.error)); + } + const suffix = details.length ? ` (${details.join(', ')})` : ''; + console.error(`Failed: ${label}${suffix}`); process.exit(result.status ?? 1); } } @@ -32,6 +66,10 @@ if (!fixtures.length) { for (const fixtureName of fixtures) { const fixtureRoot = path.join(fixturesRoot, fixtureName); + if (!fs.existsSync(fixtureRoot)) { + console.error(`Fixture not found: ${fixtureRoot}`); + process.exit(1); + } const cacheRoot = path.join(root, 'tests', '.cache', `parity-${fixtureName}`); console.log(`\nFixture parity: ${fixtureName}`); await fsPromises.rm(cacheRoot, { recursive: true, force: true }); @@ -40,8 +78,12 @@ for (const fixtureName of fixtures) { const env = { ...process.env, PAIROFCLEATS_CACHE_ROOT: cacheRoot, - PAIROFCLEATS_EMBEDDINGS: 'stub' + PAIROFCLEATS_EMBEDDINGS: 'stub', + ...(resolvedProfile ? { PAIROFCLEATS_PROFILE: resolvedProfile } : {}) }; + if (resolvedProfile) { + console.log(`[fixture-parity] profile=${resolvedProfile}`); + } run([path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', fixtureRoot], `build index (${fixtureName})`, fixtureRoot, env); run([path.join(root, 'tools', 'build-sqlite-index.js'), '--repo', fixtureRoot], `build sqlite index (${fixtureName})`, fixtureRoot, env); diff --git a/tests/fixture-smoke.js b/tests/fixture-smoke.js index 3649e9dd1..c8712ca32 100644 --- a/tests/fixture-smoke.js +++ b/tests/fixture-smoke.js @@ -3,17 +3,19 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; import { spawnSync } from 'node:child_process'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; import { getIndexDir, getMetricsDir, loadUserConfig, resolveSqlitePaths } from '../tools/dict-utils.js'; -import { rankMinhash } from '../src/search/rankers.js'; +import { rankMinhash } from '../src/retrieval/rankers.js'; const root = process.cwd(); const fixturesRoot = path.join(root, 'tests', 'fixtures'); -const argv = minimist(process.argv.slice(2), { - boolean: ['all'], - string: ['fixture'], - default: { fixture: 'sample', all: false } -}); +const argv = createCli({ + scriptName: 'fixture-smoke', + options: { + all: { type: 'boolean', default: false }, + fixture: { type: 'string', default: 'sample' } + } +}).parse(); function resolveFixtures() { if (!argv.all) return [argv.fixture]; @@ -101,9 +103,9 @@ if (!fixtures.length) { } for (const fixtureName of fixtures) { - currentFixtureRoot = path.join(fixturesRoot, fixtureName); - if (!fs.existsSync(currentFixtureRoot)) { - console.error(`Fixture not found: ${currentFixtureRoot}`); + const fixtureSourceRoot = path.join(fixturesRoot, fixtureName); + if (!fs.existsSync(fixtureSourceRoot)) { + console.error(`Fixture not found: ${fixtureSourceRoot}`); process.exit(1); } console.log(`\nFixture smoke: ${fixtureName}`); @@ -118,6 +120,21 @@ for (const fixtureName of fixtures) { PAIROFCLEATS_EMBEDDINGS: 'stub' }; process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; + currentFixtureRoot = fixtureSourceRoot; + const generatorPath = path.join(fixtureSourceRoot, 'generate.js'); + if (fs.existsSync(generatorPath)) { + const generatedRoot = path.join(root, 'tests', '.cache', 'fixtures', fixtureName); + const result = spawnSync( + process.execPath, + [generatorPath, '--out', generatedRoot], + { cwd: fixtureSourceRoot, env: currentEnv, stdio: 'inherit' } + ); + if (result.status !== 0) { + console.error(`Fixture generator failed: ${fixtureName}`); + process.exit(result.status ?? 1); + } + currentFixtureRoot = generatedRoot; + } const repoArgs = ['--repo', currentFixtureRoot]; run([path.join(root, 'build_index.js'), '--stub-embeddings', ...repoArgs], `build index (${fixtureName})`); @@ -132,8 +149,16 @@ for (const fixtureName of fixtures) { const requiredFiles = [ path.join(codeDir, 'chunk_meta.json'), path.join(codeDir, 'token_postings.json'), + path.join(codeDir, 'dense_vectors_uint8.json'), + path.join(codeDir, 'dense_vectors_doc_uint8.json'), + path.join(codeDir, 'dense_vectors_code_uint8.json'), + path.join(codeDir, 'repo_map.json'), path.join(proseDir, 'chunk_meta.json'), path.join(proseDir, 'token_postings.json'), + path.join(proseDir, 'dense_vectors_uint8.json'), + path.join(proseDir, 'dense_vectors_doc_uint8.json'), + path.join(proseDir, 'dense_vectors_code_uint8.json'), + path.join(proseDir, 'repo_map.json'), path.join(metricsDir, 'index-code.json'), path.join(metricsDir, 'index-prose.json'), sqlitePaths.codePath, @@ -147,12 +172,28 @@ for (const fixtureName of fixtures) { } } + const repoMapPath = path.join(codeDir, 'repo_map.json'); + const repoMapRaw = fs.readFileSync(repoMapPath, 'utf8'); + const repoMap = JSON.parse(repoMapRaw); + if (!Array.isArray(repoMap) || !repoMap.length) { + console.error('Fixture repo map missing or empty.'); + process.exit(1); + } + const sampleEntry = repoMap.find((entry) => entry && entry.file && entry.name); + if (!sampleEntry) { + console.error('Fixture repo map missing expected fields.'); + process.exit(1); + } + assertChunkWeights('code', path.join(codeDir, 'chunk_meta.json')); assertChunkWeights('prose', path.join(proseDir, 'chunk_meta.json')); assertMinhashConsistency('code', path.join(codeDir, 'chunk_meta.json'), path.join(codeDir, 'minhash_signatures.json')); assertMinhashConsistency('prose', path.join(proseDir, 'chunk_meta.json'), path.join(proseDir, 'minhash_signatures.json')); - const queries = loadQueries(currentFixtureRoot); + const queriesRoot = fs.existsSync(path.join(currentFixtureRoot, 'queries.txt')) + ? currentFixtureRoot + : fixtureSourceRoot; + const queries = loadQueries(queriesRoot); const backends = ['memory', 'sqlite-fts']; for (const query of queries) { for (const backend of backends) { @@ -344,6 +385,56 @@ for (const fixtureName of fixtures) { } } + if (fixtureName === 'sample') { + const typeScoped = spawnSync( + process.execPath, + [path.join(root, 'search.js'), 'sayHello', '--mode', 'code', '--json', '--backend', 'memory', '--no-ann', '--type', 'MethodDeclaration', ...repoArgs], + { cwd: currentFixtureRoot, env: currentEnv, encoding: 'utf8' } + ); + if (typeScoped.status !== 0) { + console.error('Fixture type filter failed: search error.'); + process.exit(typeScoped.status ?? 1); + } + const typePayload = JSON.parse(typeScoped.stdout || '{}'); + const typeHits = typePayload.code || []; + if (!typeHits.some((hit) => hit.file === 'src/sample.swift' && String(hit.name || '').includes('sayHello'))) { + console.error('Fixture type filter returned no sayHello() hits.'); + process.exit(1); + } + + const signatureScoped = spawnSync( + process.execPath, + [path.join(root, 'search.js'), 'sayHello', '--mode', 'code', '--json', '--backend', 'memory', '--no-ann', '--signature', 'func sayHello', ...repoArgs], + { cwd: currentFixtureRoot, env: currentEnv, encoding: 'utf8' } + ); + if (signatureScoped.status !== 0) { + console.error('Fixture signature filter failed: search error.'); + process.exit(signatureScoped.status ?? 1); + } + const signaturePayload = JSON.parse(signatureScoped.stdout || '{}'); + const signatureHits = signaturePayload.code || []; + if (!signatureHits.some((hit) => hit.file === 'src/sample.swift' && String(hit.name || '').includes('sayHello'))) { + console.error('Fixture signature filter returned no sayHello() hits.'); + process.exit(1); + } + + const decoratorScoped = spawnSync( + process.execPath, + [path.join(root, 'search.js'), 'sayHello', '--mode', 'code', '--json', '--backend', 'memory', '--no-ann', '--decorator', 'available', ...repoArgs], + { cwd: currentFixtureRoot, env: currentEnv, encoding: 'utf8' } + ); + if (decoratorScoped.status !== 0) { + console.error('Fixture decorator filter failed: search error.'); + process.exit(decoratorScoped.status ?? 1); + } + const decoratorPayload = JSON.parse(decoratorScoped.stdout || '{}'); + const decoratorHits = decoratorPayload.code || []; + if (!decoratorHits.some((hit) => hit.file === 'src/sample.swift' && String(hit.name || '').includes('sayHello'))) { + console.error('Fixture decorator filter returned no sayHello() hits.'); + process.exit(1); + } + } + if (fixtureName === 'sample') { const rustCheck = spawnSync( process.execPath, diff --git a/tests/fixtures/binary/sample.png b/tests/fixtures/binary/sample.png new file mode 100644 index 000000000..c1f499e3b Binary files /dev/null and b/tests/fixtures/binary/sample.png differ diff --git a/tests/fixtures/ctags/tags.jsonl b/tests/fixtures/ctags/tags.jsonl new file mode 100644 index 000000000..5d1d31900 --- /dev/null +++ b/tests/fixtures/ctags/tags.jsonl @@ -0,0 +1,3 @@ +{"_type":"tag","name":"Widget","path":"src/widget.js","kind":"class","line":3,"language":"JavaScript"} +{"_type":"tag","name":"Widget.render","path":"src/widget.js","kind":"method","line":12,"language":"JavaScript","signature":"render()"} +{"_type":"tag","name":"util","path":"src/util.js","kind":"function","line":1,"language":"JavaScript"} diff --git a/tests/fixtures/dict-scan/tokens.txt b/tests/fixtures/dict-scan/tokens.txt new file mode 100644 index 000000000..3db628457 --- /dev/null +++ b/tests/fixtures/dict-scan/tokens.txt @@ -0,0 +1,6 @@ +alphabeta +userprofile +httprequesthandler +tokenScan +gammaDelta +alphazzzbeta diff --git a/tests/fixtures/dict-scan/words.txt b/tests/fixtures/dict-scan/words.txt new file mode 100644 index 000000000..a0d35f127 --- /dev/null +++ b/tests/fixtures/dict-scan/words.txt @@ -0,0 +1,11 @@ +alpha +beta +gamma +delta +user +profile +http +request +handler +token +scan diff --git a/tests/fixtures/encoding/latin1.js b/tests/fixtures/encoding/latin1.js new file mode 100644 index 000000000..e9daecb5c --- /dev/null +++ b/tests/fixtures/encoding/latin1.js @@ -0,0 +1 @@ +const cafe = "caf";\n \ No newline at end of file diff --git a/tests/fixtures/extensions/vec0-slip.tar b/tests/fixtures/extensions/vec0-slip.tar new file mode 100644 index 000000000..f4cdf95a0 Binary files /dev/null and b/tests/fixtures/extensions/vec0-slip.tar differ diff --git a/tests/fixtures/extensions/vec0-slip.zip b/tests/fixtures/extensions/vec0-slip.zip new file mode 100644 index 000000000..3cbec5a34 Binary files /dev/null and b/tests/fixtures/extensions/vec0-slip.zip differ diff --git a/tests/fixtures/formats/.pairofcleats.json b/tests/fixtures/formats/.pairofcleats.json new file mode 100644 index 000000000..3775e9907 --- /dev/null +++ b/tests/fixtures/formats/.pairofcleats.json @@ -0,0 +1,5 @@ +{ + "indexing": { + "yamlChunking": "top-level" + } +} diff --git a/tests/fixtures/formats/src/styles.css b/tests/fixtures/formats/src/styles.css new file mode 100644 index 000000000..ed354b5ac --- /dev/null +++ b/tests/fixtures/formats/src/styles.css @@ -0,0 +1,11 @@ +/* Header styles */ +.page-header { + display: flex; + align-items: center; +} + +@media screen and (max-width: 900px) { + .page-header { + flex-direction: column; + } +} diff --git a/tests/fixtures/formats/src/unknown.html b/tests/fixtures/formats/src/unknown.html index b9564a9b0..0a25660b8 100644 --- a/tests/fixtures/formats/src/unknown.html +++ b/tests/fixtures/formats/src/unknown.html @@ -1,5 +1,10 @@

Fallback Chunk

+ +
name: demo
+
[build]
+
[server]
+
# Doc Block
diff --git a/tests/fixtures/graphs/simple/consumer.js b/tests/fixtures/graphs/simple/consumer.js new file mode 100644 index 000000000..9726d931b --- /dev/null +++ b/tests/fixtures/graphs/simple/consumer.js @@ -0,0 +1,6 @@ +import { createGraphWidget, GraphWidget } from './producer.js'; + +export function buildGraphWidget() { + const widget = new GraphWidget(); + return createGraphWidget(); +} diff --git a/tests/fixtures/graphs/simple/producer.js b/tests/fixtures/graphs/simple/producer.js new file mode 100644 index 000000000..4f178e53d --- /dev/null +++ b/tests/fixtures/graphs/simple/producer.js @@ -0,0 +1,9 @@ +export function createGraphWidget() { + return new GraphWidget(); +} + +export class GraphWidget { + constructor() { + this.id = 1; + } +} diff --git a/tests/fixtures/gtags/gtags.txt b/tests/fixtures/gtags/gtags.txt new file mode 100644 index 000000000..11c2b1f87 --- /dev/null +++ b/tests/fixtures/gtags/gtags.txt @@ -0,0 +1,2 @@ +Widget 3 src/widget.js +render 12 src/widget.js diff --git a/tests/fixtures/languages/src/BUILD b/tests/fixtures/languages/src/BUILD new file mode 100644 index 000000000..195586115 --- /dev/null +++ b/tests/fixtures/languages/src/BUILD @@ -0,0 +1,6 @@ +load("//tools:defs.bzl", "widget_lib") + +widget_lib( + name = "widget", + srcs = ["widget.cc"], +) diff --git a/tests/fixtures/languages/src/CMakeLists.txt b/tests/fixtures/languages/src/CMakeLists.txt new file mode 100644 index 000000000..f0c7c7df3 --- /dev/null +++ b/tests/fixtures/languages/src/CMakeLists.txt @@ -0,0 +1,5 @@ +cmake_minimum_required(VERSION 3.20) +project(Widget) +add_library(widget_lib src/widget.cpp) +add_executable(widget_cli src/main.cpp) +include(cmake/Extras.cmake) diff --git a/tests/fixtures/languages/src/Dockerfile b/tests/fixtures/languages/src/Dockerfile new file mode 100644 index 000000000..2e11f4b2b --- /dev/null +++ b/tests/fixtures/languages/src/Dockerfile @@ -0,0 +1,6 @@ +FROM node:18 AS base +WORKDIR /app +COPY package.json package-lock.json ./ +RUN npm ci +COPY . . +CMD ["node", "server.js"] diff --git a/tests/fixtures/languages/src/Gemfile b/tests/fixtures/languages/src/Gemfile new file mode 100644 index 000000000..f701cd13b --- /dev/null +++ b/tests/fixtures/languages/src/Gemfile @@ -0,0 +1,5 @@ +source 'https://rubygems.org' + +def build_widget + puts 'gemfile widget' +end diff --git a/tests/fixtures/languages/src/Makefile b/tests/fixtures/languages/src/Makefile new file mode 100644 index 000000000..acb15d039 --- /dev/null +++ b/tests/fixtures/languages/src/Makefile @@ -0,0 +1,7 @@ +build: + @echo "Building" + +test: + @echo "Testing" + +include tools.mk diff --git a/tests/fixtures/languages/src/WORKSPACE b/tests/fixtures/languages/src/WORKSPACE new file mode 100644 index 000000000..d10eb11a5 --- /dev/null +++ b/tests/fixtures/languages/src/WORKSPACE @@ -0,0 +1,3 @@ +workspace(name = "widgets") + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") diff --git a/tests/fixtures/languages/src/Widget.groovy b/tests/fixtures/languages/src/Widget.groovy new file mode 100644 index 000000000..d5538d5a1 --- /dev/null +++ b/tests/fixtures/languages/src/Widget.groovy @@ -0,0 +1,7 @@ +class Widget { + String name +} + +def buildWidget() { + return new Widget(name: 'ok') +} diff --git a/tests/fixtures/languages/src/Widget.scala b/tests/fixtures/languages/src/Widget.scala new file mode 100644 index 000000000..1cd35d230 --- /dev/null +++ b/tests/fixtures/languages/src/Widget.scala @@ -0,0 +1,5 @@ +case class Widget(name: String) + +object WidgetFactory { + def buildWidget(): Widget = Widget("ok") +} diff --git a/tests/fixtures/languages/src/default.nix b/tests/fixtures/languages/src/default.nix new file mode 100644 index 000000000..1f5e0473d --- /dev/null +++ b/tests/fixtures/languages/src/default.nix @@ -0,0 +1,5 @@ +{ pkgs ? import {} }: +let + widget = pkgs.hello; +in + widget diff --git a/tests/fixtures/languages/src/defs.bzl b/tests/fixtures/languages/src/defs.bzl new file mode 100644 index 000000000..62d037e3f --- /dev/null +++ b/tests/fixtures/languages/src/defs.bzl @@ -0,0 +1,6 @@ +def widget_rule(name): + native.genrule( + name = name, + outs = ["widget.txt"], + cmd = "echo hello > $@", + ) diff --git a/tests/fixtures/languages/src/javascript_flow.js b/tests/fixtures/languages/src/javascript_flow.js new file mode 100644 index 000000000..a4974f780 --- /dev/null +++ b/tests/fixtures/languages/src/javascript_flow.js @@ -0,0 +1,15 @@ +/* @flow */ +import type { User } from './types'; +import { parse } from 'flow-parser'; + +export type Id = string; + +export function greet(user: User, id: Id): string { + return `${user.name}-${id}`; +} + +const handler = (name: string): void => { + parse(name); +}; + +export const api = { handler }; diff --git a/tests/fixtures/languages/src/schema.graphql b/tests/fixtures/languages/src/schema.graphql new file mode 100644 index 000000000..6c2f193ec --- /dev/null +++ b/tests/fixtures/languages/src/schema.graphql @@ -0,0 +1,17 @@ +schema { + query: Query +} + +type Query { + widget(id: ID!): Widget +} + +type Widget { + id: ID! + name: String! +} + +enum WidgetState { + ACTIVE + INACTIVE +} diff --git a/tests/fixtures/languages/src/schema.proto b/tests/fixtures/languages/src/schema.proto new file mode 100644 index 000000000..92b3561da --- /dev/null +++ b/tests/fixtures/languages/src/schema.proto @@ -0,0 +1,24 @@ +syntax = "proto3"; + +package widgets; + +message Widget { + string name = 1; +} + +enum State { + STATE_UNKNOWN = 0; + STATE_ACTIVE = 1; +} + +service WidgetService { + rpc GetWidget (WidgetRequest) returns (WidgetResponse); +} + +message WidgetRequest { + string id = 1; +} + +message WidgetResponse { + Widget widget = 1; +} diff --git a/tests/fixtures/languages/src/types.js b/tests/fixtures/languages/src/types.js new file mode 100644 index 000000000..809f05666 --- /dev/null +++ b/tests/fixtures/languages/src/types.js @@ -0,0 +1,5 @@ +/* @flow */ + +export type User = { + name: string +}; diff --git a/tests/fixtures/languages/src/widget.dart b/tests/fixtures/languages/src/widget.dart new file mode 100644 index 000000000..f77fb9731 --- /dev/null +++ b/tests/fixtures/languages/src/widget.dart @@ -0,0 +1,9 @@ +class Widget { + int size() { + return 42; + } +} + +void buildWidget() { + print('ok'); +} diff --git a/tests/fixtures/languages/src/widget.djhtml b/tests/fixtures/languages/src/widget.djhtml new file mode 100644 index 000000000..4f7e3f3c1 --- /dev/null +++ b/tests/fixtures/languages/src/widget.djhtml @@ -0,0 +1,4 @@ +{% load static %} +{% block body %} + +{% endblock %} diff --git a/tests/fixtures/languages/src/widget.hbs b/tests/fixtures/languages/src/widget.hbs new file mode 100644 index 000000000..77665d115 --- /dev/null +++ b/tests/fixtures/languages/src/widget.hbs @@ -0,0 +1,4 @@ +{{! Widget template }} +{{#widgets}} +
{{name}}
+{{/widgets}} diff --git a/tests/fixtures/languages/src/widget.jinja2 b/tests/fixtures/languages/src/widget.jinja2 new file mode 100644 index 000000000..71baf8435 --- /dev/null +++ b/tests/fixtures/languages/src/widget.jinja2 @@ -0,0 +1,4 @@ +{% extends "base.html" %} +{% block content %} + {{ widget.name }} +{% endblock %} diff --git a/tests/fixtures/languages/src/widget.jl b/tests/fixtures/languages/src/widget.jl new file mode 100644 index 000000000..993b578e2 --- /dev/null +++ b/tests/fixtures/languages/src/widget.jl @@ -0,0 +1,7 @@ +module Widget +export build_widget + +function build_widget(x) + x + 1 +end +end diff --git a/tests/fixtures/languages/src/widget.mustache b/tests/fixtures/languages/src/widget.mustache new file mode 100644 index 000000000..0b16cf341 --- /dev/null +++ b/tests/fixtures/languages/src/widget.mustache @@ -0,0 +1,4 @@ +{{! Widget template }} +{{#widget}} + {{name}} +{{/widget}} diff --git a/tests/fixtures/languages/src/widget.r b/tests/fixtures/languages/src/widget.r new file mode 100644 index 000000000..7256bea32 --- /dev/null +++ b/tests/fixtures/languages/src/widget.r @@ -0,0 +1,3 @@ +build_widget <- function(x) { + x + 1 +} diff --git a/tests/fixtures/languages/src/widget.razor b/tests/fixtures/languages/src/widget.razor new file mode 100644 index 000000000..f634dad99 --- /dev/null +++ b/tests/fixtures/languages/src/widget.razor @@ -0,0 +1,5 @@ +@page "/widget" +@code { + void BuildWidget() { + } +} diff --git a/tests/fixtures/lsif/dump.lsif b/tests/fixtures/lsif/dump.lsif new file mode 100644 index 000000000..a2bcf74db --- /dev/null +++ b/tests/fixtures/lsif/dump.lsif @@ -0,0 +1,7 @@ +{"id":1,"type":"vertex","label":"document","uri":"file:///repo/src/sample.ts","languageId":"typescript"} +{"id":2,"type":"vertex","label":"range","start":{"line":1,"character":0},"end":{"line":1,"character":5},"tag":"foo"} +{"id":3,"type":"vertex","label":"definitionResult"} +{"id":4,"type":"vertex","label":"referenceResult"} +{"id":5,"type":"edge","label":"contains","outV":1,"inVs":[2]} +{"id":6,"type":"edge","label":"item","outV":2,"inVs":[3]} +{"id":7,"type":"edge","label":"item","outV":2,"inVs":[4]} diff --git a/tests/fixtures/lsp/bin/clangd b/tests/fixtures/lsp/bin/clangd new file mode 100644 index 000000000..2e6327493 --- /dev/null +++ b/tests/fixtures/lsp/bin/clangd @@ -0,0 +1,16 @@ +#!/usr/bin/env node +import { spawn } from 'node:child_process'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const args = process.argv.slice(2); +if (args.includes('--version') || args.includes('--help')) { + process.stdout.write('clangd stub\n'); + process.exit(0); +} + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); +const script = path.join(__dirname, '..', 'stub-lsp-server.js'); +const child = spawn(process.execPath, [script, '--mode', 'clangd'], { stdio: 'inherit' }); +child.on('exit', (code) => process.exit(code ?? 0)); diff --git a/tests/fixtures/lsp/bin/clangd.cmd b/tests/fixtures/lsp/bin/clangd.cmd new file mode 100644 index 000000000..cacebed48 --- /dev/null +++ b/tests/fixtures/lsp/bin/clangd.cmd @@ -0,0 +1,5 @@ +@echo off +setlocal +if "%1"=="--version" exit /b 0 +if "%1"=="--help" exit /b 0 +node "%~dp0\..\stub-lsp-server.js" --mode clangd diff --git a/tests/fixtures/lsp/bin/pyright-langserver b/tests/fixtures/lsp/bin/pyright-langserver new file mode 100644 index 000000000..e4d0b4828 --- /dev/null +++ b/tests/fixtures/lsp/bin/pyright-langserver @@ -0,0 +1,16 @@ +#!/usr/bin/env node +import { spawn } from 'node:child_process'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const args = process.argv.slice(2); +if (args.includes('--version') || args.includes('--help')) { + process.stdout.write('pyright-langserver stub\n'); + process.exit(0); +} + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); +const script = path.join(__dirname, '..', 'stub-lsp-server.js'); +const child = spawn(process.execPath, [script, '--mode', 'pyright'], { stdio: 'inherit' }); +child.on('exit', (code) => process.exit(code ?? 0)); diff --git a/tests/fixtures/lsp/bin/pyright-langserver.cmd b/tests/fixtures/lsp/bin/pyright-langserver.cmd new file mode 100644 index 000000000..4ec397478 --- /dev/null +++ b/tests/fixtures/lsp/bin/pyright-langserver.cmd @@ -0,0 +1,5 @@ +@echo off +setlocal +if "%1"=="--version" exit /b 0 +if "%1"=="--help" exit /b 0 +node "%~dp0\..\stub-lsp-server.js" --mode pyright diff --git a/tests/fixtures/lsp/bin/sourcekit-lsp b/tests/fixtures/lsp/bin/sourcekit-lsp new file mode 100644 index 000000000..e22ff0303 --- /dev/null +++ b/tests/fixtures/lsp/bin/sourcekit-lsp @@ -0,0 +1,16 @@ +#!/usr/bin/env node +import { spawn } from 'node:child_process'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const args = process.argv.slice(2); +if (args.includes('--version') || args.includes('--help')) { + process.stdout.write('sourcekit-lsp stub\n'); + process.exit(0); +} + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); +const script = path.join(__dirname, '..', 'stub-lsp-server.js'); +const child = spawn(process.execPath, [script, '--mode', 'sourcekit'], { stdio: 'inherit' }); +child.on('exit', (code) => process.exit(code ?? 0)); diff --git a/tests/fixtures/lsp/bin/sourcekit-lsp.cmd b/tests/fixtures/lsp/bin/sourcekit-lsp.cmd new file mode 100644 index 000000000..ed465d565 --- /dev/null +++ b/tests/fixtures/lsp/bin/sourcekit-lsp.cmd @@ -0,0 +1,5 @@ +@echo off +setlocal +if "%1"=="--version" exit /b 0 +if "%1"=="--help" exit /b 0 +node "%~dp0\..\stub-lsp-server.js" --mode sourcekit diff --git a/tests/fixtures/lsp/stub-lsp-server.js b/tests/fixtures/lsp/stub-lsp-server.js new file mode 100644 index 000000000..16e810da8 --- /dev/null +++ b/tests/fixtures/lsp/stub-lsp-server.js @@ -0,0 +1,161 @@ +#!/usr/bin/env node +import { createFramedJsonRpcParser, writeFramedJsonRpc } from '../../../src/shared/jsonrpc.js'; + +const args = process.argv.slice(2); +const modeIdx = args.indexOf('--mode'); +const mode = modeIdx !== -1 && args[modeIdx + 1] ? args[modeIdx + 1] : 'clangd'; +const exitOnShutdown = args.includes('--exit-on-shutdown'); + +const symbolsByMode = { + clangd: { + name: 'add', + detail: 'int add(int a, int b)', + kind: 12 + }, + sourcekit: { + name: 'greet', + detail: 'func greet(name: String, count: Int) -> String', + kind: 12 + }, + pyright: { + name: 'greet', + detail: 'def greet(name: str) -> str', + kind: 12 + } +}; + +const config = symbolsByMode[mode] || symbolsByMode.clangd; +const pyrightDiagnostic = { + message: 'Stub pyright diagnostic', + severity: 2, + code: 'PYRIGHT_STUB', + source: 'pyright', + range: { + start: { line: 0, character: 0 }, + end: { line: 0, character: 1 } + } +}; +const documents = new Map(); + +const send = (payload) => { + const pending = writeFramedJsonRpc(process.stdout, payload); + if (pending && typeof pending.catch === 'function') { + pending.catch(() => {}); + } +}; + +const lineColForIndex = (text, index) => { + const before = text.slice(0, Math.max(0, index)); + const lines = before.split(/\r?\n/); + const line = Math.max(0, lines.length - 1); + const character = lines.length ? lines[lines.length - 1].length : 0; + return { line, character }; +}; + +const buildSymbol = (text) => { + if (mode === 'pyright') { + const match = text.match(/^\s*(?:async\s+)?def\s+([A-Za-z_][\w]*)\s*\(([^)]*)\)\s*(?:->\s*([^:]+))?\s*:/m); + if (match) { + const name = match[1]; + const params = match[2] || ''; + const returnType = match[3] ? ` -> ${match[3].trim()}` : ''; + const detail = `def ${name}(${params})${returnType}`.trim(); + const idx = text.indexOf(name); + const start = lineColForIndex(text || '', idx >= 0 ? idx : 0); + const end = lineColForIndex(text || '', idx >= 0 ? idx + name.length : 1); + return { + name, + kind: config.kind, + detail, + range: { start, end }, + selectionRange: { start, end } + }; + } + } + const name = config.name; + const detail = config.detail; + const idx = text ? text.indexOf(name) : -1; + const start = lineColForIndex(text || '', idx >= 0 ? idx : 0); + const end = lineColForIndex(text || '', idx >= 0 ? idx + name.length : 1); + return { + name, + kind: config.kind, + detail, + range: { start, end }, + selectionRange: { start, end } + }; +}; + +const respond = (id, result) => send({ jsonrpc: '2.0', id, result }); +const respondError = (id, message) => send({ jsonrpc: '2.0', id, error: { code: -32601, message } }); + +const handleRequest = (message) => { + const { id, method, params } = message; + if (method === 'initialize') { + respond(id, { + capabilities: { + documentSymbolProvider: true, + hoverProvider: true + } + }); + return; + } + if (method === 'shutdown') { + respond(id, null); + if (exitOnShutdown) { + setTimeout(() => process.exit(0), 0); + } + return; + } + if (method === 'textDocument/documentSymbol') { + const uri = params?.textDocument?.uri; + const text = documents.get(uri) || ''; + const symbol = buildSymbol(text); + respond(id, symbol ? [symbol] : []); + return; + } + if (method === 'textDocument/hover') { + respond(id, { + contents: { kind: 'plaintext', value: config.detail } + }); + return; + } + respondError(id, `Method not supported: ${method}`); +}; + +const handleNotification = (message) => { + if (!message?.method) return; + if (message.method === 'textDocument/didOpen') { + const uri = message.params?.textDocument?.uri; + const text = message.params?.textDocument?.text || ''; + if (uri) documents.set(uri, text); + if (uri && mode === 'pyright') { + send({ + jsonrpc: '2.0', + method: 'textDocument/publishDiagnostics', + params: { uri, diagnostics: [pyrightDiagnostic] } + }); + } + } else if (message.method === 'textDocument/didClose') { + const uri = message.params?.textDocument?.uri; + if (uri) documents.delete(uri); + } else if (message.method === 'exit') { + process.exit(0); + } +}; + +const parser = createFramedJsonRpcParser({ + onMessage: (message) => { + if (!message || typeof message !== 'object') return; + if (Object.prototype.hasOwnProperty.call(message, 'id') && message.method) { + handleRequest(message); + return; + } + handleNotification(message); + }, + onError: (err) => { + process.stderr.write(`stub lsp parse error: ${err.message}\n`); + } +}); + +process.stdin.on('data', (chunk) => parser.push(chunk)); diff --git a/tests/fixtures/mcp/schema-snapshot.json b/tests/fixtures/mcp/schema-snapshot.json new file mode 100644 index 000000000..bff704c77 --- /dev/null +++ b/tests/fixtures/mcp/schema-snapshot.json @@ -0,0 +1,474 @@ +{ + "tools": [ + { + "name": "index_status", + "required": [], + "properties": [ + "repoPath" + ] + }, + { + "name": "config_status", + "required": [], + "properties": [ + "repoPath" + ] + }, + { + "name": "build_index", + "required": [], + "properties": [ + "artifactsDir", + "incremental", + "mode", + "repoPath", + "sqlite", + "stubEmbeddings", + "useArtifacts" + ] + }, + { + "name": "search", + "required": [ + "query" + ], + "properties": [ + "alias", + "ann", + "async", + "author", + "awaits", + "backend", + "branch", + "branchesMin", + "breaksMin", + "calls", + "case", + "caseFile", + "caseTokens", + "chunkAuthor", + "churnMin", + "context", + "continuesMin", + "decorator", + "ext", + "extends", + "file", + "generator", + "import", + "inferredType", + "lang", + "lint", + "loopsMin", + "meta", + "metaJson", + "mode", + "modifiedAfter", + "modifiedSince", + "mutates", + "output", + "param", + "path", + "query", + "reads", + "repoPath", + "returnType", + "returns", + "risk", + "riskCategory", + "riskFlow", + "riskSink", + "riskSource", + "riskTag", + "signature", + "throws", + "top", + "type", + "uses", + "visibility", + "writes" + ] + }, + { + "name": "triage_ingest", + "required": [ + "inputPath", + "source" + ], + "properties": [ + "buildIndex", + "incremental", + "inputPath", + "meta", + "repoPath", + "source", + "stubEmbeddings" + ] + }, + { + "name": "triage_decision", + "required": [ + "finding", + "status" + ], + "properties": [ + "codes", + "evidence", + "expires", + "finding", + "justification", + "meta", + "repoPath", + "reviewer", + "status" + ] + }, + { + "name": "triage_context_pack", + "required": [ + "recordId" + ], + "properties": [ + "ann", + "outPath", + "recordId", + "repoPath", + "stubEmbeddings" + ] + }, + { + "name": "download_models", + "required": [], + "properties": [ + "cacheDir", + "model", + "repoPath" + ] + }, + { + "name": "download_dictionaries", + "required": [], + "properties": [ + "dir", + "force", + "lang", + "repoPath", + "update", + "url" + ] + }, + { + "name": "download_extensions", + "required": [], + "properties": [ + "arch", + "dir", + "force", + "out", + "platform", + "provider", + "repoPath", + "update", + "url" + ] + }, + { + "name": "verify_extensions", + "required": [], + "properties": [ + "annMode", + "arch", + "column", + "dir", + "encoding", + "load", + "module", + "options", + "path", + "platform", + "provider", + "repoPath", + "table" + ] + }, + { + "name": "build_sqlite_index", + "required": [], + "properties": [ + "codeDir", + "compact", + "incremental", + "mode", + "out", + "proseDir", + "repoPath" + ] + }, + { + "name": "compact_sqlite_index", + "required": [], + "properties": [ + "dryRun", + "keepBackup", + "mode", + "repoPath" + ] + }, + { + "name": "cache_gc", + "required": [], + "properties": [ + "dryRun", + "maxAgeDays", + "maxBytes", + "maxGb", + "repoPath" + ] + }, + { + "name": "clean_artifacts", + "required": [], + "properties": [ + "all", + "dryRun", + "repoPath" + ] + }, + { + "name": "bootstrap", + "required": [], + "properties": [ + "incremental", + "repoPath", + "skipArtifacts", + "skipDicts", + "skipIndex", + "skipInstall", + "skipTooling", + "withSqlite" + ] + }, + { + "name": "report_artifacts", + "required": [], + "properties": [ + "repoPath" + ] + } + ], + "responses": { + "index_status": { + "cacheRoot": "", + "dictionaries": { + "dir": "", + "enabled": "", + "files": [], + "includeSlang": "" + }, + "git": { + "isRepo": "", + "warning": "" + }, + "incremental": { + "dir": "", + "exists": "" + }, + "index": { + "code": { + "chunkMeta": { + "bytes": "", + "exists": "", + "mtime": "" + }, + "dir": "", + "tokenPostings": { + "bytes": "", + "exists": "", + "mtime": "" + } + }, + "prose": { + "chunkMeta": { + "bytes": "", + "exists": "", + "mtime": "" + }, + "dir": "", + "tokenPostings": { + "bytes": "", + "exists": "", + "mtime": "" + } + }, + "records": { + "chunkMeta": { + "bytes": "", + "exists": "", + "mtime": "" + }, + "dir": "", + "tokenPostings": { + "bytes": "", + "exists": "", + "mtime": "" + } + } + }, + "metrics": { + "dir": "", + "indexCode": { + "bytes": "", + "exists": "", + "mtime": "" + }, + "indexProse": { + "bytes": "", + "exists": "", + "mtime": "" + }, + "indexRecords": { + "bytes": "", + "exists": "", + "mtime": "" + }, + "queryCache": { + "bytes": "", + "exists": "", + "mtime": "" + } + }, + "models": { + "available": "", + "dir": "", + "hint": "", + "model": "" + }, + "repoCacheRoot": "", + "repoId": "", + "repoPath": "", + "sqlite": { + "code": { + "bytes": "", + "exists": "", + "mtime": "", + "path": "" + }, + "legacy": "", + "prose": { + "bytes": "", + "exists": "", + "mtime": "", + "path": "" + } + } + }, + "config_status": { + "cache": { + "cacheRootExists": "", + "dictionaries": [], + "modelAvailable": "", + "repoCacheExists": "", + "sqlite": { + "codeExists": "", + "proseExists": "" + }, + "vectorExtension": { + "available": "", + "enabled": "", + "path": "" + } + }, + "config": { + "cacheRoot": "", + "dictionary": { + "dir": "", + "dpMaxTokenLength": "", + "dpMaxTokenLengthByFileCount": [ + { + "dpMaxTokenLength": "", + "maxFiles": "" + }, + { + "dpMaxTokenLength": "", + "maxFiles": "" + }, + { + "dpMaxTokenLength": "", + "maxFiles": "" + } + ], + "enableRepoDictionary": "", + "files": [], + "includeSlang": "", + "languages": [ + "" + ], + "segmentation": "", + "slangDirs": [], + "slangFiles": [] + }, + "indexing": { + "astDataflow": "", + "complexity": "", + "controlFlow": "", + "gitBlame": "", + "importScan": "", + "lint": "", + "postings": { + "chargramMaxN": "", + "chargramMaxTokenLength": "", + "chargramMinN": "", + "chargramSource": "", + "enableChargrams": "", + "enablePhraseNgrams": "", + "fielded": "", + "phraseMaxN": "", + "phraseMinN": "" + }, + "pythonAst": { + "enabled": "" + }, + "riskAnalysis": "", + "riskAnalysisCrossFile": "", + "treeSitter": { + "enabled": "" + }, + "typeInference": "", + "typeInferenceCrossFile": "" + }, + "models": { + "dir": "", + "id": "" + }, + "repoCacheRoot": "", + "search": { + "annDefault": "", + "denseVectorMode": "" + }, + "sqlite": { + "annMode": "", + "codeDbPath": "", + "proseDbPath": "", + "use": "" + }, + "tooling": {} + }, + "repoId": "", + "repoPath": "", + "warnings": [ + { + "code": "", + "message": "" + }, + { + "code": "", + "message": "" + }, + { + "code": "", + "message": "" + } + ] + } + } +} diff --git a/tests/fixtures/medium/README.md b/tests/fixtures/medium/README.md new file mode 100644 index 000000000..fd4c8b472 --- /dev/null +++ b/tests/fixtures/medium/README.md @@ -0,0 +1,15 @@ +# Medium fixture + +This fixture is generated on demand to avoid committing thousands of files. + +- Generator: `tests/fixtures/medium/generate.js` +- Default output: `tests/.cache/fixtures/medium` +- Default size: 5,000 files (adjust with `--count`) + +Example: + +```bash +node tests/fixtures/medium/generate.js --out tests/.cache/fixtures/medium --count 8000 +``` + +`tests/fixture-smoke.js` will auto-generate this fixture when it detects `generate.js`. diff --git a/tests/fixtures/medium/generate.js b/tests/fixtures/medium/generate.js new file mode 100644 index 000000000..79962a420 --- /dev/null +++ b/tests/fixtures/medium/generate.js @@ -0,0 +1,173 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import crypto from 'node:crypto'; +import { fileURLToPath } from 'node:url'; +import { createCli } from '../../../src/shared/cli.js'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const repoRoot = path.resolve(__dirname, '..', '..', '..'); +const argv = createCli({ + scriptName: 'fixture-medium-generate', + options: { + out: { type: 'string' }, + count: { type: 'number', default: 5000 }, + seed: { type: 'string', default: 'medium-fixture' }, + clean: { type: 'boolean', default: false } + } +}).parse(); + +const outRoot = argv.out + ? path.resolve(argv.out) + : path.join(repoRoot, 'tests', '.cache', 'fixtures', 'medium'); +const fileCount = Number.isFinite(argv.count) + ? Math.max(1, Math.floor(argv.count)) + : 5000; +const seed = String(argv.seed || 'medium-fixture'); + +const hashSeed = (value) => { + let h = 2166136261; + for (let i = 0; i < value.length; i += 1) { + h ^= value.charCodeAt(i); + h = Math.imul(h, 16777619); + } + return h >>> 0; +}; + +const createRng = (value) => { + let state = hashSeed(value); + return () => { + state = (Math.imul(state, 1664525) + 1013904223) >>> 0; + return state / 0x100000000; + }; +}; + +const rng = createRng(seed); +const words = ['alpha', 'bravo', 'charlie', 'delta', 'echo', 'foxtrot', 'golf', 'hotel']; + +const templates = [ + { + dir: path.join('src', 'js'), + ext: 'js', + render: (i) => { + const word = words[i % words.length]; + const n = Math.floor(rng() * 1000); + return [ + `export function fn_${i}(input) {`, + ` const tag = '${word}-${i}';`, + ` return input + ${n};`, + '}', + '' + ].join('\n'); + } + }, + { + dir: path.join('src', 'ts'), + ext: 'ts', + render: (i) => { + const n = Math.floor(rng() * 1000); + return [ + `export interface Widget${i} {`, + ' id: number;', + ' name: string;', + '}', + `export const widget${i}: Widget${i} = { id: ${n}, name: 'widget-${i}' };`, + '' + ].join('\n'); + } + }, + { + dir: path.join('src', 'py'), + ext: 'py', + render: (i) => { + const n = Math.floor(rng() * 1000); + return [ + `def handler_${i}(value: int) -> int:`, + ` return value + ${n}`, + '' + ].join('\n'); + } + }, + { + dir: path.join('docs'), + ext: 'md', + render: (i) => { + const word = words[(i + 3) % words.length]; + return [ + `# Note ${i}`, + '', + `This is the ${word} fixture entry for file ${i}.`, + '', + '```js', + `export const sample = ${i};`, + '```', + '' + ].join('\n'); + } + }, + { + dir: path.join('web'), + ext: 'html', + render: (i) => { + const n = Math.floor(rng() * 1000); + return [ + '', + '', + ' ', + ' ', + ' ', + ' ', + `
Item ${i}
`, + ' ', + ' ', + '', + '' + ].join('\n'); + } + } +]; + +if (argv.clean) { + await fsPromises.rm(outRoot, { recursive: true, force: true }); +} + +await fsPromises.mkdir(outRoot, { recursive: true }); + +const manifest = { + seed, + fileCount, + generatedAt: new Date().toISOString(), + filesByExt: {}, + totalBytes: 0, + contentHash: '' +}; +const hash = crypto.createHash('sha1'); + +for (let i = 0; i < fileCount; i += 1) { + const template = templates[i % templates.length]; + const group = Math.floor(i / 500); + const dir = path.join(outRoot, template.dir, `group-${group}`); + await fsPromises.mkdir(dir, { recursive: true }); + const fileName = `file-${i}.${template.ext}`; + const content = template.render(i); + const relPath = path.join(template.dir, `group-${group}`, fileName); + await fsPromises.writeFile(path.join(outRoot, relPath), content, 'utf8'); + manifest.filesByExt[template.ext] = (manifest.filesByExt[template.ext] || 0) + 1; + manifest.totalBytes += Buffer.byteLength(content, 'utf8'); + hash.update(relPath.replace(/\\/g, '/')); + hash.update('\n'); + hash.update(content); + hash.update('\n'); +} + +manifest.contentHash = `sha1:${hash.digest('hex')}`; +await fsPromises.writeFile( + path.join(outRoot, 'manifest.json'), + `${JSON.stringify(manifest, null, 2)}\n`, + 'utf8' +); + +console.log(`Generated medium fixture at ${outRoot} (${fileCount} files).`); diff --git a/tests/fixtures/scip/index.json b/tests/fixtures/scip/index.json new file mode 100644 index 000000000..c640a1f8f --- /dev/null +++ b/tests/fixtures/scip/index.json @@ -0,0 +1,28 @@ +{ + "documents": [ + { + "relativePath": "src/example.js", + "language": "JavaScript", + "symbols": [ + { + "symbol": "local 1", + "kind": "Function", + "displayName": "doThing", + "signature": "doThing()" + } + ], + "occurrences": [ + { + "range": [1, 0, 1, 7], + "symbol": "local 1", + "symbolRoles": 1 + }, + { + "range": [3, 2, 3, 9], + "symbol": "local 1", + "symbolRoles": 2 + } + ] + } + ] +} diff --git a/tests/fixtures/segments/README.md b/tests/fixtures/segments/README.md new file mode 100644 index 000000000..86ab279d0 --- /dev/null +++ b/tests/fixtures/segments/README.md @@ -0,0 +1,3 @@ +# Segments fixture + +Fixtures for segmented document + comment extraction coverage. diff --git a/tests/fixtures/segments/docs/guide.md b/tests/fixtures/segments/docs/guide.md new file mode 100644 index 000000000..cd40c5dbc --- /dev/null +++ b/tests/fixtures/segments/docs/guide.md @@ -0,0 +1,21 @@ +--- +title: Segment Guide +tags: + - docs +--- + +# Segment Guide + +This guide has `short` and `inline_code_span` plus `ok_span_long`. + +```js +const answer = 42; +``` + +```json +{ + "name": "widget" +} +``` + +More prose here. diff --git a/tests/fixtures/segments/src/comments.js b/tests/fixtures/segments/src/comments.js new file mode 100644 index 000000000..97d5726a4 --- /dev/null +++ b/tests/fixtures/segments/src/comments.js @@ -0,0 +1,20 @@ +/** + * Widget config. + * + * ```json + * { + * "name": "widget", + * "enabled": true + * } + * ``` + */ +export function buildWidget() { + // short + // Longer inline comment that should be indexed by prose tokenization. + /* Block comment that should be indexed as well. */ + // generated by lint + console.log('ok'); + return true; +} + +/* Copyright 2025 Example */ diff --git a/tests/fixtures/segments/src/component.vue b/tests/fixtures/segments/src/component.vue new file mode 100644 index 000000000..ece3e08e0 --- /dev/null +++ b/tests/fixtures/segments/src/component.vue @@ -0,0 +1,11 @@ + + + + + diff --git a/tests/fixtures/segments/src/page.astro b/tests/fixtures/segments/src/page.astro new file mode 100644 index 000000000..20bf98c25 --- /dev/null +++ b/tests/fixtures/segments/src/page.astro @@ -0,0 +1,12 @@ +--- +const label = 'Astro'; +--- + + + + + + +
{label}
+ + diff --git a/tests/fixtures/segments/src/widget.svelte b/tests/fixtures/segments/src/widget.svelte new file mode 100644 index 000000000..24cb1538c --- /dev/null +++ b/tests/fixtures/segments/src/widget.svelte @@ -0,0 +1,9 @@ + + + + +
{label}
diff --git a/tests/fixtures/structural/bin/comby b/tests/fixtures/structural/bin/comby new file mode 100644 index 000000000..ff0ee986f --- /dev/null +++ b/tests/fixtures/structural/bin/comby @@ -0,0 +1,19 @@ +#!/usr/bin/env node +const args = process.argv.slice(2); +if (args.includes('--version') || args.includes('--help')) { + process.stdout.write('comby stub\n'); + process.exit(0); +} +const payload = { + uri: 'docs/notes.md', + matches: [ + { + matched: 'TODO: update', + range: { + start: { line: 1, col: 1 }, + end: { line: 1, col: 12 } + } + } + ] +}; +process.stdout.write(`${JSON.stringify(payload)}\n`); diff --git a/tests/fixtures/structural/bin/semgrep b/tests/fixtures/structural/bin/semgrep new file mode 100644 index 000000000..4859e69df --- /dev/null +++ b/tests/fixtures/structural/bin/semgrep @@ -0,0 +1,23 @@ +#!/usr/bin/env node +const args = process.argv.slice(2); +if (args.includes('--version') || args.includes('--help')) { + process.stdout.write('semgrep stub\n'); + process.exit(0); +} +const payload = { + results: [ + { + check_id: 'semgrep.stub', + path: 'src/example.js', + start: { line: 2, col: 3 }, + end: { line: 2, col: 10 }, + extra: { + message: 'stub semgrep match', + severity: 'WARNING', + lines: 'eval("x")', + metadata: { tags: ['security'] } + } + } + ] +}; +process.stdout.write(`${JSON.stringify(payload)}\n`); diff --git a/tests/fixtures/structural/bin/sg b/tests/fixtures/structural/bin/sg new file mode 100644 index 000000000..e59a86f09 --- /dev/null +++ b/tests/fixtures/structural/bin/sg @@ -0,0 +1,21 @@ +#!/usr/bin/env node +const args = process.argv.slice(2); +if (args.includes('--version') || args.includes('--help')) { + process.stdout.write('ast-grep stub\n'); + process.exit(0); +} +const payload = { + ruleId: 'astgrep.stub', + file: 'src/example.ts', + matches: [ + { + message: 'stub ast-grep match', + range: { + start: { line: 4, column: 1 }, + end: { line: 4, column: 8 } + }, + text: 'eval(x)' + } + ] +}; +process.stdout.write(`${JSON.stringify(payload)}\n`); diff --git a/tests/fixtures/tree-sitter/clike.c b/tests/fixtures/tree-sitter/clike.c new file mode 100644 index 000000000..f06c63c96 --- /dev/null +++ b/tests/fixtures/tree-sitter/clike.c @@ -0,0 +1,5 @@ +struct Widget { int id; }; + +int greet(int name) { + return name; +} diff --git a/tests/fixtures/tree-sitter/cpp.cpp b/tests/fixtures/tree-sitter/cpp.cpp new file mode 100644 index 000000000..fcbddb061 --- /dev/null +++ b/tests/fixtures/tree-sitter/cpp.cpp @@ -0,0 +1,4 @@ +class Widget { +public: + int greet(int name) { return name; } +}; diff --git a/tests/fixtures/tree-sitter/csharp.cs b/tests/fixtures/tree-sitter/csharp.cs new file mode 100644 index 000000000..71e8d26fc --- /dev/null +++ b/tests/fixtures/tree-sitter/csharp.cs @@ -0,0 +1,7 @@ +namespace Demo { + class Widget { + string Greet(string name) { + return name; + } + } +} diff --git a/tests/fixtures/tree-sitter/go.go b/tests/fixtures/tree-sitter/go.go new file mode 100644 index 000000000..1bf52bd1b --- /dev/null +++ b/tests/fixtures/tree-sitter/go.go @@ -0,0 +1,5 @@ +type Widget struct {} + +func (w Widget) Greet(name string) string { + return name +} diff --git a/tests/fixtures/tree-sitter/java.java b/tests/fixtures/tree-sitter/java.java new file mode 100644 index 000000000..a45ac794c --- /dev/null +++ b/tests/fixtures/tree-sitter/java.java @@ -0,0 +1,5 @@ +class Widget { + String greet(String name) { + return name; + } +} diff --git a/tests/fixtures/tree-sitter/kotlin.kt b/tests/fixtures/tree-sitter/kotlin.kt new file mode 100644 index 000000000..843ec5710 --- /dev/null +++ b/tests/fixtures/tree-sitter/kotlin.kt @@ -0,0 +1,5 @@ +class Widget { + fun greet(name: String): String { + return name + } +} diff --git a/tests/fixtures/tree-sitter/objc.m b/tests/fixtures/tree-sitter/objc.m new file mode 100644 index 000000000..3125bca79 --- /dev/null +++ b/tests/fixtures/tree-sitter/objc.m @@ -0,0 +1,8 @@ +@interface Widget : NSObject +- (void)greet:(NSString *)name; +@end + +@implementation Widget +- (void)greet:(NSString *)name { +} +@end diff --git a/tests/fixtures/tree-sitter/rust.rs b/tests/fixtures/tree-sitter/rust.rs new file mode 100644 index 000000000..2395833bb --- /dev/null +++ b/tests/fixtures/tree-sitter/rust.rs @@ -0,0 +1,7 @@ +struct Widget {} + +impl Widget { + fn greet(&self, name: &str) -> &str { + name + } +} diff --git a/tests/fixtures/tree-sitter/swift.swift b/tests/fixtures/tree-sitter/swift.swift new file mode 100644 index 000000000..3c37acd09 --- /dev/null +++ b/tests/fixtures/tree-sitter/swift.swift @@ -0,0 +1,7 @@ +import Foundation + +class Widget { + func greet(name: String) -> String { + return name + } +} diff --git a/tests/format-fidelity.js b/tests/format-fidelity.js index 9918536e8..d0f6d054b 100644 --- a/tests/format-fidelity.js +++ b/tests/format-fidelity.js @@ -35,11 +35,22 @@ const codeDir = getIndexDir(fixtureRoot, 'code', userConfig); const proseDir = getIndexDir(fixtureRoot, 'prose', userConfig); const codeMeta = JSON.parse(fs.readFileSync(path.join(codeDir, 'chunk_meta.json'), 'utf8')); const proseMeta = JSON.parse(fs.readFileSync(path.join(proseDir, 'chunk_meta.json'), 'utf8')); +const loadFileMap = (dir) => { + const metaPath = path.join(dir, 'file_meta.json'); + if (!fs.existsSync(metaPath)) return new Map(); + const entries = JSON.parse(fs.readFileSync(metaPath, 'utf8')); + return new Map( + (Array.isArray(entries) ? entries : []).map((entry) => [entry.id, entry.file]) + ); +}; +const codeFileById = loadFileMap(codeDir); +const proseFileById = loadFileMap(proseDir); -function findChunk(meta, match) { +function findChunk(meta, match, fileById) { return meta.find((chunk) => { - if (!chunk || !chunk.file) return false; - if (match.file && chunk.file !== match.file) return false; + const file = chunk?.file || fileById.get(chunk?.fileId) || null; + if (!chunk || !file) return false; + if (match.file && file !== match.file) return false; if (match.kind && chunk.kind !== match.kind) return false; if (match.nameIncludes && !String(chunk.name || '').includes(match.nameIncludes)) return false; return true; @@ -48,38 +59,53 @@ function findChunk(meta, match) { const failures = []; -if (!findChunk(codeMeta, { file: 'src/config.json', nameIncludes: 'database' })) { +if (!findChunk(codeMeta, { file: 'src/config.json', nameIncludes: 'database' }, codeFileById)) { failures.push('Missing JSON chunk for database.'); } -if (!findChunk(codeMeta, { file: 'src/config.toml', nameIncludes: 'database' })) { +if (!findChunk(codeMeta, { file: 'src/config.toml', nameIncludes: 'database' }, codeFileById)) { failures.push('Missing TOML chunk for database.'); } -if (!findChunk(codeMeta, { file: 'src/config.ini', nameIncludes: 'server' })) { +if (!findChunk(codeMeta, { file: 'src/config.ini', nameIncludes: 'server' }, codeFileById)) { failures.push('Missing INI chunk for server.'); } -if (!findChunk(codeMeta, { file: 'src/config.xml', nameIncludes: 'database' })) { +if (!findChunk(codeMeta, { file: 'src/config.xml', nameIncludes: 'database' }, codeFileById)) { failures.push('Missing XML chunk for database.'); } -if (!findChunk(codeMeta, { file: 'src/Dockerfile', nameIncludes: 'FROM' })) { +if (!findChunk(codeMeta, { file: 'src/Dockerfile', nameIncludes: 'FROM' }, codeFileById)) { failures.push('Missing Dockerfile chunk for FROM.'); } -if (!findChunk(codeMeta, { file: 'src/Makefile', nameIncludes: 'build' })) { +if (!findChunk(codeMeta, { file: 'src/Makefile', nameIncludes: 'build' }, codeFileById)) { failures.push('Missing Makefile chunk for build target.'); } -if (!findChunk(codeMeta, { file: 'src/config.yaml', nameIncludes: 'database' })) { +if (!findChunk(codeMeta, { file: 'src/config.yaml', nameIncludes: 'database' }, codeFileById)) { failures.push('Missing YAML chunk for database.'); } -if (!findChunk(codeMeta, { file: '.github/workflows/ci.yml', nameIncludes: 'build' })) { +if (!findChunk(codeMeta, { file: '.github/workflows/ci.yml', nameIncludes: 'build' }, codeFileById)) { failures.push('Missing GitHub Actions chunk for build job.'); } -if (!findChunk(codeMeta, { file: 'src/unknown.html', kind: 'Blob' })) { - failures.push('Missing fallback blob chunk for unknown.html.'); +if (!findChunk(codeMeta, { file: 'src/unknown.html', kind: 'ElementDeclaration', nameIncludes: 'html' }, codeFileById)) { + failures.push('Missing HTML element chunk for unknown.html.'); +} +if (!findChunk(codeMeta, { file: 'src/unknown.html', kind: 'ConfigSection', nameIncludes: 'settings' }, codeFileById)) { + failures.push('Missing embedded JSON chunk for unknown.html.'); +} +if (!findChunk(codeMeta, { file: 'src/unknown.html', kind: 'ConfigSection', nameIncludes: 'build' }, codeFileById)) { + failures.push('Missing embedded TOML chunk for unknown.html.'); +} +if (!findChunk(codeMeta, { file: 'src/unknown.html', kind: 'ConfigSection', nameIncludes: 'server' }, codeFileById)) { + failures.push('Missing embedded INI chunk for unknown.html.'); +} +if (!findChunk(codeMeta, { file: 'src/unknown.html', kind: 'Section', nameIncludes: 'Doc Block' }, codeFileById)) { + failures.push('Missing embedded Markdown chunk for unknown.html.'); +} +if (!findChunk(codeMeta, { file: 'src/styles.css', kind: 'StyleRule', nameIncludes: '.page-header' }, codeFileById)) { + failures.push('Missing CSS chunk for styles.css.'); } -if (!findChunk(proseMeta, { file: 'docs/guide.rst', nameIncludes: 'Guide' })) { +if (!findChunk(proseMeta, { file: 'docs/guide.rst', nameIncludes: 'Guide' }, proseFileById)) { failures.push('Missing RST chunk for Guide.'); } -if (!findChunk(proseMeta, { file: 'docs/manual.adoc', nameIncludes: 'Manual' })) { +if (!findChunk(proseMeta, { file: 'docs/manual.adoc', nameIncludes: 'Manual' }, proseFileById)) { failures.push('Missing AsciiDoc chunk for Manual.'); } diff --git a/tests/git-blame-range.js b/tests/git-blame-range.js new file mode 100644 index 000000000..b7bedcd70 --- /dev/null +++ b/tests/git-blame-range.js @@ -0,0 +1,109 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'git-blame-range'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +const gitCheck = spawnSync('git', ['--version'], { encoding: 'utf8' }); +if (gitCheck.status !== 0) { + console.log('[skip] git not available'); + process.exit(0); +} + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const runGit = (args, label) => { + const result = spawnSync('git', args, { cwd: repoRoot, encoding: 'utf8' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } +}; + +runGit(['init'], 'git init'); +runGit(['config', 'user.email', 'alpha@example.com'], 'git config email alpha'); +runGit(['config', 'user.name', 'Alpha Author'], 'git config name alpha'); + +const sourcePath = path.join(repoRoot, 'sample.js'); +await fsPromises.writeFile( + sourcePath, + ['function alpha() {', ' return 1;', '}'].join('\n') + '\n' +); +runGit(['add', '.'], 'git add alpha'); +runGit(['commit', '-m', 'alpha'], 'git commit alpha'); + +runGit(['config', 'user.email', 'beta@example.com'], 'git config email beta'); +runGit(['config', 'user.name', 'Beta Author'], 'git config name beta'); +await fsPromises.appendFile( + sourcePath, + ['','function beta() {', ' return 2;', '}'].join('\n') + '\n' +); +runGit(['add', '.'], 'git add beta'); +runGit(['commit', '-m', 'beta'], 'git commit beta'); + +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('git blame range test failed: build_index failed'); + process.exit(buildResult.status ?? 1); +} + +const userConfig = loadUserConfig(repoRoot); +const codeDir = getIndexDir(repoRoot, 'code', userConfig); +const meta = JSON.parse(fs.readFileSync(path.join(codeDir, 'chunk_meta.json'), 'utf8')); + +const findChunk = (name) => meta.find((chunk) => chunk.name === name || String(chunk.name || '').includes(name)); +const alphaChunk = findChunk('alpha'); +const betaChunk = findChunk('beta'); +if (!alphaChunk || !betaChunk) { + console.error('Expected alpha and beta chunks in chunk_meta.json'); + process.exit(1); +} +const alphaAuthors = new Set(alphaChunk.chunk_authors || []); +const betaAuthors = new Set(betaChunk.chunk_authors || []); +if (alphaChunk.startLine !== 1 || alphaChunk.endLine !== 3) { + console.error(`Expected alpha chunk line range 1-3, got ${alphaChunk.startLine}-${alphaChunk.endLine}`); + process.exit(1); +} +if (!Number.isFinite(betaChunk.startLine) || betaChunk.startLine < 4) { + console.error(`Expected beta chunk start line >= 4, got ${betaChunk.startLine}`); + process.exit(1); +} +if (!alphaAuthors.has('Alpha Author')) { + console.error(`Expected Alpha Author in alpha chunk authors, got ${Array.from(alphaAuthors).join(', ')}`); + process.exit(1); +} +if (!betaAuthors.has('Beta Author')) { + console.error(`Expected Beta Author in beta chunk authors, got ${Array.from(betaAuthors).join(', ')}`); + process.exit(1); +} +if (alphaAuthors.has('Beta Author')) { + console.error('Unexpected Beta Author in alpha chunk authors (range likely wrong).'); + process.exit(1); +} +if (betaAuthors.has('Alpha Author')) { + console.error('Unexpected Alpha Author in beta chunk authors (range likely wrong).'); + process.exit(1); +} + +console.log('Git blame range test passed'); diff --git a/tests/git-meta.js b/tests/git-meta.js index ae34f1d7b..5c914b57b 100644 --- a/tests/git-meta.js +++ b/tests/git-meta.js @@ -1,7 +1,7 @@ #!/usr/bin/env node import fs from 'node:fs'; import path from 'node:path'; -import { getGitMeta } from '../src/indexer/git.js'; +import { getGitMeta } from '../src/index/git.js'; const root = process.cwd(); const target = path.join(root, 'README.md'); @@ -11,8 +11,8 @@ if (!fs.existsSync(target)) { process.exit(1); } -const blameEnabled = await getGitMeta(target, 0, 0, { blame: true }); -const blameDisabled = await getGitMeta(target, 0, 0, { blame: false }); +const blameEnabled = await getGitMeta(target, 1, 1, { blame: true, baseDir: root }); +const blameDisabled = await getGitMeta(target, 1, 1, { blame: false, baseDir: root }); if (blameDisabled.chunk_authors !== undefined) { console.error('Expected git blame metadata to be disabled, but chunk_authors is present.'); diff --git a/tests/graph-chunk-id.js b/tests/graph-chunk-id.js new file mode 100644 index 000000000..243005322 --- /dev/null +++ b/tests/graph-chunk-id.js @@ -0,0 +1,23 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { buildRelationGraphs } from '../src/index/build/graphs.js'; + +const stableChunkId = 'chunk_graph_1'; +const chunks = [ + { + file: 'src/graph.js', + name: 'buildWidget', + kind: 'Function', + metaV2: { chunkId: stableChunkId }, + codeRelations: { + callLinks: [{ file: 'src/other.js', target: 'helper', kind: 'Function' }] + } + } +]; + +const graphs = buildRelationGraphs({ chunks, fileRelations: new Map() }); +const node = graphs.callGraph.nodes.find((entry) => entry.id === 'src/graph.js::buildWidget'); +assert.ok(node, 'expected call graph node'); +assert.equal(node.chunkId, stableChunkId, 'expected stable chunkId in graph output'); + +console.log('graph chunk id test passed'); diff --git a/tests/gtags-ingest.js b/tests/gtags-ingest.js new file mode 100644 index 000000000..5c0cfd329 --- /dev/null +++ b/tests/gtags-ingest.js @@ -0,0 +1,45 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'gtags-ingest'); +const repoRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const inputPath = path.join(root, 'tests', 'fixtures', 'gtags', 'gtags.txt'); +const outPath = path.join(tempRoot, 'gtags.jsonl'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); + +const result = spawnSync( + process.execPath, + [path.join(root, 'tools', 'gtags-ingest.js'), '--repo', repoRoot, '--input', inputPath, '--out', outPath, '--json'], + { encoding: 'utf8' } +); +if (result.status !== 0) { + console.error(result.stderr || result.stdout || 'gtags-ingest failed'); + process.exit(result.status ?? 1); +} + +if (!fs.existsSync(outPath)) { + console.error('gtags output not found'); + process.exit(1); +} + +const lines = fs.readFileSync(outPath, 'utf8').trim().split(/\r?\n/).filter(Boolean); +assert.ok(lines.length >= 2, 'expected gtags output lines'); + +const first = JSON.parse(lines[0]); +assert.equal(first.file, 'src/widget.js'); +assert.equal(first.name, 'Widget'); +assert.equal(first.startLine, 3); +assert.equal(first.source, 'gtags'); + +const metaPath = `${outPath}.meta.json`; +const meta = JSON.parse(fs.readFileSync(metaPath, 'utf8')); +assert.equal(meta.stats.entries, lines.length); + +console.log('gtags ingest test passed'); diff --git a/tests/hnsw-ann.js b/tests/hnsw-ann.js new file mode 100644 index 000000000..6e7d89ce5 --- /dev/null +++ b/tests/hnsw-ann.js @@ -0,0 +1,95 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'hnsw-ann'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); +await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); + +const config = { + cache: { root: cacheRoot }, + indexing: { + embeddings: { + hnsw: { + enabled: true + } + } + } +}; + +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify(config, null, 2) + '\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +function run(args, label) { + const result = spawnSync(process.execPath, args, { + cwd: repoRoot, + env, + stdio: 'inherit' + }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +} + +run([path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], 'build index'); +run([path.join(root, 'tools', 'build-embeddings.js'), '--stub-embeddings', '--mode', 'code', '--repo', repoRoot], 'build embeddings (code)'); +run([path.join(root, 'tools', 'build-embeddings.js'), '--stub-embeddings', '--mode', 'prose', '--repo', repoRoot], 'build embeddings (prose)'); + +const userConfig = loadUserConfig(repoRoot); +const codeDir = getIndexDir(repoRoot, 'code', userConfig); +const proseDir = getIndexDir(repoRoot, 'prose', userConfig); +const codeIndex = path.join(codeDir, 'dense_vectors_hnsw.bin'); +const codeMeta = path.join(codeDir, 'dense_vectors_hnsw.meta.json'); +const proseIndex = path.join(proseDir, 'dense_vectors_hnsw.bin'); +const proseMeta = path.join(proseDir, 'dense_vectors_hnsw.meta.json'); + +if (!fs.existsSync(codeIndex) || !fs.existsSync(codeMeta)) { + console.error('HNSW index missing for code mode.'); + process.exit(1); +} +if (!fs.existsSync(proseIndex) || !fs.existsSync(proseMeta)) { + console.error('HNSW index missing for prose mode.'); + process.exit(1); +} + +const searchResult = spawnSync( + process.execPath, + [path.join(root, 'search.js'), 'index', '--backend', 'memory', '--json', '--ann', '--repo', repoRoot], + { cwd: repoRoot, env, encoding: 'utf8' } +); +if (searchResult.status !== 0) { + console.error('search.js failed for HNSW ANN test.'); + if (searchResult.stderr) console.error(searchResult.stderr.trim()); + process.exit(searchResult.status ?? 1); +} + +const payload = JSON.parse(searchResult.stdout || '{}'); +const stats = payload.stats || {}; +if (stats.annBackend !== 'hnsw') { + console.error(`Expected annBackend=hnsw, got ${stats.annBackend}`); + process.exit(1); +} +if (!stats.annHnsw?.available?.code || !stats.annHnsw?.available?.prose) { + console.error('Expected HNSW availability for code and prose.'); + process.exit(1); +} + +console.log('HNSW ANN test passed'); diff --git a/tests/hnsw-atomic.js b/tests/hnsw-atomic.js new file mode 100644 index 000000000..a5251f881 --- /dev/null +++ b/tests/hnsw-atomic.js @@ -0,0 +1,75 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; +import { resolveHnswPaths } from '../src/shared/hnsw.js'; +import { loadChunkMeta, readJsonFile } from '../src/shared/artifact-io.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'hnsw-atomic'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); +await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const buildIndex = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildIndex.status !== 0) { + console.error('hnsw atomic test failed: build_index failed'); + process.exit(buildIndex.status ?? 1); +} + +const userConfig = loadUserConfig(repoRoot); +const codeIndexDir = getIndexDir(repoRoot, 'code', userConfig); +const { indexPath: hnswIndexPath, metaPath: hnswMetaPath } = resolveHnswPaths(codeIndexDir); + +await fsPromises.writeFile(hnswIndexPath, 'stub-index'); +await fsPromises.writeFile(hnswMetaPath, JSON.stringify({ version: 1, dims: 1, count: 0 })); + +const buildEmbeddings = spawnSync( + process.execPath, + [path.join(root, 'tools', 'build-embeddings.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildEmbeddings.status !== 0) { + console.error('hnsw atomic test failed: build-embeddings failed'); + process.exit(buildEmbeddings.status ?? 1); +} + +if (!fs.existsSync(`${hnswIndexPath}.bak`)) { + console.error('hnsw atomic test failed: expected .bak for HNSW index after replace'); + process.exit(1); +} + +const chunkMeta = loadChunkMeta(codeIndexDir); +const meta = readJsonFile(hnswMetaPath); +if (!Number.isFinite(meta?.count) || !Number.isFinite(meta?.expectedCount)) { + console.error('hnsw atomic test failed: missing count fields in HNSW meta'); + process.exit(1); +} +if (meta.count !== meta.expectedCount) { + console.error(`hnsw atomic test failed: count mismatch (${meta.count} vs ${meta.expectedCount})`); + process.exit(1); +} +if (meta.count !== chunkMeta.length) { + console.error(`hnsw atomic test failed: expected ${chunkMeta.length} vectors, got ${meta.count}`); + process.exit(1); +} + +console.log('hnsw atomic tests passed'); diff --git a/tests/hnsw-fallback-and-candidates.js b/tests/hnsw-fallback-and-candidates.js new file mode 100644 index 000000000..f0ea5b7a5 --- /dev/null +++ b/tests/hnsw-fallback-and-candidates.js @@ -0,0 +1,71 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; + +import { loadHnswIndex, rankHnswIndex } from '../src/shared/hnsw.js'; + +{ + // loadHnswIndex should fall back to .bak when the primary exists but is unreadable. + const tmp = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'poc-hnsw-fallback-')); + const primary = path.join(tmp, 'dense_vectors_hnsw.bin'); + const bak = `${primary}.bak`; + await fsPromises.writeFile(primary, 'corrupt'); + await fsPromises.writeFile(bak, 'ok'); + + const readAttempts = []; + class FakeHNSW { + constructor(space, dims) { + this.space = space; + this.dims = dims; + this.ef = 0; + } + readIndexSync(p) { + readAttempts.push(p); + if (p === primary) { + throw new Error('corrupt index'); + } + return true; + } + setEf(ef) { + this.ef = ef; + } + } + + const index = loadHnswIndex({ + indexPath: primary, + dims: 2, + config: { enabled: true, efSearch: 17, space: 'cosine' }, + lib: { HierarchicalNSW: FakeHNSW } + }); + + assert.ok(index, 'expected fallback index to load'); + assert.deepEqual(readAttempts, [primary, bak], 'expected to try primary then .bak'); + assert.equal(index.ef, 17, 'expected efSearch to be applied on loaded index'); + assert.equal(fs.existsSync(bak), true, 'expected .bak to be preserved when used as fallback'); +} + +{ + // rankHnswIndex should treat an empty candidateSet as "no candidates". + const calls = []; + const fakeIndex = { + searchKnn: (vec, limit, filter) => { + calls.push({ vec, limit, filter }); + return { neighbors: [1], distances: [0.25] }; + } + }; + + const empty = rankHnswIndex({ index: fakeIndex, space: 'cosine' }, new Float32Array([1, 0]), 5, new Set()); + assert.deepEqual(empty, [], 'expected empty candidate set to yield no results'); + assert.equal(calls.length, 0, 'expected searchKnn to be skipped for empty candidate set'); + + const nonEmpty = rankHnswIndex({ index: fakeIndex, space: 'cosine' }, new Float32Array([1, 0]), 5, new Set([1])); + assert.equal(calls.length, 1, 'expected searchKnn to be invoked'); + assert.equal(Array.isArray(calls[0].vec), true, 'expected query embedding to be coerced to an Array'); + assert.equal(typeof calls[0].filter, 'function', 'expected candidate filter to be passed to searchKnn'); + assert.equal(nonEmpty.length, 1, 'expected a single neighbor'); +} + +console.log('hnsw fallback + candidate-set semantics test passed'); diff --git a/tests/ignore-overrides.js b/tests/ignore-overrides.js new file mode 100644 index 000000000..a7cc54da6 --- /dev/null +++ b/tests/ignore-overrides.js @@ -0,0 +1,40 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { buildIgnoreMatcher } from '../src/index/build/ignore.js'; +import { discoverFiles } from '../src/index/build/discover.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'ignore-overrides'); + +await fs.rm(tempRoot, { recursive: true, force: true }); +await fs.mkdir(path.join(tempRoot, 'dist'), { recursive: true }); +await fs.writeFile(path.join(tempRoot, 'dist', 'allow.js'), 'console.log("ok")'); +await fs.writeFile(path.join(tempRoot, 'dist', 'deny.js'), 'console.log("no")'); + +const { ignoreMatcher } = await buildIgnoreMatcher({ + root: tempRoot, + userConfig: { + extraIgnore: ['!dist/allow.js'] + } +}); + +const entries = await discoverFiles({ + root: tempRoot, + mode: 'code', + ignoreMatcher, + skippedFiles: [], + maxFileBytes: null +}); + +const rels = entries.map((entry) => entry.rel).sort(); +if (!rels.includes('dist/allow.js')) { + console.error('ignore override test failed: allow.js not discovered'); + process.exit(1); +} +if (rels.includes('dist/deny.js')) { + console.error('ignore override test failed: deny.js should be ignored'); + process.exit(1); +} + +console.log('ignore override test passed'); diff --git a/tests/import-links.js b/tests/import-links.js new file mode 100644 index 000000000..8f01277c6 --- /dev/null +++ b/tests/import-links.js @@ -0,0 +1,74 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'import-links'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile(path.join(repoRoot, 'src', 'a.js'), "import x from 'lib';\n"); +await fsPromises.writeFile(path.join(repoRoot, 'src', 'b.js'), "const x = require('lib');\n"); +await fsPromises.writeFile(path.join(repoRoot, 'src', 'c.js'), "import y from 'other';\n"); + +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('import-links test failed: build_index failed'); + process.exit(buildResult.status ?? 1); +} + +const userConfig = loadUserConfig(repoRoot); +const codeDir = getIndexDir(repoRoot, 'code', userConfig); +const relationsPath = path.join(codeDir, 'file_relations.json'); +if (!fs.existsSync(relationsPath)) { + console.error('import-links test failed: file_relations.json missing'); + process.exit(1); +} + +const raw = JSON.parse(fs.readFileSync(relationsPath, 'utf8')); +const map = new Map(raw.map((entry) => [entry.file, entry.relations])); +const relA = map.get('src/a.js'); +const relB = map.get('src/b.js'); + +if (!relA || !Array.isArray(relA.importLinks)) { + console.error('import-links test failed: missing importLinks for a.js'); + process.exit(1); +} +if (!relB || !Array.isArray(relB.importLinks)) { + console.error('import-links test failed: missing importLinks for b.js'); + process.exit(1); +} + +const expected = new Set(['src/a.js', 'src/b.js']); +for (const file of expected) { + if (!relA.importLinks.includes(file)) { + console.error(`import-links test failed: a.js missing link to ${file}`); + process.exit(1); + } +} +if (relA.importLinks.includes('src/c.js')) { + console.error('import-links test failed: a.js should not link to c.js'); + process.exit(1); +} + +console.log('Import links test passed'); diff --git a/tests/import-priority.js b/tests/import-priority.js new file mode 100644 index 000000000..3b2c1a25d --- /dev/null +++ b/tests/import-priority.js @@ -0,0 +1,25 @@ +#!/usr/bin/env node +import { sortImportScanItems } from '../src/index/build/imports.js'; + +const items = [ + { relKey: 'a', stat: { size: 100 }, index: 0 }, + { relKey: 'b', stat: { size: 1000 }, index: 1 }, + { relKey: 'c', stat: { size: 2000 }, index: 2 }, + { relKey: 'd', stat: { size: 150 }, index: 3 } +]; + +const counts = new Map([ + ['a', 10], + ['b', 5], + ['d', 10] +]); + +sortImportScanItems(items, counts); +const order = items.map((item) => item.relKey).join(','); + +if (order !== 'd,a,b,c') { + console.error(`import priority test failed: got ${order}`); + process.exit(1); +} + +console.log('import priority test passed'); diff --git a/tests/incremental-cache-signature.js b/tests/incremental-cache-signature.js new file mode 100644 index 000000000..b4b1c3674 --- /dev/null +++ b/tests/incremental-cache-signature.js @@ -0,0 +1,83 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'incremental-cache-signature'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const configPath = path.join(repoRoot, '.pairofcleats.json'); +const writeConfig = async (controlFlow) => { + await fsPromises.writeFile( + configPath, + JSON.stringify({ + indexing: { + controlFlow, + fileListSampleSize: 10, + treeSitter: { enabled: false } + } + }, null, 2) + ); +}; + +await writeConfig(false); +const filePath = path.join(repoRoot, 'src.js'); +await fsPromises.writeFile(filePath, 'function alpha() { return 1; }\n'); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const runBuild = (label) => { + const result = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--incremental', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } + ); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +runBuild('initial build'); +runBuild('cache build'); + +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const userConfig = loadUserConfig(repoRoot); +const codeDir = getIndexDir(repoRoot, 'code', userConfig); +const fileListsPath = path.join(codeDir, '.filelists.json'); +if (!fs.existsSync(fileListsPath)) { + console.error('Missing .filelists.json'); + process.exit(1); +} +const fileLists = JSON.parse(await fsPromises.readFile(fileListsPath, 'utf8')); +const cachedEntry = fileLists?.scanned?.sample?.find((entry) => entry?.file?.endsWith('src.js')); +if (!cachedEntry || cachedEntry.cached !== true) { + console.error('Expected cached entry after incremental rebuild'); + process.exit(1); +} + +await writeConfig(true); +runBuild('config signature rebuild'); + +const userConfigAfter = loadUserConfig(repoRoot); +const codeDirAfter = getIndexDir(repoRoot, 'code', userConfigAfter); +const fileListsAfter = JSON.parse(await fsPromises.readFile(path.join(codeDirAfter, '.filelists.json'), 'utf8')); +const rebuildEntry = fileListsAfter?.scanned?.sample?.find((entry) => entry?.file?.endsWith('src.js')); +if (!rebuildEntry || rebuildEntry.cached === true) { + console.error('Expected cache invalidation after config signature change'); + process.exit(1); +} + +console.log('incremental cache signature test passed'); diff --git a/tests/incremental-manifest.js b/tests/incremental-manifest.js new file mode 100644 index 000000000..c1cb49ef3 --- /dev/null +++ b/tests/incremental-manifest.js @@ -0,0 +1,74 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getRepoCacheRoot, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const repoRoot = path.join(root, 'tests', '.cache', 'incremental-manifest'); +const cacheRoot = path.join(repoRoot, '.cache'); +const buildIndexPath = path.join(root, 'build_index.js'); + +await fsPromises.rm(repoRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); + +const filePath = path.join(repoRoot, 'sample.js'); +await fsPromises.writeFile(filePath, 'export function hello() { return 1; }\n'); +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ sqlite: { use: false } }, null, 2) +); + +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const run = (args, label) => { + const result = spawnSync(process.execPath, args, { cwd: repoRoot, env, encoding: 'utf8' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } +}; + +run([buildIndexPath, '--incremental', '--stub-embeddings', '--mode', 'code', '--repo', repoRoot], 'initial build'); + +const userConfig = loadUserConfig(repoRoot); +const repoCacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const manifestPath = path.join(repoCacheRoot, 'incremental', 'code', 'manifest.json'); +if (!fs.existsSync(manifestPath)) { + console.error('Missing incremental manifest after initial build.'); + process.exit(1); +} + +const manifestBefore = JSON.parse(fs.readFileSync(manifestPath, 'utf8')); +const entryBefore = manifestBefore.files?.['sample.js']; +if (!entryBefore) { + console.error('Missing manifest entry for sample.js.'); + process.exit(1); +} + +const newTime = new Date(Date.now() + 5000); +fs.utimesSync(filePath, newTime, newTime); + +run([buildIndexPath, '--incremental', '--stub-embeddings', '--mode', 'code', '--repo', repoRoot], 'second build'); + +const manifestAfter = JSON.parse(fs.readFileSync(manifestPath, 'utf8')); +const entryAfter = manifestAfter.files?.['sample.js']; +if (!entryAfter) { + console.error('Missing manifest entry after rebuild.'); + process.exit(1); +} + +const statAfter = fs.statSync(filePath); +if (entryAfter.mtimeMs !== statAfter.mtimeMs) { + console.error(`Manifest mtimeMs not updated (${entryAfter.mtimeMs} vs ${statAfter.mtimeMs}).`); + process.exit(1); +} + +console.log('Incremental manifest refresh test passed'); diff --git a/tests/incremental-reuse.js b/tests/incremental-reuse.js new file mode 100644 index 000000000..bda0872bb --- /dev/null +++ b/tests/incremental-reuse.js @@ -0,0 +1,73 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { shouldReuseIncrementalIndex } from '../src/index/build/incremental.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'incremental-reuse'); +const outDir = path.join(tempRoot, 'index'); + +await fs.rm(tempRoot, { recursive: true, force: true }); +await fs.mkdir(path.join(outDir, 'pieces'), { recursive: true }); + +const indexState = { stage: 'stage2' }; +const pieceManifest = { version: 2, pieces: [{ name: 'chunk_meta', path: 'chunk_meta.json' }] }; +await fs.writeFile(path.join(outDir, 'index_state.json'), JSON.stringify(indexState)); +await fs.writeFile(path.join(outDir, 'pieces', 'manifest.json'), JSON.stringify(pieceManifest)); + +const entries = [ + { rel: 'src/a.js', stat: { size: 10, mtimeMs: 123 } }, + { rel: 'src/b.js', stat: { size: 20, mtimeMs: 456 } } +]; + +const manifest = { + files: { + 'src/a.js': { size: 10, mtimeMs: 123 }, + 'src/b.js': { size: 20, mtimeMs: 456 } + } +}; + +const reuse = await shouldReuseIncrementalIndex({ + outDir, + entries, + manifest, + stage: 'stage1' +}); + +if (!reuse) { + console.error('incremental reuse test failed: expected reuse'); + process.exit(1); +} + +const extraManifest = { + files: { + ...manifest.files, + 'src/c.js': { size: 30, mtimeMs: 789 } + } +}; + +const noReuseDeleted = await shouldReuseIncrementalIndex({ + outDir, + entries, + manifest: extraManifest, + stage: 'stage1' +}); + +if (noReuseDeleted) { + console.error('incremental reuse test failed: expected deletion mismatch'); + process.exit(1); +} + +const noReuse = await shouldReuseIncrementalIndex({ + outDir, + entries: [{ rel: 'src/a.js', stat: { size: 11, mtimeMs: 123 } }], + manifest, + stage: 'stage2' +}); + +if (noReuse) { + console.error('incremental reuse test failed: expected mismatch'); + process.exit(1); +} + +console.log('incremental reuse test passed'); diff --git a/tests/incremental-tokenization-cache.js b/tests/incremental-tokenization-cache.js new file mode 100644 index 000000000..d67bcea3e --- /dev/null +++ b/tests/incremental-tokenization-cache.js @@ -0,0 +1,91 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'incremental-token-cache'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const configPath = path.join(repoRoot, '.pairofcleats.json'); +const writeConfig = async (enableChargrams) => { + await fsPromises.writeFile( + configPath, + JSON.stringify({ + indexing: { + postings: { enableChargrams }, + fileListSampleSize: 10, + treeSitter: { enabled: false } + } + }, null, 2) + ); +}; +await writeConfig(false); + +const filePath = path.join(repoRoot, 'src.js'); +await fsPromises.writeFile(filePath, 'function alpha() { return 1; }\n'); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const runBuild = (label) => { + const result = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--incremental', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } + ); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +runBuild('initial build'); +runBuild('cache build'); + +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const userConfig = loadUserConfig(repoRoot); +const codeDir = getIndexDir(repoRoot, 'code', userConfig); +const fileListsPath = path.join(codeDir, '.filelists.json'); +if (!fs.existsSync(fileListsPath)) { + console.error('Missing .filelists.json'); + process.exit(1); +} +const fileLists = JSON.parse(await fsPromises.readFile(fileListsPath, 'utf8')); +const scannedSample = fileLists?.scanned?.sample; +if (!Array.isArray(scannedSample)) { + console.error('Scanned sample payload is not an array'); + process.exit(1); +} +const cachedEntry = scannedSample.find((entry) => entry?.file && entry.file.endsWith('src.js')); +if (!cachedEntry || cachedEntry.cached !== true) { + console.error('Expected cached entry after incremental rebuild'); + process.exit(1); +} + +await writeConfig(true); +runBuild('config change rebuild'); + +const userConfigAfter = loadUserConfig(repoRoot); +const codeDirAfter = getIndexDir(repoRoot, 'code', userConfigAfter); +const fileListsAfter = JSON.parse(await fsPromises.readFile(path.join(codeDirAfter, '.filelists.json'), 'utf8')); +const scannedAfter = fileListsAfter?.scanned?.sample; +const rebuildEntry = Array.isArray(scannedAfter) + ? scannedAfter.find((entry) => entry?.file && entry.file.endsWith('src.js')) + : null; +if (!rebuildEntry || rebuildEntry.cached === true) { + console.error('Expected cache invalidation after tokenization config change'); + process.exit(1); +} + +console.log('incremental tokenization cache test passed'); diff --git a/tests/index-cache.js b/tests/index-cache.js new file mode 100644 index 000000000..bd79d9194 --- /dev/null +++ b/tests/index-cache.js @@ -0,0 +1,34 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import os from 'node:os'; +import { loadIndexWithCache } from '../src/retrieval/index-cache.js'; + +const tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'pairofcleats-index-cache-')); +const indexDir = path.join(tempRoot, 'index'); +await fs.mkdir(indexDir, { recursive: true }); + +const writeMeta = async (value) => { + await fs.writeFile(path.join(indexDir, 'chunk_meta.json'), JSON.stringify(value)); +}; + +const cache = new Map(); +let loads = 0; +const loader = () => { + loads += 1; + return { loaded: loads }; +}; + +await writeMeta([{ id: 1 }]); +const first = loadIndexWithCache(cache, indexDir, { modelIdDefault: 'm', fileChargramN: 3 }, loader); +const second = loadIndexWithCache(cache, indexDir, { modelIdDefault: 'm', fileChargramN: 3 }, loader); +assert.equal(loads, 1, 'cache should prevent reloads'); +assert.equal(first.loaded, second.loaded, 'cached result should match'); + +await writeMeta([{ id: 2 }]); +const third = loadIndexWithCache(cache, indexDir, { modelIdDefault: 'm', fileChargramN: 3 }, loader); +assert.equal(loads, 2, 'cache should reload after signature change'); +assert.notEqual(third.loaded, first.loaded, 'reloaded result should differ'); + +console.log('index cache tests passed'); diff --git a/tests/index-lifecycle-contract.js b/tests/index-lifecycle-contract.js new file mode 100644 index 000000000..114db1caf --- /dev/null +++ b/tests/index-lifecycle-contract.js @@ -0,0 +1,75 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'index-lifecycle'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, 'alpha.js'), + 'export const alpha = () => "alpha";\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--mode', 'code', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); + +if (buildResult.status !== 0) { + console.error('Failed: index build for lifecycle contract'); + process.exit(buildResult.status ?? 1); +} + +const validateResult = spawnSync( + process.execPath, + [path.join(root, 'tools', 'index-validate.js'), '--json', '--mode', 'code', '--repo', repoRoot], + { cwd: repoRoot, env, encoding: 'utf8' } +); + +if (validateResult.status !== 0) { + console.error('Failed: index validate for lifecycle contract'); + if (validateResult.stderr) console.error(validateResult.stderr.trim()); + process.exit(validateResult.status ?? 1); +} + +let payload = null; +try { + payload = JSON.parse(validateResult.stdout || '{}'); +} catch { + console.error('Failed: index validate returned invalid JSON'); + process.exit(1); +} + +if (!payload || typeof payload !== 'object') { + console.error('Failed: index validate payload missing'); + process.exit(1); +} + +if (!payload.ok) { + console.error('Failed: index validate reported issues'); + if (Array.isArray(payload.issues)) { + payload.issues.forEach((issue) => console.error(`- ${issue}`)); + } + process.exit(1); +} + +if (!payload.modes || !payload.modes.code) { + console.error('Failed: index validate missing code mode'); + process.exit(1); +} + +console.log('index lifecycle contract tests passed'); diff --git a/tests/index-lock.js b/tests/index-lock.js new file mode 100644 index 000000000..00a7ee7e0 --- /dev/null +++ b/tests/index-lock.js @@ -0,0 +1,45 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { acquireIndexLock } from '../src/index/build/lock.js'; + +const root = process.cwd(); +const baseDir = path.join(root, 'tests', '.cache', 'index-lock'); +const repoCacheRoot = path.join(baseDir, 'repo'); +const lockDir = path.join(repoCacheRoot, 'locks'); +const lockPath = path.join(lockDir, 'index.lock'); +const staleMs = 24 * 60 * 60 * 1000; + +await fsPromises.rm(baseDir, { recursive: true, force: true }); +await fsPromises.mkdir(lockDir, { recursive: true }); + +await fsPromises.writeFile( + lockPath, + JSON.stringify({ pid: 999999, startedAt: new Date().toISOString() }) +); + +const lock = await acquireIndexLock({ repoCacheRoot, staleMs, log: () => {} }); +if (!lock) { + console.error('index-lock test failed: dead pid lock was not cleared.'); + process.exit(1); +} +await lock.release(); + +await fsPromises.writeFile( + lockPath, + JSON.stringify({ pid: process.pid, startedAt: new Date().toISOString() }) +); + +const lockActive = await acquireIndexLock({ repoCacheRoot, staleMs, log: () => {} }); +if (lockActive) { + await lockActive.release(); + console.error('index-lock test failed: active lock should not be acquired.'); + process.exit(1); +} + +if (fs.existsSync(lockPath)) { + await fsPromises.rm(lockPath, { force: true }); +} + +console.log('index-lock test passed'); diff --git a/tests/index-validate.js b/tests/index-validate.js new file mode 100644 index 000000000..476a11caf --- /dev/null +++ b/tests/index-validate.js @@ -0,0 +1,82 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const cacheRoot = path.join(root, 'tests', '.cache', 'index-validate'); +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const validatorPath = path.join(root, 'tools', 'index-validate.js'); +const buildPath = path.join(root, 'build_index.js'); + +const missingResult = spawnSync( + process.execPath, + [validatorPath, '--repo', fixtureRoot, '--json'], + { env, encoding: 'utf8' } +); +if (missingResult.status === 0) { + console.error('Expected index-validate to fail when indexes are missing.'); + process.exit(1); +} + +const buildResult = spawnSync( + process.execPath, + [buildPath, '--stub-embeddings', '--repo', fixtureRoot], + { env, encoding: 'utf8' } +); +if (buildResult.status !== 0) { + console.error('Failed to build fixture index for index-validate test.'); + if (buildResult.stderr) console.error(buildResult.stderr.trim()); + process.exit(buildResult.status ?? 1); +} +const previousCacheRoot = process.env.PAIROFCLEATS_CACHE_ROOT; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const userConfig = loadUserConfig(fixtureRoot); +const codeDir = getIndexDir(fixtureRoot, 'code', userConfig); +if (previousCacheRoot === undefined) { + delete process.env.PAIROFCLEATS_CACHE_ROOT; +} else { + process.env.PAIROFCLEATS_CACHE_ROOT = previousCacheRoot; +} +const piecesPath = path.join(codeDir, 'pieces', 'manifest.json'); +try { + await fsPromises.access(piecesPath); +} catch { + console.error('Expected pieces manifest to exist after build.'); + process.exit(1); +} + +const okResult = spawnSync( + process.execPath, + [validatorPath, '--repo', fixtureRoot, '--json'], + { env, encoding: 'utf8' } +); +if (okResult.status !== 0) { + console.error('Expected index-validate to pass after building index.'); + if (okResult.stderr) console.error(okResult.stderr.trim()); + process.exit(okResult.status ?? 1); +} + +let payload; +try { + payload = JSON.parse(okResult.stdout); +} catch { + console.error('index-validate did not return valid JSON.'); + process.exit(1); +} +if (!payload || payload.ok !== true) { + console.error('index-validate JSON payload missing ok=true.'); + process.exit(1); +} + +console.log('index-validate test passed'); diff --git a/tests/indexer-service.js b/tests/indexer-service.js new file mode 100644 index 000000000..149c44455 --- /dev/null +++ b/tests/indexer-service.js @@ -0,0 +1,49 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'indexer-service'); +const repoRoot = path.join(tempRoot, 'repo'); +const queueDir = path.join(tempRoot, 'queue'); +const configPath = path.join(tempRoot, 'service.json'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); + +const config = { + queueDir, + repos: [ + { id: 'repo', path: repoRoot, syncPolicy: 'none' } + ] +}; +await fsPromises.writeFile(configPath, JSON.stringify(config, null, 2)); + +const enqueue = spawnSync( + process.execPath, + [path.join(root, 'tools', 'indexer-service.js'), 'enqueue', '--config', configPath, '--repo', repoRoot, '--mode', 'code'], + { encoding: 'utf8' } +); +if (enqueue.status !== 0) { + console.error(enqueue.stderr || enqueue.stdout || 'indexer-service enqueue failed'); + process.exit(enqueue.status ?? 1); +} + +const status = spawnSync( + process.execPath, + [path.join(root, 'tools', 'indexer-service.js'), 'status', '--config', configPath], + { encoding: 'utf8' } +); +if (status.status !== 0) { + console.error(status.stderr || status.stdout || 'indexer-service status failed'); + process.exit(status.status ?? 1); +} + +const payload = JSON.parse(status.stdout || '{}'); +assert.equal(payload.queue?.queued, 1); +assert.ok(fs.existsSync(path.join(queueDir, 'queue.json'))); + +console.log('indexer service test passed'); diff --git a/tests/indexer/incremental-plan.test.js b/tests/indexer/incremental-plan.test.js new file mode 100644 index 000000000..ba266cca7 --- /dev/null +++ b/tests/indexer/incremental-plan.test.js @@ -0,0 +1,54 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { shouldReuseIncrementalIndex } from '../../src/index/build/incremental.js'; + +const fail = (message) => { + console.error(message); + process.exit(1); +}; + +const root = process.cwd(); +const baseDir = path.join(root, 'tests', '.cache', 'indexer-plan'); +const outDir = path.join(baseDir, 'out'); +const piecesDir = path.join(outDir, 'pieces'); +const fixtureFile = path.join(baseDir, 'src', 'a.js'); + +const setup = async () => { + await fs.rm(baseDir, { recursive: true, force: true }); + await fs.mkdir(path.dirname(fixtureFile), { recursive: true }); + await fs.writeFile(fixtureFile, 'const a = 1;\n'); + await fs.mkdir(piecesDir, { recursive: true }); + await fs.writeFile(path.join(outDir, 'index_state.json'), JSON.stringify({ stage: 'stage2' })); + await fs.writeFile(path.join(piecesDir, 'manifest.json'), JSON.stringify({ pieces: [{ id: 'piece-1' }] })); +}; + +const run = async () => { + await setup(); + const stat = await fs.stat(fixtureFile); + const entries = [{ rel: 'src/a.js', stat }]; + const manifest = { files: { 'src/a.js': { size: stat.size, mtimeMs: stat.mtimeMs } } }; + + const reuse = await shouldReuseIncrementalIndex({ outDir, entries, manifest, stage: 'stage2' }); + if (!reuse) { + fail('shouldReuseIncrementalIndex should return true for matching manifest entries.'); + } + + const stageMismatch = await shouldReuseIncrementalIndex({ outDir, entries, manifest, stage: 'stage3' }); + if (stageMismatch) { + fail('shouldReuseIncrementalIndex should fail when stage is not satisfied.'); + } + + const manifestMismatch = { files: { 'src/a.js': { size: stat.size + 1, mtimeMs: stat.mtimeMs } } }; + const reuseMismatch = await shouldReuseIncrementalIndex({ outDir, entries, manifest: manifestMismatch, stage: 'stage2' }); + if (reuseMismatch) { + fail('shouldReuseIncrementalIndex should fail when file sizes differ.'); + } +}; + +try { + await run(); + console.log('indexer incremental plan tests passed'); +} finally { + await fs.rm(baseDir, { recursive: true, force: true }); +} diff --git a/tests/indexer/signatures.test.js b/tests/indexer/signatures.test.js new file mode 100644 index 000000000..3d5dfb359 --- /dev/null +++ b/tests/indexer/signatures.test.js @@ -0,0 +1,86 @@ +#!/usr/bin/env node +import { buildIncrementalSignature, buildTokenizationKey } from '../../src/index/build/indexer/signatures.js'; + +const fail = (message) => { + console.error(message); + process.exit(1); +}; + +const baseRuntime = { + commentsConfig: { + licensePattern: /MIT/, + generatedPattern: /@generated/, + linterPattern: /eslint/ + }, + dictConfig: { splitCase: true }, + postingsConfig: { enablePhraseNgrams: true }, + dictSignature: 'sig-a', + segmentsConfig: { enabled: true } +}; + +const tokenKeyA = buildTokenizationKey(baseRuntime, 'code'); +const tokenKeyB = buildTokenizationKey({ ...baseRuntime, dictSignature: 'sig-b' }, 'code'); +if (tokenKeyA === tokenKeyB) { + fail('buildTokenizationKey should reflect dictSignature changes.'); +} + +const runtimeA = { + astDataflowEnabled: true, + controlFlowEnabled: false, + lintEnabled: true, + complexityEnabled: true, + riskAnalysisEnabled: false, + riskAnalysisCrossFileEnabled: false, + typeInferenceEnabled: true, + typeInferenceCrossFileEnabled: false, + gitBlameEnabled: true, + indexingConfig: { + riskRules: { foo: 'bar' }, + riskCaps: { max: 1 }, + importScan: 'post' + }, + languageOptions: { + javascript: { parser: 'babel', flow: 'auto' }, + typescript: { parser: 'auto', importsOnly: false }, + treeSitter: { + enabled: true, + languages: { js: true }, + configChunking: true, + maxBytes: 100, + maxLines: 200, + maxParseMs: 300, + byLanguage: {} + }, + yamlChunking: { mode: 'root' }, + kotlin: { flowMaxBytes: 1 } + }, + embeddingEnabled: true, + embeddingService: false, + embeddingMode: 'inline', + embeddingBatchSize: 32, + fileCaps: { default: { maxBytes: 1, maxLines: 2 }, byExt: {}, byLanguage: {} }, + fileScan: { sampleBytes: 64 }, + incrementalBundleFormat: 'json' +}; + +const sigA = buildIncrementalSignature(runtimeA, 'code', tokenKeyA); +const sigB = buildIncrementalSignature({ + ...runtimeA, + languageOptions: { + ...runtimeA.languageOptions, + typescript: { parser: 'typescript', importsOnly: false } + } +}, 'code', tokenKeyA); +if (sigA === sigB) { + fail('buildIncrementalSignature should reflect parser changes.'); +} + +const sigC = buildIncrementalSignature({ + ...runtimeA, + embeddingBatchSize: 64 +}, 'code', tokenKeyA); +if (sigA === sigC) { + fail('buildIncrementalSignature should reflect embedding batch changes.'); +} + +console.log('indexer signatures tests passed'); diff --git a/tests/inline-embeddings-validation.js b/tests/inline-embeddings-validation.js new file mode 100644 index 000000000..ad89828cb --- /dev/null +++ b/tests/inline-embeddings-validation.js @@ -0,0 +1,47 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; + +import { attachEmbeddings } from '../src/index/build/file-processor/embeddings.js'; + +{ + const chunks = [{}, {}]; + await assert.rejects( + () => attachEmbeddings({ + chunks, + codeTexts: ['a', 'b'], + docTexts: ['', 'doc'], + embeddingEnabled: true, + embeddingMode: 'both', + embeddingBatchSize: 16, + runEmbedding: async (fn) => await fn(), + getChunkEmbedding: async () => [1, 2], + getChunkEmbeddings: async () => [[1, 2], [1, 2, 3]] + }), + /dims mismatch/i, + 'expected inline embedding attachment to fail fast on dims mismatch' + ); +} + +{ + const chunks = [{}, {}]; + const res = await attachEmbeddings({ + chunks, + codeTexts: ['a', 'b'], + docTexts: ['', 'doc'], + embeddingEnabled: true, + embeddingMode: 'both', + embeddingBatchSize: 16, + runEmbedding: async (fn) => await fn(), + getChunkEmbedding: async () => [9, 9, 9], + getChunkEmbeddings: async (texts) => texts.map((_, i) => new Float32Array([i, i + 1, i + 2])) + }); + + assert.ok(res && Number.isFinite(res.embeddingMs), 'expected timing result'); + assert.equal(chunks.length, 2); + assert.equal(chunks[0].embed_code.length, 3); + assert.equal(chunks[0].embed_doc.length, 3, 'expected zero doc vector when doc text is missing'); + assert.equal(chunks[0].embedding.length, 3); + assert.equal(chunks[1].embed_doc.length, 3, 'expected doc embedding vector'); +} + +console.log('inline embeddings validation test passed'); diff --git a/tests/io-concurrency-cap.js b/tests/io-concurrency-cap.js new file mode 100644 index 000000000..ed8bb7410 --- /dev/null +++ b/tests/io-concurrency-cap.js @@ -0,0 +1,23 @@ +#!/usr/bin/env node +import { resolveThreadLimits } from '../src/shared/threads.js'; + +const threadLimits = resolveThreadLimits({ + configConcurrency: 8, + importConcurrencyConfig: 8, + ioConcurrencyCapConfig: 16 +}); + +if (threadLimits.ioConcurrency !== 16) { + throw new Error(`io-concurrency-cap test failed: expected ioConcurrency=16, got ${threadLimits.ioConcurrency}`); +} + +// Verify cap is not increasing concurrency +const uncapped = resolveThreadLimits({ + configConcurrency: 8, + importConcurrencyConfig: 8 +}); +if (uncapped.ioConcurrency < threadLimits.ioConcurrency) { + throw new Error(`io-concurrency-cap test failed: uncapped ioConcurrency=${uncapped.ioConcurrency} should be >= capped ioConcurrency=${threadLimits.ioConcurrency}`); +} + +console.log('io-concurrency-cap test passed'); diff --git a/tests/json-stream.js b/tests/json-stream.js new file mode 100644 index 000000000..b2128088d --- /dev/null +++ b/tests/json-stream.js @@ -0,0 +1,47 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { writeJsonArrayFile, writeJsonObjectFile } from '../src/shared/json-stream.js'; + +const root = process.cwd(); +const outDir = path.join(root, 'tests', '.cache', 'json-stream'); +await fs.rm(outDir, { recursive: true, force: true }); +await fs.mkdir(outDir, { recursive: true }); + +const arrayPath = path.join(outDir, 'array.json'); +const arrayInput = [ + { id: 1, name: 'alpha' }, + { id: 2, name: 'beta' } +]; +await writeJsonArrayFile(arrayPath, arrayInput); +const arrayParsed = JSON.parse(await fs.readFile(arrayPath, 'utf8')); +if (JSON.stringify(arrayParsed) !== JSON.stringify(arrayInput)) { + console.error('json-stream array test failed: parsed output mismatch.'); + process.exit(1); +} + +const objPath = path.join(outDir, 'object.json'); +const fields = { model: 'test', dims: 2, scale: 1 }; +const arrays = { + vectors: [ + [1, 2], + [3, 4] + ], + vocab: ['foo', 'bar'] +}; +await writeJsonObjectFile(objPath, { fields, arrays }); +const objParsed = JSON.parse(await fs.readFile(objPath, 'utf8')); +if (objParsed.model !== fields.model || objParsed.dims !== fields.dims || objParsed.scale !== fields.scale) { + console.error('json-stream object test failed: fields mismatch.'); + process.exit(1); +} +if (!Array.isArray(objParsed.vectors) || objParsed.vectors.length !== arrays.vectors.length) { + console.error('json-stream object test failed: vectors mismatch.'); + process.exit(1); +} +if (!Array.isArray(objParsed.vocab) || objParsed.vocab.length !== arrays.vocab.length) { + console.error('json-stream object test failed: vocab mismatch.'); + process.exit(1); +} + +console.log('json-stream test passed'); diff --git a/tests/jsonrpc-parser.js b/tests/jsonrpc-parser.js new file mode 100644 index 000000000..60811baad --- /dev/null +++ b/tests/jsonrpc-parser.js @@ -0,0 +1,31 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { createFramedJsonRpcParser } from '../src/shared/jsonrpc.js'; + +const frame = (payload) => { + const body = JSON.stringify(payload); + return Buffer.from(`Content-Length: ${Buffer.byteLength(body)}\r\n\r\n${body}`); +}; + +const messages = []; +const errors = []; +const parser = createFramedJsonRpcParser({ + onMessage: (msg) => messages.push(msg), + onError: (err) => errors.push(err), + maxBufferBytes: 256, + maxHeaderBytes: 128, + maxMessageBytes: 64 +}); + +parser.push(frame({ jsonrpc: '2.0', id: 1, result: 'ok' })); +assert.equal(messages.length, 1, 'expected one message before overflow'); +assert.equal(errors.length, 0, 'did not expect errors for valid payload'); + +parser.push(frame({ jsonrpc: '2.0', id: 2, result: 'x'.repeat(200) })); +assert.equal(errors.length, 1, 'expected overflow error'); +assert.ok(errors[0]?.message?.includes('exceeded'), 'error message should mention size limit'); + +parser.push(frame({ jsonrpc: '2.0', id: 3, result: 'ok' })); +assert.equal(messages.length, 1, 'parser should stop after overflow'); + +console.log('jsonrpc parser tests passed'); diff --git a/tests/kotlin-perf-guard.js b/tests/kotlin-perf-guard.js new file mode 100644 index 000000000..a129e3eb0 --- /dev/null +++ b/tests/kotlin-perf-guard.js @@ -0,0 +1,46 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { buildKotlinChunks, buildKotlinRelations, computeKotlinFlow, getKotlinFileStats } from '../src/lang/kotlin.js'; + +const text = 'class Widget { fun render(a: Int): Int { if (a > 0) { foo() } return a } }\n'; +const chunks = buildKotlinChunks(text, {}) || []; +const target = chunks.find((chunk) => chunk.kind === 'MethodDeclaration' || chunk.kind === 'FunctionDeclaration'); +if (!target) { + console.error('Missing Kotlin function chunk for perf guard test.'); + process.exit(1); +} + +const stats = getKotlinFileStats(text); +const fullOptions = { + stats, + kotlin: { + flowMaxBytes: 10 * 1024, + flowMaxLines: 100, + relationsMaxBytes: 10 * 1024, + relationsMaxLines: 100 + } +}; +const skipOptions = { + stats, + kotlin: { + flowMaxBytes: 1, + flowMaxLines: 1, + relationsMaxBytes: 1, + relationsMaxLines: 1 + } +}; + +const flowFull = computeKotlinFlow(text, target, { ...fullOptions, dataflow: true, controlFlow: true }); +assert.ok(flowFull && flowFull.controlFlow, 'Expected flow metadata for Kotlin chunk.'); + +const flowSkipped = computeKotlinFlow(text, target, { ...skipOptions, dataflow: true, controlFlow: true }); +assert.equal(flowSkipped, null, 'Expected flow metadata to be skipped for large Kotlin file.'); + +const relationsFull = buildKotlinRelations(text, {}, chunks, fullOptions); +assert.ok(relationsFull.calls.some((entry) => entry[1] && entry[1].includes('foo')), + 'Expected Kotlin calls to include foo().'); + +const relationsSkipped = buildKotlinRelations(text, {}, chunks, skipOptions); +assert.equal(relationsSkipped.calls.length, 0, 'Expected Kotlin relations to be skipped.'); + +console.log('kotlin perf guard test passed'); diff --git a/tests/lancedb-ann.js b/tests/lancedb-ann.js new file mode 100644 index 000000000..bb2332221 --- /dev/null +++ b/tests/lancedb-ann.js @@ -0,0 +1,94 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { tryImport } from '../src/shared/optional-deps.js'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'lancedb-ann'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +const lanceAvailable = (await tryImport('@lancedb/lancedb')).ok; +if (!lanceAvailable) { + console.warn('lancedb missing; skipping lancedb-ann test.'); + process.exit(0); +} + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); +await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); + +const config = { + cache: { root: cacheRoot } +}; + +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify(config, null, 2) + '\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const run = (args, label) => { + const result = spawnSync(process.execPath, args, { + cwd: repoRoot, + env, + stdio: 'inherit' + }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +run([path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], 'build index'); + +const userConfig = loadUserConfig(repoRoot); +const codeDir = getIndexDir(repoRoot, 'code', userConfig); +const proseDir = getIndexDir(repoRoot, 'prose', userConfig); +const codeDb = path.join(codeDir, 'dense_vectors.lancedb'); +const proseDb = path.join(proseDir, 'dense_vectors.lancedb'); +const codeMeta = path.join(codeDir, 'dense_vectors.lancedb.meta.json'); +const proseMeta = path.join(proseDir, 'dense_vectors.lancedb.meta.json'); + +if (!fs.existsSync(codeDb) || !fs.existsSync(codeMeta)) { + console.error('LanceDB index missing for code mode.'); + process.exit(1); +} +if (!fs.existsSync(proseDb) || !fs.existsSync(proseMeta)) { + console.error('LanceDB index missing for prose mode.'); + process.exit(1); +} + +const searchResult = spawnSync( + process.execPath, + [path.join(root, 'search.js'), 'index', '--backend', 'memory', '--json', '--ann', '--repo', repoRoot], + { cwd: repoRoot, env, encoding: 'utf8' } +); +if (searchResult.status !== 0) { + console.error('search.js failed for LanceDB ANN test.'); + if (searchResult.stderr) console.error(searchResult.stderr.trim()); + process.exit(searchResult.status ?? 1); +} + +const payload = JSON.parse(searchResult.stdout || '{}'); +const stats = payload.stats || {}; +if (stats.annBackend !== 'lancedb') { + console.error(`Expected annBackend=lancedb, got ${stats.annBackend}`); + process.exit(1); +} +if (!stats.annLance?.available?.code || !stats.annLance?.available?.prose) { + console.error('Expected LanceDB availability for code and prose.'); + process.exit(1); +} + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +console.log('LanceDB ANN test passed'); diff --git a/tests/lang-filter.js b/tests/lang-filter.js new file mode 100644 index 000000000..35a9be944 --- /dev/null +++ b/tests/lang-filter.js @@ -0,0 +1,24 @@ +import assert from 'node:assert/strict'; +import { mergeExtFilters, normalizeLangFilter } from '../src/retrieval/filters.js'; + +const js = normalizeLangFilter('js'); +assert.ok(js && js.includes('.js'), 'expected js to include .js'); +assert.ok(js && js.includes('.jsx'), 'expected js to include .jsx'); + +const mixed = normalizeLangFilter('ts,python'); +assert.ok(mixed && mixed.includes('.ts'), 'expected mixed to include .ts'); +assert.ok(mixed && mixed.includes('.py'), 'expected mixed to include .py'); + +const extFilter = ['.ts', '.tsx']; +const langFilter = normalizeLangFilter('typescript'); +const merged = mergeExtFilters(extFilter, langFilter); +assert.ok(merged, 'expected merged to be non-null'); +assert.deepEqual(new Set(merged), new Set(extFilter)); + +const mergedEmpty = mergeExtFilters(['.ts'], normalizeLangFilter('python')); +assert.equal(mergedEmpty, null); + +const unknown = normalizeLangFilter('unknown'); +assert.equal(unknown, null); + +console.log('lang filter test passed'); diff --git a/tests/lang/js-chunking.test.js b/tests/lang/js-chunking.test.js new file mode 100644 index 000000000..69b191073 --- /dev/null +++ b/tests/lang/js-chunking.test.js @@ -0,0 +1,33 @@ +#!/usr/bin/env node +import { buildJsChunks } from '../../src/lang/javascript.js'; + +const source = [ + 'export function alpha() {}', + 'class Foo {', + ' method() {}', + ' static bar() {}', + '}', + 'const beta = () => {};', + 'export default function gamma() {}', + 'exports.qux = function() {};' +].join('\\n'); + +const chunks = buildJsChunks(source) || []; +const names = new Set(chunks.map((chunk) => chunk.name)); + +const expect = (condition, message) => { + if (!condition) { + console.error(message); + process.exit(1); + } +}; + +expect(names.has('alpha'), 'Missing exported function chunk (alpha).'); +expect(names.has('Foo'), 'Missing class chunk (Foo).'); +expect(names.has('Foo.method'), 'Missing class method chunk (Foo.method).'); +expect(names.has('Foo.bar'), 'Missing class method chunk (Foo.bar).'); +expect(names.has('beta'), 'Missing arrow function chunk (beta).'); +expect(names.has('gamma'), 'Missing default function chunk (gamma).'); +expect(names.has('exports.qux') || names.has('qux'), 'Missing assignment function chunk (exports.qux).'); + +console.log('JS chunking test passed.'); diff --git a/tests/lang/js-imports.test.js b/tests/lang/js-imports.test.js new file mode 100644 index 000000000..0cb7d9431 --- /dev/null +++ b/tests/lang/js-imports.test.js @@ -0,0 +1,22 @@ +#!/usr/bin/env node +import { collectImports } from '../../src/lang/javascript.js'; + +const source = [ + "import fs from 'fs';", + "import { join as joinPath } from 'path';", + "export * from 'module-a';", + "export { foo } from 'module-b';", + "const mod = require('module-c');", + "async function load() { return import('module-d'); }" +].join('\n'); + +const imports = collectImports(source); +const sorted = imports.slice().sort(); +const expected = ['fs', 'path', 'module-a', 'module-b', 'module-c', 'module-d'].sort(); + +if (JSON.stringify(sorted) !== JSON.stringify(expected)) { + console.error(`JS imports mismatch: ${JSON.stringify(sorted)} !== ${JSON.stringify(expected)}`); + process.exit(1); +} + +console.log('JS imports test passed.'); diff --git a/tests/lang/js-relations.test.js b/tests/lang/js-relations.test.js new file mode 100644 index 000000000..c3379b96f --- /dev/null +++ b/tests/lang/js-relations.test.js @@ -0,0 +1,34 @@ +#!/usr/bin/env node +import { buildCodeRelations } from '../../src/lang/javascript.js'; + +const source = [ + "import { readFile } from 'fs';", + 'export function run(path) {', + ' return readFile(path);', + '}', + 'const local = () => run("x");', + 'module.exports = { run };' +].join('\\n'); + +const rel = buildCodeRelations(source, 'sample.js', { fs: ['fs.js'] }) || {}; +const calls = Array.isArray(rel.calls) ? rel.calls : []; +const imports = Array.isArray(rel.imports) ? rel.imports : []; +const exports = Array.isArray(rel.exports) ? rel.exports : []; + +const hasCall = calls.some(([from, to]) => from === 'run' && to === 'readFile'); +if (!hasCall) { + console.error(`Missing call relation from run -> readFile: ${JSON.stringify(calls)}`); + process.exit(1); +} + +if (!imports.includes('fs')) { + console.error(`Missing import for fs: ${JSON.stringify(imports)}`); + process.exit(1); +} + +if (!exports.includes('run') || !exports.includes('default')) { + console.error(`Missing exports for run/default: ${JSON.stringify(exports)}`); + process.exit(1); +} + +console.log('JS relations test passed.'); diff --git a/tests/lang/python-heuristic-chunking.test.js b/tests/lang/python-heuristic-chunking.test.js new file mode 100644 index 000000000..08ab79189 --- /dev/null +++ b/tests/lang/python-heuristic-chunking.test.js @@ -0,0 +1,40 @@ +#!/usr/bin/env node +import { buildPythonHeuristicChunks } from '../../src/lang/python.js'; + +const sample = [ + 'class Foo:', + ' def method(self):', + ' pass', + '', + 'def top():', + ' pass', + '', + 'async def later():', + ' pass' +].join('\n'); + +const chunks = buildPythonHeuristicChunks(sample) || []; +const byName = Object.fromEntries(chunks.map((chunk) => [chunk.name, chunk])); + +const expect = (condition, message) => { + if (!condition) { + console.error(message); + process.exit(1); + } +}; + +expect(byName.Foo, 'Missing class chunk for Foo.'); +expect(byName['Foo.method'], 'Missing method chunk for Foo.method.'); +expect(byName.top, 'Missing function chunk for top.'); +expect(byName.later, 'Missing function chunk for later.'); + +expect(byName.Foo.meta.startLine === 1, 'Foo startLine mismatch.'); +expect(byName.Foo.meta.endLine === 5, 'Foo endLine mismatch.'); +expect(byName['Foo.method'].meta.startLine === 2, 'Foo.method startLine mismatch.'); +expect(byName['Foo.method'].meta.endLine === 5, 'Foo.method endLine mismatch.'); +expect(byName.top.meta.startLine === 5, 'top startLine mismatch.'); +expect(byName.top.meta.endLine === 8, 'top endLine mismatch.'); +expect(byName.later.meta.startLine === 8, 'later startLine mismatch.'); +expect(byName.later.meta.endLine === 9, 'later endLine mismatch.'); + +console.log('Python heuristic chunking test passed.'); diff --git a/tests/lang/python-imports.test.js b/tests/lang/python-imports.test.js new file mode 100644 index 000000000..8c94eb682 --- /dev/null +++ b/tests/lang/python-imports.test.js @@ -0,0 +1,37 @@ +#!/usr/bin/env node +import { collectPythonImports } from '../../src/lang/python.js'; + +const source = [ + 'import os, sys as system', + 'import json', + 'from collections import defaultdict, namedtuple as nt', + 'from foo.bar import Baz as Qux, Quux', + '# from ignored import nope' +].join('\n'); + +const { imports, usages } = collectPythonImports(source); +const sorted = (items) => items.slice().sort(); + +const expectSet = (label, actual, expected) => { + const actualSorted = sorted(actual); + const expectedSorted = sorted(expected); + const actualText = JSON.stringify(actualSorted); + const expectedText = JSON.stringify(expectedSorted); + if (actualText !== expectedText) { + console.error(`${label} mismatch: ${actualText} !== ${expectedText}`); + process.exit(1); + } +}; + +expectSet('imports', imports, ['os', 'sys', 'json', 'collections', 'foo.bar']); +expectSet('usages', usages, [ + 'system', + 'defaultdict', + 'namedtuple', + 'nt', + 'Baz', + 'Qux', + 'Quux' +]); + +console.log('Python imports test passed.'); diff --git a/tests/lang/python-pool.test.js b/tests/lang/python-pool.test.js new file mode 100644 index 000000000..c2a72491f --- /dev/null +++ b/tests/lang/python-pool.test.js @@ -0,0 +1,32 @@ +#!/usr/bin/env node +import { getPythonAst, shutdownPythonAstPool } from '../../src/lang/python.js'; +import { findPythonExecutable } from '../../src/lang/python/executable.js'; + +const sample = 'def add(a: int, b: int) -> int:\\n return a + b\\n'; +const originalPath = process.env.PATH; +process.env.PATH = ''; + +const pythonBin = await findPythonExecutable(); +if (pythonBin) { + const ast = await getPythonAst(sample, null, { + pythonAst: { workerCount: 1, maxWorkers: 1, taskTimeoutMs: 5000 } + }); + if (!ast || !Array.isArray(ast.defs)) { + console.error('Expected AST payload when python is available.'); + process.exit(1); + } +} else { + const ast = await getPythonAst(sample, null, { + pythonAst: { workerCount: 1, maxWorkers: 1, taskTimeoutMs: 5000 } + }); + if (ast !== null) { + console.error('Expected null AST when python is not available.'); + process.exit(1); + } +} + +shutdownPythonAstPool(); +shutdownPythonAstPool(); +process.env.PATH = originalPath; + +console.log('Python pool test passed.'); diff --git a/tests/language-fidelity.js b/tests/language-fidelity.js index 2e3c70e7f..74ce5564c 100644 --- a/tests/language-fidelity.js +++ b/tests/language-fidelity.js @@ -16,10 +16,12 @@ await fsPromises.mkdir(cacheRoot, { recursive: true }); const env = { ...process.env, PAIROFCLEATS_CACHE_ROOT: cacheRoot, - PAIROFCLEATS_EMBEDDINGS: 'stub' + PAIROFCLEATS_EMBEDDINGS: 'stub', + PAIROFCLEATS_WORKER_POOL: 'off' }; process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; +process.env.PAIROFCLEATS_WORKER_POOL = 'off'; const repoArgs = ['--repo', fixtureRoot]; function run(args, label) { @@ -69,19 +71,103 @@ if (!fs.existsSync(chunkMetaPath)) { } const chunkMeta = JSON.parse(fs.readFileSync(chunkMetaPath, 'utf8')); +const fileMetaPath = path.join(codeDir, 'file_meta.json'); +const fileMeta = fs.existsSync(fileMetaPath) + ? JSON.parse(fs.readFileSync(fileMetaPath, 'utf8')) + : []; +const failures = []; +const extractPostings = (payload) => { + if (!payload || typeof payload !== 'object') return []; + if (Array.isArray(payload.postings)) return payload.postings; + if (Array.isArray(payload.arrays?.postings)) return payload.arrays.postings; + return []; +}; +const validateTokenPostings = (payload, label) => { + const postings = extractPostings(payload); + let badEntry = null; + for (let i = 0; i < postings.length; i += 1) { + const list = postings[i]; + if (!Array.isArray(list)) continue; + for (let j = 0; j < list.length; j += 1) { + const entry = list[j]; + if (!Array.isArray(entry)) continue; + const count = entry[1]; + if (!Number.isInteger(count)) { + badEntry = { i, j, count }; + break; + } + } + if (badEntry) break; + } + if (badEntry) { + const labelSuffix = label ? ` (${label})` : ''; + failures.push(`Token postings contain non-integer counts${labelSuffix} at ${badEntry.i}/${badEntry.j}: ${badEntry.count}`); + } +}; +const tokenPostingsPath = path.join(codeDir, 'token_postings.json'); +const tokenPostingsMetaPath = path.join(codeDir, 'token_postings.meta.json'); +if (fs.existsSync(tokenPostingsPath)) { + try { + const tokenPostings = JSON.parse(fs.readFileSync(tokenPostingsPath, 'utf8')); + validateTokenPostings(tokenPostings, 'token_postings.json'); + } catch { + failures.push('Token postings check failed: invalid JSON payload.'); + } +} else if (fs.existsSync(tokenPostingsMetaPath)) { + try { + const tokenMeta = JSON.parse(fs.readFileSync(tokenPostingsMetaPath, 'utf8')); + const parts = Array.isArray(tokenMeta?.fields?.parts) ? tokenMeta.fields.parts : []; + if (!parts.length) { + failures.push('Token postings check failed: sharded metadata missing parts list.'); + } else { + for (const part of parts) { + const partPath = path.join(codeDir, part); + if (!fs.existsSync(partPath)) { + failures.push(`Token postings shard missing: ${partPath}`); + continue; + } + try { + const shard = JSON.parse(fs.readFileSync(partPath, 'utf8')); + validateTokenPostings(shard, part); + } catch { + failures.push(`Token postings shard check failed: invalid JSON payload in ${part}.`); + } + } + } + } catch { + failures.push('Token postings check failed: invalid sharded metadata payload.'); + } +} +const fileById = new Map( + (Array.isArray(fileMeta) ? fileMeta : []).map((entry) => [entry.id, entry.file]) +); +const resolveChunkFile = (chunk) => chunk?.file || fileById.get(chunk?.fileId) || null; +const fileRelationsPath = path.join(codeDir, 'file_relations.json'); +let fileRelations = null; +if (fs.existsSync(fileRelationsPath)) { + try { + const raw = JSON.parse(fs.readFileSync(fileRelationsPath, 'utf8')); + if (Array.isArray(raw)) { + fileRelations = new Map(); + raw.forEach((entry) => { + if (entry?.file) fileRelations.set(entry.file, entry.relations || null); + }); + } + } catch {} +} +const getFileRelations = (file) => (fileRelations?.get(file) || null); function findChunk(match) { return chunkMeta.find((chunk) => { - if (!chunk || !chunk.file) return false; - if (match.file && chunk.file !== match.file) return false; + const file = resolveChunkFile(chunk); + if (!chunk || !file) return false; + if (match.file && file !== match.file) return false; if (match.kind && chunk.kind !== match.kind) return false; if (match.nameIncludes && !String(chunk.name || '').includes(match.nameIncludes)) return false; return true; }); } -const failures = []; - const branchSearch = runSearch( [searchPath, 'load', '--json', '--mode', 'code', '--branches', '1', '--no-ann'], 'search (branches filter)' @@ -172,6 +258,30 @@ if (asyncPayload) { } } +const fileRegexSearch = runSearch( + [searchPath, 'buildAliases', '--json', '--mode', 'code', '--file', '/javascript_advanced\\.js$/', '--no-ann'], + 'search (file regex filter)' +); +let fileRegexPayload = null; +try { + fileRegexPayload = JSON.parse(fileRegexSearch); +} catch { + failures.push('Search file regex filter failed: invalid JSON output.'); +} +if (fileRegexPayload) { + const fileRegexHits = fileRegexPayload.code || []; + if (!fileRegexHits.length) { + failures.push('Search file regex filter failed: no results for javascript_advanced.js.'); + } else { + const matches = fileRegexHits.every((hit) => + String(hit.file || '').includes('javascript_advanced.js') + ); + if (!matches) { + failures.push('Search file regex filter failed: returned non-matching files.'); + } + } +} + const aliasChunk = findChunk({ file: 'src/javascript_advanced.js', nameIncludes: 'buildAliases' }); if (!aliasChunk) { failures.push('Missing JavaScript alias chunk (buildAliases).'); @@ -186,6 +296,27 @@ if (!aliasChunk) { } } +const goDocChunk = findChunk({ file: 'src/go_advanced.go', kind: 'StructDeclaration', nameIncludes: 'Widget' }); +if (!goDocChunk) { + failures.push('Missing Go struct chunk (Widget).'); +} else if (!String(goDocChunk.docmeta?.doc || '').includes('Widget holds a name')) { + failures.push('Go docstring missing for Widget struct.'); +} + +const perlDocChunk = findChunk({ file: 'src/perl_advanced.pl', kind: 'FunctionDeclaration', nameIncludes: 'greet' }); +if (!perlDocChunk) { + failures.push('Missing Perl function chunk (greet).'); +} else if (!String(perlDocChunk.docmeta?.doc || '').includes('Greets a caller')) { + failures.push('Perl docstring missing for greet.'); +} + +const sqlDocChunk = findChunk({ file: 'src/sql_advanced.sql', kind: 'TableDeclaration', nameIncludes: 'widgets' }); +if (!sqlDocChunk) { + failures.push('Missing SQL table chunk (widgets).'); +} else if (!String(sqlDocChunk.docmeta?.doc || '').includes('Widget table')) { + failures.push('SQL docstring missing for widgets.'); +} + const riskChunk = findChunk({ file: 'src/javascript_risk.js', nameIncludes: 'runCommand' }); if (!riskChunk) { failures.push('Missing JavaScript risk chunk (runCommand).'); @@ -311,7 +442,7 @@ if (pythonAvailable) { } const jsWidgetClass = chunkMeta.find((chunk) => { - if (!chunk || chunk.file !== 'src/javascript_advanced.js') return false; + if (!chunk || resolveChunkFile(chunk) !== 'src/javascript_advanced.js') return false; if (chunk.name !== 'Widget') return false; return chunk.kind === 'ClassDeclaration' || chunk.kind === 'ExportedClass' || @@ -470,7 +601,8 @@ const javaMethod = findChunk({ file: 'src/java_advanced.java', kind: 'MethodDecl if (!javaMethod) { failures.push('Missing Java method chunk (Box.add).'); } else { - const imports = javaMethod.codeRelations?.imports || []; + const javaFile = resolveChunkFile(javaMethod); + const imports = javaMethod.codeRelations?.imports || getFileRelations(javaFile)?.imports || []; if (!imports.some((imp) => imp === 'java.util.List')) { failures.push('Java import capture missing java.util.List.'); } @@ -510,7 +642,7 @@ if (!shellFunc) { } const tsClass = chunkMeta.find((chunk) => - chunk.file === 'src/typescript_advanced.ts' && + resolveChunkFile(chunk) === 'src/typescript_advanced.ts' && chunk.kind === 'ClassDeclaration' && chunk.name === 'Widget' ); @@ -537,6 +669,10 @@ const tsAlias = findChunk({ file: 'src/typescript_advanced.ts', kind: 'FunctionD if (!tsAlias) { failures.push('Missing TypeScript alias chunk (buildWidgetAliases).'); } else { + const tsAliases = tsAlias.docmeta?.dataflow?.aliases || []; + if (!tsAliases.includes('name=label') || !tsAliases.includes('copy=items')) { + failures.push('TypeScript alias tracking missing expected aliases for buildWidgetAliases.'); + } const inferredParams = tsAlias.docmeta?.inferredTypes?.params?.label || []; if (!inferredParams.some((entry) => entry.type === 'string')) { failures.push('TypeScript inferredTypes missing string for label param.'); @@ -568,6 +704,10 @@ const rubyMethod = findChunk({ file: 'src/ruby_advanced.rb', kind: 'MethodDeclar if (!rubyMethod) { failures.push('Missing Ruby method chunk (Widget.render).'); } +const gemfileChunk = findChunk({ file: 'src/Gemfile', kind: 'MethodDeclaration', nameIncludes: 'build_widget' }); +if (!gemfileChunk) { + failures.push('Missing Gemfile Ruby chunk (build_widget).'); +} const phpMethod = findChunk({ file: 'src/php_advanced.php', kind: 'MethodDeclaration', nameIncludes: 'Widget.render' }); if (!phpMethod) { @@ -582,6 +722,13 @@ if (!luaMethod) { const sqlTable = findChunk({ file: 'src/sql_advanced.sql', kind: 'TableDeclaration', nameIncludes: 'widgets' }); if (!sqlTable) { failures.push('Missing SQL table chunk (widgets).'); +} else { + if (!Array.isArray(sqlTable.docmeta?.dataflow?.reads)) { + failures.push('SQL dataflow missing for widgets.'); + } + if (typeof sqlTable.docmeta?.controlFlow?.branches !== 'number') { + failures.push('SQL control flow missing for widgets.'); + } } const pgTable = findChunk({ file: 'src/sql_postgres.psql', kind: 'TableDeclaration', nameIncludes: 'pg_widgets' }); @@ -605,6 +752,101 @@ if (!sqliteTable) { failures.push('SQLite dialect metadata missing for sqlite_widgets.'); } +const dockerChunk = findChunk({ file: 'src/Dockerfile', nameIncludes: 'FROM' }); +if (!dockerChunk) { + failures.push('Missing Dockerfile chunk (FROM).'); +} + +const makeChunk = findChunk({ file: 'src/Makefile', nameIncludes: 'build' }); +if (!makeChunk) { + failures.push('Missing Makefile chunk (build).'); +} + +const protoChunk = findChunk({ file: 'src/schema.proto', nameIncludes: 'Widget' }); +if (!protoChunk) { + failures.push('Missing Protobuf chunk (Widget).'); +} + +const graphqlChunk = findChunk({ file: 'src/schema.graphql', nameIncludes: 'Widget' }); +if (!graphqlChunk) { + failures.push('Missing GraphQL chunk (Widget).'); +} + +const cmakeChunk = findChunk({ file: 'src/CMakeLists.txt', nameIncludes: 'add_executable' }); +if (!cmakeChunk) { + failures.push('Missing CMake chunk (add_executable).'); +} + +const bazelChunk = findChunk({ file: 'src/BUILD', nameIncludes: 'widget_lib' }); +if (!bazelChunk) { + failures.push('Missing Bazel chunk (widget_lib).'); +} + +const workspaceChunk = findChunk({ file: 'src/WORKSPACE', nameIncludes: 'workspace' }); +if (!workspaceChunk) { + failures.push('Missing Bazel WORKSPACE chunk (workspace).'); +} + +const starlarkChunk = findChunk({ file: 'src/defs.bzl', nameIncludes: 'widget_rule' }); +if (!starlarkChunk) { + failures.push('Missing Starlark chunk (widget_rule).'); +} + +const nixChunk = findChunk({ file: 'src/default.nix', nameIncludes: 'widget' }); +if (!nixChunk) { + failures.push('Missing Nix chunk (widget).'); +} + +const dartChunk = findChunk({ file: 'src/widget.dart', nameIncludes: 'Widget' }); +if (!dartChunk) { + failures.push('Missing Dart chunk (Widget).'); +} + +const scalaChunk = findChunk({ file: 'src/Widget.scala', nameIncludes: 'WidgetFactory' }); +if (!scalaChunk) { + failures.push('Missing Scala chunk (WidgetFactory).'); +} + +const groovyChunk = findChunk({ file: 'src/Widget.groovy', nameIncludes: 'buildWidget' }); +if (!groovyChunk) { + failures.push('Missing Groovy chunk (buildWidget).'); +} + +const rChunk = findChunk({ file: 'src/widget.r', nameIncludes: 'build_widget' }); +if (!rChunk) { + failures.push('Missing R chunk (build_widget).'); +} + +const juliaChunk = findChunk({ file: 'src/widget.jl', nameIncludes: 'build_widget' }); +if (!juliaChunk) { + failures.push('Missing Julia chunk (build_widget).'); +} + +const handlebarsChunk = findChunk({ file: 'src/widget.hbs', nameIncludes: 'widgets' }); +if (!handlebarsChunk) { + failures.push('Missing Handlebars chunk (widgets).'); +} + +const mustacheChunk = findChunk({ file: 'src/widget.mustache', nameIncludes: 'widget' }); +if (!mustacheChunk) { + failures.push('Missing Mustache chunk (widget).'); +} + +const jinjaChunk = findChunk({ file: 'src/widget.jinja2', nameIncludes: 'content' }); +if (!jinjaChunk) { + failures.push('Missing Jinja chunk (content).'); +} + +const djangoChunk = findChunk({ file: 'src/widget.djhtml', nameIncludes: 'body' }); +if (!djangoChunk) { + failures.push('Missing Django template chunk (body).'); +} + +const razorChunk = findChunk({ file: 'src/widget.razor', nameIncludes: 'page' }); +if (!razorChunk) { + failures.push('Missing Razor chunk (page).'); +} + if (failures.length) { failures.forEach((msg) => console.error(msg)); process.exit(1); diff --git a/tests/language-registry/collectors.test.js b/tests/language-registry/collectors.test.js new file mode 100644 index 000000000..0ce762ddc --- /dev/null +++ b/tests/language-registry/collectors.test.js @@ -0,0 +1,133 @@ +#!/usr/bin/env node +import { collectCmakeImports } from '../../src/index/language-registry/import-collectors/cmake.js'; +import { collectDartImports } from '../../src/index/language-registry/import-collectors/dart.js'; +import { collectDockerfileImports } from '../../src/index/language-registry/import-collectors/dockerfile.js'; +import { collectGraphqlImports } from '../../src/index/language-registry/import-collectors/graphql.js'; +import { collectGroovyImports } from '../../src/index/language-registry/import-collectors/groovy.js'; +import { collectHandlebarsImports } from '../../src/index/language-registry/import-collectors/handlebars.js'; +import { collectJinjaImports } from '../../src/index/language-registry/import-collectors/jinja.js'; +import { collectJuliaImports } from '../../src/index/language-registry/import-collectors/julia.js'; +import { collectMakefileImports } from '../../src/index/language-registry/import-collectors/makefile.js'; +import { collectMustacheImports } from '../../src/index/language-registry/import-collectors/mustache.js'; +import { collectNixImports } from '../../src/index/language-registry/import-collectors/nix.js'; +import { collectProtoImports } from '../../src/index/language-registry/import-collectors/proto.js'; +import { collectRazorImports } from '../../src/index/language-registry/import-collectors/razor.js'; +import { collectRImports } from '../../src/index/language-registry/import-collectors/r.js'; +import { collectScalaImports } from '../../src/index/language-registry/import-collectors/scala.js'; +import { collectStarlarkImports } from '../../src/index/language-registry/import-collectors/starlark.js'; + +const sort = (list) => list.slice().sort(); +const expectSet = (label, actual, expected) => { + const actualSorted = sort(actual); + const expectedSorted = sort(expected); + if (JSON.stringify(actualSorted) !== JSON.stringify(expectedSorted)) { + console.error(`${label} mismatch: ${JSON.stringify(actualSorted)} !== ${JSON.stringify(expectedSorted)}`); + process.exit(1); + } +}; + +const cases = [ + { + label: 'dockerfile', + fn: collectDockerfileImports, + text: 'FROM node:18 AS base\\nCOPY --from=base /src /dst', + expected: ['node:18', 'base'] + }, + { + label: 'makefile', + fn: collectMakefileImports, + text: 'include shared.mk\\n-include local.mk', + expected: ['shared.mk', 'local.mk'] + }, + { + label: 'proto', + fn: collectProtoImports, + text: 'import \"foo.proto\";\\nimport public \"bar.proto\";', + expected: ['foo.proto', 'bar.proto'] + }, + { + label: 'graphql', + fn: collectGraphqlImports, + text: '#import \"common.graphql\"', + expected: ['common.graphql'] + }, + { + label: 'cmake', + fn: collectCmakeImports, + text: 'include(foo)\\nadd_subdirectory(bar)\\nfind_package(Baz)', + expected: ['foo', 'bar', 'Baz'] + }, + { + label: 'starlark', + fn: collectStarlarkImports, + text: 'load(\"//path:target\", \"x\")', + expected: ['//path:target'] + }, + { + label: 'nix', + fn: collectNixImports, + text: 'import ./module.nix\\ncallPackage ../pkg.nix {}', + expected: ['./module.nix', '../pkg.nix'] + }, + { + label: 'dart', + fn: collectDartImports, + text: "import 'package:foo/bar.dart';", + expected: ['package:foo/bar.dart'] + }, + { + label: 'scala', + fn: collectScalaImports, + text: 'import foo.bar.Baz', + expected: ['foo.bar.Baz'] + }, + { + label: 'groovy', + fn: collectGroovyImports, + text: 'import foo.bar.Baz', + expected: ['foo.bar.Baz'] + }, + { + label: 'r', + fn: collectRImports, + text: 'library(ggplot2)\\nrequire(\"dplyr\")', + expected: ['ggplot2', 'dplyr'] + }, + { + label: 'julia', + fn: collectJuliaImports, + text: 'using Foo.Bar', + expected: ['Foo.Bar'] + }, + { + label: 'handlebars', + fn: collectHandlebarsImports, + text: '{{> partial-name}}', + expected: ['partial-name'] + }, + { + label: 'mustache', + fn: collectMustacheImports, + text: '{{> other}}', + expected: ['other'] + }, + { + label: 'jinja', + fn: collectJinjaImports, + text: '{% extends \"base.html\" %}', + expected: ['base.html'] + }, + { + label: 'razor', + fn: collectRazorImports, + text: '@using System.Text', + expected: ['System.Text'] + } +]; + +for (const testCase of cases) { + const actual = testCase.fn(testCase.text); + expectSet(testCase.label, actual, testCase.expected); +} + +console.log('Language registry collectors test passed.'); diff --git a/tests/language-registry/selection.test.js b/tests/language-registry/selection.test.js new file mode 100644 index 000000000..552c9d6ef --- /dev/null +++ b/tests/language-registry/selection.test.js @@ -0,0 +1,22 @@ +#!/usr/bin/env node +import { getLanguageForFile } from '../../src/index/language-registry.js'; + +const expectId = (ext, relPath, expected) => { + const lang = getLanguageForFile(ext, relPath); + const actual = lang ? lang.id : null; + if (actual !== expected) { + console.error(`Language mismatch for ${relPath || ext}: ${actual} !== ${expected}`); + process.exit(1); + } +}; + +expectId('.js', 'src/app.js', 'javascript'); +expectId('.ts', 'src/app.ts', 'typescript'); +expectId('.tsx', 'src/App.tsx', 'typescript'); +expectId('.py', 'src/app.py', 'python'); +expectId('.rs', 'src/lib.rs', 'rust'); +expectId('.go', 'src/main.go', 'go'); +expectId('.hbs', 'templates/view.hbs', 'handlebars'); +expectId('.dockerfile', 'Dockerfile.dockerfile', 'dockerfile'); + +console.log('Language registry selection test passed.'); diff --git a/tests/lmdb-backend.js b/tests/lmdb-backend.js new file mode 100644 index 000000000..1555a97c4 --- /dev/null +++ b/tests/lmdb-backend.js @@ -0,0 +1,103 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { Unpackr } from 'msgpackr'; +import { LMDB_META_KEYS, LMDB_SCHEMA_VERSION } from '../src/storage/lmdb/schema.js'; +import { resolveLmdbPaths } from '../tools/dict-utils.js'; + +let open = null; +try { + ({ open } = await import('lmdb')); +} catch (err) { + console.error(`lmdb missing: ${err?.message || err}`); + process.exit(1); +} + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'lmdb-backend'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile(path.join(repoRoot, 'alpha.js'), 'const alpha = 1;\\n'); +await fsPromises.writeFile(path.join(repoRoot, 'beta.js'), 'const beta = 2;\\n'); +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ indexing: { treeSitter: { enabled: false } } }, null, 2) +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const runNode = (label, args) => { + const result = spawnSync(process.execPath, args, { cwd: repoRoot, env, stdio: 'inherit' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +runNode('build_index', [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot]); +runNode('build_lmdb_index', [path.join(root, 'tools', 'build-lmdb-index.js'), '--mode', 'code', '--repo', repoRoot]); + +const lmdbPaths = resolveLmdbPaths(repoRoot, {}); +const dbPath = lmdbPaths.codePath; +const dataPath = path.join(dbPath, 'data.mdb'); +if (!fs.existsSync(dataPath)) { + console.error(`Expected LMDB data file to exist at ${dataPath}`); + process.exit(1); +} + +const db = open({ path: dbPath, readOnly: true }); +const unpackr = new Unpackr(); +const decode = (value) => (value == null ? null : unpackr.unpack(value)); +const version = decode(db.get(LMDB_META_KEYS.schemaVersion)); +if (version !== LMDB_SCHEMA_VERSION) { + console.error(`Expected LMDB schema version ${LMDB_SCHEMA_VERSION}, got ${version}`); + process.exit(1); +} +const mode = decode(db.get(LMDB_META_KEYS.mode)); +if (mode !== 'code') { + console.error(`Expected LMDB mode code, got ${mode}`); + process.exit(1); +} +const chunkCount = Number(decode(db.get(LMDB_META_KEYS.chunkCount)) || 0); +if (!Number.isFinite(chunkCount) || chunkCount <= 0) { + console.error('Expected LMDB chunkCount to be positive.'); + process.exit(1); +} +db.close(); + +const searchResult = spawnSync( + process.execPath, + [path.join(root, 'search.js'), 'alpha', '--json', '--backend', 'lmdb', '--no-ann', '--repo', repoRoot], + { encoding: 'utf8', env } +); +if (searchResult.status !== 0) { + console.error('search.js failed for LMDB backend test.'); + process.exit(searchResult.status ?? 1); +} +const output = String(searchResult.stdout || '').trim(); +let payload = null; +try { + payload = JSON.parse(output); +} catch { + console.error('Failed to parse LMDB search JSON output.'); + process.exit(1); +} +if (payload.backend !== 'lmdb') { + console.error(`Expected backend=lmdb, got ${payload.backend}`); + process.exit(1); +} + +console.log('lmdb backend test passed'); diff --git a/tests/lmdb-corruption.js b/tests/lmdb-corruption.js new file mode 100644 index 000000000..ec6d54164 --- /dev/null +++ b/tests/lmdb-corruption.js @@ -0,0 +1,102 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { LMDB_META_KEYS } from '../src/storage/lmdb/schema.js'; +import { loadUserConfig, resolveLmdbPaths } from '../tools/dict-utils.js'; + +let open = null; +try { + ({ open } = await import('lmdb')); +} catch (err) { + console.error(`lmdb missing: ${err?.message || err}`); + process.exit(1); +} + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'lmdb-corruption'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); +await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ sqlite: { use: false } }, null, 2) +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const run = (args, label, options = {}) => { + const result = spawnSync(process.execPath, args, { + cwd: repoRoot, + env, + ...options + }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + return result; +}; + +run( + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + 'build index', + { stdio: 'inherit' } +); +run( + [path.join(root, 'tools', 'build-lmdb-index.js'), '--mode', 'all', '--repo', repoRoot], + 'build lmdb index', + { stdio: 'inherit' } +); + +const userConfig = loadUserConfig(repoRoot); +const lmdbPaths = resolveLmdbPaths(repoRoot, userConfig); +const db = open({ path: lmdbPaths.codePath, readOnly: false }); +if (typeof db.removeSync === 'function') { + db.removeSync(LMDB_META_KEYS.schemaVersion); +} else { + db.remove(LMDB_META_KEYS.schemaVersion); +} +db.close(); + +const report = run( + [path.join(root, 'tools', 'report-artifacts.js'), '--json', '--repo', repoRoot], + 'report artifacts', + { encoding: 'utf8' } +); + +let payload = null; +try { + payload = JSON.parse(report.stdout || '{}'); +} catch { + console.error('Failed to parse report-artifacts JSON output.'); + process.exit(1); +} + +if (payload?.corruption?.ok !== false) { + console.error('Expected corruption report ok=false after LMDB tamper.'); + process.exit(1); +} +if (payload?.corruption?.lmdb?.ok !== false) { + console.error('Expected LMDB corruption report ok=false.'); + process.exit(1); +} +const issues = Array.isArray(payload?.corruption?.issues) ? payload.corruption.issues : []; +if (!issues.some((issue) => issue.includes('lmdb/code'))) { + console.error('Expected LMDB corruption issues for code db.'); + process.exit(1); +} + +console.log('lmdb corruption test passed'); diff --git a/tests/lmdb-report-artifacts.js b/tests/lmdb-report-artifacts.js new file mode 100644 index 000000000..50187fc7d --- /dev/null +++ b/tests/lmdb-report-artifacts.js @@ -0,0 +1,83 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'lmdb-report-artifacts'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); +await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ sqlite: { use: false } }, null, 2) +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const run = (args, label, options = {}) => { + const result = spawnSync(process.execPath, args, { + cwd: repoRoot, + env, + ...options + }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + return result; +}; + +run( + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + 'build index', + { stdio: 'inherit' } +); + +run( + [path.join(root, 'tools', 'build-lmdb-index.js'), '--mode', 'all', '--repo', repoRoot], + 'build lmdb index', + { stdio: 'inherit' } +); + +const report = run( + [path.join(root, 'tools', 'report-artifacts.js'), '--json', '--repo', repoRoot], + 'report artifacts', + { encoding: 'utf8' } +); + +let payload = null; +try { + payload = JSON.parse(report.stdout || '{}'); +} catch { + console.error('Failed to parse report-artifacts JSON output.'); + process.exit(1); +} + +const lmdbThroughput = payload?.throughput?.lmdb; +if (!lmdbThroughput?.code || !Number.isFinite(lmdbThroughput.code.chunksPerSec)) { + console.error('LMDB code throughput missing or invalid in report-artifacts.'); + process.exit(1); +} +if (!lmdbThroughput?.prose || !Number.isFinite(lmdbThroughput.prose.chunksPerSec)) { + console.error('LMDB prose throughput missing or invalid in report-artifacts.'); + process.exit(1); +} +if (payload?.corruption?.lmdb?.ok !== true) { + console.error('LMDB corruption report expected ok=true.'); + process.exit(1); +} + +console.log('lmdb report artifacts test passed'); diff --git a/tests/lsif-ingest.js b/tests/lsif-ingest.js new file mode 100644 index 000000000..16629e606 --- /dev/null +++ b/tests/lsif-ingest.js @@ -0,0 +1,48 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'lsif-ingest'); +const repoRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const inputPath = path.join(root, 'tests', 'fixtures', 'lsif', 'dump.lsif'); +const outPath = path.join(tempRoot, 'lsif.jsonl'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); + +const result = spawnSync( + process.execPath, + [path.join(root, 'tools', 'lsif-ingest.js'), '--repo', repoRoot, '--input', inputPath, '--out', outPath, '--json'], + { encoding: 'utf8' } +); +if (result.status !== 0) { + console.error(result.stderr || result.stdout || 'lsif-ingest failed'); + process.exit(result.status ?? 1); +} + +if (!fs.existsSync(outPath)) { + console.error('lsif output not found'); + process.exit(1); +} + +const lines = fs.readFileSync(outPath, 'utf8').trim().split(/\r?\n/).filter(Boolean); +assert.ok(lines.length >= 1, 'expected lsif output lines'); + +const first = JSON.parse(lines[0]); +assert.equal(first.file, 'src/sample.ts'); +assert.equal(first.role, 'definition'); +assert.equal(first.startLine, 2); +assert.equal(first.language, 'typescript'); + +const metaPath = `${outPath}.meta.json`; +const meta = JSON.parse(fs.readFileSync(metaPath, 'utf8')); +assert.ok(meta.stats.vertices >= 4); +assert.ok(meta.stats.edges >= 2); +assert.ok(meta.stats.definitions >= 1); +assert.ok(meta.stats.references >= 1); + +console.log('lsif ingest test passed'); diff --git a/tests/lsp-shutdown.js b/tests/lsp-shutdown.js new file mode 100644 index 000000000..0009ba2fa --- /dev/null +++ b/tests/lsp-shutdown.js @@ -0,0 +1,24 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { pathToFileURL } from 'node:url'; +import { createLspClient } from '../src/integrations/tooling/lsp/client.js'; + +const root = process.cwd(); +const serverPath = path.join(root, 'tests', 'fixtures', 'lsp', 'stub-lsp-server.js'); +const logs = []; +const client = createLspClient({ + cmd: process.execPath, + args: [serverPath, '--exit-on-shutdown'], + log: (message) => logs.push(message) +}); + +await client.initialize({ rootUri: pathToFileURL(root).href }); +await client.shutdownAndExit(); +await new Promise((resolve) => setTimeout(resolve, 200)); +client.kill(); + +if (logs.some((line) => line.includes('ERR_STREAM_DESTROYED'))) { + throw new Error('LSP shutdown emitted ERR_STREAM_DESTROYED.'); +} + +console.log('LSP shutdown test passed'); diff --git a/tests/mcp-robustness.js b/tests/mcp-robustness.js new file mode 100644 index 000000000..b98ce06c0 --- /dev/null +++ b/tests/mcp-robustness.js @@ -0,0 +1,187 @@ +#!/usr/bin/env node +import { spawn } from 'node:child_process'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; + +const root = process.cwd(); +const serverPath = path.join(root, 'tools', 'mcp-server.js'); +const tempRoot = path.join(root, 'tests', '.cache', 'mcp-robustness'); +const queueCache = path.join(tempRoot, 'queue-cache'); +const timeoutCache = path.join(tempRoot, 'timeout-cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(queueCache, { recursive: true }); +await fsPromises.mkdir(timeoutCache, { recursive: true }); + +function encodeMessage(payload) { + const json = JSON.stringify(payload); + return `Content-Length: ${Buffer.byteLength(json, 'utf8')}\r\n\r\n${json}`; +} + +function createReader(stream) { + let buffer = Buffer.alloc(0); + const tryRead = () => { + const headerEnd = buffer.indexOf('\r\n\r\n'); + if (headerEnd === -1) return null; + const header = buffer.slice(0, headerEnd).toString('utf8'); + const match = header.match(/Content-Length:\s*(\d+)/i); + if (!match) { + buffer = buffer.slice(headerEnd + 4); + return null; + } + const length = parseInt(match[1], 10); + const total = headerEnd + 4 + length; + if (buffer.length < total) return null; + const body = buffer.slice(headerEnd + 4, total).toString('utf8'); + buffer = buffer.slice(total); + return JSON.parse(body); + }; + const notifications = []; + const readRaw = async () => { + const existing = tryRead(); + if (existing) return existing; + return new Promise((resolve) => { + const onData = (chunk) => { + buffer = Buffer.concat([buffer, chunk]); + const parsed = tryRead(); + if (!parsed) return; + stream.off('data', onData); + resolve(parsed); + }; + stream.on('data', onData); + }); + }; + const readMessage = async () => { + while (true) { + const parsed = await readRaw(); + if (parsed && parsed.method && parsed.id === undefined) { + notifications.push(parsed); + continue; + } + return parsed; + } + }; + return { readMessage, notifications }; +} + +async function runQueueTest() { + const server = spawn(process.execPath, [serverPath], { + stdio: ['pipe', 'pipe', 'inherit'], + env: { + ...process.env, + PAIROFCLEATS_HOME: queueCache, + PAIROFCLEATS_CACHE_ROOT: queueCache, + PAIROFCLEATS_MCP_QUEUE_MAX: '1' + } + }); + const { readMessage } = createReader(server.stdout); + const timeout = setTimeout(() => { + console.error('MCP queue test timed out.'); + server.kill('SIGKILL'); + process.exit(1); + }, 30000); + const send = (payload) => server.stdin.write(encodeMessage(payload)); + + try { + send({ + jsonrpc: '2.0', + id: 1, + method: 'initialize', + params: { protocolVersion: '2024-11-05', capabilities: {} } + }); + await readMessage(); + + send({ + jsonrpc: '2.0', + id: 2, + method: 'tools/call', + params: { name: 'index_status', arguments: { repoPath: root } } + }); + send({ + jsonrpc: '2.0', + id: 3, + method: 'tools/call', + params: { name: 'index_status', arguments: { repoPath: root } } + }); + + const first = await readMessage(); + const second = await readMessage(); + const responses = [first, second]; + const overload = responses.find((msg) => msg?.error?.code === -32001); + if (!overload || overload.error?.data?.code !== 'QUEUE_OVERLOADED') { + throw new Error('Expected queue overload error response.'); + } + + send({ jsonrpc: '2.0', id: 4, method: 'shutdown' }); + await readMessage(); + send({ jsonrpc: '2.0', method: 'exit' }); + } catch (err) { + server.kill('SIGKILL'); + throw err; + } finally { + clearTimeout(timeout); + server.stdin.end(); + } +} + +async function runTimeoutTest() { + const server = spawn(process.execPath, [serverPath], { + stdio: ['pipe', 'pipe', 'inherit'], + env: { + ...process.env, + PAIROFCLEATS_HOME: timeoutCache, + PAIROFCLEATS_CACHE_ROOT: timeoutCache, + PAIROFCLEATS_MCP_TOOL_TIMEOUT_MS: '1' + } + }); + const { readMessage } = createReader(server.stdout); + const timeout = setTimeout(() => { + console.error('MCP timeout test timed out.'); + server.kill('SIGKILL'); + process.exit(1); + }, 30000); + const send = (payload) => server.stdin.write(encodeMessage(payload)); + + try { + send({ + jsonrpc: '2.0', + id: 10, + method: 'initialize', + params: { protocolVersion: '2024-11-05', capabilities: {} } + }); + await readMessage(); + + send({ + jsonrpc: '2.0', + id: 11, + method: 'tools/call', + params: { name: 'index_status', arguments: { repoPath: root } } + }); + const response = await readMessage(); + const payloadText = response.result?.content?.[0]?.text || ''; + const payload = JSON.parse(payloadText || '{}'); + if (!response.result?.isError || payload.code !== 'TOOL_TIMEOUT') { + throw new Error('Expected tool timeout error response.'); + } + + send({ jsonrpc: '2.0', id: 12, method: 'shutdown' }); + await readMessage(); + send({ jsonrpc: '2.0', method: 'exit' }); + } catch (err) { + server.kill('SIGKILL'); + throw err; + } finally { + clearTimeout(timeout); + server.stdin.end(); + } +} + +runQueueTest() + .then(runTimeoutTest) + .then(() => { + console.log('MCP robustness tests passed'); + }) + .catch((err) => { + console.error(err?.message || err); + process.exit(1); + }); diff --git a/tests/mcp-schema.js b/tests/mcp-schema.js new file mode 100644 index 000000000..6ca216122 --- /dev/null +++ b/tests/mcp-schema.js @@ -0,0 +1,180 @@ +#!/usr/bin/env node +import { spawn } from 'node:child_process'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { getToolDefs } from '../src/integrations/mcp/defs.js'; +import { stableStringify } from '../src/shared/stable-json.js'; +import { DEFAULT_MODEL_ID } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const serverPath = path.join(root, 'tools', 'mcp-server.js'); +const sampleRepo = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'mcp-schema'); +const cacheRoot = path.join(tempRoot, 'cache'); +const emptyRepo = path.join(tempRoot, 'empty'); +const snapshotPath = path.join(root, 'tests', 'fixtures', 'mcp', 'schema-snapshot.json'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); +await fsPromises.mkdir(emptyRepo, { recursive: true }); + +function encodeMessage(payload) { + const json = JSON.stringify(payload); + return `Content-Length: ${Buffer.byteLength(json, 'utf8')}\r\n\r\n${json}`; +} + +function createReader(stream) { + let buffer = Buffer.alloc(0); + const tryRead = () => { + const headerEnd = buffer.indexOf('\r\n\r\n'); + if (headerEnd === -1) return null; + const header = buffer.slice(0, headerEnd).toString('utf8'); + const match = header.match(/Content-Length:\s*(\d+)/i); + if (!match) { + buffer = buffer.slice(headerEnd + 4); + return null; + } + const length = parseInt(match[1], 10); + const total = headerEnd + 4 + length; + if (buffer.length < total) return null; + const body = buffer.slice(headerEnd + 4, total).toString('utf8'); + buffer = buffer.slice(total); + return JSON.parse(body); + }; + const notifications = []; + const readRaw = async () => { + const existing = tryRead(); + if (existing) return existing; + return new Promise((resolve) => { + const onData = (chunk) => { + buffer = Buffer.concat([buffer, chunk]); + const parsed = tryRead(); + if (!parsed) return; + stream.off('data', onData); + resolve(parsed); + }; + stream.on('data', onData); + }); + }; + const readMessage = async () => { + while (true) { + const parsed = await readRaw(); + if (parsed && parsed.method && parsed.id === undefined) { + notifications.push(parsed); + continue; + } + return parsed; + } + }; + return { readMessage, notifications }; +} + +const server = spawn(process.execPath, [serverPath], { + stdio: ['pipe', 'pipe', 'inherit'], + env: { + ...process.env, + PAIROFCLEATS_HOME: cacheRoot, + PAIROFCLEATS_CACHE_ROOT: cacheRoot + } +}); + +const { readMessage } = createReader(server.stdout); +const timeout = setTimeout(() => { + console.error('MCP schema test timed out.'); + server.kill('SIGKILL'); + process.exit(1); +}, 30000); + +function send(payload) { + server.stdin.write(encodeMessage(payload)); +} + +const shapeValue = (value) => { + if (Array.isArray(value)) { + return value.map((entry) => shapeValue(entry)); + } + if (value && typeof value === 'object') { + const out = {}; + for (const key of Object.keys(value).sort()) { + out[key] = shapeValue(value[key]); + } + return out; + } + if (value === null) return ''; + return `<${typeof value}>`; +}; + +const toolSchemaSnapshot = getToolDefs(DEFAULT_MODEL_ID).map((tool) => ({ + name: tool.name, + required: Array.isArray(tool.inputSchema?.required) + ? [...tool.inputSchema.required].sort() + : [], + properties: Object.keys(tool.inputSchema?.properties || {}).sort() +})); + +async function run() { + send({ + jsonrpc: '2.0', + id: 1, + method: 'initialize', + params: { protocolVersion: '2024-11-05', capabilities: {} } + }); + await readMessage(); + + send({ + jsonrpc: '2.0', + id: 2, + method: 'tools/call', + params: { + name: 'index_status', + arguments: { repoPath: sampleRepo } + } + }); + const status = await readMessage(); + const statusText = status.result?.content?.[0]?.text || ''; + const statusPayload = JSON.parse(statusText || '{}'); + + send({ + jsonrpc: '2.0', + id: 3, + method: 'tools/call', + params: { + name: 'config_status', + arguments: { repoPath: emptyRepo } + } + }); + const configStatus = await readMessage(); + const configText = configStatus.result?.content?.[0]?.text || ''; + const configPayload = JSON.parse(configText || '{}'); + + send({ jsonrpc: '2.0', id: 4, method: 'shutdown' }); + await readMessage(); + send({ jsonrpc: '2.0', method: 'exit' }); + + return { + tools: toolSchemaSnapshot, + responses: { + index_status: shapeValue(statusPayload), + config_status: shapeValue(configPayload) + } + }; +} + +run() + .then(async (actual) => { + clearTimeout(timeout); + server.stdin.end(); + const expectedRaw = await fsPromises.readFile(snapshotPath, 'utf8'); + const expected = JSON.parse(expectedRaw); + if (stableStringify(actual) !== stableStringify(expected)) { + console.error('MCP schema snapshot mismatch.'); + process.exit(1); + } + console.log('MCP schema snapshot test passed'); + }) + .catch((err) => { + clearTimeout(timeout); + console.error(err?.message || err); + server.kill('SIGKILL'); + process.exit(1); + }); diff --git a/tests/metadata-v2.js b/tests/metadata-v2.js new file mode 100644 index 000000000..0a6886b49 --- /dev/null +++ b/tests/metadata-v2.js @@ -0,0 +1,53 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { buildMetaV2 } from '../src/index/metadata-v2.js'; + +const chunk = { + file: 'src/example.js', + ext: '.js', + start: 10, + end: 42, + startLine: 2, + endLine: 4, + kind: 'FunctionDeclaration', + name: 'makeWidget', + segment: { + segmentId: 'seg-1', + type: 'code', + languageId: 'javascript', + parentSegmentId: null, + embeddingContext: 'code' + } +}; + +const docmeta = { + signature: 'makeWidget(opts)', + params: ['opts'], + returnType: 'Widget', + inferredTypes: { + returns: [{ type: 'Widget', source: 'tooling', confidence: 0.9 }] + }, + risk: { + tags: ['command-exec'], + sources: [{ name: 'req.body' }], + sinks: [{ name: 'exec' }], + flows: [{ source: 'req.body', sink: 'exec', scope: 'local' }] + } +}; + +const meta = buildMetaV2({ + chunk, + docmeta, + toolInfo: { tool: 'pairofcleats', version: '0.0.0-test', configHash: 'deadbeef' } +}); + +assert.ok(meta, 'expected metaV2 output'); +assert.ok(meta.chunkId, 'expected metaV2 chunkId'); +assert.equal(meta.file, 'src/example.js'); +assert.equal(meta.segment?.segmentId, 'seg-1'); +assert.equal(meta.signature, 'makeWidget(opts)'); +assert.equal(meta.returns, 'Widget'); +assert.equal(meta.types?.tooling?.returns?.[0]?.type, 'Widget'); +assert.equal(meta.risk?.flows?.[0]?.sink, 'exec'); + +console.log('metadata v2 test passed'); diff --git a/tests/minhash-parity.js b/tests/minhash-parity.js new file mode 100644 index 000000000..74b7506e2 --- /dev/null +++ b/tests/minhash-parity.js @@ -0,0 +1,21 @@ +#!/usr/bin/env node +import { SimpleMinHash } from '../src/index/minhash.js'; +import { rankMinhash } from '../src/retrieval/rankers.js'; + +const tokens = ['alpha', 'beta', 'gamma', 'delta']; +const mh = new SimpleMinHash(); +tokens.forEach((token) => mh.update(token)); +const idx = { + minhash: { signatures: [mh.hashValues] }, + chunkMeta: [{ weight: 1 }] +}; +const results = rankMinhash(idx, tokens, 1); +if (!results.length || results[0].idx !== 0) { + console.error('minhash parity test failed: expected top hit for id 0'); + process.exit(1); +} +if (results[0].sim < 0.99) { + console.error(`minhash parity test failed: expected sim≈1, got ${results[0].sim}`); + process.exit(1); +} +console.log('minhash parity test passed'); diff --git a/tests/parity.js b/tests/parity.js index dccfa0da9..1c923b128 100644 --- a/tests/parity.js +++ b/tests/parity.js @@ -4,15 +4,29 @@ import fsSync from 'node:fs'; import path from 'node:path'; import { spawnSync } from 'node:child_process'; import { performance } from 'node:perf_hooks'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; import { getIndexDir, loadUserConfig, resolveSqlitePaths } from '../tools/dict-utils.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['ann', 'write-report', 'enforce'], - string: ['queries', 'out', 'search', 'sqlite-backend'], - alias: { n: 'top', q: 'queries' }, - default: { top: 5, limit: 0, 'sqlite-backend': 'sqlite' } -}); +const argv = createCli({ + scriptName: 'parity', + options: { + ann: { type: 'boolean', default: true }, + 'write-report': { type: 'boolean', default: false }, + enforce: { type: 'boolean', default: false }, + 'enforce-fts': { type: 'boolean', default: false }, + 'min-overlap': { type: 'number' }, + 'min-rank-corr': { type: 'number' }, + 'max-delta': { type: 'number' }, + 'min-overlap-single': { type: 'number' }, + queries: { type: 'string' }, + out: { type: 'string' }, + search: { type: 'string' }, + 'sqlite-backend': { type: 'string', default: 'sqlite' }, + top: { type: 'number', default: 5 }, + limit: { type: 'number', default: 0 } + }, + aliases: { n: 'top', q: 'queries' } +}).parse(); const root = process.cwd(); const repoArgs = ['--repo', root]; @@ -156,6 +170,18 @@ function hitScore(hit) { function summarizeMatch(memoryHits, sqliteHits) { const mem = memoryHits.slice(0, topN); const sql = sqliteHits.slice(0, topN); + if (!mem.length && !sql.length) { + return { + overlap: 1, + avgDelta: 0, + missingFromSqlite: [], + missingFromMemory: [], + rankCorr: null, + topMemory: [], + topSqlite: [], + zeroHits: true + }; + } const memKeys = mem.map(hitKey); const sqlKeys = sql.map(hitKey); const memRanks = new Map(memKeys.map((key, idx) => [key, idx + 1])); @@ -300,11 +326,39 @@ if (argv['write-report']) { } if (argv.enforce) { - const minOverlap = typeof argv['min-overlap'] === 'number' - ? argv['min-overlap'] - : (parseFloat(argv['min-overlap']) || 0.6); - if (summary.overlapAvg < minOverlap) { - console.error(`Overlap below threshold (${summary.overlapAvg.toFixed(3)} < ${minOverlap}).`); - process.exit(1); + const isFts = sqliteBackend === 'sqlite-fts'; + const defaults = isFts + ? { minOverlap: 0.7, minRankCorr: 0.55, maxDelta: 0.5, minSingleOverlap: 0.6 } + : { minOverlap: 0.95, minRankCorr: 0.9, maxDelta: 0.1, minSingleOverlap: 0.6 }; + const thresholds = { + minOverlap: Number.isFinite(argv['min-overlap']) ? argv['min-overlap'] : defaults.minOverlap, + minRankCorr: Number.isFinite(argv['min-rank-corr']) ? argv['min-rank-corr'] : defaults.minRankCorr, + maxDelta: Number.isFinite(argv['max-delta']) ? argv['max-delta'] : defaults.maxDelta, + minSingleOverlap: Number.isFinite(argv['min-overlap-single']) + ? argv['min-overlap-single'] + : defaults.minSingleOverlap + }; + const minOverlapSingle = overlapValues.length ? Math.min(...overlapValues) : 1; + const failures = []; + if (summary.overlapAvg < thresholds.minOverlap) { + failures.push(`overlapAvg ${summary.overlapAvg.toFixed(3)} < ${thresholds.minOverlap}`); + } + if (summary.rankCorrAvg !== null && summary.rankCorrAvg < thresholds.minRankCorr) { + failures.push(`rankCorrAvg ${summary.rankCorrAvg.toFixed(3)} < ${thresholds.minRankCorr}`); + } + if (summary.scoreDeltaAvg > thresholds.maxDelta) { + failures.push(`avgDelta ${summary.scoreDeltaAvg.toFixed(3)} > ${thresholds.maxDelta}`); + } + if (minOverlapSingle < thresholds.minSingleOverlap) { + failures.push(`minOverlap@K ${minOverlapSingle.toFixed(3)} < ${thresholds.minSingleOverlap}`); + } + if (failures.length) { + const label = failures.join('; '); + if (isFts && argv['enforce-fts'] !== true) { + console.warn(`SQLite FTS parity warning: ${label}`); + } else { + console.error(`Parity thresholds failed: ${label}`); + process.exit(1); + } } } diff --git a/tests/piece-assembly.js b/tests/piece-assembly.js new file mode 100644 index 000000000..c6dbc4f3e --- /dev/null +++ b/tests/piece-assembly.js @@ -0,0 +1,125 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; +import { loadChunkMeta, loadTokenPostings } from '../src/shared/artifact-io.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const buildIndexPath = path.join(root, 'build_index.js'); +const assemblePath = path.join(root, 'tools', 'assemble-pieces.js'); + +if (!fs.existsSync(fixtureRoot)) { + console.error(`Missing fixture: ${fixtureRoot}`); + process.exit(1); +} + +const cacheRoot = path.join(root, 'tests', '.cache', 'piece-assembly'); +const cacheA = path.join(cacheRoot, 'a'); +const cacheB = path.join(cacheRoot, 'b'); +const outputDir = path.join(cacheRoot, 'assembled', 'index-code'); + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const baseEnv = { + ...process.env, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const run = (label, args, env) => { + const result = spawnSync(process.execPath, args, { + cwd: fixtureRoot, + env, + stdio: 'inherit' + }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +run('build_index (A)', [buildIndexPath, '--stub-embeddings', '--mode', 'code', '--repo', fixtureRoot], { + ...baseEnv, + PAIROFCLEATS_CACHE_ROOT: cacheA +}); +run('build_index (B)', [buildIndexPath, '--stub-embeddings', '--mode', 'code', '--repo', fixtureRoot], { + ...baseEnv, + PAIROFCLEATS_CACHE_ROOT: cacheB +}); + +const userConfig = loadUserConfig(fixtureRoot); +process.env.PAIROFCLEATS_CACHE_ROOT = cacheA; +const indexA = getIndexDir(fixtureRoot, 'code', userConfig); +process.env.PAIROFCLEATS_CACHE_ROOT = cacheB; +const indexB = getIndexDir(fixtureRoot, 'code', userConfig); + +run('assemble-pieces', [ + assemblePath, + '--repo', + fixtureRoot, + '--mode', + 'code', + '--out', + outputDir, + '--input', + indexA, + '--input', + indexB, + '--force' +], { + ...baseEnv, + PAIROFCLEATS_CACHE_ROOT: cacheRoot +}); + +const chunksA = loadChunkMeta(indexA).length; +const chunksB = loadChunkMeta(indexB).length; +const chunksOut = loadChunkMeta(outputDir).length; +if (chunksOut !== chunksA + chunksB) { + console.error(`Expected merged chunk count ${chunksA + chunksB}, got ${chunksOut}`); + process.exit(1); +} + +const tokenIndex = loadTokenPostings(outputDir); +if (!Array.isArray(tokenIndex?.docLengths) || tokenIndex.docLengths.length !== chunksOut) { + console.error('Merged token_postings docLengths mismatch.'); + process.exit(1); +} +if (!Array.isArray(tokenIndex?.vocab) || !Array.isArray(tokenIndex?.postings)) { + console.error('Merged token_postings missing vocab/postings.'); + process.exit(1); +} +if (tokenIndex.vocab.length !== tokenIndex.postings.length) { + console.error('Merged token_postings vocab/postings length mismatch.'); + process.exit(1); +} +let minDocId = Number.POSITIVE_INFINITY; +let maxDocId = -1; +for (const posting of tokenIndex.postings) { + if (!Array.isArray(posting)) continue; + for (const entry of posting) { + if (!Array.isArray(entry)) continue; + const docId = entry[0]; + if (!Number.isFinite(docId)) continue; + if (docId < minDocId) minDocId = docId; + if (docId > maxDocId) maxDocId = docId; + } +} +if (maxDocId < chunksA || maxDocId >= chunksOut) { + console.error('Merged token_postings docIds not offset correctly.'); + process.exit(1); +} +if (minDocId < 0) { + console.error('Merged token_postings docIds should be non-negative.'); + process.exit(1); +} + +const manifestPath = path.join(outputDir, 'pieces', 'manifest.json'); +if (!fs.existsSync(manifestPath)) { + console.error(`Missing pieces manifest: ${manifestPath}`); + process.exit(1); +} + +console.log('Piece assembly test passed'); diff --git a/tests/postings-quantize.js b/tests/postings-quantize.js new file mode 100644 index 000000000..68e96353e --- /dev/null +++ b/tests/postings-quantize.js @@ -0,0 +1,57 @@ +#!/usr/bin/env node +import { buildPostings } from '../src/index/build/postings.js'; +import { quantizeVec } from '../src/index/embedding.js'; + +const chunks = [ + { + tokens: ['a'], + embedding: [0.1, 0.2], + embed_doc: [0.3, 0.4], + embed_code: [0.5, 0.6], + minhashSig: [1, 2] + }, + { + tokens: ['b'], + embedding: [0.7, 0.8], + minhashSig: [3, 4] + } +]; + +const tokenPostings = new Map([ + ['a', [[0, 1]]], + ['b', [[1, 1]]] +]); + +const postings = await buildPostings({ + chunks, + df: new Map(), + tokenPostings, + docLengths: [1, 1], + fieldPostings: null, + fieldDocLengths: null, + phrasePost: new Map(), + triPost: new Map(), + postingsConfig: {}, + modelId: 'test', + useStubEmbeddings: true, + log: () => {}, + workerPool: null, + embeddingsEnabled: true +}); + +const expectedMerged = chunks.map((chunk) => quantizeVec(chunk.embedding)); +const expectedDoc = chunks.map((chunk) => quantizeVec(chunk.embed_doc || chunk.embedding)); +const expectedCode = chunks.map((chunk) => quantizeVec(chunk.embed_code || chunk.embedding)); + +const equal = (label, actual, expected) => { + if (JSON.stringify(actual) !== JSON.stringify(expected)) { + console.error(`postings quantize test failed: ${label}`); + process.exit(1); + } +}; + +equal('dense', postings.quantizedVectors, expectedMerged); +equal('doc', postings.quantizedDocVectors, expectedDoc); +equal('code', postings.quantizedCodeVectors, expectedCode); + +console.log('postings quantize test passed'); diff --git a/tests/preprocess-files.js b/tests/preprocess-files.js new file mode 100644 index 000000000..7a0dce5fc --- /dev/null +++ b/tests/preprocess-files.js @@ -0,0 +1,62 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { preprocessFiles } from '../src/index/build/preprocess.js'; +import { buildIgnoreMatcher } from '../src/index/build/ignore.js'; + +const root = process.cwd(); +const cacheRoot = path.join(root, 'tests', '.cache', 'preprocess'); +await fs.rm(cacheRoot, { recursive: true, force: true }); +await fs.mkdir(path.join(cacheRoot, 'src'), { recursive: true }); +await fs.mkdir(path.join(cacheRoot, 'docs'), { recursive: true }); + +await fs.writeFile(path.join(cacheRoot, 'src', 'app.js'), 'const a = 1;\nconst b = 2;\n'); +await fs.writeFile(path.join(cacheRoot, 'src', 'app.min.js'), 'var x=1;'); +await fs.writeFile( + path.join(cacheRoot, 'src', 'minified.js'), + 'const x=' + 'a'.repeat(200) +); +await fs.copyFile( + path.join(root, 'tests', 'fixtures', 'binary', 'sample.png'), + path.join(cacheRoot, 'src', 'binary.png') +); +await fs.writeFile(path.join(cacheRoot, 'docs', 'readme.md'), '# title\n'); + +const { ignoreMatcher } = await buildIgnoreMatcher({ root: cacheRoot, userConfig: {} }); +const fileScan = { + sampleBytes: 256, + minified: { + sampleMinBytes: 1, + minChars: 20, + avgLineThreshold: 10, + maxLineThreshold: 10, + maxWhitespaceRatio: 0.2 + }, + binary: { + sampleMinBytes: 1, + maxNonTextRatio: 0.1 + } +}; + +const result = await preprocessFiles({ + root: cacheRoot, + modes: ['code', 'prose'], + ignoreMatcher, + maxFileBytes: null, + fileCaps: {}, + fileScan, + lineCounts: true, + concurrency: 4 +}); + +const codeEntries = result.entriesByMode.code.map((entry) => entry.rel).sort(); +const proseEntries = result.entriesByMode.prose.map((entry) => entry.rel).sort(); +assert.deepEqual(codeEntries, ['src/app.js']); +assert.deepEqual(proseEntries, ['docs/readme.md']); +const codeSkips = result.skippedByMode.code.map((skip) => skip.reason); +assert.ok(codeSkips.includes('minified')); +assert.ok(codeSkips.includes('binary')); +assert.ok(result.lineCountsByMode.code.get('src/app.js') > 0); + +console.log('preprocess-files test passed.'); diff --git a/tests/profile-config.js b/tests/profile-config.js new file mode 100644 index 000000000..7094566e2 --- /dev/null +++ b/tests/profile-config.js @@ -0,0 +1,36 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fsPromises from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import { loadUserConfig } from '../tools/dict-utils.js'; + +const tempRoot = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'poc-profile-')); +const configPath = path.join(tempRoot, '.pairofcleats.json'); + +try { + await fsPromises.writeFile( + configPath, + JSON.stringify({ profile: 'lite' }, null, 2), + 'utf8' + ); + + const loaded = loadUserConfig(tempRoot); + assert.equal(loaded.profile, 'lite'); + assert.equal(loaded.indexing?.gitBlame, false); + + const previousProfile = process.env.PAIROFCLEATS_PROFILE; + process.env.PAIROFCLEATS_PROFILE = 'full'; + const loadedEnv = loadUserConfig(tempRoot); + assert.equal(loadedEnv.profile, 'full'); + assert.equal(loadedEnv.indexing?.gitBlame, true); + if (previousProfile) { + process.env.PAIROFCLEATS_PROFILE = previousProfile; + } else { + delete process.env.PAIROFCLEATS_PROFILE; + } +} finally { + await fsPromises.rm(tempRoot, { recursive: true, force: true }); +} + +console.log('profile-config test passed'); diff --git a/tests/prose-skip-imports.js b/tests/prose-skip-imports.js new file mode 100644 index 000000000..f0819b7b5 --- /dev/null +++ b/tests/prose-skip-imports.js @@ -0,0 +1,37 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const cacheRoot = path.join(root, 'tests', '.cache', 'prose-skip-imports'); + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const result = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--mode', 'prose', '--repo', fixtureRoot], + { cwd: fixtureRoot, env, encoding: 'utf8' } +); + +if (result.status !== 0) { + console.error('Failed: build_index prose mode'); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); +} + +const stderr = result.stderr || ''; +if (stderr.includes('Scanning for imports')) { + console.error('Prose mode should skip import scanning, but imports log was present.'); + process.exit(1); +} + +console.log('Prose import scan skip test passed'); diff --git a/tests/python-ast-worker.js b/tests/python-ast-worker.js new file mode 100644 index 000000000..c5946cfb0 --- /dev/null +++ b/tests/python-ast-worker.js @@ -0,0 +1,41 @@ +#!/usr/bin/env node +import { spawnSync } from 'node:child_process'; +import { getPythonAst, shutdownPythonAstPool } from '../src/lang/python.js'; + +function hasPython() { + const candidates = ['python', 'python3']; + for (const cmd of candidates) { + const result = spawnSync(cmd, ['-c', 'import sys; sys.stdout.write("ok")'], { encoding: 'utf8' }); + if (result.status === 0 && result.stdout.trim() === 'ok') return true; + } + return false; +} + +if (!hasPython()) { + console.log('Python AST worker test skipped (python not available).'); + process.exit(0); +} + +const sample = ` +def add(a: int, b: int) -> int: + return a + b +`; + +const ast = await getPythonAst(sample, null, { + dataflow: true, + controlFlow: true, + pythonAst: { workerCount: 1, maxWorkers: 1, taskTimeoutMs: 5000 } +}); + +if (!ast || !Array.isArray(ast.defs)) { + console.error('Python AST worker returned no defs.'); + process.exit(1); +} +const hasAdd = ast.defs.some((entry) => entry?.name === 'add'); +if (!hasAdd) { + console.error('Python AST worker missing add() definition.'); + process.exit(1); +} + +console.log('Python AST worker test passed'); +shutdownPythonAstPool(); diff --git a/tests/python-fallback.js b/tests/python-fallback.js index dfd24d77c..b1ad6bf7f 100644 --- a/tests/python-fallback.js +++ b/tests/python-fallback.js @@ -17,9 +17,10 @@ const chunks = buildPythonHeuristicChunks(text) || []; const hasPoint = chunks.some((chunk) => chunk.name === 'Point'); const hasDistance = chunks.some((chunk) => chunk.name === 'Point.distance'); const hasOuter = chunks.some((chunk) => chunk.name === 'outer'); +const hasFetch = chunks.some((chunk) => chunk.name === 'fetch_data'); -if (!hasPoint || !hasDistance || !hasOuter) { - console.error('Python heuristic fallback missing expected chunks (Point, Point.distance, outer).'); +if (!hasPoint || !hasDistance || !hasOuter || !hasFetch) { + console.error('Python heuristic fallback missing expected chunks (Point, Point.distance, outer, fetch_data).'); process.exit(1); } diff --git a/tests/query-intent.js b/tests/query-intent.js new file mode 100644 index 000000000..cbd21d2a1 --- /dev/null +++ b/tests/query-intent.js @@ -0,0 +1,40 @@ +#!/usr/bin/env node +import { classifyQuery, resolveIntentFieldWeights, resolveIntentVectorMode } from '../src/retrieval/query-intent.js'; + +const cases = [ + { query: 'src/utils/file.ts', tokens: ['src/utils/file.ts'], phrases: [], expect: 'path' }, + { query: 'renderToString', tokens: ['renderToString'], phrases: [], expect: 'code' }, + { query: 'how to configure proxy headers', tokens: ['how', 'to', 'configure', 'proxy', 'headers'], phrases: [], expect: 'prose' }, + { query: 'parse json', tokens: ['parse', 'json'], phrases: ['parse json'], expect: 'mixed' } +]; + +for (const sample of cases) { + const info = classifyQuery({ + query: sample.query, + tokens: sample.tokens, + phrases: sample.phrases + }); + if (info.type !== sample.expect) { + console.error(`Expected intent ${sample.expect} for "${sample.query}", got ${info.type}`); + process.exit(1); + } +} + +const proseIntent = classifyQuery({ + query: 'how to configure proxy headers', + tokens: ['how', 'to', 'configure', 'proxy', 'headers'], + phrases: [] +}); +const weights = resolveIntentFieldWeights(null, proseIntent); +if (!weights || !(weights.doc > weights.name)) { + console.error('Expected prose intent to emphasize doc weights.'); + process.exit(1); +} + +const vectorMode = resolveIntentVectorMode('auto', proseIntent); +if (vectorMode !== 'doc') { + console.error(`Expected auto vector mode to resolve to doc for prose, got ${vectorMode}`); + process.exit(1); +} + +console.log('query intent test passed'); diff --git a/tests/read-failure-skip.js b/tests/read-failure-skip.js new file mode 100644 index 000000000..94612ac64 --- /dev/null +++ b/tests/read-failure-skip.js @@ -0,0 +1,81 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { createFileProcessor } from '../src/index/build/file-processor.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'read-failure-skip'); +const repoRoot = path.join(tempRoot, 'repo'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); + +const targetPath = path.join(repoRoot, 'missing.js'); +await fsPromises.writeFile(targetPath, 'console.log("hello");\n'); +const stat = await fsPromises.stat(targetPath); +await fsPromises.unlink(targetPath); + +const skippedFiles = []; +const { processFile } = createFileProcessor({ + root: repoRoot, + mode: 'code', + dictConfig: {}, + dictWords: new Set(), + languageOptions: { astDataflowEnabled: false, controlFlowEnabled: false }, + postingsConfig: {}, + segmentsConfig: {}, + commentsConfig: {}, + allImports: {}, + contextWin: 0, + incrementalState: { + enabled: false, + manifest: { files: {} }, + bundleDir: '', + bundleFormat: 'json' + }, + getChunkEmbedding: async () => null, + getChunkEmbeddings: async () => null, + typeInferenceEnabled: false, + riskAnalysisEnabled: false, + riskConfig: {}, + relationsEnabled: false, + seenFiles: new Set(), + gitBlameEnabled: false, + lintEnabled: false, + complexityEnabled: false, + structuralMatches: null, + cacheConfig: {}, + cacheReporter: null, + queues: null, + workerPool: null, + crashLogger: null, + skippedFiles, + embeddingEnabled: false, + toolInfo: null, + tokenizationStats: null +}); + +const fileEntry = { + abs: targetPath, + rel: 'missing.js', + stat, + lines: 1, + scan: { checkedBinary: true, checkedMinified: true } +}; + +const result = await processFile(fileEntry, 0); +if (result !== null) { + console.error('Expected null result for read failure.'); + process.exit(1); +} +const skip = skippedFiles.find((entry) => entry?.file === targetPath && entry?.reason === 'read-failure'); +if (!skip) { + console.error('Expected read-failure skip entry.'); + process.exit(1); +} +if (!skip.code && !skip.message) { + console.error('Expected read-failure to include error details.'); + process.exit(1); +} + +console.log('read-failure skip test passed'); diff --git a/tests/retrieval-backend-policy.js b/tests/retrieval-backend-policy.js new file mode 100644 index 000000000..73c25b936 --- /dev/null +++ b/tests/retrieval-backend-policy.js @@ -0,0 +1,65 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { resolveBackendSelection } from '../src/retrieval/cli/policy.js'; + +const base = { + sqliteScoreModeConfig: false, + sqliteConfigured: true, + sqliteAvailable: true, + sqliteCodeAvailable: true, + sqliteProseAvailable: true, + sqliteCodePath: 'code.db', + sqliteProsePath: 'prose.db', + lmdbConfigured: true, + lmdbAvailable: true, + lmdbCodeAvailable: true, + lmdbProseAvailable: true, + lmdbCodePath: 'lmdb-code', + lmdbProsePath: 'lmdb-prose', + sqliteAutoChunkThreshold: 0, + sqliteAutoArtifactBytes: 0, + needsSqlite: true, + needsCode: true, + needsProse: false, + root: process.cwd(), + userConfig: {} +}; + +const autoResult = await resolveBackendSelection({ + ...base, + backendArg: '' +}); +assert.equal(autoResult.useSqlite, true, 'expected auto backend to select sqlite'); +assert.equal(autoResult.useLmdb, false, 'expected auto backend to avoid lmdb'); + +const lmdbFallback = await resolveBackendSelection({ + ...base, + backendArg: '', + sqliteAvailable: false, + sqliteCodeAvailable: false, + lmdbAvailable: true +}); +assert.equal(lmdbFallback.useSqlite, false, 'expected sqlite to be skipped when unavailable'); +assert.equal(lmdbFallback.useLmdb, true, 'expected lmdb to be selected when available'); + +const forcedSqlite = await resolveBackendSelection({ + ...base, + backendArg: 'sqlite', + sqliteAvailable: false, + sqliteCodeAvailable: false +}); +assert.ok(forcedSqlite.error, 'expected sqlite error when forced and missing'); +assert.ok(forcedSqlite.error.message.includes('SQLite backend requested'), 'expected sqlite error message'); +assert.ok(forcedSqlite.error.message.includes('code=code.db'), 'expected sqlite missing path in message'); + +const forcedLmdb = await resolveBackendSelection({ + ...base, + backendArg: 'lmdb', + lmdbAvailable: false, + lmdbCodeAvailable: false +}); +assert.ok(forcedLmdb.error, 'expected lmdb error when forced and missing'); +assert.ok(forcedLmdb.error.message.includes('LMDB backend requested'), 'expected lmdb error message'); +assert.ok(forcedLmdb.error.message.includes('code=lmdb-code'), 'expected lmdb missing path in message'); + +console.log('retrieval backend policy test passed'); diff --git a/tests/retrieval-branch-filter.js b/tests/retrieval-branch-filter.js new file mode 100644 index 000000000..1ed03ea13 --- /dev/null +++ b/tests/retrieval-branch-filter.js @@ -0,0 +1,32 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { applyBranchFilter } from '../src/retrieval/cli/branch-filter.js'; + +let recorded = null; +const backendPolicy = { reason: 'auto', backendLabel: 'sqlite' }; +const result = await applyBranchFilter({ + branchFilter: 'main', + caseSensitive: false, + repoBranch: 'dev', + backendLabel: 'sqlite', + backendPolicy, + emitOutput: false, + jsonOutput: true, + recordSearchMetrics: (status) => { + recorded = status; + } +}); + +assert.equal(result.matched, false, 'expected branch mismatch to be reported'); +assert.equal(recorded, 'ok', 'expected search metrics to be recorded'); +assert.ok(result.payload, 'expected payload for branch mismatch'); +assert.equal(result.payload.backend, 'sqlite'); +assert.deepEqual(result.payload.prose, []); +assert.deepEqual(result.payload.code, []); +assert.deepEqual(result.payload.records, []); +assert.equal(result.payload.stats.branch, 'dev'); +assert.equal(result.payload.stats.branchFilter, 'main'); +assert.equal(result.payload.stats.branchMatch, false); +assert.deepEqual(result.payload.stats.backendPolicy, backendPolicy); + +console.log('retrieval branch filter test passed'); diff --git a/tests/safe-regex-engine.js b/tests/safe-regex-engine.js new file mode 100644 index 000000000..01873afc3 --- /dev/null +++ b/tests/safe-regex-engine.js @@ -0,0 +1,88 @@ +import assert from 'node:assert/strict'; +import { createSafeRegex, isNativeRe2Available } from '../src/shared/safe-regex.js'; + +const basic = createSafeRegex('foo(\\d+)', '', { engine: 're2js' }); +assert.ok(basic, 'expected basic safe regex to compile'); +const match = basic.exec('xxfoo123yy'); +assert.ok(match, 'expected match'); +assert.equal(match[0], 'foo123'); +assert.equal(match[1], '123'); +assert.equal(match.index, 2); +assert.equal(match.input, 'xxfoo123yy'); + +const g = createSafeRegex('a', 'g', { engine: 're2js' }); +assert.ok(g, 'expected global regex to compile'); +const m1 = g.exec('a a'); +assert.ok(m1); +assert.equal(m1.index, 0); +assert.equal(g.lastIndex, 1); + +const m2 = g.exec('a a'); +assert.ok(m2); +assert.equal(m2.index, 2); +assert.equal(g.lastIndex, 3); + +const m3 = g.exec('a a'); +assert.equal(m3, null); +assert.equal(g.lastIndex, 0, 'expected lastIndex reset after global miss'); + +const t = createSafeRegex('a', 'g', { engine: 're2js' }); +assert.ok(t); +assert.equal(t.test('a a'), true); +assert.equal(t.lastIndex, 1); +assert.equal(t.test('a a'), true); +assert.equal(t.lastIndex, 3); +assert.equal(t.test('a a'), false); +assert.equal(t.lastIndex, 0); + +const sticky = createSafeRegex('a', 'y', { engine: 're2js' }); +assert.ok(sticky); +sticky.lastIndex = 1; +const sm1 = sticky.exec('ba'); +assert.ok(sm1); +assert.equal(sm1.index, 1); +assert.equal(sticky.lastIndex, 2); +const sm2 = sticky.exec('ba'); +assert.equal(sm2, null); +assert.equal(sticky.lastIndex, 0, 'expected lastIndex reset after sticky miss'); + +const tooLongPattern = createSafeRegex('a'.repeat(20), '', { maxPatternLength: 5, engine: 're2js' }); +assert.equal(tooLongPattern, null, 'expected maxPatternLength to reject pattern'); + +const inputLimit = createSafeRegex('a', 'g', { maxInputLength: 2, engine: 're2js' }); +assert.ok(inputLimit); +assert.equal(inputLimit.exec('aaa'), null); +assert.equal(inputLimit.lastIndex, 0); + +const flagNorm = createSafeRegex('a', 'g', { flags: 'imzzz', engine: 're2js' }); +assert.ok(flagNorm); +assert.ok(flagNorm.flags.includes('i')); +assert.ok(flagNorm.flags.includes('m')); +assert.ok(!flagNorm.flags.includes('z')); + +const forcedRe2js = createSafeRegex('a', '', { engine: 're2js' }); +assert.ok(forcedRe2js); +assert.equal(forcedRe2js.engine, 're2js'); + +const auto = createSafeRegex('a', '', { engine: 'auto' }); +assert.ok(auto); +assert.ok(['re2', 're2js'].includes(auto.engine)); + +const nativeAvailable = isNativeRe2Available(); +let sawWarn = false; +const originalWarn = console.warn; +console.warn = () => { + sawWarn = true; +}; +const forcedRe2 = createSafeRegex('a', '', { engine: 're2' }); +console.warn = originalWarn; +assert.ok(forcedRe2); +if (nativeAvailable) { + assert.equal(forcedRe2.engine, 're2'); + assert.equal(sawWarn, false); +} else { + assert.equal(forcedRe2.engine, 're2js'); + assert.equal(sawWarn, true); +} + +console.log('safe regex engine test passed'); diff --git a/tests/scip-ingest.js b/tests/scip-ingest.js new file mode 100644 index 000000000..3c6bc5252 --- /dev/null +++ b/tests/scip-ingest.js @@ -0,0 +1,47 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'scip-ingest'); +const repoRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const inputPath = path.join(root, 'tests', 'fixtures', 'scip', 'index.json'); +const outPath = path.join(tempRoot, 'scip.jsonl'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); + +const result = spawnSync( + process.execPath, + [path.join(root, 'tools', 'scip-ingest.js'), '--repo', repoRoot, '--input', inputPath, '--out', outPath, '--json'], + { encoding: 'utf8' } +); +if (result.status !== 0) { + console.error(result.stderr || result.stdout || 'scip-ingest failed'); + process.exit(result.status ?? 1); +} + +if (!fs.existsSync(outPath)) { + console.error('scip output not found'); + process.exit(1); +} + +const lines = fs.readFileSync(outPath, 'utf8').trim().split(/\r?\n/).filter(Boolean); +assert.ok(lines.length >= 2, 'expected scip output lines'); + +const first = JSON.parse(lines[0]); +assert.equal(first.file, 'src/example.js'); +assert.equal(first.name, 'doThing'); +assert.equal(first.role, 'definition'); +assert.equal(first.startLine, 2); + +const metaPath = `${outPath}.meta.json`; +const meta = JSON.parse(fs.readFileSync(metaPath, 'utf8')); +assert.equal(meta.stats.occurrences, lines.length); +assert.equal(meta.stats.definitions, 1); +assert.equal(meta.stats.references, 1); + +console.log('scip ingest test passed'); diff --git a/tests/script-coverage-harness.js b/tests/script-coverage-harness.js new file mode 100644 index 000000000..d86910c2b --- /dev/null +++ b/tests/script-coverage-harness.js @@ -0,0 +1,22 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { applyActionCoverage, createCoverageState, finalizeCoverage, reportCoverage } from './script-coverage/report.js'; + +const unknownState = createCoverageState({ scriptNames: ['build-index'] }); +applyActionCoverage(unknownState, { label: 'unknown', covers: ['missing-script'] }); +const unknownSummary = finalizeCoverage(unknownState); +assert.deepEqual(unknownSummary.unknownCovers, ['missing-script']); +assert.equal(reportCoverage(unknownSummary), false, 'expected unknown covers to fail report'); + +const tierMissingState = createCoverageState({ scriptNames: ['build-index'] }); +applyActionCoverage(tierMissingState, { label: 'tier-missing', covers: ['build-index'] }); +const tierMissingSummary = finalizeCoverage(tierMissingState); +assert.equal(tierMissingSummary.missingTierB.length, 1, 'expected tier B to remain missing without override'); + +const tierOverrideState = createCoverageState({ scriptNames: ['build-index'] }); +applyActionCoverage(tierOverrideState, { label: 'tier-override', coversTierB: ['build-index'] }); +const tierOverrideSummary = finalizeCoverage(tierOverrideState); +assert.equal(tierOverrideSummary.missingTierB.length, 0, 'expected tier B override to satisfy coverage'); +assert.equal(tierOverrideSummary.coveredTierB.length, 1, 'expected tier B override to mark covered'); + +console.log('script coverage harness test passed'); diff --git a/tests/script-coverage.js b/tests/script-coverage.js index 20b94417e..46ab0cda1 100644 --- a/tests/script-coverage.js +++ b/tests/script-coverage.js @@ -1,410 +1,69 @@ #!/usr/bin/env node -import fs from 'node:fs'; -import fsPromises from 'node:fs/promises'; -import path from 'node:path'; -import { spawnSync } from 'node:child_process'; +import { createCli } from '../src/shared/cli.js'; +import { buildActions } from './script-coverage/actions.js'; +import { loadPackageScripts, resolveScriptCoveragePaths } from './script-coverage/paths.js'; +import { applyActionCoverage, applyDefaultSkips, createCoverageState, finalizeCoverage, reportCoverage } from './script-coverage/report.js'; +import { createCommandRunner, prepareCoverageDirs, resolveRetries, runShellScripts } from './script-coverage/runner.js'; const root = process.cwd(); -const pkg = JSON.parse(fs.readFileSync(path.join(root, 'package.json'), 'utf8')); -const scripts = pkg.scripts || {}; +const argv = createCli({ + scriptName: 'script-coverage', + options: { + retries: { type: 'number', default: 2 }, + 'log-dir': { type: 'string', default: '' } + } +}).parse(); +const envRetries = Number.parseInt( + process.env.PAIROFCLEATS_TEST_RETRIES ?? process.env.npm_config_test_retries ?? '', + 10 +); +const retries = resolveRetries({ argvRetries: argv.retries, envRetries, defaultRetries: 2 }); +const logDirOverride = argv['log-dir'] + || process.env.PAIROFCLEATS_TEST_LOG_DIR + || process.env.npm_config_test_log_dir + || ''; + +const { + baseCacheRoot, + repoCacheRoot, + fixtureRoot, + failureLogRoot, + ciOutDir, + mergeDir +} = resolveScriptCoveragePaths({ root, logDirOverride }); + +const scripts = loadPackageScripts(root); const scriptNames = Object.keys(scripts); -const coverage = new Map(scriptNames.map((name) => [name, { status: 'pending', via: null, reason: null }])); +const coverageState = createCoverageState({ scriptNames }); -if (coverage.has('script-coverage-test')) { - coverage.set('script-coverage-test', { status: 'covered', via: 'self', reason: null }); -} -if (coverage.has('test-all')) { - markSkipped('test-all', 'aggregates script-coverage-test and bench'); -} - -const baseCacheRoot = path.join(root, 'tests', '.cache', 'script-coverage'); -const repoCacheRoot = path.join(baseCacheRoot, 'repo'); -const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); const repoEnv = { ...process.env, PAIROFCLEATS_CACHE_ROOT: repoCacheRoot, PAIROFCLEATS_EMBEDDINGS: 'stub' }; -await fsPromises.rm(baseCacheRoot, { recursive: true, force: true }); -await fsPromises.mkdir(repoCacheRoot, { recursive: true }); - -function markCovered(name, via) { - if (!coverage.has(name)) return; - const entry = coverage.get(name); - if (entry.status === 'pending') { - coverage.set(name, { status: 'covered', via, reason: null }); - } -} - -function markSkipped(name, reason) { - if (!coverage.has(name)) return; - coverage.set(name, { status: 'skipped', via: null, reason }); -} - -function run(label, cmd, args, options = {}) { - const result = spawnSync(cmd, args, { stdio: 'inherit', ...options }); - if (result.status !== 0) { - console.error(`Failed: ${label}`); - process.exit(result.status ?? 1); - } -} - -function runNode(label, scriptPath, args = [], options = {}) { - run(label, process.execPath, [scriptPath, ...args], options); -} - -const ciOutDir = path.join(baseCacheRoot, 'ci-artifacts'); - -const actions = [ - { - label: 'download-dicts-test', - run: () => runNode('download-dicts-test', path.join(root, 'tests', 'download-dicts.js')), - covers: ['download-dicts', 'download-dicts-test'] - }, - { - label: 'download-extensions-test', - run: () => runNode('download-extensions-test', path.join(root, 'tests', 'download-extensions.js')), - covers: ['download-extensions', 'verify-extensions', 'download-extensions-test'] - }, - { - label: 'tooling-detect-test', - run: () => runNode('tooling-detect-test', path.join(root, 'tests', 'tooling-detect.js')), - covers: ['tooling-detect', 'tooling-detect-test'] - }, - { - label: 'tooling-install-test', - run: () => runNode('tooling-install-test', path.join(root, 'tests', 'tooling-install.js')), - covers: ['tooling-install', 'tooling-install-test'] - }, - { - label: 'clean-artifacts-test', - run: () => runNode('clean-artifacts-test', path.join(root, 'tests', 'clean-artifacts.js')), - covers: ['clean-artifacts', 'clean-artifacts-test'] - }, - { - label: 'uninstall-test', - run: () => runNode('uninstall-test', path.join(root, 'tests', 'uninstall.js')), - covers: ['uninstall', 'uninstall-test'] - }, - { - label: 'sqlite-incremental-test', - run: () => runNode('sqlite-incremental-test', path.join(root, 'tests', 'sqlite-incremental.js')), - covers: ['sqlite-incremental-test'] - }, - { - label: 'sqlite-compact-test', - run: () => runNode('sqlite-compact-test', path.join(root, 'tests', 'sqlite-compact.js')), - covers: ['sqlite-compact-test', 'compact-sqlite-index'] - }, - { - label: 'sqlite-ann-extension-test', - run: () => runNode('sqlite-ann-extension-test', path.join(root, 'tests', 'sqlite-ann-extension.js')), - covers: ['sqlite-ann-extension-test'] - }, - { - label: 'language-fidelity-test', - run: () => runNode('language-fidelity-test', path.join(root, 'tests', 'language-fidelity.js')), - covers: ['language-fidelity-test'] - }, - { - label: 'type-inference-crossfile-test', - run: () => runNode('type-inference-crossfile-test', path.join(root, 'tests', 'type-inference-crossfile.js')), - covers: ['type-inference-crossfile-test'] - }, - { - label: 'type-inference-crossfile-go', - run: () => runNode('type-inference-crossfile-go', path.join(root, 'tests', 'type-inference-crossfile-go.js')), - covers: [] - }, - { - label: 'format-fidelity-test', - run: () => runNode('format-fidelity-test', path.join(root, 'tests', 'format-fidelity.js')), - covers: ['format-fidelity-test'] - }, - { - label: 'compare-models-test', - run: () => runNode('compare-models-test', path.join(root, 'tests', 'compare-models.js')), - covers: ['compare-models-test', 'compare-models'] - }, - { - label: 'summary-report-test', - run: () => runNode('summary-report-test', path.join(root, 'tests', 'summary-report.js')), - covers: ['summary-report-test', 'summary-report'] - }, - { - label: 'docs-consistency-test', - run: () => runNode('docs-consistency-test', path.join(root, 'tests', 'docs-consistency.js')), - covers: ['docs-consistency-test'] - }, - { - label: 'repometrics-dashboard-test', - run: () => runNode('repometrics-dashboard-test', path.join(root, 'tests', 'repometrics-dashboard.js')), - covers: ['repometrics-dashboard-test', 'repometrics-dashboard'] - }, - { - label: 'triage-test', - run: () => runNode('triage-test', path.join(root, 'tests', 'triage-records.js')), - covers: ['triage-test'] - }, - { - label: 'mcp-server-test', - run: () => runNode('mcp-server-test', path.join(root, 'tests', 'mcp-server.js')), - covers: ['mcp-server-test', 'mcp-server'] - }, - { - label: 'git-hooks-test', - run: () => runNode('git-hooks-test', path.join(root, 'tests', 'git-hooks.js')), - covers: ['git-hooks-test', 'git-hooks'] - }, - { - label: 'git-meta-test', - run: () => runNode('git-meta-test', path.join(root, 'tests', 'git-meta.js')), - covers: [] - }, - { - label: 'churn-filter-test', - run: () => runNode('churn-filter-test', path.join(root, 'tests', 'churn-filter.js')), - covers: [] - }, - { - label: 'search-filters-test', - run: () => runNode('search-filters-test', path.join(root, 'tests', 'search-filters.js')), - covers: ['search-filters-test'] - }, - { - label: 'unicode-offset-test', - run: () => runNode('unicode-offset-test', path.join(root, 'tests', 'unicode-offset.js')), - covers: [] - }, - { - label: 'repo-root-test', - run: () => runNode('repo-root-test', path.join(root, 'tests', 'repo-root.js')), - covers: [] - }, - { - label: 'file-size-guard-test', - run: () => runNode('file-size-guard-test', path.join(root, 'tests', 'file-size-guard.js')), - covers: [] - }, - { - label: 'ts-jsx-fixtures', - run: () => runNode('ts-jsx-fixtures', path.join(root, 'tests', 'ts-jsx-fixtures.js')), - covers: [] - }, - { - label: 'python-fallback-test', - run: () => runNode('python-fallback-test', path.join(root, 'tests', 'python-fallback.js')), - covers: [] - }, - { - label: 'verify', - run: () => runNode('verify', path.join(root, 'tests', 'smoke.js')), - covers: ['verify'] - }, - { - label: 'fixture-smoke', - run: () => runNode('fixture-smoke', path.join(root, 'tests', 'fixture-smoke.js')), - covers: ['fixture-smoke', 'build-index', 'build-sqlite-index', 'search'] - }, - { - label: 'fixture-empty', - run: () => runNode('fixture-empty', path.join(root, 'tests', 'fixture-empty.js')), - covers: [] - }, - { - label: 'fixture-parity', - run: () => runNode('fixture-parity', path.join(root, 'tests', 'fixture-parity.js')), - covers: ['fixture-parity'] - }, - { - label: 'fixture-eval', - run: () => runNode('fixture-eval', path.join(root, 'tests', 'fixture-eval.js')), - covers: ['fixture-eval'] - }, - { - label: 'query-cache-test', - run: () => runNode('query-cache-test', path.join(root, 'tests', 'query-cache.js')), - covers: ['query-cache-test'] - }, - { - label: 'repo-build-index', - run: () => runNode('build-index', path.join(root, 'build_index.js'), ['--stub-embeddings', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), - covers: ['build-index'] - }, - { - label: 'repo-build-sqlite-index', - run: () => runNode('build-sqlite-index', path.join(root, 'tools', 'build-sqlite-index.js'), ['--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), - covers: ['build-sqlite-index'] - }, - { - label: 'parity', - run: () => runNode( - 'parity', - path.join(root, 'tests', 'parity.js'), - ['--search', path.join(root, 'search.js'), '--no-ann'], - { cwd: fixtureRoot, env: repoEnv } - ), - covers: ['parity'] - }, - { - label: 'repo-search', - run: () => runNode('search', path.join(root, 'search.js'), ['message', '--json', '--no-ann', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), - covers: ['search'] - }, - { - label: 'search-sqlite', - run: () => runNode('search-sqlite', path.join(root, 'tools', 'search-sqlite.js'), ['message', '--json', '--no-ann', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), - covers: ['search-sqlite'] - }, - { - label: 'report-artifacts', - run: () => runNode('report-artifacts', path.join(root, 'tools', 'report-artifacts.js'), ['--json', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), - covers: ['report-artifacts'] - }, - { - label: 'cache-gc-test', - run: () => runNode('cache-gc-test', path.join(root, 'tests', 'cache-gc.js')), - covers: ['cache-gc', 'cache-gc-test'] - }, - { - label: 'generate-repo-dict', - run: () => runNode('generate-repo-dict', path.join(root, 'tools', 'generate-repo-dict.js'), ['--min-count', '1', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), - covers: ['generate-repo-dict'] - }, - { - label: 'ci-build', - run: () => runNode('ci-build', path.join(root, 'tools', 'ci-build-artifacts.js'), ['--out', ciOutDir, '--skip-build', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), - covers: ['ci-build'] - }, - { - label: 'ci-restore', - run: () => runNode('ci-restore', path.join(root, 'tools', 'ci-restore-artifacts.js'), ['--from', ciOutDir, '--force', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), - covers: ['ci-restore'] - }, - { - label: 'bootstrap', - run: () => runNode( - 'bootstrap', - path.join(root, 'tools', 'bootstrap.js'), - ['--skip-install', '--skip-dicts', '--skip-index', '--skip-artifacts', '--skip-tooling', '--repo', fixtureRoot], - { cwd: fixtureRoot, env: repoEnv } - ), - covers: ['bootstrap'] - }, - { - label: 'setup-test', - run: () => runNode('setup-test', path.join(root, 'tests', 'setup.js')), - covers: ['setup', 'setup-test'] - }, - { - label: 'config-validate-test', - run: () => runNode('config-validate-test', path.join(root, 'tests', 'config-validate.js')), - covers: ['config-validate', 'config-validate-test'] - }, - { - label: 'cli-test', - run: () => runNode('cli-test', path.join(root, 'tests', 'cli.js')), - covers: ['cli-test'] - } -]; - -const mergeDir = path.join(baseCacheRoot, 'merge'); -await fsPromises.mkdir(mergeDir, { recursive: true }); -const mergeBase = path.join(mergeDir, 'base.txt'); -const mergeTarget = path.join(mergeDir, 'target.txt'); -await fsPromises.writeFile(mergeBase, 'alpha\nbeta\n'); -await fsPromises.writeFile(mergeTarget, 'beta\ngamma\n'); - -actions.push({ - label: 'merge-history', - run: () => runNode('merge-history', path.join(root, 'tools', 'mergeSearchHistory.js'), [mergeBase, mergeTarget]), - covers: ['merge-history'] -}); -actions.push({ - label: 'merge-no-results', - run: () => runNode('merge-no-results', path.join(root, 'tools', 'mergeNoResultQueries.js'), [mergeBase, mergeTarget]), - covers: ['merge-no-results'] +await prepareCoverageDirs({ baseCacheRoot, repoCacheRoot, failureLogRoot }); +const { run, runNode } = createCommandRunner({ retries, failureLogRoot }); + +const actions = await buildActions({ + root, + fixtureRoot, + repoEnv, + baseCacheRoot, + ciOutDir, + mergeDir, + runNode }); for (const action of actions) { console.log(`[script-coverage] ${action.label}`); action.run(); - for (const name of action.covers) { - markCovered(name, action.label); - } -} - -markSkipped('download-models', 'requires network model download'); -markSkipped('bench', 'benchmarks are long-running'); -markSkipped('bench-ann', 'benchmarks are long-running'); -markSkipped('watch-index', 'watch mode runs until interrupted'); -markSkipped('format', 'modifies working tree'); -markSkipped('lint', 'requires npm install and project lint config'); - -const shellScripts = [ - path.join(root, 'merge-history.sh'), - path.join(root, 'merge-no-results.sh'), - path.join(root, 'merge-metrics.sh'), - path.join(root, 'tools', 'merge-history.sh'), - path.join(root, 'tools', 'merge-no-results.sh'), - path.join(root, 'tools', 'merge-metrics.sh'), - path.join(root, 'tools', 'merge-agentinfo-notes.sh'), - path.join(root, 'tools', 'merge-agentinfo-index.sh') -]; - -const bashCheck = spawnSync('bash', ['-c', 'echo ok'], { encoding: 'utf8' }); -const bashAvailable = bashCheck.status === 0; -const jqCheck = bashAvailable ? spawnSync('bash', ['-c', 'command -v jq'], { encoding: 'utf8' }) : null; -const jqAvailable = jqCheck && jqCheck.status === 0; -const toPosixPath = (value) => (process.platform === 'win32' ? value.replace(/\\/g, '/') : value); -const bashPathCheck = bashAvailable - ? spawnSync('bash', ['-c', `cd "${toPosixPath(root)}"`], { encoding: 'utf8' }) - : null; -const bashAccessible = bashPathCheck && bashPathCheck.status === 0; - -if (bashAvailable && bashAccessible) { - const shellWorkDir = path.join(baseCacheRoot, 'shell'); - await fsPromises.mkdir(shellWorkDir, { recursive: true }); - const base = path.join(shellWorkDir, 'base.json'); - const ours = path.join(shellWorkDir, 'ours.json'); - const theirs = path.join(shellWorkDir, 'theirs.json'); - await fsPromises.writeFile(base, JSON.stringify({ file: { md: 1, code: 1 } }, null, 2)); - await fsPromises.writeFile(ours, JSON.stringify({ file: { md: 2, code: 0 } }, null, 2)); - await fsPromises.writeFile(theirs, JSON.stringify({ file: { md: 3, code: 2 } }, null, 2)); - - for (const scriptPath of shellScripts) { - if (!fs.existsSync(scriptPath)) continue; - if (scriptPath.endsWith('merge-metrics.sh') && !jqAvailable) { - console.log(`[skip] ${scriptPath} (jq not available)`); - continue; - } - const args = [scriptPath, base, ours, theirs].map(toPosixPath); - run('shell-script', 'bash', args, { cwd: root }); - } -} else if (!bashAvailable) { - console.log('[skip] shell scripts (bash not available)'); -} else { - console.log('[skip] shell scripts (bash cannot access workspace path)'); -} - -const missing = []; -const skipped = []; -const covered = []; -for (const [name, entry] of coverage.entries()) { - if (entry.status === 'pending') missing.push(name); - if (entry.status === 'skipped') skipped.push({ name, reason: entry.reason }); - if (entry.status === 'covered') covered.push({ name, via: entry.via }); + applyActionCoverage(coverageState, action); } -if (missing.length) { - console.error(`Missing coverage for: ${missing.join(', ')}`); - process.exit(1); -} +await runShellScripts({ root, baseCacheRoot, run }); -console.log(`script coverage: ${covered.length} covered, ${skipped.length} skipped`); -if (skipped.length) { - for (const entry of skipped) { - console.log(`- skipped ${entry.name}: ${entry.reason}`); - } -} +applyDefaultSkips(coverageState); +const summary = finalizeCoverage(coverageState); +const ok = reportCoverage(summary); +if (!ok) process.exit(1); diff --git a/tests/script-coverage/actions.js b/tests/script-coverage/actions.js new file mode 100644 index 000000000..aa92096d5 --- /dev/null +++ b/tests/script-coverage/actions.js @@ -0,0 +1,1096 @@ +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; + +export const buildActions = async (context) => { + const { root, fixtureRoot, repoEnv, baseCacheRoot, runNode } = context; + const ciOutDir = context.ciOutDir || path.join(baseCacheRoot, 'ci-artifacts'); + +const actions = [ + { + label: 'download-dicts-test', + run: () => runNode('download-dicts-test', path.join(root, 'tests', 'download-dicts.js')), + covers: ['download-dicts', 'download-dicts-test'] + }, + { + label: 'download-extensions-test', + run: () => runNode('download-extensions-test', path.join(root, 'tests', 'download-extensions.js')), + covers: ['download-extensions', 'verify-extensions', 'download-extensions-test'] + }, + { + label: 'vector-extension-sanitize-test', + run: () => runNode('vector-extension-sanitize-test', path.join(root, 'tests', 'vector-extension-sanitize.js')), + covers: ['vector-extension-sanitize-test'] + }, + { + label: 'tooling-detect-test', + run: () => runNode('tooling-detect-test', path.join(root, 'tests', 'tooling-detect.js')), + covers: ['tooling-detect', 'tooling-detect-test'] + }, + { + label: 'tooling-install-test', + run: () => runNode('tooling-install-test', path.join(root, 'tests', 'tooling-install.js')), + covers: ['tooling-install', 'tooling-install-test'] + }, + { + label: 'clean-artifacts-test', + run: () => runNode('clean-artifacts-test', path.join(root, 'tests', 'clean-artifacts.js')), + covers: ['clean-artifacts', 'clean-artifacts-test'] + }, + { + label: 'uninstall-test', + run: () => runNode('uninstall-test', path.join(root, 'tests', 'uninstall.js')), + covers: ['uninstall', 'uninstall-test'] + }, + { + label: 'sqlite-incremental-test', + run: () => runNode('sqlite-incremental-test', path.join(root, 'tests', 'sqlite-incremental.js')), + covers: ['sqlite-incremental-test'] + }, + { + label: 'sqlite-incremental-no-change-test', + run: () => runNode('sqlite-incremental-no-change-test', path.join(root, 'tests', 'sqlite-incremental-no-change.js')), + covers: ['sqlite-incremental-no-change-test'] + }, + { + label: 'sqlite-bundle-missing-test', + run: () => runNode('sqlite-bundle-missing-test', path.join(root, 'tests', 'sqlite-bundle-missing.js')), + covers: ['sqlite-bundle-missing-test'] + }, + { + label: 'sqlite-index-state-fail-closed-test', + run: () => runNode('sqlite-index-state-fail-closed-test', path.join(root, 'tests', 'sqlite-index-state-fail-closed.js')), + covers: ['sqlite-index-state-fail-closed-test'] + }, + { + label: 'artifact-size-guardrails-test', + run: () => runNode('artifact-size-guardrails-test', path.join(root, 'tests', 'artifact-size-guardrails.js')), + covers: ['artifact-size-guardrails-test'] + }, + { + label: 'chunk-meta-jsonl-cleanup-test', + run: () => runNode('chunk-meta-jsonl-cleanup-test', path.join(root, 'tests', 'chunk-meta-jsonl-cleanup.js')), + covers: ['chunk-meta-jsonl-cleanup-test'] + }, + { + label: 'safe-regex-engine-test', + run: () => runNode('safe-regex-engine-test', path.join(root, 'tests', 'safe-regex-engine.js')), + covers: ['safe-regex-engine-test'] + }, + + { + label: 'incremental-manifest-test', + run: () => runNode('incremental-manifest-test', path.join(root, 'tests', 'incremental-manifest.js')), + covers: ['incremental-manifest-test'] + }, + { + label: 'index-lock-test', + run: () => runNode('index-lock-test', path.join(root, 'tests', 'index-lock.js')), + covers: ['index-lock-test'] + }, + { + label: 'sqlite-compact-test', + run: () => runNode('sqlite-compact-test', path.join(root, 'tests', 'sqlite-compact.js')), + covers: ['sqlite-compact-test', 'compact-sqlite-index'], + coversTierB: ['compact-sqlite-index'] + }, + { + label: 'sqlite-sidecar-cleanup-test', + run: () => runNode('sqlite-sidecar-cleanup-test', path.join(root, 'tests', 'sqlite-sidecar-cleanup.js')), + covers: ['sqlite-sidecar-cleanup-test'] + }, + { + label: 'sqlite-ann-extension-test', + run: () => runNode('sqlite-ann-extension-test', path.join(root, 'tests', 'sqlite-ann-extension.js')), + covers: ['sqlite-ann-extension-test'] + }, + { + label: 'sqlite-vec-candidate-set-test', + run: () => runNode('sqlite-vec-candidate-set-test', path.join(root, 'tests', 'sqlite-vec-candidate-set.js')), + covers: ['sqlite-vec-candidate-set-test'] + }, + { + label: 'sqlite-build-manifest-test', + run: () => runNode('sqlite-build-manifest-test', path.join(root, 'tests', 'sqlite-build-manifest.js')), + covers: ['sqlite-build-manifest-test'] + }, + { + label: 'sqlite-build-vocab-test', + run: () => runNode('sqlite-build-vocab-test', path.join(root, 'tests', 'sqlite-build-vocab.js')), + covers: ['sqlite-build-vocab-test'] + }, + { + label: 'sqlite-build-delete-test', + run: () => runNode('sqlite-build-delete-test', path.join(root, 'tests', 'sqlite-build-delete.js')), + covers: ['sqlite-build-delete-test'] + }, + { + label: 'hnsw-ann-test', + run: () => runNode('hnsw-ann-test', path.join(root, 'tests', 'hnsw-ann.js')), + covers: ['hnsw-ann-test'] + }, + { + label: 'hnsw-atomic-test', + run: () => runNode('hnsw-atomic-test', path.join(root, 'tests', 'hnsw-atomic.js')), + covers: ['hnsw-atomic-test'] + }, + { + label: 'minhash-parity-test', + run: () => runNode('minhash-parity-test', path.join(root, 'tests', 'minhash-parity.js')), + covers: ['minhash-parity-test'] + }, + { + label: 'language-fidelity-test', + run: () => runNode('language-fidelity-test', path.join(root, 'tests', 'language-fidelity.js')), + covers: ['language-fidelity-test'] + }, + { + label: 'metadata-v2-test', + run: () => runNode('metadata-v2-test', path.join(root, 'tests', 'metadata-v2.js')), + covers: ['metadata-v2-test'] + }, + { + label: 'chunking-limits-test', + run: () => runNode('chunking-limits-test', path.join(root, 'tests', 'chunking-limits.js')), + covers: ['chunking-limits-test'] + }, + { + label: 'graph-chunk-id-test', + run: () => runNode('graph-chunk-id-test', path.join(root, 'tests', 'graph-chunk-id.js')), + covers: ['graph-chunk-id-test'] + }, + { + label: 'sqlite-chunk-id-test', + run: () => runNode('sqlite-chunk-id-test', path.join(root, 'tests', 'sqlite-chunk-id.js')), + covers: ['sqlite-chunk-id-test'] + }, + { + label: 'kotlin-perf-guard-test', + run: () => runNode('kotlin-perf-guard-test', path.join(root, 'tests', 'kotlin-perf-guard.js')), + covers: ['kotlin-perf-guard-test'] + }, + { + label: 'tree-sitter-chunks-test', + run: () => runNode('tree-sitter-chunks-test', path.join(root, 'tests', 'tree-sitter-chunks.js')), + covers: ['tree-sitter-chunks-test'] + }, + { + label: 'type-inference-crossfile-go', + run: () => runNode('type-inference-crossfile-go', path.join(root, 'tests', 'type-inference-crossfile-go.js')), + covers: [] + }, + { + label: 'type-inference-crossfile-test', + run: () => runNode('type-inference-crossfile-test', path.join(root, 'tests', 'type-inference-crossfile.js')), + covers: ['type-inference-crossfile-test'] + }, + { + label: 'type-inference-lsp-enrichment-test', + run: () => runNode('type-inference-lsp-enrichment-test', path.join(root, 'tests', 'type-inference-lsp-enrichment.js')), + covers: ['type-inference-lsp-enrichment-test'] + }, + { + label: 'type-inference-typescript-provider-no-ts', + run: () => runNode('type-inference-typescript-provider-no-ts', path.join(root, 'tests', 'type-inference-typescript-provider-no-ts.js')), + covers: [] + }, + { + label: 'type-inference-clangd-provider-no-clangd', + run: () => runNode('type-inference-clangd-provider-no-clangd', path.join(root, 'tests', 'type-inference-clangd-provider-no-clangd.js')), + covers: [] + }, + { + label: 'type-inference-sourcekit-provider-no-sourcekit', + run: () => runNode('type-inference-sourcekit-provider-no-sourcekit', path.join(root, 'tests', 'type-inference-sourcekit-provider-no-sourcekit.js')), + covers: [] + }, + { + label: 'format-fidelity-test', + run: () => runNode('format-fidelity-test', path.join(root, 'tests', 'format-fidelity.js')), + covers: ['format-fidelity-test'] + }, + { + label: 'chunking-yaml-test', + run: () => runNode('chunking-yaml-test', path.join(root, 'tests', 'chunking-yaml.js')), + covers: [] + }, + { + label: 'chunking-sql-lua-test', + run: () => runNode('chunking-sql-lua-test', path.join(root, 'tests', 'chunking-sql-lua.js')), + covers: [] + }, + { + label: 'segment-pipeline-test', + run: () => runNode('segment-pipeline-test', path.join(root, 'tests', 'segment-pipeline.js')), + covers: [] + }, + { + label: 'prose-skip-imports-test', + run: () => runNode('prose-skip-imports-test', path.join(root, 'tests', 'prose-skip-imports.js')), + covers: ['prose-skip-imports-test'] + }, + { + label: 'extracted-prose-test', + run: () => runNode('extracted-prose-test', path.join(root, 'tests', 'extracted-prose.js')), + covers: [] + }, + { + label: 'tokenize-dictionary-test', + run: () => runNode('tokenize-dictionary-test', path.join(root, 'tests', 'tokenize-dictionary.js')), + covers: [] + }, + { + label: 'import-links-test', + run: () => runNode('import-links-test', path.join(root, 'tests', 'import-links.js')), + covers: ['import-links-test'] + }, + { + label: 'git-blame-range-test', + run: () => runNode('git-blame-range-test', path.join(root, 'tests', 'git-blame-range.js')), + covers: ['git-blame-range-test'] + }, + { + label: 'external-docs-test', + run: () => runNode('external-docs-test', path.join(root, 'tests', 'external-docs.js')), + covers: ['external-docs-test'] + }, + { + label: 'tooling-lsp-test', + run: () => runNode('tooling-lsp-test', path.join(root, 'tests', 'tooling-lsp.js')), + covers: [] + }, + { + label: 'lsp-shutdown-test', + run: () => runNode('lsp-shutdown-test', path.join(root, 'tests', 'lsp-shutdown.js')), + covers: ['lsp-shutdown-test'] + }, + { + label: 'bench-language-repos-test', + run: () => runNode('bench-language-repos-test', path.join(root, 'tests', 'bench-language-repos.js')), + covers: ['bench-language-test'] + }, + { + label: 'bench-language-lock-test', + run: () => runNode('bench-language-lock-test', path.join(root, 'tests', 'bench-language-lock.js')), + covers: ['bench-language-lock-test'] + }, + { + label: 'bench-language-progress-parse-test', + run: () => runNode('bench-language-progress-parse-test', path.join(root, 'tests', 'bench-language-progress-parse.js')), + covers: ['bench-language-progress-parse-test'] + }, + { + label: 'bench-language-lock-semantics-test', + run: () => runNode('bench-language-lock-semantics-test', path.join(root, 'tests', 'bench-language-lock-semantics.js')), + covers: ['bench-language-lock-semantics-test'] + }, + { + label: 'retrieval-branch-filter-test', + run: () => runNode('retrieval-branch-filter-test', path.join(root, 'tests', 'retrieval-branch-filter.js')), + covers: ['retrieval-branch-filter-test'] + }, + { + label: 'retrieval-backend-policy-test', + run: () => runNode('retrieval-backend-policy-test', path.join(root, 'tests', 'retrieval-backend-policy.js')), + covers: ['retrieval-backend-policy-test'] + }, + { + label: 'summary-report-test', + run: () => runNode('summary-report-test', path.join(root, 'tests', 'summary-report.js')), + covers: ['summary-report-test', 'summary-report'] + }, + { + label: 'repometrics-dashboard-test', + run: () => runNode('repometrics-dashboard-test', path.join(root, 'tests', 'repometrics-dashboard.js')), + covers: ['repometrics-dashboard-test', 'repometrics-dashboard'] + }, + { + label: 'index-validate-test', + run: () => runNode('index-validate-test', path.join(root, 'tests', 'index-validate.js')), + covers: ['index-validate-test', 'index-validate'] + }, + { + label: 'embeddings-validate-test', + run: () => runNode('embeddings-validate-test', path.join(root, 'tests', 'embeddings-validate.js')), + covers: ['embeddings-validate-test'] + }, + { + label: 'triage-test', + run: () => runNode('triage-test', path.join(root, 'tests', 'triage-records.js')), + covers: ['triage-test'] + }, + { + label: 'mcp-server-test', + run: () => runNode('mcp-server-test', path.join(root, 'tests', 'mcp-server.js')), + covers: ['mcp-server-test', 'mcp-server'] + }, + { + label: 'mcp-schema-test', + run: () => runNode('mcp-schema-test', path.join(root, 'tests', 'mcp-schema.js')), + covers: ['mcp-schema-test'] + }, + { + label: 'mcp-robustness-test', + run: () => runNode('mcp-robustness-test', path.join(root, 'tests', 'mcp-robustness.js')), + covers: ['mcp-robustness-test'] + }, + { + label: 'api-server-test', + run: () => runNode('api-server-test', path.join(root, 'tests', 'api-server.js')), + covers: ['api-server-test', 'api-server'] + }, + { + label: 'api-server-stream-test', + run: () => runNode('api-server-stream-test', path.join(root, 'tests', 'api-server-stream.js')), + covers: ['api-server-stream-test'] + }, + { + label: 'indexer-service-test', + run: () => runNode('indexer-service-test', path.join(root, 'tests', 'indexer-service.js')), + covers: ['indexer-service', 'indexer-service-test'] + }, + { + label: 'piece-assembly-test', + run: () => runNode('piece-assembly-test', path.join(root, 'tests', 'piece-assembly.js')), + covers: ['piece-assembly-test'] + }, + { + label: 'compact-pieces-test', + run: () => runNode('compact-pieces-test', path.join(root, 'tests', 'compact-pieces.js')), + covers: ['compact-pieces-test'] + }, + { + label: 'git-hooks-test', + run: () => runNode('git-hooks-test', path.join(root, 'tests', 'git-hooks.js')), + covers: ['git-hooks-test', 'git-hooks'] + }, + { + label: 'git-meta-test', + run: () => runNode('git-meta-test', path.join(root, 'tests', 'git-meta.js')), + covers: [] + }, + { + label: 'churn-filter-test', + run: () => runNode('churn-filter-test', path.join(root, 'tests', 'churn-filter.js')), + covers: [] + }, + { + label: 'search-filters-test', + run: () => runNode('search-filters-test', path.join(root, 'tests', 'search-filters.js')), + covers: ['search-filters-test'] + }, + { + label: 'ctags-ingest-test', + run: () => runNode('ctags-ingest-test', path.join(root, 'tests', 'ctags-ingest.js')), + covers: ['ctags-ingest', 'ctags-ingest-test'] + }, + { + label: 'scip-ingest-test', + run: () => runNode('scip-ingest-test', path.join(root, 'tests', 'scip-ingest.js')), + covers: ['scip-ingest', 'scip-ingest-test'] + }, + { + label: 'lsif-ingest-test', + run: () => runNode('lsif-ingest-test', path.join(root, 'tests', 'lsif-ingest.js')), + covers: ['lsif-ingest', 'lsif-ingest-test'] + }, + { + label: 'gtags-ingest-test', + run: () => runNode('gtags-ingest-test', path.join(root, 'tests', 'gtags-ingest.js')), + covers: ['gtags-ingest', 'gtags-ingest-test'] + }, + { + label: 'structural-search-test', + run: () => runNode('structural-search-test', path.join(root, 'tests', 'structural-search.js')), + covers: ['structural-search', 'structural-search-test'] + }, + { + label: 'structural-filters-test', + run: () => runNode('structural-filters-test', path.join(root, 'tests', 'structural-filters.js')), + covers: ['structural-filters-test'] + }, + { + label: 'lang-filter-test', + run: () => runNode('lang-filter-test', path.join(root, 'tests', 'lang-filter.js')), + covers: ['lang-filter-test'] + }, + { + label: 'sqlite-auto-backend-test', + run: () => runNode('sqlite-auto-backend-test', path.join(root, 'tests', 'sqlite-auto-backend.js')), + covers: ['sqlite-auto-backend-test'] + }, + { + label: 'sqlite-missing-dep-test', + run: () => runNode('sqlite-missing-dep-test', path.join(root, 'tests', 'sqlite-missing-dep.js')), + covers: ['sqlite-missing-dep-test'] + }, + { + label: 'search-explain-test', + run: () => runNode('search-explain-test', path.join(root, 'tests', 'search-explain.js')), + covers: ['search-explain-test'] + }, + { + label: 'search-rrf-test', + run: () => runNode('search-rrf-test', path.join(root, 'tests', 'search-rrf.js')), + covers: ['search-rrf-test'] + }, + { + label: 'artifact-bak-recovery-test', + run: () => runNode('artifact-bak-recovery-test', path.join(root, 'tests', 'artifact-bak-recovery.js')), + covers: ['artifact-bak-recovery-test'] + }, + { + label: 'encoding-hash-test', + run: () => runNode('encoding-hash-test', path.join(root, 'tests', 'encoding-hash.js')), + covers: ['encoding-hash-test'] + }, + { + label: 'embeddings-cache-identity-test', + run: () => runNode('embeddings-cache-identity-test', path.join(root, 'tests', 'embeddings-cache-identity.js')), + covers: ['embeddings-cache-identity-test'] + }, + { + label: 'embeddings-cache-invalidation-test', + run: () => runNode('embeddings-cache-invalidation-test', path.join(root, 'tests', 'embeddings-cache-invalidation.js')), + covers: ['embeddings-cache-invalidation-test'] + }, + { + label: 'embeddings-dims-mismatch-test', + run: () => runNode('embeddings-dims-mismatch-test', path.join(root, 'tests', 'embeddings-dims-mismatch.js')), + covers: ['embeddings-dims-mismatch-test'] + }, + { + label: 'embeddings-dims-validation-test', + run: () => runNode('embeddings-dims-validation-test', path.join(root, 'tests', 'embeddings-dims-validation.js')), + covers: ['embeddings-dims-validation-test'] + }, + { + label: 'embeddings-sqlite-dense-test', + run: () => runNode('embeddings-sqlite-dense-test', path.join(root, 'tests', 'embeddings-sqlite-dense.js')), + covers: ['embeddings-sqlite-dense-test'] + }, + { + label: 'search-topn-filters-test', + run: () => runNode('search-topn-filters-test', path.join(root, 'tests', 'search-topn-filters.js')), + covers: ['search-topn-filters-test'] + }, + { + label: 'search-determinism-test', + run: () => runNode('search-determinism-test', path.join(root, 'tests', 'search-determinism.js')), + covers: ['search-determinism-test'] + }, + { + label: 'filter-index-artifact-test', + run: () => runNode('filter-index-artifact-test', path.join(root, 'tests', 'filter-index-artifact.js')), + covers: ['filter-index-artifact-test'] + }, + { + label: 'search-symbol-boost-test', + run: () => runNode('search-symbol-boost-test', path.join(root, 'tests', 'search-symbol-boost.js')), + covers: ['search-symbol-boost-test'] + }, + { + label: 'vscode-extension-test', + run: () => runNode('vscode-extension-test', path.join(root, 'tests', 'vscode-extension.js')), + covers: ['vscode-extension-test'] + }, + { + label: 'ext-filter-test', + run: () => runNode('ext-filter-test', path.join(root, 'tests', 'ext-filter.js')), + covers: ['ext-filter-test'] + }, + { + label: 'filter-strictness-test', + run: () => runNode('filter-strictness-test', path.join(root, 'tests', 'filter-strictness.js')), + covers: ['filter-strictness-test'] + }, + { + label: 'filter-index-test', + run: () => runNode('filter-index-test', path.join(root, 'tests', 'filter-index.js')), + covers: ['filter-index-test'] + }, + { + label: 'search-missing-index-test', + run: () => runNode('search-missing-index-test', path.join(root, 'tests', 'search-missing-index.js')), + covers: ['search-missing-index-test'] + }, + { + label: 'search-help-test', + run: () => runNode('search-help-test', path.join(root, 'tests', 'search-help.js')), + covers: ['search-help-test'] + }, + { + label: 'search-removed-flags-test', + run: () => runNode('search-removed-flags-test', path.join(root, 'tests', 'search-removed-flags.js')), + covers: [] + }, + { + label: 'search-missing-flag-values-test', + run: () => runNode('search-missing-flag-values-test', path.join(root, 'tests', 'search-missing-flag-values.js')), + covers: [] + }, + { + label: 'search-windows-path-filter-test', + run: () => runNode('search-windows-path-filter-test', path.join(root, 'tests', 'search-windows-path-filter.js')), + covers: [] + }, + { + label: 'search-explain-symbol-test', + run: () => runNode('search-explain-symbol-test', path.join(root, 'tests', 'search-explain-symbol.js')), + covers: [] + }, + { + label: 'unicode-offset-test', + run: () => runNode('unicode-offset-test', path.join(root, 'tests', 'unicode-offset.js')), + covers: [] + }, + { + label: 'repo-root-test', + run: () => runNode('repo-root-test', path.join(root, 'tests', 'repo-root.js')), + covers: [] + }, + { + label: 'tool-root-test', + run: () => runNode('tool-root-test', path.join(root, 'tests', 'tool-root.js')), + covers: [] + }, + { + label: 'file-size-guard-test', + run: () => runNode('file-size-guard-test', path.join(root, 'tests', 'file-size-guard.js')), + covers: [] + }, + { + label: 'file-line-guard-test', + run: () => runNode('file-line-guard-test', path.join(root, 'tests', 'file-line-guard.js')), + covers: [] + }, + { + label: 'skip-minified-binary-test', + run: () => runNode('skip-minified-binary-test', path.join(root, 'tests', 'skip-minified-binary.js')), + covers: [] + }, + { + label: 'read-failure-skip-test', + run: () => runNode('read-failure-skip-test', path.join(root, 'tests', 'read-failure-skip.js')), + covers: [] + }, + { + label: 'encoding-fallback-test', + run: () => runNode('encoding-fallback-test', path.join(root, 'tests', 'encoding-fallback.js')), + covers: [] + }, + { + label: 'incremental-tokenization-cache-test', + run: () => runNode('incremental-tokenization-cache-test', path.join(root, 'tests', 'incremental-tokenization-cache.js')), + covers: [] + }, + { + label: 'tokenization-buffering-test', + run: () => runNode('tokenization-buffering-test', path.join(root, 'tests', 'tokenization-buffering.js')), + covers: [] + }, + { + label: 'postings-quantize-test', + run: () => runNode('postings-quantize-test', path.join(root, 'tests', 'postings-quantize.js')), + covers: [] + }, + { + label: 'embedding-batch-multipliers-test', + run: () => runNode('embedding-batch-multipliers-test', path.join(root, 'tests', 'embedding-batch-multipliers.js')), + covers: [] + }, + { + label: 'typescript-imports-only-test', + run: () => runNode('typescript-imports-only-test', path.join(root, 'tests', 'typescript-imports-only.js')), + covers: [] + }, + { + label: 'import-priority-test', + run: () => runNode('import-priority-test', path.join(root, 'tests', 'import-priority.js')), + covers: [] + }, + { + label: 'ignore-overrides-test', + run: () => runNode('ignore-overrides-test', path.join(root, 'tests', 'ignore-overrides.js')), + covers: [] + }, + { + label: 'incremental-cache-signature-test', + run: () => runNode('incremental-cache-signature-test', path.join(root, 'tests', 'incremental-cache-signature.js')), + covers: [] + }, + { + label: 'incremental-reuse-test', + run: () => runNode('incremental-reuse-test', path.join(root, 'tests', 'incremental-reuse.js')), + covers: [] + }, + { + label: 'thread-limits-test', + run: () => runNode('thread-limits-test', path.join(root, 'tests', 'thread-limits.js')), + covers: [] + }, + { + label: 'bench-progress-format-test', + run: () => runNode('bench-progress-format-test', path.join(root, 'tests', 'bench-progress-format.js')), + covers: [] + }, + { + label: 'shard-merge-test', + run: () => runNode('shard-merge-test', path.join(root, 'tests', 'shard-merge.js')), + covers: [] + }, + { + label: 'shard-plan-test', + run: () => runNode('shard-plan-test', path.join(root, 'tests', 'shard-plan.js')), + covers: [] + }, + { + label: 'preprocess-files-test', + run: () => runNode('preprocess-files-test', path.join(root, 'tests', 'preprocess-files.js')), + covers: [] + }, + { + label: 'service-queue-test', + run: () => runNode('service-queue-test', path.join(root, 'tests', 'service-queue.js')), + covers: [] + }, + { + label: 'build-embeddings-cache-test', + run: () => runNode('build-embeddings-cache-test', path.join(root, 'tests', 'build-embeddings-cache.js')), + covers: [] + }, + { + label: 'embedding-batch-autotune-test', + run: () => runNode('embedding-batch-autotune-test', path.join(root, 'tests', 'embedding-batch-autotune.js')), + covers: [] + }, + { + label: 'sqlite-build-indexes-test', + run: () => runNode('sqlite-build-indexes-test', path.join(root, 'tests', 'sqlite-build-indexes.js')), + covers: [] + }, + { + label: 'lmdb-backend-test', + run: () => runNode('lmdb-backend-test', path.join(root, 'tests', 'lmdb-backend.js')), + covers: ['build-lmdb-index', 'lmdb-backend-test'], + coversTierB: ['build-lmdb-index'] + }, + { + label: 'two-stage-state-test', + run: () => runNode('two-stage-state-test', path.join(root, 'tests', 'two-stage-state.js')), + covers: [] + }, + { + label: 'ts-jsx-fixtures', + run: () => runNode('ts-jsx-fixtures', path.join(root, 'tests', 'ts-jsx-fixtures.js')), + covers: [] + }, + { + label: 'python-heuristic-chunking-test', + run: () => runNode( + 'python-heuristic-chunking-test', + path.join(root, 'tests', 'lang', 'python-heuristic-chunking.test.js') + ), + covers: [] + }, + { + label: 'python-imports-test', + run: () => runNode( + 'python-imports-test', + path.join(root, 'tests', 'lang', 'python-imports.test.js') + ), + covers: [] + }, + { + label: 'python-pool-test', + run: () => runNode( + 'python-pool-test', + path.join(root, 'tests', 'lang', 'python-pool.test.js') + ), + covers: [] + }, + { + label: 'js-imports-test', + run: () => runNode('js-imports-test', path.join(root, 'tests', 'lang', 'js-imports.test.js')), + covers: [] + }, + { + label: 'js-chunking-test', + run: () => runNode('js-chunking-test', path.join(root, 'tests', 'lang', 'js-chunking.test.js')), + covers: [] + }, + { + label: 'js-relations-test', + run: () => runNode('js-relations-test', path.join(root, 'tests', 'lang', 'js-relations.test.js')), + covers: [] + }, + { + label: 'chunking-limits-unit-test', + run: () => runNode( + 'chunking-limits-unit-test', + path.join(root, 'tests', 'chunking', 'limits.test.js') + ), + covers: [] + }, + { + label: 'chunking-yaml-unit-test', + run: () => runNode( + 'chunking-yaml-unit-test', + path.join(root, 'tests', 'chunking', 'yaml.test.js') + ), + covers: [] + }, + { + label: 'chunking-json-unit-test', + run: () => runNode( + 'chunking-json-unit-test', + path.join(root, 'tests', 'chunking', 'json.test.js') + ), + covers: [] + }, + { + label: 'build-runtime-stage-overrides-test', + run: () => runNode( + 'build-runtime-stage-overrides-test', + path.join(root, 'tests', 'build-runtime', 'stage-overrides.test.js') + ), + covers: [] + }, + { + label: 'build-runtime-content-hash-test', + run: () => runNode( + 'build-runtime-content-hash-test', + path.join(root, 'tests', 'build-runtime', 'content-hash.test.js') + ), + covers: [] + }, + { + label: 'indexer-signatures-test', + run: () => runNode( + 'indexer-signatures-test', + path.join(root, 'tests', 'indexer', 'signatures.test.js') + ), + covers: [] + }, + { + label: 'indexer-incremental-plan-test', + run: () => runNode( + 'indexer-incremental-plan-test', + path.join(root, 'tests', 'indexer', 'incremental-plan.test.js') + ), + covers: [] + }, + { + label: 'file-processor-skip-test', + run: () => runNode( + 'file-processor-skip-test', + path.join(root, 'tests', 'file-processor', 'skip.test.js') + ), + covers: [] + }, + { + label: 'file-processor-cached-bundle-test', + run: () => runNode( + 'file-processor-cached-bundle-test', + path.join(root, 'tests', 'file-processor', 'cached-bundle.test.js') + ), + covers: [] + }, + { + label: 'artifacts-token-mode-test', + run: () => runNode( + 'artifacts-token-mode-test', + path.join(root, 'tests', 'artifacts', 'token-mode.test.js') + ), + covers: [] + }, + { + label: 'artifacts-file-meta-test', + run: () => runNode( + 'artifacts-file-meta-test', + path.join(root, 'tests', 'artifacts', 'file-meta.test.js') + ), + covers: [] + }, + { + label: 'language-registry-collectors-test', + run: () => runNode( + 'language-registry-collectors-test', + path.join(root, 'tests', 'language-registry', 'collectors.test.js') + ), + covers: [] + }, + { + label: 'language-registry-selection-test', + run: () => runNode( + 'language-registry-selection-test', + path.join(root, 'tests', 'language-registry', 'selection.test.js') + ), + covers: [] + }, + { + label: 'python-fallback-test', + run: () => runNode('python-fallback-test', path.join(root, 'tests', 'python-fallback.js')), + covers: [] + }, + { + label: 'python-ast-worker-test', + run: () => runNode('python-ast-worker-test', path.join(root, 'tests', 'python-ast-worker.js')), + covers: [] + }, + { + label: 'verify', + run: () => runNode('verify', path.join(root, 'tests', 'smoke.js')), + covers: ['verify'] + }, + { + label: 'fixture-smoke', + run: () => runNode('fixture-smoke', path.join(root, 'tests', 'fixture-smoke.js')), + covers: ['fixture-smoke', 'build-index', 'build-sqlite-index', 'search'], + coversTierB: ['build-index', 'build-sqlite-index'] + }, + { + label: 'fixture-parity', + run: () => runNode('fixture-parity', path.join(root, 'tests', 'fixture-parity.js'), ['--fixtures', 'sample']), + covers: ['fixture-parity'] + }, + { + label: 'fixture-empty', + run: () => runNode('fixture-empty', path.join(root, 'tests', 'fixture-empty.js')), + covers: [] + }, + { + label: 'fixture-eval', + run: () => runNode('fixture-eval', path.join(root, 'tests', 'fixture-eval.js')), + covers: ['fixture-eval'] + }, + { + label: 'eval-quality-test', + run: () => runNode('eval-quality-test', path.join(root, 'tests', 'eval-quality.js')), + covers: ['eval-quality-test', 'eval-run'] + }, + { + label: 'fielded-bm25-test', + run: () => runNode('fielded-bm25-test', path.join(root, 'tests', 'fielded-bm25.js')), + covers: ['fielded-bm25-test'] + }, + { + label: 'artifact-formats-test', + run: () => runNode('artifact-formats-test', path.join(root, 'tests', 'artifact-formats.js')), + covers: ['artifact-formats-test'] + }, + { + label: 'query-intent-test', + run: () => runNode('query-intent-test', path.join(root, 'tests', 'query-intent.js')), + covers: ['query-intent-test'] + }, + { + label: 'context-expansion-test', + run: () => runNode('context-expansion-test', path.join(root, 'tests', 'context-expansion.js')), + covers: ['context-expansion-test'] + }, + { + label: 'query-cache-test', + run: () => runNode('query-cache-test', path.join(root, 'tests', 'query-cache.js')), + covers: ['query-cache-test'] + }, + { + label: 'json-stream-test', + run: () => runNode('json-stream-test', path.join(root, 'tests', 'json-stream.js')), + covers: ['json-stream-test'] + }, + { + label: 'index-cache-test', + run: () => runNode('index-cache-test', path.join(root, 'tests', 'index-cache.js')), + covers: ['index-cache-test'] + }, + { + label: 'sqlite-cache-test', + run: () => runNode('sqlite-cache-test', path.join(root, 'tests', 'sqlite-cache.js')), + covers: ['sqlite-cache-test'] + }, + { + label: 'worker-pool-test', + run: () => runNode('worker-pool-test', path.join(root, 'tests', 'worker-pool.js')), + covers: ['worker-pool-test'] + }, + { + label: 'worker-pool-windows-test', + run: () => runNode('worker-pool-windows-test', path.join(root, 'tests', 'worker-pool-windows.js')), + covers: ['worker-pool-windows-test'] + }, + { + label: 'repo-build-index', + run: () => runNode('build-index', path.join(root, 'build_index.js'), ['--stub-embeddings', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), + covers: ['build-index'] + }, + { + label: 'build-index-all-test', + run: () => runNode('build-index-all-test', path.join(root, 'tests', 'build-index-all.js')), + covers: ['build-index-all-test'] + }, + { + label: 'repo-build-sqlite-index', + run: () => runNode('build-sqlite-index', path.join(root, 'tools', 'build-sqlite-index.js'), ['--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), + covers: ['build-sqlite-index'] + }, + { + label: 'parity', + run: () => runNode( + 'parity', + path.join(root, 'tests', 'parity.js'), + ['--search', path.join(root, 'search.js'), '--no-ann'], + { cwd: fixtureRoot, env: repoEnv } + ), + covers: ['parity'] + }, + { + label: 'repo-search', + run: () => runNode('search', path.join(root, 'search.js'), ['message', '--json', '--no-ann', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), + covers: ['search'] + }, + { + label: 'report-artifacts', + run: () => runNode('report-artifacts', path.join(root, 'tools', 'report-artifacts.js'), ['--json', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), + covers: ['report-artifacts'] + }, + { + label: 'cache-gc-test', + run: () => runNode('cache-gc-test', path.join(root, 'tests', 'cache-gc.js')), + covers: ['cache-gc', 'cache-gc-test'] + }, + { + label: 'cache-lru-test', + run: () => runNode('cache-lru-test', path.join(root, 'tests', 'cache-lru.js')), + covers: ['cache-lru-test'] + }, + { + label: 'discover-test', + run: () => runNode('discover-test', path.join(root, 'tests', 'discover.js')), + covers: ['discover-test'] + }, + { + label: 'watch-debounce-test', + run: () => runNode('watch-debounce-test', path.join(root, 'tests', 'watch-debounce.js')), + covers: ['watch-debounce-test'] + }, + { + label: 'watch-filter-test', + run: () => runNode('watch-filter-test', path.join(root, 'tests', 'watch-filter.js')), + covers: ['watch-filter-test'] + }, + { + label: 'generate-repo-dict', + run: () => runNode('generate-repo-dict', path.join(root, 'tools', 'generate-repo-dict.js'), ['--min-count', '1', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), + covers: ['generate-repo-dict'] + }, + { + label: 'ci-build', + run: () => runNode('ci-build', path.join(root, 'tools', 'ci-build-artifacts.js'), ['--out', ciOutDir, '--skip-build', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), + covers: ['ci-build'] + }, + { + label: 'ci-restore', + run: () => runNode('ci-restore', path.join(root, 'tools', 'ci-restore-artifacts.js'), ['--from', ciOutDir, '--force', '--repo', fixtureRoot], { cwd: fixtureRoot, env: repoEnv }), + covers: ['ci-restore'] + }, + { + label: 'bootstrap', + run: () => runNode( + 'bootstrap', + path.join(root, 'tools', 'bootstrap.js'), + ['--skip-install', '--skip-dicts', '--skip-index', '--skip-artifacts', '--skip-tooling', '--repo', fixtureRoot], + { cwd: fixtureRoot, env: repoEnv } + ), + covers: ['bootstrap'] + }, + { + label: 'setup-test', + run: () => runNode('setup-test', path.join(root, 'tests', 'setup.js')), + covers: ['setup', 'setup-test'] + }, + { + label: 'setup-index-detection-test', + run: () => runNode('setup-index-detection-test', path.join(root, 'tests', 'setup-index-detection.js')), + covers: ['setup-index-detection-test'] + }, + { + label: 'config-validate-test', + run: () => runNode('config-validate-test', path.join(root, 'tests', 'config-validate.js')), + covers: ['config-validate', 'config-validate-test'] + }, + { + label: 'config-dump-test', + run: () => runNode('config-dump-test', path.join(root, 'tests', 'config-dump.js')), + covers: ['config-dump-test'] + }, + { + label: 'uv-threadpool-env-test', + run: () => runNode('uv-threadpool-env-test', path.join(root, 'tests', 'uv-threadpool-env.js')), + covers: ['uv-threadpool-env-test'] + }, + { + label: 'uv-threadpool-no-override-test', + run: () => runNode('uv-threadpool-no-override-test', path.join(root, 'tests', 'uv-threadpool-no-override.js')), + covers: ['uv-threadpool-no-override-test'] + }, + { + label: 'io-concurrency-cap-test', + run: () => runNode('io-concurrency-cap-test', path.join(root, 'tests', 'io-concurrency-cap.js')), + covers: ['io-concurrency-cap-test'] + }, + { + label: 'profile-config-test', + run: () => runNode('profile-config-test', path.join(root, 'tests', 'profile-config.js')), + covers: ['profile-config-test'] + }, + { + label: 'backend-policy-test', + run: () => runNode('backend-policy-test', path.join(root, 'tests', 'backend-policy.js')), + covers: ['backend-policy-test'] + }, + { + label: 'dict-adaptive-test', + run: () => runNode('dict-adaptive-test', path.join(root, 'tests', 'dict-adaptive.js')), + covers: ['dict-adaptive-test'] + }, + { + label: 'chargram-guardrails-test', + run: () => runNode('chargram-guardrails-test', path.join(root, 'tests', 'chargram-guardrails.js')), + covers: ['chargram-guardrails-test'] + }, + { + label: 'core-api-test', + run: () => runNode('core-api-test', path.join(root, 'tests', 'core-api.js')), + covers: ['core-api-test'] + }, + { + label: 'typescript-parser-selection-test', + run: () => runNode('typescript-parser-selection-test', path.join(root, 'tests', 'typescript-parser-selection.js')), + covers: ['typescript-parser-selection-test'] + }, + { + label: 'script-coverage-harness-test', + run: () => runNode('script-coverage-harness-test', path.join(root, 'tests', 'script-coverage-harness.js')), + covers: ['script-coverage-harness-test'] + }, + { + label: 'cli-test', + run: () => runNode('cli-test', path.join(root, 'tests', 'cli.js')), + covers: ['cli-test'] + } +]; + const mergeDir = context.mergeDir || path.join(baseCacheRoot, 'merge'); + await fsPromises.mkdir(mergeDir, { recursive: true }); + const mergeBase = path.join(mergeDir, 'base.txt'); + const mergeTarget = path.join(mergeDir, 'target.txt'); + await fsPromises.writeFile(mergeBase, 'alpha\nbeta\n'); + await fsPromises.writeFile(mergeTarget, 'beta\ngamma\n'); + + actions.push({ + label: 'merge-append', + run: () => runNode('merge-append', path.join(root, 'tools', 'mergeAppendOnly.js'), [mergeBase, mergeTarget]), + covers: ['merge-append'] + }); + + return actions; +}; diff --git a/tests/script-coverage/paths.js b/tests/script-coverage/paths.js new file mode 100644 index 000000000..f017d7342 --- /dev/null +++ b/tests/script-coverage/paths.js @@ -0,0 +1,30 @@ +import fs from 'node:fs'; +import path from 'node:path'; + +export const loadPackageScripts = (root) => { + const pkg = JSON.parse(fs.readFileSync(path.join(root, 'package.json'), 'utf8')); + return pkg.scripts || {}; +}; + +export const resolveFailureLogRoot = ({ root, logDirOverride }) => { + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + return logDirOverride + ? path.resolve(logDirOverride) + : path.join(root, 'tests', '.logs', timestamp); +}; + +export const resolveScriptCoveragePaths = ({ root, logDirOverride }) => { + const baseCacheRoot = path.join(root, 'tests', '.cache', 'script-coverage'); + const repoCacheRoot = path.join(baseCacheRoot, 'repo'); + const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); + const failureLogRoot = resolveFailureLogRoot({ root, logDirOverride }); + return { + baseCacheRoot, + repoCacheRoot, + fixtureRoot, + failureLogRoot, + ciOutDir: path.join(baseCacheRoot, 'ci-artifacts'), + mergeDir: path.join(baseCacheRoot, 'merge'), + shellWorkDir: path.join(baseCacheRoot, 'shell') + }; +}; diff --git a/tests/script-coverage/report.js b/tests/script-coverage/report.js new file mode 100644 index 000000000..e7ddd10c1 --- /dev/null +++ b/tests/script-coverage/report.js @@ -0,0 +1,148 @@ +const TIER_B_DEFAULT = [ + 'build-index', + 'build-sqlite-index', + 'build-lmdb-index', + 'compact-sqlite-index' +]; + +const createCoverageEntry = () => ({ status: 'pending', via: null, reason: null }); + +export const createCoverageState = ({ scriptNames }) => { + const coverage = new Map(scriptNames.map((name) => [name, createCoverageEntry()])); + const tierBRequired = new Set(TIER_B_DEFAULT.filter((name) => coverage.has(name))); + const tierBCoverage = new Map( + Array.from(tierBRequired, (name) => [name, createCoverageEntry()]) + ); + const unknownCovers = new Set(); + + const markCovered = (name, via) => { + if (!coverage.has(name)) { + unknownCovers.add(name); + return; + } + const entry = coverage.get(name); + if (entry.status === 'pending') { + coverage.set(name, { status: 'covered', via, reason: null }); + } + }; + + const markSkipped = (name, reason) => { + if (!coverage.has(name)) return; + coverage.set(name, { status: 'skipped', via: null, reason }); + }; + + const markTierBCovered = (name, via) => { + if (!tierBCoverage.has(name)) { + unknownCovers.add(name); + return; + } + const entry = tierBCoverage.get(name); + if (entry.status === 'pending') { + tierBCoverage.set(name, { status: 'covered', via, reason: null }); + } + }; + + return { + coverage, + tierBCoverage, + tierBRequired, + unknownCovers, + markCovered, + markSkipped, + markTierBCovered + }; +}; + +export const applyActionCoverage = (state, action) => { + const covers = Array.isArray(action.covers) ? action.covers : []; + for (const name of covers) { + state.markCovered(name, action.label); + } + const tierCovers = Array.isArray(action.coversTierB) ? action.coversTierB : []; + for (const name of tierCovers) { + state.markTierBCovered(name, action.label); + } +}; + +export const applyDefaultSkips = (state) => { + if (state.coverage.has('script-coverage-test')) { + state.markCovered('script-coverage-test', 'self'); + } + state.markSkipped('test-all', 'aggregates script-coverage-test and bench'); + state.markSkipped('test-all-no-bench', 'aggregates script-coverage-test without bench'); + state.markSkipped('download-models', 'requires network model download'); + state.markSkipped('bench', 'benchmarks are long-running'); + state.markSkipped('bench-ann', 'benchmarks are long-running'); + state.markSkipped('bench-dict-seg', 'benchmarks are long-running'); + state.markSkipped('bench-score-strategy', 'benchmarks are long-running'); + state.markSkipped('bench-micro', 'benchmarks are long-running'); + state.markSkipped('compare-models', 'benchmark/perf evaluation'); + state.markSkipped('bench-language', 'benchmarks are long-running'); + state.markSkipped('smoke:section1', 'smoke lanes are run manually'); + state.markSkipped('smoke:retrieval', 'smoke lanes are run manually'); + state.markSkipped('smoke:services', 'smoke lanes are run manually'); + state.markSkipped('smoke:workers', 'smoke lanes are run manually'); + state.markSkipped('smoke:embeddings', 'smoke lanes are run manually'); + state.markSkipped('smoke:sqlite', 'smoke lanes are run manually'); + state.markSkipped('watch-index', 'watch mode runs until interrupted'); + state.markSkipped('format', 'modifies working tree'); + state.markSkipped('lint', 'requires npm install and project lint config'); + + for (const name of state.coverage.keys()) { + if (name.startsWith('bench-language:')) { + state.markSkipped(name, 'bench-language variants are long-running'); + } + } +}; + +export const finalizeCoverage = (state) => { + const missing = []; + const skipped = []; + const covered = []; + for (const [name, entry] of state.coverage.entries()) { + if (entry.status === 'pending') missing.push(name); + if (entry.status === 'skipped') skipped.push({ name, reason: entry.reason }); + if (entry.status === 'covered') covered.push({ name, via: entry.via }); + } + + const missingTierB = []; + const coveredTierB = []; + for (const [name, entry] of state.tierBCoverage.entries()) { + if (entry.status === 'pending') missingTierB.push(name); + if (entry.status === 'covered') coveredTierB.push({ name, via: entry.via }); + } + + return { + missing, + missingTierB, + skipped, + covered, + coveredTierB, + unknownCovers: Array.from(state.unknownCovers) + }; +}; + +export const reportCoverage = (summary) => { + if (summary.unknownCovers.length) { + console.error(`Unknown coverage script names: ${summary.unknownCovers.join(', ')}`); + return false; + } + if (summary.missing.length || summary.missingTierB.length) { + if (summary.missing.length) { + console.error(`Missing coverage for: ${summary.missing.join(', ')}`); + } + if (summary.missingTierB.length) { + console.error(`Missing Tier B coverage for: ${summary.missingTierB.join(', ')}`); + } + return false; + } + + console.log(`script coverage: ${summary.covered.length} covered, ${summary.skipped.length} skipped`); + console.log(`tier B coverage: ${summary.coveredTierB.length} covered, ${summary.missingTierB.length} missing`); + if (summary.skipped.length) { + for (const entry of summary.skipped) { + console.log(`- skipped ${entry.name}: ${entry.reason}`); + } + } + return true; +}; diff --git a/tests/script-coverage/runner.js b/tests/script-coverage/runner.js new file mode 100644 index 000000000..fc108c06e --- /dev/null +++ b/tests/script-coverage/runner.js @@ -0,0 +1,130 @@ +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +export const resolveRetries = ({ argvRetries, envRetries, defaultRetries = 2 }) => { + if (Number.isFinite(argvRetries)) return Math.max(0, argvRetries); + if (Number.isFinite(envRetries)) return Math.max(0, envRetries); + return defaultRetries; +}; + +export const prepareCoverageDirs = async ({ baseCacheRoot, repoCacheRoot, failureLogRoot }) => { + await fsPromises.rm(baseCacheRoot, { recursive: true, force: true }); + await fsPromises.mkdir(repoCacheRoot, { recursive: true }); + await fsPromises.mkdir(failureLogRoot, { recursive: true }); +}; + +const sanitizeLabel = (label) => label.replace(/[^a-z0-9-_]+/gi, '_').slice(0, 120); + +const writeFailureLog = (failureLogRoot, label, attempt, cmd, args, options, result) => { + const safeLabel = sanitizeLabel(label); + const logPath = path.join(failureLogRoot, `${safeLabel}.attempt-${attempt}.log`); + const lines = [ + `label: ${label}`, + `attempt: ${attempt}`, + `command: ${[cmd, ...args].join(' ')}`, + `cwd: ${options.cwd || process.cwd()}`, + `exit: ${result.status ?? 'null'}`, + '' + ]; + if (result.stdout) { + lines.push('--- stdout ---', String(result.stdout)); + } + if (result.stderr) { + lines.push('--- stderr ---', String(result.stderr)); + } + fs.writeFileSync(logPath, lines.join('\n'), 'utf8'); + return logPath; +}; + +export const createCommandRunner = ({ retries, failureLogRoot }) => { + const run = (label, cmd, args, options = {}) => { + const maxAttempts = retries + 1; + const normalizeOutput = (value) => { + if (!value) return ''; + let text = String(value); + text = text.replace(/\r\n/g, '\n'); + text = text.replace(/\n{3,}/g, '\n\n'); + text = text.replace(/^\n+/, '\n'); + return text; + }; + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + const { env: optionEnv, ...spawnOptions } = options; + const env = { ...process.env, ...optionEnv }; + if (!env.PAIROFCLEATS_TEST_LOG_DIR) { + env.PAIROFCLEATS_TEST_LOG_DIR = failureLogRoot; + } + const result = spawnSync(cmd, args, { + encoding: 'utf8', + maxBuffer: 50 * 1024 * 1024, + stdio: 'pipe', + env, + ...spawnOptions + }); + if (result.stdout) process.stdout.write(normalizeOutput(result.stdout)); + if (result.stderr) process.stderr.write(normalizeOutput(result.stderr)); + if (result.status === 0) return; + const logPath = writeFailureLog(failureLogRoot, label, attempt, cmd, args, options, result); + console.error(`Failed: ${label} (attempt ${attempt}/${maxAttempts}). Log: ${logPath}`); + if (attempt < maxAttempts) { + console.error(`Retrying: ${label}`); + } + } + process.exit(1); + }; + + const runNode = (label, scriptPath, args = [], options = {}) => { + run(label, process.execPath, [scriptPath, ...args], options); + }; + + return { run, runNode }; +}; + +export const runShellScripts = async ({ root, baseCacheRoot, run }) => { + const shellScripts = [ + path.join(root, 'merge-history.sh'), + path.join(root, 'merge-no-results.sh'), + path.join(root, 'merge-metrics.sh'), + path.join(root, 'tools', 'merge-history.sh'), + path.join(root, 'tools', 'merge-no-results.sh'), + path.join(root, 'tools', 'merge-metrics.sh'), + path.join(root, 'tools', 'merge-agentinfo-notes.sh'), + path.join(root, 'tools', 'merge-agentinfo-index.sh') + ]; + + const bashCheck = spawnSync('bash', ['-c', 'echo ok'], { encoding: 'utf8' }); + const bashAvailable = bashCheck.status === 0; + const jqCheck = bashAvailable ? spawnSync('bash', ['-c', 'command -v jq'], { encoding: 'utf8' }) : null; + const jqAvailable = jqCheck && jqCheck.status === 0; + const toPosixPath = (value) => (process.platform === 'win32' ? value.replace(/\\/g, '/') : value); + const bashPathCheck = bashAvailable + ? spawnSync('bash', ['-c', `cd "${toPosixPath(root)}"`], { encoding: 'utf8' }) + : null; + const bashAccessible = bashPathCheck && bashPathCheck.status === 0; + + if (bashAvailable && bashAccessible) { + const shellWorkDir = path.join(baseCacheRoot, 'shell'); + await fsPromises.mkdir(shellWorkDir, { recursive: true }); + const base = path.join(shellWorkDir, 'base.json'); + const ours = path.join(shellWorkDir, 'ours.json'); + const theirs = path.join(shellWorkDir, 'theirs.json'); + await fsPromises.writeFile(base, JSON.stringify({ file: { md: 1, code: 1 } }, null, 2)); + await fsPromises.writeFile(ours, JSON.stringify({ file: { md: 2, code: 0 } }, null, 2)); + await fsPromises.writeFile(theirs, JSON.stringify({ file: { md: 3, code: 2 } }, null, 2)); + + for (const scriptPath of shellScripts) { + if (!fs.existsSync(scriptPath)) continue; + if (scriptPath.endsWith('merge-metrics.sh') && !jqAvailable) { + console.log(`[skip] ${scriptPath} (jq not available)`); + continue; + } + const args = [scriptPath, base, ours, theirs].map(toPosixPath); + run('shell-script', 'bash', args, { cwd: root }); + } + } else if (!bashAvailable) { + console.log('[skip] shell scripts (bash not available)'); + } else { + console.log('[skip] shell scripts (bash cannot access workspace path)'); + } +}; diff --git a/tests/search-contract.js b/tests/search-contract.js new file mode 100644 index 000000000..1607d4082 --- /dev/null +++ b/tests/search-contract.js @@ -0,0 +1,85 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'search-contract'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, 'README.md'), + '# Sample\n\nalpha bravo\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); + +if (buildResult.status !== 0) { + console.error('Failed: build index for search contract'); + process.exit(buildResult.status ?? 1); +} + +const searchPath = path.join(root, 'search.js'); +const result = spawnSync( + process.execPath, + [searchPath, 'alpha', '--mode', 'prose', '--json', '--backend', 'memory', '--no-ann', '--repo', repoRoot], + { cwd: repoRoot, env, encoding: 'utf8' } +); + +if (result.status !== 0) { + console.error('Failed: search contract run'); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); +} + +let payload = null; +try { + payload = JSON.parse(result.stdout || '{}'); +} catch { + console.error('Failed: search contract returned invalid JSON'); + process.exit(1); +} + +if (!payload || typeof payload !== 'object') { + console.error('Failed: search contract payload missing'); + process.exit(1); +} + +for (const key of ['backend', 'code', 'prose', 'records', 'stats']) { + if (!(key in payload)) { + console.error(`Failed: search contract missing ${key}`); + process.exit(1); + } +} + +if (!Array.isArray(payload.prose) || payload.prose.length === 0) { + console.error('Failed: search contract expected prose hits'); + process.exit(1); +} + +const hit = payload.prose[0]; +if (!hit || !hit.file) { + console.error('Failed: search contract hit missing file'); + process.exit(1); +} +if (!Number.isFinite(hit.startLine)) { + console.error('Failed: search contract hit missing startLine'); + process.exit(1); +} + +console.log('search contract tests passed'); diff --git a/tests/search-determinism.js b/tests/search-determinism.js new file mode 100644 index 000000000..0b093b890 --- /dev/null +++ b/tests/search-determinism.js @@ -0,0 +1,96 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'search-determinism'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const content = 'alpha beta gamma\nalpha beta gamma\n'; +const files = ['alpha-1.txt', 'alpha-2.txt', 'alpha-3.txt']; +for (const file of files) { + await fsPromises.writeFile(path.join(repoRoot, file), content); +} + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('Failed: build index'); + process.exit(buildResult.status ?? 1); +} + +const searchPath = path.join(root, 'search.js'); +const searchArgs = [ + searchPath, + 'alpha', + '--mode', + 'prose', + '--top', + '3', + '--ann', + '--explain', + '--json', + '--backend', + 'memory', + '--repo', + repoRoot +]; + +function runSearch(label) { + const result = spawnSync(process.execPath, searchArgs, { + cwd: repoRoot, + env, + encoding: 'utf8' + }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + let payload = null; + try { + payload = JSON.parse(result.stdout || '{}'); + } catch { + console.error(`Failed: ${label} returned invalid JSON`); + process.exit(1); + } + return payload; +} + +const first = runSearch('search first'); +const second = runSearch('search second'); + +const firstHits = first.prose || []; +const secondHits = second.prose || []; +if (!firstHits.length || !secondHits.length) { + console.error('Expected prose hits for determinism test.'); + process.exit(1); +} +for (const hit of firstHits) { + if (!hit.scoreBreakdown) { + console.error('Expected score breakdown for determinism test.'); + process.exit(1); + } +} + +if (JSON.stringify(firstHits) !== JSON.stringify(secondHits)) { + console.error('Determinism test failed: search results differ between runs.'); + process.exit(1); +} + +console.log('search determinism tests passed'); diff --git a/tests/search-explain-symbol.js b/tests/search-explain-symbol.js new file mode 100644 index 000000000..7f3896097 --- /dev/null +++ b/tests/search-explain-symbol.js @@ -0,0 +1,63 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'explain-symbol'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, 'symbol.js'), + 'export function boostExample() { return "symbol boost test"; }\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub', + PAIROFCLEATS_WORKER_POOL: 'off' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('Failed: build_index'); + process.exit(buildResult.status ?? 1); +} + +const searchResult = spawnSync( + process.execPath, + [ + path.join(root, 'search.js'), + 'boostExample', + '--mode', + 'code', + '--explain', + '--no-ann', + '--repo', + repoRoot + ], + { encoding: 'utf8', env } +); +if (searchResult.status !== 0) { + console.error('Search failed.'); + if (searchResult.stderr) console.error(searchResult.stderr.trim()); + process.exit(searchResult.status ?? 1); +} + +const output = searchResult.stdout || ''; +if (!output.includes('Symbol')) { + console.error('Expected explain output to include symbol boost details.'); + process.exit(1); +} + +console.log('explain symbol test passed'); diff --git a/tests/search-explain.js b/tests/search-explain.js new file mode 100644 index 000000000..2ffcd08a9 --- /dev/null +++ b/tests/search-explain.js @@ -0,0 +1,62 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'search-explain'); +const cacheRoot = path.join(tempRoot, 'cache'); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', fixtureRoot], + { env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('search explain test failed: build_index failed'); + process.exit(buildResult.status ?? 1); +} + +const stripAnsi = (value) => value.replace(/\u001b\[[0-9;]*m/g, ''); + +const runSearch = (args, label) => { + const result = spawnSync( + process.execPath, + [path.join(root, 'search.js'), 'return', '--mode', 'code', '--no-ann', '--repo', fixtureRoot, ...args], + { env, encoding: 'utf8' } + ); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + return stripAnsi(`${result.stdout || ''}${result.stderr || ''}`); +}; + +const explainOutput = runSearch(['--explain'], 'explain'); +if (!explainOutput.includes('Score:')) { + console.error('Explain output missing Score breakdown.'); + process.exit(1); +} +if (!explainOutput.includes('Sparse:')) { + console.error('Explain output missing Sparse breakdown.'); + process.exit(1); +} + +const whyOutput = runSearch(['--why'], 'why'); +if (!whyOutput.includes('Score:')) { + console.error('Why output missing Score breakdown.'); + process.exit(1); +} + +console.log('search explain tests passed'); diff --git a/tests/search-filters.js b/tests/search-filters.js index 84e13c366..cc46ceb55 100644 --- a/tests/search-filters.js +++ b/tests/search-filters.js @@ -56,6 +56,25 @@ runGit( { GIT_AUTHOR_DATE: dateNew, GIT_COMMITTER_DATE: dateNew } ); +await fsPromises.writeFile(path.join(repoRoot, 'CaseFile.TXT'), 'AlphaCase alpha\n'); +runGit(['add', '.'], 'git add CaseFile'); +runGit( + ['commit', '-m', 'add case file', '--author', 'Casey ', '--date', dateNew], + 'git commit CaseFile', + { GIT_AUTHOR_DATE: dateNew, GIT_COMMITTER_DATE: dateNew } +); + +await fsPromises.writeFile( + path.join(repoRoot, 'sample.js'), + 'const equal = (a, b) => a && b;\nfunction check(a, b) {\n return a && b;\n}\n' +); +runGit(['add', '.'], 'git add sample.js'); +runGit( + ['commit', '-m', 'add sample.js', '--author', 'Dana ', '--date', dateNew], + 'git commit sample.js', + { GIT_AUTHOR_DATE: dateNew, GIT_COMMITTER_DATE: dateNew } +); + const env = { ...process.env, PAIROFCLEATS_CACHE_ROOT: cacheRoot, @@ -73,11 +92,15 @@ if (buildResult.status !== 0) { } const searchPath = path.join(root, 'search.js'); +const branchName = (() => { + const result = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { cwd: repoRoot, encoding: 'utf8' }); + return result.status === 0 ? result.stdout.trim() : null; +})(); -function runSearch(query, args, label) { +function runSearch(query, args, label, mode = 'prose') { const result = spawnSync( process.execPath, - [searchPath, query, '--mode', 'prose', '--json', '--no-ann', '--repo', repoRoot, ...args], + [searchPath, query, '--mode', mode, '--json', '--no-ann', '--repo', repoRoot, ...args], { cwd: repoRoot, env, encoding: 'utf8' } ); if (result.status !== 0) { @@ -88,7 +111,8 @@ function runSearch(query, args, label) { return JSON.parse(result.stdout || '{}'); } -const extractFiles = (payload) => new Set((payload.prose || []).map((hit) => path.basename(hit.file || ''))); +const extractFiles = (payload, key = 'prose') => + new Set((payload[key] || []).map((hit) => path.basename(hit.file || ''))); const negativeToken = runSearch('alpha -gamma', [], 'negative token'); const negativeTokenFiles = extractFiles(negativeToken); @@ -145,4 +169,48 @@ if (!modifiedSinceFiles.has('beta.txt') || modifiedSinceFiles.has('alpha.txt')) process.exit(1); } +if (branchName) { + const branchMatch = runSearch('alpha', ['--branch', branchName], 'branch filter'); + if (!(branchMatch.prose || []).length) { + console.error('branch filter returned no results for current branch.'); + process.exit(1); + } + const branchMiss = runSearch('alpha', ['--branch', 'no-such-branch'], 'branch mismatch'); + if ((branchMiss.prose || []).length) { + console.error('branch mismatch should return no results.'); + process.exit(1); + } +} + +const caseInsensitiveFile = runSearch('alpha', ['--file', 'casefile.txt'], 'case-insensitive file'); +if (!extractFiles(caseInsensitiveFile).has('CaseFile.TXT')) { + console.error('case-insensitive file filter failed.'); + process.exit(1); +} +const caseSensitiveFile = runSearch('alpha', ['--file', 'casefile.txt', '--case-file'], 'case-sensitive file'); +if (extractFiles(caseSensitiveFile).has('CaseFile.TXT')) { + console.error('case-sensitive file filter should not match.'); + process.exit(1); +} +const regexFile = runSearch('alpha', ['--file', '/casefile\\.txt/'], 'regex file filter'); +if (!extractFiles(regexFile).has('CaseFile.TXT')) { + console.error('regex file filter failed.'); + process.exit(1); +} +const caseInsensitiveToken = runSearch('AlphaCase', [], 'case-insensitive token'); +if (!extractFiles(caseInsensitiveToken).has('CaseFile.TXT')) { + console.error('case-insensitive token match failed.'); + process.exit(1); +} +const caseSensitiveToken = runSearch('AlphaCase', ['--case-tokens'], 'case-sensitive token'); +if (extractFiles(caseSensitiveToken).has('CaseFile.TXT')) { + console.error('case-sensitive token match should not match.'); + process.exit(1); +} +const punctuationSearch = runSearch('&&', [], 'punctuation token', 'code'); +if (!extractFiles(punctuationSearch, 'code').has('sample.js')) { + console.error('punctuation token match failed.'); + process.exit(1); +} + console.log('Search filter tests passed'); diff --git a/tests/search-help.js b/tests/search-help.js new file mode 100644 index 000000000..653454c34 --- /dev/null +++ b/tests/search-help.js @@ -0,0 +1,21 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const result = spawnSync(process.execPath, [path.join(root, 'search.js')], { encoding: 'utf8' }); +if (result.status === 0) { + console.error('Expected search help to exit non-zero with no query.'); + process.exit(1); +} + +const output = `${result.stdout || ''}${result.stderr || ''}`; +const requiredFlags = ['--calls', '--uses', '--author', '--import', '--explain']; +for (const flag of requiredFlags) { + if (!output.includes(flag)) { + console.error(`Help output missing flag: ${flag}`); + process.exit(1); + } +} + +console.log('search help test passed'); diff --git a/tests/search-missing-flag-values.js b/tests/search-missing-flag-values.js new file mode 100644 index 000000000..8240cb71d --- /dev/null +++ b/tests/search-missing-flag-values.js @@ -0,0 +1,35 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const searchPath = path.join(root, 'search.js'); + +function runFlag(flag) { + return spawnSync( + process.execPath, + [searchPath, 'test', flag], + { encoding: 'utf8' } + ); +} + +const cases = [ + { flag: '--type', name: '--type' }, + { flag: '--author', name: '--author' }, + { flag: '--import', name: '--import' } +]; + +for (const entry of cases) { + const result = runFlag(entry.flag); + if (result.status === 0) { + console.error(`Expected non-zero exit for ${entry.name}.`); + process.exit(1); + } + const output = `${result.stderr || ''}${result.stdout || ''}`; + if (!output.includes(`Missing value for ${entry.name}`)) { + console.error(`Expected missing value message for ${entry.name}.`); + process.exit(1); + } +} + +console.log('missing flag values test passed'); diff --git a/tests/search-missing-index.js b/tests/search-missing-index.js new file mode 100644 index 000000000..27d477871 --- /dev/null +++ b/tests/search-missing-index.js @@ -0,0 +1,38 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'search-missing-index'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const result = spawnSync( + process.execPath, + [path.join(root, 'search.js'), 'alpha', '--mode', 'code', '--no-ann', '--repo', repoRoot], + { encoding: 'utf8', env } +); + +if (result.status === 0) { + console.error('Expected search to fail when index is missing.'); + process.exit(1); +} + +const output = `${result.stdout || ''}${result.stderr || ''}`; +if (!output.includes('build-index')) { + console.error('Expected missing index message to include build-index hint.'); + process.exit(1); +} + +console.log('missing index test passed'); diff --git a/tests/search-removed-flags.js b/tests/search-removed-flags.js new file mode 100644 index 000000000..1c8b78d7d --- /dev/null +++ b/tests/search-removed-flags.js @@ -0,0 +1,34 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const searchPath = path.join(root, 'search.js'); + +function runFlag(flag) { + return spawnSync( + process.execPath, + [searchPath, 'test', flag], + { encoding: 'utf8' } + ); +} + +const cases = [ + { flag: '--human', label: 'human' }, + { flag: '--headline', label: 'headline' } +]; + +for (const entry of cases) { + const result = runFlag(entry.flag); + if (result.status === 0) { + console.error(`Expected non-zero exit for ${entry.flag}.`); + process.exit(1); + } + const output = `${result.stderr || ''}${result.stdout || ''}`; + if (!output.toLowerCase().includes('removed') || !output.includes(entry.flag)) { + console.error(`Expected actionable error for ${entry.flag}.`); + process.exit(1); + } +} + +console.log('removed flags test passed'); diff --git a/tests/search-rrf.js b/tests/search-rrf.js new file mode 100644 index 000000000..8ddbdc4c4 --- /dev/null +++ b/tests/search-rrf.js @@ -0,0 +1,73 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'search-rrf'); +const cacheRoot = path.join(tempRoot, 'cache'); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', fixtureRoot], + { env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('search rrf test failed: build_index failed'); + process.exit(buildResult.status ?? 1); +} + +const result = spawnSync( + process.execPath, + [ + path.join(root, 'search.js'), + 'return', + '--mode', + 'code', + '--ann', + '--json', + '--repo', + fixtureRoot + ], + { env, encoding: 'utf8' } +); + +if (result.status !== 0) { + console.error('search rrf test failed: search returned error'); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); +} + +let payload = null; +try { + payload = JSON.parse(result.stdout || '{}'); +} catch (err) { + console.error('search rrf test failed: invalid JSON output'); + process.exit(1); +} + +const hit = payload?.code?.[0]; +if (!payload?.stats?.annActive) { + console.error('search rrf test failed: annActive was false'); + process.exit(1); +} +if (!hit?.scoreBreakdown?.rrf) { + console.error('search rrf test failed: scoreBreakdown.rrf missing'); + process.exit(1); +} +if (hit.scoreType !== 'rrf') { + console.error(`search rrf test failed: expected scoreType rrf, got ${hit.scoreType}`); + process.exit(1); +} + +console.log('search rrf tests passed'); diff --git a/tests/search-symbol-boost.js b/tests/search-symbol-boost.js new file mode 100644 index 000000000..46df0ecff --- /dev/null +++ b/tests/search-symbol-boost.js @@ -0,0 +1,77 @@ +import assert from 'node:assert/strict'; +import { createSearchPipeline } from '../src/retrieval/pipeline.js'; + +const idx = { + chunkMeta: [ + { + id: 0, + file: 'a.js', + start: 0, + end: 10, + kind: 'FunctionDeclaration', + name: 'foo', + tokens: ['alpha'] + }, + { + id: 1, + file: 'b.js', + start: 0, + end: 10, + kind: 'FunctionDeclaration', + name: 'bar', + tokens: ['alpha'] + } + ], + fileRelations: new Map([ + ['a.js', { exports: ['foo'] }], + ['b.js', { exports: [] }] + ]) +}; + +const searchPipeline = createSearchPipeline({ + useSqlite: false, + sqliteFtsRequested: false, + sqliteFtsNormalize: false, + sqliteFtsProfile: 'balanced', + sqliteFtsWeights: null, + bm25K1: 1.2, + bm25B: 0.75, + postingsConfig: { + enablePhraseNgrams: false, + enableChargrams: false, + phraseMinN: 2, + phraseMaxN: 3, + chargramMinN: 3, + chargramMaxN: 3 + }, + queryTokens: ['alpha'], + phraseNgramSet: null, + phraseRange: null, + symbolBoost: { + enabled: true, + definitionWeight: 1.4, + exportWeight: 1.2 + }, + filters: {}, + filtersActive: false, + topN: 2, + annEnabled: false, + scoreBlend: { enabled: false }, + minhashMaxDocs: 0, + vectorAnnState: null, + vectorAnnUsed: {}, + buildCandidateSetSqlite: () => null, + getTokenIndexForQuery: () => null, + rankSqliteFts: () => [], + rankVectorAnnSqlite: () => [] +}); + +const results = searchPipeline(idx, 'code', null); +assert.equal(results.length, 2, 'expected two results'); +assert.equal(results[0].name, 'foo', 'expected exported definition to rank first'); +assert.ok(results[0].score > results[1].score, 'expected boosted score to win'); +assert.ok(results[0].scoreBreakdown?.symbol?.definition, 'expected definition flag'); +assert.ok(results[0].scoreBreakdown?.symbol?.export, 'expected export flag'); +assert.ok(results[0].scoreBreakdown?.symbol?.factor > 1, 'expected symbol boost factor'); + +console.log('symbol boost test passed'); diff --git a/tests/search-topn-filters.js b/tests/search-topn-filters.js new file mode 100644 index 000000000..16648a33e --- /dev/null +++ b/tests/search-topn-filters.js @@ -0,0 +1,102 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'search-topn-filters'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const allowedFiles = ['allowed-1.txt', 'allowed-2.txt']; +const blockedCount = 12; +const allowedContent = 'alpha beta gamma\nalpha beta\n'; +const blockedContent = `${Array.from({ length: 200 }, () => 'alpha').join(' ')}\n`; + +for (const file of allowedFiles) { + await fsPromises.writeFile(path.join(repoRoot, file), allowedContent); +} +for (let i = 0; i < blockedCount; i += 1) { + await fsPromises.writeFile(path.join(repoRoot, `blocked-${i + 1}.txt`), blockedContent); +} + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +function run(args, label, options = {}) { + const result = spawnSync(process.execPath, args, { + cwd: repoRoot, + env, + stdio: 'inherit', + ...options + }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } + return result; +} + +run([path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], 'build index'); +run([path.join(root, 'tools', 'build-sqlite-index.js'), '--repo', repoRoot], 'build sqlite index'); + +const searchPath = path.join(root, 'search.js'); + +function runSearch(backend) { + const result = spawnSync( + process.execPath, + [ + searchPath, + 'alpha', + '--mode', + 'prose', + '--top', + '2', + '--file', + 'allowed', + '--json', + '--backend', + backend, + '--no-ann', + '--repo', + repoRoot + ], + { cwd: repoRoot, env, encoding: 'utf8' } + ); + if (result.status !== 0) { + console.error(`Failed: search (${backend})`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + let payload = null; + try { + payload = JSON.parse(result.stdout || '{}'); + } catch { + console.error(`Failed: search (${backend}) returned invalid JSON`); + process.exit(1); + } + const hits = payload.prose || []; + if (hits.length !== 2) { + console.error(`Expected 2 results for ${backend}, got ${hits.length}`); + process.exit(1); + } + for (const hit of hits) { + const fileBase = path.basename(hit.file || ''); + if (!fileBase.startsWith('allowed-')) { + console.error(`Unexpected file in ${backend} results: ${fileBase}`); + process.exit(1); + } + } +} + +runSearch('memory'); +runSearch('sqlite-fts'); + +console.log('search top-N filter tests passed'); diff --git a/tests/search-windows-path-filter.js b/tests/search-windows-path-filter.js new file mode 100644 index 000000000..dfa91a344 --- /dev/null +++ b/tests/search-windows-path-filter.js @@ -0,0 +1,78 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'windows-path-filter'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(path.join(repoRoot, 'src', 'nested'), { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'nested', 'util.js'), + 'export function winPathFilter() { return "windows path filter"; }\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub', + PAIROFCLEATS_WORKER_POOL: 'off' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('Failed: build_index'); + process.exit(buildResult.status ?? 1); +} + +function runSearch(extraArgs) { + const result = spawnSync( + process.execPath, + [ + path.join(root, 'search.js'), + 'windows path filter', + '--json', + '--mode', + 'code', + '--no-ann', + '--repo', + repoRoot, + ...extraArgs + ], + { encoding: 'utf8', env } + ); + if (result.status !== 0) { + console.error('Search failed.'); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + try { + return JSON.parse(result.stdout || '{}'); + } catch { + console.error('Search output was not valid JSON.'); + process.exit(1); + } +} + +const filePayload = runSearch(['--file', 'src\\nested\\util.js']); +if (!Array.isArray(filePayload.code) || filePayload.code.length === 0) { + console.error('Expected results for Windows-style --file filter.'); + process.exit(1); +} + +const pathPayload = runSearch(['--path', 'src\\nested']); +if (!Array.isArray(pathPayload.code) || pathPayload.code.length === 0) { + console.error('Expected results for Windows-style --path filter.'); + process.exit(1); +} + +console.log('windows path filter test passed'); diff --git a/tests/segment-pipeline.js b/tests/segment-pipeline.js new file mode 100644 index 000000000..cebcdb556 --- /dev/null +++ b/tests/segment-pipeline.js @@ -0,0 +1,104 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import path from 'node:path'; +import { discoverSegments, chunkSegments } from '../src/index/segments.js'; +import { extractComments, normalizeCommentConfig } from '../src/index/comments.js'; +import { buildLineIndex } from '../src/shared/lines.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'segments'); + +const assert = (condition, message) => { + if (condition) return; + console.error(message); + process.exit(1); +}; + +const mdPath = path.join(fixtureRoot, 'docs', 'guide.md'); +const mdText = fs.readFileSync(mdPath, 'utf8'); +const mdSegments = discoverSegments({ + text: mdText, + ext: '.md', + relPath: 'docs/guide.md', + mode: 'prose', + segmentsConfig: { inlineCodeSpans: true } +}); +assert(mdSegments.some((seg) => seg.type === 'config' && seg.meta?.frontmatter), 'Expected markdown frontmatter segment.'); +const fencedSegments = mdSegments.filter((seg) => seg.type === 'embedded' && seg.meta?.fenceInfo); +assert(fencedSegments.some((seg) => String(seg.meta.fenceInfo).includes('js')), 'Expected markdown JS fenced segment.'); +assert(fencedSegments.some((seg) => String(seg.meta.fenceInfo).includes('json')), 'Expected markdown JSON fenced segment.'); +const inlineSegments = mdSegments.filter((seg) => seg.meta?.inlineCode); +assert(inlineSegments.length === 2, `Expected 2 inline code segments, got ${inlineSegments.length}.`); + +const mdChunks = chunkSegments({ + text: mdText, + ext: '.md', + relPath: 'docs/guide.md', + mode: 'prose', + segments: mdSegments, + lineIndex: buildLineIndex(mdText), + context: {} +}); +let lastStart = -1; +for (const chunk of mdChunks) { + assert(chunk.start >= 0 && chunk.end <= mdText.length, 'Markdown chunk range invalid.'); + assert(chunk.segment, 'Markdown chunk missing segment metadata.'); + assert(chunk.start >= lastStart, 'Markdown chunks are out of order.'); + lastStart = chunk.start; +} + +const vuePath = path.join(fixtureRoot, 'src', 'component.vue'); +const vueText = fs.readFileSync(vuePath, 'utf8'); +const vueSegments = discoverSegments({ + text: vueText, + ext: '.vue', + relPath: 'src/component.vue', + mode: 'code' +}); +assert(vueSegments.some((seg) => seg.meta?.block === 'template'), 'Expected Vue template segment.'); +assert(vueSegments.some((seg) => seg.meta?.block === 'script' || seg.meta?.block === 'scriptSetup'), 'Expected Vue script segment.'); +assert(vueSegments.some((seg) => seg.meta?.block === 'style'), 'Expected Vue style segment.'); + +const sveltePath = path.join(fixtureRoot, 'src', 'widget.svelte'); +const svelteText = fs.readFileSync(sveltePath, 'utf8'); +const svelteSegments = discoverSegments({ + text: svelteText, + ext: '.svelte', + relPath: 'src/widget.svelte', + mode: 'code' +}); +assert(svelteSegments.some((seg) => seg.meta?.block === 'script'), 'Expected Svelte script segment.'); +assert(svelteSegments.some((seg) => seg.meta?.block === 'style'), 'Expected Svelte style segment.'); +assert(svelteSegments.some((seg) => seg.meta?.block === 'template'), 'Expected Svelte template segment.'); + +const astroPath = path.join(fixtureRoot, 'src', 'page.astro'); +const astroText = fs.readFileSync(astroPath, 'utf8'); +const astroSegments = discoverSegments({ + text: astroText, + ext: '.astro', + relPath: 'src/page.astro', + mode: 'code' +}); +assert(astroSegments.some((seg) => seg.meta?.block === 'frontmatter'), 'Expected Astro frontmatter segment.'); +assert(astroSegments.some((seg) => seg.meta?.block === 'template'), 'Expected Astro template segment.'); +assert(astroSegments.some((seg) => seg.meta?.block === 'style'), 'Expected Astro style segment.'); + +const commentPath = path.join(fixtureRoot, 'src', 'comments.js'); +const commentText = fs.readFileSync(commentPath, 'utf8'); +const commentConfig = normalizeCommentConfig({ extract: 'all', includeLicense: false }); +const commentData = extractComments({ + text: commentText, + ext: '.js', + languageId: 'javascript', + lineIndex: buildLineIndex(commentText), + config: commentConfig +}); +assert(commentData.comments.some((comment) => comment.type === 'doc'), 'Expected doc comment extracted.'); +assert(commentData.comments.some((comment) => comment.type === 'inline'), 'Expected inline comment extracted.'); +assert(commentData.comments.some((comment) => comment.type === 'block'), 'Expected block comment extracted.'); +assert(commentData.comments.some((comment) => comment.type === 'license'), 'Expected license comment extracted.'); +assert(!commentData.comments.some((comment) => comment.text.includes('eslint-disable')), 'Expected linter comment to be skipped.'); +assert(!commentData.comments.some((comment) => comment.text.includes('generated by')), 'Expected generated comment to be skipped.'); +assert(commentData.configSegments.some((segment) => segment.languageId === 'json' && segment.meta?.source === 'comment'), 'Expected JSON config segment from comment.'); + +console.log('segment pipeline tests passed'); diff --git a/tests/service-queue.js b/tests/service-queue.js new file mode 100644 index 000000000..bfe90e7da --- /dev/null +++ b/tests/service-queue.js @@ -0,0 +1,49 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { + ensureQueueDir, + enqueueJob, + claimNextJob, + completeJob, + queueSummary +} from '../tools/service/queue.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'service-queue'); +const queueDir = path.join(tempRoot, 'queue'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await ensureQueueDir(queueDir); + +const baseJob = { + createdAt: new Date().toISOString(), + repo: '/tmp/repo', + mode: 'all', + reason: 'test' +}; + +await enqueueJob(queueDir, { ...baseJob, id: 'job-index' }, null, 'index'); +await enqueueJob(queueDir, { ...baseJob, id: 'job-embed' }, null, 'embeddings'); + +const summaryIndex = await queueSummary(queueDir, 'index'); +const summaryEmbed = await queueSummary(queueDir, 'embeddings'); +if (summaryIndex.total !== 1 || summaryEmbed.total !== 1) { + console.error('Queue summary counts mismatch'); + process.exit(1); +} + +const job = await claimNextJob(queueDir, 'index'); +if (!job || job.status !== 'running') { + console.error('Expected queued job to transition to running'); + process.exit(1); +} +await completeJob(queueDir, job.id, 'failed', { exitCode: 1 }, 'index'); + +const summaryAfter = await queueSummary(queueDir, 'index'); +if (summaryAfter.failed !== 1) { + console.error('Expected failed job count to be 1'); + process.exit(1); +} + +console.log('service queue test passed'); diff --git a/tests/setup-index-detection.js b/tests/setup-index-detection.js new file mode 100644 index 000000000..3ba5a69c6 --- /dev/null +++ b/tests/setup-index-detection.js @@ -0,0 +1,122 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'setup-index-detection'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; + +await fsPromises.writeFile(path.join(repoRoot, '.pairofcleats.json'), '{}'); +await fsPromises.writeFile(path.join(repoRoot, 'README.md'), 'setup detection fixture\n'); + +const userConfig = loadUserConfig(repoRoot); +const codeIndexDir = getIndexDir(repoRoot, 'code', userConfig); + +async function resetIndexDir() { + await fsPromises.rm(codeIndexDir, { recursive: true, force: true }); + await fsPromises.mkdir(codeIndexDir, { recursive: true }); +} + +function runSetup(label) { + const result = spawnSync( + process.execPath, + [ + path.join(root, 'tools', 'setup.js'), + '--repo', + repoRoot, + '--non-interactive', + '--json', + '--skip-install', + '--skip-dicts', + '--skip-models', + '--skip-extensions', + '--skip-tooling', + '--skip-index', + '--skip-sqlite', + '--skip-artifacts' + ], + { + cwd: repoRoot, + encoding: 'utf8', + env: { ...process.env, PAIROFCLEATS_CACHE_ROOT: cacheRoot } + } + ); + if (result.status !== 0) { + console.error(`setup index detection failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + let payload = null; + try { + payload = JSON.parse(result.stdout || '{}'); + } catch { + console.error(`setup index detection failed: ${label} (invalid JSON output)`); + process.exit(1); + } + return payload; +} + +const scenarios = [ + { + label: 'chunk_meta.json', + build: async () => { + await fsPromises.writeFile(path.join(codeIndexDir, 'chunk_meta.json'), '[]'); + }, + expectReady: true + }, + { + label: 'chunk_meta.jsonl', + build: async () => { + await fsPromises.writeFile(path.join(codeIndexDir, 'chunk_meta.jsonl'), '{}\n'); + }, + expectReady: true + }, + { + label: 'chunk_meta.meta.json + parts', + build: async () => { + const partsDir = path.join(codeIndexDir, 'chunk_meta.parts'); + await fsPromises.mkdir(partsDir, { recursive: true }); + const partName = 'chunk_meta.part-00000.jsonl'; + await fsPromises.writeFile(path.join(partsDir, partName), '{}\n'); + const meta = { parts: [path.join('chunk_meta.parts', partName)], count: 1 }; + await fsPromises.writeFile( + path.join(codeIndexDir, 'chunk_meta.meta.json'), + JSON.stringify(meta, null, 2) + ); + }, + expectReady: true + }, + { + label: 'chunk_meta.meta.json without parts', + build: async () => { + await fsPromises.writeFile( + path.join(codeIndexDir, 'chunk_meta.meta.json'), + JSON.stringify({ parts: [], count: 0 }, null, 2) + ); + }, + expectReady: false + } +]; + +for (const scenario of scenarios) { + await resetIndexDir(); + await scenario.build(); + const payload = runSetup(scenario.label); + const ready = payload?.steps?.index?.ready === true; + if (ready !== scenario.expectReady) { + console.error( + `setup index detection failed: ${scenario.label} expected ready=${scenario.expectReady}, got ${ready}` + ); + process.exit(1); + } +} + +console.log('setup index detection tests passed'); diff --git a/tests/shard-merge.js b/tests/shard-merge.js new file mode 100644 index 000000000..0a8e52333 --- /dev/null +++ b/tests/shard-merge.js @@ -0,0 +1,98 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; +import { MAX_JSON_BYTES, loadChunkMeta, loadTokenPostings } from '../src/shared/artifact-io.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'shard-merge'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRootA = path.join(tempRoot, 'cache-a'); +const cacheRootB = path.join(tempRoot, 'cache-b'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); +await fsPromises.mkdir(path.join(repoRoot, 'lib'), { recursive: true }); +await fsPromises.mkdir(cacheRootA, { recursive: true }); +await fsPromises.mkdir(cacheRootB, { recursive: true }); + +await fsPromises.writeFile(path.join(repoRoot, 'src', 'alpha.js'), 'export const alpha = 1;\n'); +await fsPromises.writeFile(path.join(repoRoot, 'lib', 'beta.py'), 'def beta():\n return 2\n'); + +const configPath = path.join(repoRoot, '.pairofcleats.json'); +const writeConfig = async (shardsEnabled) => { + await fsPromises.writeFile( + configPath, + JSON.stringify({ + indexing: { + fileListSampleSize: 10, + shards: { + enabled: shardsEnabled, + maxWorkers: 1, + minFiles: 1 + }, + treeSitter: { enabled: false } + } + }, null, 2) + ); +}; + +const runBuild = (cacheRoot, label) => { + const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' + }; + const result = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } + ); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +const readIndex = (cacheRoot) => { + const previousCacheRoot = process.env.PAIROFCLEATS_CACHE_ROOT; + process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; + const userConfig = loadUserConfig(repoRoot); + const codeDir = getIndexDir(repoRoot, 'code', userConfig); + const chunks = loadChunkMeta(codeDir, { maxBytes: MAX_JSON_BYTES }); + const tokenIndex = loadTokenPostings(codeDir, { maxBytes: MAX_JSON_BYTES }); + if (previousCacheRoot === undefined) { + delete process.env.PAIROFCLEATS_CACHE_ROOT; + } else { + process.env.PAIROFCLEATS_CACHE_ROOT = previousCacheRoot; + } + return { chunks, tokenIndex }; +}; + +await writeConfig(false); +runBuild(cacheRootA, 'baseline build'); +const baseline = readIndex(cacheRootA); + +await writeConfig(true); +runBuild(cacheRootB, 'sharded build'); +const sharded = readIndex(cacheRootB); + +if (baseline.chunks.length !== sharded.chunks.length) { + console.error('Shard merge mismatch: chunk counts differ'); + process.exit(1); +} +if (JSON.stringify(baseline.chunks) !== JSON.stringify(sharded.chunks)) { + console.error('Shard merge mismatch: chunk metadata differs'); + process.exit(1); +} +if (JSON.stringify(baseline.tokenIndex.vocab) !== JSON.stringify(sharded.tokenIndex.vocab)) { + console.error('Shard merge mismatch: token vocab differs'); + process.exit(1); +} +if (JSON.stringify(baseline.tokenIndex.postings) !== JSON.stringify(sharded.tokenIndex.postings)) { + console.error('Shard merge mismatch: token postings differ'); + process.exit(1); +} + +console.log('shard merge test passed'); diff --git a/tests/shard-plan.js b/tests/shard-plan.js new file mode 100644 index 000000000..20b0174be --- /dev/null +++ b/tests/shard-plan.js @@ -0,0 +1,73 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import path from 'node:path'; +import { planShards } from '../src/index/build/shards.js'; + +const makeEntry = (rel) => ({ + rel, + abs: path.join('C:\\repo', rel) +}); + +const entriesA = [ + makeEntry('src/sub/a.js'), + makeEntry('src/sub/b.js'), + makeEntry('src/root.js') +]; +const shardsA = planShards(entriesA, { + mode: 'code', + dirDepth: 2, + lineCounts: new Map() +}); +const labelsA = new Set(shardsA.map((shard) => shard.label)); +assert.ok(labelsA.has('src/javascript'), 'parent shard missing'); +assert.ok(labelsA.has('./javascript'), 'root shard missing'); +assert.ok(!Array.from(labelsA).some((label) => label.startsWith('src/sub/'))); +const srcShardA = shardsA.find((shard) => shard.label === 'src/javascript'); +assert.equal(srcShardA.entries.length, 2); + +const entriesB = []; +const lineCountsB = new Map(); +for (let i = 0; i < 10; i += 1) { + const rel = `src/large${i}/file.js`; + entriesB.push(makeEntry(rel)); + lineCountsB.set(rel, 100); +} +entriesB.push(makeEntry('src/huge/file.js')); +lineCountsB.set('src/huge/file.js', 60); +entriesB.push(makeEntry('src/small/file.js')); +lineCountsB.set('src/small/file.js', 1); +const shardsB = planShards(entriesB, { + mode: 'code', + dirDepth: 2, + lineCounts: lineCountsB +}); +const labelsB = new Set(shardsB.map((shard) => shard.label)); +assert.ok(labelsB.has('src/huge/javascript'), 'huge-file shard should stay separate'); +assert.ok(!labelsB.has('src/small/javascript'), 'small shard should merge to parent'); +const parentB = shardsB.find((shard) => shard.label === 'src/javascript'); +assert.ok(parentB, 'parent shard should exist for merged subdirs'); +assert.ok(parentB.entries.some((entry) => entry.rel === 'src/small/file.js')); + +const entriesC = []; +const lineCountsC = new Map(); +const extensions = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']; +for (const ext of extensions) { + const rel = `file.${ext}`; + entriesC.push(makeEntry(rel)); + lineCountsC.set(rel, 10); +} +for (let i = 0; i < 10; i += 1) { + const rel = `big${i}.js`; + entriesC.push(makeEntry(rel)); + lineCountsC.set(rel, 10); +} +const shardsC = planShards(entriesC, { + mode: 'code', + dirDepth: 1, + lineCounts: lineCountsC +}); +const splitParts = shardsC.filter((shard) => shard.label.startsWith('./javascript#')); +assert.equal(splitParts.length, 10, 'expected split shards for large group'); +assert.ok(splitParts.every((shard) => shard.splitFrom === './javascript')); + +console.log('shard-plan test passed.'); diff --git a/tests/skip-minified-binary.js b/tests/skip-minified-binary.js new file mode 100644 index 000000000..aad6a6ae2 --- /dev/null +++ b/tests/skip-minified-binary.js @@ -0,0 +1,94 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, getMetricsDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'skip-minified-binary'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +const configPath = path.join(repoRoot, '.pairofcleats.json'); +await fsPromises.writeFile( + configPath, + JSON.stringify({ + indexing: { + maxFileBytes: 200000, + fileListSampleSize: 20, + treeSitter: { enabled: false } + } + }, null, 2) +); + +const minifiedPath = path.join(repoRoot, 'app.min.js'); +const binaryPath = path.join(repoRoot, 'binary.png'); +const normalPath = path.join(repoRoot, 'normal.js'); +await fsPromises.writeFile(minifiedPath, 'function minified(){return 42;}'); +await fsPromises.writeFile(normalPath, 'function ok() { return 1; }\n'); +await fsPromises.copyFile( + path.join(root, 'tests', 'fixtures', 'binary', 'sample.png'), + binaryPath +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: repoRoot, env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('Failed: build_index'); + process.exit(buildResult.status ?? 1); +} + +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const userConfig = loadUserConfig(repoRoot); +const codeDir = getIndexDir(repoRoot, 'code', userConfig); +const fileListsPath = path.join(codeDir, '.filelists.json'); +if (!fs.existsSync(fileListsPath)) { + console.error('Missing .filelists.json'); + process.exit(1); +} +const fileLists = JSON.parse(await fsPromises.readFile(fileListsPath, 'utf8')); +const skippedSample = fileLists?.skipped?.sample; +if (!Array.isArray(skippedSample)) { + console.error('Skipped sample payload is not an array'); + process.exit(1); +} +const minifiedSkip = skippedSample.find((entry) => entry?.file && entry.file.endsWith('app.min.js')); +if (!minifiedSkip || minifiedSkip.reason !== 'minified') { + console.error('Expected minified skip entry for app.min.js'); + process.exit(1); +} +const binarySkip = skippedSample.find((entry) => entry?.file && entry.file.endsWith('binary.png')); +if (!binarySkip || binarySkip.reason !== 'binary') { + console.error('Expected binary skip entry for binary.js'); + process.exit(1); +} + +const metricsDir = getMetricsDir(repoRoot, userConfig); +const metricsPath = path.join(metricsDir, 'index-code.json'); +if (!fs.existsSync(metricsPath)) { + console.error('Missing index-code metrics'); + process.exit(1); +} +const metrics = JSON.parse(await fsPromises.readFile(metricsPath, 'utf8')); +const minifiedCount = metrics?.files?.skippedByReason?.minified || 0; +const binaryCount = metrics?.files?.skippedByReason?.binary || 0; +if (minifiedCount < 1 || binaryCount < 1) { + console.error('Expected skippedByReason.minified and skippedByReason.binary to be >= 1'); + process.exit(1); +} + +console.log('minified/binary skip test passed'); diff --git a/tests/smoke-embeddings.js b/tests/smoke-embeddings.js new file mode 100644 index 000000000..ab60e474b --- /dev/null +++ b/tests/smoke-embeddings.js @@ -0,0 +1,24 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { cleanup, runNode, root } from './smoke-utils.js'; + +const cacheRoots = [ + path.join(root, 'tests', '.cache', 'build-embeddings-cache'), + path.join(root, 'tests', '.cache', 'embeddings-dims-mismatch') +]; + +let failure = null; +try { + await cleanup(cacheRoots); + runNode('embeddings-cache', path.join(root, 'tests', 'build-embeddings-cache.js')); + runNode('embeddings-dims-mismatch', path.join(root, 'tests', 'embeddings-dims-mismatch.js')); +} catch (err) { + console.error(err?.message || err); + failure = err; +} +await cleanup(cacheRoots); + +if (failure) { + process.exit(failure.exitCode ?? 1); +} +console.log('smoke embeddings passed'); diff --git a/tests/smoke-retrieval.js b/tests/smoke-retrieval.js new file mode 100644 index 000000000..6f4f7bcc4 --- /dev/null +++ b/tests/smoke-retrieval.js @@ -0,0 +1,156 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { cleanup, root } from './smoke-utils.js'; + +const tempRoot = path.join(root, 'tests', '.cache', 'smoke-retrieval'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const searchPath = path.join(root, 'search.js'); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const fail = (message, exitCode = 1) => { + const error = new Error(message); + error.exitCode = exitCode; + throw error; +}; + +const runNode = (label, args, options = {}) => { + const result = spawnSync(process.execPath, args, { env, encoding: 'utf8', ...options }); + if (result.status !== 0) { + const stderr = result.stderr ? result.stderr.trim() : ''; + if (stderr) console.error(stderr); + fail(`Failed: ${label}`, result.status ?? 1); + } + return result; +}; + +let failure = null; +try { + await cleanup([tempRoot]); + await fsPromises.mkdir(cacheRoot, { recursive: true }); + await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); + + const build = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { env, stdio: 'inherit' } + ); + if (build.status !== 0) { + fail('smoke retrieval failed: build_index failed', build.status ?? 1); + } + + const helpResult = spawnSync(process.execPath, [searchPath], { encoding: 'utf8' }); + if (helpResult.status === 0) { + fail('Expected search help to exit non-zero with no query.'); + } + const helpOutput = `${helpResult.stdout || ''}${helpResult.stderr || ''}`; + const requiredFlags = ['--calls', '--uses', '--author', '--import', '--explain']; + for (const flag of requiredFlags) { + if (!helpOutput.includes(flag)) { + fail(`Help output missing flag: ${flag}`); + } + } + + const rrfResult = runNode( + 'search rrf', + [searchPath, 'return', '--mode', 'code', '--ann', '--json', '--repo', repoRoot] + ); + let rrfPayload = null; + try { + rrfPayload = JSON.parse(rrfResult.stdout || '{}'); + } catch { + fail('search rrf test failed: invalid JSON output'); + } + const rrfHit = rrfPayload?.code?.[0]; + if (!rrfPayload?.stats?.annActive) { + fail('search rrf test failed: annActive was false'); + } + if (!rrfHit?.scoreBreakdown?.rrf) { + fail('search rrf test failed: scoreBreakdown.rrf missing'); + } + if (rrfHit.scoreType !== 'rrf') { + fail(`search rrf test failed: expected scoreType rrf, got ${rrfHit.scoreType}`); + } + + const filterResult = runNode( + 'search filters', + [ + searchPath, + 'return', + '--mode', + 'code', + '--json', + '--no-ann', + '--repo', + repoRoot, + '--file', + 'src/index.js' + ] + ); + const filterPayload = JSON.parse(filterResult.stdout || '{}'); + const filterHits = filterPayload?.code || []; + if (!filterHits.length) { + fail('search filter test failed: no results returned'); + } + const badFilterHit = filterHits.find((hit) => !(hit.file || '').replace(/\\/g, '/').endsWith('src/index.js')); + if (badFilterHit) { + fail('search filter test failed: file filter mismatch'); + } + + const stripAnsi = (value) => value.replace(/\u001b\[[0-9;]*m/g, ''); + const explainResult = runNode( + 'search explain', + [searchPath, 'return', '--mode', 'code', '--no-ann', '--repo', repoRoot, '--explain'] + ); + const explainOutput = stripAnsi(`${explainResult.stdout || ''}${explainResult.stderr || ''}`); + if (!explainOutput.includes('Score:')) { + fail('Explain output missing Score breakdown.'); + } + if (!explainOutput.includes('Sparse:')) { + fail('Explain output missing Sparse breakdown.'); + } + + const blendConfig = { + search: { + scoreBlend: { + enabled: true, + sparseWeight: 0.6, + annWeight: 0.4 + } + } + }; + await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + `${JSON.stringify(blendConfig, null, 2)}\n` + ); + + const blendResult = runNode( + 'search blend', + [searchPath, 'return', '--mode', 'code', '--ann', '--json', '--repo', repoRoot] + ); + const blendPayload = JSON.parse(blendResult.stdout || '{}'); + const blendHit = blendPayload?.code?.[0]; + if (!blendHit?.scoreBreakdown?.blend) { + fail('search blend test failed: scoreBreakdown.blend missing'); + } + if (blendHit.scoreType !== 'blend') { + fail(`search blend test failed: expected scoreType blend, got ${blendHit.scoreType}`); + } +} catch (err) { + console.error(err?.message || err); + failure = err; +} + +await cleanup([tempRoot]); +if (failure) { + process.exit(failure.exitCode ?? 1); +} +console.log('smoke retrieval passed'); diff --git a/tests/smoke-section1.js b/tests/smoke-section1.js new file mode 100644 index 000000000..1b0094a6c --- /dev/null +++ b/tests/smoke-section1.js @@ -0,0 +1,24 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { cleanup, runNode, root } from './smoke-utils.js'; + +const cacheRoots = [ + path.join(root, 'tests', '.cache', 'core-api'), + path.join(root, 'tests', '.cache', 'api-server') +]; + +let failure = null; +try { + await cleanup(cacheRoots); + runNode('core-api', path.join(root, 'tests', 'core-api.js')); + runNode('api-server', path.join(root, 'tests', 'api-server.js')); +} catch (err) { + console.error(err?.message || err); + failure = err; +} +await cleanup(cacheRoots); + +if (failure) { + process.exit(failure.exitCode ?? 1); +} +console.log('smoke section1 passed'); diff --git a/tests/smoke-services.js b/tests/smoke-services.js new file mode 100644 index 000000000..a9e5ebea5 --- /dev/null +++ b/tests/smoke-services.js @@ -0,0 +1,20 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { cleanup, runNode, root } from './smoke-utils.js'; + +const cacheRoots = [path.join(root, 'tests', '.cache', 'mcp-server')]; + +let failure = null; +try { + await cleanup(cacheRoots); + runNode('mcp-server', path.join(root, 'tests', 'mcp-server.js')); +} catch (err) { + console.error(err?.message || err); + failure = err; +} +await cleanup(cacheRoots); + +if (failure) { + process.exit(failure.exitCode ?? 1); +} +console.log('smoke services passed'); diff --git a/tests/smoke-sqlite.js b/tests/smoke-sqlite.js new file mode 100644 index 000000000..04b373a91 --- /dev/null +++ b/tests/smoke-sqlite.js @@ -0,0 +1,24 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { cleanup, runNode, root } from './smoke-utils.js'; + +const cacheRoots = [ + path.join(root, 'tests', '.cache', 'sqlite-incremental'), + path.join(root, 'tests', '.cache', 'sqlite-ann-fallback') +]; + +let failure = null; +try { + await cleanup(cacheRoots); + runNode('sqlite-incremental', path.join(root, 'tests', 'sqlite-incremental.js')); + runNode('sqlite-ann-fallback', path.join(root, 'tests', 'sqlite-ann-fallback.js')); +} catch (err) { + console.error(err?.message || err); + failure = err; +} +await cleanup(cacheRoots); + +if (failure) { + process.exit(failure.exitCode ?? 1); +} +console.log('smoke sqlite passed'); diff --git a/tests/smoke-utils.js b/tests/smoke-utils.js new file mode 100644 index 000000000..a75e38d1b --- /dev/null +++ b/tests/smoke-utils.js @@ -0,0 +1,23 @@ +import fsPromises from 'node:fs/promises'; +import { spawnSync } from 'node:child_process'; + +export const root = process.cwd(); + +export async function cleanup(paths) { + for (const dir of paths) { + await fsPromises.rm(dir, { recursive: true, force: true }); + } +} + +export function runNode(label, scriptPath, args = [], options = {}) { + const result = spawnSync(process.execPath, [scriptPath, ...args], { + stdio: 'inherit', + ...options + }); + if (result.status !== 0) { + const error = new Error(`Failed: ${label}`); + error.exitCode = result.status ?? 1; + throw error; + } + return result; +} diff --git a/tests/smoke-workers.js b/tests/smoke-workers.js new file mode 100644 index 000000000..f0f959293 --- /dev/null +++ b/tests/smoke-workers.js @@ -0,0 +1,21 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { cleanup, runNode, root } from './smoke-utils.js'; + +const cacheRoots = [path.join(root, 'tests', '.cache', 'language-fidelity')]; + +let failure = null; +try { + await cleanup(cacheRoots); + runNode('worker-pool', path.join(root, 'tests', 'worker-pool.js')); + runNode('language-fidelity', path.join(root, 'tests', 'language-fidelity.js')); +} catch (err) { + console.error(err?.message || err); + failure = err; +} +await cleanup(cacheRoots); + +if (failure) { + process.exit(failure.exitCode ?? 1); +} +console.log('smoke workers passed'); diff --git a/tests/smoke.js b/tests/smoke.js index d57bde131..b3f47cf10 100644 --- a/tests/smoke.js +++ b/tests/smoke.js @@ -1,19 +1,19 @@ #!/usr/bin/env node import fs from 'node:fs'; import path from 'node:path'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; import { getDictionaryPaths, getDictConfig, getIndexDir, loadUserConfig, resolveSqlitePaths } from '../tools/dict-utils.js'; import { normalizePostingsConfig } from '../src/shared/postings-config.js'; import { getVectorExtensionConfig, resolveVectorExtensionPath } from '../tools/vector-extension.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['require-index', 'require-sqlite', 'require-dicts'], - default: { - 'require-index': false, - 'require-sqlite': false, - 'require-dicts': false +const argv = createCli({ + scriptName: 'verify', + options: { + 'require-index': { type: 'boolean', default: false }, + 'require-sqlite': { type: 'boolean', default: false }, + 'require-dicts': { type: 'boolean', default: false } } -}); +}).parse(); const root = process.cwd(); let failures = 0; diff --git a/tests/sqlite-ann-extension.js b/tests/sqlite-ann-extension.js index 7144022db..78febfcdf 100644 --- a/tests/sqlite-ann-extension.js +++ b/tests/sqlite-ann-extension.js @@ -40,8 +40,8 @@ const config = { cache: { root: cacheRoot }, sqlite: { use: true, - annMode: 'extension', vectorExtension: { + annMode: 'extension', path: extensionPath } }, @@ -144,7 +144,8 @@ await fsPromises.rm(deletableFile, { force: true }); run([path.join(root, 'build_index.js'), '--incremental', '--stub-embeddings', '--repo', repoRoot], 'build index (incremental)'); run([path.join(root, 'tools', 'build-sqlite-index.js'), '--incremental', '--mode', 'code', '--repo', repoRoot], 'build sqlite index (incremental)'); -const dbAfter = new Database(sqlitePaths.codePath, { readonly: true }); +const sqlitePathsAfter = resolveSqlitePaths(repoRoot, userConfig); +const dbAfter = new Database(sqlitePathsAfter.codePath, { readonly: true }); try { dbAfter.loadExtension(extensionPath); } catch (err) { diff --git a/tests/sqlite-ann-fallback.js b/tests/sqlite-ann-fallback.js new file mode 100644 index 000000000..7f5adca3d --- /dev/null +++ b/tests/sqlite-ann-fallback.js @@ -0,0 +1,88 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'sqlite-ann-fallback'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); +const missingExtensionPath = path.join(tempRoot, 'missing', 'vec0-missing.node'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, 'src', 'alpha.js'), + 'export const alpha = () => "ann_fallback_token";\n' +); + +const config = { + cache: { root: cacheRoot }, + dictionary: { languages: ['en'] }, + sqlite: { + use: true, + vectorExtension: { + annMode: 'extension', + path: missingExtensionPath + } + } +}; +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify(config, null, 2) + '\n' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const runNode = (label, args) => { + const result = spawnSync(process.execPath, args, { cwd: repoRoot, env, stdio: 'inherit' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +runNode('build_index', [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot]); +runNode('build_sqlite', [path.join(root, 'tools', 'build-sqlite-index.js'), '--repo', repoRoot]); + +const searchResult = spawnSync( + process.execPath, + [path.join(root, 'search.js'), 'ann_fallback_token', '--backend', 'sqlite', '--ann', '--json', '--repo', repoRoot], + { env, encoding: 'utf8' } +); +if (searchResult.status !== 0) { + console.error('sqlite ann fallback test failed: search returned error'); + if (searchResult.stderr) console.error(searchResult.stderr.trim()); + process.exit(searchResult.status ?? 1); +} + +let payload = null; +try { + payload = JSON.parse(searchResult.stdout || '{}'); +} catch { + console.error('sqlite ann fallback test failed: invalid JSON output'); + process.exit(1); +} + +const hits = payload?.code || []; +if (!hits.length) { + console.error('sqlite ann fallback test failed: no results returned'); + process.exit(1); +} +if (payload?.stats?.annBackend === 'sqlite-extension') { + console.error('sqlite ann fallback test failed: ann backend should not be sqlite-extension'); + process.exit(1); +} +if (payload?.stats?.annExtension?.available?.code) { + console.error('sqlite ann fallback test failed: ann extension should be unavailable'); + process.exit(1); +} + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +console.log('sqlite ann fallback test passed'); diff --git a/tests/sqlite-auto-backend.js b/tests/sqlite-auto-backend.js new file mode 100644 index 000000000..dc34086da --- /dev/null +++ b/tests/sqlite-auto-backend.js @@ -0,0 +1,69 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'sqlite-auto'); +const cacheRoot = path.join(tempRoot, '.cache'); +const searchPath = path.join(root, 'search.js'); +const buildIndexPath = path.join(root, 'build_index.js'); +const buildSqlitePath = path.join(root, 'tools', 'build-sqlite-index.js'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); + +const sampleCode = ` +export function greet(name) { + return "hello " + name; +} +`; +await fsPromises.writeFile(path.join(tempRoot, 'sample.js'), sampleCode); + +const writeConfig = async (threshold) => { + const config = { + sqlite: { use: true }, + search: { sqliteAutoChunkThreshold: threshold, annDefault: false } + }; + await fsPromises.writeFile( + path.join(tempRoot, '.pairofcleats.json'), + JSON.stringify(config, null, 2) + ); +}; + +await writeConfig(1); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const run = (args, label) => { + const result = spawnSync(process.execPath, args, { cwd: tempRoot, env, encoding: 'utf8' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + return result.stdout || ''; +}; + +run([buildIndexPath, '--stub-embeddings', '--repo', tempRoot], 'build index'); +run([buildSqlitePath, '--repo', tempRoot], 'build sqlite'); + +const backendA = JSON.parse(run([searchPath, 'greet', '--json', '--repo', tempRoot], 'search auto sqlite')).backend; +if (backendA !== 'sqlite') { + console.error(`Expected sqlite backend for threshold=1, got ${backendA}`); + process.exit(1); +} + +await writeConfig(999999); +const backendB = JSON.parse(run([searchPath, 'greet', '--json', '--repo', tempRoot], 'search auto memory')).backend; +if (backendB !== 'memory') { + console.error(`Expected memory backend for threshold=999999, got ${backendB}`); + process.exit(1); +} + +console.log('SQLite auto backend test passed'); diff --git a/tests/sqlite-build-delete.js b/tests/sqlite-build-delete.js new file mode 100644 index 000000000..0b8e9fb2f --- /dev/null +++ b/tests/sqlite-build-delete.js @@ -0,0 +1,67 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { deleteDocIds } from '../src/storage/sqlite/build/delete.js'; + +let Database; +try { + ({ default: Database } = await import('better-sqlite3')); +} catch (err) { + console.error('better-sqlite3 is required for sqlite build delete test.'); + process.exit(1); +} + +const db = new Database(':memory:'); +db.exec(` + CREATE TABLE chunks (id INTEGER, mode TEXT); + CREATE TABLE chunks_fts (rowid INTEGER, mode TEXT); + CREATE TABLE token_postings (doc_id INTEGER, mode TEXT); + CREATE TABLE phrase_postings (doc_id INTEGER, mode TEXT); + CREATE TABLE chargram_postings (doc_id INTEGER, mode TEXT); + CREATE TABLE minhash_signatures (doc_id INTEGER, mode TEXT); + CREATE TABLE dense_vectors (doc_id INTEGER, mode TEXT); + CREATE TABLE doc_lengths (doc_id INTEGER, mode TEXT); + CREATE TABLE dense_vectors_ann (id INTEGER PRIMARY KEY, embedding BLOB); +`); + +const insertChunk = db.prepare('INSERT INTO chunks (id, mode) VALUES (?, ?)'); +const insertChunkFts = db.prepare('INSERT INTO chunks_fts (rowid, mode) VALUES (?, ?)'); +const insertDoc = (table) => db.prepare(`INSERT INTO ${table} (doc_id, mode) VALUES (?, ?)`); +const insertAnn = db.prepare('INSERT INTO dense_vectors_ann (id, embedding) VALUES (?, ?)'); + +for (const id of [1, 2]) { + insertChunk.run(id, 'code'); + insertChunkFts.run(id, 'code'); + insertDoc('token_postings').run(id, 'code'); + insertDoc('phrase_postings').run(id, 'code'); + insertDoc('chargram_postings').run(id, 'code'); + insertDoc('minhash_signatures').run(id, 'code'); + insertDoc('dense_vectors').run(id, 'code'); + insertDoc('doc_lengths').run(id, 'code'); + insertAnn.run(id, Buffer.from('x')); +} + +insertChunk.run(1, 'prose'); +insertChunkFts.run(1, 'prose'); +insertDoc('token_postings').run(1, 'prose'); +insertDoc('doc_lengths').run(1, 'prose'); +insertAnn.run(3, Buffer.from('y')); + +deleteDocIds(db, 'code', [1, 2], [{ table: 'dense_vectors_ann', column: 'id', withMode: false }]); + +const remainingCodeChunks = db.prepare('SELECT COUNT(*) AS total FROM chunks WHERE mode = ?').get('code').total; +assert.equal(remainingCodeChunks, 0, 'expected code chunks to be removed'); +const remainingProseChunks = db.prepare('SELECT COUNT(*) AS total FROM chunks WHERE mode = ?').get('prose').total; +assert.equal(remainingProseChunks, 1, 'expected prose chunks to remain'); + +const remainingTokens = db.prepare('SELECT COUNT(*) AS total FROM token_postings WHERE mode = ?').get('code').total; +assert.equal(remainingTokens, 0, 'expected code token postings to be removed'); + +const remainingAnn = db.prepare('SELECT COUNT(*) AS total FROM dense_vectors_ann').get().total; +assert.equal(remainingAnn, 1, 'expected ANN rows to be removed for deleted ids'); + +const remainingAnnRow = db.prepare('SELECT id FROM dense_vectors_ann').get(); +assert.equal(remainingAnnRow.id, 3, 'expected ANN row for other ids to remain'); + +db.close(); + +console.log('sqlite build delete test passed'); diff --git a/tests/sqlite-build-indexes.js b/tests/sqlite-build-indexes.js new file mode 100644 index 000000000..d37d6922c --- /dev/null +++ b/tests/sqlite-build-indexes.js @@ -0,0 +1,93 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig, resolveSqlitePaths } from '../tools/dict-utils.js'; + +let Database = null; +try { + ({ default: Database } = await import('better-sqlite3')); +} catch (err) { + console.error(`better-sqlite3 missing: ${err?.message || err}`); + process.exit(1); +} + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'sqlite-build-indexes'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile(path.join(repoRoot, 'alpha.js'), 'const alpha = 1;\n'); +await fsPromises.writeFile(path.join(repoRoot, 'beta.js'), 'const beta = 2;\n'); +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ + indexing: { + treeSitter: { enabled: false }, + artifacts: { + chunkMetaFormat: 'jsonl', + chunkMetaShardSize: 1, + tokenPostingsFormat: 'sharded', + tokenPostingsShardSize: 1 + } + } + }, null, 2) +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const runNode = (label, args) => { + const result = spawnSync(process.execPath, args, { cwd: repoRoot, env, stdio: 'inherit' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +runNode('build_index', [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot]); +runNode('build_index_stage4', [path.join(root, 'build_index.js'), '--stub-embeddings', '--stage', 'stage4', '--repo', repoRoot]); + +const previousCacheRoot = process.env.PAIROFCLEATS_CACHE_ROOT; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const userConfig = loadUserConfig(repoRoot); +const indexDir = getIndexDir(repoRoot, 'code', userConfig); +const chunkMetaPartsDir = path.join(indexDir, 'chunk_meta.parts'); +const tokenPostingsShardsDir = path.join(indexDir, 'token_postings.shards'); +if (!fs.existsSync(chunkMetaPartsDir)) { + console.error(`Expected chunk_meta.parts to exist at ${chunkMetaPartsDir}`); + process.exit(1); +} +if (!fs.existsSync(tokenPostingsShardsDir)) { + console.error(`Expected token_postings.shards to exist at ${tokenPostingsShardsDir}`); + process.exit(1); +} +const chunkMetaJson = path.join(indexDir, 'chunk_meta.json'); +if (fs.existsSync(chunkMetaJson)) { + console.error(`Expected chunk_meta.json to be absent at ${chunkMetaJson}`); + process.exit(1); +} +const sqlitePaths = resolveSqlitePaths(repoRoot, {}); +if (previousCacheRoot === undefined) { + delete process.env.PAIROFCLEATS_CACHE_ROOT; +} else { + process.env.PAIROFCLEATS_CACHE_ROOT = previousCacheRoot; +} +const db = new Database(sqlitePaths.codePath); +const indexList = db.prepare("PRAGMA index_list('token_postings')").all(); +const indexNames = new Set(indexList.map((row) => row.name)); +if (!indexNames.has('idx_token_postings_token')) { + console.error('Expected idx_token_postings_token to exist'); + process.exit(1); +} +db.close(); + +console.log('sqlite build indexes test passed'); diff --git a/tests/sqlite-build-manifest.js b/tests/sqlite-build-manifest.js new file mode 100644 index 000000000..5000cf185 --- /dev/null +++ b/tests/sqlite-build-manifest.js @@ -0,0 +1,32 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { diffFileManifests, isManifestMatch, normalizeManifestFiles } from '../src/storage/sqlite/build/manifest.js'; + +const manifestFiles = { + 'src/conflict.js': { hash: 'aaa', mtimeMs: 1, size: 10 }, + 'src\\conflict.js': { hash: 'bbb', mtimeMs: 2, size: 20 }, + 'src/unchanged.js': { hash: 'keep', mtimeMs: 5, size: 50 }, + 'src/changed.js': { mtimeMs: 9, size: 90 } +}; + +const normalized = normalizeManifestFiles(manifestFiles); +assert.ok(normalized.conflicts.includes('src/conflict.js'), 'expected conflict to be recorded'); +assert.equal(normalized.entries.length, 3, 'expected normalized entries to dedupe conflicts'); + +const dbFiles = new Map(); +dbFiles.set('src/unchanged.js', { hash: 'keep', mtimeMs: 5, size: 50 }); +dbFiles.set('src/changed.js', { hash: 'old', mtimeMs: 8, size: 90 }); +dbFiles.set('src/deleted.js', { hash: 'gone', mtimeMs: 1, size: 10 }); + +const { changed, deleted } = diffFileManifests(normalized.entries, dbFiles); + +assert.ok(changed.some((record) => record.normalized === 'src/changed.js'), 'expected changed file to be detected'); +assert.ok(!changed.some((record) => record.normalized === 'src/unchanged.js'), 'expected unchanged file to be skipped'); +assert.deepEqual(deleted, ['src/deleted.js'], 'expected deleted file list'); + +const matchByHash = isManifestMatch({ hash: 'abc' }, { hash: 'abc', mtimeMs: 1, size: 1 }); +assert.equal(matchByHash, true, 'expected hash match to win'); +const matchByMeta = isManifestMatch({ mtimeMs: 5, size: 50 }, { mtimeMs: 5, size: 50 }); +assert.equal(matchByMeta, true, 'expected mtime+size match'); + +console.log('sqlite build manifest test passed'); diff --git a/tests/sqlite-build-vocab.js b/tests/sqlite-build-vocab.js new file mode 100644 index 000000000..d3f32ef41 --- /dev/null +++ b/tests/sqlite-build-vocab.js @@ -0,0 +1,56 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { ensureVocabIds } from '../src/storage/sqlite/build/vocab.js'; + +let Database; +try { + ({ default: Database } = await import('better-sqlite3')); +} catch (err) { + console.error('better-sqlite3 is required for sqlite build vocab test.'); + process.exit(1); +} + +const db = new Database(':memory:'); +db.exec('CREATE TABLE token_vocab (mode TEXT, token_id INTEGER, token TEXT, PRIMARY KEY (mode, token_id))'); + +const insertSeed = db.prepare('INSERT INTO token_vocab (mode, token_id, token) VALUES (?, ?, ?)'); +insertSeed.run('code', 0, 'alpha'); +insertSeed.run('code', 1, 'beta'); +insertSeed.run('prose', 0, 'beta'); + +const insertStmt = db.prepare('INSERT OR REPLACE INTO token_vocab (mode, token_id, token) VALUES (?, ?, ?)'); + +let result = ensureVocabIds( + db, + 'code', + 'token_vocab', + 'token_id', + 'token', + ['beta', 'gamma', 'beta'], + insertStmt +); +assert.equal(result.inserted, 1, 'expected one new token'); +assert.equal(result.map.get('beta'), 1, 'expected existing token id'); +assert.equal(result.map.get('gamma'), 2, 'expected new token id'); + +const rowCount = db.prepare('SELECT COUNT(*) AS total FROM token_vocab WHERE mode = ?').get('code').total; +assert.equal(rowCount, 3, 'expected vocab size to grow by one'); + +const beforeCount = rowCount; +result = ensureVocabIds( + db, + 'code', + 'token_vocab', + 'token_id', + 'token', + ['delta', 'epsilon'], + insertStmt, + { limits: { ratio: 0.4, absolute: 1 } } +); +assert.equal(result.skip, true, 'expected vocab growth to be skipped'); +const afterCount = db.prepare('SELECT COUNT(*) AS total FROM token_vocab WHERE mode = ?').get('code').total; +assert.equal(afterCount, beforeCount, 'expected vocab size to remain unchanged'); + +db.close(); + +console.log('sqlite build vocab test passed'); diff --git a/tests/sqlite-bundle-missing.js b/tests/sqlite-bundle-missing.js new file mode 100644 index 000000000..874c9871a --- /dev/null +++ b/tests/sqlite-bundle-missing.js @@ -0,0 +1,120 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getRepoCacheRoot, loadUserConfig, resolveSqlitePaths } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'sqlite-bundle-missing'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); +await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const run = (args, label, options = {}) => { + const result = spawnSync(process.execPath, args, { + cwd: repoRoot, + env, + encoding: 'utf8', + ...options + }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + return result; +}; + +run([ + path.join(root, 'build_index.js'), + '--incremental', + '--stub-embeddings', + '--mode', + 'code', + '--repo', + repoRoot +], 'build index'); + +const userConfig = loadUserConfig(repoRoot); +const repoCacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const manifestPath = path.join(repoCacheRoot, 'incremental', 'code', 'manifest.json'); +const bundleDir = path.join(repoCacheRoot, 'incremental', 'code', 'files'); +if (!fs.existsSync(manifestPath)) { + console.error('Missing incremental manifest for sqlite bundle test.'); + process.exit(1); +} +const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf8')); +const manifestFiles = Object.values(manifest.files || {}); +if (!manifestFiles.length) { + console.error('Incremental manifest contains no files.'); + process.exit(1); +} +const bundleName = manifestFiles[0]?.bundle; +if (!bundleName) { + console.error('Manifest entry missing bundle name.'); + process.exit(1); +} +const bundlePath = path.join(bundleDir, bundleName); +if (!fs.existsSync(bundlePath)) { + console.error(`Expected bundle file missing: ${bundlePath}`); + process.exit(1); +} +await fsPromises.rm(bundlePath, { force: true }); + +const sqliteBuild = spawnSync( + process.execPath, + [ + path.join(root, 'tools', 'build-sqlite-index.js'), + '--mode', + 'code', + '--repo', + repoRoot + ], + { cwd: repoRoot, env, encoding: 'utf8' } +); +if (sqliteBuild.status !== 0) { + console.error('build-sqlite-index failed for missing bundle test.'); + if (sqliteBuild.stderr) console.error(sqliteBuild.stderr.trim()); + process.exit(sqliteBuild.status ?? 1); +} +const output = `${sqliteBuild.stdout || ''}\n${sqliteBuild.stderr || ''}`; +if (!output.includes('falling back to file-backed artifacts')) { + console.error('Expected bundle fallback warning not found in output.'); + process.exit(1); +} + +const sqlitePaths = resolveSqlitePaths(repoRoot, userConfig); +if (!fs.existsSync(sqlitePaths.codePath)) { + console.error(`Missing sqlite db after fallback: ${sqlitePaths.codePath}`); + process.exit(1); +} + +let Database; +try { + ({ default: Database } = await import('better-sqlite3')); +} catch { + console.error('better-sqlite3 is required for sqlite bundle test.'); + process.exit(1); +} +const db = new Database(sqlitePaths.codePath, { readonly: true }); +const row = db.prepare('SELECT COUNT(*) AS total FROM chunks WHERE mode = ?').get('code'); +db.close(); +if (!Number(row?.total)) { + console.error('Expected sqlite index to contain chunks after fallback rebuild.'); + process.exit(1); +} + +console.log('sqlite bundle missing fallback test passed'); diff --git a/tests/sqlite-cache.js b/tests/sqlite-cache.js new file mode 100644 index 000000000..172961b47 --- /dev/null +++ b/tests/sqlite-cache.js @@ -0,0 +1,25 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import os from 'node:os'; +import { createSqliteDbCache } from '../src/retrieval/sqlite-cache.js'; + +const tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'pairofcleats-sqlite-cache-')); +const dbPath = path.join(tempRoot, 'index.db'); +await fs.writeFile(dbPath, 'initial'); + +const cache = createSqliteDbCache(); +let closed = false; +const db = { close: () => { closed = true; } }; +cache.set(dbPath, db); + +const first = cache.get(dbPath); +assert.equal(first, db, 'should return cached db'); + +await fs.writeFile(dbPath, 'changed'); +const second = cache.get(dbPath); +assert.equal(second, null, 'should invalidate on signature change'); +assert.equal(closed, true, 'should close invalidated db'); + +console.log('sqlite cache tests passed'); diff --git a/tests/sqlite-chunk-id.js b/tests/sqlite-chunk-id.js new file mode 100644 index 000000000..d5786c405 --- /dev/null +++ b/tests/sqlite-chunk-id.js @@ -0,0 +1,16 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { buildChunkRow } from '../src/storage/sqlite/build-helpers.js'; +import { CREATE_TABLES_BASE_SQL } from '../src/storage/sqlite/schema.js'; + +const chunk = { + file: 'src/example.js', + start: 0, + end: 12, + metaV2: { chunkId: 'chunk_sqlite_1' } +}; +const row = buildChunkRow(chunk, 'code', 0); +assert.equal(row.chunk_id, 'chunk_sqlite_1', 'expected chunk_id in sqlite row'); +assert.ok(CREATE_TABLES_BASE_SQL.includes('chunk_id'), 'expected chunk_id column in sqlite schema'); + +console.log('sqlite chunk id test passed'); diff --git a/tests/sqlite-incremental-no-change.js b/tests/sqlite-incremental-no-change.js new file mode 100644 index 000000000..f67350fb9 --- /dev/null +++ b/tests/sqlite-incremental-no-change.js @@ -0,0 +1,144 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { loadUserConfig, resolveSqlitePaths } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'sqlite-incremental-no-change'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +const stripMaxOldSpaceFlag = (options) => { + if (!options) return ''; + return options + .replace(/--max-old-space-size=\d+/g, '') + .replace(/--max-old-space-size\s+\d+/g, '') + .replace(/\s+/g, ' ') + .trim(); +}; + +const nodeOptions = stripMaxOldSpaceFlag(process.env.NODE_OPTIONS || ''); + +const rmWithRetries = async (target, { retries = 8, delayMs = 150 } = {}) => { + for (let attempt = 0; attempt <= retries; attempt += 1) { + try { + await fsPromises.rm(target, { recursive: true, force: true }); + return; + } catch (err) { + if (!err || attempt >= retries) throw err; + if (!['EBUSY', 'EPERM', 'ENOTEMPTY'].includes(err.code)) throw err; + await new Promise((resolve) => setTimeout(resolve, delayMs * (attempt + 1))); + } + } +}; + +await rmWithRetries(tempRoot); +await fsPromises.mkdir(tempRoot, { recursive: true }); +await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub', + PAIROFCLEATS_WORKER_POOL: 'off', + PAIROFCLEATS_MAX_OLD_SPACE_MB: '4096' +}; +if (nodeOptions) { + env.NODE_OPTIONS = nodeOptions; +} else { + delete env.NODE_OPTIONS; +} +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; +process.env.PAIROFCLEATS_WORKER_POOL = 'off'; +process.env.PAIROFCLEATS_MAX_OLD_SPACE_MB = '4096'; + +function run(args, label) { + const result = spawnSync(process.execPath, args, { + cwd: repoRoot, + env, + stdio: 'inherit' + }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +} + +function runCapture(args, label) { + const result = spawnSync(process.execPath, args, { + cwd: repoRoot, + env, + encoding: 'utf8' + }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + return result; +} + +run([path.join(root, 'build_index.js'), '--incremental', '--stub-embeddings', '--repo', repoRoot], 'build index'); +const initialSqlite = runCapture( + [path.join(root, 'tools', 'build-sqlite-index.js'), '--repo', repoRoot], + 'build sqlite index' +); +const initialOutput = `${initialSqlite.stdout || ''}\n${initialSqlite.stderr || ''}`; +if (!initialOutput.includes('Validation (smoke) ok for code')) { + console.error('Expected sqlite smoke validation for code build.'); + process.exit(1); +} + +const userConfig = loadUserConfig(repoRoot); +let sqlitePaths = resolveSqlitePaths(repoRoot, userConfig); + +let Database; +try { + ({ default: Database } = await import('better-sqlite3')); +} catch (err) { + console.error('better-sqlite3 is required for sqlite incremental no-change test.'); + process.exit(1); +} + +const dbBefore = new Database(sqlitePaths.codePath, { readonly: true }); +const beforeCounts = { + chunks: dbBefore.prepare('SELECT COUNT(*) AS total FROM chunks WHERE mode = ?').get('code').total, + files: dbBefore.prepare('SELECT COUNT(*) AS total FROM file_manifest WHERE mode = ?').get('code').total, + hash: (dbBefore.prepare('SELECT hash FROM file_manifest WHERE mode = ? AND file = ?') + .get('code', 'src/index.js') || {}).hash || null +}; +dbBefore.close(); + +const noChangeResult = runCapture( + [path.join(root, 'tools', 'build-sqlite-index.js'), '--incremental', '--repo', repoRoot], + 'build sqlite index (no change)' +); +const noChangeOutput = `${noChangeResult.stdout || ''}\n${noChangeResult.stderr || ''}`; +if (!noChangeOutput.includes('SQLite indexes updated')) { + console.error('Expected incremental sqlite update output for no-change run.'); + process.exit(1); +} +if (noChangeOutput.includes('rebuilding full index')) { + console.error('Expected no full rebuild for no-change run.'); + process.exit(1); +} + +sqlitePaths = resolveSqlitePaths(repoRoot, userConfig); +const dbAfter = new Database(sqlitePaths.codePath, { readonly: true }); +const afterCounts = { + chunks: dbAfter.prepare('SELECT COUNT(*) AS total FROM chunks WHERE mode = ?').get('code').total, + files: dbAfter.prepare('SELECT COUNT(*) AS total FROM file_manifest WHERE mode = ?').get('code').total, + hash: (dbAfter.prepare('SELECT hash FROM file_manifest WHERE mode = ? AND file = ?') + .get('code', 'src/index.js') || {}).hash || null +}; +dbAfter.close(); + +assert.equal(afterCounts.chunks, beforeCounts.chunks, 'expected chunk counts to remain stable'); +assert.equal(afterCounts.files, beforeCounts.files, 'expected file manifest counts to remain stable'); +assert.equal(afterCounts.hash, beforeCounts.hash, 'expected file manifest hash to remain stable'); + +console.log('sqlite incremental no-change test passed'); diff --git a/tests/sqlite-incremental.js b/tests/sqlite-incremental.js index ad345e018..b1c62bd69 100644 --- a/tests/sqlite-incremental.js +++ b/tests/sqlite-incremental.js @@ -3,8 +3,8 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; import { spawnSync } from 'node:child_process'; -import { loadUserConfig, resolveSqlitePaths } from '../tools/dict-utils.js'; -import { SCHEMA_VERSION } from '../src/sqlite/schema.js'; +import { getRepoCacheRoot, loadUserConfig, resolveSqlitePaths } from '../tools/dict-utils.js'; +import { SCHEMA_VERSION } from '../src/storage/sqlite/schema.js'; const root = process.cwd(); const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); @@ -12,17 +12,50 @@ const tempRoot = path.join(root, 'tests', '.cache', 'sqlite-incremental'); const repoRoot = path.join(tempRoot, 'repo'); const cacheRoot = path.join(tempRoot, 'cache'); -await fsPromises.rm(tempRoot, { recursive: true, force: true }); +const stripMaxOldSpaceFlag = (options) => { + if (!options) return ''; + return options + .replace(/--max-old-space-size=\d+/g, '') + .replace(/--max-old-space-size\s+\d+/g, '') + .replace(/\s+/g, ' ') + .trim(); +}; + +const nodeOptions = stripMaxOldSpaceFlag(process.env.NODE_OPTIONS || ''); + +const rmWithRetries = async (target, { retries = 8, delayMs = 150 } = {}) => { + for (let attempt = 0; attempt <= retries; attempt += 1) { + try { + await fsPromises.rm(target, { recursive: true, force: true }); + return; + } catch (err) { + if (!err || attempt >= retries) throw err; + if (!['EBUSY', 'EPERM', 'ENOTEMPTY'].includes(err.code)) throw err; + await new Promise((resolve) => setTimeout(resolve, delayMs * (attempt + 1))); + } + } +}; + +await rmWithRetries(tempRoot); await fsPromises.mkdir(tempRoot, { recursive: true }); await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); const env = { ...process.env, PAIROFCLEATS_CACHE_ROOT: cacheRoot, - PAIROFCLEATS_EMBEDDINGS: 'stub' + PAIROFCLEATS_EMBEDDINGS: 'stub', + PAIROFCLEATS_WORKER_POOL: 'off', + PAIROFCLEATS_MAX_OLD_SPACE_MB: '8192' }; +if (nodeOptions) { + env.NODE_OPTIONS = nodeOptions; +} else { + delete env.NODE_OPTIONS; +} process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; +process.env.PAIROFCLEATS_WORKER_POOL = 'off'; +process.env.PAIROFCLEATS_MAX_OLD_SPACE_MB = '8192'; function run(args, label) { const result = spawnSync(process.execPath, args, { @@ -51,10 +84,22 @@ function runCapture(args, label) { } run([path.join(root, 'build_index.js'), '--incremental', '--stub-embeddings', '--repo', repoRoot], 'build index'); -run([path.join(root, 'tools', 'build-sqlite-index.js'), '--repo', repoRoot], 'build sqlite index'); +const initialSqlite = runCapture( + [path.join(root, 'tools', 'build-sqlite-index.js'), '--repo', repoRoot], + 'build sqlite index' +); +const initialOutput = `${initialSqlite.stdout || ''}\n${initialSqlite.stderr || ''}`; +if (!initialOutput.includes('Validation (smoke) ok for code')) { + console.error('Expected sqlite smoke validation for code build.'); + process.exit(1); +} +if (!initialOutput.includes('Validation (smoke) ok for prose')) { + console.error('Expected sqlite smoke validation for prose build.'); + process.exit(1); +} const userConfig = loadUserConfig(repoRoot); -const sqlitePaths = resolveSqlitePaths(repoRoot, userConfig); +let sqlitePaths = resolveSqlitePaths(repoRoot, userConfig); let Database; try { @@ -82,6 +127,7 @@ await fsPromises.writeFile(targetFile, updated); run([path.join(root, 'build_index.js'), '--incremental', '--stub-embeddings', '--repo', repoRoot], 'build index (incremental)'); run([path.join(root, 'tools', 'build-sqlite-index.js'), '--incremental', '--repo', repoRoot], 'build sqlite index (incremental)'); +sqlitePaths = resolveSqlitePaths(repoRoot, userConfig); const dbAfter = new Database(sqlitePaths.codePath, { readonly: true }); const afterRow = dbAfter .prepare('SELECT hash, chunk_count FROM file_manifest WHERE mode = ? AND file = ?') @@ -116,6 +162,33 @@ if (!payload.code?.length && !payload.prose?.length) { process.exit(1); } +const repoCacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const manifestPath = path.join(repoCacheRoot, 'incremental', 'code', 'manifest.json'); +let manifest = null; +try { + manifest = JSON.parse(await fsPromises.readFile(manifestPath, 'utf8')); +} catch { + console.error('Failed to load incremental manifest for normalization test.'); + process.exit(1); +} +if (!manifest?.files?.['src/index.js']) { + console.error('Expected manifest entry for src/index.js.'); + process.exit(1); +} +manifest.files['src\\index.js'] = manifest.files['src/index.js']; +delete manifest.files['src/index.js']; +await fsPromises.writeFile(manifestPath, JSON.stringify(manifest, null, 2)); + +const normalizedResult = runCapture( + [path.join(root, 'tools', 'build-sqlite-index.js'), '--incremental', '--repo', repoRoot], + 'build sqlite index (normalized manifest)' +); +const normalizedOutput = `${normalizedResult.stdout || ''}\n${normalizedResult.stderr || ''}`; +if (!normalizedOutput.includes('SQLite indexes updated')) { + console.error('Expected incremental sqlite update with normalized manifest.'); + process.exit(1); +} + const downgradeVersion = Math.max(0, SCHEMA_VERSION - 1); const dbDowngrade = new Database(sqlitePaths.codePath); dbDowngrade.pragma(`user_version = ${downgradeVersion}`); diff --git a/tests/sqlite-index-state-fail-closed.js b/tests/sqlite-index-state-fail-closed.js new file mode 100644 index 000000000..2e359ee2a --- /dev/null +++ b/tests/sqlite-index-state-fail-closed.js @@ -0,0 +1,90 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, getRepoCacheRoot, loadUserConfig, resolveIndexRoot } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'sqlite-index-state-fail'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); +await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const run = (args, label) => { + const result = spawnSync(process.execPath, args, { cwd: repoRoot, env, stdio: 'inherit' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +run([ + path.join(root, 'build_index.js'), + '--stub-embeddings', + '--mode', + 'code', + '--repo', + repoRoot +], 'build index'); + +const userConfig = loadUserConfig(repoRoot); +const indexRoot = resolveIndexRoot(repoRoot, userConfig); +const codeDir = getIndexDir(repoRoot, 'code', userConfig, { indexRoot }); +const repoCacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const statePath = path.join(codeDir, 'index_state.json'); +if (!fs.existsSync(statePath)) { + console.error('Expected index_state.json after initial build.'); + process.exit(1); +} + +const chunkMetaJson = path.join(codeDir, 'chunk_meta.json'); +const chunkMetaJsonl = path.join(codeDir, 'chunk_meta.jsonl'); +const chunkMetaMeta = path.join(codeDir, 'chunk_meta.meta.json'); +const chunkMetaParts = path.join(codeDir, 'chunk_meta.parts'); +await fsPromises.rm(chunkMetaJson, { force: true }); +await fsPromises.rm(chunkMetaJsonl, { force: true }); +await fsPromises.rm(chunkMetaMeta, { force: true }); +await fsPromises.rm(chunkMetaParts, { recursive: true, force: true }); +const manifestPath = path.join(repoCacheRoot, 'incremental', 'code', 'manifest.json'); +await fsPromises.rm(manifestPath, { force: true }); + +const sqliteBuild = spawnSync( + process.execPath, + [ + path.join(root, 'tools', 'build-sqlite-index.js'), + '--mode', + 'code', + '--repo', + repoRoot + ], + { cwd: repoRoot, env, encoding: 'utf8' } +); +if (sqliteBuild.status === 0) { + console.error('Expected build-sqlite-index to fail with missing artifacts.'); + process.exit(1); +} + +const state = JSON.parse(fs.readFileSync(statePath, 'utf8')); +if (!state?.sqlite) { + console.error('index_state.json missing sqlite section after failure.'); + process.exit(1); +} +if (state.sqlite.pending !== true || state.sqlite.ready !== false) { + console.error(`Expected sqlite pending=true and ready=false, got pending=${state.sqlite.pending} ready=${state.sqlite.ready}`); + process.exit(1); +} + +console.log('sqlite index state fail-closed test passed'); diff --git a/tests/sqlite-missing-dep.js b/tests/sqlite-missing-dep.js new file mode 100644 index 000000000..a4c63af31 --- /dev/null +++ b/tests/sqlite-missing-dep.js @@ -0,0 +1,92 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'sqlite-missing-dep'); +const cacheRoot = path.join(tempRoot, '.cache'); +const searchPath = path.join(root, 'search.js'); +const buildIndexPath = path.join(root, 'build_index.js'); +const buildSqlitePath = path.join(root, 'tools', 'build-sqlite-index.js'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); + +const sampleCode = ` +export function greet(name) { + return "hello " + name; +} +`; +await fsPromises.writeFile(path.join(tempRoot, 'sample.js'), sampleCode); + +const config = { + sqlite: { use: true }, + search: { sqliteAutoChunkThreshold: 1, annDefault: false } +}; +await fsPromises.writeFile( + path.join(tempRoot, '.pairofcleats.json'), + JSON.stringify(config, null, 2) +); + +const envBase = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const run = (args, label, envOverride = {}) => { + const result = spawnSync(process.execPath, args, { + cwd: tempRoot, + env: { ...envBase, ...envOverride }, + encoding: 'utf8' + }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); + } + return result.stdout || ''; +}; + +run([buildIndexPath, '--stub-embeddings', '--repo', tempRoot], 'build index'); +run([buildSqlitePath, '--repo', tempRoot], 'build sqlite'); + +const autoOutput = run( + [searchPath, 'greet', '--json', '--repo', tempRoot], + 'search auto with sqlite disabled', + { PAIROFCLEATS_SQLITE_DISABLED: '1' } +); +let autoBackend = null; +try { + autoBackend = JSON.parse(autoOutput).backend; +} catch { + console.error('Failed to parse JSON output for auto sqlite fallback.'); + process.exit(1); +} +if (autoBackend !== 'memory') { + console.error(`Expected memory backend with sqlite disabled, got ${autoBackend}`); + process.exit(1); +} + +const forcedResult = spawnSync( + process.execPath, + [searchPath, 'greet', '--json', '--backend', 'sqlite', '--repo', tempRoot], + { + cwd: tempRoot, + env: { ...envBase, PAIROFCLEATS_SQLITE_DISABLED: '1' }, + encoding: 'utf8' + } +); +if (forcedResult.status === 0) { + console.error('Expected forced sqlite search to fail when sqlite is disabled.'); + process.exit(1); +} +const forcedStderr = forcedResult.stderr || ''; +if (!forcedStderr.includes('better-sqlite3 is required')) { + console.error('Expected missing dependency message for forced sqlite backend.'); + if (forcedStderr) console.error(forcedStderr.trim()); + process.exit(1); +} + +console.log('SQLite missing dependency test passed'); diff --git a/tests/sqlite-sidecar-cleanup.js b/tests/sqlite-sidecar-cleanup.js new file mode 100644 index 000000000..1ba5b8780 --- /dev/null +++ b/tests/sqlite-sidecar-cleanup.js @@ -0,0 +1,57 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { loadUserConfig, resolveSqlitePaths } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const tempRoot = path.join(root, 'tests', '.cache', 'sqlite-sidecar-cleanup'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(tempRoot, { recursive: true }); +await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const run = (args, label) => { + const result = spawnSync(process.execPath, args, { cwd: repoRoot, env, stdio: 'inherit' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +run([path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], 'build index'); +run([path.join(root, 'tools', 'build-sqlite-index.js'), '--mode', 'code', '--repo', repoRoot], 'build sqlite'); + +const userConfig = loadUserConfig(repoRoot); +const sqlitePaths = resolveSqlitePaths(repoRoot, userConfig); +const walPath = `${sqlitePaths.codePath}-wal`; +const shmPath = `${sqlitePaths.codePath}-shm`; +await fsPromises.writeFile(walPath, 'stale-wal'); +await fsPromises.writeFile(shmPath, 'stale-shm'); + +run([path.join(root, 'tools', 'build-sqlite-index.js'), '--mode', 'code', '--repo', repoRoot], 'rebuild sqlite'); + +const staleWal = fs.existsSync(walPath) ? fs.readFileSync(walPath) : null; +const staleShm = fs.existsSync(shmPath) ? fs.readFileSync(shmPath) : null; +if (staleWal && staleWal.toString('utf8') === 'stale-wal') { + console.error('Stale WAL sidecar was not cleaned up.'); + process.exit(1); +} +if (staleShm && staleShm.toString('utf8') === 'stale-shm') { + console.error('Stale SHM sidecar was not cleaned up.'); + process.exit(1); +} + +console.log('sqlite sidecar cleanup test passed'); diff --git a/tests/sqlite-vec-candidate-set.js b/tests/sqlite-vec-candidate-set.js new file mode 100644 index 000000000..073270638 --- /dev/null +++ b/tests/sqlite-vec-candidate-set.js @@ -0,0 +1,53 @@ +#!/usr/bin/env node +import assert from 'node:assert'; +import { queryVectorAnn } from '../tools/vector-extension.js'; + +const config = { + enabled: true, + table: 'dense_vectors_ann', + column: 'embedding', + encoding: 'float32' +}; + +let currentRows = []; +let lastSql = null; +let lastParams = null; + +const db = { + prepare: (sql) => { + lastSql = sql; + return { + all: (...params) => { + lastParams = params; + return currentRows; + } + }; + } +}; + +currentRows = [ + { rowid: 2, distance: 0.5 }, + { rowid: 3, distance: 0.1 }, + { rowid: 1, distance: 0.1 } +]; +const smallCandidates = new Set([1, 2, 3]); +const smallHits = queryVectorAnn(db, config, [0, 1], 2, smallCandidates); +assert.ok(lastSql.includes('rowid IN'), 'expected candidate pushdown for small set'); +assert.ok(lastSql.includes('ORDER BY distance'), 'expected distance ordering'); +assert.equal(smallHits[0].idx, 1, 'expected rowid tie-break on distance'); +assert.equal(smallHits[1].idx, 3, 'expected rowid tie-break on distance'); + +const largeCandidates = new Set(Array.from({ length: 901 }, (_, i) => i)); +currentRows = [ + { rowid: 2000, distance: 0.05 }, + { rowid: 10, distance: 0.1 } +]; +lastSql = null; +lastParams = null; +const largeHits = queryVectorAnn(db, config, [0, 1], 2, largeCandidates); +assert.ok(!lastSql.includes('rowid IN'), 'expected fallback query for large set'); +assert.equal(largeHits.length, 1, 'expected candidate filtering for large set'); +assert.equal(largeHits[0].idx, 10, 'expected candidate filtering for large set'); +assert.ok(Array.isArray(lastParams), 'expected SQL parameters for ANN query'); + +console.log('sqlite vec candidate set test passed'); diff --git a/tests/structural-filters.js b/tests/structural-filters.js new file mode 100644 index 000000000..962c2a4e5 --- /dev/null +++ b/tests/structural-filters.js @@ -0,0 +1,76 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, getRepoCacheRoot, loadUserConfig } from '../tools/dict-utils.js'; +import { loadChunkMeta, readJsonFile } from '../src/shared/artifact-io.js'; +import { filterChunks } from '../src/retrieval/output.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'structural-filters'); +const repoRoot = path.join(tempRoot, 'repo'); +const srcDir = path.join(repoRoot, 'src'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(srcDir, { recursive: true }); +await fsPromises.writeFile(path.join(srcDir, 'example.js'), 'eval("x");\n', 'utf8'); + +const userConfig = loadUserConfig(repoRoot); +const cacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const structuralDir = path.join(cacheRoot, 'structural'); +await fsPromises.mkdir(structuralDir, { recursive: true }); +const match = { + engine: 'semgrep', + pack: 'test-pack', + ruleId: 'no-eval', + tags: ['security'], + path: 'src/example.js', + startLine: 1, + endLine: 1, + snippet: 'eval("x")' +}; +await fsPromises.writeFile( + path.join(structuralDir, 'structural.jsonl'), + `${JSON.stringify(match)}\n`, + 'utf8' +); + +const buildResult = spawnSync(process.execPath, [ + path.join(root, 'build_index.js'), + '--stub-embeddings', + '--repo', + repoRoot +], { encoding: 'utf8' }); +if (buildResult.status !== 0) { + console.error(buildResult.stderr || buildResult.stdout || 'build_index failed'); + process.exit(buildResult.status ?? 1); +} + +const indexDir = getIndexDir(repoRoot, 'code', userConfig); +const chunkMeta = loadChunkMeta(indexDir); +const fileMeta = readJsonFile(path.join(indexDir, 'file_meta.json')); +const fileMetaById = new Map( + Array.isArray(fileMeta) ? fileMeta.map((entry) => [entry.id, entry]) : [] +); +for (const chunk of chunkMeta) { + if (!chunk || chunk.file || chunk.fileId == null) continue; + const meta = fileMetaById.get(chunk.fileId); + if (meta?.file) chunk.file = meta.file; +} +const target = chunkMeta.find((chunk) => chunk.file === 'src/example.js'); +assert.ok(target, 'expected example.js chunk to exist'); +assert.ok(Array.isArray(target.docmeta?.structural), 'expected structural metadata on chunk'); +assert.equal(target.docmeta.structural[0]?.pack, 'test-pack'); +assert.equal(target.docmeta.structural[0]?.ruleId, 'no-eval'); + +const packFiltered = filterChunks(chunkMeta, { structPack: 'test-pack' }); +assert.ok(packFiltered.find((chunk) => chunk.file === 'src/example.js'), 'expected struct-pack filter to match'); + +const ruleFiltered = filterChunks(chunkMeta, { structRule: 'no-eval' }); +assert.ok(ruleFiltered.find((chunk) => chunk.file === 'src/example.js'), 'expected struct-rule filter to match'); + +const tagFiltered = filterChunks(chunkMeta, { structTag: 'security' }); +assert.ok(tagFiltered.find((chunk) => chunk.file === 'src/example.js'), 'expected struct-tag filter to match'); + +console.log('structural filters test passed'); diff --git a/tests/structural-search.js b/tests/structural-search.js new file mode 100644 index 000000000..7b89d35d0 --- /dev/null +++ b/tests/structural-search.js @@ -0,0 +1,64 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'structural-search'); +const repoRoot = path.join(tempRoot, 'repo'); +const srcDir = path.join(repoRoot, 'src'); +const docsDir = path.join(repoRoot, 'docs'); +const binRoot = path.join(root, 'tests', 'fixtures', 'structural', 'bin'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(srcDir, { recursive: true }); +await fsPromises.mkdir(docsDir, { recursive: true }); +await fsPromises.writeFile(path.join(srcDir, 'example.js'), 'eval(\"x\");\n'); +await fsPromises.writeFile(path.join(srcDir, 'example.ts'), 'eval(x);\n'); +await fsPromises.writeFile(path.join(docsDir, 'notes.md'), 'TODO: update\n'); + +for (const binName of ['semgrep', 'sg', 'comby']) { + try { + await fsPromises.chmod(path.join(binRoot, binName), 0o755); + } catch {} +} + +const env = { + ...process.env, + PATH: `${binRoot}${path.delimiter}${process.env.PATH || ''}`, + PAIROFCLEATS_PROFILE: 'full' +}; + +const result = spawnSync( + process.execPath, + [ + path.join(root, 'tools', 'structural-search.js'), + '--repo', repoRoot, + '--pack', 'semgrep-security', + '--pack', 'astgrep-js-safety', + '--pack', 'comby-docs', + '--format', 'json' + ], + { encoding: 'utf8', env } +); + +if (result.status !== 0) { + console.error(result.stderr || result.stdout || 'structural-search failed'); + process.exit(result.status ?? 1); +} + +const payload = JSON.parse(result.stdout || '{}'); +assert.ok(Array.isArray(payload.results), 'expected results array'); +assert.ok(payload.results.length >= 3, 'expected at least 3 results'); + +const engines = new Set(payload.results.map((entry) => entry.engine)); +assert.ok(engines.has('semgrep'), 'expected semgrep result'); +assert.ok(engines.has('ast-grep'), 'expected ast-grep result'); +assert.ok(engines.has('comby'), 'expected comby result'); + +const comby = payload.results.find((entry) => entry.engine === 'comby'); +assert.equal(comby.path, 'docs/notes.md'); + +console.log('structural search test passed'); diff --git a/tests/sublime-pycompile.js b/tests/sublime-pycompile.js new file mode 100644 index 000000000..fca759990 --- /dev/null +++ b/tests/sublime-pycompile.js @@ -0,0 +1,48 @@ +#!/usr/bin/env node +import { spawnSync } from 'node:child_process'; +import fs from 'node:fs'; +import path from 'node:path'; + +const root = process.cwd(); +const pkgDir = path.join(root, 'sublime', 'PairOfCleats'); + +const collectPyFiles = (dir) => { + const out = []; + const stack = [dir]; + while (stack.length) { + const current = stack.pop(); + const entries = fs.readdirSync(current, { withFileTypes: true }); + for (const entry of entries) { + const full = path.join(current, entry.name); + if (entry.isDirectory()) { + stack.push(full); + } else if (entry.isFile() && entry.name.endsWith('.py')) { + out.push(full); + } + } + } + out.sort(); + return out; +}; + +const pyFiles = collectPyFiles(pkgDir); +if (!pyFiles.length) { + console.error('sublime-pycompile: no python files found under', pkgDir); + process.exit(1); +} + +const python = process.env.PYTHON || 'python'; +const result = spawnSync( + python, + ['-m', 'py_compile', ...pyFiles], + { encoding: 'utf8' } +); + +if (result.status !== 0) { + console.error('sublime-pycompile: python -m py_compile failed'); + if (result.stdout) console.error(result.stdout); + if (result.stderr) console.error(result.stderr); + process.exit(result.status || 1); +} + +console.log(`sublime-pycompile: ok (compiled ${pyFiles.length} files)`); diff --git a/tests/sublime/test_api_client.py b/tests/sublime/test_api_client.py new file mode 100644 index 000000000..58a46923a --- /dev/null +++ b/tests/sublime/test_api_client.py @@ -0,0 +1,139 @@ +import importlib +import json +import os +import sys +import tempfile +import threading +import unittest +from http.server import BaseHTTPRequestHandler +from socketserver import TCPServer + +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) +PACKAGE_ROOT = os.path.join(REPO_ROOT, 'sublime') +if PACKAGE_ROOT not in sys.path: + sys.path.insert(0, PACKAGE_ROOT) + +api_client = importlib.import_module('PairOfCleats.lib.api_client') + + +class _Handler(BaseHTTPRequestHandler): + def do_GET(self): + path = self.path.split('?', 1)[0] + query = {} + if '?' in self.path: + try: + from urllib.parse import parse_qs + query = {k: v[0] for k, v in parse_qs(self.path.split('?', 1)[1]).items()} + except Exception: + query = {} + + if path == '/map': + fmt = query.get('format') or 'json' + self.send_response(200) + self.send_header('Access-Control-Allow-Origin', '*') + self.send_header('X-PairofCleats-Map-CacheKey', 'test-cache-key') + if fmt == 'json': + payload = { + 'root': {'path': query.get('repo') or '/repo', 'id': 'repo-id'}, + 'summary': {'counts': {'files': 1, 'members': 1, 'edges': 0}}, + 'warnings': [] + } + body = json.dumps(payload).encode('utf-8') + self.send_header('Content-Type', 'application/json') + self.send_header('Content-Length', str(len(body))) + self.end_headers() + self.wfile.write(body) + return + + if fmt == 'dot': + body = b'digraph G {}\n' + self.send_header('Content-Type', 'text/plain') + self.send_header('Content-Length', str(len(body))) + self.end_headers() + self.wfile.write(body) + return + + body = b'' + self.send_header('Content-Type', 'text/html') + self.send_header('Content-Length', str(len(body))) + self.end_headers() + self.wfile.write(body) + return + + if path == '/map/nodes': + self.send_response(200) + self.send_header('Access-Control-Allow-Origin', '*') + payload = { + 'generatedAt': 'now', + 'root': query.get('repo') or '/repo', + 'nodes': [{'id': 'n1', 'label': 'node 1', 'file': 'src/a.js'}] + } + body = json.dumps(payload).encode('utf-8') + self.send_header('Content-Type', 'application/json') + self.send_header('Content-Length', str(len(body))) + self.end_headers() + self.wfile.write(body) + return + + self.send_response(404) + self.end_headers() + + def log_message(self, _format, *_args): + return + + +class ApiClientTests(unittest.TestCase): + def test_generate_map_report_writes_artifacts(self): + server = TCPServer(('127.0.0.1', 0), _Handler) + port = server.server_address[1] + + thread = threading.Thread(target=server.serve_forever) + thread.daemon = True + thread.start() + + try: + with tempfile.TemporaryDirectory() as tmp: + output_path = os.path.join(tmp, 'out.dot') + model_path = os.path.join(tmp, 'model.json') + nodes_path = os.path.join(tmp, 'nodes.json') + + settings = { + 'api_timeout_ms': 2000, + 'map_index_mode': 'code', + 'map_collapse_default': 'none' + } + + report = api_client.generate_map_report( + 'http://127.0.0.1:{0}'.format(port), + '/repo', + settings, + 'repo', + '', + 'imports', + 'dot', + output_path, + model_path, + nodes_path + ) + + self.assertTrue(report.get('ok')) + self.assertEqual(report.get('format'), 'dot') + self.assertEqual(report.get('cacheKey'), 'test-cache-key') + + self.assertTrue(os.path.exists(output_path)) + self.assertTrue(os.path.exists(model_path)) + self.assertTrue(os.path.exists(nodes_path)) + + finally: + try: + server.shutdown() + except Exception: + pass + try: + server.server_close() + except Exception: + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/sublime/test_plugin.py b/tests/sublime/test_plugin.py new file mode 100644 index 000000000..d736657c5 --- /dev/null +++ b/tests/sublime/test_plugin.py @@ -0,0 +1,294 @@ +import importlib +import os +import sys +import tempfile +import types +import unittest + +BASE_SETTINGS = {} + + +def install_sublime_stubs(): + sublime = types.ModuleType('sublime') + + class DummySettings(object): + def __init__(self, values): + self._values = values + + def get(self, key, default=None): + return self._values.get(key, default) + + def load_settings(_name): + return DummySettings(BASE_SETTINGS) + + sublime.load_settings = load_settings + sublime.set_timeout = lambda fn, _delay=0: fn() + sublime.error_message = lambda _message: None + sublime.status_message = lambda _message: None + sublime.active_window = lambda: None + sublime.ENCODED_POSITION = 1 + + class Region(object): + def __init__(self, a, b): + self.a = a + self.b = b + + sublime.Region = Region + + sublime_plugin = types.ModuleType('sublime_plugin') + + class WindowCommand(object): + def __init__(self, window=None): + self.window = window + + sublime_plugin.WindowCommand = WindowCommand + + class TextCommand(object): + def __init__(self, view=None): + self.view = view + + sublime_plugin.TextCommand = TextCommand + + sys.modules['sublime'] = sublime + sys.modules['sublime_plugin'] = sublime_plugin + + +install_sublime_stubs() + +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) +PACKAGE_ROOT = os.path.join(REPO_ROOT, 'sublime') +if PACKAGE_ROOT not in sys.path: + sys.path.insert(0, PACKAGE_ROOT) + +config = importlib.import_module('PairOfCleats.lib.config') +index_state = importlib.import_module('PairOfCleats.lib.index_state') +indexing = importlib.import_module('PairOfCleats.lib.indexing') +map_lib = importlib.import_module('PairOfCleats.lib.map') +map_state = importlib.import_module('PairOfCleats.lib.map_state') +paths = importlib.import_module('PairOfCleats.lib.paths') +search = importlib.import_module('PairOfCleats.lib.search') +results = importlib.import_module('PairOfCleats.lib.results') +watch = importlib.import_module('PairOfCleats.lib.watch') + + +class MockView(object): + def __init__(self, filename=None): + self._filename = filename + + def file_name(self): + return self._filename + + +class MockWindow(object): + def __init__(self, project_data=None, folders=None, view=None): + self._project_data = project_data or {} + self._folders = folders or [] + self._view = view + + def project_data(self): + return self._project_data + + def set_project_data(self, data): + self._project_data = data + + def folders(self): + return list(self._folders) + + def active_view(self): + return self._view + + def id(self): + return 1 + + +class DummyProcess(object): + def __init__(self, running=True): + self._running = running + + def poll(self): + return None if self._running else 0 + + +class DummyHandle(object): + def __init__(self, process): + self.process = process + self.cancelled = False + + def cancel(self): + self.cancelled = True + + +class SublimePluginTests(unittest.TestCase): + def setUp(self): + BASE_SETTINGS.clear() + BASE_SETTINGS.update(config.DEFAULT_SETTINGS) + + def test_find_repo_root_prefers_pairofcleats_json(self): + with tempfile.TemporaryDirectory() as root: + os.makedirs(os.path.join(root, 'src')) + open(os.path.join(root, '.pairofcleats.json'), 'w').close() + target = os.path.join(root, 'src', 'file.txt') + open(target, 'w').close() + + resolved = paths.find_repo_root(target) + self.assertEqual(resolved, root) + + def test_find_repo_root_git_fallback(self): + with tempfile.TemporaryDirectory() as root: + os.makedirs(os.path.join(root, 'src')) + git_dir = os.path.join(root, '.git') + os.makedirs(git_dir) + target = os.path.join(root, 'src', 'file.txt') + open(target, 'w').close() + + resolved = paths.find_repo_root(target) + self.assertEqual(resolved, root) + + def test_resolve_cli_prefers_configured_path(self): + with tempfile.TemporaryDirectory() as root: + os.makedirs(os.path.join(root, 'bin')) + cli_path = os.path.join(root, 'bin', 'pairofcleats.js') + open(cli_path, 'w').close() + settings = dict(config.DEFAULT_SETTINGS) + settings['pairofcleats_path'] = 'bin/pairofcleats.js' + settings['node_path'] = '/usr/bin/node' + + resolved = paths.resolve_cli(settings, root) + self.assertEqual(resolved['command'], '/usr/bin/node') + self.assertEqual(resolved['args_prefix'], [cli_path]) + self.assertEqual(resolved['source'], 'settings') + + def test_resolve_cli_local_bin(self): + with tempfile.TemporaryDirectory() as root: + bin_dir = os.path.join(root, 'node_modules', '.bin') + os.makedirs(bin_dir) + local_cli = os.path.join(bin_dir, 'pairofcleats.cmd') + open(local_cli, 'w').close() + settings = dict(config.DEFAULT_SETTINGS) + + resolved = paths.resolve_cli(settings, root) + self.assertEqual(resolved['command'], local_cli) + self.assertEqual(resolved['args_prefix'], []) + self.assertEqual(resolved['source'], 'node_modules') + + def test_settings_merge_project_overrides(self): + BASE_SETTINGS['open_results_in'] = 'quick_panel' + BASE_SETTINGS['env'] = {'PAIROFCLEATS_CACHE_ROOT': 'A'} + project_data = { + 'settings': { + 'pairofcleats': { + 'open_results_in': 'output_panel', + 'env': { + 'PAIROFCLEATS_CACHE_ROOT': 'B' + } + } + } + } + window = MockWindow(project_data=project_data) + settings = config.get_settings(window) + + self.assertEqual(settings['open_results_in'], 'output_panel') + self.assertEqual(settings['env']['PAIROFCLEATS_CACHE_ROOT'], 'B') + + def test_validate_settings_reports_invalid_values(self): + settings = dict(config.DEFAULT_SETTINGS) + settings['index_mode_default'] = 'invalid' + settings['open_results_in'] = 'nowhere' + errors = config.validate_settings(settings) + self.assertTrue(errors) + + def test_build_search_args(self): + args = search.build_search_args( + 'alpha', + repo_root='/repo', + mode='code', + backend='memory', + limit=5, + explain=True + ) + self.assertIn('--json', args) + self.assertIn('--mode', args) + self.assertIn('--backend', args) + self.assertIn('--top', args) + self.assertIn('--explain', args) + self.assertIn('/repo', args) + + def test_map_output_dir_default(self): + with tempfile.TemporaryDirectory() as root: + settings = dict(config.DEFAULT_SETTINGS) + output_dir = map_lib.resolve_output_dir(root, settings) + expected = os.path.join(root, '.pairofcleats', 'maps') + self.assertEqual(output_dir, expected) + + def test_build_map_args(self): + settings = dict(config.DEFAULT_SETTINGS) + args = map_lib.build_map_args( + '/repo', + settings, + 'file', + 'src/app.js', + 'calls', + 'dot', + '/out.dot', + '/out.model.json', + '/out.nodes.json' + ) + self.assertIn('report', args) + self.assertIn('map', args) + self.assertIn('--scope', args) + self.assertIn('file', args) + self.assertIn('--include', args) + self.assertIn('calls', args) + + def test_record_last_map(self): + window = MockWindow() + payload = {'outPath': '/tmp/map.dot', 'format': 'dot'} + map_state.record_last_map(window, payload) + stored = map_state.get_last_map(window) + self.assertEqual(stored.get('format'), 'dot') + + def test_collect_hits_tolerates_partial_payload(self): + payload = { + 'code': [{'file': 'src/a.py'}], + 'prose': None, + 'records': 'bad', + 'extractedProse': [{'file': 'docs/readme.md'}] + } + hits = results.collect_hits(payload) + files = [hit.get('file') for hit in hits] + self.assertIn('src/a.py', files) + self.assertIn('docs/readme.md', files) + + def test_record_last_build(self): + window = MockWindow(project_data={}) + state = index_state.record_last_build(window, 'code') + self.assertEqual(state.get('last_mode'), 'code') + stored = index_state.get_last_build(window) + self.assertEqual(stored.get('last_mode'), 'code') + + def test_build_index_args(self): + args = indexing.build_index_args('code', repo_root='/repo') + self.assertEqual(args[0:2], ['index', 'build']) + self.assertIn('--mode', args) + self.assertIn('--repo', args) + + def test_resolve_watch_root_folder_scope(self): + settings = dict(config.DEFAULT_SETTINGS) + settings['index_watch_scope'] = 'folder' + window = MockWindow(folders=['/workspace/sub']) + resolved = paths.resolve_watch_root(window, settings) + self.assertEqual(resolved, '/workspace/sub') + + def test_watch_gating(self): + window = MockWindow() + process = DummyProcess(running=True) + handle = DummyHandle(process) + watch.register(window, handle, '/repo') + self.assertTrue(watch.is_running(window)) + stopped = watch.stop(window) + self.assertTrue(stopped) + self.assertTrue(handle.cancelled) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/subprocess-quoting.js b/tests/subprocess-quoting.js new file mode 100644 index 000000000..ba722b7c6 --- /dev/null +++ b/tests/subprocess-quoting.js @@ -0,0 +1,107 @@ +#!/usr/bin/env node +import http from 'node:http'; +import os from 'node:os'; +import path from 'node:path'; +import readline from 'node:readline'; +import fsPromises from 'node:fs/promises'; +import { spawn, spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const fixtureRoot = path.join(root, 'tests', 'fixtures', 'sample'); +const cacheRoot = path.join(root, 'tests', '.cache', 'subprocess-quoting'); +const serverPath = path.join(root, 'tools', 'api-server.js'); + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +// Create a repo path containing spaces to catch quoting/arg-parsing bugs. +const repoParent = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'pairofcleats repo with spaces ')); +const repoPath = path.join(repoParent, 'sample repo'); +await fsPromises.cp(fixtureRoot, repoPath, { recursive: true }); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const build = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoPath], + { env, stdio: 'inherit' } +); +if (build.status !== 0) { + console.error('subprocess-quoting test failed: build_index failed'); + process.exit(1); +} + +const server = spawn( + process.execPath, + [serverPath, '--repo', repoPath, '--host', '127.0.0.1', '--port', '0', '--json'], + { env, stdio: ['ignore', 'pipe', 'pipe'] } +); + +let stderr = ''; +server.stderr.on('data', (chunk) => { + stderr += chunk.toString(); +}); + +const rl = readline.createInterface({ input: server.stdout }); +const readStartup = () => new Promise((resolve, reject) => { + const timeout = setTimeout(() => reject(new Error('timeout waiting for api-server startup')), 15000); + rl.once('line', (line) => { + clearTimeout(timeout); + resolve(line); + }); +}); + +const requestJson = (baseUrl, pathname) => new Promise((resolve, reject) => { + const req = http.get(baseUrl + pathname, (res) => { + let data = ''; + res.on('data', (chunk) => { + data += chunk.toString(); + }); + res.on('end', () => { + try { + resolve({ status: res.statusCode || 0, body: JSON.parse(data || '{}') }); + } catch (err) { + reject(err); + } + }); + }); + req.on('error', reject); +}); + +let serverInfo = null; +try { + const line = await readStartup(); + serverInfo = JSON.parse(line || '{}'); + if (!serverInfo?.baseUrl) { + throw new Error('api-server did not report a baseUrl'); + } + + const health = await requestJson(serverInfo.baseUrl, '/health'); + if (!health.body?.ok) { + throw new Error('api-server /health failed'); + } + + const map = await requestJson(serverInfo.baseUrl, '/map?format=json'); + if (!map.body?.root?.path) { + throw new Error('api-server /map did not return a map model'); + } +} catch (err) { + console.error(err?.message || err); + if (stderr.trim()) { + console.error(stderr.trim()); + } + server.kill('SIGKILL'); + process.exit(1); +} finally { + try { + server.kill('SIGKILL'); + } catch (e) { + // ignore + } +} + +console.log('subprocess-quoting: ok'); diff --git a/tests/summary-report.js b/tests/summary-report.js index 920997b27..dc0452f8d 100644 --- a/tests/summary-report.js +++ b/tests/summary-report.js @@ -18,15 +18,22 @@ await fsPromises.cp(fixtureRoot, repoRoot, { recursive: true }); const env = { ...process.env, PAIROFCLEATS_CACHE_ROOT: cacheRoot, - PAIROFCLEATS_EMBEDDINGS: 'stub' + PAIROFCLEATS_EMBEDDINGS: 'stub', + PAIROFCLEATS_PROFILE: 'full' }; const result = spawnSync( process.execPath, [ path.join(root, 'tools', 'combined-summary.js'), + '--repo', + repoRoot, '--models', 'Xenova/all-MiniLM-L12-v2,Xenova/all-MiniLM-L6-v2', + '--limit', + '5', + '--top', + '3', '--no-ann', '--out', outPath diff --git a/tests/thread-limits.js b/tests/thread-limits.js new file mode 100644 index 000000000..e8dc72e97 --- /dev/null +++ b/tests/thread-limits.js @@ -0,0 +1,37 @@ +#!/usr/bin/env node +import { resolveThreadLimits } from '../src/shared/threads.js'; +import { planShardBatches } from '../src/index/build/shards.js'; + +const argv = { threads: 4 }; +const rawArgv = ['--threads', '4']; +const envConfig = {}; +const limits = resolveThreadLimits({ argv, rawArgv, envConfig, configConcurrency: null, importConcurrencyConfig: null }); + +if (limits.fileConcurrency !== 4) { + console.error(`thread limits test failed: fileConcurrency ${limits.fileConcurrency} !== 4`); + process.exit(1); +} +if (limits.cpuConcurrency !== limits.fileConcurrency) { + console.error('thread limits test failed: cpuConcurrency not equal fileConcurrency'); + process.exit(1); +} + +const items = [ + { id: 'a', weight: 8 }, + { id: 'b', weight: 7 }, + { id: 'c', weight: 6 }, + { id: 'd', weight: 5 } +]; +const batches = planShardBatches(items, 2, { resolveWeight: (item) => item.weight }); +if (batches.length !== 2) { + console.error(`thread limits test failed: expected 2 batches, got ${batches.length}`); + process.exit(1); +} +const sums = batches.map((batch) => batch.reduce((sum, item) => sum + item.weight, 0)); +const sorted = sums.slice().sort((a, b) => b - a); +if (sorted[0] !== 13 || sorted[1] !== 13) { + console.error(`thread limits test failed: batch sums ${sorted.join(',')} expected 13,13`); + process.exit(1); +} + +console.log('thread limits test passed'); diff --git a/tests/tokenization-buffering.js b/tests/tokenization-buffering.js new file mode 100644 index 000000000..bf0d74285 --- /dev/null +++ b/tests/tokenization-buffering.js @@ -0,0 +1,50 @@ +#!/usr/bin/env node +import { + createTokenizationBuffers, + createTokenizationContext, + tokenizeChunkText +} from '../src/index/build/tokenization.js'; + +const context = createTokenizationContext({ + dictWords: new Set(['alpha', 'beta']), + dictConfig: {}, + postingsConfig: {} +}); + +const input = { + text: 'function alphaBeta() { return alpha + beta; }', + mode: 'code', + ext: '.js', + context +}; + +const baseline = tokenizeChunkText(input); +const buffers = createTokenizationBuffers(); +const buffered = tokenizeChunkText({ ...input, buffers }); +const mutated = tokenizeChunkText({ + ...input, + text: 'const gamma = alpha + beta;', + buffers +}); +const bufferedAgain = tokenizeChunkText({ ...input, buffers }); + +const compare = (label, a, b) => { + if (JSON.stringify(a) !== JSON.stringify(b)) { + console.error(`Tokenization mismatch for ${label}`); + process.exit(1); + } +}; + +compare('tokens', baseline.tokens, buffered.tokens); +compare('seq', baseline.seq, buffered.seq); +compare('ngrams', baseline.ngrams, buffered.ngrams); +compare('chargrams', baseline.chargrams, buffered.chargrams); +compare('minhash', baseline.minhashSig, buffered.minhashSig); +if (JSON.stringify(mutated.tokens) === JSON.stringify(baseline.tokens)) { + console.error('Expected buffer reuse to handle different content.'); + process.exit(1); +} +compare('tokens (reuse)', baseline.tokens, bufferedAgain.tokens); +compare('minhash (reuse)', baseline.minhashSig, bufferedAgain.minhashSig); + +console.log('tokenization buffering test passed'); diff --git a/tests/tokenize-dictionary.js b/tests/tokenize-dictionary.js new file mode 100644 index 000000000..0c6f6c2af --- /dev/null +++ b/tests/tokenize-dictionary.js @@ -0,0 +1,24 @@ +#!/usr/bin/env node +import { splitWordsWithDict } from '../src/shared/tokenize.js'; + +const dict = new Set(['alpha', 'beta']); +const unknown = splitWordsWithDict('alphazzzbeta', dict, { segmentation: 'greedy' }); +if (unknown.join('|') !== 'alpha|zzz|beta') { + console.error(`Unexpected unknown span split: ${unknown.join('|')}`); + process.exit(1); +} + +const dpDict = new Set(['abc', 'ab', 'cd']); +const autoSegments = splitWordsWithDict('abcd', dpDict, { segmentation: 'auto', dpMaxTokenLength: 8 }); +if (autoSegments.join('|') !== 'ab|cd') { + console.error(`Unexpected DP fallback split: ${autoSegments.join('|')}`); + process.exit(1); +} + +const ahoSegments = splitWordsWithDict('alphabeta', dict, { segmentation: 'aho' }); +if (ahoSegments.join('|') !== 'alpha|beta') { + console.error(`Unexpected Aho split: ${ahoSegments.join('|')}`); + process.exit(1); +} + +console.log('dictionary tokenization test passed'); diff --git a/tests/tool-root.js b/tests/tool-root.js new file mode 100644 index 000000000..90f916510 --- /dev/null +++ b/tests/tool-root.js @@ -0,0 +1,65 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'tool-root'); +const repoRoot = path.join(tempRoot, 'repo'); +const outsideRoot = path.join(tempRoot, 'outside'); +const cacheRoot = path.join(tempRoot, 'cache'); +const srcDir = path.join(repoRoot, 'src'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(srcDir, { recursive: true }); +await fsPromises.mkdir(outsideRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(srcDir, 'index.js'), + 'export function greet(name) {\n return `hi ${name}`;\n}\n', + 'utf8' +); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], + { cwd: outsideRoot, env, stdio: 'inherit' } +); +if (buildResult.status !== 0) { + console.error('Failed: build_index from outside repo root'); + process.exit(buildResult.status ?? 1); +} + +const searchResult = spawnSync( + process.execPath, + [path.join(root, 'search.js'), 'greet', '--json', '--no-ann', '--repo', repoRoot], + { cwd: outsideRoot, env, encoding: 'utf8' } +); +if (searchResult.status !== 0) { + console.error('Failed: search from outside repo root'); + console.error(searchResult.stderr || searchResult.stdout || ''); + process.exit(searchResult.status ?? 1); +} + +let payload = null; +try { + payload = JSON.parse(searchResult.stdout || '{}'); +} catch { + console.error('Failed: search output was not JSON'); + process.exit(1); +} + +const hits = payload.code || []; +if (!hits.length) { + console.error('Failed: search returned no results'); + process.exit(1); +} + +console.log('Tool root outside-repo test passed'); diff --git a/tests/tooling-detect.js b/tests/tooling-detect.js index eed5e9bce..825672617 100644 --- a/tests/tooling-detect.js +++ b/tests/tooling-detect.js @@ -24,7 +24,7 @@ try { } const languages = payload.languages || {}; -const required = ['python', 'rust', 'go', 'java', 'cpp', 'objc']; +const required = ['python', 'rust', 'go', 'java', 'cpp', 'objc', 'swift']; for (const lang of required) { if (!languages[lang]) { console.error(`Missing detected language: ${lang}`); @@ -33,7 +33,7 @@ for (const lang of required) { } const toolIds = (payload.tools || []).map((tool) => tool.id); -const toolRequired = ['clangd', 'gopls', 'rust-analyzer', 'jdtls']; +const toolRequired = ['clangd', 'gopls', 'rust-analyzer', 'jdtls', 'sourcekit-lsp']; for (const tool of toolRequired) { if (!toolIds.includes(tool)) { console.error(`Missing tooling entry: ${tool}`); diff --git a/tests/tooling-lsp.js b/tests/tooling-lsp.js new file mode 100644 index 000000000..b9b76ab4d --- /dev/null +++ b/tests/tooling-lsp.js @@ -0,0 +1,126 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { PassThrough } from 'node:stream'; +import { buildLineIndex } from '../src/shared/lines.js'; +import { createFramedJsonRpcParser, writeFramedJsonRpc } from '../src/shared/jsonrpc.js'; +import { flattenSymbols } from '../src/integrations/tooling/lsp/symbols.js'; +import { rangeToOffsets } from '../src/integrations/tooling/lsp/positions.js'; + +const messages = []; +const errors = []; +const parser = createFramedJsonRpcParser({ + onMessage: (msg) => messages.push(msg), + onError: (err) => errors.push(err) +}); + +const waitFor = async (count) => { + for (let i = 0; i < 50; i += 1) { + if (messages.length >= count) return; + await new Promise((resolve) => setTimeout(resolve, 0)); + } + throw new Error(`Timed out waiting for ${count} messages.`); +}; + +const msgOne = { jsonrpc: '2.0', id: 1, result: 'ok' }; +const msgTwo = { jsonrpc: '2.0', method: 'notify', params: { ok: true } }; + +const frame = (payload) => { + const body = Buffer.from(JSON.stringify(payload), 'utf8'); + const header = `Content-Length: ${body.length}\r\n\r\n`; + return Buffer.concat([Buffer.from(header, 'utf8'), body]); +}; + +const combined = Buffer.concat([frame(msgOne), frame(msgTwo)]); +parser.push(combined.slice(0, 12)); +parser.push(combined.slice(12)); + +await waitFor(2); +assert.equal(errors.length, 0); +assert.equal(messages.length, 2); +assert.deepEqual(messages[0], msgOne); +assert.deepEqual(messages[1], msgTwo); + +const capture = new PassThrough(); +const capturedChunks = []; +capture.on('data', (chunk) => capturedChunks.push(chunk)); +await writeFramedJsonRpc(capture, msgOne); +const parserTwo = createFramedJsonRpcParser({ + onMessage: (msg) => messages.push(msg), + onError: (err) => errors.push(err) +}); +parserTwo.push(Buffer.concat(capturedChunks)); +await waitFor(3); +assert.deepEqual(messages[messages.length - 1], msgOne); + +const largeMessages = []; +const largeErrors = []; +const parserLarge = createFramedJsonRpcParser({ + onMessage: (msg) => largeMessages.push(msg), + onError: (err) => largeErrors.push(err) +}); +const largePayload = { + jsonrpc: '2.0', + id: 99, + result: 'x'.repeat(512 * 1024) +}; +const largeFrame = frame(largePayload); +for (let i = 0; i < largeFrame.length; i += 1024) { + parserLarge.push(largeFrame.slice(i, i + 1024)); +} +for (let i = 0; i < 50; i += 1) { + if (largeMessages.length) break; + await new Promise((resolve) => setTimeout(resolve, 0)); +} +assert.equal(largeErrors.length, 0); +assert.equal(largeMessages.length, 1); +assert.equal(largeMessages[0].id, 99); + +const docSymbols = [ + { + name: 'Widget', + kind: 5, + detail: 'class Widget', + range: { start: { line: 0, character: 0 }, end: { line: 4, character: 0 } }, + selectionRange: { start: { line: 0, character: 6 }, end: { line: 0, character: 12 } }, + children: [ + { + name: 'render', + kind: 6, + detail: 'func render()', + range: { start: { line: 1, character: 2 }, end: { line: 2, character: 0 } }, + selectionRange: { start: { line: 1, character: 2 }, end: { line: 1, character: 8 } } + } + ] + } +]; + +const flattenedDoc = flattenSymbols(docSymbols); +assert.equal(flattenedDoc.length, 2); +assert.equal(flattenedDoc[1].fullName, 'Widget.render'); + +const infoSymbols = [ + { + name: 'makeWidget', + kind: 12, + containerName: 'Factory', + location: { + uri: 'file:///tmp/example.swift', + range: { start: { line: 5, character: 0 }, end: { line: 7, character: 0 } } + } + } +]; + +const flattenedInfo = flattenSymbols(infoSymbols); +assert.equal(flattenedInfo.length, 1); +assert.equal(flattenedInfo[0].fullName, 'Factory.makeWidget'); + +const text = 'alpha\nbeta\ngamma'; +const lineIndex = buildLineIndex(text); +const offsets = rangeToOffsets(lineIndex, { + start: { line: 0, character: 1 }, + end: { line: 1, character: 2 } +}); +assert.equal(offsets.start, 1); +assert.equal(offsets.end, lineIndex[1] + 2); + +console.log('tooling LSP utils test passed'); diff --git a/tests/tree-sitter-chunks.js b/tests/tree-sitter-chunks.js new file mode 100644 index 000000000..bf52d007c --- /dev/null +++ b/tests/tree-sitter-chunks.js @@ -0,0 +1,89 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { buildTreeSitterChunks, preloadTreeSitterLanguages } from '../src/lang/tree-sitter.js'; + +const root = path.resolve('tests', 'fixtures', 'tree-sitter'); +const fixtures = [ + { id: 'swift', file: 'swift.swift', languageId: 'swift', expect: ['Widget', 'Widget.greet'] }, + { id: 'kotlin', file: 'kotlin.kt', languageId: 'kotlin', expect: ['Widget', 'Widget.greet'] }, + { id: 'csharp', file: 'csharp.cs', languageId: 'csharp', expect: ['Widget', 'Widget.Greet'] }, + { id: 'clike', file: 'clike.c', ext: '.c', expect: ['Widget', 'greet'] }, + { id: 'cpp', file: 'cpp.cpp', ext: '.cpp', expect: ['Widget', 'Widget.greet'] }, + { id: 'objc', file: 'objc.m', ext: '.m', expect: ['Widget', 'greet'] }, + { id: 'go', file: 'go.go', languageId: 'go', expect: ['Widget', 'Widget.Greet'] }, + { id: 'rust', file: 'rust.rs', languageId: 'rust', expect: ['Widget', 'Widget.greet'] }, + { id: 'java', file: 'java.java', languageId: 'java', expect: ['Widget', 'Widget.greet'] } +]; + +const preloadIds = fixtures + .map((fixture) => fixture.languageId + || (fixture.ext === '.c' ? 'clike' : null) + || (fixture.ext === '.cpp' ? 'cpp' : null) + || (fixture.ext === '.m' ? 'objc' : null)) + .filter(Boolean); + +await preloadTreeSitterLanguages(preloadIds); + +const options = { treeSitter: { enabled: true }, log: () => {} }; + +const first = fixtures[0]; +const firstText = fs.readFileSync(path.join(root, first.file), 'utf8'); +const firstChunks = buildTreeSitterChunks({ + text: firstText, + languageId: first.languageId, + ext: first.ext, + options +}); + +if (!firstChunks || !firstChunks.length) { + console.log('tree-sitter not available; skipping tree-sitter chunk tests.'); + process.exit(0); +} + +const limitedByBytes = buildTreeSitterChunks({ + text: firstText, + languageId: first.languageId, + ext: first.ext, + options: { treeSitter: { enabled: true, maxBytes: 1 }, log: () => {} } +}); + +if (limitedByBytes !== null) { + throw new Error('expected tree-sitter to skip oversized file by maxBytes'); +} + +const limitedByLines = buildTreeSitterChunks({ + text: firstText, + languageId: first.languageId, + ext: first.ext, + options: { treeSitter: { enabled: true, maxLines: 1 }, log: () => {} } +}); + +if (limitedByLines !== null) { + throw new Error('expected tree-sitter to skip oversized file by maxLines'); +} + +const toNameSet = (chunks) => new Set(chunks.map((c) => c.name)); +const assertHas = (set, expected, label) => { + for (const name of expected) { + if (!set.has(name)) { + throw new Error(`${label} missing expected chunk name: ${name}`); + } + } +}; + +for (const fixture of fixtures) { + const text = fs.readFileSync(path.join(root, fixture.file), 'utf8'); + const chunks = buildTreeSitterChunks({ + text, + languageId: fixture.languageId, + ext: fixture.ext, + options + }) || []; + if (!chunks.length) { + throw new Error(`${fixture.id} tree-sitter chunks not found`); + } + const names = toNameSet(chunks); + assertHas(names, fixture.expect, fixture.id); +} + +console.log('tree-sitter chunk fixtures passed.'); diff --git a/tests/triage-records.js b/tests/triage-records.js index dfa0f07b0..2f5a83bd6 100644 --- a/tests/triage-records.js +++ b/tests/triage-records.js @@ -8,10 +8,25 @@ const root = process.cwd(); const repoRoot = path.join(root, 'tests', 'fixtures', 'sample'); const triageFixtureRoot = path.join(root, 'tests', 'fixtures', 'triage'); const cacheRoot = path.join(root, 'tests', '.cache', 'triage-records'); +const testLogRoot = process.env.PAIROFCLEATS_TEST_LOG_DIR + || process.env.npm_config_test_log_dir + || ''; +const resolvedTestLogRoot = testLogRoot ? path.resolve(testLogRoot) : ''; await fsPromises.rm(cacheRoot, { recursive: true, force: true }); await fsPromises.mkdir(cacheRoot, { recursive: true }); +async function writeTestLog(name, payload) { + if (!resolvedTestLogRoot) return; + const outPath = path.join(resolvedTestLogRoot, name); + try { + await fsPromises.mkdir(resolvedTestLogRoot, { recursive: true }); + await fsPromises.writeFile(outPath, JSON.stringify(payload, null, 2)); + } catch (err) { + console.warn(`Failed to write test log ${outPath}: ${err?.message || err}`); + } +} + const env = { ...process.env, PAIROFCLEATS_CACHE_ROOT: cacheRoot, @@ -123,6 +138,8 @@ const recordSearch = runJson('search-records', [ '--repo', repoRoot ], { cwd: repoRoot, env }); +await writeTestLog('triage-record-search.json', recordSearch); + if (!Array.isArray(recordSearch.records) || recordSearch.records.length === 0) { console.error('Record search returned no results.'); process.exit(1); @@ -150,6 +167,10 @@ if (!fs.existsSync(contextOut)) { } const pack = JSON.parse(await fsPromises.readFile(contextOut, 'utf8')); +await writeTestLog('triage-context-pack.json', pack); +await writeTestLog('triage-context-pack-evidence.json', pack.repoEvidence || {}); +await writeTestLog('triage-context-pack-history.json', { history: pack.history || [] }); + if (!pack.recordId || !pack.finding || !pack.repoEvidence) { console.error('Context pack missing required fields.'); process.exit(1); diff --git a/tests/truth-table.js b/tests/truth-table.js new file mode 100644 index 000000000..25a58366c --- /dev/null +++ b/tests/truth-table.js @@ -0,0 +1,81 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import path from 'node:path'; + +const root = process.cwd(); +const tablePath = path.join(root, 'docs', 'truth-table.md'); +let raw = ''; +try { + raw = fs.readFileSync(tablePath, 'utf8'); +} catch (err) { + console.error(`Failed to read truth table at ${tablePath}: ${err?.message || err}`); + process.exit(1); +} + +const lines = raw.split(/\r?\n/); +const claims = []; +let current = null; + +for (let i = 0; i < lines.length; i += 1) { + const line = lines[i]; + const trimmed = line.trim(); + if (trimmed.startsWith('- Claim:')) { + if (current) claims.push(current); + current = { line: i + 1, lines: [line] }; + continue; + } + if (current) { + if (trimmed.startsWith('## ') || trimmed.startsWith('# ')) { + claims.push(current); + current = null; + continue; + } + current.lines.push(line); + } +} +if (current) claims.push(current); + +if (!claims.length) { + console.error('Truth table validation failed: no claims found.'); + process.exit(1); +} + +const requiredLabels = ['Implementation:', 'Config:', 'Tests:', 'Limitations:']; +const issues = []; + +const findLabelLine = (blockLines, label) => { + for (const line of blockLines) { + if (line.includes(label)) return line; + } + return null; +}; + +for (const claim of claims) { + const blockText = claim.lines.join('\n'); + for (const label of requiredLabels) { + const line = findLabelLine(claim.lines, label); + if (!line) { + issues.push(`Claim at line ${claim.line} missing ${label}`); + continue; + } + const content = line.split(label)[1]; + if (!content || !content.trim()) { + issues.push(`Claim at line ${claim.line} has empty ${label}`); + } + } + const testsLine = findLabelLine(claim.lines, 'Tests:'); + if (testsLine && !/tests\//.test(testsLine)) { + issues.push(`Claim at line ${claim.line} Tests line missing tests/ reference`); + } + if (!testsLine && /Tests:/.test(blockText)) { + issues.push(`Claim at line ${claim.line} has malformed Tests line`); + } +} + +if (issues.length) { + console.error('Truth table validation failed:'); + issues.forEach((issue) => console.error(`- ${issue}`)); + process.exit(1); +} + +console.log(`Truth table validation passed (${claims.length} claims).`); diff --git a/tests/ts-jsx-fixtures.js b/tests/ts-jsx-fixtures.js index f42c4dff8..e811ff8ee 100644 --- a/tests/ts-jsx-fixtures.js +++ b/tests/ts-jsx-fixtures.js @@ -1,7 +1,7 @@ #!/usr/bin/env node import fs from 'node:fs'; import path from 'node:path'; -import { buildJsChunks } from '../src/lang/javascript.js'; +import { buildJsChunks, collectImports } from '../src/lang/javascript.js'; import { buildTypeScriptChunks, collectTypeScriptImports } from '../src/lang/typescript.js'; const root = process.cwd(); @@ -50,4 +50,25 @@ if (!jsxHasApp || !jsxHasButton) { process.exit(1); } -console.log('TS/JSX fixture parsing tests passed'); +const flowText = readFixture('javascript_flow.js'); +const flowChunks = buildJsChunks(flowText, { + ext: '.js', + javascript: { parser: 'babel', flow: 'auto' }, + flowMode: 'auto' +}) || []; +const flowHasGreet = flowChunks.some((chunk) => chunk.name === 'greet'); +if (!flowHasGreet) { + console.error('Expected Flow chunks for greet.'); + process.exit(1); +} +const flowImports = collectImports(flowText, { + ext: '.js', + javascript: { parser: 'babel', flow: 'auto' }, + flowMode: 'auto' +}); +if (!flowImports.includes('flow-parser') || !flowImports.includes('./types')) { + console.error('Missing Flow imports in JS parsing.'); + process.exit(1); +} + +console.log('TS/JSX/Flow fixture parsing tests passed'); diff --git a/tests/two-stage-state.js b/tests/two-stage-state.js new file mode 100644 index 000000000..a951b6620 --- /dev/null +++ b/tests/two-stage-state.js @@ -0,0 +1,114 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, getRepoCacheRoot, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'two-stage-state'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); + +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ + indexing: { + twoStage: { + enabled: true, + stage2: { + embeddings: { enabled: false, mode: 'off' } + } + }, + treeSitter: { enabled: false } + } + }, null, 2) +); +await fsPromises.writeFile(path.join(repoRoot, 'alpha.js'), 'const alpha = 1;\n'); + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub' +}; + +const runBuild = (label, args) => { + const result = spawnSync(process.execPath, args, { cwd: repoRoot, env, stdio: 'inherit' }); + if (result.status !== 0) { + console.error(`Failed: ${label}`); + process.exit(result.status ?? 1); + } +}; + +runBuild('stage1', [path.join(root, 'build_index.js'), '--stub-embeddings', '--stage', 'stage1', '--repo', repoRoot]); + +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +const userConfig = loadUserConfig(repoRoot); +const resolveStagePaths = () => { + const codeDir = getIndexDir(repoRoot, 'code', userConfig); + return { + codeDir, + statePath: path.join(codeDir, 'index_state.json'), + relationsPath: path.join(codeDir, 'file_relations.json'), + densePath: path.join(codeDir, 'dense_vectors_uint8.json') + }; +}; +let { codeDir, statePath, relationsPath, densePath } = resolveStagePaths(); +if (!fs.existsSync(statePath)) { + console.error('Missing index_state.json after stage1'); + process.exit(1); +} +const stateStage1 = JSON.parse(await fsPromises.readFile(statePath, 'utf8')); +if (stateStage1.stage !== 'stage1' || stateStage1.enrichment?.pending !== true) { + console.error('Expected stage1 index_state to show pending enrichment'); + process.exit(1); +} +if (fs.existsSync(relationsPath)) { + console.error('Did not expect file_relations.json after stage1'); + process.exit(1); +} + +const repoCacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const enrichmentPath = path.join(repoCacheRoot, 'enrichment_state.json'); +const enrichmentStage1 = JSON.parse(await fsPromises.readFile(enrichmentPath, 'utf8')); +if (enrichmentStage1.status !== 'pending') { + console.error('Expected enrichment_state pending after stage1'); + process.exit(1); +} + +runBuild('stage2', [path.join(root, 'build_index.js'), '--stub-embeddings', '--stage', 'stage2', '--repo', repoRoot]); + +({ codeDir, statePath, relationsPath, densePath } = resolveStagePaths()); +const stateStage2 = JSON.parse(await fsPromises.readFile(statePath, 'utf8')); +if (stateStage2.stage !== 'stage2' || stateStage2.enrichment?.pending === true) { + console.error('Expected stage2 index_state to clear pending enrichment'); + process.exit(1); +} +if (!fs.existsSync(relationsPath)) { + console.error('Expected file_relations.json after stage2'); + process.exit(1); +} +const enrichmentStage2 = JSON.parse(await fsPromises.readFile(enrichmentPath, 'utf8')); +if (enrichmentStage2.status !== 'done') { + console.error('Expected enrichment_state done after stage2'); + process.exit(1); +} + +runBuild('stage3', [path.join(root, 'build_index.js'), '--stub-embeddings', '--stage', 'stage3', '--repo', repoRoot]); + +({ codeDir, statePath, relationsPath, densePath } = resolveStagePaths()); +const stateStage3 = JSON.parse(await fsPromises.readFile(statePath, 'utf8')); +if (stateStage3.embeddings?.ready !== true) { + console.error('Expected stage3 to mark embeddings ready'); + process.exit(1); +} +if (!fs.existsSync(densePath)) { + console.error('Expected dense_vectors_uint8.json after stage3'); + process.exit(1); +} + +console.log('two-stage state test passed'); diff --git a/tests/type-inference-clangd-provider-no-clangd.js b/tests/type-inference-clangd-provider-no-clangd.js new file mode 100644 index 000000000..23eb36f09 --- /dev/null +++ b/tests/type-inference-clangd-provider-no-clangd.js @@ -0,0 +1,47 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { collectClangdTypes } from '../src/index/tooling/clangd-provider.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'clangd-provider-no-clangd'); +const repoRoot = path.join(tempRoot, 'repo'); +const srcDir = path.join(repoRoot, 'src'); + +await fs.rm(tempRoot, { recursive: true, force: true }); +await fs.mkdir(srcDir, { recursive: true }); +await fs.writeFile( + path.join(srcDir, 'sample.c'), + 'int add(int a, int b) { return a + b; }\n' +); + +const chunksByFile = new Map([ + ['src/sample.c', [{ file: 'src/sample.c', name: 'add', start: 0, end: 10, docmeta: {} }]] +]); + +const logs = []; +const log = (msg) => logs.push(String(msg)); + +const result = await collectClangdTypes({ + rootDir: repoRoot, + chunksByFile, + log, + cmd: 'clangd-does-not-exist' +}); + +if (!result || !(result.typesByChunk instanceof Map)) { + console.error('clangd provider did not return a types map.'); + process.exit(1); +} + +if (result.typesByChunk.size !== 0) { + console.error('clangd provider should return empty map when clangd is missing.'); + process.exit(1); +} + +if (!logs.some((entry) => entry.includes('clangd not detected'))) { + console.error('clangd provider missing expected fallback log message.'); + process.exit(1); +} + +console.log('clangd provider fallback test passed'); diff --git a/tests/type-inference-crossfile-go.js b/tests/type-inference-crossfile-go.js index 76e57877f..87b165315 100644 --- a/tests/type-inference-crossfile-go.js +++ b/tests/type-inference-crossfile-go.js @@ -8,6 +8,19 @@ import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; const root = process.cwd(); const tempRoot = path.join(root, 'tests', '.cache', 'type-inference-crossfile-go'); const repoRoot = path.join(tempRoot, 'repo'); +const hasPython = () => { + const candidates = ['python', 'python3']; + for (const candidate of candidates) { + try { + const result = spawnSync(candidate, ['-c', 'import sys; sys.stdout.write("ok")'], { + encoding: 'utf8' + }); + if (result.status === 0 && String(result.stdout || '').trim() === 'ok') return true; + } catch {} + } + return false; +}; +const pythonAvailable = hasPython(); await fsPromises.rm(tempRoot, { recursive: true, force: true }); await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); @@ -98,6 +111,27 @@ public class JavaWidgetBuilder { ` ); +if (pythonAvailable) { + await fsPromises.writeFile( + path.join(repoRoot, 'src', 'py_widget.py'), + `class PyWidget: + def __init__(self): + self.id = 1 + +def make_py_widget() -> PyWidget: + return PyWidget() +` + ); + + await fsPromises.writeFile( + path.join(repoRoot, 'src', 'py_builder.py'), + `from py_widget import make_py_widget, PyWidget + +def build_py_widget() -> PyWidget: + return make_py_widget() +` + ); +} const env = { ...process.env, PAIROFCLEATS_CACHE_ROOT: path.join(tempRoot, 'cache'), @@ -125,9 +159,17 @@ if (!fs.existsSync(chunkMetaPath)) { } const chunkMeta = JSON.parse(fs.readFileSync(chunkMetaPath, 'utf8')); +const fileMetaPath = path.join(codeDir, 'file_meta.json'); +const fileMeta = fs.existsSync(fileMetaPath) + ? JSON.parse(fs.readFileSync(fileMetaPath, 'utf8')) + : []; +const fileById = new Map( + (Array.isArray(fileMeta) ? fileMeta : []).map((entry) => [entry.id, entry.file]) +); +const resolveChunkFile = (chunk) => chunk?.file || fileById.get(chunk?.fileId) || null; const buildGo = chunkMeta.find((chunk) => - chunk.file === 'src/builder.go' && + resolveChunkFile(chunk) === 'src/builder.go' && chunk.name === 'BuildGoWidget' ); if (!buildGo) { @@ -142,7 +184,7 @@ if (!inferredGo.some((entry) => entry.type === 'GoWidget' && entry.source === 'f } const buildRust = chunkMeta.find((chunk) => - chunk.file === 'src/lib.rs' && + resolveChunkFile(chunk) === 'src/lib.rs' && chunk.name === 'build_rust_widget' ); if (!buildRust) { @@ -157,7 +199,7 @@ if (!inferredRust.some((entry) => entry.type === 'RustWidget' && entry.source == } const buildJava = chunkMeta.find((chunk) => - chunk.file === 'src/JavaWidgetBuilder.java' && + resolveChunkFile(chunk) === 'src/JavaWidgetBuilder.java' && chunk.name === 'JavaWidgetBuilder.buildWidget' ); if (!buildJava) { @@ -171,4 +213,22 @@ if (!inferredJava.some((entry) => entry.type === 'JavaWidget' && entry.source == process.exit(1); } -console.log('Cross-file inference tests passed (Go/Rust/Java).'); +if (pythonAvailable) { + const buildPy = chunkMeta.find((chunk) => + resolveChunkFile(chunk) === 'src/py_builder.py' && + chunk.name === 'build_py_widget' + ); + if (!buildPy) { + console.error('Missing build_py_widget chunk in py_builder.py.'); + process.exit(1); + } + const inferredPy = buildPy.docmeta?.inferredTypes?.returns || []; + if (!inferredPy.some((entry) => entry.type === 'PyWidget' && entry.source === 'flow')) { + console.error('Python cross-file inference missing return type PyWidget for build_py_widget.'); + process.exit(1); + } +} else { + console.log('Skipping Python cross-file inference (python not available).'); +} + +console.log('Cross-file inference tests passed (Go/Rust/Java/Python).'); diff --git a/tests/type-inference-crossfile.js b/tests/type-inference-crossfile.js index 2c1092ca6..108e94dca 100644 --- a/tests/type-inference-crossfile.js +++ b/tests/type-inference-crossfile.js @@ -4,6 +4,7 @@ import fsPromises from 'node:fs/promises'; import path from 'node:path'; import { spawnSync } from 'node:child_process'; import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; +import { applyCrossFileInference } from '../src/index/type-inference-crossfile.js'; const root = process.cwd(); const tempRoot = path.join(root, 'tests', '.cache', 'type-inference-crossfile'); @@ -11,12 +12,224 @@ const repoRoot = path.join(tempRoot, 'repo'); await fsPromises.rm(tempRoot, { recursive: true, force: true }); await fsPromises.mkdir(path.join(repoRoot, 'src'), { recursive: true }); +const graphsFixtureRoot = path.join(root, 'tests', 'fixtures', 'graphs', 'simple'); +const graphsTargetRoot = path.join(repoRoot, 'src', 'graphs'); +await fsPromises.mkdir(graphsTargetRoot, { recursive: true }); +await fsPromises.copyFile( + path.join(graphsFixtureRoot, 'producer.js'), + path.join(graphsTargetRoot, 'producer.js') +); +await fsPromises.copyFile( + path.join(graphsFixtureRoot, 'consumer.js'), + path.join(graphsTargetRoot, 'consumer.js') +); + +const statsRoot = path.join(tempRoot, 'stats'); +await fsPromises.mkdir(statsRoot, { recursive: true }); + +const writeScenarioFile = async (rootDir, relPath, contents) => { + const absPath = path.join(rootDir, relPath); + await fsPromises.mkdir(path.dirname(absPath), { recursive: true }); + await fsPromises.writeFile(absPath, contents); + return absPath; +}; + +const runStatsScenario = async (name, { files, chunks, expect }) => { + const scenarioRoot = path.join(statsRoot, name); + await fsPromises.rm(scenarioRoot, { recursive: true, force: true }); + await fsPromises.mkdir(scenarioRoot, { recursive: true }); + for (const [relPath, contents] of Object.entries(files)) { + await writeScenarioFile(scenarioRoot, relPath, contents); + } + const stats = await applyCrossFileInference({ + rootDir: scenarioRoot, + chunks, + enabled: true, + log: () => {}, + useTooling: false, + enableTypeInference: true, + enableRiskCorrelation: true, + fileRelations: null + }); + const entries = [ + ['linkedCalls', stats.linkedCalls, expect.linkedCalls], + ['linkedUsages', stats.linkedUsages, expect.linkedUsages], + ['inferredReturns', stats.inferredReturns, expect.inferredReturns], + ['riskFlows', stats.riskFlows, expect.riskFlows] + ]; + for (const [label, actual, expected] of entries) { + if (actual !== expected) { + console.error( + `Cross-file inference stats mismatch (${name}): ${label}=${actual}, expected ${expected}.` + ); + process.exit(1); + } + } +}; + +const zeroContent = 'export function noop() { const x = 1; }\n'; +await runStatsScenario('zero', { + files: { + 'src/zero.js': zeroContent + }, + chunks: [ + { + file: 'src/zero.js', + name: 'noop', + kind: 'function', + start: 0, + end: zeroContent.length, + docmeta: { returnsValue: false }, + codeRelations: {} + } + ], + expect: { + linkedCalls: 0, + linkedUsages: 0, + inferredReturns: 0, + riskFlows: 0 + } +}); + +const creatorContent = [ + 'export function makeWidget() { return {}; }', + 'export class Widget {}', + '' +].join('\n'); +const oneConsumerContent = 'export function buildWidget() { return makeWidget(); }\n'; +await runStatsScenario('one-each', { + files: { + 'src/creator.js': creatorContent, + 'src/consumer.js': oneConsumerContent + }, + chunks: [ + { + file: 'src/consumer.js', + name: 'buildWidget', + kind: 'function', + start: 0, + end: oneConsumerContent.length, + docmeta: { + returnsValue: true, + risk: { sources: [{ name: 'source', ruleId: 'rule-source', confidence: 0.8 }] } + }, + codeRelations: { + calls: [['buildWidget', 'makeWidget']], + usages: ['Widget'] + } + }, + { + file: 'src/creator.js', + name: 'makeWidget', + kind: 'function', + start: 0, + end: creatorContent.length, + docmeta: { + returnType: 'Widget', + returnsValue: false, + risk: { + sinks: [{ name: 'sink', ruleId: 'rule-sink', category: 'test', severity: 'high', tags: ['taint'] }] + } + }, + codeRelations: {} + }, + { + file: 'src/creator.js', + name: 'Widget', + kind: 'class', + start: 0, + end: creatorContent.length, + docmeta: {}, + codeRelations: {} + } + ], + expect: { + linkedCalls: 1, + linkedUsages: 1, + inferredReturns: 1, + riskFlows: 1 + } +}); + +const secondConsumerContent = 'export function buildWidgetTwo() { return makeWidget(); }\n'; +await runStatsScenario('couple-each', { + files: { + 'src/creator.js': creatorContent, + 'src/consumer-one.js': oneConsumerContent, + 'src/consumer-two.js': secondConsumerContent + }, + chunks: [ + { + file: 'src/consumer-one.js', + name: 'buildWidget', + kind: 'function', + start: 0, + end: oneConsumerContent.length, + docmeta: { + returnsValue: true, + risk: { sources: [{ name: 'source', ruleId: 'rule-source', confidence: 0.8 }] } + }, + codeRelations: { + calls: [['buildWidget', 'makeWidget']], + usages: ['Widget'] + } + }, + { + file: 'src/consumer-two.js', + name: 'buildWidgetTwo', + kind: 'function', + start: 0, + end: secondConsumerContent.length, + docmeta: { + returnsValue: true, + risk: { sources: [{ name: 'source', ruleId: 'rule-source', confidence: 0.8 }] } + }, + codeRelations: { + calls: [['buildWidgetTwo', 'makeWidget']], + usages: ['Widget'] + } + }, + { + file: 'src/creator.js', + name: 'makeWidget', + kind: 'function', + start: 0, + end: creatorContent.length, + docmeta: { + returnType: 'Widget', + returnsValue: false, + risk: { + sinks: [{ name: 'sink', ruleId: 'rule-sink', category: 'test', severity: 'high', tags: ['taint'] }] + } + }, + codeRelations: {} + }, + { + file: 'src/creator.js', + name: 'Widget', + kind: 'class', + start: 0, + end: creatorContent.length, + docmeta: {}, + codeRelations: {} + } + ], + expect: { + linkedCalls: 2, + linkedUsages: 2, + inferredReturns: 2, + riskFlows: 2 + } +}); const config = { indexing: { typeInference: true, typeInferenceCrossFile: true }, + tooling: { + autoEnableOnDetect: false + }, sqlite: { use: false } }; await fsPromises.writeFile( @@ -63,9 +276,19 @@ process.env.PAIROFCLEATS_EMBEDDINGS = env.PAIROFCLEATS_EMBEDDINGS; const result = spawnSync(process.execPath, [path.join(root, 'build_index.js'), '--stub-embeddings', '--repo', repoRoot], { cwd: repoRoot, env, + timeout: Number.isFinite(Number(process.env.PAIROFCLEATS_TEST_TIMEOUT_MS)) + ? Math.max(1000, Number(process.env.PAIROFCLEATS_TEST_TIMEOUT_MS)) + : 120000, + killSignal: 'SIGTERM', stdio: 'inherit' }); if (result.status !== 0) { + if (result.signal) { + console.error(`Cross-file inference test failed: build_index terminated by ${result.signal}.`); + } + if (result.error) { + console.error(`Cross-file inference test failed: ${result.error.message || result.error}.`); + } console.error('Cross-file inference test failed: build_index failed.'); process.exit(result.status ?? 1); } @@ -79,8 +302,17 @@ if (!fs.existsSync(chunkMetaPath)) { } const chunkMeta = JSON.parse(fs.readFileSync(chunkMetaPath, 'utf8')); +const fileMetaPath = path.join(codeDir, 'file_meta.json'); +const fileMeta = fs.existsSync(fileMetaPath) + ? JSON.parse(fs.readFileSync(fileMetaPath, 'utf8')) + : []; +const fileById = new Map( + (Array.isArray(fileMeta) ? fileMeta : []).map((entry) => [entry.id, entry.file]) +); +const resolveChunkFile = (chunk) => chunk?.file || fileById.get(chunk?.fileId) || null; + const buildWidget = chunkMeta.find((chunk) => - chunk.file === 'src/consumer.js' && + resolveChunkFile(chunk) === 'src/consumer.js' && chunk.name === 'buildWidget' ); if (!buildWidget) { @@ -118,4 +350,25 @@ if (!usageLinks.some((link) => link.target === 'Widget' && link.file === 'src/cr process.exit(1); } +const graphPath = path.join(codeDir, 'graph_relations.json'); +if (!fs.existsSync(graphPath)) { + console.error(`Missing graph relations at ${graphPath}`); + process.exit(1); +} +const graphRelations = JSON.parse(fs.readFileSync(graphPath, 'utf8')); +const findNode = (graph, id) => (graph?.nodes || []).find((node) => node.id === id); +const graphConsumer = 'src/graphs/consumer.js::buildGraphWidget'; +const graphProducerFn = 'src/graphs/producer.js::createGraphWidget'; +const graphProducerType = 'src/graphs/producer.js::GraphWidget'; +const callNode = findNode(graphRelations.callGraph, graphConsumer); +if (!callNode || !Array.isArray(callNode.out) || !callNode.out.includes(graphProducerFn)) { + console.error('Graph relations missing call link for fixture consumer.'); + process.exit(1); +} +const usageNode = findNode(graphRelations.usageGraph, graphConsumer); +if (!usageNode || !Array.isArray(usageNode.out) || !usageNode.out.includes(graphProducerType)) { + console.error('Graph relations missing usage link for fixture consumer.'); + process.exit(1); +} + console.log('Cross-file inference test passed'); diff --git a/tests/type-inference-crossfile/apply.test.js b/tests/type-inference-crossfile/apply.test.js new file mode 100644 index 000000000..b3f0baf92 --- /dev/null +++ b/tests/type-inference-crossfile/apply.test.js @@ -0,0 +1,48 @@ +#!/usr/bin/env node +import { addInferredParam, addInferredReturn, mergeDiagnostics } from '../../src/index/type-inference-crossfile/apply.js'; + +const fail = (message) => { + console.error(message); + process.exit(1); +}; + +const docmeta = {}; +addInferredReturn(docmeta, 'Widget', 'flow', 0.4); +addInferredReturn(docmeta, 'Widget', 'flow', 0.8); +addInferredReturn(docmeta, 'Widget', 'tooling', 0.2); +const returns = docmeta.inferredTypes?.returns || []; +if (returns.length !== 2) { + fail('addInferredReturn should dedupe entries by type/source.'); +} +const flowEntry = returns.find((entry) => entry.source === 'flow'); +if (!flowEntry || flowEntry.confidence !== 0.8) { + fail('addInferredReturn should keep max confidence for repeated entries.'); +} + +const paramMeta = {}; +if (!addInferredParam(paramMeta, 'arg', 'string', 'flow', 0.6)) { + fail('addInferredParam should accept first param type.'); +} +addInferredParam(paramMeta, 'arg', 'string', 'flow', 0.2); +const params = paramMeta.inferredTypes?.params?.arg || []; +if (params.length !== 1) { + fail('addInferredParam should dedupe entries by type/source.'); +} +if (params[0].confidence !== 0.6) { + fail('addInferredParam should keep max confidence for repeated entries.'); +} +if (addInferredParam(paramMeta, 'arg', 'number', 'flow', 0.5, 1)) { + fail('addInferredParam should respect maxCandidates limit.'); +} + +const target = new Map([['a', [{ message: 'one' }]]]); +const incoming = new Map([ + ['a', [{ message: 'two' }]], + ['b', [{ message: 'three' }]] +]); +mergeDiagnostics(target, incoming); +if (target.get('a')?.length !== 2 || target.get('b')?.length !== 1) { + fail('mergeDiagnostics should append incoming diagnostics.'); +} + +console.log('type-inference-crossfile apply tests passed'); diff --git a/tests/type-inference-crossfile/extract.test.js b/tests/type-inference-crossfile/extract.test.js new file mode 100644 index 000000000..e10995fc0 --- /dev/null +++ b/tests/type-inference-crossfile/extract.test.js @@ -0,0 +1,77 @@ +#!/usr/bin/env node +import { extractParamTypes, extractReturnCalls, extractReturnTypes, inferArgType } from '../../src/index/type-inference-crossfile/extract.js'; + +const fail = (message) => { + console.error(message); + process.exit(1); +}; + +const chunk = { + name: 'Widget', + kind: 'class', + docmeta: { + returnType: 'Widget', + returns: ['Widget', 'Gadget'], + inferredTypes: { + returns: [{ type: 'Thing' }, { type: 'Widget', source: 'flow' }], + params: { + b: [{ type: 'number' }, { type: 'number' }], + a: [{ type: 'boolean' }] + } + }, + params: ['a', 'b'], + paramTypes: { a: 'string' } + } +}; + +const returnTypes = extractReturnTypes(chunk); +const returnSet = new Set(returnTypes); +if (!returnSet.has('Widget') || !returnSet.has('Gadget') || !returnSet.has('Thing')) { + fail('extractReturnTypes should collect explicit and inferred return types.'); +} +if (returnTypes.length !== returnSet.size) { + fail('extractReturnTypes should dedupe return types.'); +} + +const { paramNames, paramTypes } = extractParamTypes(chunk); +if (paramNames.join(',') !== 'a,b') { + fail('extractParamTypes should preserve param name order.'); +} +const paramA = new Set(paramTypes.a || []); +const paramB = new Set(paramTypes.b || []); +if (!paramA.has('string') || !paramA.has('boolean')) { + fail('extractParamTypes should merge declared and inferred param types.'); +} +if (!paramB.has('number') || paramB.size !== 1) { + fail('extractParamTypes should dedupe inferred param types.'); +} + +const callText = [ + 'return createWidget();', + 'return await ns.Factory.build();', + 'return new Widget();' +].join('\n'); +const { calls, news } = extractReturnCalls(callText); +if (!calls.has('createWidget') || !calls.has('ns.Factory.build')) { + fail('extractReturnCalls should collect return call targets.'); +} +if (!news.has('Widget') || news.size !== 1) { + fail('extractReturnCalls should collect return new targets.'); +} + +const argChecks = [ + ['123', 'number'], + ['true', 'boolean'], + ['"hello"', 'string'], + ['[1, 2]', 'array'], + ['{ a: 1 }', 'object'], + ['new Gadget()', 'Gadget'], + ['fn(...)', 'function'] +]; +for (const [value, expected] of argChecks) { + if (inferArgType(value) !== expected) { + fail(`inferArgType should infer ${expected} from ${value}.`); + } +} + +console.log('type-inference-crossfile extract tests passed'); diff --git a/tests/type-inference-crossfile/symbols.test.js b/tests/type-inference-crossfile/symbols.test.js new file mode 100644 index 000000000..58408d475 --- /dev/null +++ b/tests/type-inference-crossfile/symbols.test.js @@ -0,0 +1,42 @@ +#!/usr/bin/env node +import { addSymbol, leafName, resolveUniqueSymbol, isTypeDeclaration } from '../../src/index/type-inference-crossfile/symbols.js'; + +const fail = (message) => { + console.error(message); + process.exit(1); +}; + +if (leafName('Alpha::Beta.Gamma') !== 'Gamma') { + fail('leafName should return the last segment.'); +} + +if (!isTypeDeclaration('Class')) { + fail('isTypeDeclaration should match class-like kinds.'); +} + +if (isTypeDeclaration('function')) { + fail('isTypeDeclaration should ignore non-type kinds.'); +} + +const directIndex = new Map(); +const directEntry = { name: 'Widget', file: 'src/widget.js', kind: 'class' }; +addSymbol(directIndex, directEntry.name, directEntry); +if (resolveUniqueSymbol(directIndex, 'Widget') !== directEntry) { + fail('resolveUniqueSymbol should resolve direct unique matches.'); +} + +const leafIndex = new Map(); +const leafEntry = { name: 'Namespace.Widget', file: 'src/ns.js', kind: 'class' }; +addSymbol(leafIndex, 'Widget', leafEntry); +if (resolveUniqueSymbol(leafIndex, 'Namespace.Widget') !== leafEntry) { + fail('resolveUniqueSymbol should resolve unique leaf matches.'); +} + +const dupeIndex = new Map(); +addSymbol(dupeIndex, 'Dup', { name: 'Dup', file: 'src/one.js' }); +addSymbol(dupeIndex, 'Dup', { name: 'Dup', file: 'src/two.js' }); +if (resolveUniqueSymbol(dupeIndex, 'Dup') !== null) { + fail('resolveUniqueSymbol should return null for ambiguous matches.'); +} + +console.log('type-inference-crossfile symbols tests passed'); diff --git a/tests/type-inference-lsp-enrichment.js b/tests/type-inference-lsp-enrichment.js new file mode 100644 index 000000000..ec3adfe50 --- /dev/null +++ b/tests/type-inference-lsp-enrichment.js @@ -0,0 +1,144 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { getIndexDir, loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'lsp-enrichment'); +const repoRoot = path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); +const srcDir = path.join(repoRoot, 'src'); +const binRoot = path.join(root, 'tests', 'fixtures', 'lsp', 'bin'); + +await fsPromises.rm(tempRoot, { recursive: true, force: true }); +await fsPromises.mkdir(srcDir, { recursive: true }); + +const cppSource = 'int add(int a, int b) { return a + b; }\n'; +const swiftSource = 'func greet(name: String, count: Int) -> String { return "hi" }\n'; +const pythonSource = 'def greet(name: str) -> str:\n return "hi"\n'; +await fsPromises.writeFile(path.join(srcDir, 'sample.cpp'), cppSource); +await fsPromises.writeFile(path.join(srcDir, 'sample.swift'), swiftSource); +await fsPromises.writeFile(path.join(srcDir, 'sample.py'), pythonSource); + +const config = { + indexing: { + typeInference: true, + typeInferenceCrossFile: true + }, + sqlite: { + use: false + }, + tooling: { + autoEnableOnDetect: true + } +}; +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify(config, null, 2) +); + +for (const binName of ['clangd', 'sourcekit-lsp', 'pyright-langserver']) { + try { + await fsPromises.chmod(path.join(binRoot, binName), 0o755); + } catch {} +} + +const env = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_EMBEDDINGS: 'stub', + PATH: `${binRoot}${path.delimiter}${process.env.PATH || ''}` +}; +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const buildResult = spawnSync( + process.execPath, + [path.join(root, 'build_index.js'), '--repo', repoRoot, '--stub-embeddings'], + { env, encoding: 'utf8' } +); + +if (buildResult.status !== 0) { + console.error('LSP enrichment test failed: build_index error.'); + if (buildResult.stderr) console.error(buildResult.stderr.trim()); + process.exit(buildResult.status ?? 1); +} + +const userConfig = loadUserConfig(repoRoot); +const indexDir = getIndexDir(repoRoot, 'code', userConfig); +const metaPath = path.join(indexDir, 'chunk_meta.json'); +if (!fs.existsSync(metaPath)) { + console.error('LSP enrichment test failed: chunk_meta.json missing.'); + process.exit(1); +} + +const chunks = JSON.parse(fs.readFileSync(metaPath, 'utf8')); +const fileMetaPath = path.join(indexDir, 'file_meta.json'); +const fileMeta = fs.existsSync(fileMetaPath) + ? JSON.parse(fs.readFileSync(fileMetaPath, 'utf8')) + : []; +const fileById = new Map( + (Array.isArray(fileMeta) ? fileMeta : []).map((entry) => [entry.id, entry.file]) +); +const resolveChunkFile = (chunk) => chunk?.file || fileById.get(chunk?.fileId) || null; + +const cppChunk = chunks.find((chunk) => resolveChunkFile(chunk) === 'src/sample.cpp' && chunk.name === 'add'); +const swiftChunk = chunks.find((chunk) => resolveChunkFile(chunk) === 'src/sample.swift' && chunk.name === 'greet'); +const pythonChunk = chunks.find((chunk) => resolveChunkFile(chunk) === 'src/sample.py' && chunk.name === 'greet'); + +const hasToolingReturn = (chunk, type) => { + const returns = chunk?.docmeta?.inferredTypes?.returns || []; + return returns.some((entry) => entry?.source === 'tooling' && (!type || entry?.type === type)); +}; +const hasToolingParam = (chunk, name, type) => { + const params = chunk?.docmeta?.inferredTypes?.params || {}; + const entries = params[name] || []; + return entries.some((entry) => entry?.source === 'tooling' && (!type || entry?.type === type)); +}; + +if (!cppChunk) { + console.error('LSP enrichment test failed: missing C++ chunk.'); + process.exit(1); +} +if (!swiftChunk) { + console.error('LSP enrichment test failed: missing Swift chunk.'); + process.exit(1); +} +if (!pythonChunk) { + console.error('LSP enrichment test failed: missing Python chunk.'); + process.exit(1); +} + +if (!hasToolingReturn(cppChunk, 'int')) { + console.error('LSP enrichment test failed: missing tooling return type for C++.'); + process.exit(1); +} +if (!hasToolingParam(cppChunk, 'a', 'int') || !hasToolingParam(cppChunk, 'b', 'int')) { + console.error('LSP enrichment test failed: missing tooling param types for C++.'); + process.exit(1); +} +if (!hasToolingReturn(swiftChunk, 'String')) { + console.error('LSP enrichment test failed: missing tooling return type for Swift.'); + process.exit(1); +} +if (!hasToolingParam(swiftChunk, 'name', 'String') || !hasToolingParam(swiftChunk, 'count', 'Int')) { + console.error('LSP enrichment test failed: missing tooling param types for Swift.'); + process.exit(1); +} +if (!hasToolingReturn(pythonChunk, 'str')) { + console.error('LSP enrichment test failed: missing tooling return type for Python.'); + process.exit(1); +} +if (!hasToolingParam(pythonChunk, 'name', 'str')) { + console.error('LSP enrichment test failed: missing tooling param types for Python.'); + process.exit(1); +} +const pyDiagnostics = pythonChunk.docmeta?.tooling?.diagnostics || []; +if (!pyDiagnostics.some((diag) => diag?.source === 'pyright')) { + console.error('LSP enrichment test failed: missing pyright diagnostics for Python.'); + process.exit(1); +} + +console.log('LSP enrichment test passed'); diff --git a/tests/type-inference-sourcekit-provider-no-sourcekit.js b/tests/type-inference-sourcekit-provider-no-sourcekit.js new file mode 100644 index 000000000..2df087527 --- /dev/null +++ b/tests/type-inference-sourcekit-provider-no-sourcekit.js @@ -0,0 +1,47 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { collectSourcekitTypes } from '../src/index/tooling/sourcekit-provider.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'sourcekit-provider-no-sourcekit'); +const repoRoot = path.join(tempRoot, 'repo'); +const srcDir = path.join(repoRoot, 'src'); + +await fs.rm(tempRoot, { recursive: true, force: true }); +await fs.mkdir(srcDir, { recursive: true }); +await fs.writeFile( + path.join(srcDir, 'sample.swift'), + 'func greet(name: String) -> String { return "hi \\(name)" }\n' +); + +const chunksByFile = new Map([ + ['src/sample.swift', [{ file: 'src/sample.swift', name: 'greet', start: 0, end: 10, docmeta: {} }]] +]); + +const logs = []; +const log = (msg) => logs.push(String(msg)); + +const result = await collectSourcekitTypes({ + rootDir: repoRoot, + chunksByFile, + log, + cmd: 'sourcekit-lsp-does-not-exist' +}); + +if (!result || !(result.typesByChunk instanceof Map)) { + console.error('sourcekit provider did not return a types map.'); + process.exit(1); +} + +if (result.typesByChunk.size !== 0) { + console.error('sourcekit provider should return empty map when sourcekit-lsp is missing.'); + process.exit(1); +} + +if (!logs.some((entry) => entry.includes('sourcekit-lsp not detected'))) { + console.error('sourcekit provider missing expected fallback log message.'); + process.exit(1); +} + +console.log('sourcekit provider fallback test passed'); diff --git a/tests/type-inference-typescript-provider-no-ts.js b/tests/type-inference-typescript-provider-no-ts.js new file mode 100644 index 000000000..30c5d7b34 --- /dev/null +++ b/tests/type-inference-typescript-provider-no-ts.js @@ -0,0 +1,55 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { collectTypeScriptTypes } from '../src/index/tooling/typescript-provider.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'typescript-provider-no-ts'); +const repoRoot = path.join(tempRoot, 'repo'); +const srcDir = path.join(repoRoot, 'src'); + +await fs.rm(tempRoot, { recursive: true, force: true }); +await fs.mkdir(srcDir, { recursive: true }); +await fs.writeFile( + path.join(srcDir, 'sample.ts'), + 'export function greet(name: string) { return `hi ${name}`; }\n' +); + +const chunksByFile = new Map([ + ['src/sample.ts', [{ file: 'src/sample.ts', name: 'greet', start: 0, end: 10, docmeta: {} }]] +]); + +const logs = []; +const log = (msg) => logs.push(String(msg)); +const toolingConfig = { + dir: path.join(repoRoot, '.tooling'), + typescript: { + enabled: true, + resolveOrder: ['repo'], + useTsconfig: true + } +}; + +const result = await collectTypeScriptTypes({ + rootDir: repoRoot, + chunksByFile, + log, + toolingConfig +}); + +if (!result || !(result.typesByChunk instanceof Map)) { + console.error('TypeScript provider did not return a types map.'); + process.exit(1); +} + +if (result.typesByChunk.size !== 0) { + console.error('TypeScript provider should return empty map when module is missing.'); + process.exit(1); +} + +if (!logs.some((entry) => entry.includes('TypeScript tooling not detected'))) { + console.error('TypeScript provider missing expected fallback log message.'); + process.exit(1); +} + +console.log('TypeScript provider fallback test passed'); diff --git a/tests/typescript-imports-only.js b/tests/typescript-imports-only.js new file mode 100644 index 000000000..eca3da0c1 --- /dev/null +++ b/tests/typescript-imports-only.js @@ -0,0 +1,35 @@ +#!/usr/bin/env node +import { collectTypeScriptImports } from '../src/lang/typescript.js'; +import { smartChunk } from '../src/index/chunking.js'; + +const text = "import type { Foo } from 'foo';\nexport = ???"; +let imports = []; +try { + imports = collectTypeScriptImports(text, { + parser: 'babel', + typescript: { importsOnly: true } + }); +} catch (err) { + console.error(`typescript imports-only test failed: ${err?.message || err}`); + process.exit(1); +} + +if (!imports.includes('foo')) { + console.error('typescript imports-only test failed: missing import'); + process.exit(1); +} + +const chunks = smartChunk({ + text: 'export interface Foo { bar: string }', + ext: '.ts', + relPath: 'foo.ts', + mode: 'code', + context: { typescript: { importsOnly: true } } +}); + +if (!Array.isArray(chunks) || chunks.length === 0) { + console.error('typescript imports-only test failed: chunker returned empty.'); + process.exit(1); +} + +console.log('typescript imports-only test passed'); diff --git a/tests/typescript-parser-selection.js b/tests/typescript-parser-selection.js new file mode 100644 index 000000000..d7afd16fd --- /dev/null +++ b/tests/typescript-parser-selection.js @@ -0,0 +1,16 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { buildTypeScriptChunks } from '../src/lang/typescript.js'; + +const sample = 'export function foo(a: number): string { return String(a); }'; + +const heuristicChunks = buildTypeScriptChunks(sample, { parser: 'heuristic' }); +assert.ok(Array.isArray(heuristicChunks) && heuristicChunks.length > 0); + +const babelChunks = buildTypeScriptChunks(sample, { parser: 'babel' }); +assert.ok(Array.isArray(babelChunks) && babelChunks.length > 0); + +const tsChunks = buildTypeScriptChunks(sample, { parser: 'typescript', rootDir: process.cwd() }); +assert.ok(Array.isArray(tsChunks) && tsChunks.length > 0); + +console.log('typescript parser selection test passed'); diff --git a/tests/uv-threadpool-env.js b/tests/uv-threadpool-env.js new file mode 100644 index 000000000..976c2c13d --- /dev/null +++ b/tests/uv-threadpool-env.js @@ -0,0 +1,62 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import fs from 'node:fs'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const cacheRoot = path.join(root, 'tests', '.cache', 'uv-threadpool-env'); +const repoRoot = path.join(cacheRoot, 'repo'); + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ runtime: { uvThreadpoolSize: 12 } }, null, 2) +); + +const binPath = path.join(root, 'bin', 'pairofcleats.js'); +if (!fs.existsSync(binPath)) { + console.error(`Missing CLI wrapper: ${binPath}`); + process.exit(1); +} + +const env = { ...process.env }; +delete env.UV_THREADPOOL_SIZE; + +const result = spawnSync(process.execPath, [binPath, 'config', 'dump', '--repo', repoRoot, '--json'], { + encoding: 'utf8', + env +}); + +if (result.status !== 0) { + console.error('config dump failed'); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); +} + +let payload; +try { + payload = JSON.parse(result.stdout || '{}'); +} catch { + console.error('config dump did not output valid JSON'); + process.exit(1); +} + +const runtime = payload?.derived?.runtime; +if (!runtime || typeof runtime !== 'object') { + console.error('config dump payload missing derived.runtime'); + process.exit(1); +} + +if (runtime.uvThreadpoolSize !== 12) { + console.error(`expected runtime.uvThreadpoolSize=12 but got ${runtime.uvThreadpoolSize}`); + process.exit(1); +} + +if (runtime.effectiveUvThreadpoolSize !== 12) { + console.error(`expected runtime.effectiveUvThreadpoolSize=12 but got ${runtime.effectiveUvThreadpoolSize}`); + process.exit(1); +} + +console.log('uv threadpool env test passed'); diff --git a/tests/uv-threadpool-no-override.js b/tests/uv-threadpool-no-override.js new file mode 100644 index 000000000..dc1dae0c2 --- /dev/null +++ b/tests/uv-threadpool-no-override.js @@ -0,0 +1,61 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import fs from 'node:fs'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const cacheRoot = path.join(root, 'tests', '.cache', 'uv-threadpool-no-override'); +const repoRoot = path.join(cacheRoot, 'repo'); + +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(repoRoot, { recursive: true }); +await fsPromises.writeFile( + path.join(repoRoot, '.pairofcleats.json'), + JSON.stringify({ runtime: { uvThreadpoolSize: 64 } }, null, 2) +); + +const binPath = path.join(root, 'bin', 'pairofcleats.js'); +if (!fs.existsSync(binPath)) { + console.error(`Missing CLI wrapper: ${binPath}`); + process.exit(1); +} + +const env = { ...process.env, UV_THREADPOOL_SIZE: '5' }; + +const result = spawnSync(process.execPath, [binPath, 'config', 'dump', '--repo', repoRoot, '--json'], { + encoding: 'utf8', + env +}); + +if (result.status !== 0) { + console.error('config dump failed'); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.status ?? 1); +} + +let payload; +try { + payload = JSON.parse(result.stdout || '{}'); +} catch { + console.error('config dump did not output valid JSON'); + process.exit(1); +} + +const runtime = payload?.derived?.runtime; +if (!runtime || typeof runtime !== 'object') { + console.error('config dump payload missing derived.runtime'); + process.exit(1); +} + +if (runtime.uvThreadpoolSize !== 64) { + console.error(`expected runtime.uvThreadpoolSize=64 but got ${runtime.uvThreadpoolSize}`); + process.exit(1); +} + +if (runtime.effectiveUvThreadpoolSize !== 5) { + console.error(`expected runtime.effectiveUvThreadpoolSize=5 but got ${runtime.effectiveUvThreadpoolSize}`); + process.exit(1); +} + +console.log('uv threadpool no-override test passed'); diff --git a/tests/vector-extension-sanitize.js b/tests/vector-extension-sanitize.js new file mode 100644 index 000000000..6ab3fc06b --- /dev/null +++ b/tests/vector-extension-sanitize.js @@ -0,0 +1,33 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { getVectorExtensionConfig } from '../tools/vector-extension.js'; +import { loadUserConfig } from '../tools/dict-utils.js'; + +const root = process.cwd(); +const tempRoot = path.join(root, 'tests', '.cache', 'vector-extension-sanitize'); +await fs.rm(tempRoot, { recursive: true, force: true }); +await fs.mkdir(tempRoot, { recursive: true }); + +const configPath = path.join(tempRoot, '.pairofcleats.json'); +await fs.writeFile(configPath, JSON.stringify({ + sqlite: { + vectorExtension: { + enabled: true, + table: 'dense_vectors_ann; DROP TABLE chunks; --' + } + } +}, null, 2)); + +const userConfig = loadUserConfig(tempRoot); +const config = getVectorExtensionConfig(tempRoot, userConfig); +if (config.enabled) { + console.error('Expected vector extension to be disabled for invalid table name.'); + process.exit(1); +} +if (!config.disabledReason) { + console.error('Expected vector extension disabled reason to be set.'); + process.exit(1); +} + +console.log('vector extension sanitize test passed'); diff --git a/tests/vscode-extension.js b/tests/vscode-extension.js new file mode 100644 index 000000000..ef95b01fa --- /dev/null +++ b/tests/vscode-extension.js @@ -0,0 +1,49 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import path from 'node:path'; + +const root = process.cwd(); +const extensionDir = path.join(root, 'extensions', 'vscode'); +const manifestPath = path.join(extensionDir, 'package.json'); +const entryPath = path.join(extensionDir, 'extension.js'); + +if (!fs.existsSync(manifestPath)) { + console.error('VS Code extension manifest missing.'); + process.exit(1); +} +if (!fs.existsSync(entryPath)) { + console.error('VS Code extension entrypoint missing.'); + process.exit(1); +} + +const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf8')); +const activationEvents = new Set(manifest.activationEvents || []); +if (!activationEvents.has('onCommand:pairofcleats.search')) { + console.error('VS Code extension activation event missing.'); + process.exit(1); +} + +const commands = manifest.contributes?.commands || []; +const commandIds = new Set(commands.map((cmd) => cmd.command)); +if (!commandIds.has('pairofcleats.search')) { + console.error('VS Code extension command missing.'); + process.exit(1); +} + +const configProps = manifest.contributes?.configuration?.properties || {}; +const requiredProps = [ + 'pairofcleats.cliPath', + 'pairofcleats.cliArgs', + 'pairofcleats.searchMode', + 'pairofcleats.searchBackend', + 'pairofcleats.searchAnn', + 'pairofcleats.maxResults' +]; +for (const prop of requiredProps) { + if (!configProps[prop]) { + console.error(`VS Code extension config missing ${prop}.`); + process.exit(1); + } +} + +console.log('VS Code extension tests passed'); diff --git a/tests/watch-backend-selection.js b/tests/watch-backend-selection.js new file mode 100644 index 000000000..4b382be30 --- /dev/null +++ b/tests/watch-backend-selection.js @@ -0,0 +1,27 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import { getCapabilities } from '../src/shared/capabilities.js'; +import { resolveWatcherBackend } from '../src/index/build/watch.js'; + +const runtime = { userConfig: {}, argv: {} }; +const caps = getCapabilities({ refresh: true }); + +process.env.PAIROFCLEATS_WATCHER_BACKEND = 'chokidar'; +const forcedChokidar = resolveWatcherBackend({ runtime, pollMs: 0 }); +assert.equal(forcedChokidar.resolved, 'chokidar', 'forced chokidar should resolve to chokidar'); + +process.env.PAIROFCLEATS_WATCHER_BACKEND = 'parcel'; +const forcedParcel = resolveWatcherBackend({ runtime, pollMs: 0 }); +if (caps.watcher.parcel) { + assert.equal(forcedParcel.resolved, 'parcel', 'parcel should resolve when available'); +} else { + assert.equal(forcedParcel.resolved, 'chokidar', 'parcel should fall back when unavailable'); + assert.ok(forcedParcel.warning, 'fallback should include warning'); +} + +const pollFallback = resolveWatcherBackend({ runtime, pollMs: 500 }); +assert.equal(pollFallback.resolved, 'chokidar', 'polling forces chokidar'); + +delete process.env.PAIROFCLEATS_WATCHER_BACKEND; + +console.log('watch backend selection tests passed'); diff --git a/tests/watch-debounce.js b/tests/watch-debounce.js new file mode 100644 index 000000000..80eb87e3c --- /dev/null +++ b/tests/watch-debounce.js @@ -0,0 +1,53 @@ +import assert from 'node:assert/strict'; +import { createDebouncedScheduler } from '../src/index/build/watch.js'; + +const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + +let calls = 0; +const scheduler = createDebouncedScheduler({ + debounceMs: 30, + onRun: () => { + calls += 1; + } +}); + +scheduler.schedule(); +scheduler.schedule(); +await wait(10); +scheduler.schedule(); +await wait(60); +assert.equal(calls, 1, 'expected single debounced run'); + +scheduler.schedule(); +await wait(50); +assert.equal(calls, 2, 'expected second run after debounce'); + + + +let errorCalls = 0; +let unhandledRejections = 0; +const onUnhandled = () => { + unhandledRejections += 1; +}; +process.on('unhandledRejection', onUnhandled); + +const asyncScheduler = createDebouncedScheduler({ + debounceMs: 10, + onRun: async () => { + throw new Error('boom'); + }, + onError: () => { + errorCalls += 1; + } +}); + +asyncScheduler.schedule(); +await wait(40); +asyncScheduler.cancel(); + +process.removeListener('unhandledRejection', onUnhandled); + +assert.equal(errorCalls, 1, 'expected onError to be invoked for async rejection'); +assert.equal(unhandledRejections, 0, 'expected no unhandledRejection events'); + +console.log('watch debounce test passed'); diff --git a/tests/watch-filter.js b/tests/watch-filter.js new file mode 100644 index 000000000..9d7e70061 --- /dev/null +++ b/tests/watch-filter.js @@ -0,0 +1,78 @@ +import assert from 'node:assert/strict'; +import path from 'node:path'; +import ignore from 'ignore'; +import { isIndexablePath } from '../src/index/build/watch.js'; + +const root = path.join(process.cwd(), 'tests', 'fixtures', 'sample'); +const ignoreMatcher = ignore().add(['ignored/']); + +const jsPath = path.join(root, 'src', 'app.js'); +assert.equal( + isIndexablePath({ absPath: jsPath, root, ignoreMatcher, modes: ['code'] }), + true, + 'expected code extension to be indexable for code mode' +); + +const mdPath = path.join(root, 'docs', 'readme.md'); +assert.equal( + isIndexablePath({ absPath: mdPath, root, ignoreMatcher, modes: ['prose'] }), + true, + 'expected prose extension to be indexable for prose mode' +); + +assert.equal( + isIndexablePath({ absPath: mdPath, root, ignoreMatcher, modes: ['code'] }), + false, + 'expected prose extension to be excluded for code-only mode' +); + +const dockerfilePath = path.join(root, 'Dockerfile'); +assert.equal( + isIndexablePath({ absPath: dockerfilePath, root, ignoreMatcher, modes: ['code'] }), + true, + 'expected special code filename to be indexable for code mode' +); + +const dockerfileVariantPath = path.join(root, 'Dockerfile.dev'); +assert.equal( + isIndexablePath({ absPath: dockerfileVariantPath, root, ignoreMatcher, modes: ['code'] }), + true, + 'expected dockerfile variants to be indexable for code mode' +); + +const makefileVariantPath = path.join(root, 'Makefile.in'); +assert.equal( + isIndexablePath({ absPath: makefileVariantPath, root, ignoreMatcher, modes: ['code'] }), + true, + 'expected makefile variants to be indexable for code mode' +); + +const gnuMakefilePath = path.join(root, 'GNUmakefile'); +assert.equal( + isIndexablePath({ absPath: gnuMakefilePath, root, ignoreMatcher, modes: ['code'] }), + true, + 'expected GNUmakefile to be indexable for code mode' +); + +const ignoredPath = path.join(root, 'ignored', 'app.js'); +assert.equal( + isIndexablePath({ absPath: ignoredPath, root, ignoreMatcher, modes: ['code'] }), + false, + 'expected ignored path to be excluded' +); + +const outsidePath = path.join(root, '..', 'outside', 'file.js'); +assert.equal( + isIndexablePath({ absPath: outsidePath, root, ignoreMatcher, modes: ['code'] }), + false, + 'expected path outside root to be excluded' +); + +const mixedModesPath = path.join(root, 'content', 'story.md'); +assert.equal( + isIndexablePath({ absPath: mixedModesPath, root, ignoreMatcher, modes: ['code', 'prose'] }), + true, + 'expected prose extension to be indexable when prose mode is enabled' +); + +console.log('watch filter test passed'); diff --git a/tests/watch-stability-guard.js b/tests/watch-stability-guard.js new file mode 100644 index 000000000..e2bd66bfb --- /dev/null +++ b/tests/watch-stability-guard.js @@ -0,0 +1,27 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import os from 'node:os'; +import { waitForStableFile } from '../src/index/build/watch.js'; + +const tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'pairofcleats-watch-')); +const filePath = path.join(tempRoot, 'partial.txt'); + +await fs.writeFile(filePath, 'start'); + +const appendPromise = new Promise((resolve) => { + setTimeout(() => { + void fs.appendFile(filePath, 'more').then(resolve); + }, 50); +}); + +const started = Date.now(); +const stable = await waitForStableFile(filePath, { checks: 3, intervalMs: 100 }); +const elapsed = Date.now() - started; +await appendPromise; + +assert.equal(stable, true, 'stability guard should eventually resolve true'); +assert.ok(elapsed >= 150, 'stability guard should wait for file to settle'); + +console.log('watch stability guard tests passed'); diff --git a/tests/worker-pool-windows.js b/tests/worker-pool-windows.js new file mode 100644 index 000000000..e4fab604a --- /dev/null +++ b/tests/worker-pool-windows.js @@ -0,0 +1,102 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { normalizePostingsConfig } from '../src/shared/postings-config.js'; +import { createTokenizationContext, tokenizeChunkText } from '../src/index/build/tokenization.js'; +import { createIndexerWorkerPool, normalizeWorkerPoolConfig } from '../src/index/build/worker-pool.js'; + +if (process.platform !== 'win32') { + console.log('worker pool windows test skipped (non-windows).'); + process.exit(0); +} + +const root = path.resolve('tests', '.cache', 'worker-pool-windows'); +const deepDir = path.join(root, 'space dir', 'unicode-é', 'deep', 'path', 'more'); +await fs.mkdir(deepDir, { recursive: true }); + +const originalCwd = process.cwd(); +try { + process.chdir(deepDir); + const postingsConfig = normalizePostingsConfig({ + enablePhraseNgrams: true, + phraseMinN: 2, + phraseMaxN: 3, + enableChargrams: true, + chargramMinN: 3, + chargramMaxN: 3 + }); + const dictWords = new Set(['hello', 'world', 'foo', 'bar']); + const dictConfig = { segmentation: 'greedy' }; + const workerConfig = normalizeWorkerPoolConfig({ + enabled: true, + maxWorkers: 1, + maxFileBytes: 4096, + quantizeBatchSize: 2, + taskTimeoutMs: 5000 + }, { cpuLimit: 1 }); + + const workerPool = await createIndexerWorkerPool({ + config: workerConfig, + dictWords, + dictConfig, + postingsConfig + }); + if (!workerPool) { + console.log('worker pool windows test skipped (worker pool unavailable).'); + process.exit(0); + } + + const context = createTokenizationContext({ dictWords, dictConfig, postingsConfig }); + const sample = 'helloWorld fooBar'; + const syncTokens = tokenizeChunkText({ text: sample, mode: 'code', ext: '.js', context }); + + const runs = []; + for (let i = 0; i < 50; i += 1) { + runs.push(workerPool.runTokenize({ + text: sample, + mode: 'code', + ext: '.js', + file: `task-${i}`, + size: sample.length + })); + } + const results = await Promise.all(runs); + for (const result of results) { + if (!result) { + console.error('worker pool windows test failed: missing token result.'); + process.exit(1); + } + if (JSON.stringify(syncTokens.tokens) !== JSON.stringify(result.tokens)) { + console.error('worker pool windows test failed: tokens mismatch.'); + process.exit(1); + } + } + + if (workerPool.pool?.destroy) { + await workerPool.pool.destroy(); + await workerPool.runTokenize({ + text: sample, + mode: 'code', + ext: '.js', + file: 'restart', + size: sample.length + }); + await new Promise((resolve) => setTimeout(resolve, 1200)); + const restarted = await workerPool.runTokenize({ + text: sample, + mode: 'code', + ext: '.js', + file: 'restart-2', + size: sample.length + }); + if (!restarted) { + console.error('worker pool windows test failed: restart did not recover.'); + process.exit(1); + } + } + + await workerPool.destroy(); + console.log('worker pool windows test passed'); +} finally { + process.chdir(originalCwd); +} diff --git a/tests/worker-pool.js b/tests/worker-pool.js new file mode 100644 index 000000000..4a1873355 --- /dev/null +++ b/tests/worker-pool.js @@ -0,0 +1,74 @@ +#!/usr/bin/env node +import { normalizePostingsConfig } from '../src/shared/postings-config.js'; +import { quantizeVec } from '../src/index/embedding.js'; +import { createTokenizationContext, tokenizeChunkText } from '../src/index/build/tokenization.js'; +import { createIndexerWorkerPool, normalizeWorkerPoolConfig } from '../src/index/build/worker-pool.js'; + +const postingsConfig = normalizePostingsConfig({ + enablePhraseNgrams: true, + phraseMinN: 2, + phraseMaxN: 3, + enableChargrams: true, + chargramMinN: 3, + chargramMaxN: 3 +}); +const dictWords = new Set(['hello', 'world', 'foo', 'bar']); +const dictConfig = { segmentation: 'greedy' }; +const workerConfig = normalizeWorkerPoolConfig({ + enabled: true, + maxWorkers: 1, + maxFileBytes: 4096, + quantizeBatchSize: 2, + taskTimeoutMs: 5000 +}, { cpuLimit: 1 }); + +const workerPool = await createIndexerWorkerPool({ + config: workerConfig, + dictWords, + dictConfig, + postingsConfig +}); +if (!workerPool) { + console.log('worker pool test skipped (worker pool unavailable).'); + process.exit(0); +} + +const context = createTokenizationContext({ dictWords, dictConfig, postingsConfig }); +const sample = 'helloWorld fooBar'; +const syncTokens = tokenizeChunkText({ text: sample, mode: 'code', ext: '.js', context }); +const workerTokens = await workerPool.runTokenize({ text: sample, mode: 'code', ext: '.js' }); + +if (JSON.stringify(syncTokens.tokens) !== JSON.stringify(workerTokens.tokens)) { + console.error('worker pool test failed: tokens mismatch.'); + process.exit(1); +} +if (JSON.stringify(syncTokens.seq) !== JSON.stringify(workerTokens.seq)) { + console.error('worker pool test failed: seq mismatch.'); + process.exit(1); +} +if (JSON.stringify(syncTokens.ngrams) !== JSON.stringify(workerTokens.ngrams)) { + console.error('worker pool test failed: ngrams mismatch.'); + process.exit(1); +} +if (JSON.stringify(syncTokens.chargrams) !== JSON.stringify(workerTokens.chargrams)) { + console.error('worker pool test failed: chargrams mismatch.'); + process.exit(1); +} +if (JSON.stringify(syncTokens.minhashSig) !== JSON.stringify(workerTokens.minhashSig)) { + console.error('worker pool test failed: minhash mismatch.'); + process.exit(1); +} + +const vectors = [ + [0, 0.5], + [1, -1] +]; +const syncQuant = vectors.map((vec) => quantizeVec(vec)); +const workerQuant = await workerPool.runQuantize({ vectors }); +if (JSON.stringify(syncQuant) !== JSON.stringify(workerQuant)) { + console.error('worker pool test failed: quantize mismatch.'); + process.exit(1); +} + +await workerPool.destroy(); +console.log('worker pool test passed'); diff --git a/tests/xxhash-backends.js b/tests/xxhash-backends.js new file mode 100644 index 000000000..8f5fd4db1 --- /dev/null +++ b/tests/xxhash-backends.js @@ -0,0 +1,30 @@ +#!/usr/bin/env node +import assert from 'node:assert/strict'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import os from 'node:os'; +import { getCapabilities } from '../src/shared/capabilities.js'; +import { checksumFile, checksumString, setXxhashBackend } from '../src/shared/hash.js'; + +const baseline = '44bc2cf5ad770999'; + +setXxhashBackend('wasm'); +const wasmHash = await checksumString('abc'); +assert.equal(wasmHash.value, baseline, 'wasm checksumString should match baseline'); + +const tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'pairofcleats-xxhash-')); +const filePath = path.join(tempRoot, 'sample.txt'); +await fs.writeFile(filePath, 'abc'); +const fileHash = await checksumFile(filePath); +assert.equal(fileHash.value, baseline, 'checksumFile should match checksumString'); + +const caps = getCapabilities(); +if (caps.hash.nodeRsXxhash) { + setXxhashBackend('native'); + const nativeHash = await checksumString('abc'); + assert.equal(nativeHash.value, baseline, 'native checksumString should match baseline'); +} + +setXxhashBackend(''); + +console.log('xxhash backend tests passed'); diff --git a/tools/api-server.js b/tools/api-server.js new file mode 100644 index 000000000..5abe7efba --- /dev/null +++ b/tools/api-server.js @@ -0,0 +1,72 @@ +#!/usr/bin/env node +import http from 'node:http'; +import path from 'node:path'; +import { createCli } from '../src/shared/cli.js'; +import { getRuntimeConfig, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; +import { getMetricsRegistry } from '../src/shared/metrics.js'; +import { createApiRouter } from './api/router.js'; +import { configureServiceLogger } from './service/logger.js'; + +const argv = createCli({ + scriptName: 'api-server', + options: { + host: { type: 'string', default: '127.0.0.1' }, + port: { type: 'string', default: '7345' }, + output: { type: 'string', default: 'compact' }, + json: { type: 'boolean', default: false }, + quiet: { type: 'boolean', default: false }, + repo: { type: 'string' } + } +}).parse(); + +const host = argv.host || '127.0.0.1'; +const port = Number.isFinite(Number(argv.port)) ? Number(argv.port) : 7345; +const defaultRepo = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); +const jsonOutput = argv.json === true; +const quiet = argv.quiet === true; +const userConfig = loadUserConfig(defaultRepo); +const runtimeConfig = getRuntimeConfig(defaultRepo, userConfig); +const parsedUv = Number(process.env.UV_THREADPOOL_SIZE); +const effectiveUvThreadpoolSize = Number.isFinite(parsedUv) && parsedUv > 0 ? Math.floor(parsedUv) : null; + +const metricsRegistry = getMetricsRegistry(); +const { logLine } = configureServiceLogger({ repoRoot: defaultRepo, service: 'api' }); + +const log = (message) => { + if (quiet) return; + logLine(message); +}; + +const router = createApiRouter({ + host, + defaultRepo, + defaultOutput: argv.output, + metricsRegistry +}); + +const server = http.createServer(router.handleRequest); + +server.listen({ port, host }, () => { + const address = server.address(); + const actualPort = typeof address === 'object' && address ? address.port : port; + const baseUrl = `http://${host}:${actualPort}`; + if (jsonOutput) { + console.log(JSON.stringify({ ok: true, host, port: actualPort, repo: defaultRepo, baseUrl })); + } else { + log(`[api] listening at ${baseUrl}`); + log(`[api] repo root: ${defaultRepo}`); + log(`[api] UV_THREADPOOL_SIZE: ${effectiveUvThreadpoolSize ?? 'default'} (config=${runtimeConfig.uvThreadpoolSize ?? 'none'})`); + } +}); + +const shutdown = (signal) => { + log(`[api] ${signal} received; shutting down...`); + server.close(() => { + router.close(); + log('[api] shutdown complete.'); + process.exit(0); + }); +}; + +process.on('SIGINT', () => shutdown('SIGINT')); +process.on('SIGTERM', () => shutdown('SIGTERM')); diff --git a/tools/api/response.js b/tools/api/response.js new file mode 100644 index 000000000..e102c8b00 --- /dev/null +++ b/tools/api/response.js @@ -0,0 +1,28 @@ +/** + * Write a JSON payload to the HTTP response. + * @param {import('node:http').ServerResponse} res + * @param {number} statusCode + * @param {any} payload + */ +export const sendJson = (res, statusCode, payload) => { + const body = JSON.stringify(payload); + res.writeHead(statusCode, { + 'Content-Type': 'application/json; charset=utf-8', + 'Content-Length': Buffer.byteLength(body), + 'Access-Control-Allow-Origin': '*' + }); + res.end(body); +}; + +/** + * Write an error payload to the HTTP response. + * @param {import('node:http').ServerResponse} res + * @param {number} statusCode + * @param {string} code + * @param {string} message + * @param {object} [details] + */ +export const sendError = (res, statusCode, code, message, details = {}) => { + const { code: ignored, ...rest } = details || {}; + sendJson(res, statusCode, { ok: false, code, message, ...rest }); +}; diff --git a/tools/api/router.js b/tools/api/router.js new file mode 100644 index 000000000..721dd9fc4 --- /dev/null +++ b/tools/api/router.js @@ -0,0 +1,438 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { resolveRepoRoot } from '../dict-utils.js'; +import { search, status } from '../../src/integrations/core/index.js'; +import { createSqliteDbCache } from '../../src/retrieval/sqlite-cache.js'; +import { createSearchValidator, normalizeMetaFilters } from './validation.js'; +import { sendError, sendJson } from './response.js'; +import { ERROR_CODES } from '../../src/shared/error-codes.js'; +import { createSseResponder } from './sse.js'; + +/** + * Create an API router for the HTTP server. + * @param {{host:string,defaultRepo:string,defaultOutput:string,metricsRegistry:any}} config + */ +export const createApiRouter = ({ host, defaultRepo, defaultOutput, metricsRegistry }) => { + const validateSearchPayload = createSearchValidator(); + const repoCaches = new Map(); + + const getRepoCaches = (repoPath) => { + const key = repoPath || defaultRepo; + const existing = repoCaches.get(key); + if (existing) { + existing.lastUsed = Date.now(); + return existing; + } + const entry = { + indexCache: new Map(), + sqliteCache: createSqliteDbCache(), + lastUsed: Date.now() + }; + repoCaches.set(key, entry); + return entry; + }; + + const closeRepoCaches = () => { + for (const entry of repoCaches.values()) { + entry.sqliteCache?.closeAll?.(); + } + repoCaches.clear(); + }; + + /** + * Parse a JSON request body. + * @param {import('node:http').IncomingMessage} req + * @returns {Promise} + */ + const parseBody = (req) => new Promise((resolve, reject) => { + let data = ''; + req.on('data', (chunk) => { + data += chunk; + if (data.length > 1_000_000) { + reject(new Error('Request body too large.')); + req.destroy(); + } + }); + req.on('aborted', () => reject(new Error('Request aborted.'))); + req.on('end', () => resolve(data)); + req.on('error', reject); + }); + + /** + * Resolve and validate a repo path. + * @param {string|null|undefined} value + * @returns {string} + */ + const resolveRepo = (value) => { + const candidate = value ? path.resolve(value) : defaultRepo; + if (!fs.existsSync(candidate)) { + throw new Error(`Repo path not found: ${candidate}`); + } + if (!fs.statSync(candidate).isDirectory()) { + throw new Error(`Repo path is not a directory: ${candidate}`); + } + return value ? resolveRepoRoot(candidate) : candidate; + }; + + /** + * Build CLI search arguments from a request payload. + * @param {string} repoPath + * @param {any} payload + * @returns {{ok:boolean,message?:string,args?:string[],query?:string}} + */ + const buildSearchParams = (repoPath, payload) => { + const query = payload?.query ? String(payload.query) : ''; + if (!query) { + return { ok: false, message: 'Missing query.' }; + } + const output = payload?.output || defaultOutput; + const useCompact = output !== 'full' && output !== 'json'; + const searchArgs = [useCompact ? '--json-compact' : '--json', '--repo', repoPath]; + const mode = payload?.mode ? String(payload.mode) : null; + const backend = payload?.backend ? String(payload.backend) : null; + const ann = payload?.ann; + const top = Number.isFinite(Number(payload?.top)) ? Number(payload.top) : null; + const context = Number.isFinite(Number(payload?.context)) ? Number(payload.context) : null; + const typeFilter = payload?.type ? String(payload.type) : null; + const authorFilter = payload?.author ? String(payload.author) : null; + const importFilter = payload?.import ? String(payload.import) : null; + const callsFilter = payload?.calls ? String(payload.calls) : null; + const usesFilter = payload?.uses ? String(payload.uses) : null; + const signatureFilter = payload?.signature ? String(payload.signature) : null; + const paramFilter = payload?.param ? String(payload.param) : null; + const decoratorFilter = payload?.decorator ? String(payload.decorator) : null; + const inferredTypeFilter = payload?.inferredType ? String(payload.inferredType) : null; + const returnTypeFilter = payload?.returnType ? String(payload.returnType) : null; + const throwsFilter = payload?.throws ? String(payload.throws) : null; + const readsFilter = payload?.reads ? String(payload.reads) : null; + const writesFilter = payload?.writes ? String(payload.writes) : null; + const mutatesFilter = payload?.mutates ? String(payload.mutates) : null; + const aliasFilter = payload?.alias ? String(payload.alias) : null; + const awaitsFilter = payload?.awaits ? String(payload.awaits) : null; + const riskFilter = payload?.risk ? String(payload.risk) : null; + const riskTagFilter = payload?.riskTag ? String(payload.riskTag) : null; + const riskSourceFilter = payload?.riskSource ? String(payload.riskSource) : null; + const riskSinkFilter = payload?.riskSink ? String(payload.riskSink) : null; + const riskCategoryFilter = payload?.riskCategory ? String(payload.riskCategory) : null; + const riskFlowFilter = payload?.riskFlow ? String(payload.riskFlow) : null; + const branchesMin = Number.isFinite(Number(payload?.branchesMin)) ? Number(payload.branchesMin) : null; + const loopsMin = Number.isFinite(Number(payload?.loopsMin)) ? Number(payload.loopsMin) : null; + const breaksMin = Number.isFinite(Number(payload?.breaksMin)) ? Number(payload.breaksMin) : null; + const continuesMin = Number.isFinite(Number(payload?.continuesMin)) ? Number(payload.continuesMin) : null; + const churnMin = Number.isFinite(Number(payload?.churnMin)) ? Number(payload.churnMin) : null; + const chunkAuthorFilter = payload?.chunkAuthor ? String(payload.chunkAuthor) : null; + const modifiedAfter = payload?.modifiedAfter ? String(payload.modifiedAfter) : null; + const modifiedSince = Number.isFinite(Number(payload?.modifiedSince)) ? Number(payload.modifiedSince) : null; + const visibilityFilter = payload?.visibility ? String(payload.visibility) : null; + const extendsFilter = payload?.extends ? String(payload.extends) : null; + const lintFilter = payload?.lint === true; + const asyncFilter = payload?.async === true; + const generatorFilter = payload?.generator === true; + const returnsFilter = payload?.returns === true; + const branchFilter = payload?.branch ? String(payload.branch) : null; + const langFilter = payload?.lang ? String(payload.lang) : null; + const caseAll = payload?.case === true; + const caseFile = payload?.caseFile === true || caseAll; + const caseTokens = payload?.caseTokens === true || caseAll; + const fileFilters = []; + const toList = (value) => (Array.isArray(value) ? value : (value == null ? [] : [value])); + fileFilters.push(...toList(payload?.path)); + fileFilters.push(...toList(payload?.file)); + const extFilters = toList(payload?.ext); + const metaFilters = normalizeMetaFilters(payload?.meta); + const metaJson = payload?.metaJson || null; + + if (mode && mode !== 'both') searchArgs.push('--mode', mode); + if (backend) searchArgs.push('--backend', backend); + if (ann === true) searchArgs.push('--ann'); + if (ann === false) searchArgs.push('--no-ann'); + if (top) searchArgs.push('-n', String(top)); + if (context !== null) searchArgs.push('--context', String(context)); + if (typeFilter) searchArgs.push('--type', typeFilter); + if (authorFilter) searchArgs.push('--author', authorFilter); + if (importFilter) searchArgs.push('--import', importFilter); + if (callsFilter) searchArgs.push('--calls', callsFilter); + if (usesFilter) searchArgs.push('--uses', usesFilter); + if (signatureFilter) searchArgs.push('--signature', signatureFilter); + if (paramFilter) searchArgs.push('--param', paramFilter); + if (decoratorFilter) searchArgs.push('--decorator', decoratorFilter); + if (inferredTypeFilter) searchArgs.push('--inferred-type', inferredTypeFilter); + if (returnTypeFilter) searchArgs.push('--return-type', returnTypeFilter); + if (throwsFilter) searchArgs.push('--throws', throwsFilter); + if (readsFilter) searchArgs.push('--reads', readsFilter); + if (writesFilter) searchArgs.push('--writes', writesFilter); + if (mutatesFilter) searchArgs.push('--mutates', mutatesFilter); + if (aliasFilter) searchArgs.push('--alias', aliasFilter); + if (awaitsFilter) searchArgs.push('--awaits', awaitsFilter); + if (riskFilter) searchArgs.push('--risk', riskFilter); + if (riskTagFilter) searchArgs.push('--risk-tag', riskTagFilter); + if (riskSourceFilter) searchArgs.push('--risk-source', riskSourceFilter); + if (riskSinkFilter) searchArgs.push('--risk-sink', riskSinkFilter); + if (riskCategoryFilter) searchArgs.push('--risk-category', riskCategoryFilter); + if (riskFlowFilter) searchArgs.push('--risk-flow', riskFlowFilter); + if (branchesMin !== null) searchArgs.push('--branches', String(branchesMin)); + if (loopsMin !== null) searchArgs.push('--loops', String(loopsMin)); + if (breaksMin !== null) searchArgs.push('--breaks', String(breaksMin)); + if (continuesMin !== null) searchArgs.push('--continues', String(continuesMin)); + if (churnMin !== null) searchArgs.push('--churn', String(churnMin)); + if (chunkAuthorFilter) searchArgs.push('--chunk-author', chunkAuthorFilter); + if (modifiedAfter) searchArgs.push('--modified-after', modifiedAfter); + if (modifiedSince !== null) searchArgs.push('--modified-since', String(modifiedSince)); + if (visibilityFilter) searchArgs.push('--visibility', visibilityFilter); + if (extendsFilter) searchArgs.push('--extends', extendsFilter); + if (lintFilter) searchArgs.push('--lint'); + if (asyncFilter) searchArgs.push('--async'); + if (generatorFilter) searchArgs.push('--generator'); + if (returnsFilter) searchArgs.push('--returns'); + if (branchFilter) searchArgs.push('--branch', branchFilter); + if (langFilter) searchArgs.push('--lang', langFilter); + if (caseAll) searchArgs.push('--case'); + if (!caseAll && caseFile) searchArgs.push('--case-file'); + if (!caseAll && caseTokens) searchArgs.push('--case-tokens'); + for (const entry of fileFilters) { + if (entry == null || entry === '') continue; + searchArgs.push('--path', String(entry)); + } + for (const entry of extFilters) { + if (entry == null || entry === '') continue; + searchArgs.push('--ext', String(entry)); + } + if (Array.isArray(metaFilters)) { + metaFilters.forEach((entry) => searchArgs.push('--meta', entry)); + } + if (metaJson) { + const jsonValue = typeof metaJson === 'string' ? metaJson : JSON.stringify(metaJson); + searchArgs.push('--meta-json', jsonValue); + } + + return { ok: true, args: searchArgs, query }; + }; + + const isNoIndexError = (err) => { + if (err?.code === ERROR_CODES.NO_INDEX) return true; + const message = String(err?.message || err || '').toLowerCase(); + if (!message) return false; + return message.includes('index not found') + || message.includes('missing required tables') + || message.includes('missing sqlite index') + || message.includes('missing lmdb index') + || message.includes('sqlite backend requested but index not found') + || message.includes('lmdb backend requested but index not found'); + }; + + const handleRequest = async (req, res) => { + const requestUrl = new URL(req.url || '/', `http://${host}`); + res.setHeader('Access-Control-Allow-Origin', '*'); + res.setHeader('Access-Control-Allow-Methods', 'GET,POST,OPTIONS'); + res.setHeader('Access-Control-Allow-Headers', 'Content-Type'); + if (req.method === 'OPTIONS') { + res.writeHead(204); + res.end(); + return; + } + + if (requestUrl.pathname === '/health' && req.method === 'GET') { + sendJson(res, 200, { ok: true, uptimeMs: Math.round(process.uptime() * 1000) }); + return; + } + + if (requestUrl.pathname === '/metrics' && req.method === 'GET') { + try { + const body = await metricsRegistry.metrics(); + res.writeHead(200, { + 'Content-Type': metricsRegistry.contentType || 'text/plain; version=0.0.4; charset=utf-8', + 'Access-Control-Allow-Origin': '*' + }); + res.end(body); + } catch (err) { + sendError(res, 500, ERROR_CODES.INTERNAL, 'Failed to render metrics.', { + error: err?.message || String(err) + }); + } + return; + } + + if (requestUrl.pathname === '/status/stream' && req.method === 'GET') { + const sse = createSseResponder(req, res); + let repoPath = ''; + try { + repoPath = resolveRepo(requestUrl.searchParams.get('repo')); + } catch (err) { + await sse.sendHeaders(); + await sse.sendEvent('error', { + ok: false, + code: ERROR_CODES.INVALID_REQUEST, + message: err?.message || 'Invalid repo path.' + }); + await sse.sendEvent('done', { ok: false }); + sse.end(); + return; + } + await sse.sendHeaders(); + await sse.sendEvent('start', { ok: true, repo: repoPath }); + try { + const payload = await status(repoPath); + if (!sse.isClosed()) { + await sse.sendEvent('result', { ok: true, repo: repoPath, status: payload }); + await sse.sendEvent('done', { ok: true }); + } + } catch (err) { + await sse.sendEvent('error', { + ok: false, + code: ERROR_CODES.INTERNAL, + message: err?.message || 'Failed to collect status.' + }); + await sse.sendEvent('done', { ok: false }); + } + sse.end(); + return; + } + + if (requestUrl.pathname === '/status' && req.method === 'GET') { + let repoPath = ''; + try { + repoPath = resolveRepo(requestUrl.searchParams.get('repo')); + } catch (err) { + sendError(res, 400, ERROR_CODES.INVALID_REQUEST, err?.message || 'Invalid repo path.'); + return; + } + try { + const payload = await status(repoPath); + sendJson(res, 200, { ok: true, repo: repoPath, status: payload }); + } catch (err) { + sendError(res, 500, ERROR_CODES.INTERNAL, 'Failed to collect status.', { + error: err?.message || String(err) + }); + } + return; + } + + if (requestUrl.pathname === '/search/stream' && req.method === 'POST') { + const sse = createSseResponder(req, res); + let raw; + try { + raw = await parseBody(req); + } catch (err) { + sendError(res, 413, ERROR_CODES.INVALID_REQUEST, err?.message || 'Request body too large.'); + return; + } + let payload = null; + try { + payload = raw ? JSON.parse(raw) : null; + } catch { + sendError(res, 400, ERROR_CODES.INVALID_REQUEST, 'Invalid JSON payload.'); + return; + } + const validation = validateSearchPayload(payload); + if (!validation.ok) { + sendError(res, 400, ERROR_CODES.INVALID_REQUEST, 'Invalid search payload.', { + errors: validation.errors + }); + return; + } + let repoPath = ''; + try { + repoPath = resolveRepo(payload?.repoPath || payload?.repo); + } catch (err) { + sendError(res, 400, ERROR_CODES.INVALID_REQUEST, err?.message || 'Invalid repo path.'); + return; + } + const searchParams = buildSearchParams(repoPath, payload || {}); + if (!searchParams.ok) { + sendError(res, 400, ERROR_CODES.INVALID_REQUEST, searchParams.message || 'Invalid search payload.'); + return; + } + await sse.sendHeaders(); + await sse.sendEvent('start', { ok: true }); + const caches = getRepoCaches(repoPath); + try { + const body = await search(repoPath, { + args: searchParams.args, + query: searchParams.query, + emitOutput: false, + exitOnError: false, + indexCache: caches.indexCache, + sqliteCache: caches.sqliteCache + }); + if (!sse.isClosed()) { + await sse.sendEvent('result', { ok: true, repo: repoPath, result: body }); + await sse.sendEvent('done', { ok: true }); + } + } catch (err) { + const isNoIndex = isNoIndexError(err); + await sse.sendEvent('error', { + ok: false, + code: isNoIndex ? ERROR_CODES.NO_INDEX : ERROR_CODES.INTERNAL, + message: err?.message || 'Search failed.' + }); + await sse.sendEvent('done', { ok: false }); + } + sse.end(); + return; + } + + if (requestUrl.pathname === '/search' && req.method === 'POST') { + let raw; + try { + raw = await parseBody(req); + } catch (err) { + sendError(res, 413, ERROR_CODES.INVALID_REQUEST, err?.message || 'Request body too large.'); + return; + } + let payload = null; + try { + payload = raw ? JSON.parse(raw) : null; + } catch { + sendError(res, 400, ERROR_CODES.INVALID_REQUEST, 'Invalid JSON payload.'); + return; + } + const validation = validateSearchPayload(payload); + if (!validation.ok) { + sendError(res, 400, ERROR_CODES.INVALID_REQUEST, 'Invalid search payload.', { + errors: validation.errors + }); + return; + } + let repoPath = ''; + try { + repoPath = resolveRepo(payload?.repoPath || payload?.repo); + } catch (err) { + sendError(res, 400, ERROR_CODES.INVALID_REQUEST, err?.message || 'Invalid repo path.'); + return; + } + const searchParams = buildSearchParams(repoPath, payload || {}); + if (!searchParams.ok) { + sendError(res, 400, ERROR_CODES.INVALID_REQUEST, searchParams.message || 'Invalid search payload.'); + return; + } + try { + const caches = getRepoCaches(repoPath); + const body = await search(repoPath, { + args: searchParams.args, + query: searchParams.query, + emitOutput: false, + exitOnError: false, + indexCache: caches.indexCache, + sqliteCache: caches.sqliteCache + }); + sendJson(res, 200, { ok: true, repo: repoPath, result: body }); + } catch (err) { + if (isNoIndexError(err)) { + sendError(res, 409, ERROR_CODES.NO_INDEX, err?.message || 'Index not found.', { + error: err?.message || String(err) + }); + return; + } + sendError(res, 500, ERROR_CODES.INTERNAL, 'Search failed.', { error: err?.message || String(err) }); + } + return; + } + + sendError(res, 404, ERROR_CODES.NOT_FOUND, 'Not found.'); + }; + + return { + handleRequest, + close: closeRepoCaches + }; +}; diff --git a/tools/api/sse.js b/tools/api/sse.js new file mode 100644 index 000000000..5bd1de716 --- /dev/null +++ b/tools/api/sse.js @@ -0,0 +1,49 @@ +/** + * Write SSE headers for streaming responses. + * @param {import('node:http').IncomingMessage} req + * @param {import('node:http').ServerResponse} res + */ +export const createSseResponder = (req, res) => { + let closed = false; + const markClosed = () => { + closed = true; + }; + req.on('aborted', markClosed); + res.on('close', markClosed); + res.on('finish', markClosed); + res.on('error', markClosed); + const writeChunk = async (chunk) => { + if (closed || res.writableEnded || res.destroyed) return false; + if (!res.write(chunk)) { + await new Promise((resolve) => res.once('drain', resolve)); + if (closed || res.writableEnded || res.destroyed) return false; + } + return true; + }; + return { + sendHeaders() { + if (closed || res.headersSent) return false; + res.writeHead(200, { + 'Content-Type': 'text/event-stream; charset=utf-8', + 'Cache-Control': 'no-cache', + Connection: 'keep-alive', + 'Access-Control-Allow-Origin': '*' + }); + return writeChunk('\n'); + }, + async sendEvent(event, payload) { + if (closed || res.writableEnded || res.destroyed) return false; + const ok = await writeChunk(`event: ${event}\n`); + if (!ok) return false; + return writeChunk(`data: ${JSON.stringify(payload)}\n\n`); + }, + end() { + if (closed || res.writableEnded || res.destroyed) return; + res.end(); + closed = true; + }, + isClosed() { + return closed || res.writableEnded || res.destroyed; + } + }; +}; diff --git a/tools/api/validation.js b/tools/api/validation.js new file mode 100644 index 000000000..b4386114a --- /dev/null +++ b/tools/api/validation.js @@ -0,0 +1,142 @@ +import Ajv from 'ajv'; + +const stringListSchema = { + anyOf: [ + { type: 'string' }, + { type: 'array', items: { type: 'string' } } + ] +}; + +const metaSchema = { + anyOf: [ + { type: 'string' }, + { + type: 'array', + items: { + anyOf: [ + { type: 'string' }, + { type: 'number' }, + { type: 'boolean' }, + { type: 'object' }, + { type: 'null' } + ] + } + }, + { type: 'object', additionalProperties: true } + ] +}; + +const searchRequestSchema = { + type: 'object', + additionalProperties: false, + required: ['query'], + properties: { + query: { type: 'string', minLength: 1 }, + repoPath: { type: 'string' }, + repo: { type: 'string' }, + output: { type: 'string', enum: ['compact', 'json', 'full'] }, + mode: { type: 'string', enum: ['code', 'prose', 'records', 'both', 'all', 'extracted-prose'] }, + backend: { type: 'string', enum: ['auto', 'memory', 'sqlite', 'sqlite-fts', 'lmdb'] }, + ann: { type: 'boolean' }, + top: { type: 'integer', minimum: 0 }, + context: { type: 'integer', minimum: 0 }, + type: { type: 'string' }, + author: { type: 'string' }, + import: { type: 'string' }, + calls: { type: 'string' }, + uses: { type: 'string' }, + signature: { type: 'string' }, + param: { type: 'string' }, + decorator: { type: 'string' }, + inferredType: { type: 'string' }, + returnType: { type: 'string' }, + throws: { type: 'string' }, + reads: { type: 'string' }, + writes: { type: 'string' }, + mutates: { type: 'string' }, + alias: { type: 'string' }, + awaits: { type: 'string' }, + risk: { type: 'string' }, + riskTag: { type: 'string' }, + riskSource: { type: 'string' }, + riskSink: { type: 'string' }, + riskCategory: { type: 'string' }, + riskFlow: { type: 'string' }, + branchesMin: { type: 'integer', minimum: 0 }, + loopsMin: { type: 'integer', minimum: 0 }, + breaksMin: { type: 'integer', minimum: 0 }, + continuesMin: { type: 'integer', minimum: 0 }, + churnMin: { type: 'integer', minimum: 0 }, + chunkAuthor: { type: 'string' }, + modifiedAfter: { type: 'string' }, + modifiedSince: { type: 'integer', minimum: 0 }, + visibility: { type: 'string' }, + extends: { type: 'string' }, + lint: { type: 'boolean' }, + async: { type: 'boolean' }, + generator: { type: 'boolean' }, + returns: { type: 'boolean' }, + branch: { type: 'string' }, + lang: { type: 'string' }, + case: { type: 'boolean' }, + caseFile: { type: 'boolean' }, + caseTokens: { type: 'boolean' }, + path: stringListSchema, + file: stringListSchema, + ext: stringListSchema, + meta: metaSchema, + metaJson: { + type: ['string', 'object', 'array', 'number', 'boolean', 'null'] + } + } +}; + +const formatValidationErrors = (errors = []) => errors.map((err) => { + const path = err.instancePath || '#'; + if (err.keyword === 'additionalProperties') { + return `${path} has unknown field "${err.params?.additionalProperty}"`; + } + if (err.keyword === 'required') { + return `${path} missing required field "${err.params?.missingProperty}"`; + } + return `${path} ${err.message}`.trim(); +}); + +/** + * Normalize meta filters into CLI-friendly key/value strings. + * @param {any} meta + * @returns {string[]|null} + */ +export const normalizeMetaFilters = (meta) => { + if (!meta) return null; + if (Array.isArray(meta)) { + const entries = meta.flatMap((entry) => { + if (entry == null) return []; + if (typeof entry === 'string') return [entry]; + if (typeof entry === 'object') { + return Object.entries(entry).map(([key, value]) => + value == null || value === '' ? String(key) : `${key}=${value}` + ); + } + return [String(entry)]; + }); + return entries.length ? entries : null; + } + if (typeof meta === 'object') { + const entries = Object.entries(meta).map(([key, value]) => + value == null || value === '' ? String(key) : `${key}=${value}` + ); + return entries.length ? entries : null; + } + return [String(meta)]; +}; + +export const createSearchValidator = () => { + const ajv = new Ajv({ allErrors: true, strict: false }); + const validateSearchRequest = ajv.compile(searchRequestSchema); + return (payload) => { + const valid = validateSearchRequest(payload); + if (valid) return { ok: true }; + return { ok: false, errors: formatValidationErrors(validateSearchRequest.errors || []) }; + }; +}; diff --git a/tools/assemble-pieces.js b/tools/assemble-pieces.js new file mode 100644 index 000000000..4d8ba5600 --- /dev/null +++ b/tools/assemble-pieces.js @@ -0,0 +1,72 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import path from 'node:path'; +import { createCli } from '../src/shared/cli.js'; +import { assembleIndexPieces } from '../src/index/build/piece-assembly.js'; +import { loadUserConfig, resolveRepoRoot } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'assemble-pieces', + argv: ['node', 'assemble-pieces.js', ...process.argv.slice(2)], + options: { + input: { type: 'string', array: true, describe: 'Input index directory (repeatable)' }, + inputs: { type: 'string', describe: 'Comma-separated input index directories' }, + out: { type: 'string', demandOption: true, describe: 'Output index directory' }, + mode: { type: 'string', default: 'code' }, + repo: { type: 'string' }, + stage: { type: 'string' }, + force: { type: 'boolean', default: false }, + sort: { + type: 'boolean', + default: true, + describe: 'Sort input directories for deterministic assembly (disable with --no-sort)' + } + } +}).parse(); + +const inputDirs = []; +if (Array.isArray(argv.input)) { + inputDirs.push(...argv.input.filter(Boolean)); +} +if (typeof argv.inputs === 'string') { + inputDirs.push(...argv.inputs.split(',').map((entry) => entry.trim()).filter(Boolean)); +} +if (!inputDirs.length) { + console.error('assemble-pieces requires at least one --input or --inputs entry.'); + process.exit(1); +} + +const outDir = path.resolve(argv.out); +if (fsSync.existsSync(outDir) && !argv.force) { + const entries = fsSync.readdirSync(outDir); + if (entries.length) { + console.error(`assemble-pieces output directory is not empty: ${outDir}`); + console.error('Pass --force to reuse the directory.'); + process.exit(1); + } +} +await fs.mkdir(outDir, { recursive: true }); + +const repoRoot = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); +const userConfig = loadUserConfig(repoRoot); +const mode = argv.mode || 'code'; + +const resolvedInputs = inputDirs.map((dir) => path.resolve(dir)); +if (argv.sort !== false) { + resolvedInputs.sort((a, b) => (a < b ? -1 : (a > b ? 1 : 0))); +} + +try { + await assembleIndexPieces({ + inputs: resolvedInputs, + outDir, + root: repoRoot, + mode, + userConfig, + stage: argv.stage + }); +} catch (err) { + console.error(err?.message || err); + process.exit(1); +} diff --git a/tools/bench-dict-seg.js b/tools/bench-dict-seg.js new file mode 100644 index 000000000..045b4b610 --- /dev/null +++ b/tools/bench-dict-seg.js @@ -0,0 +1,187 @@ +#!/usr/bin/env node +import fsSync from 'node:fs'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { createCli } from '../src/shared/cli.js'; +import { splitWordsWithDict } from '../src/shared/tokenize.js'; + +const argv = createCli({ + scriptName: 'bench-dict-seg', + options: { + json: { type: 'boolean', default: false }, + dict: { type: 'string' }, + tokens: { type: 'string' }, + fixture: { type: 'string' }, + out: { type: 'string' }, + sample: { type: 'number' }, + 'dp-max': { type: 'number' } + } +}).parse(); + +const root = process.cwd(); +const fixtureArg = typeof argv.fixture === 'string' ? argv.fixture.trim() : ''; +const fixtureDir = fixtureArg + ? (path.isAbsolute(fixtureArg) + ? path.resolve(fixtureArg) + : path.join(root, 'tests', 'fixtures', fixtureArg)) + : null; +const dictPath = fixtureDir + ? path.join(fixtureDir, 'words.txt') + : path.resolve(argv.dict || path.join(root, 'tests', 'fixtures', 'dicts', 'words.txt')); +const tokensPath = fixtureDir + ? path.join(fixtureDir, 'tokens.txt') + : (argv.tokens ? path.resolve(argv.tokens) : null); +const fixtureLabel = fixtureDir ? path.basename(fixtureDir) : 'default'; +const sampleLimit = Number.isFinite(Number(argv.sample)) + ? Math.max(10, Number(argv.sample)) + : 300; +const dpMaxTokenLength = Number.isFinite(Number(argv['dp-max'])) + ? Math.max(4, Math.floor(Number(argv['dp-max']))) + : 32; + +function camelize(a, b) { + if (!a) return b || ''; + if (!b) return a; + return `${a}${b[0].toUpperCase()}${b.slice(1)}`; +} + +function buildTokenSamples(words, limit) { + const base = words.slice(0, Math.min(words.length, 120)); + const tokens = new Set(); + for (const word of base) tokens.add(word); + for (let i = 0; i < base.length; i += 1) { + const a = base[i]; + const b = base[(i + 1) % base.length]; + const c = base[(i + 2) % base.length]; + tokens.add(`${a}${b}`); + tokens.add(camelize(a, b)); + tokens.add(`${a}_${b}`); + tokens.add(`${a}-${c}`); + } + const extras = [ + 'HTTPRequest', + 'getUserProfile', + 'userIDLookup', + 'kubernetesClusterConfig', + 'postgresConnectionString', + 'lruCacheStats', + 'xkcdToken', + 'xyzzynotaword', + 'foo2bar', + 'ZalgoMode' + ]; + extras.forEach((token) => tokens.add(token)); + return Array.from(tokens).slice(0, limit); +} + +async function loadTokens(words) { + if (tokensPath) { + try { + const raw = await fs.readFile(tokensPath, 'utf8'); + return raw + .split(/\r?\n/) + .map((line) => line.trim()) + .filter(Boolean) + .slice(0, sampleLimit); + } catch { + // Fall back to generated samples when fixture tokens are missing. + } + } + return buildTokenSamples(words, sampleLimit); +} + +function measure(tokens, dict, segmentation) { + const start = Date.now(); + let totalSegments = 0; + let totalChars = 0; + let dictChars = 0; + let unknownChars = 0; + let dictSegments = 0; + let unknownSegments = 0; + for (const token of tokens) { + if (!token) continue; + totalChars += token.length; + const segments = splitWordsWithDict(token.toLowerCase(), dict, { + segmentation, + dpMaxTokenLength + }); + totalSegments += segments.length; + for (const seg of segments) { + if (dict.has(seg)) { + dictChars += seg.length; + dictSegments += 1; + } else { + unknownChars += seg.length; + unknownSegments += 1; + } + } + } + const durationMs = Date.now() - start; + const coverage = totalChars > 0 ? dictChars / totalChars : 0; + return { + segments: totalSegments, + avgSegmentsPerToken: tokens.length ? totalSegments / tokens.length : 0, + dictSegments, + unknownSegments, + dictChars, + unknownChars, + coverage, + durationMs + }; +} + +let dictRaw = ''; +try { + dictRaw = await fs.readFile(dictPath, 'utf8'); +} catch (err) { + console.error(`Failed to read dictionary at ${dictPath}`); + if (err?.message) console.error(err.message); + process.exit(1); +} + +const dictWords = new Set( + dictRaw + .split(/\r?\n/) + .map((line) => line.trim().toLowerCase()) + .filter(Boolean) +); + +const tokens = await loadTokens(Array.from(dictWords)); +const greedy = measure(tokens, dictWords, 'greedy'); +const dp = measure(tokens, dictWords, 'dp'); +const aho = measure(tokens, dictWords, 'aho'); + +const summary = { + generatedAt: new Date().toISOString(), + dictPath, + tokensPath: tokensPath && fsSync.existsSync(tokensPath) ? tokensPath : null, + fixture: fixtureLabel, + dictWords: dictWords.size, + tokens: tokens.length, + dpMaxTokenLength, + strategies: { + greedy, + dp, + aho + } +}; + +if (argv.out) { + const outPath = path.resolve(argv.out); + await fs.writeFile(outPath, JSON.stringify(summary, null, 2)); +} + +if (argv.json) { + console.log(JSON.stringify(summary, null, 2)); +} else { + console.log('Dictionary segmentation benchmark'); + console.log(`- Dict: ${dictPath}`); + console.log(`- Words: ${dictWords.size}`); + console.log(`- Tokens: ${tokens.length}`); + console.log(`- dpMaxTokenLength: ${dpMaxTokenLength}`); + for (const [name, stats] of Object.entries(summary.strategies)) { + console.log(`- ${name} avg segments: ${stats.avgSegmentsPerToken.toFixed(2)}`); + console.log(`- ${name} coverage: ${(stats.coverage * 100).toFixed(1)}%`); + console.log(`- ${name} duration: ${stats.durationMs} ms`); + } +} diff --git a/tools/bench-language-matrix.js b/tools/bench-language-matrix.js new file mode 100644 index 000000000..56344a8bb --- /dev/null +++ b/tools/bench-language-matrix.js @@ -0,0 +1,224 @@ +#!/usr/bin/env node +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { execa } from 'execa'; +import { createCli } from '../src/shared/cli.js'; +import { BENCH_OPTIONS, mergeCliOptions, validateBenchArgs } from '../src/shared/cli-options.js'; +import { resolveToolRoot } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'bench-language-matrix', + options: mergeCliOptions( + BENCH_OPTIONS, + { + tier: { type: 'string', default: 'typical' }, + backend: { type: 'string' }, + backends: { type: 'string' }, + 'ann-modes': { type: 'string' }, + 'fts-profiles': { type: 'string' }, + config: { type: 'string' }, + root: { type: 'string' }, + 'cache-root': { type: 'string' }, + 'cache-suffix': { type: 'string' }, + results: { type: 'string' }, + 'log-dir': { type: 'string' }, + 'out-dir': { type: 'string' }, + language: { type: 'string' }, + languages: { type: 'string' }, + repos: { type: 'string' }, + only: { type: 'string' }, + 'fts-weights': { type: 'string' }, + 'dry-run': { type: 'boolean', default: false }, + 'fail-fast': { type: 'boolean', default: false }, + 'lock-mode': { type: 'string' }, + 'lock-wait-ms': { type: 'number' }, + 'lock-stale-ms': { type: 'number' } + } + ) +}).parse(); +validateBenchArgs(argv); + +const scriptRoot = resolveToolRoot(); +const benchScript = path.join(scriptRoot, 'tools', 'bench-language-repos.js'); +const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); +const resultsRoot = path.resolve(argv.results || path.join(scriptRoot, 'benchmarks', 'results')); +const runRoot = path.resolve(argv['out-dir'] || path.join(resultsRoot, 'matrix', timestamp)); +const logRoot = path.resolve(argv['log-dir'] || path.join(runRoot, 'logs')); +const outRoot = path.join(runRoot, 'runs'); + +const ALL_BACKENDS = ['sqlite-fts', 'sqlite', 'memory']; +const DEFAULT_ANN_MODES = ['auto', 'on', 'off']; +const DEFAULT_FTS_PROFILES = ['balanced', 'headline', 'name']; + +const parseList = (value) => { + if (!value) return []; + return String(value) + .split(',') + .map((entry) => entry.trim()) + .filter(Boolean); +}; + +const normalizeBackend = (raw) => { + const value = String(raw || '').toLowerCase(); + if (value === 'fts') return 'sqlite-fts'; + return value; +}; + +const resolveBackends = () => { + const raw = argv.backends || argv.backend || ''; + const list = parseList(raw).map(normalizeBackend).filter(Boolean); + if (!list.length || list.includes('all')) return ALL_BACKENDS.slice(); + return list; +}; + +const resolveAnnModes = () => { + const list = parseList(argv['ann-modes']).map((entry) => entry.toLowerCase()); + return list.length ? list : DEFAULT_ANN_MODES.slice(); +}; + +const resolveFtsProfiles = () => { + const list = parseList(argv['fts-profiles']).map((entry) => entry.toLowerCase()); + return list.length ? list : DEFAULT_FTS_PROFILES.slice(); +}; + +const toSafeName = (value) => String(value || '') + .replace(/[^a-z0-9-_]+/gi, '_') + .replace(/^_+|_+$/g, '') + .toLowerCase(); + +const buildConfigs = () => { + const configs = []; + const backends = resolveBackends(); + const annModes = resolveAnnModes(); + const ftsProfiles = resolveFtsProfiles(); + for (const backend of backends) { + const usesFts = backend === 'sqlite-fts' || backend === 'fts'; + const profiles = usesFts ? ftsProfiles : [null]; + for (const annMode of annModes) { + for (const profile of profiles) { + const idParts = [backend, annMode]; + if (profile) idParts.push(profile); + const id = toSafeName(idParts.join('-')); + configs.push({ + id, + backend, + annMode, + ftsProfile: profile + }); + } + } + } + return configs; +}; + +const appendArgs = (args, flag, value) => { + if (value === undefined || value === null || value === '') return; + args.push(flag, String(value)); +}; + +const configToArgs = (config, outFile, logFile) => { + const args = [benchScript]; + const tierArg = argv.tier || 'typical'; + appendArgs(args, '--tier', tierArg); + appendArgs(args, '--backend', config.backend); + appendArgs(args, '--out', outFile); + appendArgs(args, '--log', logFile); + + if (config.annMode === 'on') args.push('--ann'); + if (config.annMode === 'off') args.push('--no-ann'); + if (config.ftsProfile) appendArgs(args, '--fts-profile', config.ftsProfile); + + if (argv.build) args.push('--build'); + if (argv['build-index']) args.push('--build-index'); + if (argv['build-sqlite']) args.push('--build-sqlite'); + if (argv.incremental) args.push('--incremental'); + if (argv['stub-embeddings']) args.push('--stub-embeddings'); + if (argv['real-embeddings']) args.push('--real-embeddings'); + if (argv['dry-run']) args.push('--dry-run'); + + appendArgs(args, '--config', argv.config); + appendArgs(args, '--root', argv.root); + appendArgs(args, '--cache-root', argv['cache-root']); + appendArgs(args, '--cache-suffix', argv['cache-suffix']); + appendArgs(args, '--results', argv.results); + appendArgs(args, '--index-profile', argv['index-profile']); + if (argv['no-index-profile']) args.push('--no-index-profile'); + appendArgs(args, '--language', argv.language); + appendArgs(args, '--languages', argv.languages); + appendArgs(args, '--repos', argv.repos); + appendArgs(args, '--only', argv.only); + appendArgs(args, '--queries', argv.queries); + appendArgs(args, '--top', argv.top); + appendArgs(args, '--limit', argv.limit); + appendArgs(args, '--bm25-k1', argv['bm25-k1']); + appendArgs(args, '--bm25-b', argv['bm25-b']); + appendArgs(args, '--fts-weights', argv['fts-weights']); + appendArgs(args, '--threads', argv.threads); + appendArgs(args, '--heap-mb', argv['heap-mb']); + appendArgs(args, '--lock-mode', argv['lock-mode']); + appendArgs(args, '--lock-wait-ms', argv['lock-wait-ms']); + appendArgs(args, '--lock-stale-ms', argv['lock-stale-ms']); + + if (argv['no-index-profile']) args.push('--no-index-profile'); + return args; +}; + +async function main() { + await fsPromises.mkdir(logRoot, { recursive: true }); + await fsPromises.mkdir(outRoot, { recursive: true }); + + const configs = buildConfigs(); + if (!configs.length) { + console.error('No benchmark configurations resolved.'); + process.exit(1); + } + + const results = []; + for (const config of configs) { + const label = `${config.backend}/${config.annMode}${config.ftsProfile ? `/${config.ftsProfile}` : ''}`; + const outFile = path.join(outRoot, `${config.id}.json`); + const logFile = path.join(logRoot, `${config.id}.log`); + const args = configToArgs(config, outFile, logFile); + + console.log(`\n[bench-matrix] ${label}`); + console.log(`node ${args.map((arg) => (arg.includes(' ') ? `"${arg}"` : arg)).join(' ')}`); + + if (argv['dry-run']) { + results.push({ ...config, outFile, logFile, status: 'dry-run' }); + continue; + } + + try { + const child = execa(process.execPath, args, { stdio: 'inherit' }); + await child; + results.push({ ...config, outFile, logFile, status: 'ok' }); + } catch (err) { + results.push({ + ...config, + outFile, + logFile, + status: 'failed', + exitCode: err?.exitCode ?? null, + error: err?.message || String(err) + }); + if (argv['fail-fast']) break; + } + } + + const summary = { + generatedAt: new Date().toISOString(), + runRoot, + outRoot, + logRoot, + tier: argv.tier, + results + }; + const summaryPath = path.join(runRoot, 'matrix.json'); + await fsPromises.writeFile(summaryPath, JSON.stringify(summary, null, 2)); + console.log(`\n[bench-matrix] Summary written to ${summaryPath}`); +} + +main().catch((err) => { + console.error(err?.message || err); + process.exit(1); +}); diff --git a/tools/bench-language-repos.js b/tools/bench-language-repos.js new file mode 100644 index 000000000..1184c9f89 --- /dev/null +++ b/tools/bench-language-repos.js @@ -0,0 +1,518 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { getEnvConfig } from '../src/shared/env.js'; +import { getRuntimeConfig, loadUserConfig, resolveRuntimeEnv } from './dict-utils.js'; +import { parseBenchLanguageArgs } from './bench/language/cli.js'; +import { loadBenchConfig } from './bench/language/config.js'; +import { checkIndexLock, formatLockDetail } from './bench/language/locks.js'; +import { + ensureLongPathsSupport, + needsIndexArtifacts, + needsSqliteArtifacts, + resolveCloneTool, + resolveRepoCacheRoot, + resolveRepoDir +} from './bench/language/repos.js'; +import { createProcessRunner } from './bench/language/process.js'; +import { + buildLineStats, + formatDuration, + formatGb, + formatMetricSummary, + getRecommendedHeapMb, + stripMaxOldSpaceFlag, + validateEncodingFixtures +} from './bench/language/metrics.js'; +import { buildReportOutput, printSummary } from './bench/language/report.js'; +import { createProgressState } from './bench/language/progress/state.js'; +import { createProgressRenderer } from './bench/language/progress/render.js'; + +const parseList = (value) => { + if (!value) return []; + return String(value) + .split(',') + .map((entry) => entry.trim()) + .filter(Boolean); +}; + +const { + argv, + scriptRoot, + configPath, + reposRoot, + cacheRoot, + resultsRoot, + logPath, + cloneEnabled, + dryRun, + quietMode, + interactive, + colorEnabled, + logWindowSize, + lockMode, + lockWaitMs, + lockStaleMs, + wantsSqlite, + indexProfile, + suppressProfileEnv +} = parseBenchLanguageArgs(); + +const baseEnv = { ...process.env }; +const envConfig = getEnvConfig(); +const heapArgRaw = argv['heap-mb']; +const heapArg = Number.isFinite(Number(heapArgRaw)) ? Math.floor(Number(heapArgRaw)) : null; +const heapRecommendation = getRecommendedHeapMb(); +let heapLogged = false; + +let logStream = null; +const initLog = () => { + if (logStream) return; + fs.mkdirSync(path.dirname(logPath), { recursive: true }); + logStream = fs.createWriteStream(logPath, { flags: 'a' }); + logStream.write(`\n=== Bench run ${new Date().toISOString()} ===\n`); + logStream.write(`Config: ${configPath}\n`); + logStream.write(`Repos: ${reposRoot}\n`); + logStream.write(`Cache: ${cacheRoot}\n`); + logStream.write(`Results: ${resultsRoot}\n`); +}; + +const writeLog = (line) => { + if (!logStream) initLog(); + if (!logStream) return; + logStream.write(`${line}\n`); +}; + +const writeLogSync = (line) => { + try { + fs.mkdirSync(path.dirname(logPath), { recursive: true }); + fs.appendFileSync(logPath, `${line}\n`); + } catch {} +}; + +const progressState = createProgressState({ logWindowSize }); +let processRunner = null; +const progress = createProgressRenderer({ + state: progressState, + interactive, + quietMode, + colorEnabled, + writeLog, + getActiveLabel: () => (processRunner ? processRunner.getActiveLabel() : '') +}); +processRunner = createProcessRunner({ + appendLog: progress.appendLog, + writeLog, + writeLogSync, + logHistory: progressState.logHistory, + logPath +}); + +process.on('exit', (code) => { + processRunner.logExit('exit', code); + if (logStream) logStream.end(); +}); +process.on('SIGINT', () => { + writeLogSync('[signal] SIGINT received'); + const active = processRunner.getActiveChild(); + if (active) { + writeLogSync(`[signal] terminating ${processRunner.getActiveLabel()}`); + processRunner.killProcessTree(active.pid); + } + processRunner.logExit('SIGINT', 130); + process.exit(130); +}); +process.on('SIGTERM', () => { + writeLogSync('[signal] SIGTERM received'); + const active = processRunner.getActiveChild(); + if (active) { + writeLogSync(`[signal] terminating ${processRunner.getActiveLabel()}`); + processRunner.killProcessTree(active.pid); + } + processRunner.logExit('SIGTERM', 143); + process.exit(143); +}); +process.on('uncaughtException', (err) => { + writeLogSync(`[error] uncaughtException: ${err?.stack || err}`); + processRunner.logExit('uncaughtException', 1); + process.exit(1); +}); +process.on('unhandledRejection', (err) => { + writeLogSync(`[error] unhandledRejection: ${err?.stack || err}`); + processRunner.logExit('unhandledRejection', 1); + process.exit(1); +}); + +const config = loadBenchConfig(configPath); +await validateEncodingFixtures(scriptRoot); +const languageFilter = parseList(argv.languages || argv.language).map((entry) => entry.toLowerCase()); +let tierFilter = parseList(argv.tier).map((entry) => entry.toLowerCase()); +const repoFilter = parseList(argv.only || argv.repos).map((entry) => entry.toLowerCase()); +if (!tierFilter.length && Array.isArray(argv._) && argv._.length) { + const positionalTiers = argv._ + .map((entry) => String(entry).toLowerCase()) + .filter((entry) => entry === 'large' || entry === 'typical' || entry === 'small' || entry === 'tiny'); + if (positionalTiers.length) tierFilter = positionalTiers; +} + +const tasks = []; +for (const [language, entry] of Object.entries(config)) { + if (languageFilter.length && !languageFilter.includes(language.toLowerCase())) continue; + const queriesPath = argv.queries + ? path.resolve(argv.queries) + : path.resolve(scriptRoot, entry.queries || ''); + if (!fs.existsSync(queriesPath)) { + console.error(`Missing queries file: ${queriesPath}`); + process.exit(1); + } + const repoGroups = entry.repos || {}; + for (const [tier, repos] of Object.entries(repoGroups)) { + if (tierFilter.length && !tierFilter.includes(tier.toLowerCase())) continue; + for (const repo of repos) { + if (repoFilter.length && !repoFilter.includes(repo.toLowerCase())) continue; + tasks.push({ language, label: entry.label || language, tier, repo, queriesPath }); + } + } +} + +if (argv.list) { + const payload = { + config: configPath, + repoRoot: reposRoot, + cacheRoot, + resultsRoot, + languages: Object.keys(config), + tasks + }; + if (argv.json) { + console.log(JSON.stringify(payload, null, 2)); + } else { + console.log('Benchmark targets'); + console.log(`- config: ${configPath}`); + console.log(`- repos: ${reposRoot}`); + console.log(`- cache: ${cacheRoot}`); + console.log(`- results: ${resultsRoot}`); + for (const task of tasks) { + console.log(`- ${task.language} ${task.tier} ${task.repo}`); + } + } + process.exit(0); +} + +if (!tasks.length) { + console.error('No benchmark targets match the requested filters.'); + process.exit(1); +} + +let cloneTool = null; +if (cloneEnabled && !dryRun) { + ensureLongPathsSupport(); + cloneTool = resolveCloneTool(); + if (!quietMode) console.log(`Clone tool: ${cloneTool.label}`); +} +await fsPromises.mkdir(reposRoot, { recursive: true }); +await fsPromises.mkdir(resultsRoot, { recursive: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); +initLog(); +writeLog(`Clone tool: ${cloneTool ? cloneTool.label : 'disabled'}`); + +const benchScript = path.join(scriptRoot, 'tests', 'bench.js'); +const results = []; +const startTime = Date.now(); +let completed = 0; + +progress.updateMetrics('Metrics: pending'); +progress.updateProgress(`Progress: 0/${tasks.length} | elapsed ${formatDuration(0)}`); + +for (const task of tasks) { + const repoPath = resolveRepoDir({ reposRoot, repo: task.repo, language: task.language }); + await fsPromises.mkdir(path.dirname(repoPath), { recursive: true }); + const repoLabel = `${task.language}/${task.repo}`; + const phaseLabel = `repo ${repoLabel} (${task.tier})`; + progressState.currentRepoLabel = repoLabel; + progress.resetBuildProgress(repoLabel); + + if (!fs.existsSync(repoPath)) { + if (!cloneEnabled && !dryRun) { + console.error(`Missing repo ${task.repo} at ${repoPath}. Re-run with --clone.`); + process.exit(1); + } + progress.updateProgress(`Progress: ${completed}/${tasks.length} | cloning ${phaseLabel} | elapsed ${formatDuration(Date.now() - startTime)}`); + if (!dryRun && cloneEnabled && cloneTool) { + const args = cloneTool.buildArgs(task.repo, repoPath); + const cloneResult = await processRunner.runProcess(`clone ${task.repo}`, cloneTool.label, args, { + env: { ...process.env, GIT_TERMINAL_PROMPT: '0' }, + continueOnError: true + }); + if (!cloneResult.ok) { + progress.appendLog(`[error] Clone failed for ${repoLabel}; continuing to next repo.`); + completed += 1; + progress.updateProgress(`Progress: ${completed}/${tasks.length} | failed ${phaseLabel} | elapsed ${formatDuration(Date.now() - startTime)}`); + progress.updateMetrics('Metrics: failed (clone)'); + results.push({ + ...task, + repoPath, + outFile: null, + summary: null, + failed: true, + failureReason: 'clone', + failureCode: cloneResult.code ?? null + }); + continue; + } + } + } + + const repoUserConfig = loadUserConfig( + repoPath, + indexProfile ? { profile: indexProfile } : {} + ); + const repoRuntimeConfig = getRuntimeConfig(repoPath, repoUserConfig); + let baseNodeOptions = baseEnv.NODE_OPTIONS || ''; + if (Number.isFinite(heapArg) && heapArg > 0) { + baseNodeOptions = stripMaxOldSpaceFlag(baseNodeOptions); + } + const hasHeapFlag = baseNodeOptions.includes('--max-old-space-size'); + let heapOverride = null; + if (Number.isFinite(heapArg) && heapArg > 0) { + heapOverride = heapArg; + if (!heapLogged) { + progress.appendLog(`[heap] Using ${formatGb(heapOverride)} (${heapOverride} MB) from --heap-mb.`); + heapLogged = true; + } + } else if ( + !Number.isFinite(repoRuntimeConfig.maxOldSpaceMb) + && !envConfig.maxOldSpaceMb + && !hasHeapFlag + ) { + heapOverride = heapRecommendation.recommendedMb; + if (!heapLogged) { + progress.appendLog( + `[auto-heap] Using ${formatGb(heapOverride)} (${heapOverride} MB) for Node heap. ` + + 'Override with --heap-mb or PAIROFCLEATS_MAX_OLD_SPACE_MB.' + ); + heapLogged = true; + } + } + const runtimeConfigForRun = heapOverride + ? { ...repoRuntimeConfig, maxOldSpaceMb: heapOverride } + : repoRuntimeConfig; + const baseEnvForRun = { ...baseEnv }; + if (baseNodeOptions) { + baseEnvForRun.NODE_OPTIONS = baseNodeOptions; + } else { + delete baseEnvForRun.NODE_OPTIONS; + } + const repoEnvBase = resolveRuntimeEnv(runtimeConfigForRun, baseEnvForRun); + if (suppressProfileEnv && repoEnvBase.PAIROFCLEATS_PROFILE) { + delete repoEnvBase.PAIROFCLEATS_PROFILE; + } + if (heapOverride) { + repoEnvBase.PAIROFCLEATS_MAX_OLD_SPACE_MB = String(heapOverride); + } + if (indexProfile) { + repoEnvBase.PAIROFCLEATS_PROFILE = indexProfile; + } + + const outDir = path.join(resultsRoot, task.language); + const outFile = path.join(outDir, `${task.repo.replace('/', '__')}.json`); + await fsPromises.mkdir(outDir, { recursive: true }); + + const repoCacheRoot = resolveRepoCacheRoot({ repoPath, cacheRoot }); + const wantsMemory = backendList.includes('memory'); + const missingIndex = needsIndexArtifacts(repoPath); + const missingSqlite = wantsSqlite && needsSqliteArtifacts(repoPath); + let autoBuildIndex = false; + let autoBuildSqlite = false; + const buildIndexRequested = argv.build || argv['build-index']; + const buildSqliteRequested = argv.build || argv['build-sqlite']; + if (buildSqliteRequested && !buildIndexRequested && missingIndex) { + autoBuildIndex = true; + progress.appendLog('[auto-build] sqlite build requires index artifacts; enabling build-index.'); + } + if (!argv.build && !argv['build-index'] && !argv['build-sqlite']) { + if (missingIndex && wantsMemory) autoBuildIndex = true; + if (missingSqlite) autoBuildSqlite = true; + if (autoBuildSqlite && missingIndex) autoBuildIndex = true; + if (autoBuildIndex || autoBuildSqlite) { + progress.appendLog( + `[auto-build] missing artifacts${autoBuildIndex ? ' index' : ''}${autoBuildSqlite ? ' sqlite' : ''}; enabling build.` + ); + } + } + + const shouldBuildIndex = argv.build || argv['build-index'] || autoBuildIndex; + if (shouldBuildIndex && !dryRun) { + try { + progress.appendLog(`[metrics] Collecting line counts for ${repoLabel}...`); + const stats = await buildLineStats(repoPath, repoUserConfig); + progressState.build.lineTotals = stats.totals; + progressState.build.linesByFile = stats.linesByFile; + progress.appendLog( + `[metrics] Line totals: code=${stats.totals.code.toLocaleString()} prose=${stats.totals.prose.toLocaleString()}` + ); + } catch (err) { + progress.appendLog(`[metrics] Line counts unavailable: ${err?.message || err}`); + } + } + + const lockCheck = await checkIndexLock({ + repoCacheRoot, + repoLabel, + lockMode, + lockWaitMs, + lockStaleMs, + onLog: progress.appendLog + }); + if (!lockCheck.ok) { + const detail = formatLockDetail(lockCheck.detail); + const message = `Skipping ${repoLabel}: index lock held ${detail}`.trim(); + progress.appendLog(`[lock] ${message}`); + if (!quietMode) console.error(message); + completed += 1; + progress.updateProgress(`Progress: ${completed}/${tasks.length} | skipped ${phaseLabel} | elapsed ${formatDuration(Date.now() - startTime)}`); + progress.updateMetrics('Metrics: skipped (lock)'); + results.push({ + ...task, + repoPath, + outFile, + summary: null, + skipped: true, + skipReason: 'lock', + lock: lockCheck.detail || null + }); + continue; + } + + const benchArgs = [ + benchScript, + '--repo', + repoPath, + '--queries', + task.queriesPath, + '--write-report', + '--out', + outFile + ]; + if (indexProfile) benchArgs.push('--index-profile', indexProfile); + benchArgs.push('--real-embeddings'); + if (argv.build) { + benchArgs.push('--build'); + } else { + if (argv['build-index'] || autoBuildIndex) benchArgs.push('--build-index'); + if (argv['build-sqlite'] || autoBuildSqlite) benchArgs.push('--build-sqlite'); + } + if (argv.incremental) benchArgs.push('--incremental'); + if (argv['stub-embeddings']) { + progress.appendLog('[bench] Stub embeddings requested; ignored for heavy language benchmarks.'); + } + if (argv.ann) benchArgs.push('--ann'); + if (argv['no-ann']) benchArgs.push('--no-ann'); + if (argv.backend) benchArgs.push('--backend', String(argv.backend)); + if (argv.top) benchArgs.push('--top', String(argv.top)); + if (argv.limit) benchArgs.push('--limit', String(argv.limit)); + if (argv['bm25-k1']) benchArgs.push('--bm25-k1', String(argv['bm25-k1'])); + if (argv['bm25-b']) benchArgs.push('--bm25-b', String(argv['bm25-b'])); + if (argv['fts-profile']) benchArgs.push('--fts-profile', String(argv['fts-profile'])); + if (argv['fts-weights']) benchArgs.push('--fts-weights', String(argv['fts-weights'])); + if (argv.threads) benchArgs.push('--threads', String(argv.threads)); + if (argv['no-index-profile']) benchArgs.push('--no-index-profile'); + + progress.updateProgress(`Progress: ${completed}/${tasks.length} | bench ${phaseLabel} | elapsed ${formatDuration(Date.now() - startTime)}`); + + let summary = null; + if (dryRun) { + progress.appendLog(`[dry-run] node ${benchArgs.join(' ')}`); + } else { + const benchResult = await processRunner.runProcess(`bench ${repoLabel}`, process.execPath, benchArgs, { + cwd: scriptRoot, + env: { + ...repoEnvBase, + PAIROFCLEATS_CACHE_ROOT: cacheRoot, + PAIROFCLEATS_PROGRESS_FILES: '1', + PAIROFCLEATS_PROGRESS_LINES: '1', + ...(Number.isFinite(Number(argv.threads)) && Number(argv.threads) > 0 + ? { PAIROFCLEATS_THREADS: String(argv.threads) } + : {}) + }, + continueOnError: true + }); + if (!benchResult.ok) { + progress.appendLog(`[error] Bench failed for ${repoLabel}; continuing to next repo.`); + completed += 1; + progress.updateProgress(`Progress: ${completed}/${tasks.length} | failed ${phaseLabel} | elapsed ${formatDuration(Date.now() - startTime)}`); + progress.updateMetrics('Metrics: failed (bench)'); + results.push({ + ...task, + repoPath, + outFile, + summary: null, + failed: true, + failureReason: 'bench', + failureCode: benchResult.code ?? null + }); + continue; + } + try { + const raw = await fsPromises.readFile(outFile, 'utf8'); + summary = JSON.parse(raw).summary || null; + } catch (err) { + progress.appendLog(`[error] Failed to read bench report for ${repoLabel}; continuing.`); + if (err && err.message) console.error(err.message); + completed += 1; + progress.updateProgress(`Progress: ${completed}/${tasks.length} | failed ${phaseLabel} | elapsed ${formatDuration(Date.now() - startTime)}`); + progress.updateMetrics('Metrics: failed (report)'); + results.push({ + ...task, + repoPath, + outFile, + summary: null, + failed: true, + failureReason: 'report', + failureCode: null + }); + continue; + } + } + + completed += 1; + progress.updateProgress(`Progress: ${completed}/${tasks.length} | finished ${phaseLabel} | elapsed ${formatDuration(Date.now() - startTime)}`); + progress.updateMetrics(formatMetricSummary(summary)); + + results.push({ ...task, repoPath, outFile, summary }); +} + +const output = buildReportOutput({ + configPath, + cacheRoot, + resultsRoot, + results, + config +}); + +if (!quietMode) { + if (interactive) { + progress.renderStatus(); + process.stdout.write('\n'); + } + console.log('\nGrouped summary'); + for (const [language, payload] of Object.entries(output.groupedSummary)) { + if (!payload.summary) continue; + printSummary(payload.label, payload.summary, payload.count, quietMode); + } + printSummary('Overall', output.overallSummary, results.length, quietMode); +} + +if (argv.out) { + const outPath = path.resolve(argv.out); + await fsPromises.mkdir(path.dirname(outPath), { recursive: true }); + await fsPromises.writeFile(outPath, JSON.stringify(output, null, 2)); +} + +if (argv.json) { + console.log(JSON.stringify(output, null, 2)); +} else { + console.log(`\nCompleted ${results.length} benchmark runs.`); + if (argv.out) console.log(`Summary written to ${path.resolve(argv.out)}`); +} diff --git a/tools/bench-query-generator.js b/tools/bench-query-generator.js new file mode 100644 index 000000000..c25dbe329 --- /dev/null +++ b/tools/bench-query-generator.js @@ -0,0 +1,101 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import seedrandom from 'seedrandom'; +import { createCli } from '../src/shared/cli.js'; +import { loadChunkMeta } from '../src/shared/artifact-io.js'; +import { sha1 } from '../src/shared/hash.js'; +import { getIndexDir, loadUserConfig } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'bench-query-generator', + options: { + repo: { type: 'string' }, + mode: { type: 'string', default: 'code' }, + count: { type: 'number', default: 50 }, + out: { type: 'string' }, + seed: { type: 'string' }, + json: { type: 'boolean', default: false }, + 'index-root': { type: 'string' } + } +}).parse(); + +const root = argv.repo ? path.resolve(argv.repo) : process.cwd(); +const userConfig = loadUserConfig(root); +const mode = String(argv.mode || 'code').toLowerCase(); +const indexRoot = argv['index-root'] ? path.resolve(argv['index-root']) : null; +const indexDir = getIndexDir(root, mode, userConfig, indexRoot ? { indexRoot } : {}); +const chunks = loadChunkMeta(indexDir); +if (!Array.isArray(chunks) || !chunks.length) { + console.error(`No chunk metadata found at ${indexDir}`); + process.exit(1); +} + +const count = Math.max(10, Math.min(200, Number(argv.count) || 50)); +const defaultSeed = sha1(`${indexDir}:${mode}:${chunks.length}`); +const seed = argv.seed || defaultSeed; +const rng = seedrandom(seed); + +const pick = (list) => list[Math.floor(rng() * list.length)]; +const uniq = (list) => Array.from(new Set(list.filter(Boolean))); +const tokensFromDoc = (text) => { + if (!text) return []; + return text.split(/\s+/).map((t) => t.replace(/[^\w-]/g, '')).filter((t) => t.length >= 4); +}; + +const names = uniq(chunks.map((c) => c.name)); +const signatures = uniq(chunks.map((c) => c.docmeta?.signature || c.metaV2?.signature)); +const kinds = uniq(chunks.map((c) => c.kind || c.metaV2?.kind)); +const returnTypes = uniq(chunks.map((c) => c.docmeta?.returnType || c.metaV2?.returns)); +const docs = uniq(chunks.flatMap((c) => tokensFromDoc(c.docmeta?.doc || c.metaV2?.doc))); +const riskTags = uniq(chunks.flatMap((c) => c.docmeta?.risk?.tags || c.metaV2?.risk?.tags || [])); + +const strategies = [ + () => (names.length ? `${pick(names)}` : null), + () => (signatures.length ? `${pick(signatures)} --signature` : null), + () => (names.length && kinds.length ? `${pick(names)} --kind ${pick(kinds)}` : null), + () => (returnTypes.length ? `widget --return-type ${pick(returnTypes)}` : null), + () => (docs.length ? `${pick(docs)}` : null), + () => (riskTags.length ? `exec --risk-tag ${pick(riskTags)}` : null) +]; + +const seen = new Set(); +const queries = []; +let attempts = 0; +while (queries.length < count && attempts < count * 20) { + attempts += 1; + const query = pick(strategies)(); + if (!query) continue; + if (seen.has(query)) continue; + seen.add(query); + queries.push(query); +} + +const payload = { + generatedAt: new Date().toISOString(), + seed, + indexDir, + mode, + count: queries.length, + queries +}; + +if (argv.json) { + const outPath = argv.out ? path.resolve(argv.out) : path.join(root, 'docs', 'benchmarks-queries.json'); + await fs.writeFile(outPath, JSON.stringify(payload, null, 2)); + console.log(`Wrote ${queries.length} queries to ${outPath}`); + process.exit(0); +} + +const outPath = argv.out + ? path.resolve(argv.out) + : path.join(root, 'benchmarks', 'queries', `generated-${mode}.txt`); +const lines = [ + '# Generated by bench-query-generator', + `# seed: ${seed}`, + `# mode: ${mode}`, + ...queries +]; +await fs.mkdir(path.dirname(outPath), { recursive: true }); +await fs.writeFile(outPath, lines.join('\n')); +console.log(`Wrote ${queries.length} queries to ${outPath}`); diff --git a/tools/bench-score-strategy.js b/tools/bench-score-strategy.js new file mode 100644 index 000000000..7a4c0ac02 --- /dev/null +++ b/tools/bench-score-strategy.js @@ -0,0 +1,233 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { createCli } from '../src/shared/cli.js'; +import { execaSync } from 'execa'; +import { getIndexDir, loadUserConfig, resolveToolRoot } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'bench-score-strategy', + options: { + build: { type: 'boolean', default: false }, + 'build-index': { type: 'boolean', default: false }, + json: { type: 'boolean', default: false }, + 'stub-embeddings': { type: 'boolean', default: false }, + 'in-place': { type: 'boolean', default: false }, + repo: { type: 'string' }, + queries: { type: 'string' }, + out: { type: 'string' }, + backend: { type: 'string' }, + top: { type: 'number' }, + limit: { type: 'number' } + } +}).parse(); + +const toolRoot = resolveToolRoot(); +const root = process.cwd(); +const repoSource = path.resolve( + argv.repo || path.join(root, 'tests', 'fixtures', 'sample') +); +const useInPlace = argv['in-place'] === true; +const tempRoot = path.join(root, 'tests', '.cache', 'bench-score-strategy'); +const workRoot = useInPlace ? repoSource : path.join(tempRoot, 'repo'); +const cacheRoot = path.join(tempRoot, 'cache'); +const queriesPath = path.resolve( + argv.queries || path.join(repoSource, 'queries.txt') +); +const backend = argv.backend ? String(argv.backend) : 'memory'; +const topN = Number.isFinite(Number(argv.top)) ? Math.max(1, Number(argv.top)) : 5; +const limit = Number.isFinite(Number(argv.limit)) ? Math.max(0, Number(argv.limit)) : 0; +const buildRequested = argv.build === true || argv['build-index'] === true; +const useStubEmbeddings = argv['stub-embeddings'] === true; + +function runCommand(label, args, env) { + const result = execaSync(process.execPath, args, { encoding: 'utf8', env, reject: false }); + if (result.exitCode !== 0) { + console.error(`Failed: ${label}`); + if (result.stderr) console.error(result.stderr.trim()); + process.exit(result.exitCode ?? 1); + } + return result.stdout || ''; +} + +async function loadQueries(filePath) { + try { + const raw = await fsPromises.readFile(filePath, 'utf8'); + return raw + .split(/\r?\n/) + .map((line) => line.trim()) + .filter((line) => line && !line.startsWith('#')); + } catch { + return []; + } +} + +async function ensureWorkRoot() { + if (useInPlace) return; + await fsPromises.rm(tempRoot, { recursive: true, force: true }); + await fsPromises.mkdir(workRoot, { recursive: true }); + await fsPromises.cp(repoSource, workRoot, { recursive: true }); +} + +function hasIndexArtifacts(repoRoot, userConfig) { + const codeDir = getIndexDir(repoRoot, 'code', userConfig); + const proseDir = getIndexDir(repoRoot, 'prose', userConfig); + const codeMeta = path.join(codeDir, 'chunk_meta.json'); + const proseMeta = path.join(proseDir, 'chunk_meta.json'); + return fs.existsSync(codeMeta) && fs.existsSync(proseMeta); +} + +async function writeBlendConfig(repoRoot, baseConfig, enabled) { + const next = { ...(baseConfig || {}) }; + const search = { ...(next.search || {}) }; + const existingBlend = search.scoreBlend || {}; + search.scoreBlend = { + ...existingBlend, + enabled, + sparseWeight: Number.isFinite(Number(existingBlend.sparseWeight)) + ? Number(existingBlend.sparseWeight) + : 1, + annWeight: Number.isFinite(Number(existingBlend.annWeight)) + ? Number(existingBlend.annWeight) + : 1 + }; + next.search = search; + const configPath = path.join(repoRoot, '.pairofcleats.json'); + await fsPromises.writeFile(configPath, JSON.stringify(next, null, 2)); + return configPath; +} + +async function restoreConfig(repoRoot, originalConfig, configExisted) { + const configPath = path.join(repoRoot, '.pairofcleats.json'); + if (configExisted) { + await fsPromises.writeFile(configPath, originalConfig); + } else if (fs.existsSync(configPath)) { + await fsPromises.rm(configPath, { force: true }); + } +} + +await ensureWorkRoot(); +await fsPromises.rm(cacheRoot, { recursive: true, force: true }); +await fsPromises.mkdir(cacheRoot, { recursive: true }); +process.env.PAIROFCLEATS_CACHE_ROOT = cacheRoot; +if (useStubEmbeddings) process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const envBase = { + ...process.env, + PAIROFCLEATS_CACHE_ROOT: cacheRoot +}; +if (useStubEmbeddings) envBase.PAIROFCLEATS_EMBEDDINGS = 'stub'; + +const queries = await loadQueries(queriesPath); +if (!queries.length) { + console.error(`No queries found at ${queriesPath}`); + process.exit(1); +} +const selectedQueries = limit > 0 ? queries.slice(0, limit) : queries; + +const configPath = path.join(workRoot, '.pairofcleats.json'); +const configExisted = fs.existsSync(configPath); +const originalConfig = configExisted ? await fsPromises.readFile(configPath, 'utf8') : null; +const userConfig = loadUserConfig(workRoot); +const indexExists = hasIndexArtifacts(workRoot, userConfig); +if (!indexExists || buildRequested) { + const buildArgs = [path.join(toolRoot, 'build_index.js'), '--repo', workRoot]; + if (useStubEmbeddings) buildArgs.push('--stub-embeddings'); + runCommand('build index', buildArgs, envBase); +} + +const strategies = [ + { id: 'sparse', annFlag: '--no-ann', blend: false }, + { id: 'ann-fallback', annFlag: '--ann', blend: false }, + { id: 'blend', annFlag: '--ann', blend: true } +]; + +function mean(values) { + if (!values.length) return 0; + return values.reduce((a, b) => a + b, 0) / values.length; +} + +function runSearch(query, annFlag) { + const args = [ + path.join(toolRoot, 'search.js'), + query, + '--repo', + workRoot, + '--json', + '--json-compact', + '--stats', + '--backend', + backend, + '--top', + String(topN), + annFlag + ]; + const output = runCommand('search', args, envBase); + return JSON.parse(output || '{}'); +} + +const summaries = {}; +for (const strategy of strategies) { + await writeBlendConfig(workRoot, userConfig, strategy.blend); + const latencies = []; + const resultCounts = []; + const topScores = []; + const scoreTypeCounts = {}; + let hits = 0; + for (const query of selectedQueries) { + const payload = runSearch(query, strategy.annFlag); + const stats = payload.stats || {}; + if (Number.isFinite(stats.elapsedMs)) latencies.push(stats.elapsedMs); + const results = [ + ...(Array.isArray(payload.code) ? payload.code : []), + ...(Array.isArray(payload.prose) ? payload.prose : []) + ]; + resultCounts.push(results.length); + if (results.length) hits += 1; + if (results.length && Number.isFinite(results[0].score)) { + topScores.push(results[0].score); + } + for (const item of results) { + const type = item.scoreType || 'none'; + scoreTypeCounts[type] = (scoreTypeCounts[type] || 0) + 1; + } + } + summaries[strategy.id] = { + queries: selectedQueries.length, + hitRate: selectedQueries.length ? hits / selectedQueries.length : 0, + resultCountAvg: mean(resultCounts), + latencyMsAvg: mean(latencies), + topScoreAvg: mean(topScores), + scoreTypes: scoreTypeCounts + }; +} + +await restoreConfig(workRoot, originalConfig, configExisted); + +const output = { + generatedAt: new Date().toISOString(), + repo: { source: repoSource, root: workRoot }, + backend, + topN, + queries: selectedQueries.length, + strategies: summaries +}; + +if (argv.out) { + const outPath = path.resolve(argv.out); + await fsPromises.writeFile(outPath, JSON.stringify(output, null, 2)); +} + +if (argv.json) { + console.log(JSON.stringify(output, null, 2)); +} else { + console.log('Score strategy benchmark'); + console.log(`- Repo: ${workRoot}`); + console.log(`- Queries: ${selectedQueries.length}`); + for (const [name, stats] of Object.entries(summaries)) { + console.log(`- ${name} hit rate: ${(stats.hitRate * 100).toFixed(1)}%`); + console.log(`- ${name} avg results: ${stats.resultCountAvg.toFixed(1)}`); + console.log(`- ${name} avg latency: ${stats.latencyMsAvg.toFixed(1)} ms`); + } +} diff --git a/tools/bench/language/cli.js b/tools/bench/language/cli.js new file mode 100644 index 000000000..700e64e2b --- /dev/null +++ b/tools/bench/language/cli.js @@ -0,0 +1,155 @@ +import path from 'node:path'; +import { createCli } from '../../../src/shared/cli.js'; +import { BENCH_OPTIONS, mergeCliOptions, validateBenchArgs } from '../../../src/shared/cli-options.js'; +import { resolveToolRoot } from '../../dict-utils.js'; + +const parseMs = (value, fallback) => { + const parsed = Number(value); + if (Number.isFinite(parsed) && parsed >= 0) return Math.floor(parsed); + return fallback; +}; + +const normalizeLockMode = (value) => { + if (!value) return 'fail-fast'; + const raw = String(value).trim().toLowerCase(); + if (raw === 'wait' || raw === 'retry') return 'wait'; + if (raw === 'stale-clear' || raw === 'stale') return 'stale-clear'; + return 'fail-fast'; +}; + +const resolveBackendList = (value) => { + if (!value) return ['memory', 'sqlite']; + const trimmed = String(value).trim().toLowerCase(); + if (!trimmed) return ['memory', 'sqlite']; + if (trimmed === 'all') return ['memory', 'sqlite', 'sqlite-fts']; + return trimmed + .split(',') + .map((entry) => entry.trim()) + .filter(Boolean); +}; + +const isBenchProfile = (value) => { + if (!value) return false; + const normalized = String(value).trim().toLowerCase(); + if (!normalized) return false; + return normalized === 'bench' || normalized.startsWith('bench-'); +}; + +const buildRunSuffix = () => { + const now = new Date(); + const stamp = [ + now.getFullYear(), + String(now.getMonth() + 1).padStart(2, '0'), + String(now.getDate()).padStart(2, '0') + ].join(''); + const time = [ + String(now.getHours()).padStart(2, '0'), + String(now.getMinutes()).padStart(2, '0'), + String(now.getSeconds()).padStart(2, '0') + ].join(''); + return `run-${stamp}-${time}`; +}; + +export const parseBenchLanguageArgs = (rawArgs = process.argv.slice(2)) => { + const argv = createCli({ + scriptName: 'bench-language', + options: mergeCliOptions( + BENCH_OPTIONS, + { + list: { type: 'boolean', default: false }, + clone: { type: 'boolean', default: true }, + 'no-clone': { type: 'boolean', default: false }, + 'dry-run': { type: 'boolean', default: false }, + 'cache-run': { type: 'boolean', default: false }, + config: { type: 'string' }, + root: { type: 'string' }, + 'cache-root': { type: 'string' }, + 'cache-suffix': { type: 'string' }, + results: { type: 'string' }, + log: { type: 'string' }, + language: { type: 'string' }, + languages: { type: 'string' }, + tier: { type: 'string' }, + repos: { type: 'string' }, + only: { type: 'string' }, + 'log-lines': { type: 'number' }, + 'lock-mode': { type: 'string' }, + 'lock-wait-ms': { type: 'number' }, + 'lock-stale-ms': { type: 'number' } + } + ), + argv: ['node', 'bench-language-repos.js', ...(rawArgs || [])] + }).parse(); + validateBenchArgs(argv); + + const scriptRoot = resolveToolRoot(); + const configPath = path.resolve(argv.config || path.join(scriptRoot, 'benchmarks', 'repos.json')); + const reposRoot = path.resolve(argv.root || path.join(scriptRoot, 'benchmarks', 'repos')); + const cacheRootBase = path.resolve(argv['cache-root'] || path.join(scriptRoot, 'benchmarks', 'cache')); + const cacheSuffixRaw = typeof argv['cache-suffix'] === 'string' ? argv['cache-suffix'].trim() : ''; + const cacheRun = argv['cache-run'] === true; + const cacheSuffix = cacheSuffixRaw || (cacheRun ? buildRunSuffix() : ''); + const cacheRoot = cacheSuffix ? path.resolve(cacheRootBase, cacheSuffix) : cacheRootBase; + const resultsRoot = path.resolve(argv.results || path.join(scriptRoot, 'benchmarks', 'results')); + const logRoot = path.join(resultsRoot, 'logs', 'bench-language'); + const logPath = argv.log + ? path.resolve(argv.log) + : path.join(logRoot, `${buildRunSuffix()}.log`); + + const cloneEnabled = argv['no-clone'] ? false : argv.clone !== false; + const dryRun = argv['dry-run'] === true; + const quietMode = argv.json === true; + const interactive = !quietMode && process.stdout.isTTY; + const colorEnabled = interactive && !process.env.NO_COLOR; + + const logLineArg = Number.parseInt(argv['log-lines'], 10); + const logWindowSize = Number.isFinite(logLineArg) + ? Math.max(3, Math.min(50, logLineArg)) + : 20; + + const lockMode = normalizeLockMode( + argv['lock-mode'] + || ((argv.build || argv['build-index'] || argv['build-sqlite']) ? 'stale-clear' : '') + ); + const lockWaitMs = parseMs(argv['lock-wait-ms'], 5 * 60 * 1000); + const lockStaleMs = parseMs(argv['lock-stale-ms'], 30 * 60 * 1000); + + const backendList = resolveBackendList(argv.backend); + const wantsSqlite = backendList.includes('sqlite') + || backendList.includes('sqlite-fts') + || backendList.includes('fts'); + + const indexProfileRaw = typeof argv['index-profile'] === 'string' + ? argv['index-profile'].trim() + : ''; + const defaultHeavyProfile = 'full'; + const resolvedProfile = indexProfileRaw && !isBenchProfile(indexProfileRaw) + ? indexProfileRaw + : defaultHeavyProfile; + const indexProfile = argv['no-index-profile'] === true ? '' : resolvedProfile; + const suppressProfileEnv = argv['no-index-profile'] === true; + + return { + argv, + scriptRoot, + configPath, + reposRoot, + cacheRoot, + resultsRoot, + logRoot, + logPath, + cloneEnabled, + dryRun, + quietMode, + interactive, + colorEnabled, + logWindowSize, + lockMode, + lockWaitMs, + lockStaleMs, + backendList, + wantsSqlite, + indexProfile, + suppressProfileEnv + }; +}; diff --git a/tools/bench/language/config.js b/tools/bench/language/config.js new file mode 100644 index 000000000..b95853ed1 --- /dev/null +++ b/tools/bench/language/config.js @@ -0,0 +1,15 @@ +import { readJsoncFile } from '../../../src/shared/jsonc.js'; + +export const loadBenchConfig = (configPath) => { + try { + const config = readJsoncFile(configPath); + if (!config || typeof config !== 'object') { + throw new Error('Bench config must be a JSON object.'); + } + return config; + } catch (err) { + console.error(`Failed to read ${configPath}`); + if (err && err.message) console.error(err.message); + process.exit(1); + } +}; diff --git a/tools/bench/language/locks.js b/tools/bench/language/locks.js new file mode 100644 index 000000000..a81a34f9b --- /dev/null +++ b/tools/bench/language/locks.js @@ -0,0 +1,107 @@ +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { formatDuration } from './metrics.js'; + +const isProcessAlive = (pid) => { + if (!Number.isFinite(pid) || pid <= 0) return false; + try { + process.kill(pid, 0); + return true; + } catch (err) { + return err?.code === 'EPERM'; + } +}; + +const readLockInfo = async (lockPath) => { + try { + const raw = await fsPromises.readFile(lockPath, 'utf8'); + const parsed = JSON.parse(raw); + return parsed && typeof parsed === 'object' ? parsed : null; + } catch { + return null; + } +}; + +const getLockAgeMs = async (lockPath, info) => { + if (info?.startedAt) { + const started = Date.parse(info.startedAt); + if (Number.isFinite(started)) return Math.max(0, Date.now() - started); + } + try { + const stat = await fsPromises.stat(lockPath); + return Math.max(0, Date.now() - stat.mtimeMs); + } catch { + return null; + } +}; + +export const formatLockDetail = (detail) => { + if (!detail) return ''; + const parts = []; + if (Number.isFinite(detail.ageMs)) { + parts.push(`age ${formatDuration(detail.ageMs)}`); + } + if (Number.isFinite(detail.pid)) { + parts.push(`pid ${detail.pid}`); + } + return parts.length ? `(${parts.join(', ')})` : ''; +}; + +export const checkIndexLock = async ({ + repoCacheRoot, + repoLabel, + lockMode, + lockWaitMs, + lockStaleMs, + onLog +}) => { + const lockPath = path.join(repoCacheRoot, 'locks', 'index.lock'); + if (!fs.existsSync(lockPath)) return { ok: true }; + + const readDetail = async () => { + const info = await readLockInfo(lockPath); + const ageMs = await getLockAgeMs(lockPath, info); + const pid = Number.isFinite(Number(info?.pid)) ? Number(info.pid) : null; + const alive = pid ? isProcessAlive(pid) : null; + const detail = { lockPath, ageMs, pid, alive }; + const isStale = (Number.isFinite(ageMs) && ageMs > lockStaleMs) || (pid && !alive); + return { detail, isStale }; + }; + + const clearIfStale = async (detail) => { + try { + await fsPromises.rm(lockPath, { force: true }); + if (onLog) { + onLog(`[lock] cleared stale lock for ${repoLabel} ${formatLockDetail(detail)}`); + } + return true; + } catch (err) { + if (onLog) { + onLog(`[lock] failed to clear stale lock for ${repoLabel}: ${err?.message || err}`); + } + return false; + } + }; + + const initial = await readDetail(); + if (initial.isStale) { + const cleared = await clearIfStale(initial.detail); + if (cleared) return { ok: true, cleared: true, detail: initial.detail }; + } + + if (lockMode === 'wait') { + const deadline = Date.now() + lockWaitMs; + while (Date.now() < deadline) { + if (!fs.existsSync(lockPath)) return { ok: true }; + const current = await readDetail(); + if (current.isStale) { + const cleared = await clearIfStale(current.detail); + if (cleared) return { ok: true, cleared: true, detail: current.detail }; + } + await new Promise((resolve) => setTimeout(resolve, 1000)); + } + } + + return { ok: false, detail: initial.detail }; +}; diff --git a/tools/bench/language/metrics.js b/tools/bench/language/metrics.js new file mode 100644 index 000000000..36591d88a --- /dev/null +++ b/tools/bench/language/metrics.js @@ -0,0 +1,110 @@ +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { buildIgnoreMatcher } from '../../../src/index/build/ignore.js'; +import { discoverFilesForModes } from '../../../src/index/build/discover.js'; +import { readTextFile } from '../../../src/shared/encoding.js'; +import { countLinesForEntries } from '../../../src/shared/file-stats.js'; + +export const formatDuration = (ms) => { + const total = Math.max(0, Math.floor(ms / 1000)); + const hours = Math.floor(total / 3600); + const minutes = Math.floor((total % 3600) / 60); + const seconds = total % 60; + if (hours > 0) return `${hours}h ${minutes}m ${seconds}s`; + if (minutes > 0) return `${minutes}m ${seconds}s`; + return `${seconds}s`; +}; + +export const formatGb = (mb) => `${(mb / 1024).toFixed(1)} GB`; + +export const formatLoc = (value) => { + if (!Number.isFinite(value)) return 'n/a'; + if (value >= 1_000_000) return `${(value / 1_000_000).toFixed(2)}M`; + if (value >= 1_000) return `${(value / 1_000).toFixed(1)}k`; + return `${Math.floor(value)}`; +}; + +export const stripMaxOldSpaceFlag = (options) => { + if (!options) return ''; + return options + .replace(/--max-old-space-size=\d+/g, '') + .replace(/\s+/g, ' ') + .trim(); +}; + +export const getRecommendedHeapMb = () => { + const totalMb = Math.floor(os.totalmem() / (1024 * 1024)); + const recommended = Math.max(4096, Math.floor(totalMb * 0.75)); + const rounded = Math.floor(recommended / 256) * 256; + return { + totalMb, + recommendedMb: Math.max(4096, rounded) + }; +}; + +export const formatMetricSummary = (summary) => { + if (!summary) return 'Metrics: pending'; + const backends = summary.backends || Object.keys(summary.latencyMsAvg || {}); + const parts = []; + for (const backend of backends) { + const latency = summary.latencyMsAvg?.[backend]; + const hitRate = summary.hitRate?.[backend]; + const latencyText = Number.isFinite(latency) ? `${latency.toFixed(1)}ms` : 'n/a'; + const hitText = Number.isFinite(hitRate) ? `${(hitRate * 100).toFixed(1)}%` : 'n/a'; + parts.push(`${backend} ${latencyText} hit ${hitText}`); + } + if (summary.embeddingProvider) { + parts.push(`embed ${summary.embeddingProvider}`); + } + return parts.length ? `Metrics: ${parts.join(' | ')}` : 'Metrics: pending'; +}; + +const resolveMaxFileBytes = (userConfig) => { + const raw = userConfig?.indexing?.maxFileBytes; + const parsed = Number(raw); + if (raw === false || raw === 0) return null; + if (Number.isFinite(parsed) && parsed > 0) return Math.floor(parsed); + return 5 * 1024 * 1024; +}; + +export const buildLineStats = async (repoPath, userConfig) => { + const modes = ['code', 'prose']; + const { ignoreMatcher } = await buildIgnoreMatcher({ root: repoPath, userConfig }); + const skippedByMode = { code: [], prose: [] }; + const maxFileBytes = resolveMaxFileBytes(userConfig); + const entriesByMode = await discoverFilesForModes({ + root: repoPath, + modes, + ignoreMatcher, + skippedByMode, + maxFileBytes + }); + const linesByFile = { code: new Map(), prose: new Map() }; + const totals = { code: 0, prose: 0 }; + const concurrency = Math.max(1, Math.min(32, os.cpus().length * 2)); + for (const mode of modes) { + const entries = entriesByMode[mode] || []; + if (!entries.length) continue; + const lineCounts = await countLinesForEntries(entries, { concurrency }); + for (const [rel, lines] of lineCounts) { + linesByFile[mode].set(rel, lines); + totals[mode] += lines; + } + } + return { totals, linesByFile }; +}; + +export const validateEncodingFixtures = async (scriptRoot) => { + const fixturePath = path.join(scriptRoot, 'tests', 'fixtures', 'encoding', 'latin1.js'); + if (!fs.existsSync(fixturePath)) return; + try { + const { text, usedFallback } = await readTextFile(fixturePath); + const expected = 'caf\u00e9'; + if (!text.includes(expected) || !usedFallback) { + console.warn(`[bench] Encoding fixture did not decode as expected: ${fixturePath}`); + } + } catch (err) { + console.warn(`[bench] Encoding fixture read failed: ${err?.message || err}`); + } +}; diff --git a/tools/bench/language/process.js b/tools/bench/language/process.js new file mode 100644 index 000000000..d8f25ae78 --- /dev/null +++ b/tools/bench/language/process.js @@ -0,0 +1,117 @@ +import { execa, execaSync } from 'execa'; + +export const createProcessRunner = ({ + appendLog, + writeLog, + writeLogSync, + logHistory, + logPath +}) => { + let activeChild = null; + let activeLabel = ''; + let exitLogged = false; + + const setActiveChild = (child, label) => { + activeChild = child; + activeLabel = label; + }; + + const clearActiveChild = (child) => { + if (activeChild === child) { + activeChild = null; + activeLabel = ''; + } + }; + + const killProcessTree = (pid) => { + if (!Number.isFinite(pid)) return; + try { + if (process.platform === 'win32') { + execaSync('taskkill', ['/PID', String(pid), '/T', '/F'], { stdio: 'ignore', reject: false }); + return; + } + process.kill(pid, 'SIGTERM'); + } catch {} + }; + + const logExit = (reason, code) => { + if (exitLogged) return; + writeLogSync(`[exit] ${reason}${Number.isFinite(code) ? ` code=${code}` : ''}`); + exitLogged = true; + }; + + const runProcess = async (label, cmd, args, options = {}) => { + const { continueOnError = false, ...spawnOptionsRest } = options; + const spawnOptions = { + ...spawnOptionsRest, + stdio: ['ignore', 'pipe', 'pipe'], + reject: false + }; + const child = execa(cmd, args, spawnOptions); + setActiveChild(child, label); + writeLog(`[start] ${label}`); + const carry = { stdout: '', stderr: '' }; + const handleChunk = (chunk, key) => { + const text = carry[key] + chunk.toString('utf8'); + const normalized = text.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); + const parts = normalized.split('\n'); + carry[key] = parts.pop() || ''; + for (const line of parts) appendLog(line); + }; + child.stdout?.on('data', (chunk) => handleChunk(chunk, 'stdout')); + child.stderr?.on('data', (chunk) => handleChunk(chunk, 'stderr')); + try { + const result = await child; + if (carry.stdout) appendLog(carry.stdout); + if (carry.stderr) appendLog(carry.stderr); + const code = result.exitCode; + writeLog(`[finish] ${label} code=${code}`); + clearActiveChild(child); + if (code === 0) { + return { ok: true }; + } + console.error(`Failed: ${label}`); + console.error(`Log: ${logPath}`); + writeLog(`[error] Failed: ${label}`); + writeLog(`[error] Log: ${logPath}`); + if (logHistory.length) { + console.error('Last log lines:'); + logHistory.slice(-10).forEach((line) => console.error(`- ${line}`)); + logHistory.slice(-10).forEach((line) => writeLog(`[error] ${line}`)); + } + if (logHistory.some((line) => line.toLowerCase().includes('filename too long'))) { + console.error('Hint: On Windows, enable long paths and set `git config --global core.longpaths true` or use a shorter --root path.'); + writeLog('[hint] Enable Windows long paths and set `git config --global core.longpaths true` or use a shorter --root path.'); + } + if (!continueOnError) { + logExit('failure', code ?? 1); + process.exit(code ?? 1); + } + return { ok: false, code: code ?? 1 }; + } catch (err) { + const message = err?.shortMessage || err?.message || err; + writeLog(`[error] ${label} spawn failed: ${message}`); + clearActiveChild(child); + console.error(`Failed: ${label}`); + console.error(`Log: ${logPath}`); + if (logHistory.length) { + console.error('Last log lines:'); + logHistory.slice(-10).forEach((line) => console.error(`- ${line}`)); + logHistory.slice(-10).forEach((line) => writeLog(`[error] ${line}`)); + } + if (!continueOnError) { + logExit('failure', err?.exitCode ?? 1); + process.exit(err?.exitCode ?? 1); + } + return { ok: false, code: err?.exitCode ?? 1 }; + } + }; + + return { + runProcess, + killProcessTree, + logExit, + getActiveChild: () => activeChild, + getActiveLabel: () => activeLabel + }; +}; diff --git a/tools/bench/language/progress/parse.js b/tools/bench/language/progress/parse.js new file mode 100644 index 000000000..2e209ed1a --- /dev/null +++ b/tools/bench/language/progress/parse.js @@ -0,0 +1,87 @@ +const buildProgressRegex = /^\s*(Files|Imports)\s+(\d+)\/(\d+)\s+\((\d+(?:\.\d+)?)%\)/i; +const buildCombinedFileRegex = /^\s*Files\s+(\d+)\/(\d+)\s+\((\d+(?:\.\d+)?)%\)\s+(?:\[(.+?)\]\s+)?(?:File\s+)?(\d+)\/(\d+)(?:\s+lines\s+[0-9,\.]+)?\s+(.+)$/i; +const buildFileOnlyRegex = /^\s*(?:\[(.+?)\]\s+)?(?:File\s+)?(\d+)\/(\d+)(?:\s+lines\s+[0-9,\.]+)?\s+(.+)$/i; +const buildShardRegex = /^\s*(?:\u2192|->)\s+Shard\s+(\d+)\/(\d+):\s+([^\r\n\[]+?)(?:\s+\[[^\]]+\])?\s+\((\d+)\s+files\)/i; +const buildImportStatsRegex = /^\s*\u2192\s*Imports:\s+modules=(\d+),\s*edges=(\d+),\s*files=(\d+)/i; +const buildScanRegex = /Scanning\s+(code|prose)/i; +const buildLineRegex = /^\s*Line\s+(\d+)\s*\/\s*(\d+)/i; + +export const normalizeShardLabel = (raw) => { + if (!raw) return ''; + const trimmed = raw.trim(); + if (!trimmed || /^shard$/i.test(trimmed)) return ''; + return trimmed.replace(/^shard\s+/i, '').trim(); +}; + +export const parseShardLine = (line) => { + const match = buildShardRegex.exec(line); + if (!match) return null; + return { + index: Number.parseInt(match[1], 10), + total: Number.parseInt(match[2], 10), + shardLabel: match[3] ? match[3].trim() : '', + fileCount: Number.parseInt(match[4], 10) + }; +}; + +export const parseImportStatsLine = (line) => { + const match = buildImportStatsRegex.exec(line); + if (!match) return null; + return { + modules: Number.parseInt(match[1], 10), + edges: Number.parseInt(match[2], 10), + files: Number.parseInt(match[3], 10) + }; +}; + +export const parseFileProgressLine = (line) => { + const combined = buildCombinedFileRegex.exec(line); + if (combined) { + return { + count: Number.parseInt(combined[1], 10), + total: Number.parseInt(combined[2], 10), + pct: Number.parseFloat(combined[3]), + shardLabel: normalizeShardLabel(combined[4]), + fileIndex: Number.parseInt(combined[5], 10), + fileTotal: Number.parseInt(combined[6], 10), + file: combined[7] ? combined[7].trim() : '' + }; + } + const solo = buildFileOnlyRegex.exec(line); + if (!solo) return null; + return { + count: null, + total: null, + pct: null, + shardLabel: normalizeShardLabel(solo[1]), + fileIndex: Number.parseInt(solo[2], 10), + fileTotal: Number.parseInt(solo[3], 10), + file: solo[4] ? solo[4].trim() : '' + }; +}; + +export const parseProgressLine = (line) => { + const match = buildProgressRegex.exec(line); + if (!match) return null; + return { + step: match[1], + count: Number.parseInt(match[2], 10), + total: Number.parseInt(match[3], 10), + pct: Number.parseFloat(match[4]) + }; +}; + +export const parseLineProgress = (line) => { + const match = buildLineRegex.exec(line); + if (!match) return null; + return { + current: Number.parseInt(match[1], 10), + total: Number.parseInt(match[2], 10) + }; +}; + +export const parseScanMode = (line) => { + const match = buildScanRegex.exec(line); + if (!match) return null; + return match[1].toLowerCase(); +}; diff --git a/tools/bench/language/progress/render.js b/tools/bench/language/progress/render.js new file mode 100644 index 000000000..2e1b91141 --- /dev/null +++ b/tools/bench/language/progress/render.js @@ -0,0 +1,530 @@ +import readline from 'node:readline'; +import { formatShardFileProgress } from '../../../../src/shared/bench-progress.js'; +import { toPosix } from '../../../../src/shared/files.js'; +import { formatDuration, formatLoc } from '../metrics.js'; +import { + parseFileProgressLine, + parseImportStatsLine, + parseLineProgress, + parseProgressLine, + parseScanMode, + parseShardLine +} from './parse.js'; +import { resetBuildProgressState } from './state.js'; + +const ansi = { + reset: '\x1b[0m', + fgDim: '\x1b[90m', + fgLight: '\x1b[37m', + fgBright: '\x1b[97m', + bgBlack: '\x1b[40m' +}; + +export const createProgressRenderer = ({ + state, + interactive, + quietMode, + colorEnabled, + writeLog, + getActiveLabel +}) => { + const pushHistory = (line) => { + if (!line) return; + state.logHistory.push(line); + if (state.logHistory.length > state.logHistorySize) state.logHistory.shift(); + }; + + const truncateDisplay = (line) => { + if (!line) return ''; + const width = Number.isFinite(process.stdout.columns) ? process.stdout.columns : 120; + if (line.length <= width) return line; + return `${line.slice(0, Math.max(0, width - 1))}\u2026`; + }; + + const extractLogTag = (line) => { + if (!line) return ''; + const match = /^\s*\[([^\]]+)\]\s*/.exec(line); + return match ? match[1].trim().toLowerCase() : ''; + }; + + const resolveLogTag = (line, tagOverride) => { + if (tagOverride) return String(tagOverride).trim().toLowerCase(); + return extractLogTag(line); + }; + + const shouldUpdateLogWindowLine = (line, tag) => { + if (!tag) return true; + const now = Date.now(); + const last = state.logUpdateByTag.get(tag); + if (last) { + if (last.line === line) return false; + if (now - last.at < state.logUpdateDebounceMs) return false; + } + state.logUpdateByTag.set(tag, { line, at: now }); + return true; + }; + + const upsertLogWindowLine = (line, tagOverride) => { + const tag = resolveLogTag(line, tagOverride); + if (!tag) return false; + for (let i = state.logLines.length - 1; i >= 0; i -= 1) { + const existingTag = state.logLineTags[i] || extractLogTag(state.logLines[i]); + if (existingTag && existingTag === tag) { + state.logLines[i] = line; + state.logLineTags[i] = tag; + return true; + } + } + return false; + }; + + const pushLogWindowLine = (line, options = {}) => { + if (!interactive) return; + const tag = resolveLogTag(line, options.tag); + if (!shouldUpdateLogWindowLine(line, tag)) return; + const replaced = tag ? upsertLogWindowLine(line, tag) : false; + if (!replaced) { + state.logLines.push(line); + state.logLineTags.push(tag || ''); + if (state.logLines.length > state.logWindowSize) state.logLines.shift(); + if (state.logLineTags.length > state.logWindowSize) state.logLineTags.shift(); + } + renderStatus(); + }; + + const styleText = (text, prefix) => { + if (!colorEnabled || !text) return text; + return `${prefix}${text}${ansi.reset}`; + }; + + const formatBarLine = (line, width) => { + const content = line || ''; + const truncated = content.length > width + ? `${content.slice(0, Math.max(0, width - 1))}\u2026` + : content; + if (!colorEnabled) return truncated; + const padded = truncated.padEnd(width, ' '); + return `${ansi.bgBlack}${ansi.fgLight}${padded}${ansi.reset}`; + }; + + const formatLogLine = (line) => { + const content = line || ''; + if (!colorEnabled) return content; + if (/^\s*(?:\u2192|->)\s*Shard\s+/i.test(content)) { + return styleText(content, ansi.fgBright); + } + if (/^\s*\[shard\s+/i.test(content) + || /^\s*Files\s+\d+\/\d+/i.test(content) + || /^\s*File\s+\d+\/\d+/i.test(content)) { + return styleText(content, ansi.fgDim); + } + return content; + }; + + const renderStatus = () => { + if (!interactive) return; + if (!state.statusRendered) { + process.stdout.write('\n'.repeat(state.logWindowSize + 3)); + state.statusRendered = true; + } + readline.moveCursor(process.stdout, 0, -(state.logWindowSize + 3)); + const lines = [...state.logLines]; + const width = Number.isFinite(process.stdout.columns) ? process.stdout.columns : 120; + while (lines.length < state.logWindowSize) lines.push(''); + lines.push(state.metricsLine); + lines.push(state.fileProgressLine); + lines.push(state.progressLine); + for (let i = 0; i < lines.length; i += 1) { + const line = lines[i]; + const isBar = i >= state.logWindowSize; + readline.clearLine(process.stdout, 0); + const output = isBar + ? formatBarLine(line || '', width) + : formatLogLine(truncateDisplay(line || '')); + process.stdout.write(output); + process.stdout.write('\n'); + } + }; + + const parseDurationText = (text) => { + if (!text) return null; + const hours = /(\d+)\s*h/i.exec(text); + const minutes = /(\d+)\s*m/i.exec(text); + const seconds = /(\d+)\s*s/i.exec(text); + const totalSeconds = (hours ? Number(hours[1]) * 3600 : 0) + + (minutes ? Number(minutes[1]) * 60 : 0) + + (seconds ? Number(seconds[1]) : 0); + return Number.isFinite(totalSeconds) ? totalSeconds * 1000 : null; + }; + + const setProgressBase = (message) => { + state.progressLineBase = message || ''; + state.progressLinePrefix = ''; + state.progressLineSuffix = ''; + state.progressElapsedStartMs = null; + if (!message) return; + const match = message.match(/^(.*\| elapsed )([^|]+)(.*)$/); + if (!match) return; + const parsedMs = parseDurationText(match[2].trim()); + if (!Number.isFinite(parsedMs)) return; + state.progressLinePrefix = match[1]; + state.progressLineSuffix = match[3] || ''; + state.progressElapsedStartMs = Date.now() - parsedMs; + }; + + const getActiveShardList = (now = Date.now()) => { + const active = []; + for (const [index, lastSeen] of state.activeShards.entries()) { + if (now - lastSeen <= state.activeShardWindowMs) { + active.push(index); + } else { + state.activeShards.delete(index); + } + } + active.sort((a, b) => a - b); + return active; + }; + + const formatImportStats = (stats) => { + if (!stats) return ''; + const parts = []; + if (Number.isFinite(stats.modules)) parts.push(`${stats.modules} mods`); + if (Number.isFinite(stats.edges)) parts.push(`${stats.edges} edges`); + if (Number.isFinite(stats.files)) parts.push(`${stats.files} files`); + if (!parts.length) return ''; + return `imports ${parts.join(', ')}`; + }; + + const buildProgressLineExtras = (now = Date.now()) => { + const segments = []; + const shardList = getActiveShardList(now); + if (shardList.length) { + segments.push(`shards ${shardList.join(',')}`); + } + if (state.build.step?.toLowerCase() === 'imports') { + const importText = formatImportStats(state.build.importStats); + if (importText) segments.push(importText); + } + return segments.length ? ` | ${segments.join(' | ')}` : ''; + }; + + const buildProgressLineBase = (now = Date.now()) => { + if (state.progressLinePrefix && Number.isFinite(state.progressElapsedStartMs)) { + return `${state.progressLinePrefix}${formatDuration(now - state.progressElapsedStartMs)}${state.progressLineSuffix}`; + } + return state.progressLineBase; + }; + + const renderProgressLine = ({ now = Date.now(), log = false, force = false } = {}) => { + const baseLine = buildProgressLineBase(now); + const extra = buildProgressLineExtras(now); + let line = baseLine || ''; + if (extra) { + line = baseLine ? `${baseLine}${extra}` : extra.replace(/^\s*\|\s*/, ''); + } + if (!force && line === state.progressLine) return; + state.progressLine = line; + renderStatus(); + if (log && line && line !== state.lastProgressLogged) { + writeLog(`[progress] ${line}`); + state.lastProgressLogged = line; + } + if (log && !interactive && !quietMode && line !== state.lastProgressMessage) { + console.log(line); + state.lastProgressMessage = line; + } + }; + + const updateProgress = (message) => { + setProgressBase(message); + renderProgressLine({ log: true, force: true }); + }; + + const updateMetrics = (message) => { + state.metricsLine = message; + renderStatus(); + if (message && message !== state.lastMetricsLogged) { + writeLog(`[metrics] ${message}`); + state.lastMetricsLogged = message; + } + if (!interactive && !quietMode && message) { + console.log(message); + } + }; + + const updateFileProgressLine = () => { + const file = state.build.currentFile; + const current = state.build.currentLine; + const total = state.build.currentLineTotal; + if (!file) { + state.fileProgressLine = ''; + renderStatus(); + return; + } + const lineSegment = total > 0 ? ` [${current}/${total}]` : ''; + const shardIndex = state.build.currentShardIndex; + const shardTotal = state.build.currentShardTotal; + const shardLabel = (Number.isFinite(shardIndex) && Number.isFinite(shardTotal)) + ? `${shardIndex}/${shardTotal}` + : ''; + const shardSegment = shardLabel ? `[shard ${shardLabel}] ` : '[shard] '; + state.fileProgressLine = `${shardSegment}${file}${lineSegment}`; + renderStatus(); + }; + + const refreshProgressLine = (now = Date.now(), force = false) => { + if (!interactive) return; + if (!force && now - state.lastProgressRefreshMs < state.progressRefreshMs) return; + state.lastProgressRefreshMs = now; + renderProgressLine({ now, force }); + }; + + const handleShardLine = (line) => { + const entry = parseShardLine(line); + if (!entry) return false; + if (entry.shardLabel && Number.isFinite(entry.index) && Number.isFinite(entry.total)) { + state.shardByLabel.set(entry.shardLabel, { index: entry.index, total: entry.total }); + } + return true; + }; + + const handleImportStatsLine = (line) => { + const stats = parseImportStatsLine(line); + if (!stats) return false; + state.build.importStats = stats; + return true; + }; + + const handleBuildMode = (line) => { + const mode = parseScanMode(line); + if (!mode) return; + if (mode === 'code' || mode === 'prose') { + state.build.mode = mode; + } + }; + + const resolveModeForFile = (rel) => { + if (!rel) return null; + if (state.build.linesByFile.code?.has(rel)) return 'code'; + if (state.build.linesByFile.prose?.has(rel)) return 'prose'; + return null; + }; + + const handleBuildFileLine = (lineOrEntry) => { + const entry = typeof lineOrEntry === 'string' ? parseFileProgressLine(lineOrEntry) : lineOrEntry; + if (!entry || !entry.file) return; + const rawPath = entry.file.trim(); + if (!rawPath) return; + const rel = toPosix(rawPath); + const inferredMode = resolveModeForFile(rel); + if (inferredMode && inferredMode !== state.build.mode) { + state.build.mode = inferredMode; + } + const mode = state.build.mode; + if (!mode || !state.build.linesByFile[mode]) return; + state.build.currentFile = rel; + state.build.currentLineTotal = state.build.linesByFile[mode].get(rel) || 0; + state.build.currentLine = 0; + const shardLabel = entry.shardLabel; + const shardInfo = shardLabel ? state.shardByLabel.get(shardLabel) : null; + state.build.currentShard = shardLabel || null; + state.build.currentShardIndex = shardInfo?.index ?? null; + state.build.currentShardTotal = shardInfo?.total ?? null; + if (Number.isFinite(state.build.currentShardIndex)) { + state.activeShards.set(state.build.currentShardIndex, Date.now()); + } + updateFileProgressLine(); + const seen = state.build.filesSeen[mode]; + if (seen.has(rel)) return; + const lineCount = state.build.linesByFile[mode].get(rel); + if (!Number.isFinite(lineCount)) return; + seen.add(rel); + state.build.linesProcessed[mode] += lineCount; + }; + + const handleBuildLineProgress = (line) => { + const progress = parseLineProgress(line); + if (!progress) return; + const { current, total } = progress; + if (!Number.isFinite(current) || !Number.isFinite(total) || total <= 0) return; + state.build.currentLine = current; + state.build.currentLineTotal = total; + updateFileProgressLine(); + }; + + const handleBuildProgress = (line) => { + const parsed = parseProgressLine(line); + if (!parsed) return false; + const { step, count, total, pct } = parsed; + if (!Number.isFinite(count) || !Number.isFinite(total) || !Number.isFinite(pct) || total <= 0) { + return true; + } + const label = state.currentRepoLabel || (getActiveLabel ? getActiveLabel() : '') || ''; + const now = Date.now(); + if ( + state.build.step !== step + || state.build.total !== total + || count < state.build.lastCount + || state.build.label !== label + ) { + state.build.step = step; + state.build.total = total; + state.build.startMs = now; + state.build.lastLoggedMs = 0; + state.build.lastCount = 0; + state.build.lastPct = 0; + state.build.label = label; + } + if (!state.build.startMs) state.build.startMs = now; + const elapsedMs = now - state.build.startMs; + const rate = elapsedMs > 0 ? count / (elapsedMs / 1000) : 0; + const remaining = total - count; + let etaMs = rate > 0 && remaining > 0 ? (remaining / rate) * 1000 : 0; + let lineRate = 0; + let remainingLines = 0; + let totalLines = 0; + if (step.toLowerCase() === 'files' && !state.build.mode) { + const fallbackMode = resolveModeForFile(state.build.currentFile); + if (fallbackMode) { + state.build.mode = fallbackMode; + } + } + if (step.toLowerCase() === 'files' && state.build.mode) { + const mode = state.build.mode; + totalLines = state.build.lineTotals[mode] || 0; + const processedLines = state.build.linesProcessed[mode] || 0; + if (elapsedMs > 0 && processedLines > 0) { + lineRate = processedLines / (elapsedMs / 1000); + } + remainingLines = totalLines - processedLines; + if (lineRate > 0 && remainingLines > 0) { + etaMs = (remainingLines / lineRate) * 1000; + } + } + const pctDelta = pct - state.build.lastPct; + const countDelta = count - state.build.lastCount; + const shouldLog = + count === total + || now - state.build.lastLoggedMs >= 5000 + || pctDelta >= 1 + || countDelta >= 500; + if (shouldLog) { + const rateText = rate > 0 ? `${rate.toFixed(1)}/s` : 'n/a'; + const lineRateText = lineRate > 0 ? `${Math.round(lineRate).toLocaleString()}/s` : null; + const etaText = etaMs > 0 ? formatDuration(etaMs) : 'n/a'; + const labelText = label ? ` ${label}` : ''; + const lineRateSegment = lineRateText ? ` | lines ${lineRateText}` : ''; + const totalLinesText = totalLines > 0 ? `${formatLoc(totalLines)}` : null; + const processedLinesText = totalLines > 0 + ? `${formatLoc(totalLines - remainingLines)}/${totalLinesText}` + : null; + const linesElapsedSegment = processedLinesText ? ` (${processedLinesText})` : ''; + const remainingLinesText = remainingLines > 0 ? formatLoc(remainingLines) : null; + const etaSegment = remainingLinesText ? `${etaText} (${remainingLinesText} rem)` : etaText; + const currentLineSegment = state.build.currentLineTotal > 0 + ? ` [${state.build.currentLine}/${state.build.currentLineTotal}]` + : ''; + const message = `Indexing${labelText} ${step} ${count}/${total} (${pct.toFixed(1)}%)${currentLineSegment} | rate ${rateText}${lineRateSegment} | elapsed ${formatDuration(elapsedMs)}${linesElapsedSegment} | eta ${etaSegment}`; + updateMetrics(message); + state.build.lastLoggedMs = now; + state.build.lastCount = count; + state.build.lastPct = pct; + } + refreshProgressLine(now); + return true; + }; + + const formatProgressLine = (line) => { + const parsed = parseProgressLine(line); + if (!parsed) return null; + const { step, count, total, pct } = parsed; + if (!Number.isFinite(count) || !Number.isFinite(total)) return null; + const pctText = Number.isFinite(pct) ? `${pct.toFixed(1)}%` : null; + const lineText = `${step} ${count}/${total}${pctText ? ` (${pctText})` : ''}`; + return { + line: lineText, + tag: `progress:${step.toLowerCase()}` + }; + }; + + const appendLog = (line) => { + const cleaned = line.replace(/\r/g, '').trimEnd(); + if (!cleaned) return; + if (handleImportStatsLine(cleaned)) { + refreshProgressLine(Date.now(), true); + } + if (handleShardLine(cleaned)) { + pushHistory(cleaned); + if (interactive) { + pushLogWindowLine(cleaned); + } else if (!quietMode) { + console.log(cleaned); + } + return; + } + if (parseLineProgress(cleaned)) { + handleBuildLineProgress(cleaned); + handleBuildProgress(cleaned); + return; + } + const fileProgress = parseFileProgressLine(cleaned); + if (fileProgress && fileProgress.file) { + pushHistory(cleaned); + handleBuildMode(cleaned); + handleBuildFileLine(fileProgress); + handleBuildLineProgress(cleaned); + handleBuildProgress(cleaned); + const formatted = formatShardFileProgress(fileProgress, { + shardByLabel: state.shardByLabel, + lineTotal: state.build.currentLineTotal + }); + if (formatted) { + if (interactive) { + pushLogWindowLine(formatted); + } else if (!quietMode) { + console.log(formatted); + } + } + return; + } + const formattedProgress = formatProgressLine(cleaned); + if (formattedProgress) { + const { line: formattedLine, tag } = formattedProgress; + pushHistory(cleaned); + handleBuildMode(cleaned); + handleBuildLineProgress(cleaned); + handleBuildProgress(cleaned); + if (interactive) { + pushLogWindowLine(formattedLine, { tag }); + } else if (!quietMode) { + console.log(formattedLine); + } + return; + } + pushHistory(cleaned); + writeLog(cleaned); + handleBuildMode(cleaned); + handleBuildFileLine(cleaned); + handleBuildLineProgress(cleaned); + handleBuildProgress(cleaned); + if (interactive) { + pushLogWindowLine(cleaned); + } else if (!quietMode) { + console.log(cleaned); + } + }; + + const resetBuildProgress = (label = '') => { + resetBuildProgressState(state, label); + updateFileProgressLine(); + }; + + return { + appendLog, + updateProgress, + updateMetrics, + updateFileProgressLine, + resetBuildProgress, + renderStatus + }; +}; diff --git a/tools/bench/language/progress/state.js b/tools/bench/language/progress/state.js new file mode 100644 index 000000000..b3a3efb22 --- /dev/null +++ b/tools/bench/language/progress/state.js @@ -0,0 +1,59 @@ +const createBuildProgressState = () => ({ + step: null, + total: 0, + startMs: 0, + lastLoggedMs: 0, + lastCount: 0, + lastPct: 0, + label: '', + mode: null, + lineTotals: { code: 0, prose: 0 }, + linesProcessed: { code: 0, prose: 0 }, + linesByFile: { code: new Map(), prose: new Map() }, + filesSeen: { code: new Set(), prose: new Set() }, + currentFile: null, + currentLine: 0, + currentLineTotal: 0, + currentShard: null, + currentShardIndex: null, + currentShardTotal: null, + importStats: null +}); + +export const createProgressState = ({ logWindowSize = 20, logHistorySize = 50 } = {}) => { + return { + logWindowSize, + logHistorySize, + logLines: Array(logWindowSize).fill(''), + logLineTags: Array(logWindowSize).fill(''), + logHistory: [], + logUpdateByTag: new Map(), + logUpdateDebounceMs: 250, + metricsLine: '', + progressLine: '', + fileProgressLine: '', + progressLineBase: '', + progressLinePrefix: '', + progressLineSuffix: '', + progressElapsedStartMs: null, + lastProgressRefreshMs: 0, + progressRefreshMs: 1000, + statusRendered: false, + lastProgressLogged: '', + lastProgressMessage: '', + lastMetricsLogged: '', + shardByLabel: new Map(), + activeShards: new Map(), + activeShardWindowMs: 5000, + build: createBuildProgressState(), + currentRepoLabel: '' + }; +}; + +export const resetBuildProgressState = (state, label = '') => { + state.build = createBuildProgressState(); + state.build.label = label; + state.shardByLabel.clear(); + state.activeShards.clear(); + state.logUpdateByTag.clear(); +}; diff --git a/tools/bench/language/report.js b/tools/bench/language/report.js new file mode 100644 index 000000000..69e361e04 --- /dev/null +++ b/tools/bench/language/report.js @@ -0,0 +1,100 @@ +export const summarizeResults = (items) => { + const valid = items.filter((entry) => entry.summary); + if (!valid.length) return null; + const backendSet = new Set(); + for (const entry of valid) { + const summary = entry.summary; + const backends = summary.backends || Object.keys(summary.latencyMsAvg || {}); + for (const backend of backends) backendSet.add(backend); + } + const backends = Array.from(backendSet); + const latencyMsAvg = {}; + const hitRate = {}; + const resultCountAvg = {}; + const memoryRssAvgMb = {}; + const buildMsAvg = {}; + for (const backend of backends) { + const latencies = valid.map((entry) => entry.summary?.latencyMsAvg?.[backend]).filter(Number.isFinite); + const hits = valid.map((entry) => entry.summary?.hitRate?.[backend]).filter(Number.isFinite); + const results = valid.map((entry) => entry.summary?.resultCountAvg?.[backend]).filter(Number.isFinite); + const mem = valid + .map((entry) => entry.summary?.memoryRss?.[backend]?.mean) + .filter(Number.isFinite) + .map((value) => value / (1024 * 1024)); + if (latencies.length) latencyMsAvg[backend] = latencies.reduce((a, b) => a + b, 0) / latencies.length; + if (hits.length) hitRate[backend] = hits.reduce((a, b) => a + b, 0) / hits.length; + if (results.length) resultCountAvg[backend] = results.reduce((a, b) => a + b, 0) / results.length; + if (mem.length) memoryRssAvgMb[backend] = mem.reduce((a, b) => a + b, 0) / mem.length; + } + for (const entry of valid) { + const build = entry.summary?.buildMs; + if (!build) continue; + for (const [key, value] of Object.entries(build)) { + if (!Number.isFinite(value)) continue; + if (!buildMsAvg[key]) buildMsAvg[key] = []; + buildMsAvg[key].push(value); + } + } + const buildMs = Object.fromEntries( + Object.entries(buildMsAvg).map(([key, values]) => [ + key, + values.reduce((a, b) => a + b, 0) / values.length + ]) + ); + return { + backends, + latencyMsAvg, + hitRate, + resultCountAvg, + memoryRssAvgMb, + buildMs: Object.keys(buildMs).length ? buildMs : null + }; +}; + +export const printSummary = (label, summary, count, quietMode) => { + if (!summary || quietMode) return; + console.log(`\n${label} summary (${count} repos)`); + for (const backend of summary.backends) { + const latency = summary.latencyMsAvg?.[backend]; + const hit = summary.hitRate?.[backend]; + const results = summary.resultCountAvg?.[backend]; + const mem = summary.memoryRssAvgMb?.[backend]; + const latencyText = Number.isFinite(latency) ? `${latency.toFixed(1)}ms` : 'n/a'; + const hitText = Number.isFinite(hit) ? `${(hit * 100).toFixed(1)}%` : 'n/a'; + const resultText = Number.isFinite(results) ? results.toFixed(1) : 'n/a'; + const memText = Number.isFinite(mem) ? `${mem.toFixed(1)} MB` : 'n/a'; + console.log(`- ${backend} avg ${latencyText} | hit ${hitText} | avg hits ${resultText} | rss ${memText}`); + } + if (summary.buildMs) { + for (const [key, value] of Object.entries(summary.buildMs)) { + if (!Number.isFinite(value)) continue; + console.log(`- build ${key} avg ${(value / 1000).toFixed(1)}s`); + } + } +}; + +export const buildReportOutput = ({ configPath, cacheRoot, resultsRoot, results, config }) => { + const groupedResults = new Map(); + for (const entry of results) { + if (!groupedResults.has(entry.language)) groupedResults.set(entry.language, []); + groupedResults.get(entry.language).push(entry); + } + const groupedSummary = {}; + for (const [language, items] of groupedResults.entries()) { + groupedSummary[language] = { + label: config[language]?.label || language, + count: items.length, + summary: summarizeResults(items) + }; + } + const overallSummary = summarizeResults(results); + return { + generatedAt: new Date().toISOString(), + config: configPath, + cacheRoot, + resultsRoot, + tasks: results, + groupedSummary, + overallSummary + }; +}; diff --git a/tools/bench/language/repos.js b/tools/bench/language/repos.js new file mode 100644 index 000000000..7e1c81c11 --- /dev/null +++ b/tools/bench/language/repos.js @@ -0,0 +1,114 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { execaSync } from 'execa'; +import { getIndexDir, getRepoCacheRoot, loadUserConfig, resolveSqlitePaths } from '../../dict-utils.js'; + +const canRun = (cmd, args) => { + try { + const result = execaSync(cmd, args, { encoding: 'utf8', reject: false }); + return result.exitCode === 0; + } catch { + return false; + } +}; + +export const resolveCloneTool = () => { + const gitAvailable = canRun('git', ['--version']); + const ghAvailable = canRun('gh', ['--version']); + const preferGit = process.platform === 'win32' && gitAvailable; + if (preferGit) { + return { + label: 'git', + buildArgs: (repo, repoPath) => [ + '-c', + 'core.longpaths=true', + '-c', + 'checkout.workers=0', + '-c', + 'checkout.thresholdForParallelism=0', + 'clone', + `https://github.com/${repo}.git`, + repoPath + ] + }; + } + if (ghAvailable) { + return { + label: 'gh', + buildArgs: (repo, repoPath) => ['repo', 'clone', repo, repoPath] + }; + } + if (gitAvailable) { + return { + label: 'git', + buildArgs: (repo, repoPath) => [ + '-c', + 'checkout.workers=0', + '-c', + 'checkout.thresholdForParallelism=0', + 'clone', + `https://github.com/${repo}.git`, + repoPath + ] + }; + } + console.error('GitHub CLI (gh) or git is required to clone benchmark repos.'); + process.exit(1); +}; + +export const ensureLongPathsSupport = () => { + if (process.platform !== 'win32') return; + if (canRun('git', ['--version'])) { + try { + execaSync('git', ['config', '--global', 'core.longpaths', 'true'], { stdio: 'ignore', reject: false }); + } catch {} + } + let regResult; + try { + regResult = execaSync( + 'reg', + ['query', 'HKLM\\SYSTEM\\CurrentControlSet\\Control\\FileSystem', '/v', 'LongPathsEnabled'], + { encoding: 'utf8', reject: false } + ); + } catch { + regResult = null; + } + if (!regResult || regResult.exitCode !== 0) { + console.warn('Warning: Unable to confirm Windows long path setting. Enable LongPathsEnabled=1 if clones fail.'); + return; + } + const match = String(regResult.stdout || '').match(/LongPathsEnabled\s+REG_DWORD\s+0x([0-9a-f]+)/i); + if (!match) return; + const value = Number.parseInt(match[1], 16); + if (value === 0) { + console.warn('Warning: Windows long paths are disabled. Enable LongPathsEnabled=1 to avoid clone failures.'); + } +}; + +export const resolveRepoDir = ({ reposRoot, repo, language }) => { + const safeName = repo.replace('/', '__'); + return path.join(reposRoot, language, safeName); +}; + +export const resolveRepoCacheRoot = ({ repoPath, cacheRoot }) => { + return getRepoCacheRoot(repoPath, { cache: { root: cacheRoot } }); +}; + +export const needsIndexArtifacts = (repoRoot) => { + const userConfig = loadUserConfig(repoRoot); + const codeDir = getIndexDir(repoRoot, 'code', userConfig); + const proseDir = getIndexDir(repoRoot, 'prose', userConfig); + const hasChunkMeta = (dir) => ( + fs.existsSync(path.join(dir, 'chunk_meta.json')) + || fs.existsSync(path.join(dir, 'chunk_meta.jsonl')) + || fs.existsSync(path.join(dir, 'chunk_meta.meta.json')) + || fs.existsSync(path.join(dir, 'chunk_meta.parts')) + ); + return !hasChunkMeta(codeDir) || !hasChunkMeta(proseDir); +}; + +export const needsSqliteArtifacts = (repoRoot) => { + const userConfig = loadUserConfig(repoRoot); + const sqlitePaths = resolveSqlitePaths(repoRoot, userConfig); + return !fs.existsSync(sqlitePaths.codePath) || !fs.existsSync(sqlitePaths.prosePath); +}; diff --git a/tools/bench/micro/index-build.js b/tools/bench/micro/index-build.js new file mode 100644 index 000000000..488a2cb50 --- /dev/null +++ b/tools/bench/micro/index-build.js @@ -0,0 +1,50 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { buildIndex } from '../../../src/integrations/core/index.js'; +import { getRepoCacheRoot } from '../../dict-utils.js'; +import { hrtimeMs, summarizeDurations } from './utils.js'; + +export async function runIndexBuildBenchmark({ + repoRoot, + mode, + threads, + sqlite, + stubEmbeddings, + warmRuns, + cleanCache +}) { + const cacheRoot = getRepoCacheRoot(repoRoot); + if (cleanCache) { + await fs.rm(cacheRoot, { recursive: true, force: true }); + } + + const coldStart = process.hrtime.bigint(); + await buildIndex(repoRoot, { + mode, + threads, + incremental: false, + sqlite, + stubEmbeddings + }); + const coldMs = hrtimeMs(coldStart); + + const warmTimes = []; + for (let i = 0; i < warmRuns; i += 1) { + const start = process.hrtime.bigint(); + await buildIndex(repoRoot, { + mode, + threads, + incremental: true, + sqlite, + stubEmbeddings + }); + warmTimes.push(hrtimeMs(start)); + } + + return { + repoRoot, + cacheRoot: path.resolve(cacheRoot), + coldMs, + warm: summarizeDurations(warmTimes) + }; +} diff --git a/tools/bench/micro/run.js b/tools/bench/micro/run.js new file mode 100644 index 000000000..5f8807e1a --- /dev/null +++ b/tools/bench/micro/run.js @@ -0,0 +1,228 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import yargs from 'yargs/yargs'; +import { hideBin } from 'yargs/helpers'; +import { buildIndex } from '../../../src/integrations/core/index.js'; +import { getIndexDir, resolveRepoRoot, resolveToolRoot } from '../../dict-utils.js'; +import { formatMs, formatStats } from './utils.js'; +import { runIndexBuildBenchmark } from './index-build.js'; +import { runSearchBenchmark } from './search.js'; + +const toolRoot = resolveToolRoot(); +const defaultRepo = path.resolve(toolRoot, 'tests', 'fixtures', 'sample'); + +const argv = yargs(hideBin(process.argv)) + .option('repo', { + type: 'string', + describe: 'Repo root to benchmark', + default: defaultRepo + }) + .option('mode', { + type: 'string', + describe: 'Index/search mode (code|prose)', + default: 'code' + }) + .option('query', { + type: 'string', + describe: 'Search query for microbench runs', + default: 'function' + }) + .option('backend', { + type: 'string', + describe: 'Search backend (memory|sqlite|sqlite-fts)', + default: 'memory' + }) + .option('runs', { + type: 'number', + describe: 'Warm run count per component', + default: 5 + }) + .option('warmup', { + type: 'number', + describe: 'Warmup runs discarded before measuring warm stats', + default: 1 + }) + .option('threads', { + type: 'number', + describe: 'Index build worker threads', + default: 0 + }) + .option('build', { + type: 'boolean', + describe: 'Build indexes before search benchmarks', + default: true + }) + .option('clean', { + type: 'boolean', + describe: 'Clean repo cache before cold index build', + default: true + }) + .option('sqlite', { + type: 'boolean', + describe: 'Enable SQLite builds during index benchmark', + default: false + }) + .option('stub-embeddings', { + type: 'boolean', + describe: 'Use stub embeddings for index build', + default: true + }) + .option('components', { + type: 'string', + describe: 'Comma-separated component list: index-build,sparse,dense,hybrid', + default: 'index-build,sparse,dense,hybrid' + }) + .option('json', { + type: 'boolean', + describe: 'Emit JSON output only', + default: false + }) + .option('out', { + type: 'string', + describe: 'Write JSON results to a file' + }) + .help() + .argv; + +const repoRoot = path.resolve(argv.repo || resolveRepoRoot(process.cwd())); +const warmRuns = Math.max(0, Math.floor(argv.runs)); +const warmupRuns = Math.max(0, Math.floor(argv.warmup)); +const threads = Number(argv.threads) > 0 ? Math.floor(argv.threads) : undefined; +const mode = argv.mode === 'prose' ? 'prose' : 'code'; +const components = parseComponents(argv.components); + +const results = { + repoRoot, + mode, + query: argv.query, + backend: argv.backend, + components: {} +}; + +const log = argv.json ? () => {} : console.log; + +await maybeBuildIndexes(); + +if (components.includes('index-build')) { + log('\n[index-build]'); + const bench = await runIndexBuildBenchmark({ + repoRoot, + mode, + threads, + sqlite: argv.sqlite === true, + stubEmbeddings: argv['stub-embeddings'] !== false, + warmRuns, + cleanCache: argv.clean === true + }); + results.components['index-build'] = bench; + if (!argv.json) { + log(`cold: ${formatMs(bench.coldMs)}`); + log(`warm: ${formatStats(bench.warm)}`); + } +} + +const indexCache = new Map(); +const sqliteCache = null; + +if (components.includes('sparse')) { + log('\n[search-sparse]'); + const bench = await runSearchBenchmark({ + repoRoot, + query: argv.query, + mode, + backend: argv.backend, + ann: false, + profile: null, + warmRuns, + warmupRuns, + indexCache, + sqliteCache + }); + results.components['search-sparse'] = bench; + if (!argv.json) { + log(`cold: ${formatMs(bench.coldMs)}`); + log(`warm: ${formatStats(bench.warm)}`); + } +} + +if (components.includes('dense')) { + log('\n[search-dense]'); + const bench = await runSearchBenchmark({ + repoRoot, + query: argv.query, + mode, + backend: argv.backend, + ann: true, + profile: null, + warmRuns, + warmupRuns, + indexCache, + sqliteCache + }); + results.components['search-dense'] = bench; + if (!argv.json) { + log(`cold: ${formatMs(bench.coldMs)}`); + log(`warm: ${formatStats(bench.warm)}`); + } +} + +if (components.includes('hybrid')) { + log('\n[search-hybrid]'); + const bench = await runSearchBenchmark({ + repoRoot, + query: argv.query, + mode, + backend: argv.backend, + ann: true, + profile: null, + warmRuns, + warmupRuns, + indexCache, + sqliteCache + }); + results.components['search-hybrid'] = bench; + if (!argv.json) { + log(`cold: ${formatMs(bench.coldMs)}`); + log(`warm: ${formatStats(bench.warm)}`); + } +} + +if (argv.out) { + const outPath = path.resolve(argv.out); + fs.writeFileSync(outPath, `${JSON.stringify(results, null, 2)}\n`); + log(`\nSaved results to ${outPath}`); +} + +if (argv.json) { + console.log(JSON.stringify(results, null, 2)); +} + +async function maybeBuildIndexes() { + if (!argv.build) return; + const indexDir = getIndexDir(repoRoot, mode); + const metaExists = hasChunkMeta(indexDir); + if (metaExists) return; + log('[setup] building indexes before search benchmarks'); + await buildIndex(repoRoot, { + mode, + threads, + incremental: true, + sqlite: argv.sqlite === true, + stubEmbeddings: argv['stub-embeddings'] !== false + }); +} + +function hasChunkMeta(indexDir) { + const json = path.join(indexDir, 'chunk_meta.json'); + const jsonl = path.join(indexDir, 'chunk_meta.jsonl'); + const meta = path.join(indexDir, 'chunk_meta.meta.json'); + return fs.existsSync(json) || fs.existsSync(jsonl) || fs.existsSync(meta); +} + +function parseComponents(value) { + if (!value) return []; + return value + .split(',') + .map((entry) => entry.trim().toLowerCase()) + .filter(Boolean); +} diff --git a/tools/bench/micro/search.js b/tools/bench/micro/search.js new file mode 100644 index 000000000..0478731f8 --- /dev/null +++ b/tools/bench/micro/search.js @@ -0,0 +1,66 @@ +import { search } from '../../../src/integrations/core/index.js'; +import { hrtimeMs, summarizeDurations } from './utils.js'; + +export async function runSearchBenchmark({ + repoRoot, + query, + mode, + backend, + ann, + profile, + warmRuns, + warmupRuns, + indexCache, + sqliteCache +}) { + const previousProfile = process.env.PAIROFCLEATS_PROFILE; + if (profile) { + process.env.PAIROFCLEATS_PROFILE = profile; + } else { + delete process.env.PAIROFCLEATS_PROFILE; + } + + const executeSearch = async () => { + const start = process.hrtime.bigint(); + await search(repoRoot, { + query, + mode, + backend, + ann, + json: true, + jsonCompact: true, + emitOutput: false, + indexCache, + sqliteCache + }); + return hrtimeMs(start); + }; + + try { + if (indexCache?.clear) indexCache.clear(); + if (sqliteCache?.clearAll) sqliteCache.clearAll(); + + const coldMs = await executeSearch(); + + for (let i = 0; i < warmupRuns; i += 1) { + await executeSearch(); + } + + const warmTimes = []; + for (let i = 0; i < warmRuns; i += 1) { + warmTimes.push(await executeSearch()); + } + + return { + repoRoot, + coldMs, + warm: summarizeDurations(warmTimes) + }; + } finally { + if (previousProfile !== undefined) { + process.env.PAIROFCLEATS_PROFILE = previousProfile; + } else { + delete process.env.PAIROFCLEATS_PROFILE; + } + } +} diff --git a/tools/bench/micro/tinybench.js b/tools/bench/micro/tinybench.js new file mode 100644 index 000000000..5044a7ecd --- /dev/null +++ b/tools/bench/micro/tinybench.js @@ -0,0 +1,342 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import yargs from 'yargs/yargs'; +import { hideBin } from 'yargs/helpers'; +import { Bench } from 'tinybench'; +import { build as buildHistogram } from 'hdr-histogram-js'; +import { buildIndex, search } from '../../../src/integrations/core/index.js'; +import { getIndexDir, resolveRepoRoot, resolveToolRoot } from '../../dict-utils.js'; + +const toolRoot = resolveToolRoot(); +const defaultRepo = path.resolve(toolRoot, 'tests', 'fixtures', 'sample'); + +const argv = yargs(hideBin(process.argv)) + .option('repo', { + type: 'string', + describe: 'Repo root to benchmark', + default: defaultRepo + }) + .option('mode', { + type: 'string', + describe: 'Index/search mode (code|prose)', + default: 'code' + }) + .option('backend', { + type: 'string', + describe: 'Search backend (memory|sqlite|sqlite-fts)', + default: 'memory' + }) + .option('query', { + type: 'string', + describe: 'Query used for search benchmarks', + default: 'function' + }) + .option('iterations', { + type: 'number', + describe: 'Iterations per task', + default: 64 + }) + .option('warmup-iterations', { + type: 'number', + describe: 'Warmup iterations per task', + default: 8 + }) + .option('time', { + type: 'number', + describe: 'Target runtime per task in ms', + default: 1000 + }) + .option('warmup-time', { + type: 'number', + describe: 'Warmup time per task in ms', + default: 250 + }) + .option('components', { + type: 'string', + describe: 'Comma-separated components (search-sparse,search-ann,search-dense,search-hybrid)', + default: 'search-sparse,search-ann' + }) + .option('build', { + type: 'boolean', + describe: 'Build indexes before running the bench', + default: true + }) + .option('stub-embeddings', { + type: 'boolean', + describe: 'Use stub embeddings when building indexes', + default: true + }) + .option('baseline', { + type: 'string', + describe: 'Baseline file for comparisons' + }) + .option('write-baseline', { + type: 'boolean', + describe: 'Write results to the baseline file', + default: false + }) + .option('compare', { + type: 'boolean', + describe: 'Compare results against the baseline file', + default: true + }) + .option('json', { + type: 'boolean', + describe: 'Emit JSON output only', + default: false + }) + .option('out', { + type: 'string', + describe: 'Write JSON results to a file' + }) + .help() + .argv; + +const repoRoot = path.resolve(argv.repo || resolveRepoRoot(process.cwd())); +const mode = argv.mode === 'prose' ? 'prose' : 'code'; +const backend = String(argv.backend || 'memory').toLowerCase(); +const components = parseComponents(argv.components); +const baselinePath = path.resolve( + argv.baseline || path.join(toolRoot, 'benchmarks', 'baselines', 'microbench.json') +); + +if (argv['stub-embeddings'] !== false) { + process.env.PAIROFCLEATS_EMBEDDINGS = 'stub'; +} else { + delete process.env.PAIROFCLEATS_EMBEDDINGS; +} + +await maybeBuildIndexes(); + +const bench = new Bench({ + name: 'pairofcleats-microbench', + iterations: Math.max(1, Math.floor(argv.iterations)), + warmupIterations: Math.max(0, Math.floor(argv['warmup-iterations'])), + time: Math.max(0, Math.floor(argv.time)), + warmupTime: Math.max(0, Math.floor(argv['warmup-time'])), + throws: true, + retainSamples: true +}); + +const indexCache = new Map(); +const sqliteCache = null; +const annConfig = { + sparse: false, + ann: true, + dense: true, + hybrid: true +}; + +for (const component of components) { + const normalized = component.toLowerCase(); + if (normalized === 'search-sparse') { + bench.add('search-sparse', () => runSearch(false)); + } else if (normalized === 'search-ann' || normalized === 'search-dense') { + bench.add(normalized, () => runSearch(annConfig.ann)); + } else if (normalized === 'search-hybrid') { + bench.add('search-hybrid', () => runSearch(annConfig.hybrid)); + } +} + +if (!bench.tasks.length) { + console.error('[tinybench] No tasks defined. Check --components.'); + process.exit(1); +} + +await bench.run(); + +const results = { + generatedAt: new Date().toISOString(), + repoRoot, + mode, + backend, + bench: { + iterations: bench.iterations, + warmupIterations: bench.warmupIterations, + timeMs: bench.time, + warmupTimeMs: bench.warmupTime + }, + env: buildEnvSnapshot(), + components: summarizeBenchTasks(bench.tasks) +}; + +const comparison = argv.compare ? compareBaseline(results, baselinePath) : null; +if (comparison) { + results.baseline = comparison; +} + +if (argv['write-baseline']) { + ensureDir(path.dirname(baselinePath)); + fs.writeFileSync(baselinePath, `${JSON.stringify(results, null, 2)}\n`); +} + +if (argv.out) { + const outPath = path.resolve(argv.out); + ensureDir(path.dirname(outPath)); + fs.writeFileSync(outPath, `${JSON.stringify(results, null, 2)}\n`); +} + +if (argv.json) { + console.log(JSON.stringify(results, null, 2)); +} else { + printSummary(results, comparison); +} + +async function runSearch(ann) { + await search(repoRoot, { + query: argv.query, + mode, + backend, + ann, + json: true, + jsonCompact: true, + emitOutput: false, + indexCache, + sqliteCache + }); +} + +async function maybeBuildIndexes() { + if (!argv.build) return; + const indexDir = getIndexDir(repoRoot, mode); + const metaExists = hasChunkMeta(indexDir); + if (metaExists) return; + await buildIndex(repoRoot, { + mode, + incremental: true, + sqlite: backend !== 'memory', + stubEmbeddings: argv['stub-embeddings'] !== false + }); +} + +function hasChunkMeta(indexDir) { + const json = path.join(indexDir, 'chunk_meta.json'); + const jsonl = path.join(indexDir, 'chunk_meta.jsonl'); + const meta = path.join(indexDir, 'chunk_meta.meta.json'); + return fs.existsSync(json) || fs.existsSync(jsonl) || fs.existsSync(meta); +} + +function parseComponents(value) { + if (!value) return []; + return value + .split(',') + .map((entry) => entry.trim()) + .filter(Boolean); +} + +function buildEnvSnapshot() { + const cpu = os.cpus(); + return { + node: process.version, + platform: process.platform, + arch: process.arch, + cpuModel: cpu[0]?.model || 'unknown', + cpuCount: cpu.length + }; +} + +function summarizeBenchTasks(tasks) { + const entries = {}; + for (const task of tasks) { + entries[task.name] = summarizeTask(task); + } + return entries; +} + +function summarizeTask(task) { + const latency = task.result?.latency || {}; + const samples = Array.isArray(latency.samples) ? latency.samples : []; + const percentiles = summarizeSamples(samples); + return { + samples: latency.samplesCount || samples.length || 0, + meanMs: latency.mean || 0, + minMs: latency.min || 0, + maxMs: latency.max || 0, + p50Ms: percentiles.p50, + p95Ms: percentiles.p95, + p99Ms: percentiles.p99, + totalTimeMs: task.result?.totalTime || 0 + }; +} + +function summarizeSamples(samples) { + if (!samples.length) return { p50: 0, p95: 0, p99: 0 }; + const scaled = samples.map((value) => Math.max(1, Math.round(value * 1000))); + const maxValue = Math.max(...scaled, 1); + const histogram = buildHistogram({ + lowestDiscernibleValue: 1, + highestTrackableValue: maxValue, + numberOfSignificantValueDigits: 3 + }); + scaled.forEach((value) => histogram.recordValue(value)); + return { + p50: histogram.getValueAtPercentile(50) / 1000, + p95: histogram.getValueAtPercentile(95) / 1000, + p99: histogram.getValueAtPercentile(99) / 1000 + }; +} + +function compareBaseline(current, baselineFile) { + if (!fs.existsSync(baselineFile)) return null; + let baseline = null; + try { + baseline = JSON.parse(fs.readFileSync(baselineFile, 'utf8')); + } catch { + return null; + } + if (!baseline?.components) return null; + const deltas = {}; + for (const [name, stats] of Object.entries(current.components || {})) { + const base = baseline.components?.[name]; + if (!base) continue; + deltas[name] = { + meanPct: deltaPct(stats.meanMs, base.meanMs), + p50Pct: deltaPct(stats.p50Ms, base.p50Ms), + p95Pct: deltaPct(stats.p95Ms, base.p95Ms), + p99Pct: deltaPct(stats.p99Ms, base.p99Ms) + }; + } + return { + path: baselineFile, + deltas + }; +} + +function deltaPct(current, baseline) { + if (!Number.isFinite(current) || !Number.isFinite(baseline) || baseline === 0) return null; + return ((current - baseline) / baseline) * 100; +} + +function formatMs(value) { + if (!Number.isFinite(value)) return 'n/a'; + return `${value.toFixed(1)}ms`; +} + +function formatDelta(value) { + if (!Number.isFinite(value)) return 'n/a'; + const sign = value >= 0 ? '+' : ''; + return `${sign}${value.toFixed(1)}%`; +} + +function printSummary(results, comparison) { + console.log('[tinybench] Results'); + for (const [name, stats] of Object.entries(results.components || {})) { + console.log(`- ${name}: mean ${formatMs(stats.meanMs)} | p50 ${formatMs(stats.p50Ms)} | p95 ${formatMs(stats.p95Ms)} | p99 ${formatMs(stats.p99Ms)} | n=${stats.samples}`); + if (comparison?.deltas?.[name]) { + const delta = comparison.deltas[name]; + console.log(` delta: mean ${formatDelta(delta.meanPct)} | p50 ${formatDelta(delta.p50Pct)} | p95 ${formatDelta(delta.p95Pct)} | p99 ${formatDelta(delta.p99Pct)}`); + } + } + if (argv['write-baseline']) { + console.log(`- baseline saved: ${baselinePath}`); + } else if (comparison?.path) { + console.log(`- baseline: ${comparison.path}`); + } +} + +function ensureDir(dir) { + if (!dir) return; + fs.mkdirSync(dir, { recursive: true }); +} diff --git a/tools/bench/micro/utils.js b/tools/bench/micro/utils.js new file mode 100644 index 000000000..f0d6fb6f0 --- /dev/null +++ b/tools/bench/micro/utils.js @@ -0,0 +1,47 @@ +import { build as buildHistogram } from 'hdr-histogram-js'; + +const buildLatencyHistogram = (values) => { + if (!values.length) return null; + const scaled = values.map((value) => Math.max(1, Math.round(value * 1000))); + const maxValue = Math.max(...scaled, 1); + const histogram = buildHistogram({ + lowestDiscernibleValue: 1, + highestTrackableValue: maxValue, + numberOfSignificantValueDigits: 3 + }); + scaled.forEach((value) => histogram.recordValue(value)); + return histogram; +}; + +export function summarizeDurations(values) { + if (!values.length) { + return { count: 0, mean: 0, min: 0, max: 0, p50: 0, p95: 0, p99: 0 }; + } + const total = values.reduce((sum, value) => sum + value, 0); + const min = Math.min(...values); + const max = Math.max(...values); + const histogram = buildLatencyHistogram(values); + const pct = (p) => (histogram ? histogram.getValueAtPercentile(p) / 1000 : 0); + return { + count: values.length, + mean: total / values.length, + min, + max, + p50: pct(50), + p95: pct(95), + p99: pct(99) + }; +} + +export function formatMs(value) { + if (!Number.isFinite(value)) return 'n/a'; + return `${value.toFixed(1)}ms`; +} + +export function formatStats(stats) { + return `mean ${formatMs(stats.mean)} | p50 ${formatMs(stats.p50)} | p95 ${formatMs(stats.p95)} | p99 ${formatMs(stats.p99)} | min ${formatMs(stats.min)} | max ${formatMs(stats.max)} | n=${stats.count}`; +} + +export function hrtimeMs(start) { + return Number(process.hrtime.bigint() - start) / 1e6; +} diff --git a/tools/bootstrap.js b/tools/bootstrap.js index 32155368b..c870d13c7 100644 --- a/tools/bootstrap.js +++ b/tools/bootstrap.js @@ -1,34 +1,35 @@ #!/usr/bin/env node import fs from 'node:fs'; import path from 'node:path'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; import { runCommand, runCommandOrExit } from './cli-utils.js'; -import { getDictionaryPaths, getDictConfig, getRepoCacheRoot, getToolingConfig, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; +import { getDictionaryPaths, getDictConfig, getRepoCacheRoot, getRuntimeConfig, getToolingConfig, loadUserConfig, resolveRepoRoot, resolveRuntimeEnv, resolveToolRoot } from './dict-utils.js'; import { getVectorExtensionConfig, resolveVectorExtensionPath } from './vector-extension.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['skip-install', 'skip-dicts', 'skip-index', 'with-sqlite', 'incremental', 'skip-artifacts', 'skip-tooling', 'validate-config'], - string: ['repo'], - alias: { s: 'with-sqlite', i: 'incremental' }, - default: { - 'skip-install': false, - 'skip-dicts': false, - 'skip-index': false, - 'with-sqlite': false, - 'incremental': false, - 'skip-artifacts': false, - 'skip-tooling': false, - 'validate-config': false - } -}); +const argv = createCli({ + scriptName: 'bootstrap', + options: { + 'skip-install': { type: 'boolean', default: false }, + 'skip-dicts': { type: 'boolean', default: false }, + 'skip-index': { type: 'boolean', default: false }, + 'with-sqlite': { type: 'boolean', default: false }, + incremental: { type: 'boolean', default: false }, + 'skip-artifacts': { type: 'boolean', default: false }, + 'skip-tooling': { type: 'boolean', default: false }, + 'validate-config': { type: 'boolean', default: false }, + repo: { type: 'string' } + }, + aliases: { s: 'with-sqlite', i: 'incremental' } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); +const toolRoot = resolveToolRoot(); const configPath = path.join(root, '.pairofcleats.json'); if (argv['validate-config'] && fs.existsSync(configPath)) { const result = runCommand( process.execPath, - [path.join('tools', 'validate-config.js'), '--config', configPath], + [path.join(toolRoot, 'tools', 'validate-config.js'), '--config', configPath], { cwd: root, stdio: 'inherit' } ); if (!result.ok) { @@ -37,6 +38,8 @@ if (argv['validate-config'] && fs.existsSync(configPath)) { } const userConfig = loadUserConfig(root); +const runtimeConfig = getRuntimeConfig(root, userConfig); +const baseEnv = resolveRuntimeEnv(runtimeConfig, process.env); const vectorExtension = getVectorExtensionConfig(root, userConfig); const repoCacheRoot = getRepoCacheRoot(root, userConfig); const incrementalCacheRoot = path.join(repoCacheRoot, 'incremental'); @@ -54,7 +57,7 @@ let restoredArtifacts = false; * @param {string} label */ function run(cmd, args, label) { - runCommandOrExit(label || cmd, cmd, args, { cwd: root, stdio: 'inherit' }); + runCommandOrExit(label || cmd, cmd, args, { cwd: root, stdio: 'inherit', env: baseEnv }); } if (!argv['skip-install']) { @@ -68,7 +71,7 @@ if (!argv['skip-dicts']) { const dictConfig = getDictConfig(root, userConfig); const englishPath = path.join(dictConfig.dir, 'en.txt'); if (!fs.existsSync(englishPath)) { - run(process.execPath, [path.join('tools', 'download-dicts.js'), '--lang', 'en'], 'download English dictionary'); + run(process.execPath, [path.join(toolRoot, 'tools', 'download-dicts.js'), '--lang', 'en'], 'download English dictionary'); } const dictionaryPaths = await getDictionaryPaths(root, dictConfig); if (dictionaryPaths.length) { @@ -91,8 +94,8 @@ if (!argv['skip-tooling']) { const toolingConfig = getToolingConfig(root, userConfig); const detectResult = runCommand( process.execPath, - [path.join('tools', 'tooling-detect.js'), '--root', root, '--json'], - { cwd: root, encoding: 'utf8', stdio: 'pipe' } + [path.join(toolRoot, 'tools', 'tooling-detect.js'), '--root', root, '--json'], + { cwd: root, encoding: 'utf8', stdio: 'pipe', env: baseEnv } ); if (detectResult.status === 0 && detectResult.stdout) { try { @@ -101,7 +104,7 @@ if (!argv['skip-tooling']) { ? report.tools.filter((tool) => tool && tool.found === false) : []; if (toolingConfig.autoInstallOnDetect && missingTools.length) { - const installArgs = [path.join('tools', 'tooling-install.js'), '--root', root, '--scope', toolingConfig.installScope]; + const installArgs = [path.join(toolRoot, 'tools', 'tooling-install.js'), '--root', root, '--scope', toolingConfig.installScope]; if (!toolingConfig.allowGlobalFallback) installArgs.push('--no-fallback'); run(process.execPath, installArgs, 'install tooling'); } else if (missingTools.length) { @@ -118,22 +121,23 @@ if (!argv['skip-tooling']) { if (!argv['skip-artifacts'] && fs.existsSync(path.join(artifactsDir, 'manifest.json'))) { const result = runCommand( process.execPath, - [path.join('tools', 'ci-restore-artifacts.js'), '--from', artifactsDir], - { cwd: root, stdio: 'inherit' } + [path.join(toolRoot, 'tools', 'ci-restore-artifacts.js'), '--from', artifactsDir], + { cwd: root, stdio: 'inherit', env: baseEnv } ); restoredArtifacts = result.ok; } if (!argv['skip-index'] && !restoredArtifacts) { - const indexArgs = ['build_index.js']; + const indexArgs = [path.join(toolRoot, 'build_index.js')]; if (useIncremental) indexArgs.push('--incremental'); run(process.execPath, indexArgs, 'build index'); } if (argv['with-sqlite']) { - const sqliteArgs = [path.join('tools', 'build-sqlite-index.js')]; + const sqliteArgs = [path.join(toolRoot, 'tools', 'build-sqlite-index.js')]; if (useIncremental) sqliteArgs.push('--incremental'); run(process.execPath, sqliteArgs, 'build sqlite index'); } +console.log('[bootstrap] Tip: run npm run index-validate to verify index artifacts.'); console.log('\nBootstrap complete.'); diff --git a/tools/build-embeddings.js b/tools/build-embeddings.js new file mode 100644 index 000000000..99dfa8ad0 --- /dev/null +++ b/tools/build-embeddings.js @@ -0,0 +1,12 @@ +#!/usr/bin/env node +import { fileURLToPath } from 'node:url'; +import { runBuildEmbeddings } from './build-embeddings/run.js'; + +export { runBuildEmbeddings }; + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + runBuildEmbeddings().catch((err) => { + console.error(err?.message || err); + process.exit(1); + }); +} diff --git a/tools/build-embeddings/atomic.js b/tools/build-embeddings/atomic.js new file mode 100644 index 000000000..254e293a2 --- /dev/null +++ b/tools/build-embeddings/atomic.js @@ -0,0 +1,89 @@ +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import path from 'node:path'; + +export const createTempPath = (filePath) => { + const suffix = `.tmp-${process.pid}-${Date.now()}-${Math.random().toString(16).slice(2, 8)}`; + const tempPath = `${filePath}${suffix}`; + if (process.platform !== 'win32' || tempPath.length <= 240) { + return tempPath; + } + const dir = path.dirname(filePath); + const ext = path.extname(filePath) || '.bin'; + const shortName = `.tmp-${Math.random().toString(16).slice(2, 10)}${ext}`; + return path.join(dir, shortName); +}; + +export const replaceFile = async (tempPath, finalPath) => { + const bakPath = `${finalPath}.bak`; + const finalExists = fsSync.existsSync(finalPath); + let backupAvailable = fsSync.existsSync(bakPath); + if (finalExists && !backupAvailable) { + try { + await fs.rename(finalPath, bakPath); + backupAvailable = true; + } catch (err) { + if (err?.code !== 'ENOENT') { + backupAvailable = fsSync.existsSync(bakPath); + } + } + } + try { + await fs.rename(tempPath, finalPath); + } catch (err) { + if (err?.code !== 'EEXIST' && err?.code !== 'EPERM' && err?.code !== 'ENOTEMPTY') { + throw err; + } + if (!backupAvailable) { + throw err; + } + try { + await fs.rm(finalPath, { force: true }); + } catch {} + await fs.rename(tempPath, finalPath); + } +}; + +/** + * Replace a file atomically without creating a .bak. This is intended for + * ephemeral cache entries where retaining backups would create excessive churn. + */ +export const replaceFileNoBak = async (tempPath, finalPath) => { + const copyFallback = async () => { + try { + await fs.copyFile(tempPath, finalPath); + await fs.rm(tempPath, { force: true }); + return true; + } catch { + return false; + } + }; + + try { + await fs.rename(tempPath, finalPath); + return; + } catch (err) { + if (err?.code === 'EXDEV') { + if (await copyFallback()) return; + throw err; + } + if (err?.code !== 'EEXIST' + && err?.code !== 'EPERM' + && err?.code !== 'ENOTEMPTY' + && err?.code !== 'EACCES') { + throw err; + } + } + + try { + await fs.rm(finalPath, { force: true }); + } catch {} + try { + await fs.rename(tempPath, finalPath); + } catch (err) { + if (err?.code === 'EXDEV') { + if (await copyFallback()) return; + } + throw err; + } +}; diff --git a/tools/build-embeddings/cache.js b/tools/build-embeddings/cache.js new file mode 100644 index 000000000..d53909fcf --- /dev/null +++ b/tools/build-embeddings/cache.js @@ -0,0 +1,79 @@ +import path from 'node:path'; +import { sha1 } from '../../src/shared/hash.js'; + +// Keep in sync with src/index/embedding.js defaults. +const DEFAULT_POOLING = 'mean'; +const DEFAULT_NORMALIZE = true; +const DEFAULT_TRUNCATION = true; +const DEFAULT_QUANT_MIN = -1; +const DEFAULT_QUANT_MAX = 1; +const DEFAULT_QUANT_LEVELS = 256; + +export const buildCacheIdentity = ({ + modelId, + provider, + mode, + stub, + dims, + scale, + onnx, + preprocess, + quantization +} = {}) => { + const providerValue = provider || null; + const resolvedPreprocess = preprocess && typeof preprocess === 'object' ? preprocess : {}; + const resolvedQuant = quantization && typeof quantization === 'object' ? quantization : {}; + const resolvedOnnx = onnx && typeof onnx === 'object' ? onnx : null; + + const identity = { + // Bump to invalidate caches when embedding semantics change. + version: 2, + modelId: modelId || null, + provider: providerValue, + mode: mode || null, + stub: stub === true, + dims: dims ?? null, + scale, + preprocess: { + pooling: resolvedPreprocess.pooling ?? DEFAULT_POOLING, + normalize: resolvedPreprocess.normalize ?? DEFAULT_NORMALIZE, + truncation: resolvedPreprocess.truncation ?? DEFAULT_TRUNCATION, + // Reserved for future use (explicit max_length / tokenizer policy). + maxLength: resolvedPreprocess.maxLength ?? null + }, + quantization: { + // Allows future changes (e.g., asymmetric / per-channel / float16) to invalidate caches. + version: resolvedQuant.version ?? 1, + minVal: resolvedQuant.minVal ?? DEFAULT_QUANT_MIN, + maxVal: resolvedQuant.maxVal ?? DEFAULT_QUANT_MAX, + levels: resolvedQuant.levels ?? DEFAULT_QUANT_LEVELS + }, + onnx: providerValue === 'onnx' && resolvedOnnx ? { + modelPath: resolvedOnnx.modelPath ?? null, + tokenizerId: resolvedOnnx.tokenizerId ?? null, + executionProviders: resolvedOnnx.executionProviders ?? null, + intraOpNumThreads: resolvedOnnx.intraOpNumThreads ?? null, + interOpNumThreads: resolvedOnnx.interOpNumThreads ?? null, + graphOptimizationLevel: resolvedOnnx.graphOptimizationLevel ?? null + } : null + }; + const key = sha1(JSON.stringify(identity)); + return { identity, key }; +}; + +export const resolveCacheRoot = ({ repoCacheRoot, cacheDirConfig }) => { + if (cacheDirConfig) return path.resolve(cacheDirConfig); + return path.join(repoCacheRoot, 'embeddings'); +}; + +export const resolveCacheDir = (cacheRoot, mode) => path.join(cacheRoot, mode, 'files'); + +export const buildCacheKey = ({ file, hash, signature, identityKey }) => { + if (!hash) return null; + return sha1(`${file}:${hash}:${signature}:${identityKey}`); +}; + +export const isCacheValid = ({ cached, signature, identityKey }) => { + if (!cached || cached.chunkSignature !== signature) return false; + return cached.cacheMeta?.identityKey === identityKey; +}; diff --git a/tools/build-embeddings/chunks.js b/tools/build-embeddings/chunks.js new file mode 100644 index 000000000..eb41aa079 --- /dev/null +++ b/tools/build-embeddings/chunks.js @@ -0,0 +1,76 @@ +import fsSync from 'node:fs'; +import path from 'node:path'; +import { + normalizeBundleFormat, + readBundleFile, + resolveBundleFilename, + resolveBundleFormatFromName +} from '../../src/shared/bundle-io.js'; +import { sha1 } from '../../src/shared/hash.js'; + +export const buildChunkSignature = (items) => sha1( + items.map(({ chunk }) => `${chunk.start}:${chunk.end}`).join('|') +); + +export const buildChunksFromBundles = async (bundleDir, manifestFiles, bundleFormat) => { + const resolvedBundleFormat = normalizeBundleFormat(bundleFormat); + const chunksByFile = new Map(); + let maxChunkId = -1; + let total = 0; + // Ensure deterministic chunk ordering regardless of JSON object insertion order. + const manifestEntries = Object.entries(manifestFiles || {}).sort(([a], [b]) => ( + a < b ? -1 : (a > b ? 1 : 0) + )); + for (const [relPath, entry] of manifestEntries) { + const bundleName = entry?.bundle || resolveBundleFilename(relPath, resolvedBundleFormat); + const bundlePath = path.join(bundleDir, bundleName); + if (!fsSync.existsSync(bundlePath)) continue; + let bundle; + try { + const result = await readBundleFile(bundlePath, { + format: resolveBundleFormatFromName(bundleName, resolvedBundleFormat) + }); + if (!result.ok) continue; + bundle = result.bundle; + } catch { + continue; + } + const filePath = bundle?.file || relPath; + const chunks = Array.isArray(bundle?.chunks) ? bundle.chunks : []; + if (!chunks.length) continue; + const list = chunksByFile.get(filePath) || []; + for (const chunk of chunks) { + if (!chunk) continue; + const id = Number.isFinite(chunk.id) ? chunk.id : null; + if (Number.isFinite(id) && id > maxChunkId) maxChunkId = id; + list.push({ index: Number.isFinite(id) ? id : null, chunk }); + total += 1; + } + chunksByFile.set(filePath, list); + } + if (!chunksByFile.size) { + return { chunksByFile, totalChunks: 0 }; + } + let totalChunks = maxChunkId >= 0 ? maxChunkId + 1 : total; + if (maxChunkId < 0) { + let next = 0; + for (const list of chunksByFile.values()) { + for (const item of list) { + item.index = next; + next += 1; + } + } + totalChunks = next; + } else { + let next = maxChunkId + 1; + for (const list of chunksByFile.values()) { + for (const item of list) { + if (Number.isFinite(item.index)) continue; + item.index = next; + next += 1; + } + } + totalChunks = Math.max(totalChunks, next); + } + return { chunksByFile, totalChunks }; +}; diff --git a/tools/build-embeddings/cli.js b/tools/build-embeddings/cli.js new file mode 100644 index 000000000..5e7d29bb2 --- /dev/null +++ b/tools/build-embeddings/cli.js @@ -0,0 +1,91 @@ +import os from 'node:os'; +import path from 'node:path'; +import { createCli } from '../../src/shared/cli.js'; +import { getEnvConfig } from '../../src/shared/env.js'; +import { normalizeEmbeddingProvider, normalizeOnnxConfig } from '../../src/shared/onnx-embeddings.js'; +import { normalizeHnswConfig } from '../../src/shared/hnsw.js'; +import { getModelConfig, loadUserConfig, resolveIndexRoot, resolveRepoRoot } from '../dict-utils.js'; + +export const parseBuildEmbeddingsArgs = (rawArgs = process.argv.slice(2)) => { + const argv = createCli({ + scriptName: 'build-embeddings', + argv: ['node', 'build-embeddings.js', ...(rawArgs || [])], + options: { + mode: { type: 'string', default: 'all' }, + repo: { type: 'string' }, + dims: { type: 'number' }, + batch: { type: 'number' }, + 'stub-embeddings': { type: 'boolean', default: false }, + 'index-root': { type: 'string' } + } + }).parse(); + + const root = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); + const userConfig = loadUserConfig(root); + const envConfig = getEnvConfig(); + const indexingConfig = userConfig.indexing || {}; + const embeddingsConfig = indexingConfig.embeddings || {}; + const embeddingProvider = normalizeEmbeddingProvider(embeddingsConfig.provider); + const embeddingOnnx = normalizeOnnxConfig(embeddingsConfig.onnx || {}); + const hnswConfig = normalizeHnswConfig(embeddingsConfig.hnsw || {}); + + const embeddingModeRaw = typeof embeddingsConfig.mode === 'string' + ? embeddingsConfig.mode.trim().toLowerCase() + : 'auto'; + const baseStubEmbeddings = argv['stub-embeddings'] === true + || envConfig.embeddings === 'stub'; + const normalizedEmbeddingMode = ['auto', 'inline', 'service', 'stub', 'off'].includes(embeddingModeRaw) + ? embeddingModeRaw + : 'auto'; + const resolvedEmbeddingMode = normalizedEmbeddingMode === 'auto' + ? (baseStubEmbeddings ? 'stub' : 'inline') + : (normalizedEmbeddingMode === 'service' + ? (baseStubEmbeddings ? 'stub' : 'inline') + : normalizedEmbeddingMode); + + const embeddingBatchRaw = Number(argv.batch ?? indexingConfig.embeddingBatchSize); + let embeddingBatchSize = Number.isFinite(embeddingBatchRaw) + ? Math.max(0, Math.floor(embeddingBatchRaw)) + : 0; + if (!embeddingBatchSize) { + const totalGb = os.totalmem() / (1024 ** 3); + const autoBatch = Math.floor(totalGb * 32); + embeddingBatchSize = Math.min(128, Math.max(32, autoBatch)); + } + + const useStubEmbeddings = resolvedEmbeddingMode === 'stub' || baseStubEmbeddings; + const configuredDims = Number.isFinite(Number(argv.dims)) + ? Math.max(1, Math.floor(Number(argv.dims))) + : null; + + const modelConfig = getModelConfig(root, userConfig); + const indexRoot = argv['index-root'] + ? path.resolve(argv['index-root']) + : resolveIndexRoot(root, userConfig); + + const embedModeRaw = (argv.mode || 'all').toLowerCase(); + const embedMode = embedModeRaw === 'both' ? 'all' : embedModeRaw; + const modes = embedMode === 'all' ? ['code', 'prose'] : [embedMode]; + + return { + argv, + root, + userConfig, + envConfig, + indexingConfig, + embeddingsConfig, + embeddingProvider, + embeddingOnnx, + hnswConfig, + normalizedEmbeddingMode, + resolvedEmbeddingMode, + useStubEmbeddings, + embeddingBatchSize, + configuredDims, + modelConfig, + modelId: modelConfig.id, + modelsDir: modelConfig.dir || null, + indexRoot, + modes + }; +}; diff --git a/tools/build-embeddings/embed.js b/tools/build-embeddings/embed.js new file mode 100644 index 000000000..5949a3097 --- /dev/null +++ b/tools/build-embeddings/embed.js @@ -0,0 +1,90 @@ +import { normalizeVec, quantizeVec } from '../../src/index/embedding.js'; + +export const runBatched = async ({ texts, batchSize, embed }) => { + if (!texts.length) return []; + if (!batchSize || texts.length <= batchSize) { + return embed(texts); + } + const out = []; + for (let i = 0; i < texts.length; i += batchSize) { + const slice = texts.slice(i, i + batchSize); + const batch = await embed(slice); + out.push(...batch); + } + return out; +}; + +export const ensureVectorArrays = (vectors, count) => { + if (Array.isArray(vectors) && vectors.length === count) return vectors; + const out = []; + for (let i = 0; i < count; i += 1) { + out.push(Array.isArray(vectors?.[i]) ? vectors[i] : []); + } + return out; +}; + +export const createDimsValidator = ({ mode, configuredDims }) => { + let dims = 0; + const assertDims = (length) => { + if (!length) return; + if (configuredDims && configuredDims !== length) { + throw new Error( + `[embeddings] ${mode} embedding dims mismatch (configured=${configuredDims}, observed=${length}).` + ); + } + if (dims && dims !== length) { + throw new Error( + `[embeddings] ${mode} embedding dims mismatch (configured=${dims}, observed=${length}).` + ); + } + if (!dims) dims = length; + }; + const getDims = () => dims; + return { assertDims, getDims }; +}; + +export const isDimsMismatch = (err) => + err?.message?.includes('embedding dims mismatch'); + +export const validateCachedDims = ({ vectors, expectedDims, mode }) => { + if (!expectedDims || !Array.isArray(vectors)) return; + for (const vec of vectors) { + if (!Array.isArray(vec) || !vec.length) continue; + if (vec.length !== expectedDims) { + throw new Error( + `[embeddings] ${mode} embedding dims mismatch (configured=${expectedDims}, observed=${vec.length}).` + ); + } + } +}; + +export const buildQuantizedVectors = ({ + chunkIndex, + codeVector, + docVector, + zeroVector, + addHnswVector +}) => { + const embedCode = Array.isArray(codeVector) ? codeVector : []; + const embedDoc = Array.isArray(docVector) ? docVector : zeroVector; + const merged = embedCode.length + ? embedCode.map((value, idx) => (value + (embedDoc[idx] ?? 0)) / 2) + : embedDoc; + const normalized = normalizeVec(merged); + if (addHnswVector && normalized.length) { + addHnswVector(chunkIndex, normalized); + } + const quantizedCode = embedCode.length ? quantizeVec(embedCode) : []; + const quantizedDoc = embedDoc.length ? quantizeVec(embedDoc) : []; + const quantizedMerged = normalized.length ? quantizeVec(normalized) : []; + return { quantizedCode, quantizedDoc, quantizedMerged }; +}; + +export const fillMissingVectors = (vectorList, dims) => { + const fallback = new Array(dims).fill(0); + for (let i = 0; i < vectorList.length; i += 1) { + if (!Array.isArray(vectorList[i]) || vectorList[i].length !== dims) { + vectorList[i] = fallback; + } + } +}; diff --git a/tools/build-embeddings/hnsw.js b/tools/build-embeddings/hnsw.js new file mode 100644 index 000000000..a27331b8b --- /dev/null +++ b/tools/build-embeddings/hnsw.js @@ -0,0 +1,78 @@ +import fs from 'node:fs/promises'; +import hnswlib from 'hnswlib-node'; +import { writeJsonObjectFile } from '../../src/shared/json-stream.js'; +import { createTempPath, replaceFile } from './atomic.js'; + +const { HierarchicalNSW } = hnswlib?.default || hnswlib || {}; + +export const createHnswBuilder = ({ enabled, config, totalChunks, mode }) => { + let index = null; + let added = 0; + let expected = 0; + + const initHnsw = (vector) => { + if (!enabled || index || !Array.isArray(vector) || !vector.length) return; + if (!HierarchicalNSW) return; + index = new HierarchicalNSW(config.space, vector.length); + index.initIndex({ + maxElements: totalChunks, + m: config.m, + efConstruction: config.efConstruction, + randomSeed: config.randomSeed, + allowReplaceDeleted: config.allowReplaceDeleted + }); + }; + + const addVector = (chunkIndex, vector) => { + if (!enabled || !vector || !vector.length) return; + const data = Array.isArray(vector) ? vector : Array.from(vector); + initHnsw(data); + if (!index) return; + expected += 1; + try { + index.addPoint(data, chunkIndex); + added += 1; + } catch { + // Ignore HNSW insert failures. + } + }; + + const writeIndex = async ({ indexPath, metaPath, modelId, dims }) => { + if (!enabled || !index || !expected) return { skipped: true }; + if (expected !== added) { + throw new Error(`HNSW insert count mismatch (${added} of ${expected}).`); + } + const tempHnswPath = createTempPath(indexPath); + try { + index.writeIndexSync(tempHnswPath); + await replaceFile(tempHnswPath, indexPath); + } catch (err) { + try { + await fs.rm(tempHnswPath, { force: true }); + } catch {} + throw err; + } + const hnswMeta = { + version: 1, + generatedAt: new Date().toISOString(), + model: modelId || null, + dims, + count: added, + expectedCount: expected, + space: config.space, + m: config.m, + efConstruction: config.efConstruction, + efSearch: config.efSearch + }; + await writeJsonObjectFile(metaPath, { fields: hnswMeta, atomic: true }); + return { skipped: false, count: added }; + }; + + const getStats = () => ({ added, expected, ready: !!index }); + + return { + addVector, + writeIndex, + getStats + }; +}; diff --git a/tools/build-embeddings/lancedb.js b/tools/build-embeddings/lancedb.js new file mode 100644 index 000000000..72dc367cc --- /dev/null +++ b/tools/build-embeddings/lancedb.js @@ -0,0 +1,142 @@ +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import { tryImport } from '../../src/shared/optional-deps.js'; +import { writeJsonObjectFile } from '../../src/shared/json-stream.js'; +import { dequantizeUint8ToFloat32 } from '../../src/storage/sqlite/vector.js'; +import { normalizeLanceDbConfig, resolveLanceDbPaths } from '../../src/shared/lancedb.js'; + +let warnedMissing = false; + +const loadLanceDb = async () => { + const result = await tryImport('@lancedb/lancedb'); + if (!result.ok) { + if (!warnedMissing) { + warnedMissing = true; + console.warn('[embeddings] LanceDB unavailable; skipping LanceDB build.'); + } + return null; + } + return result.mod?.default || result.mod; +}; + +const createTable = async (db, tableName, rows) => { + if (!db || typeof db.createTable !== 'function') return null; + return db.createTable(tableName, rows, { mode: 'overwrite' }); +}; + +const addRows = async (table, rows) => { + if (!table) return; + if (typeof table.add === 'function') { + await table.add(rows); + return; + } + if (typeof table.insert === 'function') { + await table.insert(rows); + return; + } + if (typeof table.append === 'function') { + await table.append(rows); + } +}; + +const buildBatch = (vectors, start, end, idColumn, embeddingColumn) => { + const rows = []; + for (let i = start; i < end; i += 1) { + const vec = vectors[i]; + if (!vec || typeof vec.length !== 'number') continue; + const floatVec = dequantizeUint8ToFloat32(vec); + if (!floatVec) continue; + rows.push({ + [idColumn]: i, + [embeddingColumn]: floatVec + }); + } + return rows; +}; + +export async function writeLanceDbIndex({ + indexDir, + variant, + vectors, + dims, + modelId, + config, + emitOutput = true, + label = null +}) { + const resolvedConfig = normalizeLanceDbConfig(config); + if (!resolvedConfig.enabled) return { skipped: true, reason: 'disabled' }; + if (!Array.isArray(vectors) || !vectors.length) { + return { skipped: true, reason: 'empty' }; + } + const lancedb = await loadLanceDb(); + if (!lancedb) return { skipped: true, reason: 'missing dependency' }; + + const paths = resolveLanceDbPaths(indexDir); + const target = paths[variant]; + if (!target) return { skipped: true, reason: 'unknown variant' }; + const dir = target.dir; + const metaPath = target.metaPath; + + try { + if (fsSync.existsSync(dir)) { + await fs.rm(dir, { recursive: true, force: true }); + } + } catch {} + + const connect = lancedb.connect || lancedb.default?.connect; + if (typeof connect !== 'function') { + return { skipped: true, reason: 'invalid module' }; + } + + const db = await connect(dir); + const tableName = resolvedConfig.table; + const idColumn = resolvedConfig.idColumn; + const embeddingColumn = resolvedConfig.embeddingColumn; + const batchSize = Math.max(1, Math.floor(resolvedConfig.batchSize || 1024)); + + let table = null; + try { + const firstBatch = buildBatch(vectors, 0, Math.min(batchSize, vectors.length), idColumn, embeddingColumn); + table = await createTable(db, tableName, firstBatch); + if (!table && typeof db.openTable === 'function') { + table = await db.openTable(tableName); + if (firstBatch.length) await addRows(table, firstBatch); + } + for (let start = batchSize; start < vectors.length; start += batchSize) { + const rows = buildBatch( + vectors, + start, + Math.min(start + batchSize, vectors.length), + idColumn, + embeddingColumn + ); + if (rows.length) { + await addRows(table, rows); + } + } + } finally { + if (db?.close) { + await db.close(); + } + } + + const meta = { + version: 1, + generatedAt: new Date().toISOString(), + model: modelId || null, + dims: Number.isFinite(Number(dims)) ? Number(dims) : null, + count: vectors.length, + metric: resolvedConfig.metric, + table: tableName, + embeddingColumn, + idColumn + }; + await writeJsonObjectFile(metaPath, { fields: meta, atomic: true }); + + if (emitOutput) { + const targetLabel = label || variant; + console.log(`[embeddings] ${targetLabel}: wrote LanceDB table (${vectors.length} vectors).`); + } + return { skipped: false, count: vectors.length }; +} diff --git a/tools/build-embeddings/manifest.js b/tools/build-embeddings/manifest.js new file mode 100644 index 000000000..b3bbf1a81 --- /dev/null +++ b/tools/build-embeddings/manifest.js @@ -0,0 +1,82 @@ +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import path from 'node:path'; +import { MAX_JSON_BYTES, readJsonFile } from '../../src/shared/artifact-io.js'; +import { writeJsonObjectFile } from '../../src/shared/json-stream.js'; +import { checksumFile } from '../../src/shared/hash.js'; + +export const updatePieceManifest = async ({ indexDir, mode, totalChunks, dims }) => { + const piecesDir = path.join(indexDir, 'pieces'); + const manifestPath = path.join(piecesDir, 'manifest.json'); + let existing = {}; + if (fsSync.existsSync(manifestPath)) { + try { + existing = readJsonFile(manifestPath, { maxBytes: MAX_JSON_BYTES }) || {}; + } catch { + existing = {}; + } + } + const priorPieces = Array.isArray(existing.pieces) ? existing.pieces : []; + const retained = []; + for (const entry of priorPieces) { + if (!entry || entry.type === 'embeddings') continue; + if (entry.path === 'index_state.json') { + const absPath = path.join(indexDir, entry.path.split('/').join(path.sep)); + let bytes = null; + let checksum = null; + let checksumAlgo = null; + try { + const stat = await fs.stat(absPath); + bytes = stat.size; + const result = await checksumFile(absPath); + checksum = result?.value || null; + checksumAlgo = result?.algo || null; + } catch {} + retained.push({ + ...entry, + bytes, + checksum: checksum && checksumAlgo ? `${checksumAlgo}:${checksum}` : null + }); + continue; + } + retained.push(entry); + } + const embeddingPieces = [ + { type: 'embeddings', name: 'dense_vectors', format: 'json', path: 'dense_vectors_uint8.json', count: totalChunks, dims }, + { type: 'embeddings', name: 'dense_vectors_doc', format: 'json', path: 'dense_vectors_doc_uint8.json', count: totalChunks, dims }, + { type: 'embeddings', name: 'dense_vectors_code', format: 'json', path: 'dense_vectors_code_uint8.json', count: totalChunks, dims }, + { type: 'embeddings', name: 'dense_vectors_hnsw', format: 'bin', path: 'dense_vectors_hnsw.bin', count: totalChunks, dims }, + { type: 'embeddings', name: 'dense_vectors_hnsw_meta', format: 'json', path: 'dense_vectors_hnsw.meta.json', count: totalChunks, dims } + ]; + const enriched = []; + for (const entry of embeddingPieces) { + const absPath = path.join(indexDir, entry.path); + if (!fsSync.existsSync(absPath)) continue; + let bytes = null; + let checksum = null; + let checksumAlgo = null; + try { + const stat = await fs.stat(absPath); + bytes = stat.size; + const result = await checksumFile(absPath); + checksum = result?.value || null; + checksumAlgo = result?.algo || null; + } catch {} + enriched.push({ + ...entry, + bytes, + checksum: checksum && checksumAlgo ? `${checksumAlgo}:${checksum}` : null + }); + } + const now = new Date().toISOString(); + const manifest = { + version: existing.version || 2, + generatedAt: existing.generatedAt || now, + updatedAt: now, + mode, + stage: existing.stage || 'stage3', + pieces: [...retained, ...enriched] + }; + await fs.mkdir(piecesDir, { recursive: true }); + await writeJsonObjectFile(manifestPath, { fields: manifest, atomic: true }); +}; diff --git a/tools/build-embeddings/run.js b/tools/build-embeddings/run.js new file mode 100644 index 000000000..dd76fdab7 --- /dev/null +++ b/tools/build-embeddings/run.js @@ -0,0 +1,593 @@ +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import path from 'node:path'; +import { createEmbedder } from '../../src/index/embedding.js'; +import { validateIndexArtifacts } from '../../src/index/validate.js'; +import { markBuildPhase, resolveBuildStatePath, startBuildHeartbeat } from '../../src/index/build/build-state.js'; +import { loadIncrementalManifest } from '../../src/storage/sqlite/incremental.js'; +import { dequantizeUint8ToFloat32 } from '../../src/storage/sqlite/vector.js'; +import { loadChunkMeta, readJsonFile, MAX_JSON_BYTES } from '../../src/shared/artifact-io.js'; +import { readTextFileWithHash } from '../../src/shared/encoding.js'; +import { writeJsonObjectFile } from '../../src/shared/json-stream.js'; +import { resolveHnswPaths } from '../../src/shared/hnsw.js'; +import { resolveOnnxModelPath } from '../../src/shared/onnx-embeddings.js'; +import { getIndexDir, getRepoCacheRoot } from '../dict-utils.js'; +import { buildCacheIdentity, buildCacheKey, isCacheValid, resolveCacheDir, resolveCacheRoot } from './cache.js'; +import { createTempPath, replaceFileNoBak } from './atomic.js'; +import { buildChunkSignature, buildChunksFromBundles } from './chunks.js'; +import { + buildQuantizedVectors, + createDimsValidator, + ensureVectorArrays, + fillMissingVectors, + isDimsMismatch, + runBatched, + validateCachedDims +} from './embed.js'; +import { createHnswBuilder } from './hnsw.js'; +import { updatePieceManifest } from './manifest.js'; +import { updateSqliteDense } from './sqlite-dense.js'; +import { parseBuildEmbeddingsArgs } from './cli.js'; + +let Database = null; +try { + ({ default: Database } = await import('better-sqlite3')); +} catch {} + +const loadIndexState = (statePath) => { + if (!fsSync.existsSync(statePath)) return {}; + try { + return readJsonFile(statePath, { maxBytes: MAX_JSON_BYTES }) || {}; + } catch { + return {}; + } +}; + +const writeIndexState = async (statePath, state) => { + await writeJsonObjectFile(statePath, { fields: state, atomic: true }); +}; + +export async function runBuildEmbeddings(rawArgs = process.argv.slice(2), _options = {}) { + const config = parseBuildEmbeddingsArgs(rawArgs, _options); + const { + argv, + root, + userConfig, + embeddingsConfig, + embeddingProvider, + embeddingOnnx, + hnswConfig, + normalizedEmbeddingMode, + resolvedEmbeddingMode, + useStubEmbeddings, + embeddingBatchSize, + configuredDims, + modelId, + modelsDir, + indexRoot, + modes + } = config; + + if (embeddingsConfig.enabled === false || resolvedEmbeddingMode === 'off') { + console.error('Embeddings disabled; skipping build-embeddings.'); + return { skipped: true }; + } + + const denseScale = 2 / 255; + const cacheDims = useStubEmbeddings ? (configuredDims || 384) : configuredDims; + const cacheOnnx = embeddingProvider === 'onnx' ? { + modelPath: resolveOnnxModelPath({ + rootDir: root, + modelPath: embeddingOnnx?.modelPath, + modelsDir, + modelId + }), + tokenizerId: embeddingOnnx?.tokenizerId || modelId || null, + executionProviders: embeddingOnnx?.executionProviders || null, + intraOpNumThreads: embeddingOnnx?.intraOpNumThreads || null, + interOpNumThreads: embeddingOnnx?.interOpNumThreads || null, + graphOptimizationLevel: embeddingOnnx?.graphOptimizationLevel || null + } : null; + const { identity: cacheIdentity, key: cacheIdentityKey } = buildCacheIdentity({ + modelId, + provider: embeddingProvider, + mode: resolvedEmbeddingMode, + stub: useStubEmbeddings, + dims: cacheDims, + scale: denseScale, + preprocess: { + pooling: 'mean', + normalize: true, + truncation: true + }, + quantization: { + version: 1, + minVal: -1, + maxVal: 1, + levels: 256 + }, + onnx: cacheOnnx + }); + + const embedder = createEmbedder({ + rootDir: root, + useStubEmbeddings, + modelId, + dims: argv.dims, + modelsDir, + provider: embeddingProvider, + onnx: embeddingOnnx + }); + const getChunkEmbeddings = embedder.getChunkEmbeddings; + + const repoCacheRoot = getRepoCacheRoot(root, userConfig); + const buildStatePath = resolveBuildStatePath(indexRoot); + const hasBuildState = buildStatePath && fsSync.existsSync(buildStatePath); + const stopHeartbeat = hasBuildState ? startBuildHeartbeat(indexRoot, 'stage3') : () => {}; + + const cacheRoot = resolveCacheRoot({ + repoCacheRoot, + cacheDirConfig: embeddingsConfig.cache?.dir + }); + + if (hasBuildState) { + await markBuildPhase(indexRoot, 'stage3', 'running'); + } + + for (const mode of modes) { + if (!['code', 'prose'].includes(mode)) { + console.error(`Invalid mode: ${mode}`); + process.exit(1); + } + const indexDir = getIndexDir(root, mode, userConfig, { indexRoot }); + const statePath = path.join(indexDir, 'index_state.json'); + const stateNow = new Date().toISOString(); + let indexState = loadIndexState(statePath); + indexState.generatedAt = indexState.generatedAt || stateNow; + indexState.updatedAt = stateNow; + indexState.mode = indexState.mode || mode; + indexState.embeddings = { + ...(indexState.embeddings || {}), + enabled: true, + ready: false, + pending: true, + mode: indexState.embeddings?.mode || resolvedEmbeddingMode, + service: indexState.embeddings?.service ?? (normalizedEmbeddingMode === 'service'), + updatedAt: stateNow + }; + try { + await writeIndexState(statePath, indexState); + } catch { + // Ignore index state write failures. + } + + const chunkMetaPath = path.join(indexDir, 'chunk_meta.json'); + const chunkMetaJsonlPath = path.join(indexDir, 'chunk_meta.jsonl'); + const chunkMetaMetaPath = path.join(indexDir, 'chunk_meta.meta.json'); + const incremental = loadIncrementalManifest(repoCacheRoot, mode); + const manifestFiles = incremental?.manifest?.files || {}; + const hasChunkMeta = fsSync.existsSync(chunkMetaPath) + || fsSync.existsSync(chunkMetaJsonlPath) + || fsSync.existsSync(chunkMetaMetaPath); + + let chunkMeta; + try { + if (hasChunkMeta) { + chunkMeta = loadChunkMeta(indexDir, { maxBytes: MAX_JSON_BYTES }); + } + } catch (err) { + if (err?.code === 'ERR_JSON_TOO_LARGE') { + console.warn(`[embeddings] chunk_meta too large for ${mode}; using incremental bundles if available.`); + } else { + console.warn(`[embeddings] Failed to load chunk_meta for ${mode}: ${err?.message || err}`); + } + chunkMeta = null; + } + + let chunksByFile = new Map(); + let totalChunks = 0; + if (Array.isArray(chunkMeta)) { + const fileMetaPath = path.join(indexDir, 'file_meta.json'); + let fileMeta = []; + if (fsSync.existsSync(fileMetaPath)) { + try { + fileMeta = readJsonFile(fileMetaPath, { maxBytes: MAX_JSON_BYTES }); + } catch (err) { + console.warn(`[embeddings] Failed to read file_meta for ${mode}: ${err?.message || err}`); + fileMeta = []; + } + } + const fileMetaById = new Map(); + if (Array.isArray(fileMeta)) { + for (const entry of fileMeta) { + if (!entry || !Number.isFinite(entry.id)) continue; + fileMetaById.set(entry.id, entry); + } + } + for (let i = 0; i < chunkMeta.length; i += 1) { + const chunk = chunkMeta[i]; + if (!chunk) continue; + const filePath = chunk.file || fileMetaById.get(chunk.fileId)?.file; + if (!filePath) continue; + const list = chunksByFile.get(filePath) || []; + list.push({ index: i, chunk }); + chunksByFile.set(filePath, list); + } + totalChunks = chunkMeta.length; + } else { + if (!manifestFiles || !Object.keys(manifestFiles).length) { + console.warn(`[embeddings] Missing chunk_meta and no incremental bundles for ${mode}; skipping.`); + continue; + } + const bundleResult = await buildChunksFromBundles( + incremental.bundleDir, + manifestFiles, + incremental?.manifest?.bundleFormat + ); + chunksByFile = bundleResult.chunksByFile; + totalChunks = bundleResult.totalChunks; + if (!chunksByFile.size || !totalChunks) { + console.warn(`[embeddings] Incremental bundles empty for ${mode}; skipping.`); + continue; + } + console.log(`[embeddings] ${mode}: using incremental bundles (${chunksByFile.size} files).`); + } + + const codeVectors = new Array(totalChunks).fill(null); + const docVectors = new Array(totalChunks).fill(null); + const mergedVectors = new Array(totalChunks).fill(null); + const { indexPath: hnswIndexPath, metaPath: hnswMetaPath } = resolveHnswPaths(indexDir); + const hnswBuilder = createHnswBuilder({ enabled: hnswConfig.enabled, config: hnswConfig, totalChunks, mode }); + + const cacheDir = resolveCacheDir(cacheRoot, mode); + await fs.mkdir(cacheDir, { recursive: true }); + + const dimsValidator = createDimsValidator({ mode, configuredDims }); + const assertDims = dimsValidator.assertDims; + + if (configuredDims) { + try { + const entries = await fs.readdir(cacheDir); + for (const entry of entries) { + if (!entry.endsWith('.json')) continue; + const cached = JSON.parse(await fs.readFile(path.join(cacheDir, entry), 'utf8')); + if (cached.cacheMeta?.identityKey !== cacheIdentityKey) continue; + const expectedDims = configuredDims || cached.cacheMeta?.identity?.dims || null; + validateCachedDims({ vectors: cached.codeVectors, expectedDims, mode }); + validateCachedDims({ vectors: cached.docVectors, expectedDims, mode }); + validateCachedDims({ vectors: cached.mergedVectors, expectedDims, mode }); + } + } catch (err) { + if (isDimsMismatch(err)) throw err; + // Ignore cache preflight errors. + } + } + + let processedFiles = 0; + for (const [relPath, itemsRaw] of chunksByFile.entries()) { + // Ensure stable mapping between chunkSignature, cache vectors, and HNSW insertion. + const items = Array.isArray(itemsRaw) + ? [...itemsRaw].sort((a, b) => (a.index ?? 0) - (b.index ?? 0)) + : []; + const normalizedRel = relPath.replace(/\\/g, '/'); + const chunkSignature = buildChunkSignature(items); + const manifestEntry = manifestFiles[normalizedRel] || null; + const manifestHash = typeof manifestEntry?.hash === 'string' ? manifestEntry.hash : null; + let fileHash = manifestHash; + let cacheKey = buildCacheKey({ + file: normalizedRel, + hash: fileHash, + signature: chunkSignature, + identityKey: cacheIdentityKey + }); + let cachePath = cacheKey ? path.join(cacheDir, `${cacheKey}.json`) : null; + + if (cachePath && fsSync.existsSync(cachePath)) { + try { + const cached = JSON.parse(await fs.readFile(cachePath, 'utf8')); + const cacheIdentityMatches = cached.cacheMeta?.identityKey === cacheIdentityKey; + if (cacheIdentityMatches) { + const expectedDims = configuredDims || cached.cacheMeta?.identity?.dims || null; + validateCachedDims({ vectors: cached.codeVectors, expectedDims, mode }); + validateCachedDims({ vectors: cached.docVectors, expectedDims, mode }); + validateCachedDims({ vectors: cached.mergedVectors, expectedDims, mode }); + } + if (isCacheValid({ cached, signature: chunkSignature, identityKey: cacheIdentityKey })) { + const cachedCode = ensureVectorArrays(cached.codeVectors, items.length); + const cachedDoc = ensureVectorArrays(cached.docVectors, items.length); + const cachedMerged = ensureVectorArrays(cached.mergedVectors, items.length); + for (let i = 0; i < items.length; i += 1) { + const chunkIndex = items[i].index; + const codeVec = cachedCode[i] || []; + const docVec = cachedDoc[i] || []; + const mergedVec = cachedMerged[i] || []; + if (codeVec.length) assertDims(codeVec.length); + if (docVec.length) assertDims(docVec.length); + if (mergedVec.length) assertDims(mergedVec.length); + codeVectors[chunkIndex] = codeVec; + docVectors[chunkIndex] = docVec; + mergedVectors[chunkIndex] = mergedVec; + if (hnswConfig.enabled && mergedVec.length) { + const floatVec = dequantizeUint8ToFloat32(mergedVec); + if (floatVec) hnswBuilder.addVector(chunkIndex, floatVec); + } + } + processedFiles += 1; + continue; + } + } catch (err) { + if (isDimsMismatch(err)) throw err; + // Ignore cache parse errors. + } + } + + const absPath = path.resolve(root, normalizedRel.split('/').join(path.sep)); + let textInfo; + try { + textInfo = await readTextFileWithHash(absPath); + } catch { + console.warn(`[embeddings] Failed to read ${normalizedRel}; skipping.`); + continue; + } + const text = textInfo.text; + if (!fileHash) { + fileHash = textInfo.hash; + cacheKey = buildCacheKey({ + file: normalizedRel, + hash: fileHash, + signature: chunkSignature, + identityKey: cacheIdentityKey + }); + cachePath = cacheKey ? path.join(cacheDir, `${cacheKey}.json`) : null; + if (cachePath && fsSync.existsSync(cachePath)) { + try { + const cached = JSON.parse(await fs.readFile(cachePath, 'utf8')); + const cacheIdentityMatches = cached.cacheMeta?.identityKey === cacheIdentityKey; + if (cacheIdentityMatches) { + const expectedDims = configuredDims || cached.cacheMeta?.identity?.dims || null; + validateCachedDims({ vectors: cached.codeVectors, expectedDims, mode }); + validateCachedDims({ vectors: cached.docVectors, expectedDims, mode }); + validateCachedDims({ vectors: cached.mergedVectors, expectedDims, mode }); + } + if (isCacheValid({ cached, signature: chunkSignature, identityKey: cacheIdentityKey })) { + const cachedCode = ensureVectorArrays(cached.codeVectors, items.length); + const cachedDoc = ensureVectorArrays(cached.docVectors, items.length); + const cachedMerged = ensureVectorArrays(cached.mergedVectors, items.length); + for (let i = 0; i < items.length; i += 1) { + const chunkIndex = items[i].index; + const codeVec = cachedCode[i] || []; + const docVec = cachedDoc[i] || []; + const mergedVec = cachedMerged[i] || []; + if (codeVec.length) assertDims(codeVec.length); + if (docVec.length) assertDims(docVec.length); + if (mergedVec.length) assertDims(mergedVec.length); + codeVectors[chunkIndex] = codeVec; + docVectors[chunkIndex] = docVec; + mergedVectors[chunkIndex] = mergedVec; + if (hnswConfig.enabled && mergedVec.length) { + const floatVec = dequantizeUint8ToFloat32(mergedVec); + if (floatVec) hnswBuilder.addVector(chunkIndex, floatVec); + } + } + processedFiles += 1; + continue; + } + } catch (err) { + if (isDimsMismatch(err)) throw err; + // Ignore cache parse errors. + } + } + } + + const codeTexts = []; + const docTexts = []; + for (const { chunk } of items) { + const start = Number(chunk.start) || 0; + const end = Number(chunk.end) || start; + codeTexts.push(text.slice(start, end)); + const docText = typeof chunk.docmeta?.doc === 'string' ? chunk.docmeta.doc : ''; + docTexts.push(docText.trim() ? docText : ''); + } + + let codeEmbeds = await runBatched({ + texts: codeTexts, + batchSize: embeddingBatchSize, + embed: getChunkEmbeddings + }); + codeEmbeds = ensureVectorArrays(codeEmbeds, codeTexts.length); + for (const vec of codeEmbeds) { + if (Array.isArray(vec) && vec.length) assertDims(vec.length); + } + + const docVectorsRaw = new Array(items.length).fill(null); + const docIndexes = []; + const docPayloads = []; + for (let i = 0; i < docTexts.length; i += 1) { + if (docTexts[i]) { + docIndexes.push(i); + docPayloads.push(docTexts[i]); + } + } + if (docPayloads.length) { + const embeddedDocs = await runBatched({ + texts: docPayloads, + batchSize: embeddingBatchSize, + embed: getChunkEmbeddings + }); + for (let i = 0; i < docIndexes.length; i += 1) { + docVectorsRaw[docIndexes[i]] = embeddedDocs[i] || null; + } + } + for (const vec of docVectorsRaw) { + if (Array.isArray(vec) && vec.length) assertDims(vec.length); + } + + const dims = dimsValidator.getDims(); + const zeroVec = dims ? Array.from({ length: dims }, () => 0) : []; + + const cachedCodeVectors = []; + const cachedDocVectors = []; + const cachedMergedVectors = []; + for (let i = 0; i < items.length; i += 1) { + const chunkIndex = items[i].index; + const embedCode = Array.isArray(codeEmbeds[i]) ? codeEmbeds[i] : []; + const embedDoc = Array.isArray(docVectorsRaw[i]) ? docVectorsRaw[i] : zeroVec; + const quantized = buildQuantizedVectors({ + chunkIndex, + codeVector: embedCode, + docVector: embedDoc, + zeroVector: zeroVec, + addHnswVector: hnswConfig.enabled ? hnswBuilder.addVector : null + }); + codeVectors[chunkIndex] = quantized.quantizedCode; + docVectors[chunkIndex] = quantized.quantizedDoc; + mergedVectors[chunkIndex] = quantized.quantizedMerged; + cachedCodeVectors.push(quantized.quantizedCode); + cachedDocVectors.push(quantized.quantizedDoc); + cachedMergedVectors.push(quantized.quantizedMerged); + } + + if (cachePath) { + const payload = JSON.stringify({ + key: cacheKey, + file: normalizedRel, + hash: fileHash, + chunkSignature, + cacheMeta: { + identityKey: cacheIdentityKey, + identity: cacheIdentity, + createdAt: new Date().toISOString() + }, + codeVectors: cachedCodeVectors, + docVectors: cachedDocVectors, + mergedVectors: cachedMergedVectors + }); + let tempPath; + try { + tempPath = createTempPath(cachePath); + await fs.writeFile(tempPath, payload); + await replaceFileNoBak(tempPath, cachePath); + } catch { + // Ignore cache write failures. + if (tempPath) { + try { + await fs.rm(tempPath, { force: true }); + } catch {} + } + } + } + + processedFiles += 1; + if (processedFiles % 50 === 0) { + console.log(`[embeddings] ${mode}: processed ${processedFiles}/${chunksByFile.size} files`); + } + } + + const observedDims = dimsValidator.getDims(); + if (configuredDims && observedDims && configuredDims !== observedDims) { + throw new Error( + `[embeddings] ${mode} embedding dims mismatch (configured=${configuredDims}, observed=${observedDims}).` + ); + } + const finalDims = observedDims || configuredDims || 384; + fillMissingVectors(codeVectors, finalDims); + fillMissingVectors(docVectors, finalDims); + fillMissingVectors(mergedVectors, finalDims); + + await writeJsonObjectFile(path.join(indexDir, 'dense_vectors_uint8.json'), { + fields: { model: modelId, dims: finalDims, scale: denseScale }, + arrays: { vectors: mergedVectors }, + atomic: true + }); + await writeJsonObjectFile(path.join(indexDir, 'dense_vectors_doc_uint8.json'), { + fields: { model: modelId, dims: finalDims, scale: denseScale }, + arrays: { vectors: docVectors }, + atomic: true + }); + await writeJsonObjectFile(path.join(indexDir, 'dense_vectors_code_uint8.json'), { + fields: { model: modelId, dims: finalDims, scale: denseScale }, + arrays: { vectors: codeVectors }, + atomic: true + }); + + if (hnswConfig.enabled) { + try { + const result = await hnswBuilder.writeIndex({ + indexPath: hnswIndexPath, + metaPath: hnswMetaPath, + modelId, + dims: finalDims + }); + if (!result.skipped) { + console.log(`[embeddings] ${mode}: wrote HNSW index (${result.count} vectors).`); + } + } catch (err) { + console.warn(`[embeddings] ${mode}: failed to write HNSW index: ${err?.message || err}`); + } + } + + const now = new Date().toISOString(); + indexState.generatedAt = indexState.generatedAt || now; + indexState.updatedAt = now; + indexState.mode = indexState.mode || mode; + indexState.embeddings = { + ...(indexState.embeddings || {}), + enabled: true, + ready: true, + pending: false, + mode: indexState.embeddings?.mode || resolvedEmbeddingMode, + service: indexState.embeddings?.service ?? (normalizedEmbeddingMode === 'service'), + updatedAt: now + }; + if (indexState.enrichment && indexState.enrichment.enabled) { + indexState.enrichment = { + ...indexState.enrichment, + pending: false, + stage: indexState.enrichment.stage || indexState.stage || 'stage2' + }; + } + try { + await writeIndexState(statePath, indexState); + } catch { + // Ignore index state write failures. + } + + try { + await updatePieceManifest({ indexDir, mode, totalChunks, dims: finalDims }); + } catch { + // Ignore piece manifest write failures. + } + + updateSqliteDense({ + Database, + root, + userConfig, + indexRoot, + mode, + vectors: mergedVectors, + dims: finalDims, + scale: denseScale, + modelId, + emitOutput: true + }); + + const validation = await validateIndexArtifacts({ + root, + indexRoot, + modes: [mode], + userConfig, + sqliteEnabled: false + }); + if (!validation.ok) { + throw new Error(`[embeddings] ${mode} index validation failed; see index-validate output for details.`); + } + + console.log(`[embeddings] ${mode}: wrote ${totalChunks} vectors (dims=${finalDims}).`); + } + + if (hasBuildState) { + await markBuildPhase(indexRoot, 'stage3', 'done'); + } + stopHeartbeat(); + return { modes }; +} diff --git a/tools/build-embeddings/sqlite-dense.js b/tools/build-embeddings/sqlite-dense.js new file mode 100644 index 000000000..43fe8d65d --- /dev/null +++ b/tools/build-embeddings/sqlite-dense.js @@ -0,0 +1,133 @@ +import fsSync from 'node:fs'; +import path from 'node:path'; +import { + encodeVector, + ensureVectorTable, + getVectorExtensionConfig, + hasVectorTable, + loadVectorExtension +} from '../vector-extension.js'; +import { resolveSqlitePaths } from '../dict-utils.js'; +import { dequantizeUint8ToFloat32, packUint8, toVectorId } from '../../src/storage/sqlite/vector.js'; + +const hasTable = (db, table) => { + try { + const row = db.prepare( + "SELECT name FROM sqlite_master WHERE type='table' AND name = ?" + ).get(table); + return !!row; + } catch { + return false; + } +}; + +export const updateSqliteDense = ({ + Database, + root, + userConfig, + indexRoot, + mode, + vectors, + dims, + scale, + modelId, + dbPath, + emitOutput = true +}) => { + if (userConfig?.sqlite?.use === false) { + return { skipped: true, reason: 'sqlite disabled' }; + } + if (!Database) { + if (emitOutput) { + console.warn(`[embeddings] better-sqlite3 not available; skipping SQLite update for ${mode}.`); + } + return { skipped: true, reason: 'sqlite unavailable' }; + } + const resolvedDbPath = dbPath || (() => { + const sqlitePaths = resolveSqlitePaths(root, userConfig, indexRoot ? { indexRoot } : {}); + return mode === 'code' ? sqlitePaths.codePath : sqlitePaths.prosePath; + })(); + if (!resolvedDbPath || !fsSync.existsSync(resolvedDbPath)) { + if (emitOutput) { + console.warn(`[embeddings] SQLite ${mode} index missing; skipping.`); + } + return { skipped: true, reason: 'sqlite missing' }; + } + + const db = new Database(resolvedDbPath); + try { + if (!hasTable(db, 'dense_vectors') || !hasTable(db, 'dense_meta')) { + if (emitOutput) { + console.warn(`[embeddings] SQLite ${mode} index missing dense tables; skipping.`); + } + return { skipped: true, reason: 'missing dense tables' }; + } + try { + db.pragma('journal_mode = WAL'); + db.pragma('synchronous = NORMAL'); + } catch {} + + const vectorExtension = getVectorExtensionConfig(root, userConfig); + let vectorAnnReady = false; + let vectorAnnTable = vectorExtension.table || 'dense_vectors_ann'; + let vectorAnnColumn = vectorExtension.column || 'embedding'; + let insertVectorAnn = null; + if (vectorExtension.enabled) { + const loadResult = loadVectorExtension(db, vectorExtension, `embeddings ${mode}`); + if (loadResult.ok) { + if (hasVectorTable(db, vectorAnnTable)) { + vectorAnnReady = true; + } else { + const created = ensureVectorTable(db, vectorExtension, dims); + if (created.ok) { + vectorAnnReady = true; + vectorAnnTable = created.tableName; + vectorAnnColumn = created.column; + } else if (emitOutput) { + console.warn(`[embeddings] Failed to create vector table for ${mode}: ${created.reason}`); + } + } + if (vectorAnnReady) { + insertVectorAnn = db.prepare( + `INSERT OR REPLACE INTO ${vectorAnnTable} (rowid, ${vectorAnnColumn}) VALUES (?, ?)` + ); + } + } else if (emitOutput) { + console.warn(`[embeddings] Vector extension unavailable for ${mode}: ${loadResult.reason}`); + } + } + + const deleteDense = db.prepare('DELETE FROM dense_vectors WHERE mode = ?'); + const deleteMeta = db.prepare('DELETE FROM dense_meta WHERE mode = ?'); + const insertDense = db.prepare( + 'INSERT OR REPLACE INTO dense_vectors (mode, doc_id, vector) VALUES (?, ?, ?)' + ); + const insertMeta = db.prepare( + 'INSERT OR REPLACE INTO dense_meta (mode, dims, scale, model) VALUES (?, ?, ?, ?)' + ); + const run = db.transaction(() => { + deleteDense.run(mode); + deleteMeta.run(mode); + if (vectorAnnReady) { + db.exec(`DELETE FROM ${vectorAnnTable}`); + } + insertMeta.run(mode, dims, scale, modelId || null); + for (let docId = 0; docId < vectors.length; docId += 1) { + const vec = vectors[docId]; + insertDense.run(mode, docId, packUint8(vec)); + if (vectorAnnReady && insertVectorAnn) { + const floatVec = dequantizeUint8ToFloat32(vec); + const encoded = encodeVector(floatVec, vectorExtension); + if (encoded) insertVectorAnn.run(toVectorId(docId), encoded); + } + } + }); + run(); + if (emitOutput) { + console.log(`[embeddings] ${mode}: SQLite dense vectors updated (${resolvedDbPath}).`); + } + return { skipped: false, count: vectors.length }; + } finally { + db.close(); + } +}; diff --git a/tools/build-lmdb-index.js b/tools/build-lmdb-index.js new file mode 100644 index 000000000..73904c967 --- /dev/null +++ b/tools/build-lmdb-index.js @@ -0,0 +1,270 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import path from 'node:path'; +import { createCli } from '../src/shared/cli.js'; +import { loadChunkMeta, loadTokenPostings, readJsonFile, MAX_JSON_BYTES } from '../src/shared/artifact-io.js'; +import { writeJsonObjectFile } from '../src/shared/json-stream.js'; +import { checksumFile } from '../src/shared/hash.js'; +import { LMDB_ARTIFACT_KEYS, LMDB_META_KEYS, LMDB_SCHEMA_VERSION } from '../src/storage/lmdb/schema.js'; +import { getIndexDir, getMetricsDir, loadUserConfig, resolveIndexRoot, resolveLmdbPaths, resolveRepoRoot } from './dict-utils.js'; +import { Packr } from 'msgpackr'; + +let open = null; +try { + ({ open } = await import('lmdb')); +} catch {} + +const argv = createCli({ + scriptName: 'build-lmdb-index', + options: { + mode: { type: 'string', default: 'all' }, + repo: { type: 'string' }, + 'index-root': { type: 'string' } + } +}).parse(); + +if (!open) { + console.error('lmdb is required. Run npm install first.'); + process.exit(1); +} + +const rootArg = argv.repo ? path.resolve(argv.repo) : null; +const root = rootArg || resolveRepoRoot(process.cwd()); +const userConfig = loadUserConfig(root); +const indexRoot = argv['index-root'] + ? path.resolve(argv['index-root']) + : resolveIndexRoot(root, userConfig); +const lmdbPaths = resolveLmdbPaths(root, userConfig, { indexRoot }); +const metricsDir = getMetricsDir(root, userConfig); + +const readJsonOptional = (filePath) => { + if (!filePath || !fsSync.existsSync(filePath)) return null; + return readJsonFile(filePath, { maxBytes: MAX_JSON_BYTES }); +}; + +const sumDocLengths = (docLengths) => { + if (!Array.isArray(docLengths)) return null; + let total = 0; + for (const entry of docLengths) { + const value = Number(entry); + if (Number.isFinite(value)) total += value; + } + return total; +}; + +const updateIndexStateManifest = async (indexDir) => { + const manifestPath = path.join(indexDir, 'pieces', 'manifest.json'); + if (!fsSync.existsSync(manifestPath)) return; + let manifest = null; + try { + manifest = readJsonFile(manifestPath) || null; + } catch { + return; + } + if (!manifest || !Array.isArray(manifest.pieces)) return; + const statePath = path.join(indexDir, 'index_state.json'); + if (!fsSync.existsSync(statePath)) return; + let bytes = null; + let checksum = null; + let checksumAlgo = null; + try { + const stat = await fs.stat(statePath); + bytes = stat.size; + const result = await checksumFile(statePath); + checksum = result?.value || null; + checksumAlgo = result?.algo || null; + } catch {} + if (!bytes || !checksum) return; + const pieces = manifest.pieces.map((piece) => { + if (piece?.name !== 'index_state' || piece?.path !== 'index_state.json') { + return piece; + } + return { + ...piece, + bytes, + checksum: checksum && checksumAlgo ? `${checksumAlgo}:${checksum}` : piece.checksum + }; + }); + const next = { + ...manifest, + updatedAt: new Date().toISOString(), + pieces + }; + try { + await writeJsonObjectFile(manifestPath, { fields: next, atomic: true }); + } catch { + // Ignore manifest write failures. + } +}; + +const updateLmdbState = async (indexDir, patch) => { + if (!indexDir) return null; + const statePath = path.join(indexDir, 'index_state.json'); + let state = {}; + if (fsSync.existsSync(statePath)) { + try { + state = readJsonFile(statePath, { maxBytes: MAX_JSON_BYTES }) || {}; + } catch { + state = {}; + } + } + const now = new Date().toISOString(); + state.generatedAt = state.generatedAt || now; + state.updatedAt = now; + state.lmdb = { + ...(state.lmdb || {}), + ...patch, + updatedAt: now + }; + try { + await writeJsonObjectFile(statePath, { fields: state, atomic: true }); + } catch { + // Ignore index state write failures. + } + await updateIndexStateManifest(indexDir); + return state; +}; + +const buildModeRaw = String(argv.mode || 'all').trim().toLowerCase(); +const buildMode = buildModeRaw === 'both' ? 'all' : buildModeRaw; +const modes = buildMode === 'all' ? ['code', 'prose'] : [buildMode]; + +const packr = new Packr(); + +const storeValue = (db, key, value) => { + if (value == null) return false; + db.putSync(key, packr.pack(value)); + return true; +}; + +const storeArtifacts = (db, meta, artifacts) => { + db.clearSync(); + db.transactionSync(() => { + storeValue(db, LMDB_META_KEYS.schemaVersion, LMDB_SCHEMA_VERSION); + storeValue(db, LMDB_META_KEYS.createdAt, meta.createdAt); + storeValue(db, LMDB_META_KEYS.mode, meta.mode); + storeValue(db, LMDB_META_KEYS.sourceIndex, meta.sourceIndex); + storeValue(db, LMDB_META_KEYS.chunkCount, meta.chunkCount); + storeValue(db, LMDB_META_KEYS.artifacts, meta.artifacts); + for (const [key, value] of Object.entries(artifacts)) { + storeValue(db, key, value); + } + }); +}; + +const loadArtifactsForMode = (indexDir, mode) => { + const chunkMeta = loadChunkMeta(indexDir, { maxBytes: MAX_JSON_BYTES }); + const tokenPostings = loadTokenPostings(indexDir, { maxBytes: MAX_JSON_BYTES }); + const fileMeta = readJsonOptional(path.join(indexDir, 'file_meta.json')); + const fileRelations = readJsonOptional(path.join(indexDir, 'file_relations.json')); + const repoMap = readJsonOptional(path.join(indexDir, 'repo_map.json')); + const filterIndex = readJsonOptional(path.join(indexDir, 'filter_index.json')); + const fieldPostings = readJsonOptional(path.join(indexDir, 'field_postings.json')); + const fieldTokens = readJsonOptional(path.join(indexDir, 'field_tokens.json')); + const phraseNgrams = readJsonOptional(path.join(indexDir, 'phrase_ngrams.json')); + const chargramPostings = readJsonOptional(path.join(indexDir, 'chargram_postings.json')); + const minhashSignatures = readJsonOptional(path.join(indexDir, 'minhash_signatures.json')); + const denseVectors = readJsonOptional(path.join(indexDir, 'dense_vectors_uint8.json')); + const denseVectorsDoc = readJsonOptional(path.join(indexDir, 'dense_vectors_doc_uint8.json')); + const denseVectorsCode = readJsonOptional(path.join(indexDir, 'dense_vectors_code_uint8.json')); + const denseHnswMeta = readJsonOptional(path.join(indexDir, 'dense_vectors_hnsw.meta.json')); + const indexState = readJsonOptional(path.join(indexDir, 'index_state.json')); + const artifacts = { + [LMDB_ARTIFACT_KEYS.chunkMeta]: chunkMeta, + [LMDB_ARTIFACT_KEYS.tokenPostings]: tokenPostings, + [LMDB_ARTIFACT_KEYS.fileMeta]: fileMeta, + [LMDB_ARTIFACT_KEYS.fileRelations]: fileRelations, + [LMDB_ARTIFACT_KEYS.repoMap]: repoMap, + [LMDB_ARTIFACT_KEYS.filterIndex]: filterIndex, + [LMDB_ARTIFACT_KEYS.fieldPostings]: fieldPostings, + [LMDB_ARTIFACT_KEYS.fieldTokens]: fieldTokens, + [LMDB_ARTIFACT_KEYS.phraseNgrams]: phraseNgrams, + [LMDB_ARTIFACT_KEYS.chargramPostings]: chargramPostings, + [LMDB_ARTIFACT_KEYS.minhashSignatures]: minhashSignatures, + [LMDB_ARTIFACT_KEYS.denseVectors]: denseVectors, + [LMDB_ARTIFACT_KEYS.denseVectorsDoc]: denseVectorsDoc, + [LMDB_ARTIFACT_KEYS.denseVectorsCode]: denseVectorsCode, + [LMDB_ARTIFACT_KEYS.denseHnswMeta]: denseHnswMeta, + [LMDB_ARTIFACT_KEYS.indexState]: indexState + }; + const artifactKeys = Object.entries(artifacts) + .filter(([, value]) => value != null) + .map(([key]) => key); + const meta = { + createdAt: new Date().toISOString(), + mode, + sourceIndex: indexDir, + chunkCount: Array.isArray(chunkMeta) ? chunkMeta.length : 0, + artifacts: artifactKeys + }; + const stats = { + chunkCount: meta.chunkCount, + fileCount: Array.isArray(fileMeta) ? fileMeta.length : null, + tokenCount: sumDocLengths(tokenPostings?.docLengths) + }; + return { meta, artifacts, stats }; +}; + +for (const mode of modes) { + if (!['code', 'prose'].includes(mode)) { + console.error(`Invalid mode: ${mode}`); + process.exit(1); + } + const indexDir = getIndexDir(root, mode, userConfig, { indexRoot }); + const targetPath = mode === 'code' ? lmdbPaths.codePath : lmdbPaths.prosePath; + const buildStart = Date.now(); + await fs.mkdir(targetPath, { recursive: true }); + await updateLmdbState(indexDir, { + enabled: true, + ready: false, + pending: true, + schemaVersion: LMDB_SCHEMA_VERSION + }); + + const readStart = Date.now(); + const { meta, artifacts, stats } = loadArtifactsForMode(indexDir, mode); + const readMs = Date.now() - readStart; + const writeStart = Date.now(); + const db = open({ path: targetPath, readOnly: false }); + storeArtifacts(db, meta, artifacts); + db.close(); + const writeMs = Date.now() - writeStart; + + const finalState = await updateLmdbState(indexDir, { + enabled: true, + ready: true, + pending: false, + schemaVersion: LMDB_SCHEMA_VERSION, + path: targetPath + }); + const finalDb = open({ path: targetPath, readOnly: false }); + storeValue(finalDb, LMDB_ARTIFACT_KEYS.indexState, finalState); + finalDb.close(); + + const totalMs = Date.now() - buildStart; + const metrics = { + generatedAt: new Date().toISOString(), + mode, + sourceIndex: meta.sourceIndex, + artifacts: meta.artifacts, + files: { candidates: stats.fileCount }, + chunks: { total: stats.chunkCount }, + tokens: { total: stats.tokenCount }, + lmdb: { path: targetPath }, + timings: { + totalMs, + readMs, + writeMs + } + }; + try { + await fs.mkdir(metricsDir, { recursive: true }); + await writeJsonObjectFile( + path.join(metricsDir, `lmdb-${mode}.json`), + { fields: metrics, atomic: true } + ); + } catch {} + + console.log(`[lmdb] ${mode} index built at ${targetPath}.`); +} diff --git a/tools/build-sqlite-index.js b/tools/build-sqlite-index.js index 757a03711..f98ee163c 100644 --- a/tools/build-sqlite-index.js +++ b/tools/build-sqlite-index.js @@ -1,1001 +1,12 @@ #!/usr/bin/env node -import fs from 'node:fs/promises'; -import fsSync from 'node:fs'; -import path from 'node:path'; -import minimist from 'minimist'; -import { getIndexDir, getModelConfig, getRepoCacheRoot, loadUserConfig, resolveRepoRoot, resolveSqlitePaths } from './dict-utils.js'; -import { encodeVector, ensureVectorTable, getVectorExtensionConfig, hasVectorTable, loadVectorExtension } from './vector-extension.js'; -import { compactDatabase } from './compact-sqlite-index.js'; -import { CREATE_TABLES_SQL, REQUIRED_TABLES, SCHEMA_VERSION } from '../src/sqlite/schema.js'; -import { buildChunkRow, buildTokenFrequency, prepareVectorAnnTable } from '../src/sqlite/build-helpers.js'; -import { loadIncrementalManifest } from '../src/sqlite/incremental.js'; -import { chunkArray, hasRequiredTables, loadIndex, normalizeFilePath, readJson } from '../src/sqlite/utils.js'; -import { dequantizeUint8ToFloat32, packUint32, packUint8, quantizeVec, toVectorId } from '../src/sqlite/vector.js'; +import { fileURLToPath } from 'node:url'; +import { runBuildSqliteIndex } from './build-sqlite-index/run.js'; -let Database; -try { - ({ default: Database } = await import('better-sqlite3')); -} catch (err) { - console.error('better-sqlite3 is required. Run npm install first.'); - process.exit(1); -} - -const argv = minimist(process.argv.slice(2), { - string: ['code-dir', 'prose-dir', 'out', 'mode', 'repo'], - boolean: ['incremental', 'compact'], - default: { mode: 'all', incremental: false, compact: false } -}); - -const rootArg = argv.repo ? path.resolve(argv.repo) : null; -const root = rootArg || resolveRepoRoot(process.cwd()); -const userConfig = loadUserConfig(root); -const modelConfig = getModelConfig(root, userConfig); -const vectorExtension = getVectorExtensionConfig(root, userConfig); -const vectorAnnEnabled = vectorExtension.enabled; -const vectorConfig = { - enabled: vectorAnnEnabled, - extension: vectorExtension, - loadVectorExtension, - ensureVectorTable -}; -const repoCacheRoot = getRepoCacheRoot(root, userConfig); -const compactFlag = argv.compact; -const compactOnIncremental = compactFlag === true - || (compactFlag !== false && userConfig?.sqlite?.compactOnIncremental === true); -const codeDir = argv['code-dir'] ? path.resolve(argv['code-dir']) : getIndexDir(root, 'code', userConfig); -const proseDir = argv['prose-dir'] ? path.resolve(argv['prose-dir']) : getIndexDir(root, 'prose', userConfig); -const sqlitePaths = resolveSqlitePaths(root, userConfig); -const incrementalRequested = argv.incremental === true; - -const modeArg = (argv.mode || 'all').toLowerCase(); -if (!['all', 'code', 'prose'].includes(modeArg)) { - console.error('Invalid mode. Use --mode all|code|prose'); - process.exit(1); -} - -const outArg = argv.out ? path.resolve(argv.out) : null; -let outPath = null; -let codeOutPath = sqlitePaths.codePath; -let proseOutPath = sqlitePaths.prosePath; -if (outArg) { - if (modeArg === 'all') { - const outDir = outArg.endsWith('.db') ? path.dirname(outArg) : outArg; - codeOutPath = path.join(outDir, 'index-code.db'); - proseOutPath = path.join(outDir, 'index-prose.db'); - } else { - const targetName = modeArg === 'code' ? 'index-code.db' : 'index-prose.db'; - outPath = outArg.endsWith('.db') ? outArg : path.join(outArg, targetName); - } -} -if (!outPath && modeArg !== 'all') { - outPath = modeArg === 'code' ? codeOutPath : proseOutPath; -} - -if (modeArg === 'all') { - await fs.mkdir(path.dirname(codeOutPath), { recursive: true }); - await fs.mkdir(path.dirname(proseOutPath), { recursive: true }); -} else if (outPath) { - await fs.mkdir(path.dirname(outPath), { recursive: true }); -} - - - -const codeIndex = loadIndex(codeDir, modelConfig.id); -const proseIndex = loadIndex(proseDir, modelConfig.id); -const incrementalCode = loadIncrementalManifest(repoCacheRoot, 'code'); -const incrementalProse = loadIncrementalManifest(repoCacheRoot, 'prose'); -if (!codeIndex && !proseIndex) { - console.error('No index found. Build index-code/index-prose first.'); - process.exit(1); -} - -if (sqlitePaths.legacyExists) { - try { - await fs.rm(sqlitePaths.legacyPath, { force: true }); - console.warn(`Removed legacy SQLite index at ${sqlitePaths.legacyPath}`); - } catch (err) { - console.warn(`Failed to remove legacy SQLite index at ${sqlitePaths.legacyPath}: ${err?.message || err}`); - } -} - -const canIncrementalCode = incrementalRequested && incrementalCode?.manifest; -const canIncrementalProse = incrementalRequested && incrementalProse?.manifest; -if (modeArg === 'code' && !codeIndex && !canIncrementalCode) { - console.error('Code index missing; build index-code first.'); - process.exit(1); -} -if (modeArg === 'prose' && !proseIndex && !canIncrementalProse) { - console.error('Prose index missing; build index-prose first.'); - process.exit(1); -} - - -/** - * Build a full SQLite index from file-backed artifacts. - * @param {string} outPath - * @param {object} index - * @param {'code'|'prose'} mode - * @param {object|null} manifestFiles - * @returns {number} - */ -function buildDatabase(outPath, index, mode, manifestFiles) { - if (!index) return 0; - const db = new Database(outPath); - try { - db.pragma('journal_mode = WAL'); - db.pragma('synchronous = NORMAL'); - } catch {} - - db.exec(CREATE_TABLES_SQL); - db.pragma(`user_version = ${SCHEMA_VERSION}`); - const vectorAnn = prepareVectorAnnTable({ db, indexData: index, mode, vectorConfig }); - - const insertChunk = db.prepare(` - INSERT OR REPLACE INTO chunks ( - id, mode, file, start, end, startLine, endLine, ext, kind, name, headline, - preContext, postContext, weight, tokens, ngrams, codeRelations, docmeta, - stats, complexity, lint, externalDocs, last_modified, last_author, churn, - chunk_authors - ) VALUES ( - @id, @mode, @file, @start, @end, @startLine, @endLine, @ext, @kind, @name, @headline, - @preContext, @postContext, @weight, @tokens, @ngrams, @codeRelations, @docmeta, - @stats, @complexity, @lint, @externalDocs, @last_modified, @last_author, @churn, - @chunk_authors - ); - `); - - const insertFts = db.prepare(` - INSERT OR REPLACE INTO chunks_fts (rowid, mode, file, name, kind, headline, tokens) - VALUES (@id, @mode, @file, @name, @kind, @headline, @tokensText); - `); - - const insertTokenVocab = db.prepare( - 'INSERT OR REPLACE INTO token_vocab (mode, token_id, token) VALUES (?, ?, ?)' - ); - const insertTokenPosting = db.prepare( - 'INSERT OR REPLACE INTO token_postings (mode, token_id, doc_id, tf) VALUES (?, ?, ?, ?)' - ); - const insertDocLength = db.prepare( - 'INSERT OR REPLACE INTO doc_lengths (mode, doc_id, len) VALUES (?, ?, ?)' - ); - const insertTokenStats = db.prepare( - 'INSERT OR REPLACE INTO token_stats (mode, avg_doc_len, total_docs) VALUES (?, ?, ?)' - ); - const insertPhraseVocab = db.prepare( - 'INSERT OR REPLACE INTO phrase_vocab (mode, phrase_id, ngram) VALUES (?, ?, ?)' - ); - const insertPhrasePosting = db.prepare( - 'INSERT OR REPLACE INTO phrase_postings (mode, phrase_id, doc_id) VALUES (?, ?, ?)' - ); - const insertChargramVocab = db.prepare( - 'INSERT OR REPLACE INTO chargram_vocab (mode, gram_id, gram) VALUES (?, ?, ?)' - ); - const insertChargramPosting = db.prepare( - 'INSERT OR REPLACE INTO chargram_postings (mode, gram_id, doc_id) VALUES (?, ?, ?)' - ); - const insertMinhash = db.prepare( - 'INSERT OR REPLACE INTO minhash_signatures (mode, doc_id, sig) VALUES (?, ?, ?)' - ); - const insertDense = db.prepare( - 'INSERT OR REPLACE INTO dense_vectors (mode, doc_id, vector) VALUES (?, ?, ?)' - ); - const insertDenseMeta = db.prepare( - 'INSERT OR REPLACE INTO dense_meta (mode, dims, scale, model) VALUES (?, ?, ?, ?)' - ); - const insertFileManifest = db.prepare( - 'INSERT OR REPLACE INTO file_manifest (mode, file, hash, mtimeMs, size, chunk_count) VALUES (?, ?, ?, ?, ?, ?)' - ); - - /** - * Ingest token postings into SQLite. - * @param {object} tokenIndex - * @param {'code'|'prose'} targetMode - */ - function ingestTokenIndex(tokenIndex, targetMode) { - if (!tokenIndex?.vocab || !tokenIndex?.postings) return; - const vocab = tokenIndex.vocab; - const postings = tokenIndex.postings; - const docLengths = Array.isArray(tokenIndex.docLengths) ? tokenIndex.docLengths : []; - const avgDocLen = typeof tokenIndex.avgDocLen === 'number' ? tokenIndex.avgDocLen : null; - const totalDocs = typeof tokenIndex.totalDocs === 'number' ? tokenIndex.totalDocs : docLengths.length; - - const insertVocabTx = db.transaction(() => { - for (let i = 0; i < vocab.length; i++) { - insertTokenVocab.run(targetMode, i, vocab[i]); - } - }); - insertVocabTx(); - - const insertPostingsTx = db.transaction(() => { - for (let tokenId = 0; tokenId < postings.length; tokenId++) { - const posting = postings[tokenId] || []; - for (const entry of posting) { - if (!entry) continue; - const docId = entry[0]; - const tf = entry[1]; - insertTokenPosting.run(targetMode, tokenId, docId, tf); - } - } - }); - insertPostingsTx(); - - const insertLengthsTx = db.transaction(() => { - for (let docId = 0; docId < docLengths.length; docId++) { - insertDocLength.run(targetMode, docId, docLengths[docId]); - } - }); - insertLengthsTx(); - - insertTokenStats.run(targetMode, avgDocLen, totalDocs); - } - - /** - * Ingest a generic postings index (phrase/chargram). - * @param {object} indexData - * @param {'code'|'prose'} targetMode - * @param {import('better-sqlite3').Statement} insertVocabStmt - * @param {import('better-sqlite3').Statement} insertPostingStmt - */ - function ingestPostingIndex(indexData, targetMode, insertVocabStmt, insertPostingStmt) { - if (!indexData?.vocab || !indexData?.postings) return; - const vocab = indexData.vocab; - const postings = indexData.postings; - - const insertVocabTx = db.transaction(() => { - for (let i = 0; i < vocab.length; i++) { - insertVocabStmt.run(targetMode, i, vocab[i]); - } - }); - insertVocabTx(); - - const insertPostingsTx = db.transaction(() => { - for (let tokenId = 0; tokenId < postings.length; tokenId++) { - const posting = postings[tokenId] || []; - for (const docId of posting) { - insertPostingStmt.run(targetMode, tokenId, docId); - } - } - }); - insertPostingsTx(); - } - - /** - * Ingest minhash signatures into SQLite. - * @param {object} minhash - * @param {'code'|'prose'} targetMode - */ - function ingestMinhash(minhash, targetMode) { - if (!minhash?.signatures || !minhash.signatures.length) return; - const insertTx = db.transaction(() => { - for (let docId = 0; docId < minhash.signatures.length; docId++) { - const sig = minhash.signatures[docId]; - if (!sig) continue; - insertMinhash.run(targetMode, docId, packUint32(sig)); - } - }); - insertTx(); - } - - /** - * Ingest dense vectors into SQLite. - * @param {object} dense - * @param {'code'|'prose'} targetMode - */ - function ingestDense(dense, targetMode) { - if (!dense?.vectors || !dense.vectors.length) return; - insertDenseMeta.run( - targetMode, - dense.dims || null, - typeof dense.scale === 'number' ? dense.scale : 1.0, - dense.model || modelConfig.id || null - ); - const insertTx = db.transaction(() => { - for (let docId = 0; docId < dense.vectors.length; docId++) { - const vec = dense.vectors[docId]; - if (!vec) continue; - insertDense.run(targetMode, docId, packUint8(vec)); - if (vectorAnn?.insert) { - const floatVec = dequantizeUint8ToFloat32(vec); - const encoded = encodeVector(floatVec, vectorExtension); - if (encoded) vectorAnn.insert.run(toVectorId(docId), encoded); - } - } - }); - insertTx(); - } - - /** - * Ingest all index components for a mode. - * @param {object} indexData - * @param {'code'|'prose'} targetMode - */ - function ingestIndex(indexData, targetMode) { - if (!indexData) return 0; - const { chunkMeta } = indexData; - let count = 0; - - const insert = db.transaction((rows) => { - for (const row of rows) { - insertChunk.run(row); - insertFts.run(row); - } - }); - - const rows = []; - for (const chunk of chunkMeta) { - const id = chunk.id; - const tokensArray = Array.isArray(chunk.tokens) ? chunk.tokens : []; - const tokensText = tokensArray.join(' '); - rows.push({ - id, - mode: targetMode, - file: normalizeFilePath(chunk.file), - start: chunk.start, - end: chunk.end, - startLine: chunk.startLine || null, - endLine: chunk.endLine || null, - ext: chunk.ext || null, - kind: chunk.kind || null, - name: chunk.name || null, - headline: chunk.headline || null, - preContext: chunk.preContext ? JSON.stringify(chunk.preContext) : null, - postContext: chunk.postContext ? JSON.stringify(chunk.postContext) : null, - weight: typeof chunk.weight === 'number' ? chunk.weight : 1, - tokens: tokensArray.length ? JSON.stringify(tokensArray) : null, - tokensText, - ngrams: chunk.ngrams ? JSON.stringify(chunk.ngrams) : null, - codeRelations: chunk.codeRelations ? JSON.stringify(chunk.codeRelations) : null, - docmeta: chunk.docmeta ? JSON.stringify(chunk.docmeta) : null, - stats: chunk.stats ? JSON.stringify(chunk.stats) : null, - complexity: chunk.complexity ? JSON.stringify(chunk.complexity) : null, - lint: chunk.lint ? JSON.stringify(chunk.lint) : null, - externalDocs: chunk.externalDocs ? JSON.stringify(chunk.externalDocs) : null, - last_modified: chunk.last_modified || null, - last_author: chunk.last_author || null, - churn: typeof chunk.churn === 'number' ? chunk.churn : null, - chunk_authors: chunk.chunk_authors ? JSON.stringify(chunk.chunk_authors) : null - }); - count++; - } - - insert(rows); - ingestTokenIndex(indexData.tokenPostings, targetMode); - ingestPostingIndex(indexData.phraseNgrams, targetMode, insertPhraseVocab, insertPhrasePosting); - ingestPostingIndex(indexData.chargrams, targetMode, insertChargramVocab, insertChargramPosting); - ingestMinhash(indexData.minhash, targetMode); - ingestDense(indexData.denseVec, targetMode); - - return count; - } - - /** - * Ingest file manifest metadata if available. - * @param {object} indexData - * @param {'code'|'prose'} targetMode - */ - function ingestFileManifest(indexData, targetMode) { - if (!indexData?.chunkMeta) return; - const fileCounts = new Map(); - for (const chunk of indexData.chunkMeta) { - if (!chunk?.file) continue; - const normalizedFile = normalizeFilePath(chunk.file); - fileCounts.set(normalizedFile, (fileCounts.get(normalizedFile) || 0) + 1); - } - const insertTx = db.transaction(() => { - for (const [file, count] of fileCounts.entries()) { - const entry = manifestFiles && manifestFiles[file] ? manifestFiles[file] : null; - insertFileManifest.run( - targetMode, - file, - entry?.hash || null, - Number.isFinite(entry?.mtimeMs) ? entry.mtimeMs : null, - Number.isFinite(entry?.size) ? entry.size : null, - count - ); - } - }); - insertTx(); - } - - const count = ingestIndex(index, mode); - ingestFileManifest(index, mode); - db.close(); - return count; -} - -/** - * Read the SQLite schema version. - * @param {import('better-sqlite3').Database} db - * @returns {number|null} - */ -function getSchemaVersion(db) { - try { - const value = db.pragma('user_version', { simple: true }); - return Number.isFinite(value) ? value : null; - } catch { - return null; - } -} - -/** - * Load file manifest entries from SQLite. - * @param {import('better-sqlite3').Database} db - * @param {'code'|'prose'} mode - * @returns {object} - */ -function getFileManifest(db, mode) { - const rows = db.prepare('SELECT file, hash, mtimeMs, size FROM file_manifest WHERE mode = ?').all(mode); - const map = new Map(); - for (const row of rows) { - map.set(row.file, row); - } - return map; -} - -/** - * Check if a manifest entry matches the DB entry. - * @param {object} entry - * @param {object} dbEntry - * @returns {boolean} - */ -function isManifestMatch(entry, dbEntry) { - if (!dbEntry) return false; - if (entry?.hash && dbEntry.hash) return entry.hash === dbEntry.hash; - const mtimeMatch = Number.isFinite(entry?.mtimeMs) && Number.isFinite(dbEntry.mtimeMs) - ? entry.mtimeMs === dbEntry.mtimeMs - : false; - const sizeMatch = Number.isFinite(entry?.size) && Number.isFinite(dbEntry.size) - ? entry.size === dbEntry.size - : false; - return mtimeMatch && sizeMatch; -} +export { runBuildSqliteIndex }; -/** - * Diff file manifests into added/changed/deleted sets. - * @param {object} manifestFiles - * @param {object} dbFiles - * @returns {{added:string[],changed:string[],deleted:string[]}} - */ -function diffFileManifests(manifestFiles, dbFiles) { - const changed = []; - const deleted = []; - const manifestKeys = Object.keys(manifestFiles || {}); - const manifestSet = new Set(manifestKeys); - - for (const file of manifestKeys) { - const entry = manifestFiles[file]; - const dbEntry = dbFiles.get(file); - if (!isManifestMatch(entry, dbEntry)) { - changed.push(file); - } - } - - for (const [file] of dbFiles.entries()) { - if (!manifestSet.has(file)) deleted.push(file); - } - - return { changed, deleted }; -} - -/** - * Fetch vocab rows by value for a given mode/table. - * @param {import('better-sqlite3').Database} db - * @param {'code'|'prose'} mode - * @param {string} table - * @param {string} idColumn - * @param {string} valueColumn - * @param {string[]} values - * @returns {Array<{id:number,value:string}>} - */ -function fetchVocabRows(db, mode, table, idColumn, valueColumn, values) { - const unique = Array.from(new Set(values.filter(Boolean))); - if (!unique.length) return []; - const rows = []; - for (const chunk of chunkArray(unique)) { - const placeholders = chunk.map(() => '?').join(','); - const stmt = db.prepare( - `SELECT ${idColumn} AS id, ${valueColumn} AS value FROM ${table} WHERE mode = ? AND ${valueColumn} IN (${placeholders})` - ); - rows.push(...stmt.all(mode, ...chunk)); - } - return rows; -} - -/** - * Ensure vocab ids exist for a list of values. - * @param {import('better-sqlite3').Database} db - * @param {'code'|'prose'} mode - * @param {string} table - * @param {string} idColumn - * @param {string} valueColumn - * @param {string[]} values - * @param {import('better-sqlite3').Statement} insertStmt - * @returns {Map} - */ -function ensureVocabIds(db, mode, table, idColumn, valueColumn, values, insertStmt) { - const unique = Array.from(new Set(values.filter(Boolean))); - if (!unique.length) return new Map(); - const existing = fetchVocabRows(db, mode, table, idColumn, valueColumn, unique); - const map = new Map(existing.map((row) => [row.value, row.id])); - const missing = unique.filter((value) => !map.has(value)); - if (!missing.length) return map; - - missing.sort(); - const maxRow = db.prepare(`SELECT MAX(${idColumn}) AS maxId FROM ${table} WHERE mode = ?`).get(mode); - let nextId = Number.isFinite(maxRow?.maxId) ? maxRow.maxId + 1 : 0; - const insertTx = db.transaction(() => { - for (const value of missing) { - insertStmt.run(mode, nextId, value); - map.set(value, nextId); - nextId += 1; - } - }); - insertTx(); - - return map; -} - -/** - * Delete doc ids from all tables for a mode. - * @param {import('better-sqlite3').Database} db - * @param {'code'|'prose'} mode - * @param {number[]} docIds - * @param {Array<{table:string,column:string,withMode:boolean,transform?:(value:any)=>any}>} [extraTables] - */ -function deleteDocIds(db, mode, docIds, extraTables = []) { - if (!docIds.length) return; - const deleteTargets = [ - { table: 'chunks', column: 'id' }, - { table: 'chunks_fts', column: 'rowid' }, - { table: 'token_postings', column: 'doc_id' }, - { table: 'phrase_postings', column: 'doc_id' }, - { table: 'chargram_postings', column: 'doc_id' }, - { table: 'minhash_signatures', column: 'doc_id' }, - { table: 'dense_vectors', column: 'doc_id' }, - { table: 'doc_lengths', column: 'doc_id' } - ]; - for (const extra of extraTables) { - if (extra?.table && extra?.column) deleteTargets.push(extra); - } - for (const chunk of chunkArray(docIds)) { - const placeholders = chunk.map(() => '?').join(','); - for (const target of deleteTargets) { - const withMode = target.withMode !== false; - const values = target.transform ? chunk.map(target.transform) : chunk; - const where = withMode - ? `mode = ? AND ${target.column} IN (${placeholders})` - : `${target.column} IN (${placeholders})`; - const stmt = db.prepare( - `DELETE FROM ${target.table} WHERE ${where}` - ); - if (withMode) { - stmt.run(mode, ...values); - } else { - stmt.run(...values); - } - } - } -} - -/** - * Recompute and update token stats for a mode. - * @param {import('better-sqlite3').Database} db - * @param {'code'|'prose'} mode - * @param {import('better-sqlite3').Statement} insertTokenStats - */ -function updateTokenStats(db, mode, insertTokenStats) { - const row = db.prepare( - 'SELECT COUNT(*) AS total_docs, AVG(len) AS avg_doc_len FROM doc_lengths WHERE mode = ?' - ).get(mode) || {}; - insertTokenStats.run( - mode, - typeof row.avg_doc_len === 'number' ? row.avg_doc_len : 0, - typeof row.total_docs === 'number' ? row.total_docs : 0 - ); -} - -/** - * Apply incremental updates to a SQLite index using cached bundles. - * @param {string} outPath - * @param {'code'|'prose'} mode - * @param {object|null} incrementalData - * @param {{expectedDense?:{model?:string|null,dims?:number|null}}} [options] - * @returns {{used:boolean,reason?:string,changedFiles?:number,deletedFiles?:number,insertedChunks?:number}} - */ -function incrementalUpdateDatabase(outPath, mode, incrementalData, options = {}) { - if (!incrementalData?.manifest) { - return { used: false, reason: 'missing incremental manifest' }; - } - if (!fsSync.existsSync(outPath)) { - return { used: false, reason: 'sqlite db missing' }; - } - - const expectedDense = options.expectedDense || null; - const expectedModel = expectedDense?.model || modelConfig.id || null; - const expectedDims = Number.isFinite(expectedDense?.dims) ? expectedDense.dims : null; - - const db = new Database(outPath); - try { - db.pragma('journal_mode = WAL'); - db.pragma('synchronous = NORMAL'); - } catch {} - - const schemaVersion = getSchemaVersion(db); - if (schemaVersion !== SCHEMA_VERSION) { - db.close(); - return { - used: false, - reason: `schema mismatch (db=${schemaVersion ?? 'unknown'}, expected=${SCHEMA_VERSION})` - }; - } - - if (!hasRequiredTables(db, REQUIRED_TABLES)) { - db.close(); - return { used: false, reason: 'schema missing' }; - } - - const dbDenseMeta = db.prepare( - 'SELECT dims, scale, model FROM dense_meta WHERE mode = ?' - ).get(mode); - const dbDims = Number.isFinite(dbDenseMeta?.dims) ? dbDenseMeta.dims : null; - const dbModel = dbDenseMeta?.model || null; - if ((expectedModel || expectedDims !== null) && !dbDenseMeta) { - db.close(); - return { used: false, reason: 'dense metadata missing' }; - } - if (expectedModel) { - if (!dbModel) { - db.close(); - return { used: false, reason: 'dense metadata model missing' }; - } - if (dbModel !== expectedModel) { - db.close(); - return { used: false, reason: `model mismatch (db=${dbModel}, expected=${expectedModel})` }; - } - } - if (expectedDims !== null) { - if (dbDims === null) { - db.close(); - return { used: false, reason: 'dense metadata dims missing' }; - } - if (dbDims !== expectedDims) { - db.close(); - return { used: false, reason: `dense dims mismatch (db=${dbDims}, expected=${expectedDims})` }; - } - } - - const manifestFiles = incrementalData.manifest.files || {}; - const dbFiles = getFileManifest(db, mode); - const { changed, deleted } = diffFileManifests(manifestFiles, dbFiles); - if (!changed.length && !deleted.length) { - db.close(); - return { used: true, changedFiles: 0, deletedFiles: 0, insertedChunks: 0 }; - } - - const bundles = new Map(); - for (const file of changed) { - const entry = manifestFiles[file]; - const bundleName = entry?.bundle; - if (!bundleName) { - db.close(); - return { used: false, reason: `missing bundle for ${file}` }; - } - const bundlePath = path.join(incrementalData.bundleDir, bundleName); - if (!fsSync.existsSync(bundlePath)) { - db.close(); - return { used: false, reason: `bundle missing for ${file}` }; - } - const bundle = readJson(bundlePath); - if (!bundle || !Array.isArray(bundle.chunks)) { - db.close(); - return { used: false, reason: `invalid bundle for ${file}` }; - } - bundles.set(file, bundle); - } - - const tokenValues = []; - const phraseValues = []; - const chargramValues = []; - const incomingDimsSet = new Set(); - for (const bundle of bundles.values()) { - for (const chunk of bundle.chunks || []) { - const tokensArray = Array.isArray(chunk.tokens) ? chunk.tokens : []; - if (tokensArray.length) tokenValues.push(...tokensArray); - if (Array.isArray(chunk.ngrams)) phraseValues.push(...chunk.ngrams); - if (Array.isArray(chunk.chargrams)) chargramValues.push(...chunk.chargrams); - if (Array.isArray(chunk.embedding) && chunk.embedding.length) { - incomingDimsSet.add(chunk.embedding.length); - } - } - } - if (incomingDimsSet.size > 1) { - db.close(); - return { used: false, reason: 'embedding dims mismatch across bundles' }; - } - const incomingDims = incomingDimsSet.size ? [...incomingDimsSet][0] : null; - if (incomingDims !== null && dbDims !== null && incomingDims !== dbDims) { - db.close(); - return { used: false, reason: `embedding dims mismatch (db=${dbDims}, incoming=${incomingDims})` }; - } - if (incomingDims !== null && expectedDims !== null && incomingDims !== expectedDims) { - db.close(); - return { used: false, reason: `embedding dims mismatch (expected=${expectedDims}, incoming=${incomingDims})` }; - } - - const insertChunk = db.prepare(` - INSERT OR REPLACE INTO chunks ( - id, mode, file, start, end, startLine, endLine, ext, kind, name, headline, - preContext, postContext, weight, tokens, ngrams, codeRelations, docmeta, - stats, complexity, lint, externalDocs, last_modified, last_author, churn, - chunk_authors - ) VALUES ( - @id, @mode, @file, @start, @end, @startLine, @endLine, @ext, @kind, @name, @headline, - @preContext, @postContext, @weight, @tokens, @ngrams, @codeRelations, @docmeta, - @stats, @complexity, @lint, @externalDocs, @last_modified, @last_author, @churn, - @chunk_authors - ); - `); - - const insertFts = db.prepare(` - INSERT OR REPLACE INTO chunks_fts (rowid, mode, file, name, kind, headline, tokens) - VALUES (@id, @mode, @file, @name, @kind, @headline, @tokensText); - `); - - const insertTokenVocab = db.prepare( - 'INSERT OR REPLACE INTO token_vocab (mode, token_id, token) VALUES (?, ?, ?)' - ); - const insertTokenPosting = db.prepare( - 'INSERT OR REPLACE INTO token_postings (mode, token_id, doc_id, tf) VALUES (?, ?, ?, ?)' - ); - const insertDocLength = db.prepare( - 'INSERT OR REPLACE INTO doc_lengths (mode, doc_id, len) VALUES (?, ?, ?)' - ); - const insertTokenStats = db.prepare( - 'INSERT OR REPLACE INTO token_stats (mode, avg_doc_len, total_docs) VALUES (?, ?, ?)' - ); - const insertPhraseVocab = db.prepare( - 'INSERT OR REPLACE INTO phrase_vocab (mode, phrase_id, ngram) VALUES (?, ?, ?)' - ); - const insertPhrasePosting = db.prepare( - 'INSERT OR REPLACE INTO phrase_postings (mode, phrase_id, doc_id) VALUES (?, ?, ?)' - ); - const insertChargramVocab = db.prepare( - 'INSERT OR REPLACE INTO chargram_vocab (mode, gram_id, gram) VALUES (?, ?, ?)' - ); - const insertChargramPosting = db.prepare( - 'INSERT OR REPLACE INTO chargram_postings (mode, gram_id, doc_id) VALUES (?, ?, ?)' - ); - const insertMinhash = db.prepare( - 'INSERT OR REPLACE INTO minhash_signatures (mode, doc_id, sig) VALUES (?, ?, ?)' - ); - const insertDense = db.prepare( - 'INSERT OR REPLACE INTO dense_vectors (mode, doc_id, vector) VALUES (?, ?, ?)' - ); - const insertDenseMeta = db.prepare( - 'INSERT OR REPLACE INTO dense_meta (mode, dims, scale, model) VALUES (?, ?, ?, ?)' - ); - const insertFileManifest = db.prepare( - 'INSERT OR REPLACE INTO file_manifest (mode, file, hash, mtimeMs, size, chunk_count) VALUES (?, ?, ?, ?, ?, ?)' - ); - - const tokenIdMap = ensureVocabIds(db, mode, 'token_vocab', 'token_id', 'token', tokenValues, insertTokenVocab); - const phraseIdMap = ensureVocabIds(db, mode, 'phrase_vocab', 'phrase_id', 'ngram', phraseValues, insertPhraseVocab); - const chargramIdMap = ensureVocabIds(db, mode, 'chargram_vocab', 'gram_id', 'gram', chargramValues, insertChargramVocab); - - const maxRow = db.prepare('SELECT MAX(id) AS maxId FROM chunks WHERE mode = ?').get(mode); - let nextDocId = Number.isFinite(maxRow?.maxId) ? maxRow.maxId + 1 : 0; - const denseMetaRow = dbDenseMeta; - let denseMetaSet = !!denseMetaRow; - let denseDims = typeof denseMetaRow?.dims === 'number' ? denseMetaRow.dims : null; - let denseWarned = false; - let insertedChunks = 0; - let vectorAnnLoaded = false; - let vectorAnnReady = false; - let vectorAnnTable = vectorExtension.table || 'dense_vectors_ann'; - let vectorAnnColumn = vectorExtension.column || 'embedding'; - let insertVectorAnn = null; - if (vectorAnnEnabled) { - const loadResult = loadVectorExtension(db, vectorExtension, `sqlite ${mode}`); - if (loadResult.ok) { - vectorAnnLoaded = true; - if (hasVectorTable(db, vectorAnnTable)) { - vectorAnnReady = true; - } else if (denseDims) { - const created = ensureVectorTable(db, vectorExtension, denseDims); - if (created.ok) { - vectorAnnReady = true; - vectorAnnTable = created.tableName; - vectorAnnColumn = created.column; - } else { - console.warn(`[sqlite] Failed to create vector table for ${mode}: ${created.reason}`); - } - } - if (vectorAnnReady) { - insertVectorAnn = db.prepare( - `INSERT OR REPLACE INTO ${vectorAnnTable} (rowid, ${vectorAnnColumn}) VALUES (?, ?)` - ); - } - } else { - console.warn(`[sqlite] Vector extension unavailable for ${mode}: ${loadResult.reason}`); - } - } - const vectorDeleteTargets = vectorAnnLoaded && vectorAnnReady - ? [{ table: vectorAnnTable, column: 'rowid', withMode: false, transform: toVectorId }] - : []; - - const applyChanges = db.transaction(() => { - for (const file of deleted) { - const normalizedFile = normalizeFilePath(file); - const docRows = db.prepare('SELECT id FROM chunks WHERE mode = ? AND file = ?').all(mode, normalizedFile); - const docIds = docRows.map((row) => row.id); - deleteDocIds(db, mode, docIds, vectorDeleteTargets); - db.prepare('DELETE FROM file_manifest WHERE mode = ? AND file = ?').run(mode, normalizedFile); - } - - for (const file of changed) { - const normalizedFile = normalizeFilePath(file); - const docRows = db.prepare('SELECT id FROM chunks WHERE mode = ? AND file = ?').all(mode, normalizedFile); - const docIds = docRows.map((row) => row.id); - deleteDocIds(db, mode, docIds, vectorDeleteTargets); - - const bundle = bundles.get(file); - let chunkCount = 0; - for (const chunk of bundle.chunks || []) { - const docId = nextDocId; - nextDocId += 1; - const row = buildChunkRow(chunk, mode, docId); - insertChunk.run(row); - insertFts.run(row); - - const tokensArray = Array.isArray(chunk.tokens) ? chunk.tokens : []; - insertDocLength.run(mode, docId, tokensArray.length); - const freq = buildTokenFrequency(tokensArray); - for (const [token, tf] of freq.entries()) { - const tokenId = tokenIdMap.get(token); - if (tokenId === undefined) continue; - insertTokenPosting.run(mode, tokenId, docId, tf); - } - - if (Array.isArray(chunk.ngrams)) { - const unique = new Set(chunk.ngrams); - for (const ng of unique) { - const phraseId = phraseIdMap.get(ng); - if (phraseId === undefined) continue; - insertPhrasePosting.run(mode, phraseId, docId); - } - } - - if (Array.isArray(chunk.chargrams)) { - const unique = new Set(chunk.chargrams); - for (const gram of unique) { - const gramId = chargramIdMap.get(gram); - if (gramId === undefined) continue; - insertChargramPosting.run(mode, gramId, docId); - } - } - - if (Array.isArray(chunk.minhashSig) && chunk.minhashSig.length) { - insertMinhash.run(mode, docId, packUint32(chunk.minhashSig)); - } - - if (Array.isArray(chunk.embedding) && chunk.embedding.length) { - const dims = chunk.embedding.length; - if (!denseMetaSet) { - insertDenseMeta.run(mode, dims, 1.0, modelConfig.id || null); - denseMetaSet = true; - denseDims = dims; - } else if (denseDims !== null && dims !== denseDims && !denseWarned) { - console.warn(`Dense vector dims mismatch for ${mode}: expected ${denseDims}, got ${dims}`); - denseWarned = true; - } - insertDense.run(mode, docId, packUint8(quantizeVec(chunk.embedding))); - if (vectorAnnLoaded) { - if (!vectorAnnReady) { - const created = ensureVectorTable(db, vectorExtension, dims); - if (created.ok) { - vectorAnnReady = true; - vectorAnnTable = created.tableName; - vectorAnnColumn = created.column; - insertVectorAnn = db.prepare( - `INSERT OR REPLACE INTO ${vectorAnnTable} (rowid, ${vectorAnnColumn}) VALUES (?, ?)` - ); - } - } - if (vectorAnnReady && insertVectorAnn) { - const encoded = encodeVector(chunk.embedding, vectorExtension); - if (encoded) insertVectorAnn.run(toVectorId(docId), encoded); - } - } - } - - chunkCount += 1; - insertedChunks += 1; - } - - const entry = manifestFiles[file] || {}; - insertFileManifest.run( - mode, - normalizedFile, - entry?.hash || null, - Number.isFinite(entry?.mtimeMs) ? entry.mtimeMs : null, - Number.isFinite(entry?.size) ? entry.size : null, - chunkCount - ); - } - - updateTokenStats(db, mode, insertTokenStats); +if (process.argv[1] === fileURLToPath(import.meta.url)) { + runBuildSqliteIndex().catch((err) => { + console.error(err?.message || err); + process.exit(1); }); - - applyChanges(); - db.close(); - return { - used: true, - changedFiles: changed.length, - deletedFiles: deleted.length, - insertedChunks - }; -} - -/** - * Build or incrementally update an index for a mode. - * @param {'code'|'prose'} mode - * @param {object|null} index - * @param {string} targetPath - * @param {object|null} incrementalData - * @returns {{count?:number,incremental:boolean,changedFiles?:number,deletedFiles?:number,insertedChunks?:number}} - */ -async function runMode(mode, index, targetPath, incrementalData) { - if (incrementalRequested) { - const expectedDense = index?.denseVec - ? { model: index.denseVec.model, dims: index.denseVec.dims } - : null; - const result = incrementalUpdateDatabase(targetPath, mode, incrementalData, { - expectedDense - }); - if (result.used) { - if (compactOnIncremental && (result.changedFiles || result.deletedFiles)) { - console.log(`[sqlite] Compaction requested for ${mode} index...`); - await compactDatabase({ - dbPath: targetPath, - mode, - vectorExtension, - dryRun: false, - keepBackup: false - }); - } - return { ...result, incremental: true }; - } - if (result.reason) { - console.warn(`[sqlite] Incremental ${mode} update skipped (${result.reason}); rebuilding full index.`); - } - } - const count = buildDatabase(targetPath, index, mode, incrementalData?.manifest?.files); - return { count, incremental: false, changedFiles: null, deletedFiles: null, insertedChunks: count }; -} - -const results = {}; -if (modeArg === 'all' || modeArg === 'code') { - const targetPath = modeArg === 'all' ? codeOutPath : outPath; - results.code = await runMode('code', codeIndex, targetPath, incrementalCode); -} -if (modeArg === 'all' || modeArg === 'prose') { - const targetPath = modeArg === 'all' ? proseOutPath : outPath; - results.prose = await runMode('prose', proseIndex, targetPath, incrementalProse); -} - -if (modeArg === 'all') { - const codeResult = results.code || {}; - const proseResult = results.prose || {}; - if (codeResult.incremental || proseResult.incremental) { - console.log(`SQLite indexes updated at code=${codeOutPath} prose=${proseOutPath}. code+${codeResult.insertedChunks || 0} prose+${proseResult.insertedChunks || 0}`); - } else { - console.log(`SQLite indexes built at code=${codeOutPath} prose=${proseOutPath}. code=${codeResult.count || 0} prose=${proseResult.count || 0}`); - } -} else { - const result = modeArg === 'code' ? results.code : results.prose; - if (result?.incremental) { - console.log(`SQLite ${modeArg} index updated at ${outPath}. +${result.insertedChunks || 0} chunks`); - } else { - console.log(`SQLite ${modeArg} index built at ${outPath}. ${modeArg}=${result?.count || 0}`); - } } diff --git a/tools/build-sqlite-index/cli.js b/tools/build-sqlite-index/cli.js new file mode 100644 index 000000000..3bb8ac7ef --- /dev/null +++ b/tools/build-sqlite-index/cli.js @@ -0,0 +1,40 @@ +import { createCli } from '../../src/shared/cli.js'; + +export const normalizeValidateMode = (value) => { + if (value === false || value == null) return 'off'; + const normalized = String(value).trim().toLowerCase(); + if (!normalized || normalized === 'true') return 'smoke'; + if (['off', 'false', '0', 'no'].includes(normalized)) return 'off'; + if (['full', 'integrity'].includes(normalized)) return 'full'; + return 'smoke'; +}; + +export const parseBuildSqliteArgs = (rawArgs, options = {}) => { + const emitOutput = options.emitOutput !== false; + const exitOnError = options.exitOnError !== false; + const argv = createCli({ + scriptName: 'build-sqlite-index', + argv: ['node', 'build-sqlite-index.js', ...(rawArgs || [])], + options: { + 'code-dir': { type: 'string' }, + 'prose-dir': { type: 'string' }, + out: { type: 'string' }, + mode: { type: 'string', default: 'all' }, + repo: { type: 'string' }, + incremental: { type: 'boolean', default: false }, + compact: { type: 'boolean', default: false }, + validate: { type: 'string', default: 'smoke' }, + 'index-root': { type: 'string' } + } + }).parse(); + const validateMode = normalizeValidateMode(argv.validate); + const modeArg = (argv.mode || 'all').toLowerCase(); + return { + argv, + emitOutput, + exitOnError, + validateMode, + modeArg, + rawArgs: rawArgs || [] + }; +}; diff --git a/tools/build-sqlite-index/index-state.js b/tools/build-sqlite-index/index-state.js new file mode 100644 index 000000000..b483b4e81 --- /dev/null +++ b/tools/build-sqlite-index/index-state.js @@ -0,0 +1,78 @@ +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import path from 'node:path'; +import { readJson } from '../../src/storage/sqlite/utils.js'; +import { writeJsonObjectFile } from '../../src/shared/json-stream.js'; +import { checksumFile } from '../../src/shared/hash.js'; + +export const updateIndexStateManifest = async (indexDir) => { + const manifestPath = path.join(indexDir, 'pieces', 'manifest.json'); + if (!fsSync.existsSync(manifestPath)) return; + let manifest = null; + try { + manifest = readJson(manifestPath) || null; + } catch { + return; + } + if (!manifest || !Array.isArray(manifest.pieces)) return; + const statePath = path.join(indexDir, 'index_state.json'); + if (!fsSync.existsSync(statePath)) return; + let bytes = null; + let checksum = null; + let checksumAlgo = null; + try { + const stat = await fs.stat(statePath); + bytes = stat.size; + const result = await checksumFile(statePath); + checksum = result?.value || null; + checksumAlgo = result?.algo || null; + } catch {} + if (!bytes || !checksum) return; + const pieces = manifest.pieces.map((piece) => { + if (piece?.name !== 'index_state' || piece?.path !== 'index_state.json') { + return piece; + } + return { + ...piece, + bytes, + checksum: checksum && checksumAlgo ? `${checksumAlgo}:${checksum}` : piece.checksum + }; + }); + const next = { + ...manifest, + updatedAt: new Date().toISOString(), + pieces + }; + try { + await writeJsonObjectFile(manifestPath, { fields: next, atomic: true }); + } catch { + // Ignore manifest write failures. + } +}; + +export const updateSqliteState = async (indexDir, patch) => { + if (!indexDir) return; + const statePath = path.join(indexDir, 'index_state.json'); + let state = {}; + if (fsSync.existsSync(statePath)) { + try { + state = readJson(statePath) || {}; + } catch { + state = {}; + } + } + const now = new Date().toISOString(); + state.generatedAt = state.generatedAt || now; + state.updatedAt = now; + state.sqlite = { + ...(state.sqlite || {}), + ...patch, + updatedAt: now + }; + try { + await writeJsonObjectFile(statePath, { fields: state, atomic: true }); + } catch { + // Ignore index state write failures. + } + await updateIndexStateManifest(indexDir); +}; diff --git a/tools/build-sqlite-index/run.js b/tools/build-sqlite-index/run.js new file mode 100644 index 000000000..68f36da48 --- /dev/null +++ b/tools/build-sqlite-index/run.js @@ -0,0 +1,345 @@ +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { parseBuildSqliteArgs } from './cli.js'; +import { createTempPath } from './temp-path.js'; +import { updateSqliteState } from './index-state.js'; +import { getEnvConfig } from '../../src/shared/env.js'; +import { resolveThreadLimits } from '../../src/shared/threads.js'; +import { markBuildPhase, resolveBuildStatePath, startBuildHeartbeat } from '../../src/index/build/build-state.js'; +import { + getIndexDir, + getModelConfig, + getRepoCacheRoot, + loadUserConfig, + resolveIndexRoot, + resolveRepoRoot, + resolveSqlitePaths +} from '../dict-utils.js'; +import { + encodeVector, + ensureVectorTable, + getVectorExtensionConfig, + hasVectorTable, + loadVectorExtension +} from '../vector-extension.js'; +import { compactDatabase } from '../compact-sqlite-index.js'; +import { loadIncrementalManifest } from '../../src/storage/sqlite/incremental.js'; +import { loadIndex, replaceSqliteDatabase } from '../../src/storage/sqlite/utils.js'; +import { buildDatabaseFromArtifacts, loadIndexPieces } from '../../src/storage/sqlite/build/from-artifacts.js'; +import { buildDatabaseFromBundles } from '../../src/storage/sqlite/build/from-bundles.js'; +import { incrementalUpdateDatabase } from '../../src/storage/sqlite/build/incremental-update.js'; + +let Database = null; +try { + ({ default: Database } = await import('better-sqlite3')); +} catch {} + +const resolveOutputPaths = ({ modeArg, outArg, sqlitePaths }) => { + let outPath = null; + let codeOutPath = sqlitePaths.codePath; + let proseOutPath = sqlitePaths.prosePath; + if (outArg) { + if (modeArg === 'all') { + const outDir = outArg.endsWith('.db') ? path.dirname(outArg) : outArg; + codeOutPath = path.join(outDir, 'index-code.db'); + proseOutPath = path.join(outDir, 'index-prose.db'); + } else { + const targetName = modeArg === 'code' ? 'index-code.db' : 'index-prose.db'; + outPath = outArg.endsWith('.db') ? outArg : path.join(outArg, targetName); + } + } + if (!outPath && modeArg !== 'all') { + outPath = modeArg === 'code' ? codeOutPath : proseOutPath; + } + return { outPath, codeOutPath, proseOutPath }; +}; + +export async function runBuildSqliteIndex(rawArgs = process.argv.slice(2), options = {}) { + const { + argv, + emitOutput, + exitOnError, + validateMode, + modeArg, + rawArgs: parsedRawArgs + } = parseBuildSqliteArgs(rawArgs, options); + const bail = (message, code = 1) => { + if (emitOutput && message) console.error(message); + if (exitOnError) process.exit(code); + throw new Error(message || 'SQLite index build failed.'); + }; + if (!Database) return bail('better-sqlite3 is required. Run npm install first.'); + + const rootArg = options.root ? path.resolve(options.root) : (argv.repo ? path.resolve(argv.repo) : null); + const root = rootArg || resolveRepoRoot(process.cwd()); + const envConfig = getEnvConfig(); + const userConfig = loadUserConfig(root); + const indexRoot = argv['index-root'] + ? path.resolve(argv['index-root']) + : resolveIndexRoot(root, userConfig); + const buildStatePath = resolveBuildStatePath(indexRoot); + const hasBuildState = buildStatePath && fsSync.existsSync(buildStatePath); + const stopHeartbeat = hasBuildState ? startBuildHeartbeat(indexRoot, 'stage4') : () => {}; + const threadLimits = resolveThreadLimits({ + argv, + rawArgv: parsedRawArgs, + envConfig, + configConcurrency: userConfig?.indexing?.concurrency, + importConcurrencyConfig: userConfig?.indexing?.importConcurrency + }); + if (emitOutput && envConfig.verbose === true) { + console.log( + `[sqlite] Thread limits (${threadLimits.source}): ` + + `cpu=${threadLimits.cpuCount}, cap=${threadLimits.maxConcurrencyCap}, ` + + `files=${threadLimits.fileConcurrency}, imports=${threadLimits.importConcurrency}, ` + + `io=${threadLimits.ioConcurrency}, cpuWork=${threadLimits.cpuConcurrency}.` + ); + } + const modelConfig = getModelConfig(root, userConfig); + const vectorExtension = getVectorExtensionConfig(root, userConfig); + const vectorAnnEnabled = vectorExtension.enabled; + const vectorConfig = { + enabled: vectorAnnEnabled, + extension: vectorExtension, + encodeVector, + hasVectorTable, + loadVectorExtension, + ensureVectorTable + }; + const repoCacheRoot = getRepoCacheRoot(root, userConfig); + const compactFlag = argv.compact; + const compactOnIncremental = compactFlag === true + || (compactFlag !== false && userConfig?.sqlite?.compactOnIncremental === true); + const codeDir = argv['code-dir'] + ? path.resolve(argv['code-dir']) + : getIndexDir(root, 'code', userConfig, { indexRoot }); + const proseDir = argv['prose-dir'] + ? path.resolve(argv['prose-dir']) + : getIndexDir(root, 'prose', userConfig, { indexRoot }); + const sqlitePaths = resolveSqlitePaths(root, userConfig, indexRoot ? { indexRoot } : {}); + const incrementalRequested = argv.incremental === true; + + if (!['all', 'code', 'prose'].includes(modeArg)) { + return bail('Invalid mode. Use --mode all|code|prose'); + } + + const sqliteStateTargets = []; + if (modeArg === 'all' || modeArg === 'code') sqliteStateTargets.push(codeDir); + if (modeArg === 'all' || modeArg === 'prose') sqliteStateTargets.push(proseDir); + if (hasBuildState) { + await markBuildPhase(indexRoot, 'stage4', 'running'); + } + await Promise.all(sqliteStateTargets.map((dir) => updateSqliteState(dir, { + enabled: true, + ready: false, + pending: true + }))); + + const outArg = argv.out ? path.resolve(argv.out) : null; + const { outPath, codeOutPath, proseOutPath } = resolveOutputPaths({ + modeArg, + outArg, + sqlitePaths + }); + + if (modeArg === 'all') { + await fs.mkdir(path.dirname(codeOutPath), { recursive: true }); + await fs.mkdir(path.dirname(proseOutPath), { recursive: true }); + } else if (outPath) { + await fs.mkdir(path.dirname(outPath), { recursive: true }); + } + + const loadIndexSafe = (dir, label) => { + try { + const index = loadIndex(dir, modelConfig.id); + if (index) return { index, tooLarge: false, pieces: null }; + return { index: null, tooLarge: false, pieces: loadIndexPieces(dir, modelConfig.id) }; + } catch (err) { + if (err?.code === 'ERR_JSON_TOO_LARGE') { + console.warn(`[sqlite] ${label} chunk_meta too large; will use pieces if available.`); + return { index: null, tooLarge: true, pieces: loadIndexPieces(dir, modelConfig.id) }; + } + throw err; + } + }; + + const { index: codeIndex, pieces: codePieces } = loadIndexSafe(codeDir, 'code'); + const { index: proseIndex, pieces: prosePieces } = loadIndexSafe(proseDir, 'prose'); + const incrementalCode = loadIncrementalManifest(repoCacheRoot, 'code'); + const incrementalProse = loadIncrementalManifest(repoCacheRoot, 'prose'); + if (!codeIndex && !codePieces && !proseIndex && !prosePieces + && !incrementalCode?.manifest && !incrementalProse?.manifest) { + return bail('No index found. Build index-code/index-prose first.'); + } + + if (sqlitePaths.legacyExists) { + try { + await fs.rm(sqlitePaths.legacyPath, { force: true }); + console.warn(`Removed legacy SQLite index at ${sqlitePaths.legacyPath}`); + } catch (err) { + console.warn(`Failed to remove legacy SQLite index at ${sqlitePaths.legacyPath}: ${err?.message || err}`); + } + } + + if (modeArg === 'code' && !codeIndex && !codePieces && !incrementalCode?.manifest) { + return bail('Code index missing; build index-code first.'); + } + if (modeArg === 'prose' && !proseIndex && !prosePieces && !incrementalProse?.manifest) { + return bail('Prose index missing; build index-prose first.'); + } + + const workerPath = fileURLToPath(new URL('../workers/bundle-reader.js', import.meta.url)); + + const runMode = async (mode, index, indexDir, targetPath, incrementalData) => { + const hasBundles = incrementalData?.manifest?.files + ? Object.keys(incrementalData.manifest.files).length > 0 + : false; + + if (incrementalRequested) { + const expectedDense = index?.denseVec + ? { model: index.denseVec.model, dims: index.denseVec.dims } + : null; + const result = await incrementalUpdateDatabase({ + Database, + outPath: targetPath, + mode, + incrementalData, + modelConfig, + vectorConfig, + emitOutput, + validateMode, + expectedDense + }); + if (result.used) { + if (compactOnIncremental && (result.changedFiles || result.deletedFiles)) { + console.log(`[sqlite] Compaction requested for ${mode} index...`); + await compactDatabase({ + dbPath: targetPath, + mode, + vectorExtension, + dryRun: false, + keepBackup: false + }); + } + return { ...result, incremental: true }; + } + if (result.reason) { + console.warn(`[sqlite] Incremental ${mode} update skipped (${result.reason}); rebuilding full index.`); + } + } + if (hasBundles) { + console.log(`[sqlite] Using incremental bundles for ${mode} full rebuild.`); + const tempPath = createTempPath(targetPath); + let bundleResult = { count: 0 }; + try { + bundleResult = await buildDatabaseFromBundles({ + Database, + outPath: tempPath, + mode, + incrementalData, + envConfig, + threadLimits, + emitOutput, + validateMode, + vectorConfig, + modelConfig, + workerPath + }); + if (bundleResult.count) { + await replaceSqliteDatabase(tempPath, targetPath, { keepBackup: true }); + } else { + await fs.rm(tempPath, { force: true }); + } + } catch (err) { + try { await fs.rm(tempPath, { force: true }); } catch {} + throw err; + } + if (bundleResult.count) { + return { + count: bundleResult.count, + incremental: false, + changedFiles: null, + deletedFiles: null, + insertedChunks: bundleResult.count + }; + } + if (bundleResult.reason) { + console.warn(`[sqlite] Bundle build skipped (${bundleResult.reason}); falling back to file-backed artifacts.`); + } + } + const tempPath = createTempPath(targetPath); + let count = 0; + try { + count = await buildDatabaseFromArtifacts({ + Database, + outPath: tempPath, + index, + indexDir, + mode, + manifestFiles: incrementalData?.manifest?.files, + emitOutput, + validateMode, + vectorConfig, + modelConfig + }); + await replaceSqliteDatabase(tempPath, targetPath, { keepBackup: true }); + } catch (err) { + try { await fs.rm(tempPath, { force: true }); } catch {} + throw err; + } + return { count, incremental: false, changedFiles: null, deletedFiles: null, insertedChunks: count }; + }; + + const results = {}; + if (modeArg === 'all' || modeArg === 'code') { + const targetPath = modeArg === 'all' ? codeOutPath : outPath; + const codeInput = codeIndex || codePieces; + results.code = await runMode('code', codeInput, codeDir, targetPath, incrementalCode); + } + if (modeArg === 'all' || modeArg === 'prose') { + const targetPath = modeArg === 'all' ? proseOutPath : outPath; + const proseInput = proseIndex || prosePieces; + results.prose = await runMode('prose', proseInput, proseDir, targetPath, incrementalProse); + } + + if (modeArg === 'all') { + const codeResult = results.code || {}; + const proseResult = results.prose || {}; + if (codeResult.incremental || proseResult.incremental) { + console.log(`SQLite indexes updated at code=${codeOutPath} prose=${proseOutPath}. ` + + `code+${codeResult.insertedChunks || 0} prose+${proseResult.insertedChunks || 0}`); + } else { + console.log(`SQLite indexes built at code=${codeOutPath} prose=${proseOutPath}. ` + + `code=${codeResult.count || 0} prose=${proseResult.count || 0}`); + } + } else { + const result = modeArg === 'code' ? results.code : results.prose; + if (result?.incremental) { + console.log(`SQLite ${modeArg} index updated at ${outPath}. +${result.insertedChunks || 0} chunks`); + } else { + console.log(`SQLite ${modeArg} index built at ${outPath}. ${modeArg}=${result?.count || 0}`); + } + } + + await Promise.all(sqliteStateTargets.map((dir) => updateSqliteState(dir, { + enabled: true, + ready: true, + pending: false + }))); + if (hasBuildState) { + await markBuildPhase(indexRoot, 'stage4', 'done'); + } + stopHeartbeat(); + + return { + mode: modeArg, + results, + paths: { + code: codeOutPath, + prose: proseOutPath, + out: outPath + } + }; +} diff --git a/tools/build-sqlite-index/temp-path.js b/tools/build-sqlite-index/temp-path.js new file mode 100644 index 000000000..5849e0fd1 --- /dev/null +++ b/tools/build-sqlite-index/temp-path.js @@ -0,0 +1 @@ +export { createTempPath } from '../build-embeddings/atomic.js'; diff --git a/tools/cache-gc.js b/tools/cache-gc.js index fab4c8fda..380dd221d 100644 --- a/tools/cache-gc.js +++ b/tools/cache-gc.js @@ -2,20 +2,28 @@ import fs from 'node:fs/promises'; import fsSync from 'node:fs'; import path from 'node:path'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; +import { getEnvConfig } from '../src/shared/env.js'; import { getCacheRoot, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; import { isRootPath } from './path-utils.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['dry-run', 'json'], - string: ['max-bytes', 'max-gb', 'max-age-days', 'repo'], - default: { 'dry-run': false, json: false } -}); +const argv = createCli({ + scriptName: 'cache-gc', + options: { + 'dry-run': { type: 'boolean', default: false }, + json: { type: 'boolean', default: false }, + 'max-bytes': { type: 'number' }, + 'max-gb': { type: 'number' }, + 'max-age-days': { type: 'number' }, + repo: { type: 'string' } + } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); const userConfig = loadUserConfig(root); -const cacheRoot = (userConfig.cache && userConfig.cache.root) || process.env.PAIROFCLEATS_CACHE_ROOT || getCacheRoot(); +const envConfig = getEnvConfig(); +const cacheRoot = (userConfig.cache && userConfig.cache.root) || envConfig.cacheRoot || getCacheRoot(); const gcConfig = userConfig.cache?.gc || {}; const parseNumber = (value) => { @@ -84,17 +92,21 @@ if (!fsSync.existsSync(repoRoot)) { const entries = await fs.readdir(repoRoot, { withFileTypes: true }); const repos = []; +const needsSizeScan = maxBytes != null; for (const entry of entries) { if (!entry.isDirectory()) continue; const repoPath = path.join(repoRoot, entry.name); const stat = await fs.stat(repoPath); - const bytes = await sizeOfPath(repoPath); - repos.push({ + const repo = { id: entry.name, path: repoPath, - bytes, + bytes: null, mtimeMs: stat.mtimeMs - }); + }; + if (needsSizeScan) { + repo.bytes = await sizeOfPath(repoPath); + } + repos.push(repo); } const removals = []; @@ -123,6 +135,14 @@ if (maxBytes != null) { } } +if (!needsSizeScan && removals.length) { + for (const repo of removals) { + if (!Number.isFinite(repo.bytes)) { + repo.bytes = await sizeOfPath(repo.path); + } + } +} + for (const repo of removals) { if (isRootPath(repo.path)) { console.error(`refusing to delete root path: ${repo.path}`); @@ -132,8 +152,11 @@ for (const repo of removals) { await fs.rm(repo.path, { recursive: true, force: true }); } -const totalBytes = repos.reduce((sum, repo) => sum + repo.bytes, 0); -const freedBytes = removals.reduce((sum, repo) => sum + repo.bytes, 0); +const hasSizeData = repos.some((repo) => Number.isFinite(repo.bytes)); +const totalBytes = hasSizeData + ? repos.reduce((sum, repo) => sum + (Number.isFinite(repo.bytes) ? repo.bytes : 0), 0) + : null; +const freedBytes = removals.reduce((sum, repo) => sum + (Number.isFinite(repo.bytes) ? repo.bytes : 0), 0); const payload = { ok: true, dryRun, diff --git a/tools/ci-build-artifacts.js b/tools/ci-build-artifacts.js index cbc726147..314cbcc7b 100644 --- a/tools/ci-build-artifacts.js +++ b/tools/ci-build-artifacts.js @@ -3,25 +3,27 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; import { spawnSync } from 'node:child_process'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; import simpleGit from 'simple-git'; -import { fileURLToPath } from 'node:url'; -import { getIndexDir, loadUserConfig, resolveRepoRoot, resolveSqlitePaths } from './dict-utils.js'; +import { getIndexDir, getRuntimeConfig, loadUserConfig, resolveRepoRoot, resolveRuntimeEnv, resolveSqlitePaths, resolveToolRoot } from './dict-utils.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['skip-build', 'skip-sqlite', 'incremental'], - string: ['out', 'repo'], - default: { - 'skip-build': false, - 'skip-sqlite': false, - 'incremental': false +const argv = createCli({ + scriptName: 'ci-build', + options: { + 'skip-build': { type: 'boolean', default: false }, + 'skip-sqlite': { type: 'boolean', default: false }, + incremental: { type: 'boolean', default: false }, + out: { type: 'string' }, + repo: { type: 'string' } } -}); +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); -const scriptRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..'); +const scriptRoot = resolveToolRoot(); const userConfig = loadUserConfig(root); +const runtimeConfig = getRuntimeConfig(root, userConfig); +const baseEnv = resolveRuntimeEnv(runtimeConfig, process.env); const outDir = argv.out ? path.resolve(argv.out) : path.join(root, 'ci-artifacts'); const codeDir = getIndexDir(root, 'code', userConfig); const proseDir = getIndexDir(root, 'prose', userConfig); @@ -34,7 +36,7 @@ const sqlitePaths = resolveSqlitePaths(root, userConfig); * @param {string} label */ function run(cmd, args, label) { - const result = spawnSync(cmd, args, { stdio: 'inherit' }); + const result = spawnSync(cmd, args, { stdio: 'inherit', env: baseEnv }); if (result.status !== 0) { console.error(`Failed: ${label || cmd}`); process.exit(result.status ?? 1); @@ -42,13 +44,13 @@ function run(cmd, args, label) { } if (!argv['skip-build']) { - const args = [path.join(scriptRoot, 'build_index.js')]; + const args = [path.join(scriptRoot, 'build_index.js'), '--repo', root]; if (argv.incremental) args.push('--incremental'); run(process.execPath, args, 'build index'); } if (!argv['skip-sqlite']) { - const args = [path.join(scriptRoot, 'tools', 'build-sqlite-index.js')]; + const args = [path.join(scriptRoot, 'tools', 'build-sqlite-index.js'), '--repo', root]; if (argv.incremental) args.push('--incremental'); run(process.execPath, args, 'build sqlite index'); } diff --git a/tools/ci-restore-artifacts.js b/tools/ci-restore-artifacts.js index 4767036fe..508333d85 100644 --- a/tools/ci-restore-artifacts.js +++ b/tools/ci-restore-artifacts.js @@ -2,15 +2,18 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; import simpleGit from 'simple-git'; import { getIndexDir, loadUserConfig, resolveRepoRoot, resolveSqlitePaths } from './dict-utils.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['force'], - string: ['from', 'repo'], - default: { force: false } -}); +const argv = createCli({ + scriptName: 'ci-restore', + options: { + force: { type: 'boolean', default: false }, + from: { type: 'string' }, + repo: { type: 'string' } + } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); diff --git a/tools/clean-artifacts.js b/tools/clean-artifacts.js index aabba5939..7be9054ca 100644 --- a/tools/clean-artifacts.js +++ b/tools/clean-artifacts.js @@ -2,20 +2,25 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; -import minimist from 'minimist'; -import { getCacheRoot, getRepoCacheRoot, loadUserConfig, resolveRepoRoot, resolveSqlitePaths } from './dict-utils.js'; +import { createCli } from '../src/shared/cli.js'; +import { getEnvConfig } from '../src/shared/env.js'; +import { getCacheRoot, getRepoCacheRoot, loadUserConfig, resolveLmdbPaths, resolveRepoRoot, resolveSqlitePaths } from './dict-utils.js'; import { isInside, isRootPath } from './path-utils.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['all', 'dry-run'], - string: ['repo'], - default: { all: false, 'dry-run': false } -}); +const argv = createCli({ + scriptName: 'clean-artifacts', + options: { + all: { type: 'boolean', default: false }, + 'dry-run': { type: 'boolean', default: false }, + repo: { type: 'string' } + } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); const userConfig = loadUserConfig(root); -const cacheRoot = (userConfig.cache && userConfig.cache.root) || process.env.PAIROFCLEATS_CACHE_ROOT || getCacheRoot(); +const envConfig = getEnvConfig(); +const cacheRoot = (userConfig.cache && userConfig.cache.root) || envConfig.cacheRoot || getCacheRoot(); const repoCacheRoot = getRepoCacheRoot(root, userConfig); const defaultSqliteDir = path.join(repoCacheRoot, 'index-sqlite'); const legacyRepoSqliteDir = path.join(root, 'index-sqlite'); @@ -23,6 +28,7 @@ const defaultCodePath = path.join(defaultSqliteDir, 'index-code.db'); const defaultProsePath = path.join(defaultSqliteDir, 'index-prose.db'); const defaultLegacyPath = path.join(defaultSqliteDir, 'index.db'); const sqlitePaths = resolveSqlitePaths(root, userConfig); +const lmdbPaths = resolveLmdbPaths(root, userConfig); const targets = []; @@ -60,6 +66,14 @@ if (fs.existsSync(legacyRepoSqliteDir) && !isInside(base, path.resolve(legacyRep targets.push(legacyRepoSqliteDir); } +const lmdbDirs = [lmdbPaths.codePath, lmdbPaths.prosePath]; +for (const dir of lmdbDirs) { + if (!dir || !fs.existsSync(dir)) continue; + if (!isInside(base, path.resolve(dir))) { + targets.push(dir); + } +} + const uniqueTargets = Array.from(new Set(targets.map((target) => path.resolve(target)))); for (const target of uniqueTargets) { if (!fs.existsSync(target)) { diff --git a/tools/cli-utils.js b/tools/cli-utils.js index 4f7974ef0..f100c1437 100644 --- a/tools/cli-utils.js +++ b/tools/cli-utils.js @@ -1,4 +1,4 @@ -import { spawnSync } from 'node:child_process'; +import { execaSync } from 'execa'; /** * Run a command and return a normalized result. @@ -8,10 +8,10 @@ import { spawnSync } from 'node:child_process'; * @returns {{ok:boolean,status:number|null,stdout?:string,stderr?:string}} */ export function runCommand(cmd, args, options = {}) { - const result = spawnSync(cmd, args, options); + const result = execaSync(cmd, args, { reject: false, ...options }); return { - ok: result.status === 0, - status: result.status, + ok: result.exitCode === 0, + status: result.exitCode, stdout: result.stdout, stderr: result.stderr }; diff --git a/tools/combined-summary.js b/tools/combined-summary.js index 5d3365f29..b3acae9c8 100644 --- a/tools/combined-summary.js +++ b/tools/combined-summary.js @@ -3,27 +3,41 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; import { spawnSync } from 'node:child_process'; -import minimist from 'minimist'; -import { fileURLToPath } from 'node:url'; -import { resolveAnnSetting, resolveBaseline, resolveCompareModels } from '../src/compare/config.js'; -import { DEFAULT_MODEL_ID, getIndexDir, loadUserConfig, resolveRepoRoot, resolveSqlitePaths } from './dict-utils.js'; +import { createCli } from '../src/shared/cli.js'; +import { resolveAnnSetting, resolveBaseline, resolveCompareModels } from '../src/experimental/compare/config.js'; +import { DEFAULT_MODEL_ID, getIndexDir, getRuntimeConfig, loadUserConfig, resolveRepoRoot, resolveRuntimeEnv, resolveSqlitePaths, resolveToolRoot } from './dict-utils.js'; const rawArgs = process.argv.slice(2); -const argv = minimist(rawArgs, { - boolean: ['json', 'build', 'ann', 'no-ann', 'incremental'], - string: ['models', 'baseline', 'queries', 'out', 'top', 'limit', 'mode', 'repo'], - default: { - json: false, - build: true, - top: 5, - limit: 0 +const argv = createCli({ + scriptName: 'summary-report', + options: { + json: { type: 'boolean', default: false }, + build: { type: 'boolean', default: true }, + ann: { type: 'boolean' }, + 'no-ann': { type: 'boolean' }, + incremental: { type: 'boolean', default: false }, + models: { type: 'string' }, + baseline: { type: 'string' }, + queries: { type: 'string' }, + out: { type: 'string' }, + top: { type: 'number', default: 5 }, + limit: { type: 'number', default: 0 }, + mode: { type: 'string' }, + repo: { type: 'string' }, + profile: { type: 'string' } } -}); +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); -const userConfig = loadUserConfig(root); -const scriptRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..'); +const userConfig = loadUserConfig(root, { profile: argv.profile }); +if (userConfig.profile !== 'full') { + console.error('summary-report is experimental. Run with profile=full or set PAIROFCLEATS_PROFILE=full.'); + process.exit(1); +} +const runtimeConfig = getRuntimeConfig(root, userConfig); +const baseEnv = resolveRuntimeEnv(runtimeConfig, process.env); +const scriptRoot = resolveToolRoot(); const configCompare = Array.isArray(userConfig.models?.compare) ? userConfig.models.compare : []; const defaultModel = userConfig.models?.id || DEFAULT_MODEL_ID; @@ -61,7 +75,7 @@ const reportPaths = { * @returns {void} */ function runNode(args, label) { - const result = spawnSync(process.execPath, args, { stdio: 'inherit' }); + const result = spawnSync(process.execPath, args, { stdio: 'inherit', cwd: root, env: baseEnv }); if (result.status !== 0) { console.error(`Failed: ${label}`); process.exit(result.status ?? 1); @@ -95,7 +109,7 @@ function ensureParityIndexes() { console.error('Index missing for parity. Re-run with --build.'); process.exit(1); } - const args = [path.join(scriptRoot, 'build_index.js')]; + const args = [path.join(scriptRoot, 'build_index.js'), '--repo', root]; if (argv.incremental) args.push('--incremental'); runNode(args, 'build index'); } @@ -107,7 +121,7 @@ function ensureParityIndexes() { console.error('SQLite index missing for parity. Re-run with --build.'); process.exit(1); } - const args = [path.join(scriptRoot, 'tools', 'build-sqlite-index.js')]; + const args = [path.join(scriptRoot, 'tools', 'build-sqlite-index.js'), '--repo', root]; if (argv.incremental) args.push('--incremental'); runNode(args, 'build sqlite index'); } @@ -118,9 +132,11 @@ function ensureParityIndexes() { * @param {{backend?:string,outPath:string}} params * @returns {string[]} */ -function buildCompareArgs({ backend, outPath }) { +function buildCompareArgs({ backend, outPath, buildIndex, buildSqlite }) { const args = [ path.join(scriptRoot, 'tools', 'compare-models.js'), + '--repo', + root, '--models', models.join(','), '--baseline', @@ -134,8 +150,8 @@ function buildCompareArgs({ backend, outPath }) { if (argv.limit) args.push('--limit', String(argv.limit)); if (argv.mode) args.push('--mode', argv.mode); if (!annEnabled) args.push('--no-ann'); - if (buildEnabled) args.push('--build'); - if (buildEnabled && backend === 'sqlite') args.push('--build-sqlite'); + if (buildIndex) args.push('--build'); + if (buildSqlite) args.push('--build-sqlite'); if (argv.incremental) args.push('--incremental'); return args; } @@ -164,8 +180,14 @@ function buildParityArgs({ backend, outPath }) { return args; } -runNode(buildCompareArgs({ outPath: reportPaths.compareMemory }), 'compare models (memory)'); -runNode(buildCompareArgs({ outPath: reportPaths.compareSqlite, backend: 'sqlite' }), 'compare models (sqlite)'); +runNode( + buildCompareArgs({ outPath: reportPaths.compareMemory, buildIndex: buildEnabled, buildSqlite: false }), + 'compare models (memory)' +); +runNode( + buildCompareArgs({ outPath: reportPaths.compareSqlite, backend: 'sqlite', buildIndex: false, buildSqlite: buildEnabled }), + 'compare models (sqlite)' +); ensureParityIndexes(); runNode(buildParityArgs({ backend: 'sqlite', outPath: reportPaths.paritySqlite }), 'parity sqlite'); diff --git a/tools/compact-pieces.js b/tools/compact-pieces.js new file mode 100644 index 000000000..28de08f68 --- /dev/null +++ b/tools/compact-pieces.js @@ -0,0 +1,325 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import path from 'node:path'; +import readline from 'node:readline'; +import { createCli } from '../src/shared/cli.js'; +import { writeJsonLinesFile, writeJsonObjectFile } from '../src/shared/json-stream.js'; +import { checksumFile } from '../src/shared/hash.js'; +import { getIndexDir, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'compact-pieces', + options: { + repo: { type: 'string' }, + mode: { type: 'string', default: 'code' }, + 'chunk-meta-size': { type: 'number' }, + 'token-postings-size': { type: 'number' }, + 'dry-run': { type: 'boolean', default: false } + } +}).parse(); + +const rootArg = argv.repo ? path.resolve(argv.repo) : null; +const root = rootArg || resolveRepoRoot(process.cwd()); +const userConfig = loadUserConfig(root); +const modeArg = (argv.mode || 'code').toLowerCase(); +const modes = modeArg === 'all' ? ['code', 'prose'] : [modeArg]; +const dryRun = argv['dry-run'] === true; + +const listShardFiles = (dir, prefix) => { + if (!fsSync.existsSync(dir)) return []; + return fsSync + .readdirSync(dir) + .filter((name) => name.startsWith(prefix) && name.endsWith('.jsonl')) + .sort() + .map((name) => path.join(dir, name)); +}; + +const readJsonLinesFile = async (filePath, onEntry) => { + const stream = fsSync.createReadStream(filePath, { encoding: 'utf8' }); + const rl = readline.createInterface({ input: stream, crlfDelay: Infinity }); + for await (const line of rl) { + const trimmed = line.trim(); + if (!trimmed) continue; + const result = onEntry(JSON.parse(trimmed)); + if (result && typeof result.then === 'function') { + await result; + } + } +}; + +const readJson = async (filePath) => JSON.parse(await fs.readFile(filePath, 'utf8')); + +const resolveChunkMetaParts = async (indexDir) => { + const metaPath = path.join(indexDir, 'chunk_meta.meta.json'); + const partsDir = path.join(indexDir, 'chunk_meta.parts'); + if (!fsSync.existsSync(metaPath) && !fsSync.existsSync(partsDir)) return null; + let parts = []; + let metaFields = null; + if (fsSync.existsSync(metaPath)) { + const meta = await readJson(metaPath); + metaFields = meta.fields || meta; + if (Array.isArray(metaFields.parts)) { + parts = metaFields.parts.map((name) => path.join(indexDir, name)); + } + } + if (!parts.length) { + parts = listShardFiles(partsDir, 'chunk_meta.part-'); + } + if (!parts.length) return null; + return { metaPath, partsDir, parts, metaFields }; +}; + +const resolveTokenPostingsParts = async (indexDir) => { + const metaPath = path.join(indexDir, 'token_postings.meta.json'); + const shardsDir = path.join(indexDir, 'token_postings.shards'); + if (!fsSync.existsSync(metaPath) && !fsSync.existsSync(shardsDir)) return null; + let parts = []; + let metaFields = null; + let metaArrays = null; + if (fsSync.existsSync(metaPath)) { + const meta = await readJson(metaPath); + metaFields = meta.fields || meta; + metaArrays = meta.arrays || meta; + if (Array.isArray(metaFields.parts)) { + parts = metaFields.parts.map((name) => path.join(indexDir, name)); + } + } + if (!parts.length) { + parts = fsSync + .readdirSync(shardsDir) + .filter((name) => name.startsWith('token_postings.part-') && name.endsWith('.json')) + .sort() + .map((name) => path.join(shardsDir, name)); + } + if (!parts.length) return null; + return { metaPath, shardsDir, parts, metaFields, metaArrays }; +}; + +const appendAudit = async (indexDir, line) => { + if (dryRun) return; + const piecesDir = path.join(indexDir, 'pieces'); + await fs.mkdir(piecesDir, { recursive: true }); + const logPath = path.join(piecesDir, 'compaction.log'); + await fs.appendFile(logPath, `${line}\n`); +}; + +const compactChunkMeta = async (indexDir, targetSize) => { + const resolved = await resolveChunkMetaParts(indexDir); + if (!resolved) return null; + const { metaPath, partsDir, parts, metaFields } = resolved; + const totalChunks = Number.isFinite(metaFields?.totalChunks) ? metaFields.totalChunks : null; + const target = Number.isFinite(Number(targetSize)) && Number(targetSize) > 0 + ? Math.floor(Number(targetSize)) + : (Number.isFinite(metaFields?.shardSize) ? metaFields.shardSize : 100000); + if (parts.length <= 1 || target <= 0) return null; + + const tmpDir = path.join(indexDir, 'chunk_meta.parts.compact'); + if (!dryRun) { + await fs.rm(tmpDir, { recursive: true, force: true }); + await fs.mkdir(tmpDir, { recursive: true }); + } + const newParts = []; + const newCounts = []; + let buffer = []; + let partIndex = 0; + let total = 0; + const flush = async () => { + if (!buffer.length) return; + const name = `chunk_meta.part-${String(partIndex).padStart(5, '0')}.jsonl`; + const relPath = path.join('chunk_meta.parts', name).split(path.sep).join('/'); + const outPath = path.join(tmpDir, name); + if (!dryRun) { + await writeJsonLinesFile(outPath, buffer, { atomic: true }); + } + newParts.push(relPath); + newCounts.push(buffer.length); + total += buffer.length; + buffer = []; + partIndex += 1; + }; + + for (const partPath of parts) { + await readJsonLinesFile(partPath, async (entry) => { + buffer.push(entry); + if (buffer.length >= target) { + await flush(); + } + }); + } + await flush(); + if (!dryRun) { + await fs.rm(partsDir, { recursive: true, force: true }); + await fs.rename(tmpDir, partsDir); + await writeJsonObjectFile(metaPath, { + fields: { + format: 'jsonl', + shardSize: target, + totalChunks: totalChunks ?? total, + parts: newParts + }, + atomic: true + }); + } + return { type: 'chunks', name: 'chunk_meta', metaName: 'chunk_meta_meta', parts: newParts, counts: newCounts }; +}; + +const compactTokenPostings = async (indexDir, targetSize) => { + const resolved = await resolveTokenPostingsParts(indexDir); + if (!resolved) return null; + const { metaPath, shardsDir, parts, metaFields, metaArrays } = resolved; + const target = Number.isFinite(Number(targetSize)) && Number(targetSize) > 0 + ? Math.floor(Number(targetSize)) + : (Number.isFinite(metaFields?.shardSize) ? metaFields.shardSize : 50000); + if (parts.length <= 1 || target <= 0) return null; + + const tmpDir = path.join(indexDir, 'token_postings.shards.compact'); + if (!dryRun) { + await fs.rm(tmpDir, { recursive: true, force: true }); + await fs.mkdir(tmpDir, { recursive: true }); + } + const newParts = []; + const newCounts = []; + let vocabBuffer = []; + let postingsBuffer = []; + let partIndex = 0; + const flush = async () => { + if (!vocabBuffer.length) return; + const name = `token_postings.part-${String(partIndex).padStart(5, '0')}.json`; + const relPath = path.join('token_postings.shards', name).split(path.sep).join('/'); + const outPath = path.join(tmpDir, name); + if (!dryRun) { + await writeJsonObjectFile(outPath, { + arrays: { vocab: vocabBuffer, postings: postingsBuffer }, + atomic: true + }); + } + newParts.push(relPath); + newCounts.push(vocabBuffer.length); + vocabBuffer = []; + postingsBuffer = []; + partIndex += 1; + }; + + for (const partPath of parts) { + const shard = await readJson(partPath); + const vocab = Array.isArray(shard?.vocab) ? shard.vocab : (Array.isArray(shard?.arrays?.vocab) ? shard.arrays.vocab : []); + const postings = Array.isArray(shard?.postings) ? shard.postings : (Array.isArray(shard?.arrays?.postings) ? shard.arrays.postings : []); + for (let i = 0; i < vocab.length; i++) { + vocabBuffer.push(vocab[i]); + postingsBuffer.push(postings[i] || []); + if (vocabBuffer.length >= target) { + await flush(); + } + } + } + await flush(); + const docLengths = Array.isArray(metaArrays?.docLengths) ? metaArrays.docLengths : []; + const totalDocs = Number.isFinite(metaFields?.totalDocs) ? metaFields.totalDocs : docLengths.length; + const avgDocLen = Number.isFinite(metaFields?.avgDocLen) + ? metaFields.avgDocLen + : (docLengths.length + ? docLengths.reduce((sum, len) => sum + (Number.isFinite(len) ? len : 0), 0) / docLengths.length + : 0); + const vocabCount = newCounts.reduce((sum, count) => sum + count, 0); + if (!dryRun) { + await fs.rm(shardsDir, { recursive: true, force: true }); + await fs.rename(tmpDir, shardsDir); + await writeJsonObjectFile(metaPath, { + fields: { + avgDocLen, + totalDocs, + format: 'sharded', + shardSize: target, + vocabCount, + parts: newParts + }, + arrays: { docLengths }, + atomic: true + }); + } + return { type: 'postings', name: 'token_postings', metaName: 'token_postings_meta', parts: newParts, counts: newCounts }; +}; + +const updateManifest = async (indexDir, updates) => { + if (!updates?.length) return; + const manifestPath = path.join(indexDir, 'pieces', 'manifest.json'); + if (!fsSync.existsSync(manifestPath)) return; + const manifestRaw = await readJson(manifestPath); + const fields = manifestRaw.fields || manifestRaw; + const pieces = Array.isArray(fields.pieces) ? fields.pieces : []; + const removeNames = new Set(); + updates.forEach((update) => { + removeNames.add(update.name); + removeNames.add(update.metaName); + }); + const retained = pieces.filter((piece) => !removeNames.has(piece?.name)); + const newPieces = [...retained]; + for (const update of updates) { + for (let i = 0; i < update.parts.length; i++) { + const relPath = update.parts[i]; + const absPath = path.join(indexDir, relPath.split('/').join(path.sep)); + const stat = await fs.stat(absPath); + const result = await checksumFile(absPath); + const checksum = result?.value || null; + const checksumAlgo = result?.algo || null; + newPieces.push({ + type: update.type, + name: update.name, + format: update.type === 'chunks' ? 'jsonl' : 'json', + count: update.counts[i], + path: relPath, + bytes: stat.size, + checksum: checksum && checksumAlgo ? `${checksumAlgo}:${checksum}` : null + }); + } + const metaRel = update.type === 'chunks' ? 'chunk_meta.meta.json' : 'token_postings.meta.json'; + const metaAbs = path.join(indexDir, metaRel); + if (fsSync.existsSync(metaAbs)) { + const stat = await fs.stat(metaAbs); + const result = await checksumFile(metaAbs); + const checksum = result?.value || null; + const checksumAlgo = result?.algo || null; + newPieces.push({ + type: update.type, + name: update.metaName, + format: 'json', + count: null, + path: metaRel, + bytes: stat.size, + checksum: checksum && checksumAlgo ? `${checksumAlgo}:${checksum}` : null + }); + } + } + fields.pieces = newPieces; + fields.generatedAt = new Date().toISOString(); + if (!dryRun) { + await fs.mkdir(path.join(indexDir, 'pieces'), { recursive: true }); + await writeJsonObjectFile(manifestPath, { fields, atomic: true }); + } +}; + +for (const mode of modes) { + const indexDir = getIndexDir(root, mode, userConfig); + const chunkMetaTarget = argv['chunk-meta-size']; + const tokenPostingsTarget = argv['token-postings-size']; + const updates = []; + const chunkUpdate = await compactChunkMeta(indexDir, chunkMetaTarget); + if (chunkUpdate) { + updates.push(chunkUpdate); + await appendAudit(indexDir, `${new Date().toISOString()} chunk_meta compacted: parts=${chunkUpdate.parts.length}`); + } + const tokenUpdate = await compactTokenPostings(indexDir, tokenPostingsTarget); + if (tokenUpdate) { + updates.push(tokenUpdate); + await appendAudit(indexDir, `${new Date().toISOString()} token_postings compacted: parts=${tokenUpdate.parts.length}`); + } + if (updates.length) { + await updateManifest(indexDir, updates); + } + if (!updates.length) { + console.log(`[pieces] ${mode}: no compaction needed.`); + } else { + console.log(`[pieces] ${mode}: compaction ${dryRun ? 'planned' : 'complete'}.`); + } +} diff --git a/tools/compact-sqlite-index.js b/tools/compact-sqlite-index.js index 15bf3c90a..0ac762fd4 100644 --- a/tools/compact-sqlite-index.js +++ b/tools/compact-sqlite-index.js @@ -3,12 +3,12 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; import { pathToFileURL } from 'node:url'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; import { loadUserConfig, resolveRepoRoot, resolveSqlitePaths } from './dict-utils.js'; import { encodeVector, ensureVectorTable, getVectorExtensionConfig, hasVectorTable, loadVectorExtension } from './vector-extension.js'; -import { CREATE_TABLES_SQL, REQUIRED_TABLES, SCHEMA_VERSION } from '../src/sqlite/schema.js'; -import { hasRequiredTables, normalizeFilePath } from '../src/sqlite/utils.js'; -import { dequantizeUint8ToFloat32, toVectorId } from '../src/sqlite/vector.js'; +import { CREATE_TABLES_SQL, REQUIRED_TABLES, SCHEMA_VERSION } from '../src/storage/sqlite/schema.js'; +import { hasRequiredTables, normalizeFilePath, replaceSqliteDatabase } from '../src/storage/sqlite/utils.js'; +import { dequantizeUint8ToFloat32, toVectorId } from '../src/storage/sqlite/vector.js'; let Database; try { @@ -111,21 +111,21 @@ export async function compactDatabase(input) { const insertChunk = outDb.prepare(` INSERT OR REPLACE INTO chunks ( - id, mode, file, start, end, startLine, endLine, ext, kind, name, headline, - preContext, postContext, weight, tokens, ngrams, codeRelations, docmeta, - stats, complexity, lint, externalDocs, last_modified, last_author, churn, - chunk_authors + id, chunk_id, mode, file, start, end, startLine, endLine, ext, kind, name, + headline, preContext, postContext, weight, tokens, ngrams, codeRelations, + docmeta, stats, complexity, lint, externalDocs, last_modified, last_author, + churn, chunk_authors ) VALUES ( - @id, @mode, @file, @start, @end, @startLine, @endLine, @ext, @kind, @name, @headline, - @preContext, @postContext, @weight, @tokens, @ngrams, @codeRelations, @docmeta, - @stats, @complexity, @lint, @externalDocs, @last_modified, @last_author, @churn, - @chunk_authors + @id, @chunk_id, @mode, @file, @start, @end, @startLine, @endLine, @ext, @kind, + @name, @headline, @preContext, @postContext, @weight, @tokens, @ngrams, + @codeRelations, @docmeta, @stats, @complexity, @lint, @externalDocs, + @last_modified, @last_author, @churn, @chunk_authors ); `); const insertFts = outDb.prepare(` - INSERT OR REPLACE INTO chunks_fts (rowid, mode, file, name, kind, headline, tokens) - VALUES (@id, @mode, @file, @name, @kind, @headline, @tokensText); + INSERT OR REPLACE INTO chunks_fts (rowid, mode, file, name, signature, kind, headline, doc, tokens) + VALUES (@id, @mode, @file, @name, @signature, @kind, @headline, @doc, @tokensText); `); const insertTokenVocab = outDb.prepare( @@ -196,13 +196,24 @@ export async function compactDatabase(input) { insertChunk.run(chunkRow); const tokensText = parseTokens(row.tokens).join(' '); + let signature = null; + let doc = null; + if (row.docmeta) { + try { + const meta = JSON.parse(row.docmeta); + signature = typeof meta?.signature === 'string' ? meta.signature : null; + doc = typeof meta?.doc === 'string' ? meta.doc : null; + } catch {} + } insertFts.run({ id: newId, mode, file: normalizedFile, name: row.name, + signature, kind: row.kind, headline: row.headline, + doc, tokensText }); @@ -390,28 +401,22 @@ export async function compactDatabase(input) { if (!keepBackup && fs.existsSync(backupPath)) { await fsPromises.rm(backupPath, { force: true }); } - - await fsPromises.rename(dbPath, backupPath); - await fsPromises.rename(tempPath, dbPath); - - if (!keepBackup) { - await fsPromises.rm(backupPath, { force: true }); - } + await replaceSqliteDatabase(tempPath, dbPath, { keepBackup, backupPath }); return { skipped: false }; } const isDirectRun = import.meta.url === pathToFileURL(process.argv[1]).href; if (isDirectRun) { - const argv = minimist(process.argv.slice(2), { - string: ['mode', 'repo'], - boolean: ['dry-run', 'keep-backup'], - default: { - mode: 'all', - 'dry-run': false, - 'keep-backup': false + const argv = createCli({ + scriptName: 'compact-sqlite-index', + options: { + mode: { type: 'string', default: 'all' }, + repo: { type: 'string' }, + 'dry-run': { type: 'boolean', default: false }, + 'keep-backup': { type: 'boolean', default: false } } - }); + }).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); diff --git a/tools/compare-models.js b/tools/compare-models.js index a9c13e721..c934c5031 100644 --- a/tools/compare-models.js +++ b/tools/compare-models.js @@ -3,46 +3,75 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; import crypto from 'node:crypto'; -import { spawnSync } from 'node:child_process'; -import { fileURLToPath } from 'node:url'; -import minimist from 'minimist'; -import { resolveAnnSetting, resolveBaseline, resolveCompareModels } from '../src/compare/config.js'; +import { execaSync } from 'execa'; +import { createCli } from '../src/shared/cli.js'; +import { getEnvConfig } from '../src/shared/env.js'; +import { resolveAnnSetting, resolveBaseline, resolveCompareModels } from '../src/experimental/compare/config.js'; import { DEFAULT_MODEL_ID, getCacheRoot, getDictConfig, getModelConfig, getRepoId, + getRuntimeConfig, loadUserConfig, resolveRepoRoot, - resolveSqlitePaths + resolveRuntimeEnv, + resolveSqlitePaths, + resolveToolRoot } from './dict-utils.js'; const rawArgs = process.argv.slice(2); -const argv = minimist(rawArgs, { - boolean: ['json', 'build', 'build-index', 'build-sqlite', 'incremental', 'stub-embeddings', 'ann', 'no-ann'], - string: ['models', 'baseline', 'queries', 'backend', 'out', 'mode', 'cache-root', 'repo'], - alias: { n: 'top', q: 'queries' }, - default: { top: 5, limit: 0 } -}); +const argv = createCli({ + scriptName: 'compare-models', + options: { + json: { type: 'boolean', default: false }, + build: { type: 'boolean', default: false }, + 'build-index': { type: 'boolean', default: false }, + 'build-sqlite': { type: 'boolean', default: false }, + incremental: { type: 'boolean', default: false }, + 'stub-embeddings': { type: 'boolean', default: false }, + ann: { type: 'boolean' }, + 'no-ann': { type: 'boolean' }, + models: { type: 'string' }, + baseline: { type: 'string' }, + queries: { type: 'string' }, + backend: { type: 'string' }, + out: { type: 'string' }, + mode: { type: 'string' }, + 'cache-root': { type: 'string' }, + repo: { type: 'string' }, + profile: { type: 'string' }, + top: { type: 'number', default: 5 }, + limit: { type: 'number', default: 0 } + }, + aliases: { n: 'top', q: 'queries' } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); -const scriptRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..'); -const userConfig = loadUserConfig(root); +const scriptRoot = resolveToolRoot(); +const userConfig = loadUserConfig(root, { profile: argv.profile }); +if (userConfig.profile !== 'full') { + console.error('compare-models is experimental. Run with profile=full or set PAIROFCLEATS_PROFILE=full.'); + process.exit(1); +} +const envConfig = getEnvConfig(); +const runtimeConfig = getRuntimeConfig(root, userConfig); +const baseEnv = resolveRuntimeEnv(runtimeConfig, process.env); const configCacheRoot = typeof userConfig.cache?.root === 'string' && userConfig.cache.root.trim() ? path.resolve(userConfig.cache.root) : null; const cacheRootBase = argv['cache-root'] ? path.resolve(argv['cache-root']) - : (process.env.PAIROFCLEATS_CACHE_ROOT - ? path.resolve(process.env.PAIROFCLEATS_CACHE_ROOT) + : (envConfig.cacheRoot + ? path.resolve(envConfig.cacheRoot) : getCacheRoot()); const repoId = getRepoId(root); const modelConfig = getModelConfig(root, userConfig); const dictConfig = getDictConfig(root, userConfig); -const sharedModelsDir = process.env.PAIROFCLEATS_MODELS_DIR || modelConfig.dir; -const sharedDictDir = process.env.PAIROFCLEATS_DICT_DIR || dictConfig.dir; +const sharedModelsDir = envConfig.modelsDir || modelConfig.dir; +const sharedDictDir = envConfig.dictDir || dictConfig.dir; const configCompareModels = Array.isArray(userConfig.models?.compare) ? userConfig.models.compare @@ -125,7 +154,7 @@ function getModelCacheRoot(modelId) { */ function buildEnv(modelId, modelCacheRoot) { const env = { - ...process.env, + ...baseEnv, PAIROFCLEATS_MODEL: modelId }; if (modelCacheRoot) env.PAIROFCLEATS_CACHE_ROOT = modelCacheRoot; @@ -142,7 +171,38 @@ function buildEnv(modelId, modelCacheRoot) { * @returns {boolean} */ function indexExists(modelCacheRoot, mode) { - const metaPath = path.join(modelCacheRoot, 'repos', repoId, `index-${mode}`, 'chunk_meta.json'); + const repoCacheRoot = path.join(modelCacheRoot, 'repos', repoId); + let indexRoot = repoCacheRoot; + const currentPath = path.join(repoCacheRoot, 'builds', 'current.json'); + if (fs.existsSync(currentPath)) { + try { + const data = JSON.parse(fs.readFileSync(currentPath, 'utf8')) || {}; + const resolveRoot = (value) => { + if (!value) return null; + return path.isAbsolute(value) ? value : path.join(repoCacheRoot, value); + }; + const buildId = typeof data.buildId === 'string' ? data.buildId : null; + const buildRootRaw = typeof data.buildRoot === 'string' ? data.buildRoot : null; + const buildRoot = buildRootRaw + ? resolveRoot(buildRootRaw) + : (buildId ? path.join(repoCacheRoot, 'builds', buildId) : null); + let modeRoot = null; + if (data.buildRoots && typeof data.buildRoots === 'object' && !Array.isArray(data.buildRoots)) { + const raw = data.buildRoots[mode]; + if (typeof raw === 'string') { + modeRoot = resolveRoot(raw); + } + } else if (buildRoot && Array.isArray(data.modes) && data.modes.includes(mode)) { + modeRoot = buildRoot; + } + if (modeRoot && fs.existsSync(modeRoot)) { + indexRoot = modeRoot; + } else if (buildRoot && fs.existsSync(buildRoot)) { + indexRoot = buildRoot; + } + } catch {} + } + const metaPath = path.join(indexRoot, `index-${mode}`, 'chunk_meta.json'); return fs.existsSync(metaPath); } @@ -166,11 +226,11 @@ function ensureIndex(modelCacheRoot) { * @param {string} label */ function runCommand(args, env, label) { - const stdio = argv.json ? ['ignore', process.stderr, process.stderr] : 'inherit'; - const result = spawnSync(process.execPath, args, { env, stdio }); - if (result.status !== 0) { + const stdio = argv.json ? ['ignore', 'ignore', 'ignore'] : 'inherit'; + const result = execaSync(process.execPath, args, { env, stdio, reject: false }); + if (result.exitCode !== 0) { console.error(`Failed: ${label}`); - process.exit(result.status ?? 1); + process.exit(result.exitCode ?? 1); } } @@ -190,18 +250,20 @@ function runSearch(query, env) { backend, '-n', String(topN), - annArg + annArg, + '--repo', + root ]; if (modeArg && modeArg !== 'both') { args.push('--mode', modeArg); } const start = Date.now(); - const result = spawnSync(process.execPath, args, { env, encoding: 'utf8' }); + const result = execaSync(process.execPath, args, { env, encoding: 'utf8', reject: false }); const wallMs = Date.now() - start; - if (result.status !== 0) { + if (result.exitCode !== 0) { console.error(`Search failed for query="${query}" (model=${env.PAIROFCLEATS_MODEL})`); if (result.stderr) console.error(result.stderr.trim()); - process.exit(result.status ?? 1); + process.exit(result.exitCode ?? 1); } const payload = JSON.parse(result.stdout || '{}'); return { payload, wallMs }; @@ -248,9 +310,8 @@ const limit = Math.max(0, parseInt(argv.limit, 10) || 0); const selectedQueries = limit > 0 ? queries.slice(0, limit) : queries; if (sqliteBackend && buildSqlite) { - const sqlitePaths = resolveSqlitePaths(root, userConfig); - if (!buildIndex && !fs.existsSync(sqlitePaths.codePath) && !fs.existsSync(sqlitePaths.prosePath)) { - console.error('SQLite index missing. Use --build or build the indexes first.'); + if (!buildIndex && !ensureIndex(getModelCacheRoot(models[0]))) { + console.error('Index missing. Use --build or build the index first.'); process.exit(1); } } @@ -266,7 +327,7 @@ for (const modelId of models) { } if (buildIndex) { - const args = [path.join(scriptRoot, 'build_index.js')]; + const args = [path.join(scriptRoot, 'build_index.js'), '--repo', root]; if (buildIncremental) args.push('--incremental'); if (stubEmbeddings) args.push('--stub-embeddings'); runCommand(args, env, `build index (${modelId})`); @@ -276,7 +337,7 @@ for (const modelId of models) { } if (buildSqlite) { - const args = [path.join(scriptRoot, 'tools', 'build-sqlite-index.js')]; + const args = [path.join(scriptRoot, 'tools', 'build-sqlite-index.js'), '--repo', root]; if (buildIncremental) args.push('--incremental'); runCommand(args, env, `build sqlite (${modelId})`); } else if (sqliteBackend) { diff --git a/tools/config-dump.js b/tools/config-dump.js new file mode 100644 index 000000000..9e45e42ca --- /dev/null +++ b/tools/config-dump.js @@ -0,0 +1,70 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { createCli } from '../src/shared/cli.js'; +import { getEnvConfig } from '../src/shared/env.js'; +import { + getCacheRoot, + getCacheRuntimeConfig, + getModelConfig, + getRepoCacheRoot, + getRuntimeConfig, + getToolingConfig, + loadUserConfig, + resolveLmdbPaths, + resolveRepoRoot, + resolveSqlitePaths +} from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'config-dump', + options: { + repo: { type: 'string' }, + json: { type: 'boolean', default: false } + } +}).parse(); + +const rootArg = argv.repo ? path.resolve(argv.repo) : null; +const repoRoot = rootArg || resolveRepoRoot(process.cwd()); +const userConfig = loadUserConfig(repoRoot); +const envConfig = getEnvConfig(); + +const runtimeConfig = getRuntimeConfig(repoRoot, userConfig); +const parsedUv = Number(process.env.UV_THREADPOOL_SIZE); +const effectiveUvThreadpoolSize = Number.isFinite(parsedUv) && parsedUv > 0 ? Math.floor(parsedUv) : null; + + +const cacheRoot = (userConfig.cache && userConfig.cache.root) || envConfig.cacheRoot || getCacheRoot(); +const payload = { + repoRoot, + profile: userConfig.profile || null, + env: envConfig, + userConfig, + derived: { + cacheRoot, + repoCacheRoot: getRepoCacheRoot(repoRoot, userConfig), + runtime: { ...runtimeConfig, effectiveUvThreadpoolSize }, + cacheRuntime: getCacheRuntimeConfig(repoRoot, userConfig), + model: getModelConfig(repoRoot, userConfig), + tooling: getToolingConfig(repoRoot, userConfig), + lmdb: resolveLmdbPaths(repoRoot, userConfig), + sqlite: resolveSqlitePaths(repoRoot, userConfig) + } +}; + +if (argv.json) { + console.log(JSON.stringify(payload, null, 2)); + process.exit(0); +} + +console.log('Config dump'); +console.log(`- repo: ${repoRoot}`); +console.log(`- profile: ${payload.profile || 'none'}`); +console.log(`- cache root: ${payload.derived.cacheRoot}`); +console.log(`- repo cache: ${payload.derived.repoCacheRoot}`); +console.log(`- runtime UV_THREADPOOL_SIZE: ${payload.derived.runtime.effectiveUvThreadpoolSize ?? 'default'}`); +console.log(`- model: ${payload.derived.model.id}`); +console.log(`- lmdb code: ${payload.derived.lmdb.codePath}`); +console.log(`- lmdb prose: ${payload.derived.lmdb.prosePath}`); +console.log(`- sqlite code: ${payload.derived.sqlite.codePath}`); +console.log(`- sqlite prose: ${payload.derived.sqlite.prosePath}`); +console.log(`- env overrides: ${Object.entries(envConfig).filter(([, value]) => value !== '' && value != null).map(([key]) => key).join(', ') || 'none'}`); diff --git a/tools/config-inventory.js b/tools/config-inventory.js new file mode 100644 index 000000000..9e132d23b --- /dev/null +++ b/tools/config-inventory.js @@ -0,0 +1,523 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { fdir } from 'fdir'; +import { resolveToolRoot } from './dict-utils.js'; + +const root = resolveToolRoot(); +const schemaPath = path.join(root, 'docs', 'config-schema.json'); +const outputJsonPath = path.join(root, 'docs', 'config-inventory.json'); +const outputMdPath = path.join(root, 'docs', 'config-inventory.md'); + +const normalizeType = (schema) => { + if (!schema || typeof schema !== 'object') return null; + if (Array.isArray(schema.type)) return schema.type.join('|'); + if (typeof schema.type === 'string') return schema.type; + if (Array.isArray(schema.enum)) return 'enum'; + return null; +}; + +const normalizeEnum = (schema) => { + if (!schema || typeof schema !== 'object') return null; + if (!Array.isArray(schema.enum)) return null; + return schema.enum.map((value) => String(value)); +}; + +const mergeEntry = (target, incoming) => { + if (!target.type && incoming.type) target.type = incoming.type; + if (!target.enum && incoming.enum) target.enum = incoming.enum; + if (target.type && incoming.type && target.type !== incoming.type) { + const parts = new Set(String(target.type).split('|')); + String(incoming.type).split('|').forEach((part) => parts.add(part)); + target.type = Array.from(parts).join('|'); + } + if (target.enum && incoming.enum) { + const merged = new Set(target.enum); + incoming.enum.forEach((value) => merged.add(value)); + target.enum = Array.from(merged); + } +}; + +const collectSchemaEntries = (schema, prefix = '', entries = []) => { + if (!schema || typeof schema !== 'object') return entries; + const properties = schema.properties && typeof schema.properties === 'object' + ? schema.properties + : null; + if (properties) { + for (const [key, child] of Object.entries(properties)) { + const pathKey = prefix ? `${prefix}.${key}` : key; + entries.push({ + path: pathKey, + type: normalizeType(child), + enum: normalizeEnum(child) + }); + collectSchemaEntries(child, pathKey, entries); + } + } + const additional = schema.additionalProperties && typeof schema.additionalProperties === 'object' + ? schema.additionalProperties + : null; + if (additional && additional.properties) { + const pathKey = prefix ? `${prefix}.*` : '*'; + entries.push({ + path: pathKey, + type: normalizeType(additional), + enum: normalizeEnum(additional) + }); + collectSchemaEntries(additional, pathKey, entries); + } + const items = schema.items && typeof schema.items === 'object' ? schema.items : null; + if (items && items.properties) { + const pathKey = prefix ? `${prefix}[]` : '[]'; + entries.push({ + path: pathKey, + type: normalizeType(items), + enum: normalizeEnum(items) + }); + collectSchemaEntries(items, pathKey, entries); + } + return entries; +}; + +const listSourceFiles = async () => { + const files = await new fdir().withFullPaths().crawl(root).withPromise(); + return files.filter((filePath) => { + if (!filePath.endsWith('.js')) return false; + const normalized = filePath.replace(/\\/g, '/'); + if (normalized.includes('/node_modules/')) return false; + if (normalized.includes('/.git/')) return false; + if (normalized.includes('/benchmarks/repos/')) return false; + if (normalized.includes('/benchmarks/cache/')) return false; + return true; + }); +}; + +const findMatchingBrace = (source, startIndex) => { + let depth = 0; + let inString = null; + let escaped = false; + let inLineComment = false; + let inBlockComment = false; + for (let i = startIndex; i < source.length; i += 1) { + const ch = source[i]; + const next = source[i + 1]; + if (inLineComment) { + if (ch === '\n') inLineComment = false; + continue; + } + if (inBlockComment) { + if (ch === '*' && next === '/') { + inBlockComment = false; + i += 1; + } + continue; + } + if (inString) { + if (escaped) { + escaped = false; + continue; + } + if (ch === '\\') { + escaped = true; + continue; + } + if (ch === inString) { + inString = null; + } + continue; + } + if (ch === '/' && next === '/') { + inLineComment = true; + i += 1; + continue; + } + if (ch === '/' && next === '*') { + inBlockComment = true; + i += 1; + continue; + } + if (ch === '"' || ch === '\'' || ch === '`') { + inString = ch; + continue; + } + if (ch === '{') { + depth += 1; + continue; + } + if (ch === '}') { + depth -= 1; + if (depth === 0) return i; + } + } + return -1; +}; + +const extractOptionObjects = (source) => { + const ranges = []; + const patterns = [ + /\boptions\s*:\s*\{/g, + /\.options\s*\(\s*\{/g + ]; + for (const pattern of patterns) { + let match; + while ((match = pattern.exec(source)) !== null) { + const braceIndex = source.indexOf('{', match.index); + if (braceIndex < 0) continue; + const endIndex = findMatchingBrace(source, braceIndex); + if (endIndex < 0) continue; + ranges.push(source.slice(braceIndex, endIndex + 1)); + pattern.lastIndex = endIndex + 1; + } + } + return ranges; +}; + +const extractStringArray = (source, name) => { + const regex = new RegExp(`\\b${name}\\s*=\\s*\\[([\\s\\S]*?)\\]`, 'm'); + const match = regex.exec(source); + if (!match) return []; + const body = match[1] || ''; + const values = new Set(); + const stringRegex = /['"]([^'"\\]+)['"]/g; + let stringMatch; + while ((stringMatch = stringRegex.exec(body)) !== null) { + if (stringMatch[1]) values.add(stringMatch[1]); + } + return Array.from(values); +}; + +const extractTopLevelKeys = (objectText) => { + const keys = new Set(); + let i = 1; + const len = objectText.length; + const skipWhitespace = () => { + while (i < len && /\s/.test(objectText[i])) i += 1; + }; + const skipComments = () => { + while (i < len) { + if (objectText[i] === '/' && objectText[i + 1] === '/') { + i += 2; + while (i < len && objectText[i] !== '\n') i += 1; + continue; + } + if (objectText[i] === '/' && objectText[i + 1] === '*') { + i += 2; + while (i < len && !(objectText[i] === '*' && objectText[i + 1] === '/')) i += 1; + i += 2; + continue; + } + break; + } + }; + const parseString = (quote) => { + let value = ''; + i += 1; + while (i < len) { + const ch = objectText[i]; + if (ch === '\\') { + value += ch; + i += 2; + continue; + } + if (ch === quote) { + i += 1; + break; + } + value += ch; + i += 1; + } + return value; + }; + const parseIdentifier = () => { + const start = i; + if (!/[A-Za-z_$]/.test(objectText[i])) return null; + i += 1; + while (i < len && /[A-Za-z0-9_$]/.test(objectText[i])) i += 1; + return objectText.slice(start, i); + }; + const skipValue = () => { + let depthBrace = 0; + let depthBracket = 0; + let depthParen = 0; + let inString = null; + let escaped = false; + let inLineComment = false; + let inBlockComment = false; + for (; i < len; i += 1) { + const ch = objectText[i]; + const next = objectText[i + 1]; + if (inLineComment) { + if (ch === '\n') inLineComment = false; + continue; + } + if (inBlockComment) { + if (ch === '*' && next === '/') { + inBlockComment = false; + i += 1; + } + continue; + } + if (inString) { + if (escaped) { + escaped = false; + continue; + } + if (ch === '\\') { + escaped = true; + continue; + } + if (ch === inString) { + inString = null; + } + continue; + } + if (ch === '/' && next === '/') { + inLineComment = true; + i += 1; + continue; + } + if (ch === '/' && next === '*') { + inBlockComment = true; + i += 1; + continue; + } + if (ch === '"' || ch === '\'' || ch === '`') { + inString = ch; + continue; + } + if (ch === '{') { + depthBrace += 1; + continue; + } + if (ch === '}') { + if (depthBrace > 0) { + depthBrace -= 1; + continue; + } + return; + } + if (ch === '[') { + depthBracket += 1; + continue; + } + if (ch === ']') { + if (depthBracket > 0) depthBracket -= 1; + continue; + } + if (ch === '(') { + depthParen += 1; + continue; + } + if (ch === ')') { + if (depthParen > 0) depthParen -= 1; + continue; + } + if (depthBrace === 0 && depthBracket === 0 && depthParen === 0 && ch === ',') { + i += 1; + return; + } + } + }; + + while (i < len - 1) { + skipWhitespace(); + skipComments(); + skipWhitespace(); + if (objectText[i] === '}') break; + let key = null; + if (objectText[i] === '"' || objectText[i] === '\'') { + key = parseString(objectText[i]); + } else { + key = parseIdentifier(); + } + skipWhitespace(); + skipComments(); + skipWhitespace(); + if (!key || objectText[i] !== ':') { + i += 1; + continue; + } + keys.add(key); + i += 1; + skipValue(); + } + return Array.from(keys); +}; + +const buildInventory = async () => { + const schemaRaw = await fs.readFile(schemaPath, 'utf8'); + const schema = JSON.parse(schemaRaw); + const entries = collectSchemaEntries(schema); + const entryMap = new Map(); + for (const entry of entries) { + if (!entry.path) continue; + const existing = entryMap.get(entry.path); + if (!existing) { + entryMap.set(entry.path, { ...entry }); + } else { + mergeEntry(existing, entry); + } + } + const configEntries = Array.from(entryMap.values()) + .sort((a, b) => a.path.localeCompare(b.path)); + const topLevel = new Map(); + for (const entry of configEntries) { + const rootKey = entry.path.split(/[.[\]]/)[0] || entry.path; + topLevel.set(rootKey, (topLevel.get(rootKey) || 0) + 1); + } + + const sourceFiles = await listSourceFiles(); + const envVarMap = new Map(); + const cliFlagMap = new Map(); + const cliFlagsByFile = new Map(); + const dynamicOptionFiles = new Set(); + + for (const filePath of sourceFiles) { + const relPath = path.relative(root, filePath).replace(/\\/g, '/'); + const source = await fs.readFile(filePath, 'utf8'); + + const envMatches = source.match(/PAIROFCLEATS_[A-Z0-9_]+/g) || []; + for (const match of envMatches) { + if (!envVarMap.has(match)) envVarMap.set(match, new Set()); + envVarMap.get(match).add(relPath); + } + + const optionObjects = extractOptionObjects(source); + const fileFlags = new Set(); + for (const obj of optionObjects) { + extractTopLevelKeys(obj).forEach((key) => fileFlags.add(key)); + } + const boolFlags = extractStringArray(source, 'BOOLEAN_FLAGS'); + const stringFlags = extractStringArray(source, 'STRING_FLAGS'); + boolFlags.forEach((flag) => fileFlags.add(flag)); + stringFlags.forEach((flag) => fileFlags.add(flag)); + if (source.includes('mergedOptions.profile')) fileFlags.add('profile'); + + if ((source.includes('.options(') || source.includes('options:')) && fileFlags.size === 0) { + dynamicOptionFiles.add(relPath); + } + + if (fileFlags.size) { + const sorted = Array.from(fileFlags).sort((a, b) => a.localeCompare(b)); + cliFlagsByFile.set(relPath, sorted); + for (const flag of sorted) { + if (!cliFlagMap.has(flag)) cliFlagMap.set(flag, new Set()); + cliFlagMap.get(flag).add(relPath); + } + } + } + + const envVars = Array.from(envVarMap.entries()) + .map(([name, files]) => ({ name, files: Array.from(files).sort() })) + .sort((a, b) => a.name.localeCompare(b.name)); + + const cliFlags = Array.from(cliFlagMap.entries()) + .map(([flag, files]) => ({ flag, files: Array.from(files).sort() })) + .sort((a, b) => a.flag.localeCompare(b.flag)); + + const cliFlagsByFileOutput = Array.from(cliFlagsByFile.entries()) + .map(([file, flags]) => ({ file, flags })) + .sort((a, b) => a.file.localeCompare(b.file)); + + const duplicatedFlags = cliFlags + .filter((entry) => entry.files.length > 1) + .map((entry) => ({ + flag: entry.flag, + count: entry.files.length, + files: entry.files + })) + .sort((a, b) => b.count - a.count || a.flag.localeCompare(b.flag)); + + const inventory = { + generatedAt: new Date().toISOString(), + configSchema: { + path: path.relative(root, schemaPath).replace(/\\/g, '/'), + totalKeys: configEntries.length, + topLevel: Array.from(topLevel.entries()) + .map(([key, count]) => ({ key, count })) + .sort((a, b) => a.key.localeCompare(b.key)) + }, + configKeys: configEntries, + envVars, + cliFlags: { + totalFlags: cliFlags.length, + byFile: cliFlagsByFileOutput, + duplicated: duplicatedFlags, + dynamicOptionFiles: Array.from(dynamicOptionFiles).sort() + } + }; + + await fs.writeFile(outputJsonPath, JSON.stringify(inventory, null, 2)); + + const mdLines = []; + mdLines.push('# Config Inventory'); + mdLines.push(''); + mdLines.push(`Generated: ${inventory.generatedAt}`); + mdLines.push(''); + mdLines.push('This file is generated by `node tools/config-inventory.js`.'); + mdLines.push('See `docs/config-inventory-notes.md` for ownership and overlap analysis.'); + mdLines.push(''); + mdLines.push('## Summary'); + mdLines.push(`- Config keys: ${inventory.configSchema.totalKeys}`); + mdLines.push(`- Env vars: ${inventory.envVars.length}`); + mdLines.push(`- CLI flags: ${inventory.cliFlags.totalFlags}`); + mdLines.push(''); + mdLines.push('## Config keys by top-level namespace'); + mdLines.push(''); + for (const entry of inventory.configSchema.topLevel) { + mdLines.push(`- ${entry.key}: ${entry.count}`); + } + mdLines.push(''); + mdLines.push('## Env vars'); + mdLines.push(''); + if (inventory.envVars.length === 0) { + mdLines.push('- (none)'); + } else { + for (const entry of inventory.envVars) { + mdLines.push(`- ${entry.name} (${entry.files.length} files)`); + } + } + mdLines.push(''); + mdLines.push('## CLI flags (duplicated across files)'); + mdLines.push(''); + if (inventory.cliFlags.duplicated.length === 0) { + mdLines.push('- (none)'); + } else { + for (const entry of inventory.cliFlags.duplicated) { + mdLines.push(`- ${entry.flag} (${entry.count} files)`); + } + } + mdLines.push(''); + mdLines.push('## CLI flags by file'); + mdLines.push(''); + for (const entry of inventory.cliFlags.byFile) { + mdLines.push(`### ${entry.file}`); + mdLines.push(''); + mdLines.push(entry.flags.length ? entry.flags.join(', ') : '(none)'); + mdLines.push(''); + } + mdLines.push('## Config keys (full list)'); + mdLines.push(''); + mdLines.push('```'); + for (const entry of inventory.configKeys) { + const type = entry.type ? ` (${entry.type})` : ''; + const enumValues = entry.enum && entry.enum.length ? ` enum=${entry.enum.join('|')}` : ''; + mdLines.push(`${entry.path}${type}${enumValues}`.trim()); + } + mdLines.push('```'); + mdLines.push(''); + if (inventory.cliFlags.dynamicOptionFiles.length) { + mdLines.push('## Notes'); + mdLines.push(''); + mdLines.push('Dynamic CLI options detected in these files; verify flags manually:'); + mdLines.push(''); + for (const file of inventory.cliFlags.dynamicOptionFiles) { + mdLines.push(`- ${file}`); + } + mdLines.push(''); + } + + await fs.writeFile(outputMdPath, mdLines.join('\n')); +}; + +await buildInventory(); diff --git a/tools/ctags-ingest.js b/tools/ctags-ingest.js new file mode 100644 index 000000000..a4cc02d67 --- /dev/null +++ b/tools/ctags-ingest.js @@ -0,0 +1,176 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import readline from 'node:readline'; +import { spawn } from 'node:child_process'; +import { createCli } from '../src/shared/cli.js'; +import { getRepoCacheRoot, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'ctags-ingest', + options: { + repo: { type: 'string' }, + input: { type: 'string' }, + out: { type: 'string' }, + json: { type: 'boolean', default: false }, + run: { type: 'boolean', default: false }, + interactive: { type: 'boolean', default: false }, + ctags: { type: 'string', default: 'ctags' }, + fields: { type: 'string' }, + args: { type: 'string' } + } +}).parse(); + +const repoRoot = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); +const userConfig = loadUserConfig(repoRoot); +const cacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const outputPath = argv.out + ? path.resolve(argv.out) + : path.join(cacheRoot, 'ctags', 'ctags.jsonl'); +const metaPath = `${outputPath}.meta.json`; +const inputPath = argv.input ? String(argv.input) : null; +const runCtags = argv.run === true; +const interactive = argv.interactive === true; +const ctagsCmd = argv.ctags || 'ctags'; + +const toPosix = (value) => value.replace(/\\/g, '/'); +const normalizePath = (value) => { + if (!value) return null; + const raw = String(value); + const resolved = path.isAbsolute(raw) ? raw : path.resolve(repoRoot, raw); + const rel = path.relative(repoRoot, resolved); + return toPosix(rel || raw); +}; + +const mapEntry = (entry) => { + if (!entry || typeof entry !== 'object') return null; + if (entry._type && entry._type !== 'tag') return null; + const name = entry.name || null; + const file = normalizePath(entry.path || entry.file || entry.input || ''); + if (!name || !file) return null; + const ext = path.extname(file).toLowerCase(); + const kind = entry.kind || null; + const kindName = entry.kindName || null; + const signature = entry.signature || entry.pattern || null; + const line = Number.isFinite(Number(entry.line)) ? Number(entry.line) : null; + const startLine = line; + const endLine = line; + return { + file, + ext, + name, + kind, + kindName, + signature, + startLine, + endLine, + scope: entry.scope || null, + scopeKind: entry.scopeKind || null, + access: entry.access || null, + implementation: entry.implementation || null, + language: entry.language || null, + typeref: entry.typeref || null + }; +}; + +const stats = { + entries: 0, + ignored: 0, + errors: 0, + kinds: {}, + languages: {} +}; + +const bump = (bucket, key) => { + if (!key) return; + const k = String(key); + bucket[k] = (bucket[k] || 0) + 1; +}; + +const ensureOutputDir = async () => { + await fsPromises.mkdir(path.dirname(outputPath), { recursive: true }); +}; + +const writeStream = fs.createWriteStream(outputPath, { encoding: 'utf8' }); + +const ingestStream = async (stream) => { + const rl = readline.createInterface({ input: stream, crlfDelay: Infinity }); + for await (const line of rl) { + const trimmed = line.trim(); + if (!trimmed) continue; + let parsed = null; + try { + parsed = JSON.parse(trimmed); + } catch { + stats.errors += 1; + continue; + } + const mapped = mapEntry(parsed); + if (!mapped) { + stats.ignored += 1; + continue; + } + stats.entries += 1; + bump(stats.kinds, mapped.kind || mapped.kindName || 'unknown'); + bump(stats.languages, mapped.language || 'unknown'); + writeStream.write(`${JSON.stringify(mapped)}\n`); + } +}; + +const runCtagsCommand = async () => { + const args = ['--output-format=json', '--tag-relative=yes', '--recurse=yes']; + if (argv.fields) args.push(`--fields=${argv.fields}`); + if (argv.args) { + const extra = String(argv.args) + .split(/\s+/) + .map((entry) => entry.trim()) + .filter(Boolean); + args.push(...extra); + } + args.push(repoRoot); + const child = spawn(ctagsCmd, args, { stdio: ['ignore', 'pipe', 'pipe'] }); + child.stderr.on('data', (chunk) => process.stderr.write(chunk)); + await ingestStream(child.stdout); + const exitCode = await new Promise((resolve) => { + child.on('close', (code) => resolve(code ?? 0)); + }); + if (exitCode !== 0) { + throw new Error(`ctags exited with code ${exitCode}`); + } +}; + +await ensureOutputDir(); +if (interactive) { + await ingestStream(process.stdin); +} else if (inputPath && inputPath !== '-') { + const inputStream = fs.createReadStream(inputPath, { encoding: 'utf8' }); + await ingestStream(inputStream); +} else if (inputPath === '-' || runCtags) { + if (runCtags) { + await runCtagsCommand(); + } else { + await ingestStream(process.stdin); + } +} else { + await runCtagsCommand(); +} + +writeStream.end(); + +const summary = { + generatedAt: new Date().toISOString(), + repoRoot: path.resolve(repoRoot), + input: inputPath || (runCtags ? 'ctags' : 'stdin'), + output: path.resolve(outputPath), + stats +}; +await fsPromises.writeFile(metaPath, JSON.stringify(summary, null, 2)); + +if (argv.json) { + console.log(JSON.stringify(summary, null, 2)); +} else { + console.log(`Ctags ingest: ${stats.entries} entries (${stats.errors} parse errors)`); + console.log(`- output: ${outputPath}`); + console.log(`- meta: ${metaPath}`); +} diff --git a/tools/default-config-template.js b/tools/default-config-template.js new file mode 100644 index 000000000..77c32c329 --- /dev/null +++ b/tools/default-config-template.js @@ -0,0 +1,143 @@ +export const DEFAULT_USER_CONFIG_TEMPLATE = `{ + // Enable sqlite index artifacts for search backends. + // Speed impact: adds sqlite build time when stage4 runs. + "sqlite": { + // Toggle sqlite index usage/artifact generation. + // Speed impact: enabling adds some indexing time and disk usage. + "use": true + }, + // Enable LMDB artifacts for embeddings/cache backends. + // Speed impact: adds LMDB build time and disk usage during indexing. + "lmdb": { + // Toggle LMDB index usage/artifact generation. + // Speed impact: enabling adds some indexing time and disk usage. + "use": true + }, + // Search defaults for query-time behavior. + // Speed impact: no direct impact on indexing speed. + "search": { + // Prefer ANN search by default when multiple backends exist. + // Speed impact: no impact on indexing; affects query latency/recall. + "annDefault": true, + // Dense vector combination strategy for search. + // Speed impact: minor impact on embedding/storage cost during indexing. + "denseVectorMode": "merged", + // Regex search guardrails. + // Speed impact: no impact on indexing; affects regex query cost. + "regex": { + // Max regex pattern length accepted. + // Speed impact: no impact on indexing; caps regex compile cost. + "maxPatternLength": 512, + // Max regex input length scanned. + // Speed impact: no impact on indexing; caps regex runtime cost. + "maxInputLength": 10000, + // Max regex program size after compilation. + // Speed impact: no impact on indexing; caps regex execution cost. + "maxProgramSize": 2000, + // Regex timeout in milliseconds. + // Speed impact: no impact on indexing; limits regex runtime. + "timeoutMs": 25, + // Regex flags to apply by default. + // Speed impact: no impact on indexing; affects regex behavior. + "flags": "" + } + }, + // Index build pipeline options. + // Speed impact: many flags here change CPU/IO per file. + "indexing": { + // Sparse postings generation settings. + // Speed impact: heavier postings settings increase indexing time/size. + "postings": { + // Build phrase n-gram postings. + // Speed impact: increases indexing time and index size. + "enablePhraseNgrams": true, + // Smallest phrase n-gram length. + // Speed impact: lower values add more n-grams and cost. + "phraseMinN": 2, + // Largest phrase n-gram length. + // Speed impact: higher values increase indexing time and size. + "phraseMaxN": 4, + // Build chargram postings for fuzzy matching. + // Speed impact: noticeable extra CPU and disk usage. + "enableChargrams": true, + // Smallest chargram length. + // Speed impact: lower values increase chargram volume and cost. + "chargramMinN": 3, + // Largest chargram length. + // Speed impact: higher values increase chargram volume and cost. + "chargramMaxN": 5, + // Choose which fields contribute chargrams. + // Speed impact: more fields increase indexing work. + "chargramSource": "fields", + // Cap token length eligible for chargrams. + // Speed impact: higher caps increase CPU on long identifiers. + "chargramMaxTokenLength": 48, + // Track postings per field (name, path, body, etc). + // Speed impact: slight overhead for richer scoring. + "fielded": true + }, + // When to scan imports ("pre" or "post" indexing). + // Speed impact: small; "post" avoids extra upfront work. + "importScan": "post", + // Enable AST dataflow analysis. + // Speed impact: moderate CPU cost on large codebases. + "astDataflow": true, + // Enable control-flow analysis. + // Speed impact: moderate CPU cost on large codebases. + "controlFlow": true, + // Enable risk analysis rules. + // Speed impact: moderate CPU cost; can be heavy on huge repos. + "riskAnalysis": true, + // Enable cross-file risk correlation. + // Speed impact: heavy extra work on large repos. + "riskAnalysisCrossFile": true, + // Risk regex guardrails for analysis. + // Speed impact: tighter caps can reduce analysis time. + "riskRegex": { + // Max regex pattern length accepted. + // Speed impact: lower caps reduce risk regex compile time. + "maxPatternLength": 512, + // Max regex input length scanned. + // Speed impact: lower caps reduce risk regex runtime cost. + "maxInputLength": 10000, + // Max regex program size after compilation. + // Speed impact: lower caps reduce risk regex execution cost. + "maxProgramSize": 2000, + // Regex timeout in milliseconds. + // Speed impact: lower timeouts reduce risk regex runtime cost. + "timeoutMs": 25, + // Regex flags to apply by default. + // Speed impact: minimal; affects risk regex behavior. + "flags": "i" + }, + // Enable type inference. + // Speed impact: moderate to heavy CPU cost. + "typeInference": false, + // Enable cross-file type inference. + // Speed impact: heavy extra work on large repos. + "typeInferenceCrossFile": false, + // Collect git blame/churn metadata per file. + // Speed impact: heavy IO/CPU; can dominate indexing time. + "gitBlame": true, + // Run linting pass for diagnostics. + // Speed impact: extra CPU per file. + "lint": true, + // Compute complexity metrics. + // Speed impact: extra CPU per file. + "complexity": true, + // Python AST parsing options. + // Speed impact: small to moderate CPU on Python files. + "pythonAst": { + // Enable Python AST parsing. + // Speed impact: small to moderate on Python-heavy repos. + "enabled": true + }, + // Tree-sitter parsing options. + // Speed impact: moderate CPU, improved chunking accuracy. + "treeSitter": { + // Enable tree-sitter parsing. + // Speed impact: moderate CPU on supported languages. + "enabled": true + } + } +}`; diff --git a/tools/default-config.js b/tools/default-config.js new file mode 100644 index 000000000..ffc1c5434 --- /dev/null +++ b/tools/default-config.js @@ -0,0 +1,51 @@ +export const DEFAULT_USER_CONFIG = { + sqlite: { + use: true + }, + lmdb: { + use: true + }, + search: { + annDefault: true, + denseVectorMode: 'merged', + regex: { + maxPatternLength: 512, + maxInputLength: 10000, + maxProgramSize: 2000, + timeoutMs: 25, + flags: '' + } + }, + indexing: { + postings: { + enablePhraseNgrams: true, + phraseMinN: 2, + phraseMaxN: 4, + enableChargrams: true, + chargramMinN: 3, + chargramMaxN: 5, + chargramSource: 'fields', + chargramMaxTokenLength: 48, + fielded: true + }, + importScan: 'post', + astDataflow: true, + controlFlow: true, + riskAnalysis: true, + riskAnalysisCrossFile: true, + riskRegex: { + maxPatternLength: 512, + maxInputLength: 10000, + maxProgramSize: 2000, + timeoutMs: 25, + flags: 'i' + }, + typeInference: false, + typeInferenceCrossFile: false, + gitBlame: true, + lint: true, + complexity: true, + pythonAst: { enabled: true }, + treeSitter: { enabled: true } + } +}; diff --git a/tools/dict-utils.js b/tools/dict-utils.js index 65420710e..4f72e0b1f 100644 --- a/tools/dict-utils.js +++ b/tools/dict-utils.js @@ -4,6 +4,23 @@ import path from 'node:path'; import os from 'node:os'; import crypto from 'node:crypto'; import { spawnSync } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; +import { DEFAULT_CACHE_MB, DEFAULT_CACHE_TTL_MS } from '../src/shared/cache.js'; +import { readJsoncFile } from '../src/shared/jsonc.js'; +import { isPlainObject, mergeConfig } from '../src/shared/config.js'; +import { getEnvConfig } from '../src/shared/env.js'; +import { stableStringify } from '../src/shared/stable-json.js'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const TOOL_ROOT = path.resolve(__dirname, '..'); +const PROFILES_DIR = path.resolve(TOOL_ROOT, 'profiles'); +const profileWarnings = new Set(); +let toolVersionCache = null; +const DEFAULT_DP_MAX_BY_FILE_COUNT = [ + { maxFiles: 5000, dpMaxTokenLength: 32 }, + { maxFiles: 20000, dpMaxTokenLength: 24 }, + { maxFiles: Number.POSITIVE_INFINITY, dpMaxTokenLength: 16 } +]; export const DEFAULT_MODEL_ID = 'Xenova/all-MiniLM-L12-v2'; export const DEFAULT_TRIAGE_PROMOTE_FIELDS = [ @@ -24,26 +41,131 @@ export const DEFAULT_TRIAGE_PROMOTE_FIELDS = [ ]; /** - * Load repo-local configuration from .pairofcleats.json. + * Load repo-local configuration from .pairofcleats.json and apply profiles. * @param {string} repoRoot + * @param {{profile?:string,fallbackRoot?:string,fallbackConfigPath?:string}} [options] * @returns {object} */ -export function loadUserConfig(repoRoot) { +export function loadUserConfig(repoRoot, options = {}) { try { const configPath = path.join(repoRoot, '.pairofcleats.json'); - if (!fs.existsSync(configPath)) return {}; - return JSON.parse(fs.readFileSync(configPath, 'utf8')) || {}; + if (fs.existsSync(configPath)) { + const base = readJsoncFile(configPath) || {}; + return normalizeUserConfig(applyProfileConfig(base, options.profile)); + } + const fallbackPath = options.fallbackConfigPath + || (options.fallbackRoot ? path.join(options.fallbackRoot, '.pairofcleats.json') : null); + if (fallbackPath && fs.existsSync(fallbackPath)) { + const base = readJsoncFile(fallbackPath) || {}; + return normalizeUserConfig(applyProfileConfig(base, options.profile)); + } + const defaultPath = path.join(TOOL_ROOT, '.pairofcleats.json'); + if (defaultPath !== configPath && fs.existsSync(defaultPath)) { + const base = readJsoncFile(defaultPath) || {}; + return normalizeUserConfig(applyProfileConfig(base, options.profile)); + } + return normalizeUserConfig(applyProfileConfig({}, options.profile)); } catch { return {}; } } +/** + * Resolve the installation root for PairOfCleats tooling. + * @returns {string} + */ +export function resolveToolRoot() { + return TOOL_ROOT; +} + +/** + * Resolve the current tool version from package.json. + * @returns {string|null} + */ +export function getToolVersion() { + if (toolVersionCache !== null) return toolVersionCache; + try { + const pkgPath = path.join(TOOL_ROOT, 'package.json'); + const parsed = JSON.parse(fs.readFileSync(pkgPath, 'utf8')); + toolVersionCache = typeof parsed?.version === 'string' ? parsed.version : null; + } catch { + toolVersionCache = null; + } + return toolVersionCache; +} + +/** + * Compute a stable hash of the effective config inputs for a repo. + * @param {string} repoRoot + * @param {object|null} userConfig + * @returns {string} + */ +export function getEffectiveConfigHash(repoRoot, userConfig = null) { + const cfg = userConfig || loadUserConfig(repoRoot); + const env = getEnvConfig(); + const payload = { config: cfg, env }; + const json = stableStringify(payload); + return crypto.createHash('sha1').update(json).digest('hex'); +} + + +function normalizeUserConfig(baseConfig) { + if (!isPlainObject(baseConfig)) return baseConfig || {}; + + return baseConfig; +} + + +function loadProfileConfig(profileName) { + if (!profileName) return { config: {}, path: null, error: null }; + const profileFile = `${profileName}.json`; + const profilePath = path.join(PROFILES_DIR, profileFile); + if (!fs.existsSync(profilePath)) { + return { + config: {}, + path: profilePath, + error: `Profile not found: ${profilePath}` + }; + } + try { + const config = JSON.parse(fs.readFileSync(profilePath, 'utf8')) || {}; + if (isPlainObject(config)) delete config.profile; + return { config, path: profilePath, error: null }; + } catch (error) { + return { + config: {}, + path: profilePath, + error: `Failed to parse profile ${profilePath}: ${error?.message || error}` + }; + } +} + +function applyProfileConfig(baseConfig, profileOverride) { + const overrideName = typeof profileOverride === 'string' ? profileOverride.trim() : ''; + const envProfile = getEnvConfig().profile || ''; + const configProfile = typeof baseConfig?.profile === 'string' ? baseConfig.profile.trim() : ''; + const profileName = overrideName || envProfile || configProfile; + if (!profileName) return baseConfig || {}; + const { config: profileConfig, path: profilePath, error } = loadProfileConfig(profileName); + if (error) { + const key = `${profileName}:${profilePath}`; + if (!profileWarnings.has(key)) { + profileWarnings.add(key); + console.error(`[config] ${error}`); + } + } + const merged = mergeConfig(profileConfig, baseConfig || {}); + merged.profile = profileName; + return merged; +} + /** * Resolve the cache root directory. * @returns {string} */ export function getCacheRoot() { - if (process.env.PAIROFCLEATS_HOME) return process.env.PAIROFCLEATS_HOME; + const envConfig = getEnvConfig(); + if (envConfig.home) return envConfig.home; if (process.env.LOCALAPPDATA) return path.join(process.env.LOCALAPPDATA, 'PairOfCleats'); if (process.env.XDG_CACHE_HOME) return path.join(process.env.XDG_CACHE_HOME, 'pairofcleats'); return path.join(os.homedir(), '.cache', 'pairofcleats'); @@ -58,14 +180,65 @@ export function getCacheRoot() { export function getDictConfig(repoRoot, userConfig = null) { const cfg = userConfig || loadUserConfig(repoRoot); const dict = cfg.dictionary || {}; + const envConfig = getEnvConfig(); + const dpMaxTokenLengthByFileCount = normalizeDpMaxTokenLengthByFileCount( + dict.dpMaxTokenLengthByFileCount + ); return { - dir: dict.dir || process.env.PAIROFCLEATS_DICT_DIR || path.join(getCacheRoot(), 'dictionaries'), + dir: dict.dir || envConfig.dictDir || path.join(getCacheRoot(), 'dictionaries'), languages: Array.isArray(dict.languages) ? dict.languages : ['en'], files: Array.isArray(dict.files) ? dict.files : [], includeSlang: dict.includeSlang !== false, slangDirs: Array.isArray(dict.slangDirs) ? dict.slangDirs : [], slangFiles: Array.isArray(dict.slangFiles) ? dict.slangFiles : [], - enableRepoDictionary: dict.enableRepoDictionary === true + enableRepoDictionary: dict.enableRepoDictionary === true, + segmentation: typeof dict.segmentation === 'string' ? dict.segmentation : 'auto', + dpMaxTokenLength: Number.isFinite(Number(dict.dpMaxTokenLength)) + ? Number(dict.dpMaxTokenLength) + : 32, + dpMaxTokenLengthByFileCount + }; +} + +function normalizeDpMaxTokenLengthByFileCount(raw) { + if (!Array.isArray(raw) || !raw.length) { + return DEFAULT_DP_MAX_BY_FILE_COUNT.map((entry) => ({ ...entry })); + } + const normalized = raw + .map((entry) => { + if (!entry || typeof entry !== 'object') return null; + const maxFiles = Number(entry.maxFiles); + const dpMaxTokenLength = Number(entry.dpMaxTokenLength); + if (!Number.isFinite(maxFiles) || maxFiles <= 0) return null; + if (!Number.isFinite(dpMaxTokenLength) || dpMaxTokenLength <= 0) return null; + return { + maxFiles, + dpMaxTokenLength: Math.max(4, Math.floor(dpMaxTokenLength)) + }; + }) + .filter(Boolean) + .sort((a, b) => a.maxFiles - b.maxFiles); + return normalized.length ? normalized : DEFAULT_DP_MAX_BY_FILE_COUNT.map((entry) => ({ ...entry })); +} + +export function applyAdaptiveDictConfig(dictConfig, fileCount) { + if (!dictConfig || typeof dictConfig !== 'object') return dictConfig || {}; + const count = Number(fileCount); + if (!Number.isFinite(count) || count <= 0) return dictConfig; + const mode = typeof dictConfig.segmentation === 'string' + ? dictConfig.segmentation.trim().toLowerCase() + : 'auto'; + if (mode !== 'auto' && mode !== 'dp') return dictConfig; + const thresholds = Array.isArray(dictConfig.dpMaxTokenLengthByFileCount) + && dictConfig.dpMaxTokenLengthByFileCount.length + ? dictConfig.dpMaxTokenLengthByFileCount + : DEFAULT_DP_MAX_BY_FILE_COUNT; + const match = thresholds.find((entry) => count <= entry.maxFiles) || thresholds[thresholds.length - 1]; + if (!match || !Number.isFinite(match.dpMaxTokenLength)) return dictConfig; + if (dictConfig.dpMaxTokenLength === match.dpMaxTokenLength) return dictConfig; + return { + ...dictConfig, + dpMaxTokenLength: match.dpMaxTokenLength }; } @@ -76,9 +249,21 @@ export function getDictConfig(repoRoot, userConfig = null) { */ export function getRepoId(repoRoot) { const resolved = path.resolve(repoRoot); - return crypto.createHash('sha1').update(resolved).digest('hex'); + const base = path.basename(resolved); + const normalized = String(base || 'repo') + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, ''); + const prefix = (normalized || 'repo').slice(0, 24); + const hash = crypto.createHash('sha1').update(resolved).digest('hex').slice(0, 12); + return `${prefix}-${hash}`; } +const getLegacyRepoId = (repoRoot) => { + const resolved = path.resolve(repoRoot); + return crypto.createHash('sha1').update(resolved).digest('hex'); +}; + /** * Resolve the repo root from a starting directory. * @param {string} startPath @@ -126,9 +311,120 @@ function findConfigRoot(startPath) { */ export function getRepoCacheRoot(repoRoot, userConfig = null) { const cfg = userConfig || loadUserConfig(repoRoot); - const cacheRoot = (cfg.cache && cfg.cache.root) || process.env.PAIROFCLEATS_CACHE_ROOT || getCacheRoot(); + const envConfig = getEnvConfig(); + const cacheRoot = (cfg.cache && cfg.cache.root) || envConfig.cacheRoot || getCacheRoot(); const repoId = getRepoId(repoRoot); - return path.join(cacheRoot, 'repos', repoId); + const repoCacheRoot = path.join(cacheRoot, 'repos', repoId); + const legacyRoot = path.join(cacheRoot, 'repos', getLegacyRepoId(repoRoot)); + if (fs.existsSync(legacyRoot) && !fs.existsSync(repoCacheRoot)) return legacyRoot; + return repoCacheRoot; +} + +/** + * Resolve the builds root directory for a repo. + * @param {string} repoRoot + * @param {object|null} userConfig + * @returns {string} + */ +export function getBuildsRoot(repoRoot, userConfig = null) { + return path.join(getRepoCacheRoot(repoRoot, userConfig), 'builds'); +} + +/** + * Resolve current build metadata for a repo, if present. + * @param {string} repoRoot + * @param {object|null} userConfig + * @returns {{buildId:string,buildRoot:string,path:string,data:object}|null} + */ +export function getCurrentBuildInfo(repoRoot, userConfig = null, options = {}) { + const repoCacheRoot = getRepoCacheRoot(repoRoot, userConfig); + const buildsRoot = path.join(repoCacheRoot, 'builds'); + const currentPath = path.join(buildsRoot, 'current.json'); + if (!fs.existsSync(currentPath)) return null; + try { + const data = JSON.parse(fs.readFileSync(currentPath, 'utf8')) || {}; + const buildId = typeof data.buildId === 'string' ? data.buildId : null; + const buildRootRaw = typeof data.buildRoot === 'string' ? data.buildRoot : null; + const resolveRoot = (value) => { + if (!value) return null; + return path.isAbsolute(value) ? value : path.join(repoCacheRoot, value); + }; + const buildRoot = buildRootRaw + ? resolveRoot(buildRootRaw) + : (buildId ? path.join(buildsRoot, buildId) : null); + const buildRoots = {}; + if (data.buildRoots && typeof data.buildRoots === 'object' && !Array.isArray(data.buildRoots)) { + for (const [mode, value] of Object.entries(data.buildRoots)) { + if (typeof value !== 'string') continue; + const resolved = resolveRoot(value); + if (resolved) buildRoots[mode] = resolved; + } + } else if (buildRoot && Array.isArray(data.modes)) { + for (const mode of data.modes) { + if (typeof mode !== 'string') continue; + buildRoots[mode] = buildRoot; + } + } + const preferredMode = typeof options.mode === 'string' ? options.mode : null; + const preferredRoot = preferredMode ? buildRoots[preferredMode] : null; + const activeRoot = preferredRoot || buildRoot || Object.values(buildRoots)[0] || null; + if (!buildId || !activeRoot || !fs.existsSync(activeRoot)) return null; + return { buildId, buildRoot: buildRoot || activeRoot, activeRoot, path: currentPath, data, buildRoots }; + } catch { + return null; + } +} + +/** + * Resolve the active index root for a repo (current build or legacy path). + * @param {string} repoRoot + * @param {object|null} userConfig + * @param {{indexRoot?:string|null}} [options] + * @returns {string} + */ +export function resolveIndexRoot(repoRoot, userConfig = null, options = {}) { + if (options?.indexRoot) return path.resolve(options.indexRoot); + const repoCacheRoot = getRepoCacheRoot(repoRoot, userConfig); + const buildsRoot = path.join(repoCacheRoot, 'builds'); + const currentPath = path.join(buildsRoot, 'current.json'); + if (fs.existsSync(currentPath)) { + try { + const data = JSON.parse(fs.readFileSync(currentPath, 'utf8')) || {}; + const resolveRoot = (value) => { + if (!value) return null; + return path.isAbsolute(value) ? value : path.join(repoCacheRoot, value); + }; + const buildRootRaw = typeof data.buildRoot === 'string' ? data.buildRoot : null; + const buildId = typeof data.buildId === 'string' ? data.buildId : null; + const buildRoot = buildRootRaw + ? resolveRoot(buildRootRaw) + : (buildId ? path.join(buildsRoot, buildId) : null); + const buildRoots = {}; + if (data.buildRoots && typeof data.buildRoots === 'object' && !Array.isArray(data.buildRoots)) { + for (const [mode, value] of Object.entries(data.buildRoots)) { + if (typeof value !== 'string') continue; + buildRoots[mode] = resolveRoot(value); + } + } else if (buildRoot && Array.isArray(data.modes)) { + for (const mode of data.modes) { + if (typeof mode !== 'string') continue; + buildRoots[mode] = buildRoot; + } + } + const preferredMode = typeof options.mode === 'string' ? options.mode : null; + const ensureExists = (value) => (value && fs.existsSync(value) ? value : null); + let resolved = preferredMode ? ensureExists(buildRoots[preferredMode]) : null; + if (!resolved && !preferredMode) { + for (const mode of ['code', 'prose', 'records']) { + resolved = ensureExists(buildRoots[mode]); + if (resolved) break; + } + } + if (!resolved) resolved = ensureExists(buildRoot); + if (resolved) return resolved; + } catch {} + } + return getRepoCacheRoot(repoRoot, userConfig); } /** @@ -140,13 +436,116 @@ export function getRepoCacheRoot(repoRoot, userConfig = null) { export function getModelConfig(repoRoot, userConfig = null) { const cfg = userConfig || loadUserConfig(repoRoot); const models = cfg.models || {}; - const id = process.env.PAIROFCLEATS_MODEL || models.id || DEFAULT_MODEL_ID; + const envConfig = getEnvConfig(); + const id = envConfig.model || models.id || DEFAULT_MODEL_ID; return { id, dir: getModelsDir(repoRoot, cfg) }; } +/** + * Resolve runtime configuration for a repo. + * @param {string} repoRoot + * @param {object|null} userConfig + * @returns {{maxOldSpaceMb:number|null,nodeOptions:string,uvThreadpoolSize:number|null}} + */ +export function getRuntimeConfig(repoRoot, userConfig = null) { + const cfg = userConfig || loadUserConfig(repoRoot); + const runtime = cfg.runtime || {}; + const envConfig = getEnvConfig(); + const rawMaxOldSpace = runtime.maxOldSpaceMb ?? envConfig.maxOldSpaceMb; + const parsedMaxOldSpace = Number(rawMaxOldSpace); + const maxOldSpaceMb = Number.isFinite(parsedMaxOldSpace) && parsedMaxOldSpace > 0 + ? parsedMaxOldSpace + : null; + const nodeOptionsRaw = runtime.nodeOptions ?? envConfig.nodeOptions; + const nodeOptions = typeof nodeOptionsRaw === 'string' ? nodeOptionsRaw.trim() : ''; + const rawUvThreadpoolSize = runtime.uvThreadpoolSize ?? envConfig.uvThreadpoolSize; + const parsedUvThreadpoolSize = Number(rawUvThreadpoolSize); + const uvThreadpoolSize = Number.isFinite(parsedUvThreadpoolSize) && parsedUvThreadpoolSize > 0 + ? Math.floor(parsedUvThreadpoolSize) + : null; + return { maxOldSpaceMb, nodeOptions, uvThreadpoolSize }; +} + +/** + * Resolve runtime cache limits and TTLs for a repo. + * @param {string} repoRoot + * @param {object|null} userConfig + * @returns {{fileText:{maxMb:number,ttlMs:number},summary:{maxMb:number,ttlMs:number},lint:{maxMb:number,ttlMs:number},complexity:{maxMb:number,ttlMs:number},gitMeta:{maxMb:number,ttlMs:number}}} + */ +export function getCacheRuntimeConfig(repoRoot, userConfig = null) { + const cfg = userConfig || loadUserConfig(repoRoot); + const runtimeCache = cfg.cache?.runtime || {}; + const resolveEntry = (key) => { + const entry = runtimeCache[key] || {}; + const maxMbRaw = entry.maxMb; + const ttlMsRaw = entry.ttlMs; + const maxMb = Number.isFinite(Number(maxMbRaw)) + ? Math.max(0, Number(maxMbRaw)) + : (DEFAULT_CACHE_MB[key] || 0); + const ttlMs = Number.isFinite(Number(ttlMsRaw)) + ? Math.max(0, Number(ttlMsRaw)) + : (DEFAULT_CACHE_TTL_MS[key] || 0); + return { maxMb, ttlMs }; + }; + return { + fileText: resolveEntry('fileText'), + summary: resolveEntry('summary'), + lint: resolveEntry('lint'), + complexity: resolveEntry('complexity'), + gitMeta: resolveEntry('gitMeta') + }; +} + +/** + * Merge runtime Node options with existing NODE_OPTIONS. + * @param {{maxOldSpaceMb:number|null,nodeOptions:string,uvThreadpoolSize:number|null}} runtimeConfig + * @param {string} [baseOptions] + * @returns {string} + */ +export function resolveNodeOptions(runtimeConfig, baseOptions = process.env.NODE_OPTIONS || '') { + const base = typeof baseOptions === 'string' ? baseOptions.trim() : ''; + const extras = []; + if (runtimeConfig?.nodeOptions) extras.push(runtimeConfig.nodeOptions.trim()); + if (Number.isFinite(runtimeConfig?.maxOldSpaceMb) && runtimeConfig.maxOldSpaceMb > 0) { + const combined = [base, ...extras].join(' '); + if (!combined.includes('--max-old-space-size')) { + extras.push(`--max-old-space-size=${Math.floor(runtimeConfig.maxOldSpaceMb)}`); + } + } + return [base, ...extras].filter(Boolean).join(' ').trim(); +} + + +/** + * Resolve the child-process runtime environment for PairOfCleats tool launches. + * Applies runtime Node options and (optionally) propagates UV_THREADPOOL_SIZE when configured. + * Note: UV_THREADPOOL_SIZE must be set before the Node process starts to affect libuv. + * @param {{maxOldSpaceMb:number|null,nodeOptions:string,uvThreadpoolSize:number|null}} runtimeConfig + * @param {NodeJS.ProcessEnv} [baseEnv] + * @returns {NodeJS.ProcessEnv} + */ +export function resolveRuntimeEnv(runtimeConfig, baseEnv = process.env) { + const env = { ...baseEnv }; + const resolvedNodeOptions = resolveNodeOptions(runtimeConfig, env.NODE_OPTIONS || ''); + if (resolvedNodeOptions) { + env.NODE_OPTIONS = resolvedNodeOptions; + } + + const uvThreadpoolSize = runtimeConfig?.uvThreadpoolSize; + if ( + Number.isFinite(Number(uvThreadpoolSize)) + && Number(uvThreadpoolSize) > 0 + && !env.UV_THREADPOOL_SIZE + ) { + env.UV_THREADPOOL_SIZE = String(Math.floor(Number(uvThreadpoolSize))); + } + + return env; +} + /** * Resolve the index directory for a repo/mode. * @param {string} repoRoot @@ -154,8 +553,9 @@ export function getModelConfig(repoRoot, userConfig = null) { * @param {object|null} userConfig * @returns {string} */ -export function getIndexDir(repoRoot, mode, userConfig = null) { - return path.join(getRepoCacheRoot(repoRoot, userConfig), `index-${mode}`); +export function getIndexDir(repoRoot, mode, userConfig = null, options = {}) { + const base = resolveIndexRoot(repoRoot, userConfig, { ...options, mode }); + return path.join(base, `index-${mode}`); } /** @@ -223,18 +623,40 @@ export function getRepoDictPath(repoRoot, dictConfig = null) { return path.join(config.dir, 'repos', `${repoId}.txt`); } +/** + * Resolve LMDB database paths for the repo. + * @param {string} repoRoot + * @param {object|null} userConfig + * @returns {{codePath:string,prosePath:string,dbDir:string}} + */ +export function resolveLmdbPaths(repoRoot, userConfig = null, options = {}) { + const cfg = userConfig || loadUserConfig(repoRoot); + const lmdb = cfg.lmdb || {}; + const indexRoot = resolveIndexRoot(repoRoot, cfg, options); + const defaultDir = path.join(indexRoot, 'index-lmdb'); + const dbDir = lmdb.dbDir ? resolvePath(repoRoot, lmdb.dbDir) : defaultDir; + const codePath = lmdb.codeDbPath + ? resolvePath(repoRoot, lmdb.codeDbPath) + : path.join(dbDir, 'index-code'); + const prosePath = lmdb.proseDbPath + ? resolvePath(repoRoot, lmdb.proseDbPath) + : path.join(dbDir, 'index-prose'); + return { codePath, prosePath, dbDir }; +} + /** * Resolve SQLite database paths for the repo. * @param {string} repoRoot * @param {object|null} userConfig * @returns {{codePath:string,prosePath:string,dbDir:string,legacyPath:string,legacyExists:boolean}} */ -export function resolveSqlitePaths(repoRoot, userConfig = null) { +export function resolveSqlitePaths(repoRoot, userConfig = null, options = {}) { const cfg = userConfig || loadUserConfig(repoRoot); const sqlite = cfg.sqlite || {}; const repoCacheRoot = getRepoCacheRoot(repoRoot, cfg); - const defaultDir = path.join(repoCacheRoot, 'index-sqlite'); - const legacyPath = sqlite.dbPath ? resolvePath(repoRoot, sqlite.dbPath) : path.join(defaultDir, 'index.db'); + const indexRoot = resolveIndexRoot(repoRoot, cfg, options); + const defaultDir = path.join(indexRoot, 'index-sqlite'); + const legacyPath = path.join(repoCacheRoot, 'index-sqlite', 'index.db'); const dbDir = sqlite.dbDir ? resolvePath(repoRoot, sqlite.dbDir) : defaultDir; const codePath = sqlite.codeDbPath ? resolvePath(repoRoot, sqlite.codeDbPath) @@ -259,9 +681,10 @@ export function resolveSqlitePaths(repoRoot, userConfig = null) { */ export function getModelsDir(repoRoot, userConfig = null) { const cfg = userConfig || loadUserConfig(repoRoot); - const cacheRoot = (cfg.cache && cfg.cache.root) || process.env.PAIROFCLEATS_CACHE_ROOT || getCacheRoot(); + const envConfig = getEnvConfig(); + const cacheRoot = (cfg.cache && cfg.cache.root) || envConfig.cacheRoot || getCacheRoot(); const models = cfg.models || {}; - return models.dir || process.env.PAIROFCLEATS_MODELS_DIR || path.join(cacheRoot, 'models'); + return models.dir || envConfig.modelsDir || path.join(cacheRoot, 'models'); } /** @@ -272,26 +695,70 @@ export function getModelsDir(repoRoot, userConfig = null) { */ export function getToolingDir(repoRoot, userConfig = null) { const cfg = userConfig || loadUserConfig(repoRoot); - const cacheRoot = (cfg.cache && cfg.cache.root) || process.env.PAIROFCLEATS_CACHE_ROOT || getCacheRoot(); + const envConfig = getEnvConfig(); + const cacheRoot = (cfg.cache && cfg.cache.root) || envConfig.cacheRoot || getCacheRoot(); const tooling = cfg.tooling || {}; - return tooling.dir || process.env.PAIROFCLEATS_TOOLING_DIR || path.join(cacheRoot, 'tooling'); + return tooling.dir || envConfig.toolingDir || path.join(cacheRoot, 'tooling'); } /** * Resolve tooling configuration for a repo. * @param {string} repoRoot * @param {object|null} userConfig - * @returns {{autoInstallOnDetect:boolean,installScope:string,allowGlobalFallback:boolean,dir:string}} + * @returns {{autoInstallOnDetect:boolean,autoEnableOnDetect:boolean,installScope:string,allowGlobalFallback:boolean,dir:string,enabledTools:string[],disabledTools:string[],typescript:{enabled:boolean,resolveOrder:string[],useTsconfig:boolean,tsconfigPath:string},clangd:{requireCompilationDatabase:boolean,compileCommandsDir:string}}} */ export function getToolingConfig(repoRoot, userConfig = null) { const cfg = userConfig || loadUserConfig(repoRoot); const tooling = cfg.tooling || {}; - const installScope = (tooling.installScope || process.env.PAIROFCLEATS_TOOLING_INSTALL_SCOPE || 'cache').toLowerCase(); + const typescript = tooling.typescript || {}; + const clangd = tooling.clangd || {}; + const envConfig = getEnvConfig(); + const timeoutMs = Number(tooling.timeoutMs ?? envConfig.toolingTimeoutMs); + const maxRetries = Number(tooling.maxRetries ?? envConfig.toolingMaxRetries); + const breakerThreshold = Number(tooling.circuitBreakerThreshold ?? envConfig.toolingCircuitBreaker); + const logDir = typeof tooling.logDir === 'string' ? tooling.logDir : ''; + const installScope = (tooling.installScope || envConfig.toolingInstallScope || 'cache').toLowerCase(); + const normalizeOrder = (value) => { + if (Array.isArray(value)) return value.map((entry) => String(entry).trim()).filter(Boolean); + if (typeof value === 'string') { + return value.split(',').map((entry) => entry.trim()).filter(Boolean); + } + return null; + }; + const normalizeToolList = (value) => { + if (Array.isArray(value)) { + return value.map((entry) => String(entry).trim().toLowerCase()).filter(Boolean); + } + if (typeof value === 'string') { + return value.split(',').map((entry) => entry.trim().toLowerCase()).filter(Boolean); + } + return []; + }; + const enabledTools = normalizeToolList(tooling.enabledTools); + const disabledTools = normalizeToolList(tooling.disabledTools); + const resolveOrder = normalizeOrder(typescript.resolveOrder) || ['repo', 'cache', 'global']; return { autoInstallOnDetect: tooling.autoInstallOnDetect === true, + autoEnableOnDetect: tooling.autoEnableOnDetect !== false, + timeoutMs: Number.isFinite(timeoutMs) ? Math.max(1000, Math.floor(timeoutMs)) : null, + maxRetries: Number.isFinite(maxRetries) ? Math.max(0, Math.floor(maxRetries)) : null, + circuitBreakerThreshold: Number.isFinite(breakerThreshold) ? Math.max(1, Math.floor(breakerThreshold)) : null, + logDir: logDir.trim(), installScope, allowGlobalFallback: tooling.allowGlobalFallback !== false, - dir: getToolingDir(repoRoot, cfg) + dir: getToolingDir(repoRoot, cfg), + enabledTools, + disabledTools, + typescript: { + enabled: typescript.enabled !== false, + resolveOrder, + useTsconfig: typescript.useTsconfig !== false, + tsconfigPath: typeof typescript.tsconfigPath === 'string' ? typescript.tsconfigPath : '' + }, + clangd: { + requireCompilationDatabase: clangd.requireCompilationDatabase === true, + compileCommandsDir: typeof clangd.compileCommandsDir === 'string' ? clangd.compileCommandsDir : '' + } }; } @@ -303,12 +770,13 @@ export function getToolingConfig(repoRoot, userConfig = null) { */ export function getExtensionsDir(repoRoot, userConfig = null) { const cfg = userConfig || loadUserConfig(repoRoot); - const cacheRoot = (cfg.cache && cfg.cache.root) || process.env.PAIROFCLEATS_CACHE_ROOT || getCacheRoot(); + const envConfig = getEnvConfig(); + const cacheRoot = (cfg.cache && cfg.cache.root) || envConfig.cacheRoot || getCacheRoot(); const extensions = cfg.extensions || {}; const sqliteVector = cfg.sqlite?.vectorExtension || {}; return extensions.dir || sqliteVector.dir - || process.env.PAIROFCLEATS_EXTENSIONS_DIR + || envConfig.extensionsDir || path.join(cacheRoot, 'extensions'); } @@ -389,6 +857,8 @@ export async function getDictionaryPaths(repoRoot, dictConfig = null) { if (config.enableRepoDictionary) { const repoDict = getRepoDictPath(repoRoot, config); if (fs.existsSync(repoDict)) paths.push(repoDict); + const legacyRepoDict = path.join(config.dir, 'repos', `${getLegacyRepoId(repoRoot)}.txt`); + if (fs.existsSync(legacyRepoDict)) paths.push(legacyRepoDict); } if (!paths.length) { diff --git a/tools/download-dicts.js b/tools/download-dicts.js index d607e310e..173660c8a 100644 --- a/tools/download-dicts.js +++ b/tools/download-dicts.js @@ -1,18 +1,27 @@ #!/usr/bin/env node import fs from 'node:fs/promises'; import fsSync from 'node:fs'; +import crypto from 'node:crypto'; import path from 'node:path'; import http from 'node:http'; import https from 'node:https'; import { URL } from 'node:url'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; +import { createError, ERROR_CODES } from '../src/shared/error-codes.js'; import { getDictConfig, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['update', 'force'], - string: ['lang', 'dir', 'url', 'repo'], - default: { update: false, force: false } -}); +const argv = createCli({ + scriptName: 'download-dicts', + options: { + update: { type: 'boolean', default: false }, + force: { type: 'boolean', default: false }, + lang: { type: 'string' }, + dir: { type: 'string' }, + url: { type: 'string', array: true }, + sha256: { type: 'string', array: true }, + repo: { type: 'string' } + } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const repoRoot = rootArg || resolveRepoRoot(process.cwd()); @@ -32,6 +41,80 @@ try { manifest = {}; } +const normalizeHash = (value) => { + if (!value) return null; + const trimmed = String(value).trim().toLowerCase(); + if (!trimmed) return null; + const normalized = trimmed.startsWith('sha256:') ? trimmed.slice(7) : trimmed; + if (!/^[a-f0-9]{64}$/.test(normalized)) return null; + return normalized; +}; + +const parseHashes = (input) => { + if (!input) return {}; + const items = Array.isArray(input) ? input : [input]; + const out = {}; + for (const item of items) { + const eq = String(item || '').indexOf('='); + if (eq <= 0 || eq >= item.length - 1) continue; + const name = item.slice(0, eq); + const hash = normalizeHash(item.slice(eq + 1)); + if (name && hash) out[name] = hash; + } + return out; +}; + +const resolveDownloadPolicy = (cfg) => { + const policy = cfg?.security?.downloads || {}; + const allowlist = policy.allowlist && typeof policy.allowlist === 'object' + ? policy.allowlist + : {}; + return { + requireHash: policy.requireHash === true, + warnUnsigned: policy.warnUnsigned !== false, + allowlist + }; +}; + +const resolveExpectedHash = (source, policy, overrides) => { + const explicit = normalizeHash(source?.sha256 || source?.hash); + if (explicit) return explicit; + const allowlist = policy?.allowlist || {}; + const fallback = overrides?.[source?.name] + || overrides?.[source?.url] + || overrides?.[source?.file] + || allowlist[source?.name] + || allowlist[source?.url] + || allowlist[source?.file]; + return normalizeHash(fallback); +}; + +const verifyDownloadHash = (source, buffer, expectedHash, policy) => { + if (!expectedHash) { + if (policy?.requireHash) { + throw createError( + ERROR_CODES.DOWNLOAD_VERIFY_FAILED, + `Download verification requires a sha256 hash (${source?.name || source?.url || 'unknown source'}).` + ); + } + if (policy?.warnUnsigned) { + console.warn(`[download] Skipping hash verification for ${source?.name || source?.url || 'unknown source'}.`); + } + return null; + } + const actual = crypto.createHash('sha256').update(buffer).digest('hex'); + if (actual !== expectedHash) { + throw createError( + ERROR_CODES.DOWNLOAD_VERIFY_FAILED, + `Download verification failed for ${source?.name || source?.url || 'unknown source'}.` + ); + } + return actual; +}; + +const hashOverrides = parseHashes(argv.sha256); +const downloadPolicy = resolveDownloadPolicy(userConfig); + const SOURCES = { en: { name: 'en', @@ -45,14 +128,17 @@ const SOURCES = { * @param {string|string[]|null} input * @returns {Array<{name:string,url:string,file:string}>} */ -function parseUrls(input) { +function parseUrls(input, hashes = null) { if (!input) return []; const items = Array.isArray(input) ? input : [input]; const sources = []; for (const item of items) { - const [name, url] = item.split('='); - if (!name || !url) continue; - sources.push({ name, url, file: `${name}.txt` }); + const eq = item.indexOf('='); + if (eq <= 0 || eq >= item.length - 1) continue; + const name = item.slice(0, eq); + const url = item.slice(eq + 1); + const sha256 = hashes && hashes[name] ? hashes[name] : null; + sources.push({ name, url, file: `${name}.txt`, sha256 }); } return sources; } @@ -122,12 +208,17 @@ async function downloadSource(source) { throw new Error(`Failed to download ${source.url}: ${response.statusCode}`); } + const expectedHash = resolveExpectedHash(source, downloadPolicy, hashOverrides); + const actualHash = verifyDownloadHash(source, response.body, expectedHash, downloadPolicy); + const text = response.body.toString('utf8'); await fs.writeFile(outputPath, text.endsWith('\n') ? text : `${text}\n`); manifest[source.name] = { url: source.url, file: source.file, + sha256: actualHash || expectedHash || null, + verified: Boolean(expectedHash), etag: response.headers.etag || null, lastModified: response.headers['last-modified'] || null, downloadedAt: new Date().toISOString() @@ -146,7 +237,7 @@ for (const lang of langs) { if (src) sources.push(src); } -const urlSources = parseUrls(argv.url); +const urlSources = parseUrls(argv.url, hashOverrides); sources.push(...urlSources); if (!sources.length) { diff --git a/tools/download-extensions.js b/tools/download-extensions.js index 7868141c9..a58950144 100644 --- a/tools/download-extensions.js +++ b/tools/download-extensions.js @@ -1,22 +1,33 @@ #!/usr/bin/env node import fs from 'node:fs/promises'; import fsSync from 'node:fs'; +import crypto from 'node:crypto'; import path from 'node:path'; import http from 'node:http'; import https from 'node:https'; import { pipeline } from 'node:stream/promises'; import { URL } from 'node:url'; import { createGunzip } from 'node:zlib'; -import { spawnSync } from 'node:child_process'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; +import { createError, ERROR_CODES } from '../src/shared/error-codes.js'; import { loadUserConfig, resolveRepoRoot } from './dict-utils.js'; import { getBinarySuffix, getPlatformKey, getVectorExtensionConfig, resolveVectorExtensionPath } from './vector-extension.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['update', 'force'], - string: ['provider', 'dir', 'url', 'out', 'platform', 'arch', 'repo'], - default: { update: false, force: false } -}); +const argv = createCli({ + scriptName: 'download-extensions', + options: { + update: { type: 'boolean', default: false }, + force: { type: 'boolean', default: false }, + provider: { type: 'string' }, + dir: { type: 'string' }, + url: { type: 'string' }, + sha256: { type: 'string', array: true }, + out: { type: 'string' }, + platform: { type: 'string' }, + arch: { type: 'string' }, + repo: { type: 'string' } + } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const repoRoot = rootArg || resolveRepoRoot(process.cwd()); @@ -41,6 +52,105 @@ try { manifest = {}; } +const FILE_MODE = 0o644; +const DIR_MODE = 0o755; +const DEFAULT_ARCHIVE_LIMITS = { + maxBytes: 200 * 1024 * 1024, + maxEntryBytes: 50 * 1024 * 1024, + maxEntries: 2048 +}; + +const normalizeLimit = (value, fallback) => { + if (value === 0 || value === false) return null; + const parsed = Number(value); + if (Number.isFinite(parsed) && parsed > 0) return Math.floor(parsed); + return fallback; +}; + +const normalizeHash = (value) => { + if (!value) return null; + const trimmed = String(value).trim().toLowerCase(); + if (!trimmed) return null; + const normalized = trimmed.startsWith('sha256:') ? trimmed.slice(7) : trimmed; + if (!/^[a-f0-9]{64}$/.test(normalized)) return null; + return normalized; +}; + +const parseHashes = (input) => { + if (!input) return {}; + const items = Array.isArray(input) ? input : [input]; + const out = {}; + for (const item of items) { + const eq = String(item || '').indexOf('='); + if (eq <= 0 || eq >= item.length - 1) continue; + const name = item.slice(0, eq); + const hash = normalizeHash(item.slice(eq + 1)); + if (name && hash) out[name] = hash; + } + return out; +}; + +const resolveDownloadPolicy = (cfg) => { + const policy = cfg?.security?.downloads || {}; + const allowlist = policy.allowlist && typeof policy.allowlist === 'object' + ? policy.allowlist + : {}; + return { + requireHash: policy.requireHash === true, + warnUnsigned: policy.warnUnsigned !== false, + allowlist + }; +}; + +const resolveArchiveLimits = (cfg) => { + const archives = cfg?.security?.archives || {}; + return { + maxBytes: normalizeLimit(archives.maxBytes, DEFAULT_ARCHIVE_LIMITS.maxBytes), + maxEntryBytes: normalizeLimit(archives.maxEntryBytes, DEFAULT_ARCHIVE_LIMITS.maxEntryBytes), + maxEntries: normalizeLimit(archives.maxEntries, DEFAULT_ARCHIVE_LIMITS.maxEntries) + }; +}; + +const resolveExpectedHash = (source, policy, overrides) => { + const explicit = normalizeHash(source?.sha256 || source?.hash); + if (explicit) return explicit; + const allowlist = policy?.allowlist || {}; + const fallback = overrides?.[source?.name] + || overrides?.[source?.url] + || overrides?.[source?.file] + || allowlist[source?.name] + || allowlist[source?.url] + || allowlist[source?.file]; + return normalizeHash(fallback); +}; + +const verifyDownloadHash = (source, buffer, expectedHash, policy) => { + if (!expectedHash) { + if (policy?.requireHash) { + throw createError( + ERROR_CODES.DOWNLOAD_VERIFY_FAILED, + `Download verification requires a sha256 hash (${source?.name || source?.url || 'unknown source'}).` + ); + } + if (policy?.warnUnsigned) { + console.warn(`[download] Skipping hash verification for ${source?.name || source?.url || 'unknown source'}.`); + } + return null; + } + const actual = crypto.createHash('sha256').update(buffer).digest('hex'); + if (actual !== expectedHash) { + throw createError( + ERROR_CODES.DOWNLOAD_VERIFY_FAILED, + `Download verification failed for ${source?.name || source?.url || 'unknown source'}.` + ); + } + return actual; +}; + +const hashOverrides = parseHashes(argv.sha256); +const downloadPolicy = resolveDownloadPolicy(userConfig); +const archiveLimits = resolveArchiveLimits(userConfig); + /** * Identify the archive type from a filename or URL. * @param {string|undefined|null} value @@ -64,51 +174,203 @@ function getArchiveTypeForSource(source) { return getArchiveType(source.file) || getArchiveType(source.url); } -/** - * Run a command and return true if it succeeded. - * @param {string} cmd - * @param {string[]} args - * @returns {boolean} - */ -function runCommand(cmd, args) { - const result = spawnSync(cmd, args, { stdio: 'inherit' }); - return result.status === 0; +function normalizeArchiveEntry(entryName) { + const name = String(entryName || '').replace(/\\/g, '/').trim(); + let cleaned = name.replace(/^(\.\/)+/, ''); + cleaned = cleaned.replace(/^\/+/, ''); + // Handle Windows extended-length paths that can appear as //?/C:/... + cleaned = cleaned.replace(/^\?\//, ''); + // Strip Windows drive-letter prefixes (e.g., C:, C:/, C:\) + cleaned = cleaned.replace(/^[A-Za-z]:/, ''); + cleaned = cleaned.replace(/^\/+/, ''); + return path.posix.normalize(cleaned); } -async function extractZipNode(archivePath, destDir) { - try { - const mod = await import('adm-zip'); - const AdmZip = mod.default || mod; - const zip = new AdmZip(archivePath); - zip.extractAllTo(destDir, true); - return true; - } catch { - return false; +function isArchivePathSafe(rootDir, entryName) { + const normalized = normalizeArchiveEntry(entryName); + if (!normalized) return false; + if (normalized === '.' || normalized === '..') return false; + if (normalized.startsWith('../') || normalized.includes('/../')) return false; + if (/^[A-Za-z]:/.test(normalized)) return false; + if (path.posix.isAbsolute(normalized) || path.win32.isAbsolute(normalized)) return false; + const root = path.resolve(rootDir); + const resolved = path.resolve(root, normalized); + const rootPrefix = root.endsWith(path.sep) ? root : `${root}${path.sep}`; + if (process.platform === 'win32') { + return resolved.toLowerCase().startsWith(rootPrefix.toLowerCase()); } + return resolved.startsWith(rootPrefix); } -async function extractTarNode(archivePath, destDir, gzip) { - try { - const mod = await import('tar-fs'); - const tarFs = mod.default || mod; - await fs.mkdir(destDir, { recursive: true }); - const extract = tarFs.extract(destDir); - const source = fsSync.createReadStream(archivePath); - if (gzip) { - await pipeline(source, createGunzip(), extract); - } else { - await pipeline(source, extract); +function resolveArchivePath(rootDir, entryName) { + if (!isArchivePathSafe(rootDir, entryName)) return null; + const normalized = normalizeArchiveEntry(entryName); + return path.resolve(rootDir, normalized); +} + +function isZipSymlink(entry) { + const attr = entry?.header?.attr; + if (typeof attr !== 'number') return false; + const mode = attr >>> 16; + return (mode & 0o170000) === 0o120000; +} + +function createArchiveLimiter(limits) { + const maxEntries = Number.isFinite(limits?.maxEntries) ? limits.maxEntries : null; + const maxEntryBytes = Number.isFinite(limits?.maxEntryBytes) ? limits.maxEntryBytes : null; + const maxBytes = Number.isFinite(limits?.maxBytes) ? limits.maxBytes : null; + let entries = 0; + let totalBytes = 0; + const checkTotals = () => { + if (maxBytes && totalBytes > maxBytes) { + throw createError(ERROR_CODES.ARCHIVE_TOO_LARGE, `Archive exceeds max size (${totalBytes} > ${maxBytes}).`); + } + }; + const checkEntry = (name, size) => { + entries += 1; + if (maxEntries && entries > maxEntries) { + throw createError(ERROR_CODES.ARCHIVE_TOO_LARGE, `Archive exceeds entry limit (${entries} > ${maxEntries}).`); + } + const entryBytes = Number.isFinite(size) && size > 0 ? size : 0; + if (maxEntryBytes && entryBytes > maxEntryBytes) { + throw createError(ERROR_CODES.ARCHIVE_TOO_LARGE, `Archive entry too large (${name}).`); + } + totalBytes += entryBytes; + checkTotals(); + return entryBytes; + }; + const addBytes = (delta) => { + if (!Number.isFinite(delta) || delta <= 0) return; + totalBytes += delta; + checkTotals(); + }; + return { checkEntry, addBytes }; +} + + +async function extractZipNode(archivePath, destDir, limits) { + const mod = await import('adm-zip'); + const AdmZip = mod.default || mod; + const zip = new AdmZip(archivePath); + const entries = zip.getEntries(); + const limiter = createArchiveLimiter(limits); + await fs.mkdir(destDir, { recursive: true }); + for (const entry of entries) { + if (isZipSymlink(entry)) { + throw createError(ERROR_CODES.ARCHIVE_UNSAFE, `unsafe zip entry (symlink): ${entry.entryName}`); + } + const targetPath = resolveArchivePath(destDir, entry.entryName); + if (!targetPath) { + throw createError(ERROR_CODES.ARCHIVE_UNSAFE, `unsafe zip entry: ${entry.entryName}`); } - return true; - } catch { - return false; + const declaredSize = Number(entry?.header?.size); + const counted = limiter.checkEntry(entry.entryName, Number.isFinite(declaredSize) ? declaredSize : 0); + if (entry.isDirectory) { + await fs.mkdir(targetPath, { recursive: true }); + try { await fs.chmod(targetPath, DIR_MODE); } catch {} + continue; + } + const data = entry.getData(); + if (limits?.maxEntryBytes && data.length > limits.maxEntryBytes) { + throw createError(ERROR_CODES.ARCHIVE_TOO_LARGE, `archive entry too large (${entry.entryName}).`); + } + if (data.length > counted) { + limiter.addBytes(data.length - counted); + } + await fs.mkdir(path.dirname(targetPath), { recursive: true }); + await fs.writeFile(targetPath, data, { mode: FILE_MODE }); + try { await fs.chmod(targetPath, FILE_MODE); } catch {} + } + return true; +} + +async function extractTarNode(archivePath, destDir, gzip, limits) { + const mod = await import('tar-stream'); + const tarStream = mod.default || mod; + const extract = tarStream.extract(); + const limiter = createArchiveLimiter(limits); + await fs.mkdir(destDir, { recursive: true }); + extract.on('entry', (header, stream, next) => { + const rawName = header?.name || ''; + const normalized = normalizeArchiveEntry(rawName); + const type = header?.type || 'file'; + + (async () => { + // Reject symlinks/hardlinks to avoid writing outside the destination or + // creating unexpected filesystem references. + if (type === 'symlink' || type === 'link') { + throw createError(ERROR_CODES.ARCHIVE_UNSAFE, `unsafe tar entry (symlink): ${rawName}`); + } + + // Skip empty / root-ish entries. + if (!normalized || normalized === '.' || normalized === '..') { + stream.resume(); + return; + } + + const targetPath = resolveArchivePath(destDir, normalized); + if (!targetPath) { + throw createError(ERROR_CODES.ARCHIVE_UNSAFE, `unsafe tar entry: ${rawName}`); + } + + if (type === 'directory') { + await fs.mkdir(targetPath, { recursive: true }); + try { await fs.chmod(targetPath, DIR_MODE); } catch {} + stream.resume(); + return; + } + + // Ignore special entries (devices, FIFOs, pax headers, etc.). + if (type !== 'file' && type !== 'contiguous-file') { + stream.resume(); + return; + } + + const declaredSize = Number(header?.size); + const counted = limiter.checkEntry( + normalized, + Number.isFinite(declaredSize) ? declaredSize : 0 + ); + + await fs.mkdir(path.dirname(targetPath), { recursive: true }); + + const writer = fsSync.createWriteStream(targetPath, { mode: FILE_MODE }); + let written = 0; + stream.on('data', (chunk) => { + written += chunk.length; + if (limits?.maxEntryBytes && written > limits.maxEntryBytes) { + stream.destroy( + createError(ERROR_CODES.ARCHIVE_TOO_LARGE, `archive entry too large (${normalized}).`) + ); + } + }); + + await pipeline(stream, writer); + + if (written > counted) { + limiter.addBytes(written - counted); + } + try { await fs.chmod(targetPath, FILE_MODE); } catch {} + })() + .then(() => next()) + .catch((err) => { + try { stream.resume(); } catch {} + extract.destroy(err); + }); + }); + const source = fsSync.createReadStream(archivePath); + if (gzip) { + await pipeline(source, createGunzip(), extract); + } else { + await pipeline(source, extract); } + return true; } -async function extractArchiveNode(archivePath, destDir, type) { - if (type === 'zip') return extractZipNode(archivePath, destDir); +async function extractArchiveNode(archivePath, destDir, type, limits) { + if (type === 'zip') return extractZipNode(archivePath, destDir, limits); const gzip = type === 'tar.gz'; - return extractTarNode(archivePath, destDir, gzip); + return extractTarNode(archivePath, destDir, gzip, limits); } /** @@ -118,22 +380,8 @@ async function extractArchiveNode(archivePath, destDir, type) { * @param {string} type * @returns {boolean} */ -async function extractArchive(archivePath, destDir, type) { - if (type === 'zip') { - if (runCommand('unzip', ['-o', archivePath, '-d', destDir])) return true; - if (runCommand('tar', ['-xf', archivePath, '-C', destDir])) return true; - if (process.platform === 'win32') { - const script = `Expand-Archive -LiteralPath "${archivePath}" -DestinationPath "${destDir}" -Force`; - if (runCommand('powershell', ['-NoProfile', '-Command', script])) return true; - if (runCommand('pwsh', ['-NoProfile', '-Command', script])) return true; - } - return extractArchiveNode(archivePath, destDir, type); - } - const tarArgs = type === 'tar.gz' - ? ['-xzf', archivePath, '-C', destDir] - : ['-xf', archivePath, '-C', destDir]; - if (runCommand('tar', tarArgs)) return true; - return extractArchiveNode(archivePath, destDir, type); +async function extractArchive(archivePath, destDir, type, limits) { + return extractArchiveNode(archivePath, destDir, type, limits); } /** @@ -173,15 +421,18 @@ async function findFile(rootDir, targetName, suffix) { * @param {string} suffix * @returns {Array<{name:string,url:string,file:string}>} */ -function parseUrls(input, suffix) { +function parseUrls(input, suffix, hashes = null) { if (!input) return []; const items = Array.isArray(input) ? input : [input]; const sources = []; for (const item of items) { - const [name, url] = item.split('='); - if (!name || !url) continue; + const eq = item.indexOf('='); + if (eq <= 0 || eq >= item.length - 1) continue; + const name = item.slice(0, eq); + const url = item.slice(eq + 1); const fileName = name.includes('.') ? name : `${name}${suffix}`; - sources.push({ name, url, file: fileName }); + const sha256 = hashes && hashes[name] ? hashes[name] : null; + sources.push({ name, url, file: fileName, sha256 }); } return sources; } @@ -200,7 +451,8 @@ function resolveSourceFromConfig(cfg) { return { name: cfg.provider, url: byPlatform.url, - file: byPlatform.file || cfg.filename + file: byPlatform.file || cfg.filename, + sha256: byPlatform.sha256 || byPlatform.hash || null }; } if (typeof byPlatform === 'string') { @@ -248,7 +500,7 @@ function requestUrl(url, headers = {}, redirects = 0) { } const suffix = getBinarySuffix(config.platform); -const sources = parseUrls(argv.url, suffix); +const sources = parseUrls(argv.url, suffix, hashOverrides); if (!sources.length) { const fallback = resolveSourceFromConfig(config); if (fallback?.url) sources.push(fallback); @@ -319,17 +571,19 @@ async function downloadSource(source, index) { if (response.statusCode !== 200) { throw new Error(`Failed to download ${source.url}: ${response.statusCode}`); } + const expectedHash = resolveExpectedHash(source, downloadPolicy, hashOverrides); + const actualHash = verifyDownloadHash(source, response.body, expectedHash, downloadPolicy); if (archiveType) { await fs.mkdir(tempRoot, { recursive: true }); } - await fs.writeFile(downloadPath, response.body); + await fs.writeFile(downloadPath, response.body, { mode: FILE_MODE }); let extractedFrom = null; if (archiveType) { const extractDir = path.join(tempRoot, `extract-${Date.now()}`); await fs.mkdir(extractDir, { recursive: true }); - const ok = await extractArchive(downloadPath, extractDir, archiveType); + const ok = await extractArchive(downloadPath, extractDir, archiveType, archiveLimits); if (!ok) { throw new Error(`Failed to extract ${downloadPath} (${archiveType})`); } @@ -338,6 +592,7 @@ async function downloadSource(source, index) { throw new Error(`No extension binary found in ${downloadPath}`); } await fs.copyFile(extractedPath, outputPath); + try { await fs.chmod(outputPath, FILE_MODE); } catch {} extractedFrom = path.relative(extensionDir, extractedPath); await fs.rm(extractDir, { recursive: true, force: true }); await fs.rm(downloadPath, { force: true }); @@ -353,6 +608,8 @@ async function downloadSource(source, index) { provider: config.provider, platform: config.platform, arch: config.arch, + sha256: actualHash || expectedHash || null, + verified: Boolean(expectedHash), etag: response.headers.etag || null, lastModified: response.headers['last-modified'] || null, downloadedAt: new Date().toISOString() diff --git a/tools/download-models.js b/tools/download-models.js index bf4d85f99..af0686197 100644 --- a/tools/download-models.js +++ b/tools/download-models.js @@ -1,16 +1,18 @@ #!/usr/bin/env node import fs from 'node:fs/promises'; import path from 'node:path'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; import { pipeline, env } from '@xenova/transformers'; import { DEFAULT_MODEL_ID, getModelConfig, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; -const argv = minimist(process.argv.slice(2), { - string: ['model', 'cache-dir', 'repo'], - default: { - model: DEFAULT_MODEL_ID +const argv = createCli({ + scriptName: 'download-models', + options: { + model: { type: 'string', default: DEFAULT_MODEL_ID }, + 'cache-dir': { type: 'string' }, + repo: { type: 'string' } } -}); +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); diff --git a/tools/eval/run.js b/tools/eval/run.js new file mode 100644 index 000000000..6678fda63 --- /dev/null +++ b/tools/eval/run.js @@ -0,0 +1,194 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import path from 'node:path'; +import { createCli } from '../../src/shared/cli.js'; +import { search as coreSearch } from '../../src/integrations/core/index.js'; +import { createSqliteDbCache } from '../../src/retrieval/sqlite-cache.js'; + +const argv = createCli({ + scriptName: 'eval-run', + options: { + repo: { type: 'string' }, + dataset: { type: 'string' }, + backend: { type: 'string', default: 'auto' }, + top: { type: 'number', default: 10 }, + ann: { type: 'boolean' }, + out: { type: 'string' }, + pretty: { type: 'boolean', default: false } + }, + aliases: { n: 'top' } +}).parse(); + +const root = process.cwd(); +const repoRoot = argv.repo ? path.resolve(argv.repo) : root; +const datasetPath = argv.dataset + ? path.resolve(argv.dataset) + : path.join(root, 'tests', 'fixtures', 'sample', 'eval.json'); +const backend = argv.backend ? String(argv.backend) : 'auto'; +const topN = Math.max(1, parseInt(argv.top, 10) || 10); +const annFlag = typeof argv.ann === 'boolean' ? argv.ann : null; +const ks = [1, 3, 5, 10].filter((k) => k <= Math.max(10, topN)); + +const loadDataset = () => { + const raw = fs.readFileSync(datasetPath, 'utf8'); + const data = JSON.parse(raw); + if (!Array.isArray(data)) return []; + return data; +}; + +const matchExpected = (hit, expected) => { + if (!hit) return false; + if (expected.file && hit.file !== expected.file) return false; + if (expected.name) { + const hitName = hit.name ? String(hit.name).toLowerCase() : ''; + if (!hitName.includes(String(expected.name).toLowerCase())) return false; + } + if (expected.kind) { + if (!hit.kind || String(hit.kind).toLowerCase() !== String(expected.kind).toLowerCase()) { + return false; + } + } + return true; +}; + +const computeRecallAtK = (ranks, totalRelevant, k) => { + if (!totalRelevant) return 0; + const found = ranks.filter((rank) => rank <= k).length; + return found / totalRelevant; +}; + +const computeMRR = (ranks) => { + if (!ranks.length) return 0; + return 1 / Math.min(...ranks); +}; + +const computeNDCG = (ranks, totalRelevant, k) => { + if (!totalRelevant) return 0; + const hits = ranks.filter((rank) => rank <= k).sort((a, b) => a - b); + if (!hits.length) return 0; + const dcg = hits.reduce((sum, rank) => sum + 1 / Math.log2(rank + 1), 0); + const idealCount = Math.min(totalRelevant, k); + let idcg = 0; + for (let i = 1; i <= idealCount; i += 1) { + idcg += 1 / Math.log2(i + 1); + } + return idcg ? dcg / idcg : 0; +}; + +const runSearch = async (query, mode) => { + const args = ['--json-compact', '--repo', repoRoot, '-n', String(topN)]; + if (mode && mode !== 'both') args.push('--mode', mode); + if (backend && backend !== 'auto') args.push('--backend', backend); + if (annFlag === true) args.push('--ann'); + if (annFlag === false) args.push('--no-ann'); + + const payload = await coreSearch(repoRoot, { + args, + query, + emitOutput: false, + exitOnError: false, + indexCache: evalCaches.indexCache, + sqliteCache: evalCaches.sqliteCache + }); + if (mode === 'code') return payload.code || []; + if (mode === 'prose') return payload.prose || []; + return [...(payload.code || []), ...(payload.prose || [])]; +}; + +const evalCaches = { + indexCache: new Map(), + sqliteCache: createSqliteDbCache() +}; + +const cases = loadDataset(); +if (!cases.length) { + console.error(`No eval cases found at ${datasetPath}`); + process.exit(1); +} + +const results = []; +for (const entry of cases) { + const query = String(entry?.query || '').trim(); + if (!query) continue; + const mode = entry.mode || 'both'; + const silver = Array.isArray(entry.relevant) + ? entry.relevant + : (Array.isArray(entry.expect) ? entry.expect : []); + const gold = Array.isArray(entry.gold) ? entry.gold : []; + + const hits = await runSearch(query, mode); + const ranks = []; + const goldRanks = []; + hits.forEach((hit, index) => { + const rank = index + 1; + if (silver.some((exp) => matchExpected(hit, exp))) ranks.push(rank); + if (gold.some((exp) => matchExpected(hit, exp))) goldRanks.push(rank); + }); + + const metrics = { + recallAtK: Object.fromEntries(ks.map((k) => [k, computeRecallAtK(ranks, silver.length, k)])), + mrr: computeMRR(ranks), + ndcgAtK: Object.fromEntries(ks.map((k) => [k, computeNDCG(ranks, silver.length, k)])) + }; + const goldMetrics = gold.length + ? { + recallAtK: Object.fromEntries(ks.map((k) => [k, computeRecallAtK(goldRanks, gold.length, k)])), + mrr: computeMRR(goldRanks), + ndcgAtK: Object.fromEntries(ks.map((k) => [k, computeNDCG(goldRanks, gold.length, k)])) + } + : null; + + results.push({ + query, + mode, + totals: { + relevant: silver.length, + gold: gold.length, + hits: hits.length + }, + metrics, + goldMetrics + }); +} + +const aggregate = (field) => { + if (!results.length) return 0; + const sum = results.reduce((acc, entry) => acc + (entry.metrics?.[field] || 0), 0); + return sum / results.length; +}; + +const aggregateMap = (key) => { + const totals = {}; + if (!results.length) return totals; + for (const k of ks) { + const sum = results.reduce((acc, entry) => acc + (entry.metrics?.[key]?.[k] || 0), 0); + totals[k] = sum / results.length; + } + return totals; +}; + +const summary = { + cases: results.length, + recallAtK: aggregateMap('recallAtK'), + ndcgAtK: aggregateMap('ndcgAtK'), + mrr: aggregate('mrr') +}; + +const output = { + generatedAt: new Date().toISOString(), + repo: repoRoot, + dataset: datasetPath, + backend, + topN, + ann: annFlag, + ks, + summary, + results +}; + +if (argv.out) { + fs.writeFileSync(path.resolve(argv.out), JSON.stringify(output, null, 2)); +} + +const payload = argv.pretty ? JSON.stringify(output, null, 2) : JSON.stringify(output); +console.log(payload); diff --git a/tools/eval/sample.json b/tools/eval/sample.json new file mode 100644 index 000000000..f083378d1 --- /dev/null +++ b/tools/eval/sample.json @@ -0,0 +1,19 @@ +[ + { + "query": "greet", + "mode": "code", + "relevant": [{ "file": "src/index.js", "name": "greet" }], + "gold": [{ "file": "src/index.js", "name": "greet" }] + }, + { + "query": "clamp", + "mode": "code", + "relevant": [{ "file": "src/util.js", "name": "clamp" }] + }, + { + "query": "guide", + "mode": "prose", + "relevant": [{ "file": "docs/guide.md", "name": "Guide" }], + "gold": [{ "file": "docs/guide.md", "name": "Guide" }] + } +] diff --git a/tools/generate-demo-config.js b/tools/generate-demo-config.js new file mode 100644 index 000000000..46049cf2d --- /dev/null +++ b/tools/generate-demo-config.js @@ -0,0 +1,134 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { createCli } from '../src/shared/cli.js'; +import { parseJsoncText } from '../src/shared/jsonc.js'; +import { DEFAULT_USER_CONFIG_TEMPLATE } from './default-config-template.js'; + +const argv = createCli({ + scriptName: 'generate-demo-config', + options: { + schema: { type: 'string', default: 'docs/config-schema.json' }, + out: { type: 'string', default: 'demo.pairofcleats.json' } + } +}).parse(); + +const schemaPath = path.resolve(argv.schema); +const outPath = path.resolve(argv.out); +const schemaRaw = await fs.readFile(schemaPath, 'utf8'); +const schema = JSON.parse(schemaRaw); +const templateDefaults = parseJsoncText(DEFAULT_USER_CONFIG_TEMPLATE, 'default-config-template'); + +const collectTypes = (node) => { + if (!node || typeof node !== 'object') return []; + if (Array.isArray(node.type)) return node.type; + if (typeof node.type === 'string') return [node.type]; + const options = node.oneOf || node.anyOf || []; + const nested = []; + for (const option of options) { + nested.push(...collectTypes(option)); + } + return Array.from(new Set(nested)); +}; + +const collectEnum = (node) => { + if (!node || typeof node !== 'object') return []; + if (Array.isArray(node.enum)) return node.enum.slice(); + if (node.const !== undefined) return [node.const]; + const options = node.oneOf || node.anyOf || []; + const values = []; + for (const option of options) { + values.push(...collectEnum(option)); + } + return Array.from(new Set(values)); +}; + +const resolveDefault = (node) => { + if (!node || typeof node !== 'object') return { value: null, hasDefault: false }; + if (node.default !== undefined) return { value: node.default, hasDefault: true }; + if (node.const !== undefined) return { value: node.const, hasDefault: true }; + const types = collectTypes(node); + if (types.includes('array')) return { value: [], hasDefault: false }; + return { value: null, hasDefault: false }; +}; + +const formatValue = (value) => { + return JSON.stringify(value); +}; + +const describeAcceptedValues = (node) => { + const enumValues = collectEnum(node); + if (enumValues.length) { + return `Accepted values: ${enumValues.map(formatValue).join(', ')}`; + } + const types = collectTypes(node); + if (types.includes('boolean')) { + return 'Accepted values: true, false'; + } + const itemEnums = collectEnum(node?.items); + if (itemEnums.length) { + return `Accepted values (items): ${itemEnums.map(formatValue).join(', ')}`; + } + return ''; +}; + +const describeDefault = (node, hasDefault, value, templateValue) => { + if (templateValue !== undefined) return `Default: ${formatValue(templateValue)}`; + if (hasDefault) return `Default: ${formatValue(value)}`; + if (node && node.default !== undefined) return `Default: ${formatValue(node.default)}`; + return ''; +}; + +const describeMax = (node) => { + if (!node || typeof node !== 'object') return ''; + if (Number.isFinite(node.maximum)) return `Max: ${node.maximum}`; + if (Number.isFinite(node.maxItems)) return `Max items: ${node.maxItems}`; + if (Number.isFinite(node.maxLength)) return `Max length: ${node.maxLength}`; + if (Number.isFinite(node.maxProperties)) return `Max properties: ${node.maxProperties}`; + return ''; +}; + +const renderProperties = (node, lines, indent, pathPrefix, templateNode) => { + const properties = node?.properties && typeof node.properties === 'object' + ? node.properties + : {}; + const keys = Object.keys(properties); + keys.forEach((key, index) => { + const prop = properties[key]; + const propPath = pathPrefix ? `${pathPrefix}.${key}` : key; + const { value, hasDefault } = resolveDefault(prop); + const templateValue = templateNode && typeof templateNode === 'object' + ? templateNode[key] + : undefined; + const types = collectTypes(prop); + const accepted = describeAcceptedValues(prop); + if (accepted) lines.push(`${indent}// ${accepted}`); + const defaultLine = describeDefault(prop, hasDefault, value, templateValue); + if (defaultLine) lines.push(`${indent}// ${defaultLine}`); + const maxLine = describeMax(prop); + if (maxLine) lines.push(`${indent}// ${maxLine}`); + + const isObject = types.includes('object') && prop?.properties && typeof prop.properties === 'object'; + const isLeafObject = types.includes('object') && !prop?.properties; + const comma = index < keys.length - 1 ? ',' : ''; + if (isObject) { + lines.push(`${indent}"${key}": {`); + renderProperties(prop, lines, `${indent} `, propPath, templateValue); + lines.push(`${indent}}${comma}`); + } else if (isLeafObject && hasDefault && typeof value === 'object') { + lines.push(`${indent}"${key}": ${JSON.stringify(value, null, 2)}${comma}`); + } else { + const outputValue = templateValue !== undefined ? templateValue : value; + lines.push(`${indent}"${key}": ${formatValue(outputValue)}${comma}`); + } + }); +}; + +const lines = []; +lines.push('{'); +renderProperties(schema, lines, ' ', '', templateDefaults); +lines.push('}'); +lines.push(''); + +await fs.writeFile(outPath, `${lines.join('\n')}\n`, 'utf8'); +console.log(`Wrote ${outPath}`); diff --git a/tools/generate-repo-dict.js b/tools/generate-repo-dict.js index faaf61b74..9732aed7e 100644 --- a/tools/generate-repo-dict.js +++ b/tools/generate-repo-dict.js @@ -3,16 +3,21 @@ import fs from 'node:fs/promises'; import fsSync from 'node:fs'; import path from 'node:path'; import { spawnSync } from 'node:child_process'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; import ignore from 'ignore'; import { getDictConfig, getRepoDictPath, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; import { splitId } from '../src/shared/tokenize.js'; -const argv = minimist(process.argv.slice(2), { - string: ['out', 'extensions', 'repo'], - boolean: ['include-prose'], - default: { 'min-count': 3, 'include-prose': false } -}); +const argv = createCli({ + scriptName: 'generate-repo-dict', + options: { + out: { type: 'string' }, + extensions: { type: 'string' }, + repo: { type: 'string' }, + 'include-prose': { type: 'boolean', default: false }, + 'min-count': { type: 'number', default: 3 } + } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const repoRoot = rootArg || resolveRepoRoot(process.cwd()); diff --git a/tools/get-last-failure.js b/tools/get-last-failure.js new file mode 100644 index 000000000..85df1df07 --- /dev/null +++ b/tools/get-last-failure.js @@ -0,0 +1,109 @@ +#!/usr/bin/env node +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import path from 'node:path'; + +const root = process.cwd(); +const candidates = []; + +const readPrefix = async (filePath, maxBytes) => { + try { + const handle = await fs.open(filePath, 'r'); + try { + const { size } = await handle.stat(); + const readBytes = Math.min(size, maxBytes); + const buffer = Buffer.alloc(readBytes); + await handle.read(buffer, 0, readBytes, 0); + return buffer.toString('utf8'); + } finally { + await handle.close(); + } + } catch { + return ''; + } +}; + +const readSuffix = async (filePath, maxBytes) => { + try { + const handle = await fs.open(filePath, 'r'); + try { + const { size } = await handle.stat(); + const readBytes = Math.min(size, maxBytes); + const buffer = Buffer.alloc(readBytes); + const start = Math.max(0, size - readBytes); + await handle.read(buffer, 0, readBytes, start); + return buffer.toString('utf8'); + } finally { + await handle.close(); + } + } catch { + return ''; + } +}; + +const isFailureLog = async (filePath) => { + const prefix = await readPrefix(filePath, 4096); + if (/\bexit:\s*[1-9]\d*/i.test(prefix)) return true; + if (/\bFailed:/i.test(prefix) || /\buncaughtException\b/i.test(prefix)) return true; + const suffix = await readSuffix(filePath, 8192); + if (/\bFailed:/i.test(suffix) || /\buncaughtException\b/i.test(suffix)) return true; + return false; +}; + +const addCandidate = async (filePath) => { + try { + const stat = await fs.stat(filePath); + if (!stat.isFile()) return; + candidates.push({ path: filePath, mtimeMs: stat.mtimeMs }); + } catch { + // ignore missing or unreadable paths + } +}; + +const collectLogs = async (dirPath) => { + let entries; + try { + entries = await fs.readdir(dirPath, { withFileTypes: true }); + } catch { + return; + } + for (const entry of entries) { + const nextPath = path.join(dirPath, entry.name); + if (entry.isDirectory()) { + await collectLogs(nextPath); + continue; + } + if (!entry.isFile()) continue; + if (!entry.name.toLowerCase().endsWith('.log')) continue; + await addCandidate(nextPath); + } +}; + +const searchRoots = [ + path.join(root, 'tests', '.logs'), + path.join(root, 'benchmarks', 'results') +]; + +for (const dirPath of searchRoots) { + await collectLogs(dirPath); +} + +if (!candidates.length) { + console.error('No log files found.'); + process.exit(1); +} + +const failures = []; +for (const entry of candidates) { + if (await isFailureLog(entry.path)) { + failures.push(entry); + } +} + +const pick = (list) => list.sort((a, b) => b.mtimeMs - a.mtimeMs)[0]; +const selected = failures.length ? pick(failures) : pick(candidates); +if (!selected || !fsSync.existsSync(selected.path)) { + console.error('No log files found.'); + process.exit(1); +} +console.log(selected.path); diff --git a/tools/git-hooks.js b/tools/git-hooks.js index 23b24eb5d..9c983f485 100644 --- a/tools/git-hooks.js +++ b/tools/git-hooks.js @@ -2,14 +2,19 @@ import fs from 'node:fs/promises'; import fsSync from 'node:fs'; import path from 'node:path'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; import { resolveRepoRoot } from './dict-utils.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['install', 'uninstall', 'status'], - string: ['hooks', 'repo'], - default: { install: false, uninstall: false, status: false } -}); +const argv = createCli({ + scriptName: 'git-hooks', + options: { + install: { type: 'boolean', default: false }, + uninstall: { type: 'boolean', default: false }, + status: { type: 'boolean', default: false }, + hooks: { type: 'string' }, + repo: { type: 'string' } + } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); @@ -31,7 +36,7 @@ ROOT="$(git rev-parse --show-toplevel 2>/dev/null)" if [ -z "$ROOT" ]; then exit 0 fi -node "$ROOT/build_index.js" --incremental +node "$ROOT/bin/pairofcleats.js" build-index --incremental --repo "$ROOT" `; const ensureHooksDir = async () => { diff --git a/tools/gtags-ingest.js b/tools/gtags-ingest.js new file mode 100644 index 000000000..ffadb7a19 --- /dev/null +++ b/tools/gtags-ingest.js @@ -0,0 +1,135 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import readline from 'node:readline'; +import { spawn } from 'node:child_process'; +import { createCli } from '../src/shared/cli.js'; +import { getRepoCacheRoot, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'gtags-ingest', + options: { + repo: { type: 'string' }, + input: { type: 'string' }, + out: { type: 'string' }, + json: { type: 'boolean', default: false }, + run: { type: 'boolean', default: false }, + global: { type: 'string', default: 'global' }, + args: { type: 'string' } + } +}).parse(); + +const repoRoot = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); +const userConfig = loadUserConfig(repoRoot); +const cacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const outputPath = argv.out + ? path.resolve(argv.out) + : path.join(cacheRoot, 'gtags', 'gtags.jsonl'); +const metaPath = `${outputPath}.meta.json`; +const inputPath = argv.input ? String(argv.input) : null; +const runGlobal = argv.run === true; +const globalCmd = argv.global || 'global'; + +const toPosix = (value) => value.replace(/\\/g, '/'); +const normalizePath = (value) => { + if (!value) return null; + const raw = String(value); + const resolved = path.isAbsolute(raw) ? raw : path.resolve(repoRoot, raw); + const rel = path.relative(repoRoot, resolved); + return toPosix(rel || raw); +}; + +const stats = { + entries: 0, + errors: 0 +}; + +const ensureOutputDir = async () => { + await fsPromises.mkdir(path.dirname(outputPath), { recursive: true }); +}; + +const writeStream = fs.createWriteStream(outputPath, { encoding: 'utf8' }); + +const parseGlobalLine = (line) => { + const trimmed = line.trim(); + if (!trimmed) return null; + const parts = trimmed.split(/\s+/); + if (parts.length < 3) return null; + const name = parts[0]; + const lineNo = Number.parseInt(parts[1], 10); + const file = normalizePath(parts.slice(2).join(' ')); + if (!name || !file || !Number.isFinite(lineNo)) return null; + return { file, name, line: lineNo }; +}; + +const ingestTextLines = async (stream) => { + const rl = readline.createInterface({ input: stream, crlfDelay: Infinity }); + for await (const line of rl) { + const parsed = parseGlobalLine(line); + if (!parsed) { + if (line.trim()) stats.errors += 1; + continue; + } + stats.entries += 1; + const payload = { + file: parsed.file, + ext: path.extname(parsed.file).toLowerCase(), + name: parsed.name, + startLine: parsed.line, + endLine: parsed.line, + role: 'definition', + source: 'gtags' + }; + writeStream.write(`${JSON.stringify(payload)}\n`); + } +}; + +const runGlobalCommand = async () => { + const args = ['-x']; + if (argv.args) { + const extra = String(argv.args) + .split(/\s+/) + .map((entry) => entry.trim()) + .filter(Boolean); + args.push(...extra); + } + const child = spawn(globalCmd, args, { cwd: repoRoot, stdio: ['ignore', 'pipe', 'pipe'] }); + child.stderr.on('data', (chunk) => process.stderr.write(chunk)); + await ingestTextLines(child.stdout); + const exitCode = await new Promise((resolve) => { + child.on('close', (code) => resolve(code ?? 0)); + }); + if (exitCode !== 0) { + throw new Error(`global exited with code ${exitCode}`); + } +}; + +await ensureOutputDir(); +if (runGlobal) { + await runGlobalCommand(); +} else if (inputPath && inputPath !== '-') { + const inputStream = fs.createReadStream(inputPath, { encoding: 'utf8' }); + await ingestTextLines(inputStream); +} else { + await ingestTextLines(process.stdin); +} + +writeStream.end(); + +const summary = { + generatedAt: new Date().toISOString(), + repoRoot: path.resolve(repoRoot), + input: inputPath || (runGlobal ? 'global' : 'stdin'), + output: path.resolve(outputPath), + stats +}; +await fsPromises.writeFile(metaPath, JSON.stringify(summary, null, 2)); + +if (argv.json) { + console.log(JSON.stringify(summary, null, 2)); +} else { + console.log(`GTAGS ingest: ${stats.entries} entries (${stats.errors} parse errors)`); + console.log(`- output: ${outputPath}`); + console.log(`- meta: ${metaPath}`); +} diff --git a/tools/index-validate.js b/tools/index-validate.js new file mode 100644 index 000000000..3e486a467 --- /dev/null +++ b/tools/index-validate.js @@ -0,0 +1,81 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { createCli } from '../src/shared/cli.js'; +import { resolveRepoRoot } from './dict-utils.js'; +import { validateIndexArtifacts } from '../src/index/validate.js'; + +const parseModes = (raw) => { + const tokens = String(raw || '') + .split(/[,\s]+/) + .map((token) => token.trim()) + .filter(Boolean); + const modeSet = new Set(tokens.length ? tokens : ['code', 'prose']); + if (modeSet.has('all')) return ['code', 'prose', 'records']; + return Array.from(modeSet); +}; + +async function runCli() { + const argv = createCli({ + scriptName: 'index-validate', + options: { + json: { type: 'boolean', default: false }, + repo: { type: 'string' }, + mode: { type: 'string' }, + 'index-root': { type: 'string' } + } + }).parse(); + + const rootArg = argv.repo ? path.resolve(argv.repo) : null; + const root = rootArg || resolveRepoRoot(process.cwd()); + const indexRoot = argv['index-root'] ? path.resolve(argv['index-root']) : null; + const modes = parseModes(argv.mode); + const report = await validateIndexArtifacts({ root, indexRoot, modes }); + + if (argv.json) { + console.log(JSON.stringify(report, null, 2)); + process.exit(report.ok ? 0 : 1); + } + + console.log('Index validation'); + console.log(`- repo: ${report.root}`); + for (const mode of modes) { + const entry = report.modes[mode]; + const status = entry.ok ? 'ok' : 'missing'; + console.log(`- ${mode}: ${status} (${entry.path})`); + if (entry.missing.length) { + console.log(` - missing: ${entry.missing.join(', ')}`); + } + if (entry.warnings.length) { + console.log(` - optional: ${entry.warnings.join(', ')}`); + } + } + if (report.sqlite.enabled) { + const status = report.sqlite.ok ? 'ok' : 'issues'; + console.log(`- sqlite: ${status} (mode=${report.sqlite.mode})`); + if (report.sqlite.issues.length) { + report.sqlite.issues.forEach((issue) => console.log(` - ${issue}`)); + } + } + + if (report.warnings.length && report.ok) { + console.log('Warnings:'); + report.warnings.forEach((warning) => console.log(`- ${warning}`)); + } + if (!report.ok) { + console.log('Issues:'); + report.issues.forEach((issue) => console.log(`- ${issue}`)); + } + if (report.hints?.length) { + console.log('Hints:'); + report.hints.forEach((hint) => console.log(`- ${hint}`)); + } + process.exit(report.ok ? 0 : 1); +} + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + runCli().catch((err) => { + console.error(err?.message || err); + process.exit(1); + }); +} diff --git a/tools/indexer-service.js b/tools/indexer-service.js new file mode 100644 index 000000000..eb7ab1c90 --- /dev/null +++ b/tools/indexer-service.js @@ -0,0 +1,396 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { spawn } from 'node:child_process'; +import { createCli } from '../src/shared/cli.js'; +import { resolveRepoRoot, getCacheRoot, getRepoCacheRoot, resolveToolRoot } from './dict-utils.js'; +import { getServiceConfigPath, loadServiceConfig, resolveRepoRegistry } from './service/config.js'; +import { ensureQueueDir, enqueueJob, claimNextJob, completeJob, queueSummary, resolveQueueName, requeueStaleJobs, touchJobHeartbeat } from './service/queue.js'; +import { ensureRepo, resolveRepoPath } from './service/repos.js'; + +const argv = createCli({ + scriptName: 'indexer-service', + options: { + config: { type: 'string' }, + repo: { type: 'string' }, + mode: { type: 'string', default: 'both' }, + reason: { type: 'string' }, + stage: { type: 'string' }, + command: { type: 'string' }, + watch: { type: 'boolean', default: false }, + interval: { type: 'number' }, + concurrency: { type: 'number' }, + queue: { type: 'string', default: 'index' } + } +}).parse(); + +const command = argv.command || String(argv._[0] || ''); +const parsedUv = Number(process.env.UV_THREADPOOL_SIZE); +const effectiveUvThreadpoolSize = Number.isFinite(parsedUv) && parsedUv > 0 ? Math.floor(parsedUv) : null; +if (command === 'serve' || argv.watch) { + console.error(`[indexer-service] UV_THREADPOOL_SIZE: ${effectiveUvThreadpoolSize ?? 'default'}`); +} + +const configPath = getServiceConfigPath(argv.config || null); +const config = loadServiceConfig(configPath); +const repoEntries = resolveRepoRegistry(config, configPath); +const baseDir = config.baseDir + ? path.resolve(config.baseDir) + : path.join(getCacheRoot(), 'service', 'repos'); +const queueDir = config.queueDir + ? path.resolve(config.queueDir) + : path.join(getCacheRoot(), 'service', 'queue'); +const queueName = argv.queue || 'index'; +const resolvedQueueName = resolveQueueName(queueName, { + reason: queueName === 'embeddings' ? 'embeddings' : null, + stage: argv.stage || null, + mode: argv.mode || null +}); + +const resolveRepoEntry = (repoArg) => { + if (!repoArg) return null; + const resolved = path.resolve(repoArg); + return repoEntries.find((entry) => resolveRepoPath(entry, baseDir) === resolved) + || repoEntries.find((entry) => entry.id === repoArg) + || { id: repoArg, path: resolved, syncPolicy: 'none' }; +}; + +const formatJobId = () => `${Date.now()}-${Math.random().toString(16).slice(2, 10)}`; + +const toolRoot = resolveToolRoot(); + +const BUILD_STATE_FILE = 'build_state.json'; +const BUILD_STATE_POLL_MS = 5000; +const BUILD_STATE_LOOKBACK_MS = 5 * 60 * 1000; + +const resolveBuildsRoot = (repoCacheRoot) => path.join(repoCacheRoot, 'builds'); + +const readBuildState = async (buildRoot) => { + if (!buildRoot) return null; + const statePath = path.join(buildRoot, BUILD_STATE_FILE); + try { + const raw = await fsPromises.readFile(statePath, 'utf8'); + const parsed = JSON.parse(raw); + return parsed && typeof parsed === 'object' ? { state: parsed, path: statePath } : null; + } catch { + return null; + } +}; + +const listBuildStateCandidates = async (repoCacheRoot) => { + const buildsRoot = resolveBuildsRoot(repoCacheRoot); + let entries; + try { + entries = await fsPromises.readdir(buildsRoot, { withFileTypes: true }); + } catch { + return []; + } + const candidates = []; + for (const entry of entries) { + if (!entry.isDirectory()) continue; + const buildRoot = path.join(buildsRoot, entry.name); + const statePath = path.join(buildRoot, BUILD_STATE_FILE); + try { + const stat = await fsPromises.stat(statePath); + candidates.push({ buildRoot, statePath, mtimeMs: stat.mtimeMs }); + } catch {} + } + return candidates.sort((a, b) => b.mtimeMs - a.mtimeMs); +}; + +const pickBuildState = async (repoCacheRoot, stage, sinceMs) => { + const candidates = await listBuildStateCandidates(repoCacheRoot); + for (const candidate of candidates) { + if (Number.isFinite(sinceMs) && candidate.mtimeMs < sinceMs) continue; + const loaded = await readBuildState(candidate.buildRoot); + if (!loaded) continue; + const state = loaded.state; + if (stage && state?.stage && state.stage !== stage) continue; + if (stage && state?.phases?.[stage]?.status === 'failed') continue; + return { buildRoot: candidate.buildRoot, state: loaded.state, path: loaded.path }; + } + return null; +}; + +const formatDuration = (ms) => { + const total = Math.max(0, Math.floor(ms / 1000)); + const hours = Math.floor(total / 3600); + const minutes = Math.floor((total % 3600) / 60); + const seconds = total % 60; + if (hours > 0) return `${hours}h ${minutes}m ${seconds}s`; + if (minutes > 0) return `${minutes}m ${seconds}s`; + return `${seconds}s`; +}; + +const formatProgressLine = ({ jobId, stage, state }) => { + if (!state) return null; + const phases = state?.phases || {}; + const phase = stage ? phases?.[stage] : null; + const phaseOrder = ['discovery', 'preprocessing', stage, 'validation', 'promote'].filter(Boolean); + const activePhase = phaseOrder.find((name) => phases?.[name]?.status === 'running'); + const startedAtRaw = phase?.startedAt || state?.createdAt || null; + const startedAt = startedAtRaw ? Date.parse(startedAtRaw) : null; + const now = Date.now(); + const elapsedMs = Number.isFinite(startedAt) ? Math.max(0, now - startedAt) : null; + const progress = state?.progress || {}; + let processedTotal = 0; + let totalFiles = 0; + const modeParts = []; + for (const [mode, data] of Object.entries(progress)) { + const processed = Number(data?.processedFiles); + const total = Number(data?.totalFiles); + if (!Number.isFinite(processed) || !Number.isFinite(total) || total <= 0) continue; + processedTotal += processed; + totalFiles += total; + modeParts.push(`${mode} ${processed}/${total}`); + } + const etaMs = (elapsedMs && processedTotal > 0 && totalFiles > processedTotal) + ? ((totalFiles - processedTotal) / (processedTotal / (elapsedMs / 1000))) * 1000 + : null; + const elapsedText = elapsedMs !== null ? formatDuration(elapsedMs) : 'n/a'; + const etaText = Number.isFinite(etaMs) ? formatDuration(etaMs) : 'n/a'; + const status = phase?.status || state?.stage || 'running'; + const progressText = modeParts.length + ? modeParts.join(' | ') + : 'progress pending'; + const phaseNote = activePhase && activePhase !== stage ? ` | phase ${activePhase} running` : ''; + return `[indexer] job ${jobId} ${stage || state?.stage || 'stage'} ${status} | ${progressText}${phaseNote} | elapsed ${elapsedText} | eta ${etaText}`; +}; + +const startBuildProgressMonitor = ({ job, repoPath, stage }) => { + if (!job || !repoPath) return () => {}; + const repoCacheRoot = getRepoCacheRoot(repoPath); + const startedAt = Date.now(); + let active = null; + let waitingLogged = false; + let lastLine = ''; + const poll = async () => { + if (!active) { + active = await pickBuildState(repoCacheRoot, stage, startedAt - BUILD_STATE_LOOKBACK_MS); + } + if (!active) { + if (!waitingLogged) { + console.log(`[indexer] job ${job.id} ${stage || 'stage'} running; waiting for build state...`); + waitingLogged = true; + } + return; + } + const loaded = await readBuildState(active.buildRoot); + if (loaded?.state) active.state = loaded.state; + const line = formatProgressLine({ jobId: job.id, stage, state: active.state }); + if (line && line !== lastLine) { + console.log(line); + lastLine = line; + } + }; + const timer = setInterval(() => { + void poll(); + }, BUILD_STATE_POLL_MS); + void poll(); + return () => clearInterval(timer); +}; + +const spawnWithLog = (args, extraEnv = {}, logPath = null) => new Promise((resolve) => { + const useLog = typeof logPath === 'string' && logPath.trim(); + const stdio = useLog ? ['ignore', 'pipe', 'pipe'] : 'inherit'; + const child = spawn(process.execPath, args, { stdio, env: { ...process.env, ...extraEnv } }); + let stream = null; + if (useLog) { + fs.mkdirSync(path.dirname(logPath), { recursive: true }); + stream = fs.createWriteStream(logPath, { flags: 'a' }); + stream.write(`[${new Date().toISOString()}] job start\n`); + child.stdout.pipe(stream); + child.stderr.pipe(stream); + } + child.on('close', (code) => { + if (stream) { + stream.write(`[${new Date().toISOString()}] job exit ${code ?? 1}\n`); + stream.end(); + } + resolve(code ?? 1); + }); +}); + +const runBuildIndex = (repoPath, mode, stage, extraArgs = null, logPath = null) => { + const buildPath = path.join(toolRoot, 'build_index.js'); + const args = [buildPath]; + if (Array.isArray(extraArgs) && extraArgs.length) { + args.push(...extraArgs); + } else { + args.push('--repo', repoPath); + if (mode && mode !== 'both') args.push('--mode', mode); + if (stage) args.push('--stage', stage); + } + return spawnWithLog(args, {}, logPath); +}; + +const runBuildEmbeddings = (repoPath, mode, extraEnv = {}, logPath = null) => { + const buildPath = path.join(toolRoot, 'tools', 'build-embeddings.js'); + const args = [buildPath, '--repo', repoPath]; + if (mode && mode !== 'both') args.push('--mode', mode); + return spawnWithLog(args, extraEnv, logPath); +}; + +const handleSync = async () => { + const targets = argv.repo ? [resolveRepoEntry(argv.repo)].filter(Boolean) : repoEntries; + if (!targets.length) { + console.error('No repos configured for sync.'); + process.exit(1); + } + const policy = config.sync?.policy || 'pull'; + const results = []; + for (const entry of targets) { + const result = await ensureRepo(entry, baseDir, policy); + results.push({ id: entry.id || entry.path, ...result }); + } + console.log(JSON.stringify({ ok: true, results }, null, 2)); +}; + +const handleEnqueue = async () => { + const target = resolveRepoEntry(argv.repo || resolveRepoRoot(process.cwd())); + if (!target) { + console.error('Repo not found for enqueue.'); + process.exit(1); + } + await ensureQueueDir(queueDir); + const queueConfig = queueName === 'embeddings' + ? (config.embeddings?.queue || {}) + : (config.queue || {}); + const id = formatJobId(); + const mode = argv.mode || 'both'; + const result = await enqueueJob(queueDir, { + id, + createdAt: new Date().toISOString(), + repo: resolveRepoPath(target, baseDir) || target.path, + mode, + reason: argv.reason || null, + stage: argv.stage || null, + maxRetries: queueConfig.maxRetries ?? null + }, queueConfig.maxQueued ?? null, queueName); + if (!result.ok) { + console.error(result.message || 'Failed to enqueue job.'); + process.exit(1); + } + console.log(JSON.stringify({ ok: true, job: result.job }, null, 2)); +}; + +const handleStatus = async () => { + const summary = await queueSummary(queueDir, resolvedQueueName); + console.log(JSON.stringify({ ok: true, queue: summary, name: resolvedQueueName }, null, 2)); +}; + +const processQueueOnce = async (metrics) => { + const queueConfig = queueName === 'embeddings' + ? (config.embeddings?.queue || {}) + : (config.queue || {}); + await requeueStaleJobs(queueDir, resolvedQueueName, { + maxRetries: Number.isFinite(queueConfig.maxRetries) ? queueConfig.maxRetries : 2 + }); + const job = await claimNextJob(queueDir, resolvedQueueName); + if (!job) return false; + metrics.processed += 1; + const embedWorkerConfig = config.embeddings?.worker || {}; + const memoryMb = Number.isFinite(Number(embedWorkerConfig.maxMemoryMb)) + ? Math.max(128, Math.floor(Number(embedWorkerConfig.maxMemoryMb))) + : null; + const extraEnv = memoryMb + ? { NODE_OPTIONS: `${process.env.NODE_OPTIONS || ''} --max-old-space-size=${memoryMb}`.trim() } + : {}; + const heartbeat = setInterval(() => { + void touchJobHeartbeat(queueDir, job.id, resolvedQueueName); + }, 30000); + const logPath = job.logPath || path.join(queueDir, 'logs', `${job.id}.log`); + const stopProgress = queueName === 'index' + ? startBuildProgressMonitor({ job, repoPath: job.repo, stage: job.stage }) + : () => {}; + const exitCode = queueName === 'embeddings' + ? await runBuildEmbeddings(job.repo, job.mode, extraEnv, logPath) + : await runBuildIndex(job.repo, job.mode, job.stage, job.args, logPath); + stopProgress(); + clearInterval(heartbeat); + const status = exitCode === 0 ? 'done' : 'failed'; + const attempts = Number.isFinite(job.attempts) ? job.attempts : 0; + const maxRetries = Number.isFinite(job.maxRetries) + ? job.maxRetries + : (Number.isFinite(queueConfig.maxRetries) ? queueConfig.maxRetries : 0); + if (status === 'failed' && maxRetries > attempts) { + const nextAttempts = attempts + 1; + metrics.retried += 1; + await completeJob( + queueDir, + job.id, + 'queued', + { exitCode, retry: true, attempts: nextAttempts, error: `exit ${exitCode}` }, + resolvedQueueName + ); + return true; + } + if (status === 'done') { + metrics.succeeded += 1; + } else { + metrics.failed += 1; + } + await completeJob(queueDir, job.id, status, { exitCode, error: `exit ${exitCode}` }, resolvedQueueName); + return true; +}; + +const handleWork = async () => { + await ensureQueueDir(queueDir); + const workerConfig = queueName === 'embeddings' + ? (config.embeddings?.worker || {}) + : (config.worker || {}); + const concurrency = Number.isFinite(Number(argv.concurrency)) + ? Math.max(1, Number(argv.concurrency)) + : (workerConfig.concurrency || 1); + const intervalMs = Number.isFinite(Number(argv.interval)) + ? Math.max(100, Number(argv.interval)) + : (config.sync?.intervalMs || 5000); + const runBatch = async () => { + const metrics = { processed: 0, succeeded: 0, failed: 0, retried: 0 }; + const workers = Array.from({ length: concurrency }, async () => { + let worked = true; + while (worked) { + worked = await processQueueOnce(metrics); + } + }); + await Promise.all(workers); + if (metrics.processed) { + console.log(JSON.stringify({ + ok: true, + queue: resolvedQueueName, + metrics, + at: new Date().toISOString() + }, null, 2)); + } + }; + await runBatch(); + if (argv.watch) { + while (true) { + await new Promise((resolve) => setTimeout(resolve, intervalMs)); + await runBatch(); + } + } +}; + +const handleServe = async () => { + const apiPath = path.join(toolRoot, 'tools', 'api-server.js'); + const repoArg = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); + const child = spawn(process.execPath, [apiPath, '--repo', repoArg], { stdio: 'inherit' }); + child.on('exit', (code) => process.exit(code ?? 0)); +}; + +if (command === 'sync') { + await handleSync(); +} else if (command === 'enqueue') { + await handleEnqueue(); +} else if (command === 'work') { + await handleWork(); +} else if (command === 'status') { + await handleStatus(); +} else if (command === 'serve') { + await handleServe(); +} else { + console.error('Usage: indexer-service [--queue index|embeddings] [--stage stage1|stage2|stage3|stage4]'); + process.exit(1); +} diff --git a/tools/lsif-ingest.js b/tools/lsif-ingest.js new file mode 100644 index 000000000..eed864c39 --- /dev/null +++ b/tools/lsif-ingest.js @@ -0,0 +1,188 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import readline from 'node:readline'; +import { createCli } from '../src/shared/cli.js'; +import { getRepoCacheRoot, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'lsif-ingest', + options: { + repo: { type: 'string' }, + input: { type: 'string' }, + out: { type: 'string' }, + json: { type: 'boolean', default: false } + } +}).parse(); + +const repoRoot = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); +const userConfig = loadUserConfig(repoRoot); +const cacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const inputPath = argv.input ? String(argv.input) : null; +const outputPath = argv.out + ? path.resolve(argv.out) + : path.join(cacheRoot, 'lsif', 'lsif.jsonl'); +const metaPath = `${outputPath}.meta.json`; + +const toPosix = (value) => value.replace(/\\/g, '/'); +const normalizePath = (value) => { + if (!value) return null; + let raw = String(value); + const posixRaw = raw.replace(/\\/g, '/'); + if (posixRaw === '/repo') return ''; + if (posixRaw.startsWith('/repo/')) { + return posixRaw.slice('/repo/'.length); + } + if (posixRaw.startsWith('/') && /^[A-Za-z]:\//.test(posixRaw.slice(1))) { + raw = posixRaw.slice(1); + } + const resolved = path.isAbsolute(raw) ? raw : path.resolve(repoRoot, raw); + const rel = path.relative(repoRoot, resolved); + return toPosix(rel || raw); +}; + +const stats = { + vertices: 0, + edges: 0, + definitions: 0, + references: 0, + errors: 0, + kinds: {}, + languages: {} +}; + +const bump = (bucket, key) => { + if (!key) return; + const k = String(key); + bucket[k] = (bucket[k] || 0) + 1; +}; + +const ensureOutputDir = async () => { + await fsPromises.mkdir(path.dirname(outputPath), { recursive: true }); +}; + +const writeStream = fs.createWriteStream(outputPath, { encoding: 'utf8' }); + +const vertexById = new Map(); +const docById = new Map(); +const rangeById = new Map(); +const rangeToDoc = new Map(); + +const normalizeRange = (range) => { + if (!range || typeof range !== 'object') return null; + const start = range.start || {}; + const end = range.end || {}; + const startLine = Number.isFinite(Number(start.line)) ? Number(start.line) + 1 : null; + const endLine = Number.isFinite(Number(end.line)) ? Number(end.line) + 1 : startLine; + return { + startLine, + endLine, + startChar: Number.isFinite(Number(start.character)) ? Number(start.character) : null, + endChar: Number.isFinite(Number(end.character)) ? Number(end.character) : null + }; +}; + +const recordEntry = (payload) => { + writeStream.write(`${JSON.stringify(payload)}\n`); +}; + +const handleVertex = (vertex) => { + vertexById.set(vertex.id, vertex); + const label = vertex.label || vertex.type || null; + bump(stats.kinds, label || 'unknown'); + if (label === 'document' && vertex.uri) { + docById.set(vertex.id, vertex); + } + if (label === 'range') { + rangeById.set(vertex.id, vertex); + } + stats.vertices += 1; +}; + +const handleEdge = (edge) => { + stats.edges += 1; + const label = edge.label || edge.type || null; + if (label === 'contains' && edge.outV != null && Array.isArray(edge.inVs)) { + const outVertex = vertexById.get(edge.outV); + if (outVertex && (outVertex.label === 'document' || outVertex.type === 'document')) { + for (const id of edge.inVs) { + rangeToDoc.set(id, outVertex); + } + } + } + if (label === 'item' && edge.outV != null && Array.isArray(edge.inVs)) { + const doc = rangeToDoc.get(edge.outV) || null; + const docUri = doc?.uri || null; + const file = docUri ? normalizePath(new URL(docUri).pathname) : null; + if (!file) return; + const range = rangeById.get(edge.outV); + const normalized = normalizeRange(range); + for (const inV of edge.inVs) { + const inVertex = vertexById.get(inV); + const inLabel = inVertex?.label || inVertex?.type || null; + const role = inLabel === 'definitionResult' ? 'definition' + : inLabel === 'referenceResult' ? 'reference' + : 'other'; + if (role === 'definition') stats.definitions += 1; + if (role === 'reference') stats.references += 1; + bump(stats.languages, doc?.languageId || 'unknown'); + recordEntry({ + file, + ext: path.extname(file).toLowerCase(), + name: range?.tag || range?.text || null, + kind: range?.kind || null, + startLine: normalized?.startLine ?? null, + endLine: normalized?.endLine ?? null, + startChar: normalized?.startChar ?? null, + endChar: normalized?.endChar ?? null, + role, + language: doc?.languageId || null + }); + } + } +}; + +const ingestJsonLines = async (stream) => { + const rl = readline.createInterface({ input: stream, crlfDelay: Infinity }); + for await (const line of rl) { + const trimmed = line.trim(); + if (!trimmed) continue; + let parsed = null; + try { + parsed = JSON.parse(trimmed); + } catch { + stats.errors += 1; + continue; + } + if (parsed && parsed.type === 'vertex') handleVertex(parsed); + else if (parsed && parsed.type === 'edge') handleEdge(parsed); + } +}; + +await ensureOutputDir(); +if (inputPath && inputPath !== '-') { + const inputStream = fs.createReadStream(inputPath, { encoding: 'utf8' }); + await ingestJsonLines(inputStream); +} else { + await ingestJsonLines(process.stdin); +} + +writeStream.end(); + +const summary = { + generatedAt: new Date().toISOString(), + repoRoot: path.resolve(repoRoot), + input: inputPath || 'stdin', + output: path.resolve(outputPath), + stats +}; +await fsPromises.writeFile(metaPath, JSON.stringify(summary, null, 2)); + +if (argv.json) { + console.log(JSON.stringify(summary, null, 2)); +} else { + console.log(`LSIF ingest: ${stats.vertices} vertices, ${stats.edges} edges`); + console.log(`- output: ${outputPath}`); + console.log(`- meta: ${metaPath}`); +} diff --git a/tools/map-iso-serve.js b/tools/map-iso-serve.js new file mode 100644 index 000000000..5e5950e36 --- /dev/null +++ b/tools/map-iso-serve.js @@ -0,0 +1,178 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import path from 'node:path'; +import https from 'node:https'; +import { spawnSync, spawn } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; +import { createCli } from '../src/shared/cli.js'; +import selfsigned from 'selfsigned'; + +const argv = createCli({ + scriptName: 'map-iso', + options: { + repo: { type: 'string', describe: 'Repo root.' }, + dir: { type: 'string', describe: 'Alias for --repo.' }, + out: { type: 'string', describe: 'Output HTML path.' }, + port: { type: 'number', default: 0, describe: 'HTTPS port (0 for random).' }, + 'open-uri-template': { type: 'string', describe: 'URI template for double-click.' }, + 'three-url': { type: 'string', describe: 'Override three.js module URL.' }, + 'cert-dir': { type: 'string', describe: 'Directory for TLS key/cert.' }, + open: { type: 'boolean', default: true, describe: 'Open browser.' } + } +}).parse(); + +const toolRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..'); +const repoRoot = argv.repo ? path.resolve(argv.repo) + : (argv.dir ? path.resolve(argv.dir) : process.cwd()); +const mapsDir = path.join(repoRoot, '.pairofcleats', 'maps'); +const outPath = argv.out ? path.resolve(argv.out) : path.join(mapsDir, 'map.iso.html'); +const threeUrl = argv['three-url'] || '/three/three.module.js'; +const certDir = argv['cert-dir'] ? path.resolve(argv['cert-dir']) : path.join(mapsDir, '.certs'); +const port = Number.isFinite(argv.port) ? argv.port : 0; + +const ensureDir = (targetPath) => { + fs.mkdirSync(targetPath, { recursive: true }); +}; + +const ensureCert = (targetDir) => { + ensureDir(targetDir); + const keyPath = path.join(targetDir, 'localhost.key'); + const certPath = path.join(targetDir, 'localhost.crt'); + if (fs.existsSync(keyPath) && fs.existsSync(certPath)) { + return { key: fs.readFileSync(keyPath), cert: fs.readFileSync(certPath) }; + } + const attrs = [{ name: 'commonName', value: 'localhost' }]; + const pems = selfsigned.generate(attrs, { days: 30, keySize: 2048 }); + fs.writeFileSync(keyPath, pems.private); + fs.writeFileSync(certPath, pems.cert); + return { key: pems.private, cert: pems.cert }; +}; + +const runReport = () => { + ensureDir(path.dirname(outPath)); + const args = [ + path.join(toolRoot, 'tools', 'report-code-map.js'), + '--repo', repoRoot, + '--format', 'html-iso', + '--out', outPath, + '--three-url', threeUrl + ]; + if (argv['open-uri-template']) { + args.push('--open-uri-template', argv['open-uri-template']); + } + const result = spawnSync(process.execPath, args, { cwd: toolRoot, stdio: 'inherit' }); + if (result.status !== 0) { + process.exit(result.status ?? 1); + } +}; + +const contentTypeFor = (filePath) => { + const ext = path.extname(filePath).toLowerCase(); + if (ext === '.html') return 'text/html; charset=utf-8'; + if (ext === '.js') return 'application/javascript; charset=utf-8'; + if (ext === '.json') return 'application/json; charset=utf-8'; + if (ext === '.map') return 'application/json; charset=utf-8'; + if (ext === '.jpg' || ext === '.jpeg') return 'image/jpeg'; + if (ext === '.png') return 'image/png'; + if (ext === '.hdr') return 'application/octet-stream'; + return 'application/octet-stream'; +}; + +const safeJoin = (baseDir, requestPath) => { + const safePath = path.normalize(path.join(baseDir, requestPath)); + if (!safePath.startsWith(baseDir)) return null; + return safePath; +}; + +const openBrowser = (url) => { + if (argv.open === false) return; + if (process.platform === 'win32') { + spawn('cmd', ['/c', 'start', '', url], { detached: true, stdio: 'ignore' }); + return; + } + const opener = process.platform === 'darwin' ? 'open' : 'xdg-open'; + spawn(opener, [url], { detached: true, stdio: 'ignore' }); +}; + +runReport(); + +const { key, cert } = ensureCert(certDir); +const threeRoot = path.join(toolRoot, 'node_modules', 'three'); +const threeBuildRoot = path.join(threeRoot, 'build'); +const threeExamplesRoot = path.join(threeRoot, 'examples'); +const isomapAssetsRoot = path.join(toolRoot, 'assets', 'isomap'); +const isomapClientRoot = path.join(toolRoot, 'src', 'map', 'isometric', 'client'); + +const server = https.createServer({ key, cert }, (req, res) => { + const url = new URL(req.url || '/', 'https://localhost'); + const pathname = decodeURIComponent(url.pathname || '/'); + if (pathname === '/' || pathname === '/map.iso.html') { + const htmlPath = outPath; + if (!fs.existsSync(htmlPath)) { + res.writeHead(404); + res.end('map.iso.html not found.'); + return; + } + res.writeHead(200, { 'Content-Type': contentTypeFor(htmlPath) }); + fs.createReadStream(htmlPath).pipe(res); + return; + } + if (pathname.startsWith('/three/examples/')) { + const relativePath = pathname.replace('/three/examples/', ''); + const targetPath = safeJoin(threeExamplesRoot, relativePath); + if (!targetPath || !fs.existsSync(targetPath)) { + res.writeHead(404); + res.end('three.js example asset not found.'); + return; + } + res.writeHead(200, { 'Content-Type': contentTypeFor(targetPath) }); + fs.createReadStream(targetPath).pipe(res); + return; + } + if (pathname.startsWith('/three/')) { + const relativePath = pathname.replace('/three/', ''); + const targetPath = safeJoin(threeBuildRoot, relativePath); + if (!targetPath || !fs.existsSync(targetPath)) { + res.writeHead(404); + res.end('three.js asset not found.'); + return; + } + res.writeHead(200, { 'Content-Type': contentTypeFor(targetPath) }); + fs.createReadStream(targetPath).pipe(res); + return; + } + if (pathname.startsWith('/assets/isomap/')) { + const relativePath = pathname.replace('/assets/isomap/', ''); + const targetPath = safeJoin(isomapAssetsRoot, relativePath); + if (!targetPath || !fs.existsSync(targetPath)) { + res.writeHead(404); + res.end('isomap asset not found.'); + return; + } + res.writeHead(200, { 'Content-Type': contentTypeFor(targetPath) }); + fs.createReadStream(targetPath).pipe(res); + return; + } + if (pathname.startsWith('/isomap/')) { + const relativePath = pathname.replace('/isomap/', ''); + const targetPath = safeJoin(isomapClientRoot, relativePath); + if (!targetPath || !fs.existsSync(targetPath)) { + res.writeHead(404); + res.end('isomap client asset not found.'); + return; + } + res.writeHead(200, { 'Content-Type': contentTypeFor(targetPath) }); + fs.createReadStream(targetPath).pipe(res); + return; + } + res.writeHead(404); + res.end('Not found.'); +}); + +server.listen(port, '127.0.0.1', () => { + const address = server.address(); + const actualPort = typeof address === 'object' && address ? address.port : port; + const url = `https://localhost:${actualPort}/map.iso.html`; + console.log(`Serving map: ${url}`); + openBrowser(url); +}); diff --git a/tools/mcp-server.js b/tools/mcp-server.js index bfcacb7a5..88f52c364 100644 --- a/tools/mcp-server.js +++ b/tools/mcp-server.js @@ -1,1284 +1,58 @@ #!/usr/bin/env node import fs from 'node:fs'; import path from 'node:path'; -import { fileURLToPath } from 'node:url'; -import { spawn, spawnSync } from 'node:child_process'; -import simpleGit from 'simple-git'; -import { getToolDefs } from '../src/mcp/defs.js'; -import { sendError, sendNotification, sendResult } from '../src/mcp/protocol.js'; -import { - DEFAULT_MODEL_ID, - getCacheRoot, - getDictConfig, - getDictionaryPaths, - getIndexDir, - getMetricsDir, - getModelConfig, - getRepoCacheRoot, - getRepoId, - loadUserConfig, - resolveRepoRoot, - resolveSqlitePaths -} from './dict-utils.js'; -import { getVectorExtensionConfig, resolveVectorExtensionPath } from './vector-extension.js'; +import { getToolDefs } from '../src/integrations/mcp/defs.js'; +import { DEFAULT_MODEL_ID, getRuntimeConfig, loadUserConfig, resolveRepoRoot, resolveToolRoot } from './dict-utils.js'; +import { parseTimeoutMs, resolveToolTimeoutMs } from './mcp/repo.js'; +import { handleToolCall } from './mcp/tools.js'; +import { createMcpTransport } from './mcp/transport.js'; +import { configureServiceLogger } from './service/logger.js'; -const __dirname = path.dirname(fileURLToPath(import.meta.url)); -const ROOT = path.resolve(__dirname, '..'); -const PKG = JSON.parse(fs.readFileSync(path.join(ROOT, 'package.json'), 'utf8')); +const toolRoot = resolveToolRoot(); +const PKG = JSON.parse(fs.readFileSync(path.join(toolRoot, 'package.json'), 'utf8')); const TOOL_DEFS = getToolDefs(DEFAULT_MODEL_ID); - -/** - * Resolve and validate a repo path. - * @param {string} inputPath - * @returns {string} - */ -function resolveRepoPath(inputPath) { - const base = inputPath ? path.resolve(inputPath) : process.cwd(); - if (!fs.existsSync(base) || !fs.statSync(base).isDirectory()) { - throw new Error(`Repo path not found: ${base}`); - } - return inputPath ? base : resolveRepoRoot(base); -} - -/** - * Build the artifact path map for a repo. - * @param {string} repoPath - * @param {object} userConfig - * @returns {object} - */ -function listArtifacts(repoPath, userConfig) { - const indexCode = getIndexDir(repoPath, 'code', userConfig); - const indexProse = getIndexDir(repoPath, 'prose', userConfig); - const indexRecords = getIndexDir(repoPath, 'records', userConfig); - const metricsDir = getMetricsDir(repoPath, userConfig); - const sqlitePaths = resolveSqlitePaths(repoPath, userConfig); - return { - index: { - code: { - dir: indexCode, - chunkMeta: path.join(indexCode, 'chunk_meta.json'), - tokenPostings: path.join(indexCode, 'token_postings.json') - }, - prose: { - dir: indexProse, - chunkMeta: path.join(indexProse, 'chunk_meta.json'), - tokenPostings: path.join(indexProse, 'token_postings.json') - }, - records: { - dir: indexRecords, - chunkMeta: path.join(indexRecords, 'chunk_meta.json'), - tokenPostings: path.join(indexRecords, 'token_postings.json') - } - }, - metrics: { - dir: metricsDir, - indexCode: path.join(metricsDir, 'index-code.json'), - indexProse: path.join(metricsDir, 'index-prose.json'), - indexRecords: path.join(metricsDir, 'index-records.json'), - queryCache: path.join(metricsDir, 'queryCache.json') - }, - sqlite: { - code: sqlitePaths.codePath, - prose: sqlitePaths.prosePath, - legacy: sqlitePaths.legacyPath, - legacyExists: sqlitePaths.legacyExists - } - }; -} - -/** - * Stat a path if it exists. - * @param {string} target - * @returns {{exists:boolean,mtime:(string|null),bytes:number}} - */ -function statIfExists(target) { - try { - const stat = fs.statSync(target); - return { - exists: true, - mtime: stat.mtime ? stat.mtime.toISOString() : null, - bytes: stat.size - }; - } catch { - return { exists: false, mtime: null, bytes: 0 }; - } -} - -/** - * Fetch lightweight git status info for a repo. - * @param {string} repoPath - * @returns {Promise} - */ -async function getGitInfo(repoPath) { - const gitDir = path.join(repoPath, '.git'); - const hasGitDir = fs.existsSync(gitDir); - if (!hasGitDir) { - return { - isRepo: false, - warning: 'Git repository not detected; using path-based repo identity.' - }; - } - try { - const git = simpleGit(repoPath); - const status = await git.status(); - const head = await git.revparse(['HEAD']); - return { - isRepo: true, - head: head.trim(), - branch: status.current || null, - isDirty: status.files.length > 0 - }; - } catch (error) { - return { - isRepo: true, - warning: `Git detected but status unavailable: ${error.message}` - }; - } -} - -/** - * Build an index status report for the MCP tool. - * @param {object} [args] - * @returns {Promise} - */ -async function indexStatus(args = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const userConfig = loadUserConfig(repoPath); - const cacheRoot = (userConfig.cache && userConfig.cache.root) || process.env.PAIROFCLEATS_CACHE_ROOT || getCacheRoot(); - const repoId = getRepoId(repoPath); - const repoCacheRoot = getRepoCacheRoot(repoPath, userConfig); - const dictConfig = getDictConfig(repoPath, userConfig); - const dictPaths = await getDictionaryPaths(repoPath, dictConfig); - const modelConfig = getModelConfig(repoPath, userConfig); - const modelsDir = modelConfig.dir; - const modelDirName = `models--${modelConfig.id.replace('/', '--')}`; - const modelPath = path.join(modelsDir, modelDirName); - - const artifacts = listArtifacts(repoPath, userConfig); - const git = await getGitInfo(repoPath); - const incrementalRoot = path.join(repoCacheRoot, 'incremental'); - const report = { - repoPath, - repoId, - cacheRoot, - repoCacheRoot, - git, - dictionaries: { - dir: dictConfig.dir, - files: dictPaths, - enabled: dictPaths.length > 0, - includeSlang: dictConfig.includeSlang - }, - models: { - dir: modelsDir, - model: modelConfig.id, - available: fs.existsSync(modelPath), - hint: fs.existsSync(modelPath) - ? null - : 'Run the download_models tool or `npm run download-models` to prefetch embeddings.' - }, - incremental: { - dir: incrementalRoot, - exists: fs.existsSync(incrementalRoot) - }, - index: { - code: { - dir: artifacts.index.code.dir, - chunkMeta: statIfExists(artifacts.index.code.chunkMeta), - tokenPostings: statIfExists(artifacts.index.code.tokenPostings) - }, - prose: { - dir: artifacts.index.prose.dir, - chunkMeta: statIfExists(artifacts.index.prose.chunkMeta), - tokenPostings: statIfExists(artifacts.index.prose.tokenPostings) - }, - records: { - dir: artifacts.index.records.dir, - chunkMeta: statIfExists(artifacts.index.records.chunkMeta), - tokenPostings: statIfExists(artifacts.index.records.tokenPostings) - } - }, - sqlite: { - code: { path: artifacts.sqlite.code, ...statIfExists(artifacts.sqlite.code) }, - prose: { path: artifacts.sqlite.prose, ...statIfExists(artifacts.sqlite.prose) }, - legacy: artifacts.sqlite.legacyExists ? artifacts.sqlite.legacy : null - }, - metrics: { - dir: artifacts.metrics.dir, - indexCode: statIfExists(artifacts.metrics.indexCode), - indexProse: statIfExists(artifacts.metrics.indexProse), - indexRecords: statIfExists(artifacts.metrics.indexRecords), - queryCache: statIfExists(artifacts.metrics.queryCache) - } - }; - - return report; -} - -/** - * Inspect configuration + cache status with warnings. - * @param {object} [args] - * @returns {Promise} - */ -async function configStatus(args = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const userConfig = loadUserConfig(repoPath); - const cacheRoot = (userConfig.cache && userConfig.cache.root) || process.env.PAIROFCLEATS_CACHE_ROOT || getCacheRoot(); - const repoCacheRoot = getRepoCacheRoot(repoPath, userConfig); - const dictConfig = getDictConfig(repoPath, userConfig); - const dictionaryPaths = await getDictionaryPaths(repoPath, dictConfig); - const modelConfig = getModelConfig(repoPath, userConfig); - const modelsDir = modelConfig.dir; - const modelDirName = `models--${modelConfig.id.replace('/', '--')}`; - const modelPath = path.join(modelsDir, modelDirName); - const sqlitePaths = resolveSqlitePaths(repoPath, userConfig); - const sqliteConfigured = userConfig.sqlite?.use !== false; - const vectorConfig = getVectorExtensionConfig(repoPath, userConfig); - const vectorPath = resolveVectorExtensionPath(vectorConfig); - - const warnings = []; - if (!dictionaryPaths.length && (dictConfig.languages.length || dictConfig.files.length || dictConfig.includeSlang || dictConfig.enableRepoDictionary)) { - warnings.push({ - code: 'dictionary_missing', - message: 'No dictionary files found; identifier splitting will be limited.' - }); - } - if (!fs.existsSync(modelPath)) { - warnings.push({ - code: 'model_missing', - message: `Embedding model not found (${modelConfig.id}). Run npm run download-models.` - }); - } - if (sqliteConfigured) { - const missing = []; - if (!fs.existsSync(sqlitePaths.codePath)) missing.push(`code=${sqlitePaths.codePath}`); - if (!fs.existsSync(sqlitePaths.prosePath)) missing.push(`prose=${sqlitePaths.prosePath}`); - if (missing.length) { - warnings.push({ - code: 'sqlite_missing', - message: `SQLite indexes missing (${missing.join(', ')}). Run npm run build-sqlite-index.` - }); - } - } - if (vectorConfig.enabled) { - if (!vectorPath || !fs.existsSync(vectorPath)) { - warnings.push({ - code: 'extension_missing', - message: 'SQLite vector extension is enabled but not installed.' - }); - } - } - - return { - repoPath, - repoId: getRepoId(repoPath), - config: { - cacheRoot, - repoCacheRoot, - dictionary: dictConfig, - models: modelConfig, - sqlite: { - use: sqliteConfigured, - annMode: userConfig.sqlite?.annMode || null, - codeDbPath: sqlitePaths.codePath, - proseDbPath: sqlitePaths.prosePath - }, - search: userConfig.search || {}, - indexing: userConfig.indexing || {}, - tooling: userConfig.tooling || {} - }, - cache: { - cacheRootExists: fs.existsSync(cacheRoot), - repoCacheExists: fs.existsSync(repoCacheRoot), - dictionaries: dictionaryPaths, - modelAvailable: fs.existsSync(modelPath), - sqlite: { - codeExists: fs.existsSync(sqlitePaths.codePath), - proseExists: fs.existsSync(sqlitePaths.prosePath) - }, - vectorExtension: { - enabled: vectorConfig.enabled, - path: vectorPath, - available: !!(vectorPath && fs.existsSync(vectorPath)) - } - }, - warnings - }; -} - -/** - * Run a node command and return stdout. - * @param {string} cwd - * @param {string[]} args - * @returns {string} - */ -function runNodeSync(cwd, args) { - const result = spawnSync(process.execPath, args, { cwd, encoding: 'utf8' }); - if (result.status !== 0) { - const err = result.stderr || `Command failed: ${args.join(' ')}`; - throw new Error(err.trim()); - } - return result.stdout || ''; -} - -/** - * Normalize meta filters into CLI-friendly key/value strings. - * @param {any} meta - * @returns {string[]|null} - */ -function normalizeMetaFilters(meta) { - if (!meta) return null; - if (Array.isArray(meta)) { - const entries = meta.flatMap((entry) => { - if (entry == null) return []; - if (typeof entry === 'string') return [entry]; - if (typeof entry === 'object') { - return Object.entries(entry).map(([key, value]) => - value == null || value === '' ? String(key) : `${key}=${value}` - ); - } - return [String(entry)]; - }); - return entries.length ? entries : null; - } - if (typeof meta === 'object') { - const entries = Object.entries(meta).map(([key, value]) => - value == null || value === '' ? String(key) : `${key}=${value}` - ); - return entries.length ? entries : null; - } - return [String(meta)]; -} - -/** - * Build a line buffer for progress streaming. - * @param {(line:string)=>void} onLine - * @returns {{push:(text:string)=>void,flush:()=>void}} - */ -function createLineBuffer(onLine) { - let buffer = ''; - return { - push(text) { - buffer += text; - const lines = buffer.split(/\r?\n/); - buffer = lines.pop() || ''; - for (const line of lines) { - const trimmed = line.trim(); - if (trimmed) onLine(trimmed); - } - }, - flush() { - const trimmed = buffer.trim(); - if (trimmed) onLine(trimmed); - buffer = ''; - } - }; -} - -/** - * Run a node command asynchronously with optional stderr streaming. - * @param {string} cwd - * @param {string[]} args - * @param {{streamOutput?:boolean,onLine?:(payload:{stream:string,line:string})=>void}} [options] - * @returns {Promise<{stdout:string,stderr:string}>} - */ -function runNodeAsync(cwd, args, options = {}) { - return new Promise((resolve, reject) => { - const child = spawn(process.execPath, args, { cwd }); - let stdout = ''; - let stderr = ''; - const streamOutput = options.streamOutput === true; - const onLine = typeof options.onLine === 'function' ? options.onLine : null; - const stdoutBuffer = onLine - ? createLineBuffer((line) => onLine({ stream: 'stdout', line })) - : null; - const stderrBuffer = onLine - ? createLineBuffer((line) => onLine({ stream: 'stderr', line })) - : null; - child.stdout?.on('data', (chunk) => { - const text = chunk.toString(); - stdout += text; - if (streamOutput) process.stderr.write(text); - stdoutBuffer?.push(text); - }); - child.stderr?.on('data', (chunk) => { - const text = chunk.toString(); - stderr += text; - if (streamOutput) process.stderr.write(text); - stderrBuffer?.push(text); - }); - child.on('error', (err) => { - const error = new Error(err.message || 'Command failed'); - error.stdout = stdout; - error.stderr = stderr; - reject(error); - }); - child.on('close', (code) => { - stdoutBuffer?.flush(); - stderrBuffer?.flush(); - if (code === 0) { - resolve({ stdout, stderr }); - return; - } - const error = new Error(stderr.trim() || `Command failed: ${args.join(' ')}`); - error.code = code; - error.stdout = stdout; - error.stderr = stderr; - reject(error); - }); - }); -} - -/** - * Run a tool script with progress notifications. - * @param {{repoPath:string,scriptArgs:string[],context?:object,startMessage?:string,doneMessage?:string}} input - * @returns {Promise} - */ -async function runToolWithProgress({ repoPath, scriptArgs, context = {}, startMessage, doneMessage }) { - const progress = typeof context.progress === 'function' ? context.progress : null; - const progressLine = progress - ? ({ stream, line }) => progress({ message: line, stream }) - : null; - if (progress && startMessage) { - progress({ message: startMessage, phase: 'start' }); - } - const { stdout } = await runNodeAsync(repoPath, scriptArgs, { - streamOutput: true, - onLine: progressLine - }); - if (progress && doneMessage) { - progress({ message: doneMessage, phase: 'done' }); - } - return stdout || ''; -} - -function parseCountSummary(stdout) { - const match = String(stdout || '').match(/downloaded=(\d+)\s+skipped=(\d+)/i); - if (!match) return null; - return { - downloaded: Number(match[1]), - skipped: Number(match[2]) - }; -} - -function parseExtensionPath(stdout) { - const match = String(stdout || '').match(/Extension present at (.+)$/im); - return match ? match[1].trim() : null; -} - -/** - * Format error payloads for tool responses. - * @param {any} error - * @returns {{message:string,code?:number,stderr?:string,stdout?:string}} - */ -function getRemediationHint(error) { - const parts = [error?.message, error?.stderr, error?.stdout] - .filter(Boolean) - .join('\n') - .toLowerCase(); - if (!parts) return null; - - if (parts.includes('sqlite backend requested but index not found') - || parts.includes('missing required tables')) { - return 'Run `npm run build-sqlite-index` or set sqlite.use=false / --backend memory.'; - } - if (parts.includes('better-sqlite3 is required')) { - return 'Run `npm install` and ensure better-sqlite3 can load on this platform.'; - } - if (parts.includes('chunk_meta.json') || parts.includes('minhash_signatures')) { - return 'Run `npm run build-index` (or `npm run setup`/`npm run bootstrap`) to generate indexes.'; - } - if ((parts.includes('model') || parts.includes('xenova') || parts.includes('transformers')) - && (parts.includes('not found') || parts.includes('failed') || parts.includes('fetch') || parts.includes('download') || parts.includes('enoent'))) { - return 'Run `npm run download-models` or use `--stub-embeddings` / `PAIROFCLEATS_EMBEDDINGS=stub`.'; - } - if (parts.includes('dictionary') - || parts.includes('wordlist') - || parts.includes('words_alpha') - || parts.includes('download-dicts')) { - return 'Run `npm run download-dicts -- --lang en` (or configure dictionary.files/languages).'; - } - return null; -} - -/** - * Format error payloads for tool responses. - * @param {any} error - * @returns {{message:string,code?:number,stderr?:string,stdout?:string,hint?:string}} - */ -function formatToolError(error) { - const payload = { - message: error?.message || String(error) - }; - if (error?.code !== undefined) payload.code = error.code; - if (error?.stderr) payload.stderr = String(error.stderr).trim(); - if (error?.stdout) payload.stdout = String(error.stdout).trim(); - const hint = getRemediationHint(error); - if (hint) payload.hint = hint; - return payload; -} - -/** - * Emit a progress notification for long-running tools. - * @param {string|number|null} id - * @param {string} tool - * @param {{message:string,stream?:string,phase?:string}} payload - */ -function sendProgress(id, tool, payload) { - if (id === null || id === undefined) return; - const message = payload?.message ? String(payload.message) : ''; - if (!message) return; - sendNotification('notifications/progress', { - id, - tool, - message, - stream: payload?.stream || 'info', - phase: payload?.phase || 'progress', - ts: new Date().toISOString() - }); -} - -/** - * Restore CI artifacts if present. - * @param {string} repoPath - * @param {string} artifactsDir - * @returns {boolean} - */ -function maybeRestoreArtifacts(repoPath, artifactsDir, progress) { - const fromDir = artifactsDir ? path.resolve(artifactsDir) : path.join(repoPath, 'ci-artifacts'); - if (!fs.existsSync(path.join(fromDir, 'manifest.json'))) return false; - if (progress) { - progress({ - message: `Restoring CI artifacts from ${fromDir}`, - phase: 'start' - }); - } - runNodeSync(repoPath, [path.join(ROOT, 'tools', 'ci-restore-artifacts.js'), '--from', fromDir]); - if (progress) { - progress({ - message: 'CI artifacts restored.', - phase: 'done' - }); - } - return true; -} - -/** - * Handle the MCP build_index tool call. - * @param {object} [args] - * @returns {object} - */ -async function buildIndex(args = {}, context = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const userConfig = loadUserConfig(repoPath); - const sqliteConfigured = userConfig.sqlite?.use !== false; - const shouldUseSqlite = typeof args.sqlite === 'boolean' ? args.sqlite : sqliteConfigured; - const mode = args.mode || 'all'; - const incremental = args.incremental === true; - const stubEmbeddings = args.stubEmbeddings === true; - const buildSqlite = shouldUseSqlite && mode !== 'records'; - const useArtifacts = args.useArtifacts === true; - const progress = typeof context.progress === 'function' ? context.progress : null; - const progressLine = progress - ? ({ stream, line }) => progress({ message: line, stream }) - : null; - - let restoredArtifacts = false; - if (useArtifacts) { - restoredArtifacts = maybeRestoreArtifacts(repoPath, args.artifactsDir, progress); - } - - if (!restoredArtifacts) { - if (progress) { - progress({ - message: `Building ${mode} index${incremental ? ' (incremental)' : ''}.`, - phase: 'start' - }); - } - const indexArgs = [path.join(ROOT, 'build_index.js')]; - if (mode && mode !== 'all') indexArgs.push('--mode', mode); - if (incremental) indexArgs.push('--incremental'); - if (stubEmbeddings) indexArgs.push('--stub-embeddings'); - await runNodeAsync(repoPath, indexArgs, { streamOutput: true, onLine: progressLine }); - } - - if (buildSqlite) { - if (progress) { - progress({ - message: `Building SQLite index${incremental ? ' (incremental)' : ''}.`, - phase: 'start' - }); - } - const sqliteArgs = [path.join(ROOT, 'tools', 'build-sqlite-index.js')]; - if (incremental) sqliteArgs.push('--incremental'); - await runNodeAsync(repoPath, sqliteArgs, { streamOutput: true, onLine: progressLine }); - } - if (progress) { - progress({ - message: 'Index build complete.', - phase: 'done' - }); - } - - return { - repoPath, - mode, - sqlite: buildSqlite, - incremental, - restoredArtifacts - }; -} - -/** - * Handle the MCP search tool call. - * @param {object} [args] - * @returns {object} - */ -function runSearch(args = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const query = String(args.query || '').trim(); - if (!query) throw new Error('Query is required.'); - const mode = args.mode || 'both'; - const backend = args.backend || null; - const output = typeof args.output === 'string' ? args.output.toLowerCase() : ''; - const ann = typeof args.ann === 'boolean' ? args.ann : null; - const top = Number.isFinite(Number(args.top)) ? Math.max(1, Number(args.top)) : null; - const context = Number.isFinite(Number(args.context)) ? Math.max(0, Number(args.context)) : null; - const typeFilter = args.type ? String(args.type) : null; - const authorFilter = args.author ? String(args.author) : null; - const importFilter = args.import ? String(args.import) : null; - const callsFilter = args.calls ? String(args.calls) : null; - const usesFilter = args.uses ? String(args.uses) : null; - const signatureFilter = args.signature ? String(args.signature) : null; - const paramFilter = args.param ? String(args.param) : null; - const decoratorFilter = args.decorator ? String(args.decorator) : null; - const inferredTypeFilter = args.inferredType ? String(args.inferredType) : null; - const returnTypeFilter = args.returnType ? String(args.returnType) : null; - const throwsFilter = args.throws ? String(args.throws) : null; - const readsFilter = args.reads ? String(args.reads) : null; - const writesFilter = args.writes ? String(args.writes) : null; - const mutatesFilter = args.mutates ? String(args.mutates) : null; - const aliasFilter = args.alias ? String(args.alias) : null; - const awaitsFilter = args.awaits ? String(args.awaits) : null; - const riskFilter = args.risk ? String(args.risk) : null; - const riskTagFilter = args.riskTag ? String(args.riskTag) : null; - const riskSourceFilter = args.riskSource ? String(args.riskSource) : null; - const riskSinkFilter = args.riskSink ? String(args.riskSink) : null; - const riskCategoryFilter = args.riskCategory ? String(args.riskCategory) : null; - const riskFlowFilter = args.riskFlow ? String(args.riskFlow) : null; - const branchesMin = Number.isFinite(Number(args.branchesMin)) ? Number(args.branchesMin) : null; - const loopsMin = Number.isFinite(Number(args.loopsMin)) ? Number(args.loopsMin) : null; - const breaksMin = Number.isFinite(Number(args.breaksMin)) ? Number(args.breaksMin) : null; - const continuesMin = Number.isFinite(Number(args.continuesMin)) ? Number(args.continuesMin) : null; - const churnMin = Number.isFinite(Number(args.churnMin)) ? Number(args.churnMin) : null; - const chunkAuthorFilter = args.chunkAuthor ? String(args.chunkAuthor) : null; - const modifiedAfter = args.modifiedAfter ? String(args.modifiedAfter) : null; - const modifiedSince = Number.isFinite(Number(args.modifiedSince)) ? Number(args.modifiedSince) : null; - const visibilityFilter = args.visibility ? String(args.visibility) : null; - const extendsFilter = args.extends ? String(args.extends) : null; - const lintFilter = args.lint === true; - const asyncFilter = args.async === true; - const generatorFilter = args.generator === true; - const returnsFilter = args.returns === true; - const fileFilters = []; - const toList = (value) => (Array.isArray(value) ? value : (value == null ? [] : [value])); - fileFilters.push(...toList(args.path)); - fileFilters.push(...toList(args.file)); - const extFilters = toList(args.ext); - const metaFilters = normalizeMetaFilters(args.meta); - const metaJson = args.metaJson || null; - - const useCompact = output !== 'full' && output !== 'json'; - const searchArgs = [path.join(ROOT, 'search.js'), query, useCompact ? '--json-compact' : '--json']; - if (mode && mode !== 'both') searchArgs.push('--mode', mode); - if (backend) searchArgs.push('--backend', backend); - if (ann === true) searchArgs.push('--ann'); - if (ann === false) searchArgs.push('--no-ann'); - if (top) searchArgs.push('-n', String(top)); - if (context !== null) searchArgs.push('--context', String(context)); - if (typeFilter) searchArgs.push('--type', typeFilter); - if (authorFilter) searchArgs.push('--author', authorFilter); - if (importFilter) searchArgs.push('--import', importFilter); - if (callsFilter) searchArgs.push('--calls', callsFilter); - if (usesFilter) searchArgs.push('--uses', usesFilter); - if (signatureFilter) searchArgs.push('--signature', signatureFilter); - if (paramFilter) searchArgs.push('--param', paramFilter); - if (decoratorFilter) searchArgs.push('--decorator', decoratorFilter); - if (inferredTypeFilter) searchArgs.push('--inferred-type', inferredTypeFilter); - if (returnTypeFilter) searchArgs.push('--return-type', returnTypeFilter); - if (throwsFilter) searchArgs.push('--throws', throwsFilter); - if (readsFilter) searchArgs.push('--reads', readsFilter); - if (writesFilter) searchArgs.push('--writes', writesFilter); - if (mutatesFilter) searchArgs.push('--mutates', mutatesFilter); - if (aliasFilter) searchArgs.push('--alias', aliasFilter); - if (awaitsFilter) searchArgs.push('--awaits', awaitsFilter); - if (riskFilter) searchArgs.push('--risk', riskFilter); - if (riskTagFilter) searchArgs.push('--risk-tag', riskTagFilter); - if (riskSourceFilter) searchArgs.push('--risk-source', riskSourceFilter); - if (riskSinkFilter) searchArgs.push('--risk-sink', riskSinkFilter); - if (riskCategoryFilter) searchArgs.push('--risk-category', riskCategoryFilter); - if (riskFlowFilter) searchArgs.push('--risk-flow', riskFlowFilter); - if (branchesMin !== null) searchArgs.push('--branches', String(branchesMin)); - if (loopsMin !== null) searchArgs.push('--loops', String(loopsMin)); - if (breaksMin !== null) searchArgs.push('--breaks', String(breaksMin)); - if (continuesMin !== null) searchArgs.push('--continues', String(continuesMin)); - if (churnMin !== null) searchArgs.push('--churn', String(churnMin)); - if (chunkAuthorFilter) searchArgs.push('--chunk-author', chunkAuthorFilter); - if (modifiedAfter) searchArgs.push('--modified-after', modifiedAfter); - if (modifiedSince !== null) searchArgs.push('--modified-since', String(modifiedSince)); - if (visibilityFilter) searchArgs.push('--visibility', visibilityFilter); - if (extendsFilter) searchArgs.push('--extends', extendsFilter); - if (lintFilter) searchArgs.push('--lint'); - if (asyncFilter) searchArgs.push('--async'); - if (generatorFilter) searchArgs.push('--generator'); - if (returnsFilter) searchArgs.push('--returns'); - for (const entry of fileFilters) { - if (entry == null || entry === '') continue; - searchArgs.push('--path', String(entry)); - } - for (const entry of extFilters) { - if (entry == null || entry === '') continue; - searchArgs.push('--ext', String(entry)); - } - if (Array.isArray(metaFilters)) { - metaFilters.forEach((entry) => searchArgs.push('--meta', entry)); - } - if (metaJson) { - const jsonValue = typeof metaJson === 'string' ? metaJson : JSON.stringify(metaJson); - searchArgs.push('--meta-json', jsonValue); - } - - const stdout = runNodeSync(repoPath, searchArgs); - return JSON.parse(stdout || '{}'); -} - -/** - * Handle the MCP download_models tool call. - * @param {object} [args] - * @returns {{model:string,output:string}} - */ -async function downloadModels(args = {}, context = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const userConfig = loadUserConfig(repoPath); - const modelConfig = getModelConfig(repoPath, userConfig); - const model = args.model || modelConfig.id || DEFAULT_MODEL_ID; - const scriptArgs = [path.join(ROOT, 'tools', 'download-models.js'), '--model', model]; - if (args.cacheDir) scriptArgs.push('--cache-dir', args.cacheDir); - const progress = typeof context.progress === 'function' ? context.progress : null; - const progressLine = progress - ? ({ stream, line }) => progress({ message: line, stream }) - : null; - if (progress) { - progress({ message: `Downloading model ${model}.`, phase: 'start' }); - } - const { stdout } = await runNodeAsync(repoPath, scriptArgs, { - streamOutput: true, - onLine: progressLine - }); - if (progress) { - progress({ message: `Model download complete (${model}).`, phase: 'done' }); - } - return { model, output: stdout.trim() }; -} - -/** - * Handle the MCP download_dictionaries tool call. - * @param {object} [args] - * @returns {Promise} - */ -async function downloadDictionaries(args = {}, context = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const scriptArgs = [path.join(ROOT, 'tools', 'download-dicts.js')]; - if (args.lang) scriptArgs.push('--lang', String(args.lang)); - const urls = Array.isArray(args.url) ? args.url : (args.url ? [args.url] : []); - urls.forEach((value) => scriptArgs.push('--url', String(value))); - if (args.dir) scriptArgs.push('--dir', String(args.dir)); - if (args.update === true) scriptArgs.push('--update'); - if (args.force === true) scriptArgs.push('--force'); - const stdout = await runToolWithProgress({ - repoPath, - scriptArgs, - context, - startMessage: 'Downloading dictionaries.', - doneMessage: 'Dictionary download complete.' - }); - const summary = parseCountSummary(stdout); - return { - repoPath, - output: stdout.trim(), - ...(summary || {}) - }; -} - -/** - * Handle the MCP download_extensions tool call. - * @param {object} [args] - * @returns {Promise} - */ -async function downloadExtensions(args = {}, context = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const scriptArgs = [path.join(ROOT, 'tools', 'download-extensions.js')]; - if (args.provider) scriptArgs.push('--provider', String(args.provider)); - if (args.dir) scriptArgs.push('--dir', String(args.dir)); - if (args.out) scriptArgs.push('--out', String(args.out)); - if (args.platform) scriptArgs.push('--platform', String(args.platform)); - if (args.arch) scriptArgs.push('--arch', String(args.arch)); - const urls = Array.isArray(args.url) ? args.url : (args.url ? [args.url] : []); - urls.forEach((value) => scriptArgs.push('--url', String(value))); - if (args.update === true) scriptArgs.push('--update'); - if (args.force === true) scriptArgs.push('--force'); - const stdout = await runToolWithProgress({ - repoPath, - scriptArgs, - context, - startMessage: 'Downloading extensions.', - doneMessage: 'Extension download complete.' - }); - const summary = parseCountSummary(stdout); - const resolvedPath = parseExtensionPath(stdout); - return { - repoPath, - output: stdout.trim(), - extensionPath: resolvedPath, - ...(summary || {}) - }; -} - -/** - * Handle the MCP verify_extensions tool call. - * @param {object} [args] - * @returns {object} - */ -function verifyExtensions(args = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const scriptArgs = [path.join(ROOT, 'tools', 'verify-extensions.js'), '--json']; - if (args.provider) scriptArgs.push('--provider', String(args.provider)); - if (args.dir) scriptArgs.push('--dir', String(args.dir)); - if (args.path) scriptArgs.push('--path', String(args.path)); - if (args.platform) scriptArgs.push('--platform', String(args.platform)); - if (args.arch) scriptArgs.push('--arch', String(args.arch)); - if (args.module) scriptArgs.push('--module', String(args.module)); - if (args.table) scriptArgs.push('--table', String(args.table)); - if (args.column) scriptArgs.push('--column', String(args.column)); - if (args.encoding) scriptArgs.push('--encoding', String(args.encoding)); - if (args.options) scriptArgs.push('--options', String(args.options)); - if (args.annMode) scriptArgs.push('--ann-mode', String(args.annMode)); - if (args.load === false) scriptArgs.push('--no-load'); - const stdout = runNodeSync(repoPath, scriptArgs); - try { - return JSON.parse(stdout || '{}'); - } catch { - return { repoPath, output: stdout.trim() }; - } -} - -/** - * Handle the MCP build_sqlite_index tool call. - * @param {object} [args] - * @returns {Promise} - */ -async function buildSqliteIndex(args = {}, context = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const scriptArgs = [path.join(ROOT, 'tools', 'build-sqlite-index.js')]; - if (args.mode) scriptArgs.push('--mode', String(args.mode)); - if (args.incremental === true) scriptArgs.push('--incremental'); - if (args.compact === true) scriptArgs.push('--compact'); - if (args.codeDir) scriptArgs.push('--code-dir', String(args.codeDir)); - if (args.proseDir) scriptArgs.push('--prose-dir', String(args.proseDir)); - if (args.out) scriptArgs.push('--out', String(args.out)); - const stdout = await runToolWithProgress({ - repoPath, - scriptArgs, - context, - startMessage: 'Building SQLite index.', - doneMessage: 'SQLite index build complete.' - }); - return { repoPath, output: stdout.trim() }; -} - -/** - * Handle the MCP compact_sqlite_index tool call. - * @param {object} [args] - * @returns {Promise} - */ -async function compactSqliteIndex(args = {}, context = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const scriptArgs = [path.join(ROOT, 'tools', 'compact-sqlite-index.js')]; - if (args.mode) scriptArgs.push('--mode', String(args.mode)); - if (args.dryRun === true) scriptArgs.push('--dry-run'); - if (args.keepBackup === true) scriptArgs.push('--keep-backup'); - const stdout = await runToolWithProgress({ - repoPath, - scriptArgs, - context, - startMessage: 'Compacting SQLite index.', - doneMessage: 'SQLite compaction complete.' - }); - return { repoPath, output: stdout.trim() }; -} - -/** - * Handle the MCP cache_gc tool call. - * @param {object} [args] - * @returns {object} - */ -function cacheGc(args = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const scriptArgs = [path.join(ROOT, 'tools', 'cache-gc.js'), '--json']; - if (args.dryRun === true) scriptArgs.push('--dry-run'); - if (Number.isFinite(Number(args.maxBytes))) scriptArgs.push('--max-bytes', String(args.maxBytes)); - if (Number.isFinite(Number(args.maxGb))) scriptArgs.push('--max-gb', String(args.maxGb)); - if (Number.isFinite(Number(args.maxAgeDays))) scriptArgs.push('--max-age-days', String(args.maxAgeDays)); - const stdout = runNodeSync(repoPath, scriptArgs); - try { - return JSON.parse(stdout || '{}'); - } catch { - return { repoPath, output: stdout.trim() }; - } -} - -/** - * Handle the MCP clean_artifacts tool call. - * @param {object} [args] - * @returns {Promise} - */ -async function cleanArtifacts(args = {}, context = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const scriptArgs = [path.join(ROOT, 'tools', 'clean-artifacts.js')]; - if (args.all === true) scriptArgs.push('--all'); - if (args.dryRun === true) scriptArgs.push('--dry-run'); - const stdout = await runToolWithProgress({ - repoPath, - scriptArgs, - context, - startMessage: 'Cleaning artifacts.', - doneMessage: 'Artifact cleanup complete.' - }); - return { repoPath, output: stdout.trim() }; -} - -/** - * Handle the MCP bootstrap tool call. - * @param {object} [args] - * @returns {Promise} - */ -async function runBootstrap(args = {}, context = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const scriptArgs = [path.join(ROOT, 'tools', 'bootstrap.js')]; - if (args.skipInstall === true) scriptArgs.push('--skip-install'); - if (args.skipDicts === true) scriptArgs.push('--skip-dicts'); - if (args.skipIndex === true) scriptArgs.push('--skip-index'); - if (args.skipArtifacts === true) scriptArgs.push('--skip-artifacts'); - if (args.skipTooling === true) scriptArgs.push('--skip-tooling'); - if (args.withSqlite === true) scriptArgs.push('--with-sqlite'); - if (args.incremental === true) scriptArgs.push('--incremental'); - const stdout = await runToolWithProgress({ - repoPath, - scriptArgs, - context, - startMessage: 'Bootstrapping repo.', - doneMessage: 'Bootstrap complete.' - }); - return { repoPath, output: stdout.trim() }; -} - -/** - * Handle the MCP report_artifacts tool call. - * @param {object} [args] - * @returns {object} - */ -function reportArtifacts(args = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const stdout = runNodeSync(repoPath, [path.join(ROOT, 'tools', 'report-artifacts.js'), '--json']); - return JSON.parse(stdout || '{}'); -} - -/** - * Handle the MCP triage_ingest tool call. - * @param {object} [args] - * @returns {Promise} - */ -async function triageIngest(args = {}, context = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const source = String(args.source || '').trim(); - const inputPath = String(args.inputPath || '').trim(); - if (!source || !inputPath) { - throw new Error('source and inputPath are required.'); - } - const resolvedInput = path.isAbsolute(inputPath) ? inputPath : path.join(repoPath, inputPath); - const metaFilters = normalizeMetaFilters(args.meta); - const ingestArgs = [path.join(ROOT, 'tools', 'triage', 'ingest.js'), '--source', source, '--in', resolvedInput]; - ingestArgs.push('--repo', repoPath); - if (Array.isArray(metaFilters)) { - metaFilters.forEach((entry) => ingestArgs.push('--meta', entry)); - } - const progress = typeof context.progress === 'function' ? context.progress : null; - const progressLine = progress - ? ({ stream, line }) => progress({ message: line, stream }) - : null; - if (progress) { - progress({ message: `Ingesting ${source} findings.`, phase: 'start' }); - } - const { stdout } = await runNodeAsync(repoPath, ingestArgs, { streamOutput: true, onLine: progressLine }); - let payload = {}; - try { - payload = JSON.parse(stdout || '{}'); - } catch (error) { - throw new Error(`Failed to parse ingest output: ${error?.message || error}`); - } - if (args.buildIndex) { - await buildIndex({ - repoPath, - mode: 'records', - incremental: args.incremental === true, - stubEmbeddings: args.stubEmbeddings === true, - sqlite: false - }, context); - } - if (progress) { - progress({ message: 'Triage ingest complete.', phase: 'done' }); - } - return payload; -} - -/** - * Handle the MCP triage_decision tool call. - * @param {object} [args] - * @returns {object} - */ -function triageDecision(args = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const finding = String(args.finding || '').trim(); - const status = String(args.status || '').trim(); - if (!finding || !status) { - throw new Error('finding and status are required.'); - } - const metaFilters = normalizeMetaFilters(args.meta); - const decisionArgs = [path.join(ROOT, 'tools', 'triage', 'decision.js'), '--finding', finding, '--status', status]; - decisionArgs.push('--repo', repoPath); - if (args.justification) decisionArgs.push('--justification', String(args.justification)); - if (args.reviewer) decisionArgs.push('--reviewer', String(args.reviewer)); - if (args.expires) decisionArgs.push('--expires', String(args.expires)); - if (Array.isArray(metaFilters)) { - metaFilters.forEach((entry) => decisionArgs.push('--meta', entry)); - } - const codes = Array.isArray(args.codes) ? args.codes : (args.codes ? [args.codes] : []); - const evidence = Array.isArray(args.evidence) ? args.evidence : (args.evidence ? [args.evidence] : []); - codes.filter(Boolean).forEach((code) => decisionArgs.push('--code', String(code))); - evidence.filter(Boolean).forEach((item) => decisionArgs.push('--evidence', String(item))); - const stdout = runNodeSync(repoPath, decisionArgs); - return JSON.parse(stdout || '{}'); -} - -/** - * Handle the MCP triage_context_pack tool call. - * @param {object} [args] - * @returns {Promise} - */ -async function triageContextPack(args = {}, context = {}) { - const repoPath = resolveRepoPath(args.repoPath); - const recordId = String(args.recordId || '').trim(); - if (!recordId) throw new Error('recordId is required.'); - const contextArgs = [path.join(ROOT, 'tools', 'triage', 'context-pack.js'), '--record', recordId]; - contextArgs.push('--repo', repoPath); - if (args.outPath) contextArgs.push('--out', String(args.outPath)); - if (args.ann === true) contextArgs.push('--ann'); - if (args.ann === false) contextArgs.push('--no-ann'); - if (args.stubEmbeddings === true) contextArgs.push('--stub-embeddings'); - const progress = typeof context.progress === 'function' ? context.progress : null; - const progressLine = progress - ? ({ stream, line }) => progress({ message: line, stream }) - : null; - if (progress) { - progress({ message: 'Building triage context pack.', phase: 'start' }); - } - const { stdout } = await runNodeAsync(repoPath, contextArgs, { streamOutput: true, onLine: progressLine }); - if (progress) { - progress({ message: 'Context pack ready.', phase: 'done' }); - } - try { - return JSON.parse(stdout || '{}'); - } catch (error) { - throw new Error(`Failed to parse context pack output: ${error?.message || error}`); - } -} - -/** - * Dispatch an MCP tool call by name. - * @param {string} name - * @param {object} args - * @returns {Promise} - */ -async function handleToolCall(name, args, context = {}) { - switch (name) { - case 'index_status': - return await indexStatus(args); - case 'config_status': - return await configStatus(args); - case 'build_index': - return await buildIndex(args, context); - case 'search': - return runSearch(args); - case 'download_models': - return await downloadModels(args, context); - case 'download_dictionaries': - return await downloadDictionaries(args, context); - case 'download_extensions': - return await downloadExtensions(args, context); - case 'verify_extensions': - return verifyExtensions(args); - case 'build_sqlite_index': - return await buildSqliteIndex(args, context); - case 'compact_sqlite_index': - return await compactSqliteIndex(args, context); - case 'cache_gc': - return cacheGc(args); - case 'clean_artifacts': - return await cleanArtifacts(args, context); - case 'bootstrap': - return await runBootstrap(args, context); - case 'report_artifacts': - return reportArtifacts(args); - case 'triage_ingest': - return await triageIngest(args, context); - case 'triage_decision': - return triageDecision(args); - case 'triage_context_pack': - return await triageContextPack(args, context); - default: - throw new Error(`Unknown tool: ${name}`); - } -} - -/** - * Handle a JSON-RPC message from stdin. - * @param {object} message - * @returns {Promise} - */ -async function handleMessage(message) { - if (!message || message.jsonrpc !== '2.0') return; - const { id, method, params } = message; - - if (method === 'initialize') { - sendResult(id, { - protocolVersion: '2024-11-05', - serverInfo: { name: 'PairOfCleats', version: PKG.version }, - capabilities: { - tools: { listChanged: false }, - resources: { listChanged: false } - } - }); - return; - } - - if (method === 'shutdown') { - sendResult(id, {}); - return; - } - - if (method === 'exit') { - process.exit(0); - } - - if (method === 'tools/list') { - sendResult(id, { tools: TOOL_DEFS }); - return; - } - - if (method === 'resources/list') { - sendResult(id, { resources: [] }); - return; - } - - if (method === 'tools/call') { - if (!id) return; - const name = params?.name; - const args = params?.arguments || {}; - try { - const progress = (payload) => sendProgress(id, name, payload); - const result = await handleToolCall(name, args, { progress, toolCallId: id }); - sendResult(id, { - content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] - }); - } catch (error) { - const payload = formatToolError(error); - sendResult(id, { - content: [{ type: 'text', text: JSON.stringify(payload, null, 2) }], - isError: true - }); - } - return; - } - - if (id) { - sendError(id, -32601, `Method not found: ${method}`); - } -} - -let buffer = Buffer.alloc(0); -let processing = false; -const queue = []; - -/** - * Process queued messages serially. - */ -function processQueue() { - if (processing) return; - processing = true; - const run = async () => { - while (queue.length) { - const msg = queue.shift(); - await handleMessage(msg); - } - processing = false; - }; - run().catch((error) => { - processing = false; - console.error(error); - }); -} - -/** - * Enqueue a message for processing. - * @param {object} message - */ -function enqueueMessage(message) { - queue.push(message); - processQueue(); -} - -/** - * Parse framed JSON-RPC messages from the input buffer. - */ -function parseBuffer() { - while (true) { - const headerEnd = buffer.indexOf('\r\n\r\n'); - if (headerEnd === -1) return; - const header = buffer.slice(0, headerEnd).toString('utf8'); - const lengthMatch = header.match(/Content-Length:\s*(\d+)/i); - if (!lengthMatch) { - buffer = buffer.slice(headerEnd + 4); - continue; - } - const length = parseInt(lengthMatch[1], 10); - const total = headerEnd + 4 + length; - if (buffer.length < total) return; - const body = buffer.slice(headerEnd + 4, total).toString('utf8'); - buffer = buffer.slice(total); - try { - const msg = JSON.parse(body); - enqueueMessage(msg); - } catch {} - } -} - -process.stdin.on('data', (chunk) => { - buffer = Buffer.concat([buffer, chunk]); - parseBuffer(); +const DEFAULT_MCP_QUEUE_MAX = 64; +const DEFAULT_TOOL_TIMEOUT_MS = 120000; +const DEFAULT_TOOL_TIMEOUTS = { + build_index: 10 * 60 * 1000, + build_sqlite_index: 10 * 60 * 1000, + download_models: 10 * 60 * 1000, + download_dictionaries: 10 * 60 * 1000, + download_extensions: 10 * 60 * 1000, + bootstrap: 10 * 60 * 1000, + triage_ingest: 5 * 60 * 1000 +}; + +const envQueueMax = parseTimeoutMs(process.env.PAIROFCLEATS_MCP_QUEUE_MAX); +const envToolTimeoutMs = parseTimeoutMs(process.env.PAIROFCLEATS_MCP_TOOL_TIMEOUT_MS); +const baseConfigRoot = resolveRepoRoot(process.cwd()); +const baseConfig = loadUserConfig(baseConfigRoot); +const { logLine } = configureServiceLogger({ repoRoot: baseConfigRoot, service: 'mcp' }); +const runtimeConfig = getRuntimeConfig(baseConfigRoot, baseConfig); +const parsedUv = Number(process.env.UV_THREADPOOL_SIZE); +const effectiveUvThreadpoolSize = Number.isFinite(parsedUv) && parsedUv > 0 ? Math.floor(parsedUv) : null; +if (effectiveUvThreadpoolSize || runtimeConfig.uvThreadpoolSize) { + logLine(`[mcp] UV_THREADPOOL_SIZE: ${effectiveUvThreadpoolSize ?? 'default'} (config=${runtimeConfig.uvThreadpoolSize ?? 'none'})`); +} + +const baseMcpConfig = baseConfig?.mcp && typeof baseConfig.mcp === 'object' ? baseConfig.mcp : {}; +const configuredQueueMax = parseTimeoutMs(baseMcpConfig.queueMax); +const queueMax = Math.max(1, configuredQueueMax ?? envQueueMax ?? DEFAULT_MCP_QUEUE_MAX); + +const resolveTimeout = (name, args) => resolveToolTimeoutMs(name, args, { + envToolTimeoutMs, + defaultToolTimeoutMs: DEFAULT_TOOL_TIMEOUT_MS, + defaultToolTimeouts: DEFAULT_TOOL_TIMEOUTS }); -process.stdin.on('end', () => { - process.exit(0); +const transport = createMcpTransport({ + toolDefs: TOOL_DEFS, + serverInfo: { name: 'PairOfCleats', version: PKG.version }, + handleToolCall, + resolveToolTimeoutMs: resolveTimeout, + queueMax }); + +transport.start(); diff --git a/tools/mcp/repo.js b/tools/mcp/repo.js new file mode 100644 index 000000000..b9a1c42f4 --- /dev/null +++ b/tools/mcp/repo.js @@ -0,0 +1,360 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import simpleGit from 'simple-git'; +import { getEnvConfig } from '../../src/shared/env.js'; +import { createSqliteDbCache } from '../../src/retrieval/sqlite-cache.js'; +import { + getCacheRoot, + getDictConfig, + getDictionaryPaths, + getIndexDir, + getMetricsDir, + getModelConfig, + getRepoCacheRoot, + getRepoId, + loadUserConfig, + resolveRepoRoot, + resolveSqlitePaths +} from '../dict-utils.js'; +import { getVectorExtensionConfig, resolveVectorExtensionPath } from '../vector-extension.js'; + +const repoCaches = new Map(); + +export const getRepoCaches = (repoPath) => { + const key = repoPath || process.cwd(); + const existing = repoCaches.get(key); + if (existing) { + existing.lastUsed = Date.now(); + return existing; + } + const entry = { + indexCache: new Map(), + sqliteCache: createSqliteDbCache(), + lastUsed: Date.now() + }; + repoCaches.set(key, entry); + return entry; +}; + +export const clearRepoCaches = (repoPath) => { + if (!repoPath) return; + const entry = repoCaches.get(repoPath); + if (!entry) return; + entry.sqliteCache?.closeAll?.(); + entry.indexCache?.clear?.(); + repoCaches.delete(repoPath); +}; + +/** + * Resolve and validate a repo path. + * @param {string} inputPath + * @returns {string} + */ +export function resolveRepoPath(inputPath) { + const base = inputPath ? path.resolve(inputPath) : process.cwd(); + if (!fs.existsSync(base) || !fs.statSync(base).isDirectory()) { + throw new Error(`Repo path not found: ${base}`); + } + return inputPath ? base : resolveRepoRoot(base); +} + +const resolveConfigRoot = (args) => { + const candidate = args?.repoPath ? path.resolve(String(args.repoPath)) : null; + if (candidate && fs.existsSync(candidate) && fs.statSync(candidate).isDirectory()) { + return resolveRepoRoot(candidate); + } + return resolveRepoRoot(process.cwd()); +}; + +const resolveMcpConfig = (args) => { + const repoRoot = resolveConfigRoot(args); + const cfg = loadUserConfig(repoRoot); + return cfg?.mcp && typeof cfg.mcp === 'object' ? cfg.mcp : {}; +}; + +export const parseTimeoutMs = (value) => { + if (value == null || value === '') return null; + const parsed = Number(value); + return Number.isFinite(parsed) ? Math.max(0, Math.floor(parsed)) : null; +}; + +export const resolveToolTimeoutMs = (name, args, { envToolTimeoutMs, defaultToolTimeoutMs, defaultToolTimeouts }) => { + const mcpConfig = resolveMcpConfig(args); + const toolTimeouts = mcpConfig.toolTimeouts && typeof mcpConfig.toolTimeouts === 'object' + ? mcpConfig.toolTimeouts + : {}; + const override = parseTimeoutMs(toolTimeouts[name]); + const baseTimeout = parseTimeoutMs(mcpConfig.toolTimeoutMs ?? envToolTimeoutMs) + ?? defaultToolTimeouts[name] + ?? defaultToolTimeoutMs; + const resolved = override ?? baseTimeout; + return resolved && resolved > 0 ? resolved : null; +}; + +/** + * Build the artifact path map for a repo. + * @param {string} repoPath + * @param {object} userConfig + * @returns {object} + */ +function listArtifacts(repoPath, userConfig) { + const indexCode = getIndexDir(repoPath, 'code', userConfig); + const indexProse = getIndexDir(repoPath, 'prose', userConfig); + const indexRecords = getIndexDir(repoPath, 'records', userConfig); + const metricsDir = getMetricsDir(repoPath, userConfig); + const sqlitePaths = resolveSqlitePaths(repoPath, userConfig); + return { + index: { + code: { + dir: indexCode, + chunkMeta: path.join(indexCode, 'chunk_meta.json'), + tokenPostings: path.join(indexCode, 'token_postings.json') + }, + prose: { + dir: indexProse, + chunkMeta: path.join(indexProse, 'chunk_meta.json'), + tokenPostings: path.join(indexProse, 'token_postings.json') + }, + records: { + dir: indexRecords, + chunkMeta: path.join(indexRecords, 'chunk_meta.json'), + tokenPostings: path.join(indexRecords, 'token_postings.json') + } + }, + metrics: { + dir: metricsDir, + indexCode: path.join(metricsDir, 'index-code.json'), + indexProse: path.join(metricsDir, 'index-prose.json'), + indexRecords: path.join(metricsDir, 'index-records.json'), + queryCache: path.join(metricsDir, 'queryCache.json') + }, + sqlite: { + code: sqlitePaths.codePath, + prose: sqlitePaths.prosePath, + legacy: sqlitePaths.legacyPath, + legacyExists: sqlitePaths.legacyExists + } + }; +} + +/** + * Stat a path if it exists. + * @param {string} target + * @returns {{exists:boolean,mtime:(string|null),bytes:number}} + */ +function statIfExists(target) { + try { + const stat = fs.statSync(target); + return { + exists: true, + mtime: stat.mtime ? stat.mtime.toISOString() : null, + bytes: stat.size + }; + } catch { + return { exists: false, mtime: null, bytes: 0 }; + } +} + +/** + * Fetch lightweight git status info for a repo. + * @param {string} repoPath + * @returns {Promise} + */ +async function getGitInfo(repoPath) { + const gitDir = path.join(repoPath, '.git'); + const hasGitDir = fs.existsSync(gitDir); + if (!hasGitDir) { + return { + isRepo: false, + warning: 'Git repository not detected; using path-based repo identity.' + }; + } + try { + const git = simpleGit(repoPath); + const status = await git.status(); + const head = await git.revparse(['HEAD']); + return { + isRepo: true, + head: head.trim(), + branch: status.current || null, + isDirty: status.files.length > 0 + }; + } catch (error) { + return { + isRepo: true, + warning: `Git detected but status unavailable: ${error.message}` + }; + } +} + +/** + * Build an index status report for the MCP tool. + * @param {object} [args] + * @returns {Promise} + */ +export async function indexStatus(args = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const userConfig = loadUserConfig(repoPath); + const envConfig = getEnvConfig(); + const cacheRoot = (userConfig.cache && userConfig.cache.root) || envConfig.cacheRoot || getCacheRoot(); + const repoId = getRepoId(repoPath); + const repoCacheRoot = getRepoCacheRoot(repoPath, userConfig); + const dictConfig = getDictConfig(repoPath, userConfig); + const dictPaths = await getDictionaryPaths(repoPath, dictConfig); + const modelConfig = getModelConfig(repoPath, userConfig); + const modelsDir = modelConfig.dir; + const modelDirName = `models--${modelConfig.id.replace('/', '--')}`; + const modelPath = path.join(modelsDir, modelDirName); + + const artifacts = listArtifacts(repoPath, userConfig); + const git = await getGitInfo(repoPath); + const incrementalRoot = path.join(repoCacheRoot, 'incremental'); + const report = { + repoPath, + repoId, + cacheRoot, + repoCacheRoot, + git, + dictionaries: { + dir: dictConfig.dir, + files: dictPaths, + enabled: dictPaths.length > 0, + includeSlang: dictConfig.includeSlang + }, + models: { + dir: modelsDir, + model: modelConfig.id, + available: fs.existsSync(modelPath), + hint: fs.existsSync(modelPath) + ? null + : 'Run the download_models tool or `npm run download-models` to prefetch embeddings.' + }, + incremental: { + dir: incrementalRoot, + exists: fs.existsSync(incrementalRoot) + }, + index: { + code: { + dir: artifacts.index.code.dir, + chunkMeta: statIfExists(artifacts.index.code.chunkMeta), + tokenPostings: statIfExists(artifacts.index.code.tokenPostings) + }, + prose: { + dir: artifacts.index.prose.dir, + chunkMeta: statIfExists(artifacts.index.prose.chunkMeta), + tokenPostings: statIfExists(artifacts.index.prose.tokenPostings) + }, + records: { + dir: artifacts.index.records.dir, + chunkMeta: statIfExists(artifacts.index.records.chunkMeta), + tokenPostings: statIfExists(artifacts.index.records.tokenPostings) + } + }, + sqlite: { + code: { path: artifacts.sqlite.code, ...statIfExists(artifacts.sqlite.code) }, + prose: { path: artifacts.sqlite.prose, ...statIfExists(artifacts.sqlite.prose) }, + legacy: artifacts.sqlite.legacyExists ? artifacts.sqlite.legacy : null + }, + metrics: { + dir: artifacts.metrics.dir, + indexCode: statIfExists(artifacts.metrics.indexCode), + indexProse: statIfExists(artifacts.metrics.indexProse), + indexRecords: statIfExists(artifacts.metrics.indexRecords), + queryCache: statIfExists(artifacts.metrics.queryCache) + } + }; + + return report; +} + +/** + * Inspect configuration + cache status with warnings. + * @param {object} [args] + * @returns {Promise} + */ +export async function configStatus(args = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const userConfig = loadUserConfig(repoPath); + const envConfig = getEnvConfig(); + const cacheRoot = (userConfig.cache && userConfig.cache.root) || envConfig.cacheRoot || getCacheRoot(); + const repoCacheRoot = getRepoCacheRoot(repoPath, userConfig); + const dictConfig = getDictConfig(repoPath, userConfig); + const dictionaryPaths = await getDictionaryPaths(repoPath, dictConfig); + const modelConfig = getModelConfig(repoPath, userConfig); + const modelsDir = modelConfig.dir; + const modelDirName = `models--${modelConfig.id.replace('/', '--')}`; + const modelPath = path.join(modelsDir, modelDirName); + const sqlitePaths = resolveSqlitePaths(repoPath, userConfig); + const sqliteConfigured = userConfig.sqlite?.use !== false; + const vectorConfig = getVectorExtensionConfig(repoPath, userConfig); + const vectorPath = resolveVectorExtensionPath(vectorConfig); + + const warnings = []; + if (!dictionaryPaths.length && (dictConfig.languages.length || dictConfig.files.length || dictConfig.includeSlang || dictConfig.enableRepoDictionary)) { + warnings.push({ + code: 'dictionary_missing', + message: 'No dictionary files found; identifier splitting will be limited.' + }); + } + if (!fs.existsSync(modelPath)) { + warnings.push({ + code: 'model_missing', + message: `Embedding model not found (${modelConfig.id}). Run npm run download-models.` + }); + } + if (sqliteConfigured) { + const missing = []; + if (!fs.existsSync(sqlitePaths.codePath)) missing.push(`code=${sqlitePaths.codePath}`); + if (!fs.existsSync(sqlitePaths.prosePath)) missing.push(`prose=${sqlitePaths.prosePath}`); + if (missing.length) { + warnings.push({ + code: 'sqlite_missing', + message: `SQLite indexes missing (${missing.join(', ')}). Run npm run build-sqlite-index.` + }); + } + } + if (vectorConfig.enabled) { + if (!vectorPath || !fs.existsSync(vectorPath)) { + warnings.push({ + code: 'extension_missing', + message: 'SQLite vector extension is enabled but not installed.' + }); + } + } + + return { + repoPath, + repoId: getRepoId(repoPath), + config: { + cacheRoot, + repoCacheRoot, + dictionary: dictConfig, + models: modelConfig, + sqlite: { + use: sqliteConfigured, + annMode: vectorConfig.annMode || null, + codeDbPath: sqlitePaths.codePath, + proseDbPath: sqlitePaths.prosePath + }, + search: userConfig.search || {}, + indexing: userConfig.indexing || {}, + tooling: userConfig.tooling || {} + }, + cache: { + cacheRootExists: fs.existsSync(cacheRoot), + repoCacheExists: fs.existsSync(repoCacheRoot), + dictionaries: dictionaryPaths, + modelAvailable: fs.existsSync(modelPath), + sqlite: { + codeExists: fs.existsSync(sqlitePaths.codePath), + proseExists: fs.existsSync(sqlitePaths.prosePath) + }, + vectorExtension: { + enabled: vectorConfig.enabled, + path: vectorPath, + available: !!(vectorPath && fs.existsSync(vectorPath)) + } + }, + warnings + }; +} diff --git a/tools/mcp/runner.js b/tools/mcp/runner.js new file mode 100644 index 000000000..1d6730cae --- /dev/null +++ b/tools/mcp/runner.js @@ -0,0 +1,181 @@ +import { execa, execaSync } from 'execa'; +import { ERROR_CODES } from '../../src/shared/error-codes.js'; +import { incTimeout } from '../../src/shared/metrics.js'; + +/** + * Run a node command and return stdout. + * @param {string} cwd + * @param {string[]} args + * @returns {string} + */ +export function runNodeSync(cwd, args) { + const result = execaSync(process.execPath, args, { + cwd, + encoding: 'utf8', + reject: false + }); + if (result.exitCode !== 0) { + const stderr = (result.stderr || '').trim(); + const stdout = (result.stdout || '').trim(); + const message = stderr || stdout || `Command failed: ${args.join(' ')}`; + const error = new Error(message.trim()); + error.code = result.exitCode; + error.stderr = stderr; + error.stdout = stdout; + throw error; + } + return result.stdout || ''; +} + +/** + * Build a line buffer for progress streaming. + * @param {(line:string)=>void} onLine + * @returns {{push:(text:string)=>void,flush:()=>void}} + */ +function createLineBuffer(onLine) { + let buffer = ''; + return { + push(text) { + buffer += text; + const lines = buffer.split(/\r?\n/); + buffer = lines.pop() || ''; + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed) onLine(trimmed); + } + }, + flush() { + const trimmed = buffer.trim(); + if (trimmed) onLine(trimmed); + buffer = ''; + } + }; +} + +/** + * Run a node command asynchronously with optional stderr streaming. + * @param {string} cwd + * @param {string[]} args + * @param {{streamOutput?:boolean,onLine?:(payload:{stream:string,line:string})=>void,maxBufferBytes?:number}} [options] + * @returns {Promise<{stdout:string,stderr:string}>} + */ +export function runNodeAsync(cwd, args, options = {}) { + return new Promise((resolve, reject) => { + const child = execa(process.execPath, args, { + cwd, + reject: false, + stdio: ['ignore', 'pipe', 'pipe'] + }); + let stdout = ''; + let stderr = ''; + const streamOutput = options.streamOutput === true; + const onLine = typeof options.onLine === 'function' ? options.onLine : null; + const maxBufferBytes = Number.isFinite(Number(options.maxBufferBytes)) + ? Math.max(0, Number(options.maxBufferBytes)) + : 1024 * 1024; + const appendLimited = (current, text) => { + if (!maxBufferBytes) return current + text; + const combined = current + text; + if (combined.length <= maxBufferBytes) return combined; + return combined.slice(combined.length - maxBufferBytes); + }; + const stdoutBuffer = onLine + ? createLineBuffer((line) => onLine({ stream: 'stdout', line })) + : null; + const stderrBuffer = onLine + ? createLineBuffer((line) => onLine({ stream: 'stderr', line })) + : null; + child.stdout?.on('data', (chunk) => { + const text = chunk.toString(); + stdout = appendLimited(stdout, text); + if (streamOutput) process.stderr.write(text); + stdoutBuffer?.push(text); + }); + child.stderr?.on('data', (chunk) => { + const text = chunk.toString(); + stderr = appendLimited(stderr, text); + if (streamOutput) process.stderr.write(text); + stderrBuffer?.push(text); + }); + child + .then((result) => { + stdoutBuffer?.flush(); + stderrBuffer?.flush(); + if (result.exitCode === 0) { + resolve({ stdout, stderr }); + return; + } + const error = new Error(stderr.trim() || `Command failed: ${args.join(' ')}`); + error.code = result.exitCode; + error.stdout = stdout; + error.stderr = stderr; + reject(error); + }) + .catch((err) => { + const error = new Error(err?.shortMessage || err?.message || 'Command failed'); + error.code = err?.exitCode; + error.stdout = err?.stdout || stdout; + error.stderr = err?.stderr || stderr; + reject(error); + }); + }); +} + +/** + * Run a tool script with progress notifications. + * @param {{repoPath:string,scriptArgs:string[],context?:object,startMessage?:string,doneMessage?:string}} input + * @returns {Promise} + */ +export async function runToolWithProgress({ repoPath, scriptArgs, context = {}, startMessage, doneMessage }) { + const progress = typeof context.progress === 'function' ? context.progress : null; + const progressLine = progress + ? ({ stream, line }) => progress({ message: line, stream }) + : null; + if (progress && startMessage) { + progress({ message: startMessage, phase: 'start' }); + } + const { stdout } = await runNodeAsync(repoPath, scriptArgs, { + streamOutput: true, + onLine: progressLine + }); + if (progress && doneMessage) { + progress({ message: doneMessage, phase: 'done' }); + } + return stdout || ''; +} + +export function parseCountSummary(stdout) { + const match = String(stdout || '').match(/downloaded=(\d+)\s+skipped=(\d+)/i); + if (!match) return null; + return { + downloaded: Number(match[1]), + skipped: Number(match[2]) + }; +} + +export function parseExtensionPath(stdout) { + const match = String(stdout || '').match(/Extension present at (.+)$/im); + return match ? match[1].trim() : null; +} + +export const withTimeout = async (promise, timeoutMs, { label, onTimeout } = {}) => { + if (!Number.isFinite(timeoutMs) || timeoutMs <= 0) { + return await promise; + } + let timer = null; + const timeoutPromise = new Promise((_, reject) => { + timer = setTimeout(() => { + onTimeout?.(); + incTimeout({ surface: 'mcp', operation: 'tool' }); + const error = new Error(`Tool timeout after ${timeoutMs}ms (${label || 'tool'}).`); + error.code = ERROR_CODES.TOOL_TIMEOUT; + error.timeoutMs = timeoutMs; + reject(error); + }, timeoutMs); + }); + try { + return await Promise.race([promise, timeoutPromise]); + } finally { + if (timer) clearTimeout(timer); + } +}; diff --git a/tools/mcp/tools.js b/tools/mcp/tools.js new file mode 100644 index 000000000..c107e36d2 --- /dev/null +++ b/tools/mcp/tools.js @@ -0,0 +1,676 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { + DEFAULT_MODEL_ID, + getModelConfig, + loadUserConfig, + resolveToolRoot +} from '../dict-utils.js'; +import { buildIndex as coreBuildIndex, buildSqliteIndex as coreBuildSqliteIndex, search as coreSearch, status as coreStatus } from '../../src/integrations/core/index.js'; +import { clearRepoCaches, configStatus, getRepoCaches, indexStatus, resolveRepoPath } from './repo.js'; +import { parseCountSummary, parseExtensionPath, runNodeAsync, runNodeSync, runToolWithProgress } from './runner.js'; + +const toolRoot = resolveToolRoot(); + +/** + * Normalize meta filters into CLI-friendly key/value strings. + * @param {any} meta + * @returns {string[]|null} + */ +function normalizeMetaFilters(meta) { + if (!meta) return null; + if (Array.isArray(meta)) { + const entries = meta.flatMap((entry) => { + if (entry == null) return []; + if (typeof entry === 'string') return [entry]; + if (typeof entry === 'object') { + return Object.entries(entry).map(([key, value]) => + value == null || value === '' ? String(key) : `${key}=${value}` + ); + } + return [String(entry)]; + }); + return entries.length ? entries : null; + } + if (typeof meta === 'object') { + const entries = Object.entries(meta).map(([key, value]) => + value == null || value === '' ? String(key) : `${key}=${value}` + ); + return entries.length ? entries : null; + } + return [String(meta)]; +} + +/** + * Restore CI artifacts if present. + * @param {string} repoPath + * @param {string} artifactsDir + * @returns {boolean} + */ +function maybeRestoreArtifacts(repoPath, artifactsDir, progress) { + const fromDir = artifactsDir ? path.resolve(artifactsDir) : path.join(repoPath, 'ci-artifacts'); + if (!fs.existsSync(path.join(fromDir, 'manifest.json'))) return false; + if (progress) { + progress({ + message: `Restoring CI artifacts from ${fromDir}`, + phase: 'start' + }); + } + runNodeSync(repoPath, [path.join(toolRoot, 'tools', 'ci-restore-artifacts.js'), '--repo', repoPath, '--from', fromDir]); + if (progress) { + progress({ + message: 'CI artifacts restored.', + phase: 'done' + }); + } + return true; +} + +/** + * Handle the MCP build_index tool call. + * @param {object} [args] + * @returns {object} + */ +export async function buildIndex(args = {}, context = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const userConfig = loadUserConfig(repoPath); + const sqliteConfigured = userConfig.sqlite?.use !== false; + const shouldUseSqlite = typeof args.sqlite === 'boolean' ? args.sqlite : sqliteConfigured; + const mode = args.mode || 'all'; + const incremental = args.incremental === true; + const stubEmbeddings = args.stubEmbeddings === true; + const buildSqlite = shouldUseSqlite && mode !== 'records'; + const useArtifacts = args.useArtifacts === true; + const progress = typeof context.progress === 'function' ? context.progress : null; + + let restoredArtifacts = false; + if (useArtifacts) { + restoredArtifacts = maybeRestoreArtifacts(repoPath, args.artifactsDir, progress); + } + + if (!restoredArtifacts) { + if (progress) { + progress({ + message: `Building ${mode} index${incremental ? ' (incremental)' : ''}.`, + phase: 'start' + }); + } + await coreBuildIndex(repoPath, { + mode, + incremental, + stubEmbeddings, + sqlite: buildSqlite, + emitOutput: true + }); + } + + if (buildSqlite) { + if (progress) { + progress({ + message: `Building SQLite index${incremental ? ' (incremental)' : ''}.`, + phase: 'start' + }); + } + await coreBuildSqliteIndex(repoPath, { + incremental, + emitOutput: true + }); + } + if (progress) { + progress({ + message: 'Index build complete.', + phase: 'done' + }); + } + clearRepoCaches(repoPath); + + return { + repoPath, + mode, + sqlite: buildSqlite, + incremental, + restoredArtifacts + }; +} + +/** + * Handle the MCP search tool call. + * @param {object} [args] + * @returns {object} + */ +export async function runSearch(args = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const query = String(args.query || '').trim(); + if (!query) throw new Error('Query is required.'); + + const mode = args.mode || 'both'; + const backend = args.backend || null; + const output = typeof args.output === 'string' ? args.output.toLowerCase() : ''; + const ann = typeof args.ann === 'boolean' ? args.ann : null; + const top = Number.isFinite(Number(args.top)) ? Math.max(1, Number(args.top)) : null; + const context = Number.isFinite(Number(args.context)) ? Math.max(0, Number(args.context)) : null; + const typeFilter = args.type ? String(args.type) : null; + const authorFilter = args.author ? String(args.author) : null; + const importFilter = args.import ? String(args.import) : null; + const callsFilter = args.calls ? String(args.calls) : null; + const usesFilter = args.uses ? String(args.uses) : null; + const signatureFilter = args.signature ? String(args.signature) : null; + const paramFilter = args.param ? String(args.param) : null; + const decoratorFilter = args.decorator ? String(args.decorator) : null; + const inferredTypeFilter = args.inferredType ? String(args.inferredType) : null; + const returnTypeFilter = args.returnType ? String(args.returnType) : null; + const throwsFilter = args.throws ? String(args.throws) : null; + const readsFilter = args.reads ? String(args.reads) : null; + const writesFilter = args.writes ? String(args.writes) : null; + const mutatesFilter = args.mutates ? String(args.mutates) : null; + const aliasFilter = args.alias ? String(args.alias) : null; + const awaitsFilter = args.awaits ? String(args.awaits) : null; + const riskFilter = args.risk ? String(args.risk) : null; + const riskTagFilter = args.riskTag ? String(args.riskTag) : null; + const riskSourceFilter = args.riskSource ? String(args.riskSource) : null; + const riskSinkFilter = args.riskSink ? String(args.riskSink) : null; + const riskCategoryFilter = args.riskCategory ? String(args.riskCategory) : null; + const riskFlowFilter = args.riskFlow ? String(args.riskFlow) : null; + const branchesMin = Number.isFinite(Number(args.branchesMin)) ? Number(args.branchesMin) : null; + const loopsMin = Number.isFinite(Number(args.loopsMin)) ? Number(args.loopsMin) : null; + const breaksMin = Number.isFinite(Number(args.breaksMin)) ? Number(args.breaksMin) : null; + const continuesMin = Number.isFinite(Number(args.continuesMin)) ? Number(args.continuesMin) : null; + const churnMin = Number.isFinite(Number(args.churnMin)) ? Number(args.churnMin) : null; + const chunkAuthorFilter = args.chunkAuthor ? String(args.chunkAuthor) : null; + const modifiedAfter = args.modifiedAfter ? String(args.modifiedAfter) : null; + const modifiedSince = Number.isFinite(Number(args.modifiedSince)) ? Number(args.modifiedSince) : null; + const visibilityFilter = args.visibility ? String(args.visibility) : null; + const extendsFilter = args.extends ? String(args.extends) : null; + const lintFilter = args.lint === true; + const asyncFilter = args.async === true; + const generatorFilter = args.generator === true; + const returnsFilter = args.returns === true; + const branchFilter = args.branch ? String(args.branch) : null; + const langFilter = args.lang ? String(args.lang) : null; + const caseAll = args.case === true; + const caseFile = args.caseFile === true || caseAll; + const caseTokens = args.caseTokens === true || caseAll; + const fileFilters = []; + const toList = (value) => (Array.isArray(value) ? value : (value == null ? [] : [value])); + fileFilters.push(...toList(args.path)); + fileFilters.push(...toList(args.file)); + const extFilters = toList(args.ext); + const metaFilters = normalizeMetaFilters(args.meta); + const metaJson = args.metaJson || null; + + const useCompact = output !== 'full' && output !== 'json'; + const searchArgs = [useCompact ? '--json-compact' : '--json', '--repo', repoPath]; + if (mode && mode !== 'both') searchArgs.push('--mode', mode); + if (backend) searchArgs.push('--backend', backend); + if (ann === true) searchArgs.push('--ann'); + if (ann === false) searchArgs.push('--no-ann'); + if (top) searchArgs.push('-n', String(top)); + if (context !== null) searchArgs.push('--context', String(context)); + if (typeFilter) searchArgs.push('--type', typeFilter); + if (authorFilter) searchArgs.push('--author', authorFilter); + if (importFilter) searchArgs.push('--import', importFilter); + if (callsFilter) searchArgs.push('--calls', callsFilter); + if (usesFilter) searchArgs.push('--uses', usesFilter); + if (signatureFilter) searchArgs.push('--signature', signatureFilter); + if (paramFilter) searchArgs.push('--param', paramFilter); + if (decoratorFilter) searchArgs.push('--decorator', decoratorFilter); + if (inferredTypeFilter) searchArgs.push('--inferred-type', inferredTypeFilter); + if (returnTypeFilter) searchArgs.push('--return-type', returnTypeFilter); + if (throwsFilter) searchArgs.push('--throws', throwsFilter); + if (readsFilter) searchArgs.push('--reads', readsFilter); + if (writesFilter) searchArgs.push('--writes', writesFilter); + if (mutatesFilter) searchArgs.push('--mutates', mutatesFilter); + if (aliasFilter) searchArgs.push('--alias', aliasFilter); + if (awaitsFilter) searchArgs.push('--awaits', awaitsFilter); + if (riskFilter) searchArgs.push('--risk', riskFilter); + if (riskTagFilter) searchArgs.push('--risk-tag', riskTagFilter); + if (riskSourceFilter) searchArgs.push('--risk-source', riskSourceFilter); + if (riskSinkFilter) searchArgs.push('--risk-sink', riskSinkFilter); + if (riskCategoryFilter) searchArgs.push('--risk-category', riskCategoryFilter); + if (riskFlowFilter) searchArgs.push('--risk-flow', riskFlowFilter); + if (branchesMin !== null) searchArgs.push('--branches', String(branchesMin)); + if (loopsMin !== null) searchArgs.push('--loops', String(loopsMin)); + if (breaksMin !== null) searchArgs.push('--breaks', String(breaksMin)); + if (continuesMin !== null) searchArgs.push('--continues', String(continuesMin)); + if (churnMin !== null) searchArgs.push('--churn', String(churnMin)); + if (chunkAuthorFilter) searchArgs.push('--chunk-author', chunkAuthorFilter); + if (modifiedAfter) searchArgs.push('--modified-after', modifiedAfter); + if (modifiedSince !== null) searchArgs.push('--modified-since', String(modifiedSince)); + if (visibilityFilter) searchArgs.push('--visibility', visibilityFilter); + if (extendsFilter) searchArgs.push('--extends', extendsFilter); + if (lintFilter) searchArgs.push('--lint'); + if (asyncFilter) searchArgs.push('--async'); + if (generatorFilter) searchArgs.push('--generator'); + if (returnsFilter) searchArgs.push('--returns'); + if (branchFilter) searchArgs.push('--branch', branchFilter); + if (langFilter) searchArgs.push('--lang', langFilter); + if (caseAll) searchArgs.push('--case'); + if (!caseAll && caseFile) searchArgs.push('--case-file'); + if (!caseAll && caseTokens) searchArgs.push('--case-tokens'); + for (const entry of fileFilters) { + if (entry == null || entry === '') continue; + searchArgs.push('--path', String(entry)); + } + for (const entry of extFilters) { + if (entry == null || entry === '') continue; + searchArgs.push('--ext', String(entry)); + } + if (Array.isArray(metaFilters)) { + metaFilters.forEach((entry) => searchArgs.push('--meta', entry)); + } + if (metaJson) { + const jsonValue = typeof metaJson === 'string' ? metaJson : JSON.stringify(metaJson); + searchArgs.push('--meta-json', jsonValue); + } + + const caches = getRepoCaches(repoPath); + return await coreSearch(repoPath, { + args: searchArgs, + query, + emitOutput: false, + exitOnError: false, + indexCache: caches.indexCache, + sqliteCache: caches.sqliteCache + }); +} + +/** + * Handle the MCP download_models tool call. + * @param {object} [args] + * @returns {{model:string,output:string}} + */ +export async function downloadModels(args = {}, context = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const userConfig = loadUserConfig(repoPath); + const modelConfig = getModelConfig(repoPath, userConfig); + const model = args.model || modelConfig.id || DEFAULT_MODEL_ID; + const scriptArgs = [path.join(toolRoot, 'tools', 'download-models.js'), '--model', model, '--repo', repoPath]; + if (args.cacheDir) scriptArgs.push('--cache-dir', args.cacheDir); + const progress = typeof context.progress === 'function' ? context.progress : null; + const progressLine = progress + ? ({ stream, line }) => progress({ message: line, stream }) + : null; + if (progress) { + progress({ message: `Downloading model ${model}.`, phase: 'start' }); + } + const { stdout } = await runNodeAsync(repoPath, scriptArgs, { + streamOutput: true, + onLine: progressLine + }); + if (progress) { + progress({ message: `Model download complete (${model}).`, phase: 'done' }); + } + return { model, output: stdout.trim() }; +} + +/** + * Handle the MCP download_dictionaries tool call. + * @param {object} [args] + * @returns {Promise} + */ +export async function downloadDictionaries(args = {}, context = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const scriptArgs = [path.join(toolRoot, 'tools', 'download-dicts.js'), '--repo', repoPath]; + if (args.lang) scriptArgs.push('--lang', String(args.lang)); + const urls = Array.isArray(args.url) ? args.url : (args.url ? [args.url] : []); + urls.forEach((value) => scriptArgs.push('--url', String(value))); + if (args.dir) scriptArgs.push('--dir', String(args.dir)); + if (args.update === true) scriptArgs.push('--update'); + if (args.force === true) scriptArgs.push('--force'); + const stdout = await runToolWithProgress({ + repoPath, + scriptArgs, + context, + startMessage: 'Downloading dictionaries.', + doneMessage: 'Dictionary download complete.' + }); + const summary = parseCountSummary(stdout); + return { + repoPath, + output: stdout.trim(), + ...(summary || {}) + }; +} + +/** + * Handle the MCP download_extensions tool call. + * @param {object} [args] + * @returns {Promise} + */ +export async function downloadExtensions(args = {}, context = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const scriptArgs = [path.join(toolRoot, 'tools', 'download-extensions.js'), '--repo', repoPath]; + if (args.provider) scriptArgs.push('--provider', String(args.provider)); + if (args.dir) scriptArgs.push('--dir', String(args.dir)); + if (args.out) scriptArgs.push('--out', String(args.out)); + if (args.platform) scriptArgs.push('--platform', String(args.platform)); + if (args.arch) scriptArgs.push('--arch', String(args.arch)); + const urls = Array.isArray(args.url) ? args.url : (args.url ? [args.url] : []); + urls.forEach((value) => scriptArgs.push('--url', String(value))); + if (args.update === true) scriptArgs.push('--update'); + if (args.force === true) scriptArgs.push('--force'); + const stdout = await runToolWithProgress({ + repoPath, + scriptArgs, + context, + startMessage: 'Downloading extensions.', + doneMessage: 'Extension download complete.' + }); + const summary = parseCountSummary(stdout); + const resolvedPath = parseExtensionPath(stdout); + return { + repoPath, + output: stdout.trim(), + extensionPath: resolvedPath, + ...(summary || {}) + }; +} + +/** + * Handle the MCP verify_extensions tool call. + * @param {object} [args] + * @returns {object} + */ +export function verifyExtensions(args = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const scriptArgs = [path.join(toolRoot, 'tools', 'verify-extensions.js'), '--json', '--repo', repoPath]; + if (args.provider) scriptArgs.push('--provider', String(args.provider)); + if (args.dir) scriptArgs.push('--dir', String(args.dir)); + if (args.path) scriptArgs.push('--path', String(args.path)); + if (args.platform) scriptArgs.push('--platform', String(args.platform)); + if (args.arch) scriptArgs.push('--arch', String(args.arch)); + if (args.module) scriptArgs.push('--module', String(args.module)); + if (args.table) scriptArgs.push('--table', String(args.table)); + if (args.column) scriptArgs.push('--column', String(args.column)); + if (args.encoding) scriptArgs.push('--encoding', String(args.encoding)); + if (args.options) scriptArgs.push('--options', String(args.options)); + if (args.annMode) scriptArgs.push('--ann-mode', String(args.annMode)); + if (args.load === false) scriptArgs.push('--no-load'); + const stdout = runNodeSync(repoPath, scriptArgs); + try { + return JSON.parse(stdout || '{}'); + } catch { + return { repoPath, output: stdout.trim() }; + } +} + +/** + * Handle the MCP build_sqlite_index tool call. + * @param {object} [args] + * @returns {Promise} + */ +export async function buildSqliteIndex(args = {}, context = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const progress = typeof context.progress === 'function' ? context.progress : null; + if (progress) { + progress({ message: 'Building SQLite index.', phase: 'start' }); + } + const payload = await coreBuildSqliteIndex(repoPath, { + mode: args.mode, + incremental: args.incremental === true, + compact: args.compact === true, + codeDir: args.codeDir, + proseDir: args.proseDir, + out: args.out, + emitOutput: true, + exitOnError: false + }); + clearRepoCaches(repoPath); + if (progress) { + progress({ message: 'SQLite index build complete.', phase: 'done' }); + } + return payload; +} + +/** + * Handle the MCP compact_sqlite_index tool call. + * @param {object} [args] + * @returns {Promise} + */ +export async function compactSqliteIndex(args = {}, context = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const scriptArgs = [path.join(toolRoot, 'tools', 'compact-sqlite-index.js'), '--repo', repoPath]; + if (args.mode) scriptArgs.push('--mode', String(args.mode)); + if (args.dryRun === true) scriptArgs.push('--dry-run'); + if (args.keepBackup === true) scriptArgs.push('--keep-backup'); + const stdout = await runToolWithProgress({ + repoPath, + scriptArgs, + context, + startMessage: 'Compacting SQLite index.', + doneMessage: 'SQLite compaction complete.' + }); + return { repoPath, output: stdout.trim() }; +} + +/** + * Handle the MCP cache_gc tool call. + * @param {object} [args] + * @returns {object} + */ +export function cacheGc(args = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const scriptArgs = [path.join(toolRoot, 'tools', 'cache-gc.js'), '--json', '--repo', repoPath]; + if (args.dryRun === true) scriptArgs.push('--dry-run'); + if (Number.isFinite(Number(args.maxBytes))) scriptArgs.push('--max-bytes', String(args.maxBytes)); + if (Number.isFinite(Number(args.maxGb))) scriptArgs.push('--max-gb', String(args.maxGb)); + if (Number.isFinite(Number(args.maxAgeDays))) scriptArgs.push('--max-age-days', String(args.maxAgeDays)); + const stdout = runNodeSync(repoPath, scriptArgs); + try { + return JSON.parse(stdout || '{}'); + } catch { + return { repoPath, output: stdout.trim() }; + } +} + +/** + * Handle the MCP clean_artifacts tool call. + * @param {object} [args] + * @returns {Promise} + */ +export async function cleanArtifacts(args = {}, context = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const scriptArgs = [path.join(toolRoot, 'tools', 'clean-artifacts.js'), '--repo', repoPath]; + if (args.all === true) scriptArgs.push('--all'); + if (args.dryRun === true) scriptArgs.push('--dry-run'); + const stdout = await runToolWithProgress({ + repoPath, + scriptArgs, + context, + startMessage: 'Cleaning artifacts.', + doneMessage: 'Artifact cleanup complete.' + }); + return { repoPath, output: stdout.trim() }; +} + +/** + * Handle the MCP bootstrap tool call. + * @param {object} [args] + * @returns {Promise} + */ +export async function runBootstrap(args = {}, context = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const scriptArgs = [path.join(toolRoot, 'tools', 'bootstrap.js'), '--repo', repoPath]; + if (args.skipInstall === true) scriptArgs.push('--skip-install'); + if (args.skipDicts === true) scriptArgs.push('--skip-dicts'); + if (args.skipIndex === true) scriptArgs.push('--skip-index'); + if (args.skipArtifacts === true) scriptArgs.push('--skip-artifacts'); + if (args.skipTooling === true) scriptArgs.push('--skip-tooling'); + if (args.withSqlite === true) scriptArgs.push('--with-sqlite'); + if (args.incremental === true) scriptArgs.push('--incremental'); + const stdout = await runToolWithProgress({ + repoPath, + scriptArgs, + context, + startMessage: 'Bootstrapping repo.', + doneMessage: 'Bootstrap complete.' + }); + return { repoPath, output: stdout.trim() }; +} + +/** + * Handle the MCP report_artifacts tool call. + * @param {object} [args] + * @returns {object} + */ +export async function reportArtifacts(args = {}) { + const repoPath = resolveRepoPath(args.repoPath); + return coreStatus(repoPath); +} + +/** + * Handle the MCP triage_ingest tool call. + * @param {object} [args] + * @returns {Promise} + */ +export async function triageIngest(args = {}, context = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const source = String(args.source || '').trim(); + const inputPath = String(args.inputPath || '').trim(); + if (!source || !inputPath) { + throw new Error('source and inputPath are required.'); + } + const resolvedInput = path.isAbsolute(inputPath) ? inputPath : path.join(repoPath, inputPath); + const metaFilters = normalizeMetaFilters(args.meta); + const ingestArgs = [path.join(toolRoot, 'tools', 'triage', 'ingest.js'), '--source', source, '--in', resolvedInput]; + ingestArgs.push('--repo', repoPath); + if (Array.isArray(metaFilters)) { + metaFilters.forEach((entry) => ingestArgs.push('--meta', entry)); + } + const progress = typeof context.progress === 'function' ? context.progress : null; + const progressLine = progress + ? ({ stream, line }) => progress({ message: line, stream }) + : null; + if (progress) { + progress({ message: `Ingesting ${source} findings.`, phase: 'start' }); + } + const { stdout } = await runNodeAsync(repoPath, ingestArgs, { streamOutput: true, onLine: progressLine }); + let payload = {}; + try { + payload = JSON.parse(stdout || '{}'); + } catch (error) { + throw new Error(`Failed to parse ingest output: ${error?.message || error}`); + } + if (args.buildIndex) { + await buildIndex({ + repoPath, + mode: 'records', + incremental: args.incremental === true, + stubEmbeddings: args.stubEmbeddings === true, + sqlite: false + }, context); + } + if (progress) { + progress({ message: 'Triage ingest complete.', phase: 'done' }); + } + return payload; +} + +/** + * Handle the MCP triage_decision tool call. + * @param {object} [args] + * @returns {object} + */ +export function triageDecision(args = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const finding = String(args.finding || '').trim(); + const status = String(args.status || '').trim(); + if (!finding || !status) { + throw new Error('finding and status are required.'); + } + const metaFilters = normalizeMetaFilters(args.meta); + const decisionArgs = [path.join(toolRoot, 'tools', 'triage', 'decision.js'), '--finding', finding, '--status', status]; + decisionArgs.push('--repo', repoPath); + if (args.justification) decisionArgs.push('--justification', String(args.justification)); + if (args.reviewer) decisionArgs.push('--reviewer', String(args.reviewer)); + if (args.expires) decisionArgs.push('--expires', String(args.expires)); + if (Array.isArray(metaFilters)) { + metaFilters.forEach((entry) => decisionArgs.push('--meta', entry)); + } + const codes = Array.isArray(args.codes) ? args.codes : (args.codes ? [args.codes] : []); + const evidence = Array.isArray(args.evidence) ? args.evidence : (args.evidence ? [args.evidence] : []); + codes.filter(Boolean).forEach((code) => decisionArgs.push('--code', String(code))); + evidence.filter(Boolean).forEach((item) => decisionArgs.push('--evidence', String(item))); + const stdout = runNodeSync(repoPath, decisionArgs); + return JSON.parse(stdout || '{}'); +} + +/** + * Handle the MCP triage_context_pack tool call. + * @param {object} [args] + * @returns {Promise} + */ +export async function triageContextPack(args = {}, context = {}) { + const repoPath = resolveRepoPath(args.repoPath); + const recordId = String(args.recordId || '').trim(); + if (!recordId) throw new Error('recordId is required.'); + const contextArgs = [path.join(toolRoot, 'tools', 'triage', 'context-pack.js'), '--record', recordId]; + contextArgs.push('--repo', repoPath); + if (args.outPath) contextArgs.push('--out', String(args.outPath)); + if (args.ann === true) contextArgs.push('--ann'); + if (args.ann === false) contextArgs.push('--no-ann'); + if (args.stubEmbeddings === true) contextArgs.push('--stub-embeddings'); + const progress = typeof context.progress === 'function' ? context.progress : null; + const progressLine = progress + ? ({ stream, line }) => progress({ message: line, stream }) + : null; + if (progress) { + progress({ message: 'Building triage context pack.', phase: 'start' }); + } + const { stdout } = await runNodeAsync(repoPath, contextArgs, { streamOutput: true, onLine: progressLine }); + if (progress) { + progress({ message: 'Context pack ready.', phase: 'done' }); + } + try { + return JSON.parse(stdout || '{}'); + } catch (error) { + throw new Error(`Failed to parse context pack output: ${error?.message || error}`); + } +} + +/** + * Dispatch an MCP tool call by name. + * @param {string} name + * @param {object} args + * @returns {Promise} + */ +export async function handleToolCall(name, args, context = {}) { + switch (name) { + case 'index_status': + return await indexStatus(args); + case 'config_status': + return await configStatus(args); + case 'build_index': + return await buildIndex(args, context); + case 'search': + return await runSearch(args); + case 'download_models': + return await downloadModels(args, context); + case 'download_dictionaries': + return await downloadDictionaries(args, context); + case 'download_extensions': + return await downloadExtensions(args, context); + case 'verify_extensions': + return verifyExtensions(args); + case 'build_sqlite_index': + return await buildSqliteIndex(args, context); + case 'compact_sqlite_index': + return await compactSqliteIndex(args, context); + case 'cache_gc': + return cacheGc(args); + case 'clean_artifacts': + return await cleanArtifacts(args, context); + case 'bootstrap': + return await runBootstrap(args, context); + case 'report_artifacts': + return await reportArtifacts(args); + case 'triage_ingest': + return await triageIngest(args, context); + case 'triage_decision': + return triageDecision(args); + case 'triage_context_pack': + return await triageContextPack(args, context); + default: + throw new Error(`Unknown tool: ${name}`); + } +} diff --git a/tools/mcp/transport.js b/tools/mcp/transport.js new file mode 100644 index 000000000..f9ce844aa --- /dev/null +++ b/tools/mcp/transport.js @@ -0,0 +1,216 @@ +import { StreamMessageReader } from 'vscode-jsonrpc'; +import { closeOutput, sendError, sendNotification, sendResult } from '../../src/integrations/mcp/protocol.js'; +import { ERROR_CODES } from '../../src/shared/error-codes.js'; +import { logError } from '../../src/shared/progress.js'; +import { withTimeout } from './runner.js'; + +/** + * Format error payloads for tool responses. + * @param {any} error + * @returns {{message:string,code?:number,stderr?:string,stdout?:string}} + */ +function getRemediationHint(error) { + const parts = [error?.message, error?.stderr, error?.stdout] + .filter(Boolean) + .join('\n') + .toLowerCase(); + if (!parts) return null; + + if (parts.includes('sqlite backend requested but index not found') + || parts.includes('missing required tables')) { + return 'Run `npm run build-sqlite-index` or set sqlite.use=false / --backend memory.'; + } + if (parts.includes('better-sqlite3 is required')) { + return 'Run `npm install` and ensure better-sqlite3 can load on this platform.'; + } + if (parts.includes('chunk_meta.json') + || parts.includes('minhash_signatures') + || parts.includes('index not found') + || parts.includes('build-index') + || parts.includes('build index')) { + return 'Run `npm run build-index` (or `npm run setup`/`npm run bootstrap`) to generate indexes.'; + } + if ((parts.includes('model') || parts.includes('xenova') || parts.includes('transformers')) + && (parts.includes('not found') || parts.includes('failed') || parts.includes('fetch') || parts.includes('download') || parts.includes('enoent'))) { + return 'Run `npm run download-models` or use `--stub-embeddings` / `PAIROFCLEATS_EMBEDDINGS=stub`.'; + } + if (parts.includes('dictionary') + || parts.includes('wordlist') + || parts.includes('words_alpha') + || parts.includes('download-dicts')) { + return 'Run `npm run download-dicts -- --lang en` (or configure dictionary.files/languages).'; + } + return null; +} + +/** + * Format error payloads for tool responses. + * @param {any} error + * @returns {{message:string,code?:number,stderr?:string,stdout?:string,hint?:string}} + */ +function formatToolError(error) { + const payload = { + message: error?.message || String(error) + }; + if (error?.code !== undefined) payload.code = error.code; + if (error?.stderr) payload.stderr = String(error.stderr).trim(); + if (error?.stdout) payload.stdout = String(error.stdout).trim(); + if (error?.timeoutMs) payload.timeoutMs = error.timeoutMs; + const hint = getRemediationHint(error); + if (hint) payload.hint = hint; + return payload; +} + +/** + * Emit a progress notification for long-running tools. + * @param {string|number|null} id + * @param {string} tool + * @param {{message:string,stream?:string,phase?:string}} payload + */ +function sendProgress(id, tool, payload) { + if (id === null || id === undefined) return; + const message = payload?.message ? String(payload.message) : ''; + if (!message) return; + sendNotification('notifications/progress', { + id, + tool, + message, + stream: payload?.stream || 'info', + phase: payload?.phase || 'progress', + ts: new Date().toISOString() + }); +} + +/** + * Start the MCP stdio transport. + * @param {{toolDefs:any,serverInfo:{name:string,version:string},handleToolCall:Function,resolveToolTimeoutMs:Function,queueMax:number}} config + */ +export const createMcpTransport = ({ toolDefs, serverInfo, handleToolCall, resolveToolTimeoutMs, queueMax }) => { + let processing = false; + const queue = []; + + /** + * Handle a JSON-RPC message from stdin. + * @param {object} message + * @returns {Promise} + */ + async function handleMessage(message) { + if (!message || message.jsonrpc !== '2.0') return; + const { id, method, params } = message; + + if (method === 'initialize') { + sendResult(id, { + protocolVersion: '2024-11-05', + serverInfo, + capabilities: { + tools: { listChanged: false }, + resources: { listChanged: false } + } + }); + return; + } + + if (method === 'shutdown') { + sendResult(id, {}); + return; + } + + if (method === 'exit') { + process.exit(0); + } + + if (method === 'tools/list') { + sendResult(id, { tools: toolDefs }); + return; + } + + if (method === 'resources/list') { + sendResult(id, { resources: [] }); + return; + } + + if (method === 'tools/call') { + if (!id) return; + const name = params?.name; + const args = params?.arguments || {}; + const timeoutMs = resolveToolTimeoutMs(name, args); + try { + let timedOut = false; + const progress = (payload) => { + if (timedOut) return; + sendProgress(id, name, payload); + }; + const result = await withTimeout( + handleToolCall(name, args, { progress, toolCallId: id }), + timeoutMs, + { label: name, onTimeout: () => { timedOut = true; } } + ); + sendResult(id, { + content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] + }); + } catch (error) { + const payload = formatToolError(error); + if (error?.code === 'TOOL_TIMEOUT' && timeoutMs) { + payload.timeoutMs = timeoutMs; + } + sendResult(id, { + content: [{ type: 'text', text: JSON.stringify(payload, null, 2) }], + isError: true + }); + } + return; + } + + if (id) { + sendError(id, -32601, `Method not found: ${method}`); + } + } + + /** + * Process queued messages serially. + */ + function processQueue() { + if (processing) return; + processing = true; + const run = async () => { + while (queue.length) { + const msg = queue.shift(); + await handleMessage(msg); + } + processing = false; + }; + run().catch((error) => { + processing = false; + logError('[mcp] queue error', { error: error?.message || String(error) }); + }); + } + + /** + * Enqueue a message for processing. + * @param {object} message + */ + function enqueueMessage(message) { + const inFlight = processing ? 1 : 0; + if (queue.length + inFlight >= queueMax) { + if (message?.id !== undefined && message?.id !== null) { + sendError(message.id, -32001, 'Server overloaded.', undefined, { code: ERROR_CODES.QUEUE_OVERLOADED }); + } + return; + } + queue.push(message); + processQueue(); + } + + const start = () => { + const reader = new StreamMessageReader(process.stdin); + reader.onError((err) => logError('[mcp] stream error', { error: err?.message || String(err) })); + reader.onClose(() => { + closeOutput(); + process.exit(0); + }); + reader.listen(enqueueMessage); + return reader; + }; + + return { start }; +}; diff --git a/tools/merge-history.sh b/tools/merge-history.sh index 1fe08279a..4ee250c1c 100644 --- a/tools/merge-history.sh +++ b/tools/merge-history.sh @@ -15,6 +15,6 @@ if [ ! -f "$THEIRS_FILE" ]; then exit 0 fi -node "$(dirname "$0")/mergeSearchHistory.js" "$THEIRS_FILE" "$OURS_FILE" +node "$(dirname "$0")/mergeAppendOnly.js" "$THEIRS_FILE" "$OURS_FILE" exit 0 diff --git a/tools/merge-no-results.sh b/tools/merge-no-results.sh index fb2f84165..4ee250c1c 100644 --- a/tools/merge-no-results.sh +++ b/tools/merge-no-results.sh @@ -15,6 +15,6 @@ if [ ! -f "$THEIRS_FILE" ]; then exit 0 fi -node "$(dirname "$0")/mergeNoResultQueries.js" "$THEIRS_FILE" "$OURS_FILE" +node "$(dirname "$0")/mergeAppendOnly.js" "$THEIRS_FILE" "$OURS_FILE" exit 0 diff --git a/tools/mergeNoResultQueries.js b/tools/mergeNoResultQueries.js deleted file mode 100644 index 8849a86bd..000000000 --- a/tools/mergeNoResultQueries.js +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env node -import { mergeAppendOnly } from './mergeAppendOnly.js'; - -const [baseFile, targetFile] = process.argv.slice(2); -if (!baseFile || !targetFile) { - console.error('usage: mergeNoResultQueries.js '); - process.exit(1); -} - -await mergeAppendOnly(baseFile, targetFile); diff --git a/tools/mergeSearchHistory.js b/tools/mergeSearchHistory.js deleted file mode 100644 index 2ca1eccd9..000000000 --- a/tools/mergeSearchHistory.js +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env node -import { mergeAppendOnly } from './mergeAppendOnly.js'; - -const [baseFile, targetFile] = process.argv.slice(2); -if (!baseFile || !targetFile) { - console.error('usage: mergeSearchHistory.js '); - process.exit(1); -} - -await mergeAppendOnly(baseFile, targetFile); diff --git a/tools/parity-matrix.js b/tools/parity-matrix.js new file mode 100644 index 000000000..33bacb168 --- /dev/null +++ b/tools/parity-matrix.js @@ -0,0 +1,277 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import { execa } from 'execa'; +import { createCli } from '../src/shared/cli.js'; +import { resolveToolRoot } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'parity-matrix', + options: { + backend: { type: 'string' }, + backends: { type: 'string' }, + 'ann-modes': { type: 'string' }, + queries: { type: 'string' }, + 'queries-dir': { type: 'string' }, + top: { type: 'number' }, + limit: { type: 'number' }, + results: { type: 'string' }, + 'out-dir': { type: 'string' }, + search: { type: 'string' }, + 'dry-run': { type: 'boolean', default: false }, + 'fail-fast': { type: 'boolean', default: false } + } +}).parse(); + +const scriptRoot = resolveToolRoot(); +const parityScript = path.join(scriptRoot, 'tests', 'parity.js'); +const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); +const resultsRoot = path.resolve( + argv.results || path.join(scriptRoot, 'benchmarks', 'results') +); +const runRoot = path.resolve( + argv['out-dir'] || path.join(resultsRoot, 'parity', timestamp) +); +const logRoot = path.join(runRoot, 'logs'); +const outRoot = path.join(runRoot, 'runs'); + +const DEFAULT_BACKENDS = ['sqlite', 'sqlite-fts']; +const DEFAULT_ANN_MODES = ['on', 'off']; +const DEFAULT_TOP = 10; + +const parseList = (value) => { + if (!value) return []; + return String(value) + .split(',') + .map((entry) => entry.trim()) + .filter(Boolean); +}; + +const normalizeBackend = (raw) => { + const value = String(raw || '').toLowerCase(); + if (value === 'fts') return 'sqlite-fts'; + return value; +}; + +const resolveBackends = () => { + const raw = argv.backends || argv.backend || ''; + const list = parseList(raw).map(normalizeBackend).filter(Boolean); + if (!list.length || list.includes('all')) return DEFAULT_BACKENDS.slice(); + return Array.from(new Set(list)); +}; + +const normalizeAnnMode = (raw) => { + const value = String(raw || '').toLowerCase(); + if (value === 'true' || value === '1' || value === 'on' || value === 'yes') { + return 'on'; + } + if (value === 'false' || value === '0' || value === 'off' || value === 'no') { + return 'off'; + } + return null; +}; + +const resolveAnnModes = () => { + const raw = parseList(argv['ann-modes']); + const modes = raw.map(normalizeAnnMode).filter(Boolean); + return modes.length ? Array.from(new Set(modes)) : DEFAULT_ANN_MODES.slice(); +}; + +const toSafeName = (value) => + String(value || '') + .replace(/[^a-z0-9-_]+/gi, '_') + .replace(/^_+|_+$/g, '') + .toLowerCase(); + +const appendArgs = (args, flag, value) => { + if (value === undefined || value === null || value === '') return; + args.push(flag, String(value)); +}; + +async function loadQueriesFromFile(filePath) { + const raw = await fsPromises.readFile(filePath, 'utf8'); + if (filePath.endsWith('.json')) { + const parsed = JSON.parse(raw); + if (Array.isArray(parsed)) { + return parsed + .map((entry) => { + if (typeof entry === 'string') return entry; + if (entry && typeof entry.query === 'string') return entry.query; + return null; + }) + .filter(Boolean); + } + if (Array.isArray(parsed.queries)) { + return parsed.queries + .map((entry) => { + if (typeof entry === 'string') return entry; + if (entry && typeof entry.query === 'string') return entry.query; + return null; + }) + .filter(Boolean); + } + return []; + } + return raw + .split(/\r?\n/) + .map((line) => line.trim()) + .filter((line) => line && !line.startsWith('#')); +} + +async function resolveQueryFile() { + if (argv.queries) { + const resolved = path.resolve(argv.queries); + if (!fs.existsSync(resolved)) { + throw new Error(`Query file not found: ${resolved}`); + } + const queries = await loadQueriesFromFile(resolved); + return { path: resolved, source: 'file', count: queries.length }; + } + + const queriesDir = path.resolve( + argv['queries-dir'] || path.join(scriptRoot, 'benchmarks', 'queries') + ); + const entries = await fsPromises.readdir(queriesDir, { withFileTypes: true }); + const files = entries + .filter((entry) => entry.isFile() && entry.name.endsWith('.txt')) + .map((entry) => entry.name) + .sort(); + if (!files.length) { + throw new Error(`No query files found in ${queriesDir}`); + } + + const seen = new Set(); + const combined = []; + for (const file of files) { + const filePath = path.join(queriesDir, file); + const lines = await loadQueriesFromFile(filePath); + for (const line of lines) { + if (seen.has(line)) continue; + seen.add(line); + combined.push(line); + } + } + + if (!combined.length) { + throw new Error(`No queries resolved from ${queriesDir}`); + } + + const outPath = path.join(runRoot, 'parity-queries.txt'); + const header = [ + '# Generated from benchmarks/queries/*.txt', + `# Source: ${queriesDir}`, + '' + ]; + await fsPromises.writeFile(outPath, `${header.join('\n')}${combined.join('\n')}\n`); + return { path: outPath, source: queriesDir, count: combined.length }; +} + +const configToArgs = (config, queryFile, outFile, top, limit) => { + const args = [parityScript]; + appendArgs(args, '--sqlite-backend', config.backend); + appendArgs(args, '--queries', queryFile); + appendArgs(args, '--top', top); + appendArgs(args, '--limit', limit); + appendArgs(args, '--search', argv.search); + args.push('--write-report'); + appendArgs(args, '--out', outFile); + if (config.annMode === 'on') args.push('--ann'); + if (config.annMode === 'off') args.push('--no-ann'); + return args; +}; + +async function main() { + await fsPromises.mkdir(logRoot, { recursive: true }); + await fsPromises.mkdir(outRoot, { recursive: true }); + + const queryInfo = await resolveQueryFile(); + const top = Number.isFinite(Number(argv.top)) + ? Math.max(1, Number(argv.top)) + : DEFAULT_TOP; + const limit = Number.isFinite(Number(argv.limit)) + ? Math.max(0, Number(argv.limit)) + : 0; + + const backends = resolveBackends(); + const annModes = resolveAnnModes(); + const configs = []; + for (const backend of backends) { + for (const annMode of annModes) { + const id = toSafeName([backend, annMode].join('-')); + configs.push({ id, backend, annMode }); + } + } + + if (!configs.length) { + throw new Error('No parity configurations resolved.'); + } + + const results = []; + for (const config of configs) { + const label = `${config.backend}/${config.annMode}`; + const outFile = path.join(outRoot, `${config.id}.json`); + const logFile = path.join(logRoot, `${config.id}.log`); + const args = configToArgs(config, queryInfo.path, outFile, top, limit); + + console.log(`\n[parity-matrix] ${label}`); + console.log( + `node ${args.map((arg) => (arg.includes(' ') ? `"${arg}"` : arg)).join(' ')}` + ); + + if (argv['dry-run']) { + results.push({ ...config, outFile, logFile, status: 'dry-run' }); + continue; + } + + try { + const child = await execa(process.execPath, args, { all: true }); + if (child.all) process.stdout.write(child.all); + await fsPromises.writeFile(logFile, child.all || ''); + + let summary = null; + try { + const report = JSON.parse(await fsPromises.readFile(outFile, 'utf8')); + summary = report.summary || null; + } catch { + summary = null; + } + + results.push({ ...config, outFile, logFile, status: 'ok', summary }); + } catch (err) { + const output = err?.all || err?.stdout || err?.stderr || String(err); + if (output) process.stdout.write(output); + await fsPromises.writeFile(logFile, output || ''); + results.push({ + ...config, + outFile, + logFile, + status: 'failed', + exitCode: err?.exitCode ?? null, + error: err?.message || String(err) + }); + if (argv['fail-fast']) break; + } + } + + const matrix = { + generatedAt: new Date().toISOString(), + runRoot, + outRoot, + logRoot, + queryFile: queryInfo.path, + querySource: queryInfo.source, + queryCount: queryInfo.count, + top, + limit, + results + }; + const matrixPath = path.join(runRoot, 'matrix.json'); + await fsPromises.writeFile(matrixPath, JSON.stringify(matrix, null, 2)); + console.log(`\n[parity-matrix] summary written to ${matrixPath}`); +} + +main().catch((err) => { + console.error(err?.message || String(err)); + process.exit(1); +}); diff --git a/tools/release-check.js b/tools/release-check.js new file mode 100644 index 000000000..d928d0baa --- /dev/null +++ b/tools/release-check.js @@ -0,0 +1,63 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import path from 'node:path'; + +const args = process.argv.slice(2); +const requireBreaking = args.includes('--breaking') + || process.env.PAIROFCLEATS_BREAKING === '1'; + +const root = process.cwd(); +const packagePath = path.join(root, 'package.json'); +const changelogPath = path.join(root, 'CHANGELOG.md'); + +if (!fs.existsSync(packagePath)) { + console.error('release-check: package.json not found.'); + process.exit(1); +} +if (!fs.existsSync(changelogPath)) { + console.error('release-check: CHANGELOG.md not found.'); + process.exit(1); +} + +const pkg = JSON.parse(fs.readFileSync(packagePath, 'utf8')); +const version = pkg?.version ? String(pkg.version).trim() : ''; +if (!version) { + console.error('release-check: package.json version missing.'); + process.exit(1); +} + +const changelog = fs.readFileSync(changelogPath, 'utf8'); +const headerRe = new RegExp(`^##\\s+v?${version.replace(/\./g, '\\.')}(\\b|\\s)`, 'm'); +const match = headerRe.exec(changelog); +if (!match) { + console.error(`release-check: CHANGELOG.md missing section for v${version}.`); + process.exit(1); +} + +const sectionStart = match.index; +const nextHeaderMatch = changelog.slice(sectionStart + match[0].length).match(/^##\s+/m); +const sectionEnd = nextHeaderMatch + ? sectionStart + match[0].length + nextHeaderMatch.index + : changelog.length; +const section = changelog.slice(sectionStart, sectionEnd); + +if (requireBreaking) { + const breakingHeader = section.match(/^###\s+Breaking\s*$/m); + if (!breakingHeader) { + console.error(`release-check: missing "### Breaking" section for v${version}.`); + process.exit(1); + } + const afterBreaking = section.slice(breakingHeader.index + breakingHeader[0].length); + const nextSubsection = afterBreaking.match(/^###\s+/m); + const breakingBlock = nextSubsection + ? afterBreaking.slice(0, nextSubsection.index) + : afterBreaking; + const bullets = breakingBlock.split('\n').map((line) => line.trim()).filter((line) => line.startsWith('-')); + const hasRealEntry = bullets.some((line) => !line.toLowerCase().includes('none')); + if (!bullets.length || !hasRealEntry) { + console.error(`release-check: add breaking change notes under v${version}.`); + process.exit(1); + } +} + +console.log(`release-check: changelog entry ok for v${version}.`); diff --git a/tools/repometrics-dashboard.js b/tools/repometrics-dashboard.js index 25fc6310a..5253b4fa6 100644 --- a/tools/repometrics-dashboard.js +++ b/tools/repometrics-dashboard.js @@ -2,14 +2,18 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; import { getMetricsDir, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['json'], - string: ['out', 'repo'], - default: { top: 5 } -}); +const argv = createCli({ + scriptName: 'repometrics-dashboard', + options: { + json: { type: 'boolean', default: false }, + out: { type: 'string' }, + repo: { type: 'string' }, + top: { type: 'number', default: 5 } + } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); diff --git a/tools/report-artifacts.js b/tools/report-artifacts.js index 061aa31c4..b7c570d76 100644 --- a/tools/report-artifacts.js +++ b/tools/report-artifacts.js @@ -1,50 +1,91 @@ #!/usr/bin/env node import fs from 'node:fs'; -import fsPromises from 'node:fs/promises'; import path from 'node:path'; -import minimist from 'minimist'; -import { getCacheRoot, getDictConfig, getRepoCacheRoot, loadUserConfig, resolveRepoRoot, resolveSqlitePaths } from './dict-utils.js'; - -const argv = minimist(process.argv.slice(2), { - boolean: ['json', 'all'], - string: ['repo'], - default: { json: false, all: false } -}); +import { createCli } from '../src/shared/cli.js'; +import { getStatus } from '../src/integrations/core/status.js'; +import { validateIndexArtifacts } from '../src/index/validate.js'; +import { getMetricsDir, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'report-artifacts', + options: { + json: { type: 'boolean', default: false }, + all: { type: 'boolean', default: false }, + repo: { type: 'string' } + } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); const userConfig = loadUserConfig(root); -const cacheRoot = (userConfig.cache && userConfig.cache.root) || process.env.PAIROFCLEATS_CACHE_ROOT || getCacheRoot(); -const repoCacheRoot = getRepoCacheRoot(root, userConfig); -const dictConfig = getDictConfig(root, userConfig); -const dictDir = dictConfig.dir; -const sqlitePaths = resolveSqlitePaths(root, userConfig); -const sqliteTargets = [ - { label: 'code', path: sqlitePaths.codePath }, - { label: 'prose', path: sqlitePaths.prosePath } -]; +const metricsDir = getMetricsDir(root, userConfig); +const status = await getStatus({ repoRoot: root, includeAll: argv.all }); -/** - * Recursively compute the size of a file or directory. - * @param {string} targetPath - * @returns {Promise} - */ -async function sizeOfPath(targetPath) { +const readJson = (targetPath) => { + if (!fs.existsSync(targetPath)) return null; try { - const stat = await fsPromises.lstat(targetPath); - if (stat.isSymbolicLink()) return 0; - if (stat.isFile()) return stat.size; - if (!stat.isDirectory()) return 0; - - const entries = await fsPromises.readdir(targetPath); - let total = 0; - for (const entry of entries) { - total += await sizeOfPath(path.join(targetPath, entry)); - } - return total; + return JSON.parse(fs.readFileSync(targetPath, 'utf8')); } catch { - return 0; + return null; } +}; + +const indexMetrics = { + code: readJson(path.join(metricsDir, 'index-code.json')), + prose: readJson(path.join(metricsDir, 'index-prose.json')) +}; +const lmdbMetrics = { + code: readJson(path.join(metricsDir, 'lmdb-code.json')), + prose: readJson(path.join(metricsDir, 'lmdb-prose.json')) +}; + +const computeRate = (count, ms) => { + const total = Number(count); + const elapsed = Number(ms); + if (!Number.isFinite(total) || !Number.isFinite(elapsed) || elapsed <= 0) return null; + return total / (elapsed / 1000); +}; + +const buildThroughput = (mode, metrics, bytes) => { + if (!metrics) return null; + const totalMs = Number(metrics?.timings?.totalMs); + const writeMs = Number(metrics?.timings?.writeMs); + const files = Number(metrics?.files?.candidates); + const chunks = Number(metrics?.chunks?.total); + const tokens = Number(metrics?.tokens?.total); + const payload = { + mode, + totalMs: Number.isFinite(totalMs) ? totalMs : null, + writeMs: Number.isFinite(writeMs) ? writeMs : null, + files: Number.isFinite(files) ? files : null, + chunks: Number.isFinite(chunks) ? chunks : null, + tokens: Number.isFinite(tokens) ? tokens : null, + bytes: Number.isFinite(Number(bytes)) ? Number(bytes) : null + }; + payload.filesPerSec = computeRate(payload.files, payload.totalMs); + payload.chunksPerSec = computeRate(payload.chunks, payload.totalMs); + payload.tokensPerSec = computeRate(payload.tokens, payload.totalMs); + payload.bytesPerSec = computeRate(payload.bytes, payload.totalMs); + payload.writeBytesPerSec = computeRate(payload.bytes, payload.writeMs); + return payload; +}; + +const throughput = { + code: buildThroughput('code', indexMetrics.code, status.repo?.artifacts?.indexCode), + prose: buildThroughput('prose', indexMetrics.prose, status.repo?.artifacts?.indexProse), + lmdb: { + code: buildThroughput('lmdb code', lmdbMetrics.code, status.repo?.lmdb?.code?.bytes), + prose: buildThroughput('lmdb prose', lmdbMetrics.prose, status.repo?.lmdb?.prose?.bytes) + } +}; + +const corruption = await validateIndexArtifacts({ root, userConfig, modes: ['code', 'prose'] }); +status.throughput = throughput; +status.corruption = corruption; + +if (argv.json) { + console.log(JSON.stringify(status, null, 2)); + process.exit(0); } /** @@ -65,169 +106,84 @@ function formatBytes(bytes) { return `${rounded} ${units[unit]}`; } -/** - * Check if a path is contained within another path. - * @param {string} parent - * @param {string} child - * @returns {boolean} - */ -function isInside(parent, child) { - const rel = path.relative(parent, child); - return rel === '' || (!rel.startsWith('..') && !path.isAbsolute(rel)); -} - -const repoArtifacts = { - indexCode: path.join(repoCacheRoot, 'index-code'), - indexProse: path.join(repoCacheRoot, 'index-prose'), - repometrics: path.join(repoCacheRoot, 'repometrics'), - incremental: path.join(repoCacheRoot, 'incremental') -}; - -const repoCacheSize = await sizeOfPath(repoCacheRoot); -const repoArtifactSizes = {}; -for (const [name, artifactPath] of Object.entries(repoArtifacts)) { - repoArtifactSizes[name] = await sizeOfPath(artifactPath); -} +const repo = status.repo; +const overall = status.overall; +const code = repo.sqlite?.code; +const prose = repo.sqlite?.prose; +const lmdbCode = repo.lmdb?.code; +const lmdbProse = repo.lmdb?.prose; -const sqliteStats = {}; -let sqliteOutsideCacheSize = 0; -for (const target of sqliteTargets) { - const exists = fs.existsSync(target.path); - const size = exists ? await sizeOfPath(target.path) : 0; - sqliteStats[target.label] = exists ? { path: target.path, bytes: size } : null; - if (exists && !isInside(path.resolve(cacheRoot), target.path)) { - sqliteOutsideCacheSize += size; - } -} -const cacheRootSize = await sizeOfPath(cacheRoot); -const dictSize = await sizeOfPath(dictDir); -const overallSize = cacheRootSize + sqliteOutsideCacheSize; - -const health = { issues: [], hints: [] }; -const indexIssues = []; -if (!fs.existsSync(repoArtifacts.indexCode)) { - indexIssues.push('index-code directory missing'); -} else { - if (!fs.existsSync(path.join(repoArtifacts.indexCode, 'chunk_meta.json'))) { - indexIssues.push('index-code chunk_meta.json missing'); - } - if (!fs.existsSync(path.join(repoArtifacts.indexCode, 'token_postings.json'))) { - indexIssues.push('index-code token_postings.json missing'); - } -} -if (!fs.existsSync(repoArtifacts.indexProse)) { - indexIssues.push('index-prose directory missing'); -} else { - if (!fs.existsSync(path.join(repoArtifacts.indexProse, 'chunk_meta.json'))) { - indexIssues.push('index-prose chunk_meta.json missing'); - } - if (!fs.existsSync(path.join(repoArtifacts.indexProse, 'token_postings.json'))) { - indexIssues.push('index-prose token_postings.json missing'); - } -} -if (indexIssues.length) { - health.issues.push(...indexIssues); - health.hints.push('Run `npm run build-index` to rebuild file-backed indexes.'); +console.log('Repo artifacts'); +console.log(`- cache root: ${formatBytes(repo.totalBytes)} (${repo.root})`); +console.log(`- index-code: ${formatBytes(repo.artifacts.indexCode)} (${repo.artifacts.indexCode})`); +console.log(`- index-prose: ${formatBytes(repo.artifacts.indexProse)} (${repo.artifacts.indexProse})`); +console.log(`- repometrics: ${formatBytes(repo.artifacts.repometrics)} (${path.join(repo.root, 'repometrics')})`); +console.log(`- incremental: ${formatBytes(repo.artifacts.incremental)} (${path.join(repo.root, 'incremental')})`); +console.log(`- sqlite code db: ${code ? formatBytes(code.bytes) : 'missing'} (${code?.path || status.repo.sqlite?.code?.path || 'missing'})`); +console.log(`- sqlite prose db: ${prose ? formatBytes(prose.bytes) : 'missing'} (${prose?.path || status.repo.sqlite?.prose?.path || 'missing'})`); +console.log(`- lmdb code db: ${lmdbCode ? formatBytes(lmdbCode.bytes) : 'missing'} (${lmdbCode?.path || status.repo.lmdb?.code?.path || 'missing'})`); +console.log(`- lmdb prose db: ${lmdbProse ? formatBytes(lmdbProse.bytes) : 'missing'} (${lmdbProse?.path || status.repo.lmdb?.prose?.path || 'missing'})`); +if (repo.sqlite?.legacy) { + console.log(`- legacy sqlite db: ${repo.sqlite.legacy.path}`); } -const sqliteIssues = []; -if (userConfig.sqlite?.use === true) { - if (!fs.existsSync(sqlitePaths.codePath)) sqliteIssues.push('sqlite code db missing'); - if (!fs.existsSync(sqlitePaths.prosePath)) sqliteIssues.push('sqlite prose db missing'); +console.log('\nOverall'); +console.log(`- cache root: ${formatBytes(overall.cacheBytes)} (${overall.cacheRoot})`); +console.log(`- dictionaries: ${formatBytes(overall.dictionaryBytes)}`); +if (overall.sqliteOutsideCacheBytes) { + console.log(`- sqlite outside cache: ${formatBytes(overall.sqliteOutsideCacheBytes)}`); } -if (sqliteIssues.length) { - health.issues.push(...sqliteIssues); - health.hints.push('Run `npm run build-sqlite-index` to rebuild SQLite indexes.'); +if (overall.lmdbOutsideCacheBytes) { + console.log(`- lmdb outside cache: ${formatBytes(overall.lmdbOutsideCacheBytes)}`); } +console.log(`- total: ${formatBytes(overall.totalBytes)}`); -const repoRollups = []; -if (argv.all) { - const reposRoot = path.join(cacheRoot, 'repos'); - if (fs.existsSync(reposRoot)) { - const entries = await fsPromises.readdir(reposRoot, { withFileTypes: true }); - for (const entry of entries) { - if (!entry.isDirectory()) continue; - const repoPath = path.join(reposRoot, entry.name); - const bytes = await sizeOfPath(repoPath); - const stat = await fsPromises.stat(repoPath); - repoRollups.push({ - id: entry.name, - path: path.resolve(repoPath), - bytes, - mtime: stat.mtime ? stat.mtime.toISOString() : null - }); - } +if (status.health?.issues?.length) { + console.log('\nHealth'); + status.health.issues.forEach((issue) => console.log(`- issue: ${issue}`)); + status.health.hints.forEach((hint) => console.log(`- hint: ${hint}`)); +} + +if (status.throughput) { + const formatRate = (value, unit) => (Number.isFinite(value) ? `${value.toFixed(1)} ${unit}/s` : 'n/a'); + const formatMs = (value) => (Number.isFinite(value) ? `${value.toFixed(0)} ms` : 'n/a'); + console.log('\nThroughput'); + const entries = [ + ['code', status.throughput.code], + ['prose', status.throughput.prose], + ['lmdb code', status.throughput.lmdb?.code], + ['lmdb prose', status.throughput.lmdb?.prose] + ]; + for (const [mode, entry] of entries) { + if (!entry) continue; + console.log( + `- ${mode}: files ${formatRate(entry.filesPerSec, 'files')}, ` + + `chunks ${formatRate(entry.chunksPerSec, 'chunks')}, ` + + `tokens ${formatRate(entry.tokensPerSec, 'tokens')}, ` + + `bytes ${formatRate(entry.bytesPerSec, 'bytes')} (total ${formatMs(entry.totalMs)})` + ); } } -if (argv.json) { - const sqlitePayload = { - code: sqliteStats.code, - prose: sqliteStats.prose, - legacy: sqlitePaths.legacyExists ? { path: sqlitePaths.legacyPath } : null - }; - const payload = { - repo: { - root: path.resolve(repoCacheRoot), - totalBytes: repoCacheSize, - artifacts: repoArtifactSizes, - sqlite: sqlitePayload - }, - health, - overall: { - cacheRoot: path.resolve(cacheRoot), - cacheBytes: cacheRootSize, - dictionaryBytes: dictSize, - sqliteOutsideCacheBytes: sqliteOutsideCacheSize, - totalBytes: overallSize - } - }; - if (argv.all) { - const totalRepoBytes = repoRollups.reduce((sum, repo) => sum + repo.bytes, 0); - payload.allRepos = { - root: path.resolve(path.join(cacheRoot, 'repos')), - repos: repoRollups, - totalBytes: totalRepoBytes - }; +if (status.corruption) { + const validation = status.corruption; + const statusLabel = validation.ok ? 'ok' : 'issues'; + console.log('\nIntegrity'); + console.log(`- index-validate: ${statusLabel}`); + if (!validation.ok && validation.issues?.length) { + validation.issues.forEach((issue) => console.log(`- issue: ${issue}`)); + } + if (validation.warnings?.length) { + validation.warnings.forEach((warning) => console.log(`- warning: ${warning}`)); } - console.log(JSON.stringify(payload, null, 2)); - process.exit(0); -} - -console.log('Repo artifacts'); -console.log(`- cache root: ${formatBytes(repoCacheSize)} (${path.resolve(repoCacheRoot)})`); -console.log(`- index-code: ${formatBytes(repoArtifactSizes.indexCode)} (${path.resolve(repoArtifacts.indexCode)})`); -console.log(`- index-prose: ${formatBytes(repoArtifactSizes.indexProse)} (${path.resolve(repoArtifacts.indexProse)})`); -console.log(`- repometrics: ${formatBytes(repoArtifactSizes.repometrics)} (${path.resolve(repoArtifacts.repometrics)})`); -console.log(`- incremental: ${formatBytes(repoArtifactSizes.incremental)} (${path.resolve(repoArtifacts.incremental)})`); -const code = sqliteStats.code; -const prose = sqliteStats.prose; -console.log(`- sqlite code db: ${code ? formatBytes(code.bytes) : 'missing'} (${code?.path || sqlitePaths.codePath})`); -console.log(`- sqlite prose db: ${prose ? formatBytes(prose.bytes) : 'missing'} (${prose?.path || sqlitePaths.prosePath})`); -if (sqlitePaths.legacyExists) { - console.log(`- legacy sqlite db: ${sqlitePaths.legacyPath}`); -} - -console.log('\nOverall'); -console.log(`- cache root: ${formatBytes(cacheRootSize)} (${path.resolve(cacheRoot)})`); -console.log(`- dictionaries: ${formatBytes(dictSize)} (${path.resolve(dictDir)})`); -if (sqliteOutsideCacheSize) { - console.log(`- sqlite outside cache: ${formatBytes(sqliteOutsideCacheSize)}`); -} -console.log(`- total: ${formatBytes(overallSize)}`); - -if (health.issues.length) { - console.log('\nHealth'); - health.issues.forEach((issue) => console.log(`- issue: ${issue}`)); - health.hints.forEach((hint) => console.log(`- hint: ${hint}`)); } -if (argv.all) { - const totalRepoBytes = repoRollups.reduce((sum, repo) => sum + repo.bytes, 0); +if (status.allRepos) { + const repos = status.allRepos.repos.slice().sort((a, b) => b.bytes - a.bytes); console.log('\nAll repos'); - console.log(`- root: ${path.resolve(path.join(cacheRoot, 'repos'))}`); - console.log(`- total: ${formatBytes(totalRepoBytes)}`); - for (const repo of repoRollups.sort((a, b) => b.bytes - a.bytes)) { - console.log(`- ${repo.id}: ${formatBytes(repo.bytes)} (${repo.path})`); + console.log(`- root: ${status.allRepos.root}`); + console.log(`- total: ${formatBytes(status.allRepos.totalBytes)}`); + for (const repoEntry of repos) { + console.log(`- ${repoEntry.id}: ${formatBytes(repoEntry.bytes)} (${repoEntry.path})`); } } diff --git a/tools/report-code-map.js b/tools/report-code-map.js new file mode 100644 index 000000000..1b03235e3 --- /dev/null +++ b/tools/report-code-map.js @@ -0,0 +1,244 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { pathToFileURL } from 'node:url'; +import { createCli } from '../src/shared/cli.js'; +import { buildCodeMap, buildNodeList, buildMapCacheKey } from '../src/map/build-map.js'; +import { renderDot } from '../src/map/dot-writer.js'; +import { renderSvgHtml } from '../src/map/html-writer.js'; +import { renderIsometricHtml } from '../src/map/isometric-viewer.js'; +import { loadUserConfig, resolveRepoRoot, getIndexDir, getCurrentBuildInfo, getRepoId } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'report map', + options: { + repo: { type: 'string', describe: 'Repo root.' }, + mode: { type: 'string', default: 'code' }, + 'index-root': { type: 'string' }, + scope: { type: 'string', default: 'repo' }, + focus: { type: 'string' }, + include: { type: 'string' }, + 'only-exported': { type: 'boolean', default: false }, + collapse: { type: 'string', default: 'none' }, + 'max-files': { type: 'number' }, + 'max-members-per-file': { type: 'number' }, + 'max-edges': { type: 'number' }, + 'top-k-by-degree': { type: 'boolean', default: false }, + format: { type: 'string', default: 'json' }, + out: { type: 'string' }, + 'model-out': { type: 'string' }, + 'node-list-out': { type: 'string' }, + json: { type: 'boolean', default: false }, + pretty: { type: 'boolean', default: false }, + 'open-uri-template': { type: 'string' }, + 'three-url': { type: 'string' }, + 'wasd-sensitivity': { type: 'number' }, + 'wasd-acceleration': { type: 'number' }, + 'wasd-max-speed': { type: 'number' }, + 'wasd-drag': { type: 'number' }, + 'zoom-sensitivity': { type: 'number' }, + 'cache-dir': { type: 'string' }, + refresh: { type: 'boolean', default: false } + } +}).parse(); + +const rootArg = argv.repo ? path.resolve(argv.repo) : null; +const repoRoot = rootArg || resolveRepoRoot(process.cwd()); +const userConfig = loadUserConfig(repoRoot); +const mode = String(argv.mode || 'code').toLowerCase(); +const indexDir = getIndexDir(repoRoot, mode, userConfig, { + indexRoot: argv['index-root'] ? path.resolve(argv['index-root']) : null +}); + +const scope = String(argv.scope || 'repo').toLowerCase(); +const focus = argv.focus ? String(argv.focus) : ''; +const formatRaw = String(argv.format || 'json').toLowerCase(); +const format = formatRaw === 'iso' ? 'html-iso' : formatRaw; + +const viewerControls = { + wasd: { + ...(Number.isFinite(argv['wasd-sensitivity']) ? { sensitivity: Number(argv['wasd-sensitivity']) } : {}), + ...(Number.isFinite(argv['wasd-acceleration']) ? { acceleration: Number(argv['wasd-acceleration']) } : {}), + ...(Number.isFinite(argv['wasd-max-speed']) ? { maxSpeed: Number(argv['wasd-max-speed']) } : {}), + ...(Number.isFinite(argv['wasd-drag']) ? { drag: Number(argv['wasd-drag']) } : {}) + }, + ...(Number.isFinite(argv['zoom-sensitivity']) ? { zoomSensitivity: Number(argv['zoom-sensitivity']) } : {}) +}; + +const buildOptions = { + mode, + scope, + focus, + include: argv.include, + onlyExported: argv['only-exported'] === true, + collapse: argv.collapse, + maxFiles: argv['max-files'], + maxMembersPerFile: argv['max-members-per-file'], + maxEdges: argv['max-edges'], + topKByDegree: argv['top-k-by-degree'] === true, + viewer: { + controls: viewerControls, + openUriTemplate: argv['open-uri-template'] || null + } +}; + +const buildInfo = getCurrentBuildInfo(repoRoot, userConfig, { mode }); +const cacheKey = buildMapCacheKey({ buildId: buildInfo?.buildId || null, options: buildOptions }); +const cacheDir = argv['cache-dir'] + ? path.resolve(argv['cache-dir']) + : path.join(repoRoot, '.pairofcleats', 'maps', 'cache'); +const cachePath = path.join(cacheDir, `${cacheKey}.json`); + +const ensureDir = (targetPath) => { + if (!targetPath) return; + const dir = path.dirname(targetPath); + fs.mkdirSync(dir, { recursive: true }); +}; + +let mapModel = null; +const warnings = []; + +if (!argv.refresh && fs.existsSync(cachePath)) { + try { + mapModel = JSON.parse(fs.readFileSync(cachePath, 'utf8')); + } catch (err) { + warnings.push(`cache read failed: ${err?.message || err}`); + } +} + +if (!mapModel) { + mapModel = buildCodeMap({ repoRoot, indexDir, options: buildOptions }); + mapModel.root.id = getRepoId(repoRoot); + try { + ensureDir(cachePath); + fs.writeFileSync(cachePath, JSON.stringify(mapModel, null, 2)); + } catch (err) { + warnings.push(`cache write failed: ${err?.message || err}`); + } +} + +if (mapModel) { + mapModel.root = mapModel.root || { path: repoRoot, id: null }; + mapModel.root.path = repoRoot; + mapModel.root.id = mapModel.root.id || getRepoId(repoRoot); + warnings.push(...(mapModel.warnings || [])); +} + +const modelOut = argv['model-out'] ? path.resolve(argv['model-out']) : null; +if (modelOut) { + try { + ensureDir(modelOut); + fs.writeFileSync(modelOut, JSON.stringify(mapModel, null, 2)); + } catch (err) { + warnings.push(`model output failed: ${err?.message || err}`); + } +} + +const nodeListOut = argv['node-list-out'] ? path.resolve(argv['node-list-out']) : null; +if (nodeListOut) { + try { + ensureDir(nodeListOut); + const list = buildNodeList(mapModel); + fs.writeFileSync(nodeListOut, JSON.stringify(list, null, 2)); + } catch (err) { + warnings.push(`node list output failed: ${err?.message || err}`); + } +} + +const resolveThreeUrl = (targetPath) => { + if (argv['three-url']) return argv['three-url']; + const modulePath = path.join(repoRoot, 'node_modules', 'three', 'build', 'three.module.js'); + if (!fs.existsSync(modulePath)) return ''; + if (targetPath) { + const rel = path.relative(path.dirname(targetPath), modulePath).replace(/\\/g, '/'); + return rel.startsWith('.') ? rel : `./${rel}`; + } + return pathToFileURL(modulePath).href; +}; + +const formatOutputPath = (targetPath, fallbackExt) => { + if (!targetPath) return null; + if (!fallbackExt) return targetPath; + const currentExt = path.extname(targetPath); + if (currentExt.toLowerCase() === fallbackExt) return targetPath; + return `${targetPath.slice(0, targetPath.length - currentExt.length)}${fallbackExt}`; +}; + +const renderSvg = (dot) => { + const result = spawnSync('dot', ['-Tsvg'], { + input: dot, + encoding: 'utf8' + }); + if (result.status !== 0) { + const message = result.stderr || result.stdout || 'Graphviz dot failed.'; + warnings.push(message.trim()); + return null; + } + return result.stdout; +}; + +let output = null; +let outputPath = argv.out ? path.resolve(argv.out) : null; +let resolvedFormat = format; + +if (format === 'json') { + output = JSON.stringify(mapModel, null, argv.pretty ? 2 : 0); +} else if (format === 'dot') { + output = renderDot(mapModel); +} else if (format === 'svg' || format === 'html') { + const dot = renderDot(mapModel); + const svg = renderSvg(dot); + if (!svg) { + resolvedFormat = 'dot'; + output = dot; + outputPath = formatOutputPath(outputPath, '.dot'); + } else if (format === 'svg') { + output = svg; + } else { + output = renderSvgHtml({ svg, mapModel, title: 'Code Map' }); + } +} else if (format === 'html-iso') { + const threeUrl = resolveThreeUrl(outputPath); + if (!threeUrl) warnings.push('three.js module missing; install three or set --three-url'); + output = renderIsometricHtml({ + mapModel, + threeUrl, + openUriTemplate: argv['open-uri-template'] || mapModel.viewer?.openUriTemplate, + viewerConfig: mapModel.viewer || {} + }); +} else { + output = JSON.stringify(mapModel, null, argv.pretty ? 2 : 0); + resolvedFormat = 'json'; +} + +if (outputPath) { + try { + ensureDir(outputPath); + fs.writeFileSync(outputPath, output); + } catch (err) { + warnings.push(`output write failed: ${err?.message || err}`); + } +} + +const report = { + ok: true, + format: resolvedFormat, + outPath: outputPath, + modelPath: modelOut || null, + nodeListPath: nodeListOut || null, + cacheKey, + summary: mapModel.summary || null, + warnings: Array.from(new Set(warnings.filter(Boolean))) +}; + +if (argv.json) { + console.log(JSON.stringify(report, null, argv.pretty ? 2 : 0)); + process.exit(0); +} + +if (!outputPath) { + process.stdout.write(output); +} else if (!argv.json) { + console.log(`Wrote ${resolvedFormat} map to ${outputPath}`); +} diff --git a/tools/reset-config.js b/tools/reset-config.js new file mode 100644 index 000000000..a08a0c1f5 --- /dev/null +++ b/tools/reset-config.js @@ -0,0 +1,68 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import path from 'node:path'; +import { createCli } from '../src/shared/cli.js'; +import { resolveRepoRoot } from './dict-utils.js'; +import { DEFAULT_USER_CONFIG_TEMPLATE } from './default-config-template.js'; + +const argv = createCli({ + scriptName: 'reset-config', + options: { + repo: { type: 'string' }, + config: { type: 'string' }, + force: { type: 'boolean', default: false }, + backup: { type: 'boolean', default: true }, + json: { type: 'boolean', default: false } + } +}).parse(); + +const isTruthy = (value) => { + if (value == null) return false; + const normalized = String(value).trim().toLowerCase(); + return ['1', 'true', 'yes', 'on'].includes(normalized); +}; + +const forceRequested = argv.force + || isTruthy(process.env.PAIROFCLEATS_RESET_FORCE) + || isTruthy(process.env.npm_config_force); + +const repoRoot = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); +const configPath = argv.config + ? path.resolve(argv.config) + : path.join(repoRoot, '.pairofcleats.json'); +const existing = fs.existsSync(configPath); +const result = { + ok: true, + configPath, + backupPath: null, + reset: false +}; + +if (existing && !forceRequested) { + result.ok = false; + if (argv.json) { + console.log(JSON.stringify(result, null, 2)); + } else { + console.error(`[reset-config] Refusing to overwrite ${configPath} without --force.`); + } + process.exit(1); +} + +if (existing && argv.backup) { + const backupPath = `${configPath}.bak`; + fs.copyFileSync(configPath, backupPath); + result.backupPath = backupPath; +} + +const template = DEFAULT_USER_CONFIG_TEMPLATE.trimEnd(); +fs.writeFileSync(configPath, `${template}\n`, 'utf8'); +result.reset = true; + +if (argv.json) { + console.log(JSON.stringify(result, null, 2)); +} else { + console.log(`[reset-config] Wrote default config to ${configPath}`); + if (result.backupPath) { + console.log(`[reset-config] Backup saved to ${result.backupPath}`); + } +} diff --git a/tools/run-phase22-gates.js b/tools/run-phase22-gates.js new file mode 100644 index 000000000..848980df3 --- /dev/null +++ b/tools/run-phase22-gates.js @@ -0,0 +1,20 @@ +#!/usr/bin/env node +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const root = process.cwd(); +const tests = [ + { label: 'type-inference-lsp-enrichment', file: path.join(root, 'tests', 'type-inference-lsp-enrichment.js') }, + { label: 'embeddings-dims-mismatch', file: path.join(root, 'tests', 'embeddings-dims-mismatch.js') }, + { label: 'embeddings-cache-identity', file: path.join(root, 'tests', 'embeddings-cache-identity.js') } +]; + +for (const test of tests) { + const result = spawnSync(process.execPath, [test.file], { stdio: 'inherit' }); + if (result.status !== 0) { + console.error(`phase22 gate failed: ${test.label}`); + process.exit(result.status ?? 1); + } +} + +console.log('phase22 gate tests passed'); diff --git a/tools/scip-ingest.js b/tools/scip-ingest.js new file mode 100644 index 000000000..8e246e552 --- /dev/null +++ b/tools/scip-ingest.js @@ -0,0 +1,239 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import readline from 'node:readline'; +import { spawn } from 'node:child_process'; +import { createCli } from '../src/shared/cli.js'; +import { getRepoCacheRoot, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'scip-ingest', + options: { + repo: { type: 'string' }, + input: { type: 'string' }, + out: { type: 'string' }, + json: { type: 'boolean', default: false }, + run: { type: 'boolean', default: false }, + scip: { type: 'string', default: 'scip' }, + args: { type: 'string' } + } +}).parse(); + +const repoRoot = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); +const userConfig = loadUserConfig(repoRoot); +const cacheRoot = getRepoCacheRoot(repoRoot, userConfig); +const outputPath = argv.out + ? path.resolve(argv.out) + : path.join(cacheRoot, 'scip', 'scip.jsonl'); +const metaPath = `${outputPath}.meta.json`; +const inputPath = argv.input ? String(argv.input) : null; +const runScip = argv.run === true; +const scipCmd = argv.scip || 'scip'; + +const toPosix = (value) => value.replace(/\\/g, '/'); +const normalizePath = (value) => { + if (!value) return null; + const raw = String(value); + const resolved = path.isAbsolute(raw) ? raw : path.resolve(repoRoot, raw); + const rel = path.relative(repoRoot, resolved); + return toPosix(rel || raw); +}; + +const stats = { + documents: 0, + occurrences: 0, + definitions: 0, + references: 0, + errors: 0, + kinds: {}, + languages: {} +}; + +const bump = (bucket, key) => { + if (!key) return; + const k = String(key); + bucket[k] = (bucket[k] || 0) + 1; +}; + +const ensureOutputDir = async () => { + await fsPromises.mkdir(path.dirname(outputPath), { recursive: true }); +}; + +const writeStream = fs.createWriteStream(outputPath, { encoding: 'utf8' }); + +const roleInfo = (roles) => { + const value = Number(roles) || 0; + const isDefinition = (value & 1) === 1; + const isReference = (value & 2) === 2; + return { isDefinition, isReference }; +}; + +const normalizeRange = (range) => { + if (!Array.isArray(range) || !range.length) return null; + const startLine = Number.isFinite(Number(range[0])) ? Number(range[0]) : 0; + const startChar = Number.isFinite(Number(range[1])) ? Number(range[1]) : 0; + let endLine = startLine; + let endChar = startChar; + if (range.length === 3) { + endChar = Number.isFinite(Number(range[2])) ? Number(range[2]) : endChar; + } else if (range.length >= 4) { + endLine = Number.isFinite(Number(range[2])) ? Number(range[2]) : endLine; + endChar = Number.isFinite(Number(range[3])) ? Number(range[3]) : endChar; + } + return { + startLine: startLine + 1, + startChar, + endLine: endLine + 1, + endChar + }; +}; + +const extractSymbolInfo = (doc) => { + const entries = doc?.symbols || doc?.symbolInformation || doc?.symbolInformations || []; + if (!Array.isArray(entries) || !entries.length) return new Map(); + const map = new Map(); + for (const entry of entries) { + if (!entry || !entry.symbol) continue; + map.set(entry.symbol, entry); + } + return map; +}; + +const writeOccurrence = (doc, occurrence, symbolInfo) => { + if (!occurrence || !occurrence.symbol) return; + const file = normalizePath(doc.relativePath || doc.path || doc.file || ''); + if (!file) return; + const range = normalizeRange(occurrence.range || occurrence.enclosingRange); + const info = symbolInfo.get(occurrence.symbol) || {}; + const role = roleInfo(occurrence.symbolRoles); + const entry = { + file, + ext: path.extname(file).toLowerCase(), + name: info.displayName || info.symbol || occurrence.symbol, + symbol: occurrence.symbol, + kind: info.kind || info.symbolKind || null, + signature: info.signature || info.signatureDocumentation || null, + startLine: range ? range.startLine : null, + endLine: range ? range.endLine : null, + startChar: range ? range.startChar : null, + endChar: range ? range.endChar : null, + role: role.isDefinition ? 'definition' : (role.isReference ? 'reference' : 'other'), + language: info.language || doc.language || null, + scope: info.scope || null, + scopeKind: info.scopeKind || null + }; + stats.occurrences += 1; + if (role.isDefinition) stats.definitions += 1; + if (role.isReference) stats.references += 1; + bump(stats.kinds, entry.kind || 'unknown'); + bump(stats.languages, entry.language || 'unknown'); + writeStream.write(`${JSON.stringify(entry)}\n`); +}; + +const handleDocument = (doc) => { + if (!doc || typeof doc !== 'object') return; + const file = doc.relativePath || doc.path || doc.file || null; + if (!file) return; + stats.documents += 1; + const symbolInfo = extractSymbolInfo(doc); + const occurrences = Array.isArray(doc.occurrences) ? doc.occurrences : []; + for (const occ of occurrences) { + writeOccurrence(doc, occ, symbolInfo); + } +}; + +const handlePayload = (payload) => { + if (!payload) return; + if (Array.isArray(payload)) { + payload.forEach(handlePayload); + return; + } + if (Array.isArray(payload.documents)) { + payload.documents.forEach(handleDocument); + return; + } + if (payload.relativePath || payload.path || payload.file) { + handleDocument(payload); + } +}; + +const ingestJsonLines = async (stream) => { + const rl = readline.createInterface({ input: stream, crlfDelay: Infinity }); + for await (const line of rl) { + const trimmed = line.trim(); + if (!trimmed) continue; + let parsed = null; + try { + parsed = JSON.parse(trimmed); + } catch { + stats.errors += 1; + continue; + } + handlePayload(parsed); + } +}; + +const ingestJsonFile = async (filePath) => { + try { + const raw = await fsPromises.readFile(filePath, 'utf8'); + const parsed = JSON.parse(raw); + handlePayload(parsed); + return true; + } catch { + return false; + } +}; + +const runScipCommand = async () => { + const args = ['print', '--format=json']; + if (inputPath) args.push('--input', inputPath); + if (argv.args) { + const extra = String(argv.args) + .split(/\s+/) + .map((entry) => entry.trim()) + .filter(Boolean); + args.push(...extra); + } + const child = spawn(scipCmd, args, { stdio: ['ignore', 'pipe', 'pipe'] }); + child.stderr.on('data', (chunk) => process.stderr.write(chunk)); + await ingestJsonLines(child.stdout); + const exitCode = await new Promise((resolve) => { + child.on('close', (code) => resolve(code ?? 0)); + }); + if (exitCode !== 0) { + throw new Error(`scip exited with code ${exitCode}`); + } +}; + +await ensureOutputDir(); +if (runScip) { + await runScipCommand(); +} else if (inputPath && inputPath !== '-') { + const parsed = await ingestJsonFile(inputPath); + if (!parsed) { + const inputStream = fs.createReadStream(inputPath, { encoding: 'utf8' }); + await ingestJsonLines(inputStream); + } +} else { + await ingestJsonLines(process.stdin); +} + +writeStream.end(); + +const summary = { + generatedAt: new Date().toISOString(), + repoRoot: path.resolve(repoRoot), + input: inputPath || (runScip ? 'scip' : 'stdin'), + output: path.resolve(outputPath), + stats +}; +await fsPromises.writeFile(metaPath, JSON.stringify(summary, null, 2)); + +if (argv.json) { + console.log(JSON.stringify(summary, null, 2)); +} else { + console.log(`SCIP ingest: ${stats.occurrences} occurrences (${stats.errors} parse errors)`); + console.log(`- output: ${outputPath}`); + console.log(`- meta: ${metaPath}`); +} diff --git a/tools/search-sqlite.js b/tools/search-sqlite.js deleted file mode 100644 index 645b7fa34..000000000 --- a/tools/search-sqlite.js +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env node -import { spawnSync } from 'node:child_process'; -import path from 'node:path'; -import { fileURLToPath } from 'node:url'; - -const args = process.argv.slice(2); -const hasBackend = args.includes('--backend'); -const scriptRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..'); -const searchPath = path.join(scriptRoot, 'search.js'); -const forwarded = hasBackend ? args : ['--backend', 'sqlite-fts', ...args]; - -const result = spawnSync(process.execPath, [searchPath, ...forwarded], { - stdio: 'inherit', - env: process.env -}); - -process.exit(result.status ?? 1); diff --git a/tools/service/config.js b/tools/service/config.js new file mode 100644 index 000000000..8bb698583 --- /dev/null +++ b/tools/service/config.js @@ -0,0 +1,48 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { getCacheRoot } from '../dict-utils.js'; + +export function getServiceConfigPath(inputPath = null) { + if (inputPath) return path.resolve(inputPath); + return path.join(getCacheRoot(), 'service', 'config.json'); +} + +export function loadServiceConfig(configPath) { + if (!configPath || !fs.existsSync(configPath)) { + return { + repos: [], + queue: { + maxQueued: 20 + }, + worker: { + concurrency: 1 + }, + embeddings: { + queue: { + maxQueued: 10 + }, + worker: { + concurrency: 1, + maxMemoryMb: 4096 + } + }, + sync: { + policy: 'pull', + intervalMs: 5 * 60 * 1000 + } + }; + } + const raw = JSON.parse(fs.readFileSync(configPath, 'utf8')); + return raw && typeof raw === 'object' ? raw : {}; +} + +export function resolveRepoRegistry(config, configPath) { + if (Array.isArray(config?.repos)) return config.repos; + const repoFile = config?.reposFile; + if (!repoFile) return []; + const baseDir = configPath ? path.dirname(configPath) : process.cwd(); + const resolved = path.isAbsolute(repoFile) ? repoFile : path.join(baseDir, repoFile); + if (!fs.existsSync(resolved)) return []; + const payload = JSON.parse(fs.readFileSync(resolved, 'utf8')); + return Array.isArray(payload?.repos) ? payload.repos : []; +} diff --git a/tools/service/logger.js b/tools/service/logger.js new file mode 100644 index 000000000..29700b6df --- /dev/null +++ b/tools/service/logger.js @@ -0,0 +1,42 @@ +import { getEnvConfig } from '../../src/shared/env.js'; +import { configureLogger, log, logError, logLine, updateLogContext } from '../../src/shared/progress.js'; +import { loadUserConfig } from '../dict-utils.js'; + +const normalizeLevel = (value) => { + if (typeof value === 'string' && value.trim()) return value.trim().toLowerCase(); + return 'info'; +}; + +const normalizeFormat = (value) => { + if (value === 'json' || value === 'pretty') return value; + return 'text'; +}; + +export function configureServiceLogger({ repoRoot, service, context = {} }) { + const envConfig = getEnvConfig(); + const userConfig = repoRoot ? loadUserConfig(repoRoot) : {}; + const loggingConfig = userConfig?.logging || {}; + const logFormat = normalizeFormat(envConfig.logFormat || loggingConfig.format); + const logLevel = normalizeLevel(envConfig.logLevel || loggingConfig.level); + const ringMax = Number.isFinite(Number(loggingConfig.ringMax)) + ? Math.max(1, Math.floor(Number(loggingConfig.ringMax))) + : 200; + const ringMaxBytes = Number.isFinite(Number(loggingConfig.ringMaxBytes)) + ? Math.max(1024, Math.floor(Number(loggingConfig.ringMaxBytes))) + : 2 * 1024 * 1024; + configureLogger({ + enabled: logFormat !== 'text', + pretty: logFormat === 'pretty', + level: logLevel, + ringMax, + ringMaxBytes, + redact: loggingConfig.redact, + context: { + service: service || 'service', + repoRoot: repoRoot || null, + ...context + } + }); + updateLogContext({ service: service || 'service' }); + return { log, logLine, logError }; +} diff --git a/tools/service/queue.js b/tools/service/queue.js new file mode 100644 index 000000000..36f7d4ad2 --- /dev/null +++ b/tools/service/queue.js @@ -0,0 +1,262 @@ +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import path from 'node:path'; + +const readJson = async (filePath, fallback) => { + try { + const raw = await fs.readFile(filePath, 'utf8'); + return JSON.parse(raw); + } catch { + return fallback; + } +}; + +const withLock = async (lockPath, worker) => { + const start = Date.now(); + while (true) { + try { + const handle = await fs.open(lockPath, 'wx'); + try { + return await worker(); + } finally { + await handle.close(); + await fs.rm(lockPath, { force: true }); + } + } catch (err) { + if (err?.code !== 'EEXIST') throw err; + if (Date.now() - start > 5000) throw new Error('Queue lock timeout.'); + await new Promise((resolve) => setTimeout(resolve, 100)); + } + } +}; + +export async function ensureQueueDir(dirPath) { + await fs.mkdir(dirPath, { recursive: true }); +} + +const ensureJobDirs = async (dirPath) => { + const logsDir = path.join(dirPath, 'logs'); + const reportsDir = path.join(dirPath, 'reports'); + await fs.mkdir(logsDir, { recursive: true }); + await fs.mkdir(reportsDir, { recursive: true }); + return { logsDir, reportsDir }; +}; + +const normalizeQueueName = (value) => { + const raw = typeof value === 'string' ? value.trim().toLowerCase() : ''; + if (!raw || raw === 'index') return null; + return raw.replace(/[^a-z0-9_-]+/g, '-'); +}; + +export function resolveQueueName(queueName, job = null) { + const normalized = normalizeQueueName(queueName); + if (normalized && normalized !== 'auto') return normalized; + if (normalized === 'auto') { + const base = job?.reason === 'embeddings' ? 'embeddings' : 'index'; + const parts = []; + if (job?.stage) parts.push(String(job.stage).toLowerCase()); + if (job?.mode && job.mode !== 'both') parts.push(String(job.mode).toLowerCase()); + return parts.length ? `${base}-${parts.join('-')}` : base; + } + return normalized; +} + +export function getQueuePaths(dirPath, queueName = null) { + const normalized = normalizeQueueName(queueName); + const suffix = normalized ? `-${normalized}` : ''; + return { + queuePath: path.join(dirPath, `queue${suffix}.json`), + lockPath: path.join(dirPath, `queue${suffix}.lock`) + }; +} + +export async function loadQueue(dirPath, queueName = null) { + const { queuePath } = getQueuePaths(dirPath, queueName); + const payload = await readJson(queuePath, { jobs: [] }); + return { + jobs: Array.isArray(payload.jobs) ? payload.jobs : [] + }; +} + +export async function saveQueue(dirPath, queue, queueName = null) { + const { queuePath } = getQueuePaths(dirPath, queueName); + await fs.writeFile(queuePath, JSON.stringify(queue, null, 2)); +} + +export async function enqueueJob(dirPath, job, maxQueued = null, queueName = null) { + await ensureQueueDir(dirPath); + const { logsDir, reportsDir } = await ensureJobDirs(dirPath); + const resolvedQueueName = resolveQueueName(queueName, job); + const { lockPath } = getQueuePaths(dirPath, resolvedQueueName); + return withLock(lockPath, async () => { + const queue = await loadQueue(dirPath, resolvedQueueName); + const queued = queue.jobs.filter((entry) => entry.status === 'queued'); + if (Number.isFinite(maxQueued) && queued.length >= maxQueued) { + return { ok: false, message: 'Queue is full.' }; + } + const maxRetries = Number.isFinite(Number(job.maxRetries)) && Number(job.maxRetries) >= 0 + ? Math.floor(Number(job.maxRetries)) + : null; + const next = { + id: job.id, + createdAt: job.createdAt, + status: 'queued', + repo: job.repo, + mode: job.mode, + reason: job.reason || null, + stage: job.stage || null, + args: Array.isArray(job.args) && job.args.length ? job.args : null, + attempts: 0, + maxRetries, + nextEligibleAt: null, + lastHeartbeatAt: null, + logPath: path.join(logsDir, `${job.id}.log`), + reportPath: path.join(reportsDir, `${job.id}.json`) + }; + queue.jobs.push(next); + await saveQueue(dirPath, queue, resolvedQueueName); + return { ok: true, job: next }; + }); +} + +export async function claimNextJob(dirPath, queueName = null) { + const { lockPath } = getQueuePaths(dirPath, queueName); + return withLock(lockPath, async () => { + const { logsDir, reportsDir } = await ensureJobDirs(dirPath); + const queue = await loadQueue(dirPath, queueName); + const now = Date.now(); + const job = queue.jobs.find((entry) => { + if (entry.status !== 'queued') return false; + if (!entry.nextEligibleAt) return true; + const eligibleAt = Date.parse(entry.nextEligibleAt); + return Number.isNaN(eligibleAt) || eligibleAt <= now; + }); + if (!job) return null; + if (!job.logPath) job.logPath = path.join(logsDir, `${job.id}.log`); + if (!job.reportPath) job.reportPath = path.join(reportsDir, `${job.id}.json`); + job.status = 'running'; + job.startedAt = new Date().toISOString(); + job.lastHeartbeatAt = job.startedAt; + await saveQueue(dirPath, queue, queueName); + return job; + }); +} + +export async function completeJob(dirPath, jobId, status, result, queueName = null) { + const { lockPath } = getQueuePaths(dirPath, queueName); + return withLock(lockPath, async () => { + const { reportsDir } = await ensureJobDirs(dirPath); + const queue = await loadQueue(dirPath, queueName); + const job = queue.jobs.find((entry) => entry.id === jobId); + if (!job) return null; + job.status = status; + job.finishedAt = new Date().toISOString(); + job.result = result || null; + if (Number.isFinite(result?.attempts)) { + job.attempts = Math.max(0, Math.floor(result.attempts)); + } + if (result?.error) { + job.lastError = result.error; + } + job.lastHeartbeatAt = null; + await saveQueue(dirPath, queue, queueName); + const reportPath = job.reportPath || path.join(reportsDir, `${job.id}.json`); + try { + await fs.writeFile(reportPath, JSON.stringify({ + updatedAt: new Date().toISOString(), + status: job.status, + job + }, null, 2)); + } catch {} + return job; + }); +} + +export async function touchJobHeartbeat(dirPath, jobId, queueName = null) { + const { lockPath } = getQueuePaths(dirPath, queueName); + return withLock(lockPath, async () => { + const queue = await loadQueue(dirPath, queueName); + const job = queue.jobs.find((entry) => entry.id === jobId); + if (!job) return null; + if (job.status !== 'running') return job; + job.lastHeartbeatAt = new Date().toISOString(); + await saveQueue(dirPath, queue, queueName); + return job; + }); +} + +const resolveStaleThresholdMs = (job, queueName) => { + const stage = typeof job?.stage === 'string' ? job.stage.toLowerCase() : ''; + if (queueName === 'embeddings' || job?.reason === 'embeddings' || stage === 'stage3') { + return 15 * 60 * 1000; + } + if (stage === 'stage2') return 10 * 60 * 1000; + return null; +}; + +const resolveRetryDelayMs = (attempts) => { + if (attempts <= 0) return 0; + if (attempts === 1) return 2 * 60 * 1000; + return 10 * 60 * 1000; +}; + +export async function requeueStaleJobs(dirPath, queueName = null, options = {}) { + const { lockPath } = getQueuePaths(dirPath, queueName); + return withLock(lockPath, async () => { + const queue = await loadQueue(dirPath, queueName); + const now = Date.now(); + const stale = []; + for (const job of queue.jobs) { + if (job.status !== 'running') continue; + const threshold = resolveStaleThresholdMs(job, queueName); + if (!threshold) continue; + const heartbeatAt = Date.parse(job.lastHeartbeatAt || job.startedAt || ''); + if (Number.isNaN(heartbeatAt)) continue; + if (now - heartbeatAt <= threshold) continue; + stale.push(job); + } + if (!stale.length) return { stale: 0, retried: 0, failed: 0 }; + let retried = 0; + let failed = 0; + for (const job of stale) { + const attempts = Number.isFinite(job.attempts) ? job.attempts : 0; + const maxRetries = Number.isFinite(job.maxRetries) + ? job.maxRetries + : (Number.isFinite(options.maxRetries) ? options.maxRetries : 2); + const nextAttempts = attempts + 1; + if (nextAttempts <= maxRetries) { + retried += 1; + job.status = 'queued'; + job.attempts = nextAttempts; + job.lastError = 'stale job heartbeat'; + const delayMs = resolveRetryDelayMs(nextAttempts); + job.nextEligibleAt = new Date(now + delayMs).toISOString(); + } else { + failed += 1; + job.status = 'failed'; + job.finishedAt = new Date().toISOString(); + job.result = { error: 'stale job heartbeat', attempts: nextAttempts }; + } + job.lastHeartbeatAt = null; + } + await saveQueue(dirPath, queue, queueName); + return { stale: stale.length, retried, failed }; + }); +} + +export async function queueSummary(dirPath, queueName = null) { + const { queuePath } = getQueuePaths(dirPath, queueName); + if (!fsSync.existsSync(queuePath)) { + return { total: 0, queued: 0, running: 0, done: 0, failed: 0, retries: 0 }; + } + const queue = await loadQueue(dirPath, queueName); + const summary = { total: queue.jobs.length, queued: 0, running: 0, done: 0, failed: 0, retries: 0 }; + for (const job of queue.jobs) { + if (job.status === 'queued') summary.queued += 1; + else if (job.status === 'running') summary.running += 1; + else if (job.status === 'done') summary.done += 1; + else if (job.status === 'failed') summary.failed += 1; + if (Number.isFinite(job.attempts) && job.attempts > 0) summary.retries += 1; + } + return summary; +} diff --git a/tools/service/repos.js b/tools/service/repos.js new file mode 100644 index 000000000..229e3544d --- /dev/null +++ b/tools/service/repos.js @@ -0,0 +1,41 @@ +import fs from 'node:fs/promises'; +import fsSync from 'node:fs'; +import path from 'node:path'; +import { spawnSync } from 'node:child_process'; + +const runGit = (args, cwd) => spawnSync('git', args, { cwd, encoding: 'utf8' }); + +export function resolveRepoPath(entry, baseDir) { + if (!entry?.path) return null; + return path.isAbsolute(entry.path) ? entry.path : path.join(baseDir, entry.path); +} + +export async function ensureRepo(entry, baseDir, defaultPolicy = 'pull') { + const repoPath = resolveRepoPath(entry, baseDir); + if (!repoPath) return { ok: false, message: 'Missing repo path.' }; + const branch = entry.branch || 'main'; + const policy = entry.syncPolicy || defaultPolicy; + const depth = Number.isFinite(Number(entry.cloneDepth)) ? Math.max(0, Number(entry.cloneDepth)) : 0; + + if (!fsSync.existsSync(repoPath)) { + if (!entry.url) return { ok: false, message: `Missing repo url for ${repoPath}` }; + await fs.mkdir(path.dirname(repoPath), { recursive: true }); + const cloneArgs = ['clone']; + if (depth > 0) cloneArgs.push('--depth', String(depth)); + if (branch) cloneArgs.push('--branch', branch); + cloneArgs.push(entry.url, repoPath); + const clone = runGit(cloneArgs, process.cwd()); + if (clone.status !== 0) { + return { ok: false, message: clone.stderr || clone.stdout || 'git clone failed' }; + } + return { ok: true, repoPath, action: 'clone' }; + } + + if (policy === 'none') return { ok: true, repoPath, action: 'skip' }; + const args = policy === 'fetch' ? ['fetch', '--all', '--prune'] : ['pull', '--ff-only']; + const sync = runGit(args, repoPath); + if (sync.status !== 0) { + return { ok: false, repoPath, message: sync.stderr || sync.stdout || 'git sync failed' }; + } + return { ok: true, repoPath, action: policy }; +} diff --git a/tools/setup.js b/tools/setup.js index 581ae31a9..902cefb3f 100644 --- a/tools/setup.js +++ b/tools/setup.js @@ -2,60 +2,54 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; -import minimist from 'minimist'; +import os from 'node:os'; +import { createCli } from '../src/shared/cli.js'; import readline from 'node:readline/promises'; +import { readJsoncFile } from '../src/shared/jsonc.js'; import { getDictionaryPaths, getDictConfig, getIndexDir, getModelConfig, getRepoCacheRoot, + getRuntimeConfig, getToolingConfig, loadUserConfig, - resolveRepoRoot + resolveRuntimeEnv, + resolveRepoRoot, + resolveToolRoot } from './dict-utils.js'; import { runCommand as runCommandBase } from './cli-utils.js'; import { getVectorExtensionConfig, resolveVectorExtensionPath } from './vector-extension.js'; -const argv = minimist(process.argv.slice(2), { - boolean: [ - 'json', - 'non-interactive', - 'validate-config', - 'skip-validate', - 'skip-install', - 'skip-dicts', - 'skip-models', - 'skip-extensions', - 'skip-tooling', - 'skip-index', - 'skip-sqlite', - 'skip-artifacts', - 'with-sqlite', - 'incremental' - ], - string: ['root', 'repo', 'tooling-scope'], - alias: { ci: 'non-interactive', s: 'with-sqlite', i: 'incremental' }, - default: { - 'non-interactive': false, - 'validate-config': false, - 'skip-validate': false, - 'skip-install': false, - 'skip-dicts': false, - 'skip-models': false, - 'skip-extensions': false, - 'skip-tooling': false, - 'skip-index': false, - 'skip-sqlite': false, - 'skip-artifacts': false, - 'with-sqlite': false, - incremental: false, - json: false - } -}); +const argv = createCli({ + scriptName: 'setup', + options: { + json: { type: 'boolean', default: false }, + 'non-interactive': { type: 'boolean', default: false }, + 'validate-config': { type: 'boolean', default: false }, + 'skip-validate': { type: 'boolean', default: false }, + 'skip-install': { type: 'boolean', default: false }, + 'skip-dicts': { type: 'boolean', default: false }, + 'skip-models': { type: 'boolean', default: false }, + 'skip-extensions': { type: 'boolean', default: false }, + 'skip-tooling': { type: 'boolean', default: false }, + 'skip-index': { type: 'boolean', default: false }, + 'skip-sqlite': { type: 'boolean', default: false }, + 'skip-artifacts': { type: 'boolean', default: false }, + 'with-sqlite': { type: 'boolean', default: false }, + incremental: { type: 'boolean', default: false }, + root: { type: 'string' }, + repo: { type: 'string' }, + 'tooling-scope': { type: 'string' }, + 'heap-mb': { type: 'string' } + }, + aliases: { ci: 'non-interactive', s: 'with-sqlite', i: 'incremental' } +}).parse(); const explicitRoot = argv.root || argv.repo; const root = explicitRoot ? path.resolve(explicitRoot) : resolveRepoRoot(process.cwd()); +const toolRoot = resolveToolRoot(); const jsonOutput = argv.json === true; const nonInteractive = argv['non-interactive'] === true; const rl = nonInteractive ? null : readline.createInterface({ input: process.stdin, output: process.stdout }); @@ -110,8 +104,62 @@ async function promptChoice(question, choices, defaultChoice) { return match || defaultChoice; } +function formatGb(mb) { + return `${(mb / 1024).toFixed(1)} GB`; +} + +function getRecommendedHeapMb() { + const totalMb = Math.floor(os.totalmem() / (1024 * 1024)); + const recommended = Math.max(4096, Math.floor(totalMb * 0.75)); + const rounded = Math.floor(recommended / 256) * 256; + return { + totalMb, + recommendedMb: Math.max(4096, rounded) + }; +} + +async function updateRuntimeConfig(maxOldSpaceMb) { + const existing = configExists + ? readJsoncFile(configPath) + : {}; + const next = { + ...existing, + runtime: { + ...(existing.runtime || {}), + maxOldSpaceMb + } + }; + await fsPromises.writeFile(configPath, JSON.stringify(next, null, 2)); + configExists = true; + return next; +} + +async function updateProfileConfig(profileName) { + const existing = configExists + ? readJsoncFile(configPath) + : {}; + const next = { + ...existing, + profile: profileName + }; + await fsPromises.writeFile(configPath, JSON.stringify(next, null, 2)); + configExists = true; + return next; +} + +function buildRuntimeEnv(config) { + const runtimeConfig = getRuntimeConfig(root, config); + return resolveRuntimeEnv(runtimeConfig, process.env); +} + +let runtimeEnv = { ...process.env }; + function runCommand(cmd, args, options = {}) { - const spawnOptions = { cwd: root, ...options }; + const spawnOptions = { + cwd: root, + ...options, + env: { ...runtimeEnv, ...(options.env || {}) } + }; if (!('stdio' in spawnOptions)) { spawnOptions.stdio = jsonOutput ? 'pipe' : 'inherit'; } @@ -143,7 +191,7 @@ async function hasEntries(dirPath) { log(`Starting setup in ${root}`); const configPath = path.join(root, '.pairofcleats.json'); -const configExists = fs.existsSync(configPath); +let configExists = fs.existsSync(configPath); let shouldValidateConfig = argv['validate-config'] === true; if (!argv['skip-validate'] && configExists && !shouldValidateConfig && !nonInteractive) { shouldValidateConfig = await promptYesNo('Validate .pairofcleats.json now?', true); @@ -151,7 +199,7 @@ if (!argv['skip-validate'] && configExists && !shouldValidateConfig && !nonInter if (argv['skip-validate']) shouldValidateConfig = false; if (shouldValidateConfig && configExists) { - const args = [path.join(root, 'tools', 'validate-config.js'), '--config', configPath]; + const args = [path.join(toolRoot, 'tools', 'validate-config.js'), '--config', configPath]; if (jsonOutput) args.push('--json'); const result = runCommand(process.execPath, args); recordStep('config', { skipped: false, ok: result.ok, configPath }); @@ -169,13 +217,57 @@ if (shouldValidateConfig && configExists) { recordStep('config', { skipped: true, present: configExists, configPath }); } -const userConfig = loadUserConfig(root); +const profileName = typeof argv.profile === 'string' ? argv.profile.trim() : ''; +let userConfig = loadUserConfig(root); +if (profileName) { + await updateProfileConfig(profileName); + userConfig = loadUserConfig(root); + recordStep('profile', { configured: true, profile: profileName }); +} else { + recordStep('profile', { configured: false }); +} +runtimeEnv = buildRuntimeEnv(userConfig); const repoCacheRoot = getRepoCacheRoot(root, userConfig); const incrementalCacheRoot = path.join(repoCacheRoot, 'incremental'); const useIncremental = argv.incremental || fs.existsSync(incrementalCacheRoot); summary.incremental = useIncremental; if (useIncremental) log('Incremental indexing enabled.'); +const heapArgRaw = argv['heap-mb']; +const heapArg = Number.isFinite(Number(heapArgRaw)) ? Number(heapArgRaw) : null; +const currentHeap = Number(userConfig.runtime?.maxOldSpaceMb); +const heapConfigured = Number.isFinite(currentHeap) && currentHeap > 0; +const heapRecommendation = getRecommendedHeapMb(); +let runtimeUpdated = false; +let heapValue = heapConfigured ? currentHeap : null; + +if (Number.isFinite(heapArg) && heapArg > 0) { + userConfig = await updateRuntimeConfig(Math.floor(heapArg)); + runtimeEnv = buildRuntimeEnv(userConfig); + runtimeUpdated = true; + heapValue = Math.floor(heapArg); + log(`Configured Node heap limit at ${formatGb(heapValue)}.`); +} else if (!heapConfigured) { + const defaultYes = heapRecommendation.totalMb >= 16384; + const shouldSet = await promptYesNo( + `Set Node heap limit to ${formatGb(heapRecommendation.recommendedMb)}?`, + defaultYes + ); + if (shouldSet) { + userConfig = await updateRuntimeConfig(heapRecommendation.recommendedMb); + runtimeEnv = buildRuntimeEnv(userConfig); + runtimeUpdated = true; + heapValue = heapRecommendation.recommendedMb; + log(`Configured Node heap limit at ${formatGb(heapValue)}.`); + } +} +recordStep('runtime', { + configured: runtimeUpdated || heapConfigured, + maxOldSpaceMb: heapValue, + recommendedMb: heapRecommendation.recommendedMb, + skipped: !(runtimeUpdated || heapConfigured) +}); + const nodeModules = path.join(root, 'node_modules'); if (argv['skip-install']) { recordStep('install', { skipped: true, present: fs.existsSync(nodeModules) }); @@ -205,7 +297,7 @@ if (argv['skip-dicts']) { if (!hasDicts || needsEnglish) { const shouldDownload = await promptYesNo('Download English dictionary wordlist?', true); if (shouldDownload) { - const result = runCommand(process.execPath, [path.join(root, 'tools', 'download-dicts.js'), '--lang', 'en']); + const result = runCommand(process.execPath, [path.join(toolRoot, 'tools', 'download-dicts.js'), '--lang', 'en']); if (!result.ok) { warn('Dictionary download failed.'); recordError('dictionaries', result, 'download failed'); @@ -236,7 +328,7 @@ if (argv['skip-models']) { const shouldDownload = await promptYesNo(`Download embedding model ${modelConfig.id}?`, true); if (shouldDownload) { const result = runCommand(process.execPath, [ - path.join(root, 'tools', 'download-models.js'), + path.join(toolRoot, 'tools', 'download-models.js'), '--model', modelConfig.id, '--cache-dir', @@ -268,7 +360,7 @@ if (argv['skip-extensions']) { if (!hasExtension) { const shouldDownload = await promptYesNo('Download SQLite ANN extension?', true); if (shouldDownload) { - const result = runCommand(process.execPath, [path.join(root, 'tools', 'download-extensions.js')]); + const result = runCommand(process.execPath, [path.join(toolRoot, 'tools', 'download-extensions.js')]); if (!result.ok) { warn('Extension download failed.'); recordError('extensions', result, 'download failed'); @@ -301,7 +393,7 @@ if (argv['skip-tooling']) { let toolingInstalled = false; const detectResult = runCommand( process.execPath, - [path.join(root, 'tools', 'tooling-detect.js'), '--root', root, '--json'], + [path.join(toolRoot, 'tools', 'tooling-detect.js'), '--root', root, '--json'], { encoding: 'utf8', stdio: 'pipe' } ); if (detectResult.status === 0 && detectResult.stdout) { @@ -318,7 +410,7 @@ if (argv['skip-tooling']) { if (shouldInstall) { const scopeDefault = argv['tooling-scope'] || toolingConfig.installScope || 'cache'; const scope = await promptChoice('Install tooling scope', ['cache', 'global'], scopeDefault); - const installArgs = [path.join(root, 'tools', 'tooling-install.js'), '--root', root, '--scope', scope]; + const installArgs = [path.join(toolRoot, 'tools', 'tooling-install.js'), '--root', root, '--scope', scope]; if (!toolingConfig.allowGlobalFallback) installArgs.push('--no-fallback'); const result = runCommand(process.execPath, installArgs); if (!result.ok) { @@ -353,7 +445,7 @@ if (!argv['skip-artifacts']) { if (fs.existsSync(manifestPath)) { const shouldRestore = await promptYesNo('Restore CI artifacts from ci-artifacts?', true); if (shouldRestore) { - const result = runCommand(process.execPath, [path.join(root, 'tools', 'ci-restore-artifacts.js'), '--from', artifactsDir]); + const result = runCommand(process.execPath, [path.join(toolRoot, 'tools', 'ci-restore-artifacts.js'), '--from', artifactsDir]); restoredArtifacts = result.ok; if (!result.ok) { warn('CI artifact restore failed.'); @@ -369,8 +461,17 @@ recordStep('artifacts', { const codeIndexDir = getIndexDir(root, 'code', userConfig); const proseIndexDir = getIndexDir(root, 'prose', userConfig); -const codeIndexPresent = fs.existsSync(path.join(codeIndexDir, 'chunk_meta.json')); -const proseIndexPresent = fs.existsSync(path.join(proseIndexDir, 'chunk_meta.json')); +const hasChunkMeta = (indexDir) => { + const jsonPath = path.join(indexDir, 'chunk_meta.json'); + const jsonlPath = path.join(indexDir, 'chunk_meta.jsonl'); + const metaPath = path.join(indexDir, 'chunk_meta.meta.json'); + const partsDir = path.join(indexDir, 'chunk_meta.parts'); + return fs.existsSync(jsonPath) + || fs.existsSync(jsonlPath) + || (fs.existsSync(metaPath) && fs.existsSync(partsDir)); +}; +const codeIndexPresent = hasChunkMeta(codeIndexDir); +const proseIndexPresent = hasChunkMeta(proseIndexDir); let indexReady = restoredArtifacts || codeIndexPresent || proseIndexPresent; let indexBuilt = false; let indexBuildOk = true; @@ -381,7 +482,7 @@ if (!argv['skip-index'] && !restoredArtifacts) { !indexReady ); if (shouldBuild) { - const args = [path.join(root, 'build_index.js')]; + const args = [path.join(toolRoot, 'build_index.js')]; if (useIncremental) args.push('--incremental'); const result = runCommand(process.execPath, args); if (!result.ok) { @@ -397,7 +498,7 @@ if (!argv['skip-index'] && !restoredArtifacts) { let sqliteBuilt = false; let sqliteOk = true; if (!argv['skip-sqlite']) { - const sqliteConfigured = userConfig.sqlite?.use === true; + const sqliteConfigured = userConfig.sqlite?.use !== false; const sqliteDefault = argv['with-sqlite'] ? true : sqliteConfigured; const shouldBuildSqlite = argv['with-sqlite'] ? true @@ -406,7 +507,7 @@ if (!argv['skip-sqlite']) { if (!indexReady) { const shouldBuildIndex = await promptYesNo('SQLite build requires file-backed indexes. Build index now?', true); if (shouldBuildIndex && !argv['skip-index']) { - const args = [path.join(root, 'build_index.js')]; + const args = [path.join(toolRoot, 'build_index.js')]; if (useIncremental) args.push('--incremental'); const result = runCommand(process.execPath, args); if (!result.ok) { @@ -419,7 +520,7 @@ if (!argv['skip-sqlite']) { } } if (indexReady) { - const sqliteArgs = [path.join(root, 'tools', 'build-sqlite-index.js')]; + const sqliteArgs = [path.join(toolRoot, 'tools', 'build-sqlite-index.js')]; if (useIncremental) sqliteArgs.push('--incremental'); const result = runCommand(process.execPath, sqliteArgs); sqliteBuilt = true; @@ -451,6 +552,7 @@ recordStep('index', { if (rl) rl.close(); log('Setup complete.'); +log('Tip: run npm run index-validate to verify index artifacts.'); if (jsonOutput) { console.log(JSON.stringify(summary, null, 2)); } diff --git a/tools/shard-census.js b/tools/shard-census.js new file mode 100644 index 000000000..e1cef7664 --- /dev/null +++ b/tools/shard-census.js @@ -0,0 +1,269 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import path from 'node:path'; +import os from 'node:os'; +import { createCli } from '../src/shared/cli.js'; +import { loadUserConfig, resolveToolRoot } from './dict-utils.js'; +import { buildIgnoreMatcher } from '../src/index/build/ignore.js'; +import { discoverFilesForModes } from '../src/index/build/discover.js'; +import { planShardBatches, planShards } from '../src/index/build/shards.js'; +import { countLinesForEntries } from '../src/shared/file-stats.js'; + +const argv = createCli({ + scriptName: 'shard-census', + usage: 'Usage: shard-census --repo | --bench', + options: { + bench: { type: 'boolean', default: false }, + repo: { type: 'string' } + } +}).parse(); + +const scriptRoot = resolveToolRoot(); +const benchConfigPath = path.join(scriptRoot, 'benchmarks', 'repos.json'); +const benchReposRoot = path.join(scriptRoot, 'benchmarks', 'repos'); + +const normalizeLimit = (value, fallback) => { + if (value === 0 || value === false) return null; + const parsed = Number(value); + if (Number.isFinite(parsed) && parsed > 0) return Math.floor(parsed); + return fallback; +}; + +const normalizeDepth = (value, fallback) => { + if (value === 0) return 0; + if (value === false) return null; + const parsed = Number(value); + if (Number.isFinite(parsed) && parsed > 0) return Math.floor(parsed); + return fallback; +}; + +const normalizeCapValue = (value) => { + if (value === 0 || value === false) return null; + const parsed = Number(value); + if (Number.isFinite(parsed) && parsed > 0) return Math.floor(parsed); + return null; +}; + +const normalizeCapEntry = (raw) => { + const input = raw && typeof raw === 'object' ? raw : {}; + const maxBytes = normalizeCapValue(input.maxBytes); + const maxLines = normalizeCapValue(input.maxLines); + return { maxBytes, maxLines }; +}; + +const normalizeCapsByExt = (raw) => { + const input = raw && typeof raw === 'object' ? raw : {}; + const output = {}; + for (const [key, value] of Object.entries(input)) { + const entry = normalizeCapEntry(value); + if (entry.maxBytes == null && entry.maxLines == null) continue; + const normalizedKey = key.startsWith('.') ? key.toLowerCase() : `.${key.toLowerCase()}`; + output[normalizedKey] = entry; + } + return output; +}; + +const normalizeCapsByLanguage = (raw) => { + const input = raw && typeof raw === 'object' ? raw : {}; + const output = {}; + for (const [key, value] of Object.entries(input)) { + const entry = normalizeCapEntry(value); + if (entry.maxBytes == null && entry.maxLines == null) continue; + output[key.toLowerCase()] = entry; + } + return output; +}; + +const resolveMaxFileBytes = (indexingConfig) => { + const maxFileBytesRaw = indexingConfig?.maxFileBytes; + const maxFileBytesParsed = Number(maxFileBytesRaw); + if (maxFileBytesRaw === false || maxFileBytesRaw === 0) { + return null; + } + if (Number.isFinite(maxFileBytesParsed) && maxFileBytesParsed > 0) { + return maxFileBytesParsed; + } + return 5 * 1024 * 1024; +}; + +const resolveFileCaps = (indexingConfig) => { + const fileCapsConfig = indexingConfig?.fileCaps || {}; + return { + default: normalizeCapEntry(fileCapsConfig.default || {}), + byExt: normalizeCapsByExt(fileCapsConfig.byExt || {}), + byLanguage: normalizeCapsByLanguage(fileCapsConfig.byLanguage || {}) + }; +}; + +const resolveShardConfig = (indexingConfig) => { + const shardsConfig = indexingConfig?.shards || {}; + return { + enabled: shardsConfig.enabled === true, + maxShards: normalizeLimit(shardsConfig.maxShards, null), + minFiles: normalizeLimit(shardsConfig.minFiles, null), + dirDepth: normalizeDepth(shardsConfig.dirDepth, 3), + maxWorkers: normalizeLimit(shardsConfig.maxWorkers, null) + }; +}; + +const loadBenchConfig = async () => { + const raw = await fsPromises.readFile(benchConfigPath, 'utf8'); + return JSON.parse(raw); +}; + +const buildBenchTasks = (config) => { + const tasks = []; + for (const [language, entry] of Object.entries(config || {})) { + const repos = entry?.repos || {}; + for (const tier of Object.keys(repos)) { + const list = Array.isArray(repos[tier]) ? repos[tier] : []; + for (const repo of list) { + tasks.push({ language, repo, tier }); + } + } + } + return tasks; +}; + +const resolveRepoPath = async (repoArg) => { + if (!repoArg) return null; + const direct = path.resolve(repoArg); + if (fs.existsSync(direct)) return direct; + if (!repoArg.includes('/')) return null; + const config = await loadBenchConfig(); + const matches = []; + for (const [language, entry] of Object.entries(config || {})) { + const repos = entry?.repos || {}; + for (const list of Object.values(repos)) { + if (!Array.isArray(list)) continue; + if (list.includes(repoArg)) { + matches.push(path.join(benchReposRoot, language, repoArg)); + } + } + } + if (matches.length === 1) return matches[0]; + return null; +}; + +const formatNumber = (value) => value.toLocaleString('en-US'); + +const censusRepo = async (repoPath, label) => { + const userConfig = loadUserConfig(repoPath); + const indexingConfig = userConfig.indexing || {}; + const maxFileBytes = resolveMaxFileBytes(indexingConfig); + const fileCaps = resolveFileCaps(indexingConfig); + const shardConfig = resolveShardConfig(indexingConfig); + const { ignoreMatcher } = await buildIgnoreMatcher({ root: repoPath, userConfig }); + + const modes = ['code', 'prose']; + const skippedByMode = { code: [], prose: [] }; + const entriesByMode = await discoverFilesForModes({ + root: repoPath, + modes, + ignoreMatcher, + skippedByMode, + maxFileBytes, + fileCaps + }); + + const concurrency = Math.max(1, Math.min(32, os.cpus().length * 2)); + console.log(`\n${label}`); + console.log(`Repo: ${repoPath}`); + for (const mode of modes) { + const entries = entriesByMode[mode] || []; + if (!entries.length) { + console.log(`Mode ${mode}: no files`); + continue; + } + const lineCounts = await countLinesForEntries(entries, { concurrency }); + const shards = planShards(entries, { + mode, + maxShards: shardConfig.maxShards, + minFiles: shardConfig.minFiles, + dirDepth: shardConfig.dirDepth, + lineCounts + }); + const shardStats = shards.map((shard) => { + const lines = Number.isFinite(shard.lineCount) ? shard.lineCount : 0; + return { + id: shard.id, + label: shard.label || shard.id, + files: shard.entries.length, + lines + }; + }); + shardStats.sort((a, b) => { + if (b.lines !== a.lines) return b.lines - a.lines; + if (b.files !== a.files) return b.files - a.files; + return a.label < b.label ? -1 : a.label > b.label ? 1 : 0; + }); + const totalFiles = entries.length; + const totalLines = shardStats.reduce((sum, shard) => sum + shard.lines, 0); + console.log( + `Mode ${mode}: ${shardStats.length} shards, ${formatNumber(totalFiles)} files, ${formatNumber(totalLines)} lines` + ); + for (const shard of shardStats) { + console.log( + `- ${shard.label} | files ${formatNumber(shard.files)} | lines ${formatNumber(shard.lines)}` + ); + } + if (shardConfig.maxWorkers) { + const shardBatches = planShardBatches(shards, shardConfig.maxWorkers, { + resolveWeight: (shard) => shard.costMs || shard.lineCount || shard.entries.length || 0 + }); + if (shardBatches.length) { + console.log(`Batch plan (${shardBatches.length} workers):`); + shardBatches.forEach((batch, index) => { + const batchFiles = batch.reduce((sum, shard) => sum + shard.entries.length, 0); + const batchLines = batch.reduce((sum, shard) => sum + (shard.lineCount || 0), 0); + console.log( + `- batch ${index + 1} | shards ${batch.length} | files ${formatNumber(batchFiles)} | lines ${formatNumber(batchLines)}` + ); + }); + } + } + } +}; + +const main = async () => { + if (argv.bench && argv.repo) { + console.error('Use either --bench or --repo, not both.'); + process.exit(1); + } + if (!argv.bench && !argv.repo) { + console.error('Missing --bench or --repo.'); + process.exit(1); + } + if (argv.bench) { + const config = await loadBenchConfig(); + const tasks = buildBenchTasks(config); + let missing = 0; + for (const task of tasks) { + const repoPath = path.join(benchReposRoot, task.language, task.repo); + const label = `${task.language}/${task.repo}`; + if (!fs.existsSync(repoPath)) { + console.error(`Missing ${label} at ${repoPath}`); + missing += 1; + continue; + } + await censusRepo(repoPath, label); + } + if (missing) { + console.error(`Skipped ${missing} repos (missing on disk).`); + } + return; + } + + const repoPath = await resolveRepoPath(argv.repo); + if (!repoPath || !fs.existsSync(repoPath)) { + console.error(`Repo not found: ${argv.repo}`); + process.exit(1); + } + await censusRepo(repoPath, `repo ${argv.repo}`); +}; + +main().catch((err) => { + console.error(err?.stack || err?.message || err); + process.exit(1); +}); diff --git a/tools/show-throughput.js b/tools/show-throughput.js new file mode 100644 index 000000000..d98869ce1 --- /dev/null +++ b/tools/show-throughput.js @@ -0,0 +1,268 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import path from 'node:path'; +import { color } from '../src/retrieval/cli/ansi.js'; +import { getMetricsDir, loadUserConfig } from './dict-utils.js'; + +const resultsRoot = path.join(process.cwd(), 'benchmarks', 'results'); + +const listDirs = (root) => fs.existsSync(root) + ? fs.readdirSync(root, { withFileTypes: true }).filter((entry) => entry.isDirectory()) + : []; + +const formatNumber = (value, digits = 1) => ( + Number.isFinite(value) ? value.toFixed(digits) : 'n/a' +); + +const formatCount = (value) => ( + Number.isFinite(value) ? value.toLocaleString() : 'n/a' +); + +const formatMs = (value) => { + if (!Number.isFinite(value)) return 'n/a'; + if (value < 1000) return `${Math.round(value)}ms`; + const seconds = value / 1000; + if (seconds < 60) return `${seconds.toFixed(1)}s`; + const minutes = Math.floor(seconds / 60); + const rem = (seconds % 60).toFixed(0); + return `${minutes}m ${rem}s`; +}; + +const formatBytesPerSec = (value) => { + if (!Number.isFinite(value)) return 'n/a'; + const mb = value / (1024 * 1024); + if (mb < 1024) return `${mb.toFixed(1)} MB/s`; + return `${(mb / 1024).toFixed(2)} GB/s`; +}; + +const mean = (values) => { + if (!values.length) return null; + return values.reduce((sum, val) => sum + val, 0) / values.length; +}; + +const collect = (items, selector) => items + .map((item) => selector(item)) + .filter((value) => Number.isFinite(value)); + +const mergeTotals = (target, entry) => { + if (!entry) return; + if (Number.isFinite(entry.files)) target.files += entry.files; + if (Number.isFinite(entry.chunks)) target.chunks += entry.chunks; + if (Number.isFinite(entry.tokens)) target.tokens += entry.tokens; + if (Number.isFinite(entry.bytes)) target.bytes += entry.bytes; + if (Number.isFinite(entry.totalMs)) target.totalMs += entry.totalMs; +}; + +const rateFromTotals = (totals, key) => { + if (!Number.isFinite(totals.totalMs) || totals.totalMs <= 0) return null; + const value = totals[key]; + if (!Number.isFinite(value)) return null; + return value / (totals.totalMs / 1000); +}; + +const sumRates = (...values) => { + let sum = 0; + let found = false; + for (const value of values) { + if (!Number.isFinite(value)) continue; + sum += value; + found = true; + } + return found ? sum : null; +}; + +const loadJson = (filePath) => { + try { + return JSON.parse(fs.readFileSync(filePath, 'utf8')); + } catch { + return null; + } +}; + +const loadFeatureMetrics = (repoRoot) => { + if (!repoRoot) return null; + const userConfig = loadUserConfig(repoRoot); + const metricsDir = getMetricsDir(repoRoot, userConfig); + const runPath = path.join(metricsDir, 'feature-metrics-run.json'); + const mergedPath = path.join(metricsDir, 'feature-metrics.json'); + return loadJson(runPath) || loadJson(mergedPath); +}; + +const collectLanguageLines = (metrics, totals) => { + if (!metrics || !metrics.modes) return; + for (const modeEntry of Object.values(metrics.modes)) { + const languages = modeEntry?.languages || {}; + for (const [language, bucket] of Object.entries(languages)) { + const lines = Number(bucket?.lines) || 0; + if (!lines) continue; + totals.set(language, (totals.get(language) || 0) + lines); + } + } +}; + +if (!fs.existsSync(resultsRoot)) { + console.error(`No benchmark results found at ${resultsRoot}`); + process.exit(1); +} + +const folders = listDirs(resultsRoot).filter((dir) => dir.name !== 'logs'); +if (!folders.length) { + console.log('No benchmark results folders found.'); + process.exit(0); +} + +const totalThroughput = { + code: { files: 0, chunks: 0, tokens: 0, bytes: 0, totalMs: 0 }, + prose: { files: 0, chunks: 0, tokens: 0, bytes: 0, totalMs: 0 } +}; +const languageTotals = new Map(); +const reposWithMetrics = new Set(); + +console.log(color.bold(color.cyan('Benchmark Performance Overview'))); +console.log(color.gray(`Root: ${resultsRoot}`)); + +for (const dir of folders) { + const folderPath = path.join(resultsRoot, dir.name); + const files = fs.readdirSync(folderPath).filter((name) => name.endsWith('.json')); + const runs = []; + const throughputs = []; + + for (const file of files) { + const payload = loadJson(path.join(folderPath, file)); + if (!payload) continue; + const summary = payload.summary || payload.runs?.[0] || null; + const throughput = payload.artifacts?.throughput || {}; + runs.push({ file, summary, throughput }); + throughputs.push(throughput); + mergeTotals(totalThroughput.code, throughput.code); + mergeTotals(totalThroughput.prose, throughput.prose); + const repoRoot = payload.repo?.root; + if (repoRoot && !reposWithMetrics.has(repoRoot)) { + const metrics = loadFeatureMetrics(repoRoot); + if (metrics) { + collectLanguageLines(metrics, languageTotals); + reposWithMetrics.add(repoRoot); + } + } + } + + const header = `${dir.name} (${runs.length} run${runs.length === 1 ? '' : 's'})`; + console.log(''); + console.log(color.bold(color.blue(header))); + + if (!runs.length) { + console.log(color.gray(' No benchmark JSON files found.')); + continue; + } + + const code = throughputs.map((t) => t.code).filter(Boolean); + const prose = throughputs.map((t) => t.prose).filter(Boolean); + + if (code.length) { + console.log( + ` ${color.bold('Code throughput')}: ` + + `${formatNumber(mean(collect(code, (c) => c.chunksPerSec)))} chunks/s | ` + + `${formatNumber(mean(collect(code, (c) => c.tokensPerSec)))} tokens/s | ` + + `${formatBytesPerSec(mean(collect(code, (c) => c.bytesPerSec)))} | ` + + `${formatNumber(mean(collect(code, (c) => c.filesPerSec)))} files/s` + ); + } + + if (prose.length) { + console.log( + ` ${color.bold('Prose throughput')}: ` + + `${formatNumber(mean(collect(prose, (c) => c.chunksPerSec)))} chunks/s | ` + + `${formatNumber(mean(collect(prose, (c) => c.tokensPerSec)))} tokens/s | ` + + `${formatBytesPerSec(mean(collect(prose, (c) => c.bytesPerSec)))} | ` + + `${formatNumber(mean(collect(prose, (c) => c.filesPerSec)))} files/s` + ); + } + + const summaries = runs.map((r) => r.summary).filter(Boolean); + if (summaries.length) { + const wallPerQuery = mean(collect(summaries, (s) => s.queryWallMsPerQuery)); + const wallPerSearch = mean(collect(summaries, (s) => s.queryWallMsPerSearch)); + if (wallPerQuery || wallPerSearch) { + console.log( + ` ${color.bold('Query wall time')}: ` + + `avg/query ${formatMs(wallPerQuery)} | avg/search ${formatMs(wallPerSearch)}` + ); + } + + const backendLatency = {}; + for (const summary of summaries) { + const latency = summary.latencyMs || {}; + for (const [backend, stats] of Object.entries(latency)) { + if (!backendLatency[backend]) backendLatency[backend] = { mean: [], p95: [] }; + if (Number.isFinite(stats?.mean)) backendLatency[backend].mean.push(stats.mean); + if (Number.isFinite(stats?.p95)) backendLatency[backend].p95.push(stats.p95); + } + } + const latencyLine = Object.entries(backendLatency) + .map(([backend, stats]) => ( + `${backend} ${formatNumber(mean(stats.mean))}ms (p95 ${formatNumber(mean(stats.p95))}ms)` + )) + .join(' | '); + if (latencyLine) { + console.log(` ${color.bold('Latency')}: ${latencyLine}`); + } + + const buildIndexMs = mean(collect(summaries, (s) => s.buildMs?.index)); + const buildSqliteMs = mean(collect(summaries, (s) => s.buildMs?.sqlite)); + if (buildIndexMs || buildSqliteMs) { + console.log( + ` ${color.bold('Build time')}: ` + + `index ${formatMs(buildIndexMs)} | sqlite ${formatMs(buildSqliteMs)}` + ); + } + } + + console.log(color.gray(' Runs:')); + for (const run of runs) { + const repoLabel = run.file.replace(/\.json$/, ''); + const codeStats = run.throughput?.code || {}; + const proseStats = run.throughput?.prose || {}; + const summary = run.summary || {}; + const line = [ + color.bold(repoLabel), + `code ${formatNumber(codeStats.chunksPerSec)} ch/s`, + `prose ${formatNumber(proseStats.chunksPerSec)} ch/s`, + `query ${formatMs(summary.queryWallMsPerQuery)}` + ].join(' | '); + console.log(` ${line}`); + } +} + +const totalFilesPerSec = sumRates( + rateFromTotals(totalThroughput.code, 'files'), + rateFromTotals(totalThroughput.prose, 'files') +); +const totalChunksPerSec = sumRates( + rateFromTotals(totalThroughput.code, 'chunks'), + rateFromTotals(totalThroughput.prose, 'chunks') +); +const totalTokensPerSec = sumRates( + rateFromTotals(totalThroughput.code, 'tokens'), + rateFromTotals(totalThroughput.prose, 'tokens') +); +const totalBytesPerSec = sumRates( + rateFromTotals(totalThroughput.code, 'bytes'), + rateFromTotals(totalThroughput.prose, 'bytes') +); + +console.log(''); +console.log(color.bold(color.green('Totals'))); +console.log( + ` ${color.bold('Files')}: ${formatNumber(totalFilesPerSec)} files/s | ` + + `${color.bold('Chunks')}: ${formatNumber(totalChunksPerSec)} chunks/s | ` + + `${color.bold('Tokens')}: ${formatNumber(totalTokensPerSec)} tokens/s | ` + + `${color.bold('Bytes')}: ${formatBytesPerSec(totalBytesPerSec)}` +); +if (languageTotals.size) { + const sortedLanguages = Array.from(languageTotals.entries()) + .sort((a, b) => b[1] - a[1]); + console.log(` ${color.bold('Lines by language')}:`); + for (const [language, lines] of sortedLanguages) { + console.log(` ${language}: ${formatCount(lines)} lines`); + } +} diff --git a/tools/structural-search.js b/tools/structural-search.js new file mode 100644 index 000000000..19d7c4320 --- /dev/null +++ b/tools/structural-search.js @@ -0,0 +1,93 @@ +#!/usr/bin/env node +import fs from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { createCli } from '../src/shared/cli.js'; +import { loadRegistry, resolvePacks } from '../src/experimental/structural/registry.js'; +import { runStructuralSearch } from '../src/experimental/structural/runner.js'; +import { writeJson, writeJsonl } from '../src/experimental/structural/io.js'; +import { loadUserConfig, resolveRepoRoot } from './dict-utils.js'; + +const argv = createCli({ + scriptName: 'structural-search', + options: { + repo: { type: 'string' }, + engine: { type: 'string' }, + pack: { type: 'array' }, + registry: { type: 'string' }, + rule: { type: 'array' }, + format: { type: 'string', default: 'jsonl' }, + out: { type: 'string' }, + json: { type: 'boolean', default: false }, + profile: { type: 'string' }, + 'list-packs': { type: 'boolean', default: false } + } +}).parse(); + +const scriptRoot = path.dirname(fileURLToPath(import.meta.url)); +const repoRoot = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); +const userConfig = loadUserConfig(repoRoot, { profile: argv.profile }); +if (userConfig.profile !== 'full') { + console.error('structural-search is experimental. Run with profile=full or set PAIROFCLEATS_PROFILE=full.'); + process.exit(1); +} +const registryPath = argv.registry + ? path.resolve(argv.registry) + : path.resolve(scriptRoot, '..', 'rules', 'registry.json'); +const outputPath = argv.out ? path.resolve(argv.out) : null; +const format = argv.json ? 'json' : (argv.format || 'jsonl'); + +const registry = loadRegistry(registryPath); +if (argv['list-packs']) { + const output = registry.packs.map((pack) => ({ + id: pack.id, + label: pack.label, + engine: pack.engine, + rules: pack.rules + })); + console.log(JSON.stringify(output, null, 2)); + process.exit(0); +} + +const packIds = (argv.pack || []).map((entry) => String(entry).trim()).filter(Boolean); +const rulePaths = (argv.rule || []).map((entry) => String(entry)).filter(Boolean); +const engineOverride = argv.engine ? String(argv.engine).trim() : ''; + +const { selectedPacks, missingPacks } = resolvePacks(registry, packIds); +if (missingPacks.length) { + console.error(`Unknown packs: ${missingPacks.join(', ')}`); +} + +if (!selectedPacks.length && !engineOverride) { + console.error('No packs selected and no engine specified.'); + process.exit(1); +} + +const resolveRulePath = (rulePath) => { + if (!rulePath) return null; + const resolved = path.isAbsolute(rulePath) + ? rulePath + : path.resolve(scriptRoot, '..', rulePath); + return fs.existsSync(resolved) ? resolved : null; +}; + +const packsToRun = selectedPacks.map((pack) => ({ + pack, + engine: pack.engine, + rules: pack.rules.map(resolveRulePath).filter(Boolean) +})); +if (engineOverride || rulePaths.length) { + packsToRun.push({ + pack: null, + engine: engineOverride, + rules: rulePaths.map(resolveRulePath).filter(Boolean) + }); +} + +const results = runStructuralSearch({ repoRoot, packsToRun }); + +if (format === 'json') { + await writeJson(results, outputPath); +} else { + writeJsonl(results, outputPath); +} diff --git a/tools/tooling-detect.js b/tools/tooling-detect.js index ea57ffe5d..23933cfcb 100644 --- a/tools/tooling-detect.js +++ b/tools/tooling-detect.js @@ -1,19 +1,26 @@ #!/usr/bin/env node -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; +import path from 'node:path'; import { buildToolingReport, normalizeLanguageList } from './tooling-utils.js'; import { resolveRepoRoot } from './dict-utils.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['json'], - string: ['root', 'repo', 'languages'], - default: { json: false } -}); +const argv = createCli({ + scriptName: 'tooling-detect', + options: { + json: { type: 'boolean', default: false }, + root: { type: 'string' }, + repo: { type: 'string' }, + languages: { type: 'string' } + } +}).parse(); const explicitRoot = argv.root || argv.repo; const root = explicitRoot ? path.resolve(explicitRoot) : resolveRepoRoot(process.cwd()); const languageOverride = normalizeLanguageList(argv.languages); -const report = await buildToolingReport(root, languageOverride); +const report = await buildToolingReport(root, languageOverride, { + skipScan: languageOverride.length > 0 +}); if (argv.json) { console.log(JSON.stringify(report, null, 2)); diff --git a/tools/tooling-install.js b/tools/tooling-install.js index 5749b87e8..ade0315f0 100644 --- a/tools/tooling-install.js +++ b/tools/tooling-install.js @@ -1,14 +1,23 @@ #!/usr/bin/env node -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; +import path from 'node:path'; import { spawnSync } from 'node:child_process'; import { buildToolingReport, detectTool, normalizeLanguageList, resolveToolsById, resolveToolsForLanguages, selectInstallPlan } from './tooling-utils.js'; import { getToolingConfig, resolveRepoRoot } from './dict-utils.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['json', 'dry-run', 'no-fallback'], - string: ['root', 'repo', 'scope', 'languages', 'tools'], - default: { 'dry-run': false, json: false, 'no-fallback': false } -}); +const argv = createCli({ + scriptName: 'tooling-install', + options: { + json: { type: 'boolean', default: false }, + 'dry-run': { type: 'boolean', default: false }, + 'no-fallback': { type: 'boolean', default: false }, + root: { type: 'string' }, + repo: { type: 'string' }, + scope: { type: 'string' }, + languages: { type: 'string' }, + tools: { type: 'string' } + } +}).parse(); const explicitRoot = argv.root || argv.repo; const root = explicitRoot ? path.resolve(explicitRoot) : resolveRepoRoot(process.cwd()); @@ -18,11 +27,13 @@ const allowFallback = argv['no-fallback'] ? false : toolingConfig.allowGlobalFal const languageOverride = normalizeLanguageList(argv.languages); const toolOverride = normalizeLanguageList(argv.tools); -const report = await buildToolingReport(root, languageOverride); +const report = toolOverride.length + ? { languages: {}, formats: {} } + : await buildToolingReport(root, languageOverride, { skipScan: languageOverride.length > 0 }); const languageList = languageOverride.length ? languageOverride : Object.keys(report.languages || {}); const tools = toolOverride.length - ? resolveToolsById(toolOverride, toolingConfig.dir, root) - : resolveToolsForLanguages(languageList, toolingConfig.dir, root); + ? resolveToolsById(toolOverride, toolingConfig.dir, root, toolingConfig) + : resolveToolsForLanguages(languageList, toolingConfig.dir, root, toolingConfig); const actions = []; const results = []; diff --git a/tools/tooling-utils.js b/tools/tooling-utils.js index f5ee5b450..766670b53 100644 --- a/tools/tooling-utils.js +++ b/tools/tooling-utils.js @@ -1,20 +1,21 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; -import { spawnSync } from 'node:child_process'; -import { SKIP_DIRS, SKIP_FILES } from '../src/indexer/constants.js'; +import { execaSync } from 'execa'; +import { LOCK_FILES, MANIFEST_FILES, SKIP_DIRS, SKIP_FILES } from '../src/index/constants.js'; import { getToolingConfig } from './dict-utils.js'; const LANGUAGE_EXTENSIONS = { javascript: ['.js', '.mjs', '.cjs'], typescript: ['.ts', '.tsx', '.mts', '.cts'], - python: ['.py'], + python: ['.py', '.pyi'], c: ['.c', '.h'], cpp: ['.cc', '.cpp', '.hpp', '.hh'], objc: ['.m', '.mm'], rust: ['.rs'], go: ['.go'], java: ['.java'], + swift: ['.swift'], shell: ['.sh', '.bash', '.zsh', '.ksh'], csharp: ['.cs'], kotlin: ['.kt', '.kts'], @@ -36,19 +37,34 @@ const FORMAT_EXTENSIONS = { const FORMAT_FILENAMES = { dockerfile: ['dockerfile'], - makefile: ['makefile'] + makefile: ['makefile', 'gnumakefile'], + manifest: Array.from(MANIFEST_FILES), + lockfile: Array.from(LOCK_FILES) +}; + +const FORMAT_FILENAME_PREFIXES = { + dockerfile: ['dockerfile.'], + makefile: ['makefile.'] }; const TOOL_DOCS = { tsserver: 'https://www.typescriptlang.org/', + 'typescript-language-server': 'https://github.com/typescript-language-server/typescript-language-server', clangd: 'https://clangd.llvm.org/installation', 'rust-analyzer': 'https://rust-analyzer.github.io/', gopls: 'https://pkg.go.dev/golang.org/x/tools/gopls', jdtls: 'https://github.com/eclipse-jdtls/eclipse.jdt.ls', + 'sourcekit-lsp': 'https://www.swift.org/download/', 'kotlin-language-server': 'https://github.com/fwcd/kotlin-language-server', + 'kotlin-lsp': 'https://kotlinlang.org/docs/', + pyright: 'https://github.com/microsoft/pyright', omnisharp: 'https://github.com/OmniSharp/omnisharp-roslyn', + 'csharp-ls': 'https://github.com/razzmatazz/csharp-language-server', + 'ruby-lsp': 'https://shopify.github.io/ruby-lsp/', solargraph: 'https://solargraph.org/', phpactor: 'https://phpactor.readthedocs.io/', + intelephense: 'https://github.com/bmewburn/intelephense-docs', + 'bash-language-server': 'https://github.com/bash-lsp/bash-language-server', 'lua-language-server': 'https://github.com/LuaLS/lua-language-server', sqls: 'https://github.com/lighttiger2505/sqls' }; @@ -72,14 +88,18 @@ function findBinaryInDirs(name, dirs) { } function canRun(cmd, args = ['--version']) { - const result = spawnSync(cmd, args, { encoding: 'utf8' }); - return result.status === 0; + try { + const result = execaSync(cmd, args, { encoding: 'utf8', stdio: 'ignore', reject: false }); + return result.exitCode === 0; + } catch { + return false; + } } async function scanRepo(root) { const extCounts = new Map(); - const filePaths = []; const lowerNames = new Set(); + let workflowCount = 0; const visit = async (dir) => { let entries; try { @@ -98,12 +118,16 @@ async function scanRepo(root) { if (SKIP_FILES.has(entry.name)) continue; const ext = path.extname(entry.name).toLowerCase(); if (ext) extCounts.set(ext, (extCounts.get(ext) || 0) + 1); - filePaths.push(abs); lowerNames.add(entry.name.toLowerCase()); + const normalized = abs.replace(/\\/g, '/').toLowerCase(); + if (normalized.includes('/.github/workflows/') + && (normalized.endsWith('.yml') || normalized.endsWith('.yaml'))) { + workflowCount += 1; + } } }; await visit(root); - return { extCounts, filePaths, lowerNames }; + return { extCounts, lowerNames, workflowCount }; } function buildLangHits(extCounts) { @@ -117,8 +141,16 @@ function buildLangHits(extCounts) { return hits; } -function buildFormatHits(extCounts, lowerNames, filePaths) { +function buildFormatHits(extCounts, lowerNames, workflowCount) { const hits = {}; + const hasPrefixName = (prefix) => { + const key = prefix.toLowerCase(); + if (lowerNames.has(key)) return true; + for (const name of lowerNames) { + if (name.startsWith(key)) return true; + } + return false; + }; for (const [format, exts] of Object.entries(FORMAT_EXTENSIONS)) { const matched = exts.filter((ext) => extCounts.has(ext)); if (!matched.length) continue; @@ -126,25 +158,23 @@ function buildFormatHits(extCounts, lowerNames, filePaths) { hits[format] = { extensions: matched, files: count }; } for (const [format, names] of Object.entries(FORMAT_FILENAMES)) { - if (names.some((name) => lowerNames.has(name))) { + const prefixes = FORMAT_FILENAME_PREFIXES[format] || []; + const hasExact = names.some((name) => lowerNames.has(name)); + const hasPrefix = prefixes.some((prefix) => hasPrefixName(prefix)); + if (hasExact || hasPrefix) { hits[format] = { filenames: names, files: names.length }; } } - const ghWorkflows = filePaths.filter((filePath) => { - const normalized = filePath.replace(/\\/g, '/').toLowerCase(); - if (!normalized.includes('/.github/workflows/')) return false; - return normalized.endsWith('.yml') || normalized.endsWith('.yaml'); - }); - if (ghWorkflows.length) { - hits['github-actions'] = { extensions: ['.yml', '.yaml'], files: ghWorkflows.length }; + if (workflowCount) { + hits['github-actions'] = { extensions: ['.yml', '.yaml'], files: workflowCount }; } return hits; } export async function detectRepoLanguages(root) { - const { extCounts, filePaths, lowerNames } = await scanRepo(root); + const { extCounts, lowerNames, workflowCount } = await scanRepo(root); const languages = buildLangHits(extCounts); - const formats = buildFormatHits(extCounts, lowerNames, filePaths); + const formats = buildFormatHits(extCounts, lowerNames, workflowCount); return { languages, formats, extCounts }; } @@ -170,6 +200,17 @@ export function getToolingRegistry(toolingRoot, repoRoot) { }, docs: TOOL_DOCS.tsserver }, + { + id: 'typescript-language-server', + label: 'TypeScript language server', + languages: ['typescript'], + detect: { cmd: 'typescript-language-server', args: ['--version'], binDirs: [repoNodeBin, nodeBin] }, + install: { + cache: { cmd: 'npm', args: ['install', '--prefix', nodeDir, 'typescript-language-server'] }, + user: { cmd: 'npm', args: ['install', '-g', 'typescript-language-server'] } + }, + docs: TOOL_DOCS['typescript-language-server'] + }, { id: 'clangd', label: 'clangd', @@ -180,6 +221,27 @@ export function getToolingRegistry(toolingRoot, repoRoot) { }, docs: TOOL_DOCS.clangd }, + { + id: 'sourcekit-lsp', + label: 'SourceKit-LSP', + languages: ['swift'], + detect: { cmd: 'sourcekit-lsp', args: ['--help'], binDirs: [] }, + install: { + manual: true + }, + docs: TOOL_DOCS['sourcekit-lsp'] + }, + { + id: 'pyright', + label: 'Pyright', + languages: ['python'], + detect: { cmd: 'pyright', args: ['--version'], binDirs: [repoNodeBin, nodeBin] }, + install: { + cache: { cmd: 'npm', args: ['install', '--prefix', nodeDir, 'pyright'] }, + user: { cmd: 'npm', args: ['install', '-g', 'pyright'] } + }, + docs: TOOL_DOCS.pyright + }, { id: 'rust-analyzer', label: 'rust-analyzer', @@ -221,6 +283,16 @@ export function getToolingRegistry(toolingRoot, repoRoot) { }, docs: TOOL_DOCS['kotlin-language-server'] }, + { + id: 'kotlin-lsp', + label: 'Kotlin LSP', + languages: ['kotlin'], + detect: { cmd: 'kotlin-lsp', args: ['--version'], binDirs: [] }, + install: { + manual: true + }, + docs: TOOL_DOCS['kotlin-lsp'] + }, { id: 'omnisharp', label: 'OmniSharp', @@ -232,6 +304,28 @@ export function getToolingRegistry(toolingRoot, repoRoot) { }, docs: TOOL_DOCS.omnisharp }, + { + id: 'csharp-ls', + label: 'C# LSP (Roslyn)', + languages: ['csharp'], + detect: { cmd: 'csharp-ls', args: ['--version'], binDirs: [dotnetDir] }, + install: { + cache: { cmd: 'dotnet', args: ['tool', 'install', '--tool-path', dotnetDir, 'csharp-ls'], requires: 'dotnet' }, + user: { cmd: 'dotnet', args: ['tool', 'install', '-g', 'csharp-ls'], requires: 'dotnet' } + }, + docs: TOOL_DOCS['csharp-ls'] + }, + { + id: 'ruby-lsp', + label: 'Ruby LSP', + languages: ['ruby'], + detect: { cmd: 'ruby-lsp', args: ['--version'], binDirs: [binDir] }, + install: { + cache: { cmd: 'gem', args: ['install', '-i', gemsDir, '-n', binDir, 'ruby-lsp'], requires: 'gem' }, + user: { cmd: 'gem', args: ['install', 'ruby-lsp'], requires: 'gem' } + }, + docs: TOOL_DOCS['ruby-lsp'] + }, { id: 'solargraph', label: 'Solargraph', @@ -254,6 +348,17 @@ export function getToolingRegistry(toolingRoot, repoRoot) { }, docs: TOOL_DOCS.phpactor }, + { + id: 'intelephense', + label: 'Intelephense', + languages: ['php'], + detect: { cmd: 'intelephense', args: ['--version'], binDirs: [repoNodeBin, nodeBin] }, + install: { + cache: { cmd: 'npm', args: ['install', '--prefix', nodeDir, 'intelephense'] }, + user: { cmd: 'npm', args: ['install', '-g', 'intelephense'] } + }, + docs: TOOL_DOCS.intelephense + }, { id: 'lua-language-server', label: 'lua-language-server', @@ -264,6 +369,17 @@ export function getToolingRegistry(toolingRoot, repoRoot) { }, docs: TOOL_DOCS['lua-language-server'] }, + { + id: 'bash-language-server', + label: 'bash-language-server', + languages: ['shell'], + detect: { cmd: 'bash-language-server', args: ['--version'], binDirs: [repoNodeBin, nodeBin] }, + install: { + cache: { cmd: 'npm', args: ['install', '--prefix', nodeDir, 'bash-language-server'] }, + user: { cmd: 'npm', args: ['install', '-g', 'bash-language-server'] } + }, + docs: TOOL_DOCS['bash-language-server'] + }, { id: 'sqls', label: 'sqls', @@ -278,16 +394,33 @@ export function getToolingRegistry(toolingRoot, repoRoot) { ]; } -export function resolveToolsForLanguages(languages, toolingRoot, repoRoot) { +function filterToolsByConfig(tools, toolingConfig) { + const enabled = Array.isArray(toolingConfig?.enabledTools) ? toolingConfig.enabledTools : []; + const disabled = Array.isArray(toolingConfig?.disabledTools) ? toolingConfig.disabledTools : []; + let filtered = tools; + if (enabled.length) { + const enabledSet = new Set(enabled); + filtered = filtered.filter((tool) => enabledSet.has(tool.id)); + } + if (disabled.length) { + const disabledSet = new Set(disabled); + filtered = filtered.filter((tool) => !disabledSet.has(tool.id)); + } + return filtered; +} + +export function resolveToolsForLanguages(languages, toolingRoot, repoRoot, toolingConfig = null) { const languageSet = new Set(languages); const registry = getToolingRegistry(toolingRoot, repoRoot); - return registry.filter((tool) => tool.languages.some((lang) => languageSet.has(lang))); + const matched = registry.filter((tool) => tool.languages.some((lang) => languageSet.has(lang))); + return filterToolsByConfig(matched, toolingConfig); } -export function resolveToolsById(ids, toolingRoot, repoRoot) { +export function resolveToolsById(ids, toolingRoot, repoRoot, toolingConfig = null) { const idSet = new Set(ids); const registry = getToolingRegistry(toolingRoot, repoRoot); - return registry.filter((tool) => idSet.has(tool.id)); + const matched = registry.filter((tool) => idSet.has(tool.id)); + return filterToolsByConfig(matched, toolingConfig); } export function detectTool(tool) { @@ -317,13 +450,22 @@ export function hasCommand(cmd) { return canRun(cmd, ['--version']); } -export async function buildToolingReport(root, languageOverride = null) { +export async function buildToolingReport(root, languageOverride = null, options = {}) { const toolingConfig = getToolingConfig(root); - const { languages, formats } = await detectRepoLanguages(root); + const skipScan = options.skipScan === true; + const detected = skipScan ? { languages: {}, formats: {} } : await detectRepoLanguages(root); + const languages = detected.languages || {}; + const formats = detected.formats || {}; const languageList = languageOverride && languageOverride.length ? languageOverride : Object.keys(languages); - const tools = resolveToolsForLanguages(languageList, toolingConfig.dir, root).map((tool) => { + const languageMap = (languageOverride && languageOverride.length && skipScan) + ? languageOverride.reduce((acc, lang) => { + acc[lang] = { extensions: [], files: 0, override: true }; + return acc; + }, {}) + : languages; + const tools = resolveToolsForLanguages(languageList, toolingConfig.dir, root, toolingConfig).map((tool) => { const status = detectTool(tool); return { id: tool.id, @@ -339,7 +481,7 @@ export async function buildToolingReport(root, languageOverride = null) { return { root, toolingRoot: toolingConfig.dir, - languages, + languages: languageMap, formats, tools }; diff --git a/tools/triage/context-pack.js b/tools/triage/context-pack.js index 2db6bf598..3eb89efab 100644 --- a/tools/triage/context-pack.js +++ b/tools/triage/context-pack.js @@ -1,15 +1,20 @@ #!/usr/bin/env node import fsPromises from 'node:fs/promises'; import path from 'node:path'; -import { spawnSync } from 'node:child_process'; -import { fileURLToPath } from 'node:url'; -import minimist from 'minimist'; -import { getRepoCacheRoot, getTriageConfig, loadUserConfig, resolveRepoRoot } from '../dict-utils.js'; - -const argv = minimist(process.argv.slice(2), { - boolean: ['stub-embeddings', 'ann'], - string: ['repo', 'record', 'out'] -}); +import { execaSync } from 'execa'; +import { createCli } from '../../src/shared/cli.js'; +import { getRepoCacheRoot, getRuntimeConfig, getTriageConfig, loadUserConfig, resolveRepoRoot, resolveRuntimeEnv, resolveToolRoot } from '../dict-utils.js'; + +const argv = createCli({ + scriptName: 'triage-context-pack', + options: { + 'stub-embeddings': { type: 'boolean', default: false }, + ann: { type: 'boolean' }, + repo: { type: 'string' }, + record: { type: 'string' }, + out: { type: 'string' } + } +}).parse(); const rawArgs = process.argv.slice(2); const annFlagPresent = rawArgs.includes('--ann') || rawArgs.includes('--no-ann'); @@ -21,6 +26,8 @@ if (!recordId) { } const userConfig = loadUserConfig(repoRoot); +const runtimeConfig = getRuntimeConfig(repoRoot, userConfig); +const baseEnv = resolveRuntimeEnv(runtimeConfig, process.env); const triageConfig = getTriageConfig(repoRoot, userConfig); const repoCacheRoot = getRepoCacheRoot(repoRoot, userConfig); const recordsDir = triageConfig.recordsDir; @@ -225,7 +232,7 @@ async function loadRecord(recordsDir, recordId) { } function runSearchJson({ repoRoot, query, mode, metaFilters, top }) { - const scriptRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..', '..'); + const scriptRoot = resolveToolRoot(); const searchPath = path.join(scriptRoot, 'search.js'); const args = [searchPath, query, '--mode', mode, '--json', '--top', String(top), '--repo', repoRoot]; if (Array.isArray(metaFilters)) { @@ -235,10 +242,10 @@ function runSearchJson({ repoRoot, query, mode, metaFilters, top }) { } if (annFlagPresent && argv.ann === true) args.push('--ann'); if (annFlagPresent && argv.ann === false) args.push('--no-ann'); - const env = { ...process.env }; + const env = { ...baseEnv }; if (argv['stub-embeddings']) env.PAIROFCLEATS_EMBEDDINGS = 'stub'; - const result = spawnSync(process.execPath, args, { cwd: repoRoot, env, encoding: 'utf8' }); - if (result.status !== 0) { + const result = execaSync(process.execPath, args, { cwd: repoRoot, env, encoding: 'utf8', reject: false }); + if (result.exitCode !== 0) { return { ok: false, error: result.stderr || result.stdout || 'search failed', payload: null }; } try { diff --git a/tools/triage/decision.js b/tools/triage/decision.js index f9caaaa92..9ac1c4768 100644 --- a/tools/triage/decision.js +++ b/tools/triage/decision.js @@ -1,16 +1,28 @@ #!/usr/bin/env node import fsPromises from 'node:fs/promises'; import path from 'node:path'; -import minimist from 'minimist'; +import { createCli } from '../../src/shared/cli.js'; import { getTriageConfig, loadUserConfig, resolveRepoRoot } from '../dict-utils.js'; -import { buildRecordId } from '../../src/triage/record-utils.js'; -import { applyRoutingMeta } from '../../src/triage/normalize/helpers.js'; -import { renderRecordMarkdown } from '../../src/triage/render.js'; +import { buildRecordId } from '../../src/integrations/triage/record-utils.js'; +import { applyRoutingMeta } from '../../src/integrations/triage/normalize/helpers.js'; +import { renderRecordMarkdown } from '../../src/integrations/triage/render.js'; -const argv = minimist(process.argv.slice(2), { - string: ['repo', 'finding', 'status', 'justification', 'reviewer', 'expires', 'meta', 'code', 'evidence'], - alias: { r: 'repo' } -}); +const argv = createCli({ + scriptName: 'triage-decision', + options: { + repo: { type: 'string' }, + finding: { type: 'string' }, + record: { type: 'string' }, + status: { type: 'string' }, + justification: { type: 'string' }, + reviewer: { type: 'string' }, + expires: { type: 'string' }, + meta: { type: 'string', array: true }, + code: { type: 'string', array: true }, + evidence: { type: 'string', array: true } + }, + aliases: { r: 'repo' } +}).parse(); const repoRoot = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); const findingId = argv.finding || argv.record; diff --git a/tools/triage/ingest.js b/tools/triage/ingest.js index f10a37ac7..bdc98f4ed 100644 --- a/tools/triage/ingest.js +++ b/tools/triage/ingest.js @@ -1,24 +1,31 @@ #!/usr/bin/env node import fsPromises from 'node:fs/promises'; import path from 'node:path'; -import { spawnSync } from 'node:child_process'; -import { fileURLToPath } from 'node:url'; -import minimist from 'minimist'; -import { getTriageConfig, loadUserConfig, resolveRepoRoot } from '../dict-utils.js'; -import { normalizeDependabot } from '../../src/triage/normalize/dependabot.js'; -import { normalizeAwsInspector } from '../../src/triage/normalize/aws-inspector.js'; -import { normalizeGeneric } from '../../src/triage/normalize/generic.js'; -import { renderRecordMarkdown } from '../../src/triage/render.js'; +import { execaSync } from 'execa'; +import { createCli } from '../../src/shared/cli.js'; +import { getRuntimeConfig, getTriageConfig, loadUserConfig, resolveRepoRoot, resolveRuntimeEnv, resolveToolRoot } from '../dict-utils.js'; +import { normalizeDependabot } from '../../src/integrations/triage/normalize/dependabot.js'; +import { normalizeAwsInspector } from '../../src/integrations/triage/normalize/aws-inspector.js'; +import { normalizeGeneric } from '../../src/integrations/triage/normalize/generic.js'; +import { renderRecordMarkdown } from '../../src/integrations/triage/render.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['build-index', 'incremental', 'stub-embeddings'], - string: ['repo', 'source', 'in', 'meta'], - alias: { i: 'in' } -}); +const argv = createCli({ + scriptName: 'triage-ingest', + options: { + 'build-index': { type: 'boolean', default: false }, + incremental: { type: 'boolean', default: false }, + 'stub-embeddings': { type: 'boolean', default: false }, + repo: { type: 'string' }, + source: { type: 'string' }, + in: { type: 'string' }, + meta: { type: 'string', array: true } + }, + aliases: { i: 'in' } +}).parse(); const repoRoot = argv.repo ? path.resolve(argv.repo) : resolveRepoRoot(process.cwd()); const source = normalizeSource(argv.source); -const inputPath = argv.in ? path.resolve(argv.in) : null; +const inputPath = argv.in ? path.resolve(repoRoot, argv.in) : null; if (!source || !inputPath) { console.error('usage: node tools/triage/ingest.js --source dependabot|aws_inspector|generic --in [--repo ] [--meta key=value] [--build-index]'); @@ -26,6 +33,8 @@ if (!source || !inputPath) { } const userConfig = loadUserConfig(repoRoot); +const runtimeConfig = getRuntimeConfig(repoRoot, userConfig); +const baseEnv = resolveRuntimeEnv(runtimeConfig, process.env); const triageConfig = getTriageConfig(repoRoot, userConfig); const meta = parseMeta(argv.meta); @@ -77,12 +86,14 @@ for (let index = 0; index < rawEntries.length; index += 1) { } if (argv['build-index']) { - const scriptRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..', '..'); + const scriptRoot = resolveToolRoot(); const args = [path.join(scriptRoot, 'build_index.js'), '--mode', 'records', '--repo', repoRoot]; if (argv.incremental) args.push('--incremental'); if (argv['stub-embeddings']) args.push('--stub-embeddings'); - const result = spawnSync(process.execPath, args, { cwd: repoRoot, stdio: 'inherit' }); - if (result.status !== 0) process.exit(result.status ?? 1); + const env = { ...baseEnv }; + if (argv['stub-embeddings']) env.PAIROFCLEATS_EMBEDDINGS = 'stub'; + const result = execaSync(process.execPath, args, { cwd: repoRoot, stdio: 'inherit', env, reject: false }); + if (result.exitCode !== 0) process.exit(result.exitCode ?? 1); } console.log(JSON.stringify(results, null, 2)); diff --git a/tools/uninstall.js b/tools/uninstall.js index 7db450c85..c01416882 100644 --- a/tools/uninstall.js +++ b/tools/uninstall.js @@ -3,23 +3,28 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; import readline from 'node:readline/promises'; -import minimist from 'minimist'; +import { createCli } from '../src/shared/cli.js'; +import { getEnvConfig } from '../src/shared/env.js'; import { getCacheRoot, getDictConfig, getExtensionsDir, getModelsDir, loadUserConfig, resolveRepoRoot } from './dict-utils.js'; import { isInside, isRootPath } from './path-utils.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['yes', 'dry-run'], - string: ['repo'], - default: { yes: false, 'dry-run': false } -}); +const argv = createCli({ + scriptName: 'uninstall', + options: { + yes: { type: 'boolean', default: false }, + 'dry-run': { type: 'boolean', default: false }, + repo: { type: 'string' } + } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); const userConfig = loadUserConfig(root); const dictConfig = getDictConfig(root, userConfig); +const envConfig = getEnvConfig(); const defaultCacheRoot = getCacheRoot(); -const configuredCacheRoot = (userConfig.cache && userConfig.cache.root) || process.env.PAIROFCLEATS_CACHE_ROOT || defaultCacheRoot; -const envCacheRoot = process.env.PAIROFCLEATS_CACHE_ROOT || null; +const configuredCacheRoot = (userConfig.cache && userConfig.cache.root) || envConfig.cacheRoot || defaultCacheRoot; +const envCacheRoot = envConfig.cacheRoot || null; const modelsDir = getModelsDir(root, userConfig); const extensionsDir = getExtensionsDir(root, userConfig); diff --git a/tools/validate-config.js b/tools/validate-config.js index aaa877600..d85294e34 100644 --- a/tools/validate-config.js +++ b/tools/validate-config.js @@ -1,21 +1,24 @@ #!/usr/bin/env node import fs from 'node:fs'; import path from 'node:path'; -import minimist from 'minimist'; -import { fileURLToPath } from 'node:url'; -import { resolveRepoRoot } from './dict-utils.js'; +import { createCli } from '../src/shared/cli.js'; +import { readJsoncFile } from '../src/shared/jsonc.js'; +import { resolveRepoRoot, resolveToolRoot } from './dict-utils.js'; import { validateConfig } from '../src/config/validate.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['json'], - string: ['repo', 'config'], - default: { json: false } -}); +const argv = createCli({ + scriptName: 'config-validate', + options: { + json: { type: 'boolean', default: false }, + repo: { type: 'string' }, + config: { type: 'string' } + } +}).parse(); const repoArg = argv.repo ? path.resolve(argv.repo) : null; const repoRoot = repoArg || resolveRepoRoot(process.cwd()); const configPath = argv.config ? path.resolve(argv.config) : path.join(repoRoot, '.pairofcleats.json'); -const toolRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..'); +const toolRoot = resolveToolRoot(); const schemaPath = path.join(toolRoot, 'docs', 'config-schema.json'); if (!fs.existsSync(schemaPath)) { @@ -35,7 +38,7 @@ if (!fs.existsSync(configPath)) { let config; try { - config = JSON.parse(fs.readFileSync(configPath, 'utf8')); + config = readJsoncFile(configPath); } catch (err) { const message = `Failed to parse config: ${err?.message || err}`; if (argv.json) { @@ -58,6 +61,27 @@ if (!config || typeof config !== 'object' || Array.isArray(config)) { const schema = JSON.parse(fs.readFileSync(schemaPath, 'utf8')); const result = validateConfig(schema, config); +const profileErrors = []; +const profileName = typeof config.profile === 'string' ? config.profile.trim() : ''; +if (profileName) { + const profilePath = path.join(toolRoot, 'profiles', `${profileName}.json`); + if (!fs.existsSync(profilePath)) { + profileErrors.push(`Profile not found: ${profilePath}`); + } else { + try { + const profileRaw = JSON.parse(fs.readFileSync(profilePath, 'utf8')); + if (!profileRaw || typeof profileRaw !== 'object' || Array.isArray(profileRaw)) { + profileErrors.push(`Profile must be a JSON object: ${profilePath}`); + } + } catch (err) { + profileErrors.push(`Failed to parse profile ${profilePath}: ${err?.message || err}`); + } + } +} +if (profileErrors.length) { + result.ok = false; + result.errors = result.errors.concat(profileErrors); +} if (argv.json) { console.log(JSON.stringify({ ok: result.ok, found: true, configPath, errors: result.errors }, null, 2)); } else if (result.ok) { diff --git a/tools/vector-extension.js b/tools/vector-extension.js index 100732f72..48865f707 100644 --- a/tools/vector-extension.js +++ b/tools/vector-extension.js @@ -1,12 +1,17 @@ import fs from 'node:fs'; import path from 'node:path'; import { getExtensionsDir, loadUserConfig } from './dict-utils.js'; +import { getEnvConfig } from '../src/shared/env.js'; +import { incFallback } from '../src/shared/metrics.js'; const DEFAULT_PROVIDER = 'sqlite-vec'; const DEFAULT_MODULE = 'vec0'; const DEFAULT_TABLE = 'dense_vectors_ann'; const DEFAULT_COLUMN = 'embedding'; const DEFAULT_ENCODING = 'float32'; +const SQLITE_IN_LIMIT = 900; +const IDENTIFIER_RE = /^[A-Za-z_][A-Za-z0-9_]*$/; +const OPTION_RE = /^([A-Za-z_][A-Za-z0-9_]*)(?:\s*=\s*([A-Za-z0-9_.-]+))?$/; const PROVIDERS = { 'sqlite-vec': { @@ -17,6 +22,61 @@ const PROVIDERS = { } }; +const warningCache = new Set(); + +function warnOnce(key, message) { + if (warningCache.has(key)) return; + warningCache.add(key); + console.warn(message); +} + +function isSafeIdentifier(value) { + return IDENTIFIER_RE.test(String(value || '')); +} + +function normalizeOptionValue(value) { + return String(value || '').replace(/\\/g, '/').trim(); +} + +function parseVectorOptions(raw) { + if (!raw) return { ok: true, options: '' }; + const trimmed = normalizeOptionValue(raw); + if (!trimmed) return { ok: true, options: '' }; + const parts = trimmed.split(',').map((part) => part.trim()).filter(Boolean); + const normalized = []; + for (const part of parts) { + const match = OPTION_RE.exec(part); + if (!match) { + return { ok: false, reason: 'invalid vector extension options' }; + } + const key = match[1]; + const value = match[2]; + normalized.push(value ? `${key}=${value}` : key); + } + return { ok: true, options: normalized.join(', ') }; +} + +function sanitizeVectorExtensionConfig(config) { + const issues = []; + if (!isSafeIdentifier(config.module)) issues.push('module'); + if (!isSafeIdentifier(config.table)) issues.push('table'); + if (!isSafeIdentifier(config.column)) issues.push('column'); + const parsedOptions = parseVectorOptions(config.options); + if (!parsedOptions.ok) issues.push('options'); + + const sanitized = { + ...config, + options: parsedOptions.ok ? parsedOptions.options : '', + disabledReason: null + }; + if (sanitized.enabled && issues.length) { + sanitized.enabled = false; + sanitized.disabledReason = `invalid vector extension config (${issues.join(', ')})`; + warnOnce('vector-extension-invalid', `[sqlite] Vector extension disabled: ${sanitized.disabledReason}`); + } + return sanitized; +} + /** * Resolve a path relative to the repo root. * @param {string} repoRoot @@ -59,12 +119,13 @@ export function getPlatformKey(platform = process.platform, arch = process.arch) */ export function getVectorExtensionConfig(repoRoot, userConfig = null, overrides = {}) { const cfg = userConfig || loadUserConfig(repoRoot); + const envConfig = getEnvConfig(); const sqlite = cfg.sqlite || {}; const vectorCfg = sqlite.vectorExtension || {}; const provider = overrides.provider || vectorCfg.provider || DEFAULT_PROVIDER; const providerDefaults = PROVIDERS[provider] || {}; - const annModeRaw = overrides.annMode || vectorCfg.annMode || sqlite.annMode || 'js'; + const annModeRaw = overrides.annMode || vectorCfg.annMode || 'js'; const annMode = String(annModeRaw).toLowerCase(); const enabled = overrides.enabled === true || vectorCfg.enabled === true @@ -82,7 +143,7 @@ export function getVectorExtensionConfig(repoRoot, userConfig = null, overrides const dir = overrides.dir ? resolvePath(repoRoot, overrides.dir) : resolvePath(repoRoot, vectorCfg.dir) - || process.env.PAIROFCLEATS_EXTENSIONS_DIR + || envConfig.extensionsDir || getExtensionsDir(repoRoot, cfg); const filename = overrides.filename || vectorCfg.filename @@ -91,14 +152,14 @@ export function getVectorExtensionConfig(repoRoot, userConfig = null, overrides const pathOverride = overrides.path ? resolvePath(repoRoot, overrides.path) : resolvePath(repoRoot, vectorCfg.path) - || (process.env.PAIROFCLEATS_VECTOR_EXTENSION - ? resolvePath(repoRoot, process.env.PAIROFCLEATS_VECTOR_EXTENSION) + || (envConfig.vectorExtension + ? resolvePath(repoRoot, envConfig.vectorExtension) : null); const url = overrides.url || vectorCfg.url || providerDefaults.url || null; const downloads = overrides.downloads || vectorCfg.downloads || providerDefaults.downloads || null; - return { + return sanitizeVectorExtensionConfig({ annMode, enabled, provider, @@ -115,7 +176,7 @@ export function getVectorExtensionConfig(repoRoot, userConfig = null, overrides platform, arch, platformKey - }; + }); } /** @@ -141,7 +202,7 @@ const loadCache = new WeakMap(); */ export function loadVectorExtension(db, config, label = 'sqlite') { if (!db || !config?.enabled) { - return { ok: false, reason: 'disabled' }; + return { ok: false, reason: config?.disabledReason || 'disabled' }; } if (loadCache.has(db)) return loadCache.get(db); const extPath = resolveVectorExtensionPath(config); @@ -191,10 +252,19 @@ export function ensureVectorTable(db, config, dims) { if (!db || !config?.module || !config?.table) { return { ok: false, reason: 'missing config' }; } + if (!config.enabled) { + return { ok: false, reason: config.disabledReason || 'disabled' }; + } + if (!isSafeIdentifier(config.module) || !isSafeIdentifier(config.table)) { + return { ok: false, reason: 'invalid vector extension config' }; + } if (!Number.isFinite(dims) || dims <= 0) { return { ok: false, reason: 'invalid dims' }; } const column = config.column || DEFAULT_COLUMN; + if (!isSafeIdentifier(column)) { + return { ok: false, reason: 'invalid vector extension config' }; + } const options = config.options ? `, ${config.options}` : ''; try { try { @@ -238,28 +308,47 @@ export function encodeVector(vector, config) { * @returns {Array<{idx:number,sim:number}>} */ export function queryVectorAnn(db, config, embedding, topN, candidateSet) { - if (!db || !embedding) return []; + if (!db || !embedding || !config?.enabled) return []; const table = config?.table || DEFAULT_TABLE; const column = config?.column || DEFAULT_COLUMN; + if (!isSafeIdentifier(table) || !isSafeIdentifier(column)) { + warnOnce('vector-extension-unsafe', '[sqlite] Vector extension disabled: invalid identifiers'); + return []; + } const limit = Math.max(1, Number(topN) || 1); - const queryLimit = candidateSet && candidateSet.size ? limit * 5 : limit; + const candidateSize = candidateSet?.size || 0; + const canPushdown = candidateSize > 0 && candidateSize <= SQLITE_IN_LIMIT; + const candidates = canPushdown ? Array.from(candidateSet) : null; + const queryLimit = canPushdown ? limit : (candidateSize ? limit * 5 : limit); const encoded = encodeVector(embedding, config); if (!encoded) return []; try { + const candidateClause = canPushdown + ? ` AND rowid IN (${candidates.map(() => '?').join(',')})` + : ''; + const params = canPushdown + ? [encoded, ...candidates, queryLimit] + : [encoded, queryLimit]; + if (candidateSize && !canPushdown) { + warnOnce('vector-extension-candidates', '[sqlite] Vector extension candidate set too large; using best-effort fallback.'); + incFallback({ surface: 'search', reason: 'vector-candidates' }); + } const stmt = db.prepare( - `SELECT rowid, distance FROM ${table} WHERE ${column} MATCH ? ORDER BY distance LIMIT ?` + `SELECT rowid, distance FROM ${table} WHERE ${column} MATCH ?${candidateClause} ORDER BY distance LIMIT ?` ); - const rows = stmt.all(encoded, queryLimit); + const rows = stmt.all(...params); let hits = rows.map((row) => { const rowId = Number(row.rowid ?? row.id); const raw = row.distance ?? row.score ?? row.similarity ?? row.sim ?? 0; const sim = row.distance !== undefined ? -raw : raw; return { idx: rowId, sim }; }); - if (candidateSet && candidateSet.size) { + if (candidateSet && candidateSet.size && !canPushdown) { hits = hits.filter((hit) => candidateSet.has(hit.idx)); } - return hits.slice(0, limit); + return hits + .sort((a, b) => (b.sim - a.sim) || (a.idx - b.idx)) + .slice(0, limit); } catch { return []; } diff --git a/tools/verify-extensions.js b/tools/verify-extensions.js index af45544a1..c29e6c710 100644 --- a/tools/verify-extensions.js +++ b/tools/verify-extensions.js @@ -1,14 +1,29 @@ #!/usr/bin/env node import fs from 'node:fs'; -import minimist from 'minimist'; +import path from 'node:path'; +import { createCli } from '../src/shared/cli.js'; import { loadUserConfig, resolveRepoRoot } from './dict-utils.js'; import { getVectorExtensionConfig, resolveVectorExtensionPath } from './vector-extension.js'; -const argv = minimist(process.argv.slice(2), { - boolean: ['json', 'load'], - string: ['provider', 'dir', 'path', 'platform', 'arch', 'module', 'table', 'column', 'encoding', 'options', 'ann-mode', 'repo'], - default: { json: false, load: true } -}); +const argv = createCli({ + scriptName: 'verify-extensions', + options: { + json: { type: 'boolean', default: false }, + load: { type: 'boolean', default: true }, + provider: { type: 'string' }, + dir: { type: 'string' }, + path: { type: 'string' }, + platform: { type: 'string' }, + arch: { type: 'string' }, + module: { type: 'string' }, + table: { type: 'string' }, + column: { type: 'string' }, + encoding: { type: 'string' }, + options: { type: 'string' }, + 'ann-mode': { type: 'string' }, + repo: { type: 'string' } + } +}).parse(); const rootArg = argv.repo ? path.resolve(argv.repo) : null; const root = rootArg || resolveRepoRoot(process.cwd()); diff --git a/tools/workers/bundle-reader.js b/tools/workers/bundle-reader.js new file mode 100644 index 000000000..f529b13f5 --- /dev/null +++ b/tools/workers/bundle-reader.js @@ -0,0 +1,12 @@ +import { readBundleFile } from '../../src/shared/bundle-io.js'; + +export default async function readBundle({ bundlePath }) { + if (!bundlePath) return { ok: false, reason: 'missing bundle path' }; + try { + const result = await readBundleFile(bundlePath); + if (!result.ok) return { ok: false, reason: result.reason || 'invalid bundle' }; + return { ok: true, bundle: { chunks: result.bundle.chunks } }; + } catch (err) { + return { ok: false, reason: err?.message || String(err) }; + } +}