diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md index d51d20bdf..c48a56b11 100644 --- a/nemo_retriever/README.md +++ b/nemo_retriever/README.md @@ -16,8 +16,8 @@ From the repo root: ```bash cd /path/to/nv-ingest -uv venv .retriever -source .retriever/bin/activate +uv venv .nr +source .nr/bin/activate uv pip install -e ./nemo_retriever ``` @@ -52,7 +52,7 @@ uv run python nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py /path Pass the directory that contains your PDFs as the first argument (`input-dir`). For recall evaluation, the pipeline uses `bo767_query_gt.csv` in the current directory by default; override with `--query-csv `. For document-level recall, use `--recall-match-mode pdf_only` with `query,expected_pdf` data. Recall is skipped if the query file does not exist. By default, per-query details (query, gold, hits) are printed; use `--no-recall-details` to print only the missed-gold summary and recall metrics. To use an existing Ray cluster, pass `--ray-address auto`. If OCR fails with a missing `libcudart.so.13`, install the CUDA 13 runtime and set `LD_LIBRARY_PATH` as shown above. -For **HTML** or **text** ingestion, use `--input-type html` or `--input-type txt` with the same examples (e.g. `batch_pipeline.py --input-type html`). HTML files are converted to markdown via markitdown, then chunked with the same tokenizer as .txt. Staged CLI: `retriever html run --input-dir ` writes `*.html_extraction.json`; then `retriever local stage5 run --input-dir --pattern "*.html_extraction.json"` and `retriever local stage6 run --input-dir `. +For **HTML** or **text** ingestion, use `--input-type html` or `--input-type txt` with the same examples (e.g. `batch_pipeline.py --input-type html`). HTML files are converted to markdown via markitdown, then chunked with the same tokenizer as .txt. Staged CLI: `nr html run --input-dir ` writes `*.html_extraction.json`; then `nr local stage5 run --input-dir --pattern "*.html_extraction.json"` and `nr local stage6 run --input-dir `. ## Harness (run, sweep, nightly) @@ -61,7 +61,7 @@ For **HTML** or **text** ingestion, use `--input-type html` or `--input-type txt - Config files: - `nemo_retriever/harness/test_configs.yaml` - `nemo_retriever/harness/nightly_config.yaml` -- CLI entrypoint is nested under `retriever harness`. +- CLI entrypoint is nested under `nr harness`. - First pass is LanceDB-only and enforces recall-required pass/fail by default. - Single-run artifact directories default to `_`. - Dataset-specific recall adapters are supported via config: @@ -77,37 +77,37 @@ For **HTML** or **text** ingestion, use `--input-type html` or `--input-type txt ```bash # Dataset preset from test_configs.yaml (recall-required example) -retriever harness run --dataset jp20 --preset single_gpu +nr harness run --dataset jp20 --preset single_gpu # Direct dataset path -retriever harness run --dataset /datasets/nv-ingest/bo767 --preset single_gpu +nr harness run --dataset /datasets/nv-ingest/bo767 --preset single_gpu # Add repeatable run or session tags for later review -retriever harness run --dataset jp20 --preset single_gpu --tag nightly --tag candidate +nr harness run --dataset jp20 --preset single_gpu --tag nightly --tag candidate ``` ### Sweep runs (explicit runs list) ```bash -retriever harness sweep --runs-config nemo_retriever/harness/nightly_config.yaml +nr harness sweep --runs-config nemo_retriever/harness/nightly_config.yaml ``` ### Nightly session ```bash -retriever harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml -retriever harness nightly --dry-run -retriever harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml --tag nightly +nr harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml +nr harness nightly --dry-run +nr harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml --tag nightly ``` ### Session inspection ```bash # Print a compact table from a completed sweep/nightly session -retriever harness summary nemo_retriever/artifacts/nightly_20260305_010203_UTC +nr harness summary nemo_retriever/artifacts/nightly_20260305_010203_UTC # Compare two session summaries by run name -retriever harness compare \ +nr harness compare \ nemo_retriever/artifacts/nightly_20260305_010203_UTC \ nemo_retriever/artifacts/nightly_20260306_010204_UTC ``` diff --git a/nemo_retriever/chart_stage_config.yaml b/nemo_retriever/chart_stage_config.yaml index fada15b68..00abf3c51 100644 --- a/nemo_retriever/chart_stage_config.yaml +++ b/nemo_retriever/chart_stage_config.yaml @@ -1,8 +1,8 @@ # Example config for chart extraction. # # Intended usage (once the chart stage CLI is wired up similarly to table stage): -# - `retriever chart stage run --config --input ` -# - `retriever local stage4 run --config --input ` +# - `nr chart stage run --config --input ` +# - `nr local stage4 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_chart_schema.ChartExtractorSchema` # via `nemo_retriever.chart.config.load_chart_extractor_schema_from_dict`. diff --git a/nemo_retriever/embedding_stage_config.yaml b/nemo_retriever/embedding_stage_config.yaml index 46c45d170..df9de3567 100644 --- a/nemo_retriever/embedding_stage_config.yaml +++ b/nemo_retriever/embedding_stage_config.yaml @@ -11,7 +11,7 @@ api_key: "" # e.g. $NGC_API_KEY or $NVIDIA_API_KEY # Embedding service settings -# If set to null/empty, `retriever local stage5` will fall back to local HF embeddings +# If set to null/empty, `nr local stage5` will fall back to local HF embeddings # via `nemo_retriever.model.local.llama_nemotron_embed_1b_v2_embedder`. embedding_nim_endpoint: null # embedding_nim_endpoint: "http://localhost:8012/v1" diff --git a/nemo_retriever/harness/HANDOFF.md b/nemo_retriever/harness/HANDOFF.md index e724307b1..ac992ad79 100644 --- a/nemo_retriever/harness/HANDOFF.md +++ b/nemo_retriever/harness/HANDOFF.md @@ -49,36 +49,36 @@ From repo root: ```bash source ~/setup_env.sh -source .retriever/bin/activate +source .nr/bin/activate uv pip install -e ./nemo_retriever ``` Single run: ```bash -retriever harness run --dataset jp20 --preset single_gpu -retriever harness run --dataset jp20 --preset single_gpu --tag nightly --tag candidate +nr harness run --dataset jp20 --preset single_gpu +nr harness run --dataset jp20 --preset single_gpu --tag nightly --tag candidate ``` Sweep: ```bash -retriever harness sweep --runs-config nemo_retriever/harness/nightly_config.yaml +nr harness sweep --runs-config nemo_retriever/harness/nightly_config.yaml ``` Nightly: ```bash -retriever harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml -retriever harness nightly --dry-run -retriever harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml --tag nightly +nr harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml +nr harness nightly --dry-run +nr harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml --tag nightly ``` Session inspection: ```bash -retriever harness summary nemo_retriever/artifacts/nightly_20260305_010203_UTC -retriever harness compare \ +nr harness summary nemo_retriever/artifacts/nightly_20260305_010203_UTC +nr harness compare \ nemo_retriever/artifacts/nightly_20260305_010203_UTC \ nemo_retriever/artifacts/nightly_20260306_010204_UTC ``` @@ -148,9 +148,9 @@ Notes: - `financebench` now defaults to `data/financebench_train.json` with recall enabled. - Session UX improvements: - Runs, sweeps, and nightly sessions accept repeatable `--tag` values persisted into artifacts. - - `retriever harness summary` prints a compact table from `session_summary.json`. + - `nr harness summary` prints a compact table from `session_summary.json`. - Comparison utility: - - `retriever harness compare` prints pages/sec and recall deltas by run name for two sessions. + - `nr harness compare` prints pages/sec and recall deltas by run name for two sessions. ## Current Validation Status diff --git a/nemo_retriever/infographic_stage_config.yaml b/nemo_retriever/infographic_stage_config.yaml index 67ac68546..d945aa586 100644 --- a/nemo_retriever/infographic_stage_config.yaml +++ b/nemo_retriever/infographic_stage_config.yaml @@ -1,6 +1,6 @@ # Example config for: -# - `retriever infographic stage run --config --input ` -# - `retriever local stage2 run --config --input ` +# - `nr infographic stage run --config --input ` +# - `nr local stage2 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_infographic_schema.InfographicExtractorSchema` # via `nemo_retriever.infographic.config.load_infographic_extractor_schema_from_dict`. diff --git a/nemo_retriever/pdf_stage_config.yaml b/nemo_retriever/pdf_stage_config.yaml index 33126ba61..7e9a7352e 100644 --- a/nemo_retriever/pdf_stage_config.yaml +++ b/nemo_retriever/pdf_stage_config.yaml @@ -1,11 +1,11 @@ -# Example config for: `retriever pdf stage page-elements --config ` +# Example config for: `nr pdf stage page-elements --config ` # # CLI override rule: # - If you pass an option explicitly on the CLI, it wins. # - Otherwise the value from this YAML file is used. # # You can run repeatedly: -# retriever pdf stage page-elements --config nemo_retriever/pdf_stage_config.yaml +# nr pdf stage page-elements --config nemo_retriever/pdf_stage_config.yaml # # Directory containing PDFs (scanned recursively for *.pdf) diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index b9d1ac476..24c400f43 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -69,6 +69,7 @@ dependencies = [ "soundfile>=0.12.0", "scipy>=1.11.0", "nvidia-ml-py", + "pytest" ] [project.optional-dependencies] @@ -78,7 +79,7 @@ dev = [ ] [project.scripts] -retriever = "nemo_retriever.__main__:main" +nr = "nemo_retriever.__main__:main" [tool.setuptools.dynamic] version = {attr = "nemo_retriever.version.get_build_version"} diff --git a/nemo_retriever/src/nemo_retriever/adapters/cli/main.py b/nemo_retriever/src/nemo_retriever/adapters/cli/main.py index 32d8012de..e533892e8 100644 --- a/nemo_retriever/src/nemo_retriever/adapters/cli/main.py +++ b/nemo_retriever/src/nemo_retriever/adapters/cli/main.py @@ -7,12 +7,14 @@ import typer from nemo_retriever.audio import app as audio_app +from nemo_retriever.examples.batch_pipeline import app as batch_app from nemo_retriever.utils.benchmark import app as benchmark_app from nemo_retriever.chart import app as chart_app from nemo_retriever.utils.compare import app as compare_app from nemo_retriever.harness import app as harness_app from nemo_retriever.html import __main__ as html_main from nemo_retriever.utils.image import app as image_app +from nemo_retriever.examples.inprocess_pipeline import app as inprocess_app from nemo_retriever.local import app as local_app from nemo_retriever.online import __main__ as online_main from nemo_retriever.pdf import app as pdf_app @@ -21,7 +23,13 @@ from nemo_retriever.vector_store import app as vector_store_app from nemo_retriever.version import get_version_info -app = typer.Typer(help="Retriever") +app = typer.Typer(help="NeMo Retriever – RAG ingestion pipeline CLI.") + +ingest_app = typer.Typer(help="Run ingestion pipelines (batch or in-process).") +ingest_app.add_typer(batch_app, name="batch") +ingest_app.add_typer(inprocess_app, name="inprocess") +app.add_typer(ingest_app, name="ingest") + app.add_typer(audio_app, name="audio") app.add_typer(image_app, name="image") app.add_typer(pdf_app, name="pdf") @@ -54,7 +62,7 @@ def _callback( version: bool = typer.Option( False, "--version", - help="Show retriever version metadata and exit.", + help="Show nr version metadata and exit.", callback=_version_callback, is_eager=True, ) diff --git a/nemo_retriever/src/nemo_retriever/audio/cli.py b/nemo_retriever/src/nemo_retriever/audio/cli.py index 1dbb6ee6d..af91777e8 100644 --- a/nemo_retriever/src/nemo_retriever/audio/cli.py +++ b/nemo_retriever/src/nemo_retriever/audio/cli.py @@ -8,7 +8,7 @@ This module intentionally contains **no configuration logic**. It simply re-exports the `nemo_retriever.audio.stage` Typer application so any arguments provided to: - `retriever audio ...` + `nr audio ...` are handled exactly the same as the stage commands (e.g. `extract`, `discover`). """ diff --git a/nemo_retriever/src/nemo_retriever/audio/stage.py b/nemo_retriever/src/nemo_retriever/audio/stage.py index 000ae99c9..7284ff1cb 100644 --- a/nemo_retriever/src/nemo_retriever/audio/stage.py +++ b/nemo_retriever/src/nemo_retriever/audio/stage.py @@ -5,8 +5,8 @@ """ Audio extraction stage: chunk + ASR only, write *.audio_extraction.json sidecars. -Invoked as `retriever audio extract` / `retriever audio discover` (or -`python -m nemo_retriever.audio extract` / `discover`). Analogous to `retriever pdf stage page-elements`. +Invoked as `nr audio extract` / `nr audio discover` (or +`python -m nemo_retriever.audio extract` / `discover`). Analogous to `nr pdf stage page-elements`. """ from __future__ import annotations diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index d9f6915ff..2392f964a 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -4,7 +4,7 @@ """ Batch ingestion pipeline with optional recall evaluation. -Run with: uv run python -m nemo_retriever.examples.batch_pipeline +Run with: nr batch """ import json @@ -345,7 +345,7 @@ def _hit_key_and_distance(hit: dict) -> tuple[str | None, float | None]: return key, dist -@app.command() +@app.command(name="pipeline") def main( ctx: typer.Context, debug: bool = typer.Option( diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py index ea6295c2b..930505da8 100644 --- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py @@ -4,7 +4,7 @@ """ In-process ingestion pipeline (no Ray) with optional recall evaluation. -Run with: uv run python -m nemo_retriever.examples.inprocess_pipeline +Run with: nr ingest inprocess """ import json @@ -96,7 +96,7 @@ def _hit_key_and_distance(hit: dict) -> tuple[str | None, float | None]: return key, dist -@app.command() +@app.command(name="pipeline") def main( input_path: Path = typer.Argument( ..., diff --git a/nemo_retriever/src/nemo_retriever/examples/online_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/online_pipeline.py index 9808e29d6..d23a82225 100644 --- a/nemo_retriever/src/nemo_retriever/examples/online_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/online_pipeline.py @@ -7,7 +7,7 @@ - Inprocess: runs the full pipeline locally (no server). - Online: submits each document to the online ingest REST service (start with - `retriever online serve`). Uses the same LanceDB for recall evaluation. + `nr online serve`). Uses the same LanceDB for recall evaluation. Run with: uv run python -m nemo_retriever.examples.online_pipeline diff --git a/nemo_retriever/src/nemo_retriever/html/__main__.py b/nemo_retriever/src/nemo_retriever/html/__main__.py index 28d4e9a37..748f5f361 100644 --- a/nemo_retriever/src/nemo_retriever/html/__main__.py +++ b/nemo_retriever/src/nemo_retriever/html/__main__.py @@ -5,7 +5,7 @@ """ CLI for .html extraction: markitdown -> markdown -> tokenizer split, write *.html_extraction.json. -Use with: retriever local stage5 run --pattern "*.html_extraction.json" then stage6. +Use with: nr local stage5 run --pattern "*.html_extraction.json" then stage6. """ from __future__ import annotations @@ -63,8 +63,8 @@ def run( Scan input_dir for *.html, convert to markdown and chunk each, write .html_extraction.json. Output JSON has the same primitives-like shape as stage5 input (text, path, page_number, metadata). - Then run: retriever local stage5 run --input-dir --pattern "*.html_extraction.json" - and retriever local stage6 run --input-dir . + Then run: nr local stage5 run --input-dir --pattern "*.html_extraction.json" + and nr local stage6 run --input-dir . """ input_dir = Path(input_dir) html_files = sorted(input_dir.glob("*.html")) diff --git a/nemo_retriever/src/nemo_retriever/ingest-config.yaml b/nemo_retriever/src/nemo_retriever/ingest-config.yaml index 59fb95303..b35c1b573 100644 --- a/nemo_retriever/src/nemo_retriever/ingest-config.yaml +++ b/nemo_retriever/src/nemo_retriever/ingest-config.yaml @@ -1,4 +1,4 @@ -# nv-ingest retriever consolidated configuration +# nv-ingest nr consolidated configuration # # This single file replaces the older per-stage YAML configs: # - pdf_stage_config.yaml @@ -17,7 +17,7 @@ # - Sections below are consumed by their respective stages. pdf: - # Example config for: `retriever pdf stage page-elements --config ` + # Example config for: `nr pdf stage page-elements --config ` # # Directory containing PDFs (scanned recursively for *.pdf) input_dir: /home/local/jdyer/datasets/jp20 @@ -59,14 +59,14 @@ pdf: # Optionally limit number of PDFs processed limit: null -# Optional config for `retriever txt run` and .extract_txt() API +# Optional config for `nr txt run` and .extract_txt() API txt: max_tokens: 512 overlap_tokens: 0 tokenizer_model_id: nvidia/llama-3.2-nv-embedqa-1b-v2 encoding: utf-8 -# Optional config for `retriever html run` and .extract_html() API +# Optional config for `nr html run` and .extract_html() API html: max_tokens: 512 overlap_tokens: 0 @@ -94,8 +94,8 @@ audio_asr: table: # Example config for: - # - `retriever table stage run --config --input ` - # - `retriever local stage3 run --config --input ` + # - `nr table stage run --config --input ` + # - `nr local stage3 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_table_schema.TableExtractorSchema` # via `nemo_retriever.table.config.load_table_extractor_schema_from_dict`. @@ -133,8 +133,8 @@ table: chart: # Example config for: - # - `retriever chart stage run --config --input ` - # - `retriever local stage4 run --config --input ` + # - `nr chart stage run --config --input ` + # - `nr local stage4 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_chart_schema.ChartExtractorSchema` # via `nemo_retriever.chart.config.load_chart_extractor_schema_from_dict`. @@ -185,8 +185,8 @@ chart: infographic: # Example config for: - # - `retriever infographic stage run --config --input ` - # - `retriever local stage2 run --config --input ` + # - `nr infographic stage run --config --input ` + # - `nr local stage2 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_infographic_schema.InfographicExtractorSchema` # via `nemo_retriever.infographic.config.load_infographic_extractor_schema_from_dict`. @@ -231,7 +231,7 @@ embedding: api_key: "" # e.g. $NGC_API_KEY or $NVIDIA_API_KEY # Embedding service settings - # If set to null/empty, `retriever local stage5` will fall back to local HF embeddings + # If set to null/empty, `nr local stage5` will fall back to local HF embeddings # via `nemo_retriever.model.local.llama_nemotron_embed_1b_v2_embedder`. embedding_nim_endpoint: null # embedding_nim_endpoint: "http://localhost:8012/v1" diff --git a/nemo_retriever/src/nemo_retriever/local/stages/stage1_pdf_extraction.py b/nemo_retriever/src/nemo_retriever/local/stages/stage1_pdf_extraction.py index 06bc08a4a..863bea08c 100644 --- a/nemo_retriever/src/nemo_retriever/local/stages/stage1_pdf_extraction.py +++ b/nemo_retriever/src/nemo_retriever/local/stages/stage1_pdf_extraction.py @@ -10,11 +10,11 @@ This module intentionally contains **no configuration logic**. It simply re-exports the `nemo_retriever.pdf.stage` Typer application so any arguments provided to: - `retriever local stage1 ...` + `nr local stage1 ...` are handled exactly the same as: - `retriever pdf ...` + `nr pdf ...` """ from nemo_retriever.pdf.stage import app as app diff --git a/nemo_retriever/src/nemo_retriever/local/stages/stage5_text_embeddings.py b/nemo_retriever/src/nemo_retriever/local/stages/stage5_text_embeddings.py index 3ea038f3d..5cdae2f2a 100644 --- a/nemo_retriever/src/nemo_retriever/local/stages/stage5_text_embeddings.py +++ b/nemo_retriever/src/nemo_retriever/local/stages/stage5_text_embeddings.py @@ -10,7 +10,7 @@ This module intentionally contains no configuration logic. It re-exports the `nemo_retriever.text_embed.stage` Typer application so arguments provided to: - `retriever local stage5 ...` + `nr local stage5 ...` are handled by `nemo_retriever.text_embed.stage`, including local-HF fallback options when an embedding endpoint is not configured. diff --git a/nemo_retriever/src/nemo_retriever/local/stages/stage6_vdb_upload.py b/nemo_retriever/src/nemo_retriever/local/stages/stage6_vdb_upload.py index 17c18b45f..a61808957 100644 --- a/nemo_retriever/src/nemo_retriever/local/stages/stage6_vdb_upload.py +++ b/nemo_retriever/src/nemo_retriever/local/stages/stage6_vdb_upload.py @@ -10,7 +10,7 @@ This module intentionally contains no configuration logic. It re-exports the `nemo_retriever.vector_store.stage` Typer application so arguments provided to: - `retriever local stage6 ...` + `nr local stage6 ...` are handled by `nemo_retriever.vector_store.stage`. """ diff --git a/nemo_retriever/src/nemo_retriever/local/stages/stage7_vdb_query.py b/nemo_retriever/src/nemo_retriever/local/stages/stage7_vdb_query.py index 451befd0d..e11be2c5d 100644 --- a/nemo_retriever/src/nemo_retriever/local/stages/stage7_vdb_query.py +++ b/nemo_retriever/src/nemo_retriever/local/stages/stage7_vdb_query.py @@ -10,7 +10,7 @@ This module intentionally contains no configuration logic. It re-exports the `nemo_retriever.recall.vdb_recall` Typer application so arguments provided to: - `retriever local stage7 ...` + `nr local stage7 ...` are handled by `nemo_retriever.recall.vdb_recall`, including a local-HF fallback path when embedding HTTP/gRPC endpoints are not configured. diff --git a/nemo_retriever/src/nemo_retriever/local/stages/stage999_post_mortem_analysis.py b/nemo_retriever/src/nemo_retriever/local/stages/stage999_post_mortem_analysis.py index e4f6ba1ae..951d9dfe5 100644 --- a/nemo_retriever/src/nemo_retriever/local/stages/stage999_post_mortem_analysis.py +++ b/nemo_retriever/src/nemo_retriever/local/stages/stage999_post_mortem_analysis.py @@ -659,7 +659,7 @@ def _arc_for(p: Path) -> str: # Include a small readme to guide recipients. readme = ( - "retriever stage999 gathered results\n" + "nr stage999 gathered results\n" "\n" "Contents:\n" "- bo767_query_gt.csv: original query->pdf_page mapping\n" @@ -1697,7 +1697,7 @@ def _short(s: str, n: int = 90) -> str: summary_cache: Dict[int, Dict[str, Any]] = {} root = tk.Tk() - root.title("retriever stage999 post-mortem analysis") + root.title("nr stage999 post-mortem analysis") # Layout: left search+list, right detail panel. root.columnconfigure(0, weight=0) diff --git a/nemo_retriever/src/nemo_retriever/txt/__main__.py b/nemo_retriever/src/nemo_retriever/txt/__main__.py index b769c774e..fb60cb962 100644 --- a/nemo_retriever/src/nemo_retriever/txt/__main__.py +++ b/nemo_retriever/src/nemo_retriever/txt/__main__.py @@ -5,7 +5,7 @@ """ CLI for .txt extraction: tokenizer-based split, write *.txt_extraction.json. -Use with: retriever local stage5 run --pattern "*.txt_extraction.json" then stage6. +Use with: nr local stage5 run --pattern "*.txt_extraction.json" then stage6. """ from __future__ import annotations @@ -63,8 +63,8 @@ def run( Scan input_dir for *.txt, tokenizer-split each into chunks, write .txt_extraction.json. Output JSON has the same primitives-like shape as stage5 input (text, path, page_number, metadata). - Then run: retriever local stage5 run --input-dir --pattern "*.txt_extraction.json" - and retriever local stage6 run --input-dir . + Then run: nr local stage5 run --input-dir --pattern "*.txt_extraction.json" + and nr local stage6 run --input-dir . """ input_dir = Path(input_dir) txt_files = sorted(input_dir.glob("*.txt")) diff --git a/nemo_retriever/src/nemo_retriever/vector_store/stage.py b/nemo_retriever/src/nemo_retriever/vector_store/stage.py index ba3f994a1..94c535272 100644 --- a/nemo_retriever/src/nemo_retriever/vector_store/stage.py +++ b/nemo_retriever/src/nemo_retriever/vector_store/stage.py @@ -24,7 +24,7 @@ def run( exists=True, file_okay=False, dir_okay=True, - help="Directory containing `*.text_embeddings.json` files (from `retriever local stage5`).", + help="Directory containing `*.text_embeddings.json` files (from `nr local stage5`).", ), recursive: bool = typer.Option(False, "--recursive/--no-recursive", help="Scan subdirectories too."), limit: Optional[int] = typer.Option(None, "--limit", min=1, help="Optionally limit number of input files."), diff --git a/nemo_retriever/table_stage_config.yaml b/nemo_retriever/table_stage_config.yaml index e268c8ba7..8a7b80152 100644 --- a/nemo_retriever/table_stage_config.yaml +++ b/nemo_retriever/table_stage_config.yaml @@ -1,6 +1,6 @@ # Example config for: -# - `retriever table stage run --config --input ` -# - `retriever local stage3 run --config --input ` +# - `nr table stage run --config --input ` +# - `nr local stage3 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_table_schema.TableExtractorSchema` # via `nemo_retriever.table.config.load_table_extractor_schema_from_dict`. diff --git a/nemo_retriever/tests/test_audio_stage.py b/nemo_retriever/tests/test_audio_stage.py index 33124adbf..b23209513 100644 --- a/nemo_retriever/tests/test_audio_stage.py +++ b/nemo_retriever/tests/test_audio_stage.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 """ -Tests for the audio extraction-only stage (retriever audio extract). +Tests for the audio extraction-only stage (nr audio extract). """ from pathlib import Path diff --git a/nemo_retriever/tests/test_nr_cli.py b/nemo_retriever/tests/test_nr_cli.py new file mode 100644 index 000000000..24d37ec3c --- /dev/null +++ b/nemo_retriever/tests/test_nr_cli.py @@ -0,0 +1,153 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Tests for the ``nr`` CLI entrypoint, the ``nr ingest`` command group, +and the ``nr ingest inprocess`` / ``nr ingest batch`` subcommands. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +try: + from nemo_retriever.adapters.cli.main import app +except ImportError as _exc: + pytest.skip(f"CLI dependencies not available: {_exc}", allow_module_level=True) + +from typer.testing import CliRunner + +RUNNER = CliRunner() + + +# --------------------------------------------------------------------------- +# pyproject.toml entrypoint +# --------------------------------------------------------------------------- + + +def test_pyproject_entrypoint_is_nr() -> None: + """The installed script name must be ``nr``, not ``retriever``.""" + pyproject = Path(__file__).resolve().parents[1] / "pyproject.toml" + text = pyproject.read_text() + assert "\nnr = " in text + assert "\nretriever = " not in text + + +# --------------------------------------------------------------------------- +# Top-level app +# --------------------------------------------------------------------------- + + +def test_top_level_help_succeeds() -> None: + result = RUNNER.invoke(app, ["--help"]) + assert result.exit_code == 0 + + +def test_top_level_help_lists_ingest() -> None: + result = RUNNER.invoke(app, ["--help"]) + assert "ingest" in result.output + + +def test_version_flag() -> None: + result = RUNNER.invoke(app, ["--version"]) + assert result.exit_code == 0 + assert result.output.strip() + + +def test_existing_subcommands_still_registered() -> None: + """All pre-existing subcommands must still appear in top-level help.""" + result = RUNNER.invoke(app, ["--help"]) + for name in ("audio", "pdf", "local", "harness", "recall", "benchmark"): + assert name in result.output, f"Expected subcommand {name!r} in top-level help" + + +# --------------------------------------------------------------------------- +# ``nr ingest`` command group +# --------------------------------------------------------------------------- + + +def test_ingest_help_succeeds() -> None: + result = RUNNER.invoke(app, ["ingest", "--help"]) + assert result.exit_code == 0 + + +def test_ingest_help_lists_batch_and_inprocess() -> None: + result = RUNNER.invoke(app, ["ingest", "--help"]) + assert "batch" in result.output + assert "inprocess" in result.output + + +# --------------------------------------------------------------------------- +# ``nr ingest batch`` subcommand +# --------------------------------------------------------------------------- + + +def test_ingest_batch_help_succeeds() -> None: + result = RUNNER.invoke(app, ["ingest", "batch", "pipeline", "--help"]) + assert result.exit_code == 0 + + +def test_ingest_batch_help_shows_key_options() -> None: + result = RUNNER.invoke(app, ["ingest", "batch", "pipeline", "--help"]) + for flag in ("--embed-model-name", "--lancedb-uri", "--input-type", "--debug"): + assert flag in result.output, f"Expected {flag!r} in batch pipeline --help output" + + +# --------------------------------------------------------------------------- +# ``nr ingest inprocess`` subcommand – help / option presence +# --------------------------------------------------------------------------- + + +def test_ingest_inprocess_help_succeeds() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline", "--help"]) + assert result.exit_code == 0 + + +def test_ingest_inprocess_help_shows_extract_options() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline", "--help"]) + for flag in ( + "--use-table-structure", + "--table-output-format", + "--table-structure-invoke-url", + "--page-elements-invoke-url", + "--ocr-invoke-url", + ): + assert flag in result.output, f"Expected {flag!r} in inprocess pipeline --help" + + +def test_ingest_inprocess_help_shows_embed_options() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline", "--help"]) + for flag in ( + "--embed-model-name", + "--embed-invoke-url", + "--embed-modality", + "--embed-granularity", + "--text-elements-modality", + "--structured-elements-modality", + ): + assert flag in result.output, f"Expected {flag!r} in inprocess pipeline --help" + + +def test_ingest_inprocess_help_shows_execution_options() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline", "--help"]) + for flag in ("--max-workers", "--gpu-devices", "--num-gpus"): + assert flag in result.output, f"Expected {flag!r} in inprocess pipeline --help" + + +def test_ingest_inprocess_help_shows_recall_options() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline", "--help"]) + for flag in ("--query-csv", "--no-recall-details"): + assert flag in result.output, f"Expected {flag!r} in inprocess pipeline --help" + + +# --------------------------------------------------------------------------- +# ``nr ingest inprocess pipeline`` – argument validation +# --------------------------------------------------------------------------- + + +def test_ingest_inprocess_rejects_missing_path() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline"]) + assert result.exit_code != 0