diff --git a/.gitignore b/.gitignore index c02a673aa93..ca2e6009681 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ cpp/include/tensorrt_llm/executor/version.h cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/ cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h .devcontainer/.env +cpp/dependency_scan/scan_output/ # User config files CMakeUserPresets.json diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e013dbc17e2..86cfa00dba8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -328,6 +328,13 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss ") +# Generate dependency files (.d) to track all header dependencies This creates +# .d files alongside .o files showing all headers used +if(NOT WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -MD -MP") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -MD -MP") +endif() + # note: cmake expr generation $ is a build time # evaluation so hard to debug at cmake time if(ENABLE_MULTI_DEVICE) diff --git a/cpp/dependency_scan/README.md b/cpp/dependency_scan/README.md new file mode 100644 index 00000000000..095236579d3 --- /dev/null +++ b/cpp/dependency_scan/README.md @@ -0,0 +1,345 @@ +# CPP Dependency Scanner + +Scans TensorRT-LLM build artifacts (headers, libraries, binaries) and maps them to source dependencies. +A build artifact is any header file used in the build, and any linked static/dynamic library. + +## Quick Start + +```bash +# Run scanner (scans ../build by default) +python scan_build_artifacts.py + +# Output: +# scan_output/known.yml - Mapped artifacts +# scan_output/unknown.yml - Unmapped artifacts +# scan_output/path_issues.yml - Non-existent paths +``` + +## Goals and Non-Goals + +### Goals + +This scanner is designed to: + +1. **Map Build Artifacts to Dependencies** + - Identify which source dependencies (container-origin, fetched, third-party) are used in the TensorRT-LLM C++ build + - Use tools + developer-provided pattern data to map build artifacts to canonical packages. + +2. **Achieve Complete Coverage** + - Goal: 100% of build artifacts mapped to known dependencies. + - Track unmapped artifacts in `unknown.yml` for iterative pattern refinement + +4. **Enable Iterative Development** + - Provide actionable output (`unknown.yml`) to guide pattern additions + - Support YAML-based pattern definitions for easy maintenance + - Validate patterns with schema checking + +### Non-Goals + +This scanner is **NOT** designed to: + +1. Identify any source-integrated dependencies. A source-integrated dependency is any copy-pasted code directly from a third-party repository to the TensorRT-LLM codebase. +2. Identify pip-installed python runtime dependencies. +3. Be a one-size-fits-all solution catching all dependencies. +4. Enrich with license information for each dependency - or generate attributions. +5. Track transitive dependencies that are invisible to cmake. +6. provide Windows support. + +## Usage + +### Basic Usage + +```bash +# Scan with default settings +python scan_build_artifacts.py + +# Scan custom build directory +python scan_build_artifacts.py --build-dir /path/to/build + +# Scan with custom output directory +python scan_build_artifacts.py --output-dir /path/to/output + +# Validate YAML files +python scan_build_artifacts.py --validate +``` + +### Command-Line Arguments + +- `--build-dir`: Build directory to scan (default: `../build/`) +- `--output-dir`: Output directory for reports (default: `scan_output/`) +- `--metadata-dir`: Metadata directory containing YAML files (default: `./metadata/`) +- `--validate`: Validate YAML files without running scanner + +## Resolution Strategy + +1. **dpkg-query**: System packages via Debian package manager +2. **YAML patterns**: Non-dpkg packages (TensorRT, PyTorch, 3rdparty/ submodules, etc.) + +## Output Format + +### known.yml + +```yaml +summary: + total_artifacts: 6200 + mapped: 6200 + unmapped: 0 + coverage: 100.0% + unique_dependencies: 48 + +dependencies: + cuda-cudart: + - /usr/local/cuda-12.9/include/cuda_runtime.h + - /usr/local/cuda-12.9/include/cuda.h + + libc6: + - /usr/include/stdio.h + - -lpthread + - -ldl + + pytorch: + - /usr/local/lib/python3.12/dist-packages/torch/include/torch/torch.h + - -ltorch +``` + +### unknown.yml + +```yaml +summary: + count: 0 + action_required: Add patterns to YAML files in metadata/ for these artifacts +artifacts: [] +``` + +### path_issues.yml + +Reports artifacts whose resolved paths don't exist in the filesystem. This helps identify: +- Stale build artifacts from old builds +- Incorrectly resolved paths +- Optional headers that may not be present +- Temporary build files that were deleted + +**Note:** Library artifacts are excluded from this report since they don't have meaningful path resolution metadata. + +```yaml +summary: + count: 1042 + total_artifacts: 12238 + percentage: 8.5% + note: These header paths were resolved from .d files but do not exist in the filesystem (libraries excluded) + +non_existent_paths: +- resolved_path: /usr/local/lib/python3.12/dist-packages/torch/include/ATen/ops/_cudnn_attention_backward.h + type: header + source: /home/.../trtGptModelInflightBatching.cpp.o.d + d_file_path: /usr/local/lib/python3.12/dist-packages/torch/include/ATen/ops/_cudnn_attention_backward.h +``` + +**Field Descriptions:** +- `resolved_path`: The final canonicalized absolute path after resolution +- `type`: Artifact type (typically "header") +- `source`: The .d file where this path was found +- `d_file_path`: The original path as it appeared in the .d file (may be relative or absolute) + +**Common Causes:** +- **Optional headers**: PyTorch/CUDA headers that don't exist in all installations (e.g., `_cudnn_attention_*`) +- **Old CUDA paths**: References to previous CUDA versions no longer installed (e.g., `cuda-13.0` when only `cuda-12.9` exists) +- **Build artifacts**: Temporary generated files deleted after build completion +- **Stale .d files**: Dependency files from previous builds with different directory structures + +**Action:** Review the list and determine if these are expected (optional/temporary) or indicate path resolution issues. + +## Iterative Workflow + +1. **Run scanner** on build directory +2. **Review outputs**: + - `scan_output/unknown.yml` - unmapped artifacts requiring pattern additions + - `scan_output/path_issues.yml` - non-existent paths (may indicate stale builds or optional dependencies) +3. **Add patterns** to `metadata/*.yml` files for unknown artifacts +4. **Re-run** to verify improved coverage +5. **Repeat** until all artifacts mapped + +## Pattern Matching + +### Strategy Priority (High → Low) + +1. **Exact match**: `libcudart.so.12` → `cuda-cudart` +2. **Path alias**: `/build/pytorch/include/` → `pytorch` +3. **Generic inference**: `libfoobar.so` → `foobar` + +### Adding Patterns + +Edit existing or create new YAML file in `metadata/`: + +```yaml +name: cutlass +description: CUDA Templates for Linear Algebra Subroutines + +basename_matches: + - libcutlass.a + +linker_flags_matches: + - -lcutlass + +directory_matches: + - cutlass # Single: matches any /cutlass/ in path + - 3rdparty/cutlass # Multi: matches /3rdparty/cutlass/ sequence +``` + +#### Multi-Directory Patterns + +Directory patterns support both single and multi-directory matching: + +**Single Component:** +- `"pytorch"` matches any path containing `/pytorch/` +- Example: `/home/build/pytorch/include/torch.h` ✓ + +**Multi-Directory:** +- `"3rdparty/cutlass"` matches consecutive `/3rdparty/cutlass/` sequence +- `"foo/bar"` matches `/home/foo/bar/file.h` ✓ +- `"foo/bar"` does NOT match `/home/foobar/file.h` ✗ (no substring matching) + +**Matching Rules:** +- Exact component matching only (no substrings) +- `"oo/ba"` will NOT match `/foo/bar/` +- Rightmost match wins if pattern appears multiple times +- Leading/trailing slashes are ignored (`"/foo/bar/"` = `"foo/bar"`) + +See `metadata/_template.yml` and `metadata/README.md` for details. + +## YAML Dependencies + +Each dependency file contains: + +```yaml +name: pytorch +description: PyTorch machine learning framework +license: BSD-3-Clause +copyright: Copyright (c) PyTorch Contributors +homepage: https://pytorch.org/ +source: container + +basename_matches: + - libtorch.so + - libc10.so + +linker_flags_matches: + - -ltorch_python + +directory_matches: + - ATen + - c10 + - caffe2 + - torch + +aliases: + - torch +``` + +Multiple dependencies can be grouped in list format (see `metadata/base.yml`, `metadata/cuda.yml`). + +## Testing + +```bash +cd tests +python -m pytest test_scan_build_artifacts.py -v +``` + +## Troubleshooting + +**Low dpkg coverage** +- Running on non-Debian system +- YAML dependencies will handle more as fallback, with concrete patterns. + +**Many unknown artifacts** +1. Review `scan_output/unknown.yml` +2. Add patterns to `metadata/*.yml` +3. Run `--validate` to check syntax +4. Re-scan to verify + +**Wrong mappings** +- Check pattern priorities in YAML files +- More specific patterns should be listed first +- Make sure the patterns are very specific, to avoid false positives, or interfering with other patterns. + +**High percentage in path_issues.yml** +- If >20%, likely indicates stale build artifacts - run a clean rebuild +- If <10%, likely optional/temporary headers - expected behavior +- Check for references to uninstalled CUDA versions + +**Slow performance** +- Use `--build-dir` to target specific subdirectories +- Reduce build artifacts scope + +## Architecture + +``` +scan_build_artifacts.py (1,300 lines) +├── DpkgResolver - dpkg-query for system packages +├── ArtifactCollector - Parse D files, link files, wheels +├── PatternMatcher - 3-tier YAML pattern matching +└── OutputGenerator - Generate YAML reports +``` + +**Artifact Sources:** +- D files: CMake dependency files with headers. Dependency source header files. +- link.txt: Linker commands with libraries. Precompiled dependency artifacts. +- Wheels: Python binaries via readelf. Runtime dependency artifacts. + +**Special Parsing Behaviors:** + +1. **Malformed .d File Handling** (_parse_d_file method) + - Some CMake-generated .d files contain paths with trailing colons + - Example: `/usr/include/stdc-predef.h:` (should be `/usr/include/stdc-predef.h`) + - Parser strips trailing colons to handle these malformed entries + - Prevents duplicate artifacts and improves accuracy + +2. **CMakeFiles Linker Artifact Extraction** (_parse_link_file method) + - CMake generates special linker artifacts in CMakeFiles directories + - Pattern: `/path/CMakeFiles/foo.dir/-Wl,-soname,libtest.so.1` + - Parser extracts library name and converts to linker flag: `-ltest` + - Enables proper dependency mapping for internal build artifacts + +3. **CMake .d File Path Resolution** (_parse_d_file method, lines 356-364) + - **Critical Fix (October 2025)**: Changed context directory for path resolution + - CMake generates .d files with paths relative to the **target's build directory** (where the Makefile for that target is located), **NOT** the top-level build directory + - **Context Directory**: Parent directory of `CMakeFiles/` (e.g., `/build/tensorrt_llm/batch_manager/`) + - **Example**: For .d file at `/build/tensorrt_llm/batch_manager/CMakeFiles/target.dir/file.cpp.o.d`: + - **Context is**: `/build/tensorrt_llm/batch_manager/` (parent of CMakeFiles) + - **NOT**: `/build/` (top-level build directory) + - Relative path `../../../tensorrt_llm/...` resolves correctly from this context + - **Before Fix**: Used `d_file.parent` (adjacent to CMakeFiles directory) - caused 49.9% path resolution errors + - **After Fix**: Uses parent of CMakeFiles directory - reduced errors to 7.2% + - **Path Existence Tracking**: Scanner marks each artifact with `path_exists` metadata and reports non-existent paths in `path_issues.yml` + + **Algorithm:** + ```python + d_file_parts = d_file.parts + if 'CMakeFiles' in d_file_parts: + cmake_idx = d_file_parts.index('CMakeFiles') + context_dir = Path(*d_file_parts[:cmake_idx]) # Parent of CMakeFiles + else: + context_dir = self.build_dir # Fallback + ``` + +4. **3rdparty Submodule Resolution** (_parse_d_file method) + - When D files contain relative paths with submodule directories that don't exist relative to the build directory, the scanner attempts to resolve them from the configured submodules directory + - **Configuration**: Set via `THIRDPARTY_ROOT` constant in scan_build_artifacts.py + - **Default**: `TRTLLM_ROOT/3rdparty` (3 levels up from scanner location) + - **Customization**: Edit the `THIRDPARTY_ROOT` constant if dependencies move (e.g., to `${CMAKE_BINARY_DIR}/_deps/`) + - **Example**: `../../../../3rdparty/xgrammar/include/file.h` resolves to `{THIRDPARTY_ROOT}/xgrammar/include/file.h` + +**Resolution Flow:** +1. Collect artifacts from build directory +2. Try dpkg-query resolution (PRIMARY) +3. Fall back to YAML patterns (FALLBACK) +4. Generate known.yml, unknown.yml, and path_issues.yml reports + +## Files + +- `scan_build_artifacts.py` - Main scanner script +- `metadata/*.yml` - Dependency patterns +- `metadata/_template.yml` - Template for new dependencies +- `metadata/_schema.yml` - YAML validation schema +- `metadata/README.md` - Pattern documentation +- `tests/test_scan_build_artifacts.py` - Unit tests diff --git a/cpp/dependency_scan/metadata/README.md b/cpp/dependency_scan/metadata/README.md new file mode 100644 index 00000000000..f55990958e3 --- /dev/null +++ b/cpp/dependency_scan/metadata/README.md @@ -0,0 +1,761 @@ +# Dependency Patterns + +This directory contains pattern definitions for dependency detection in the TensorRT-LLM C++ dependency scanner. + +## Quick Start + +After running the scanner, check `scan_output/unknown.yml` for unmapped artifacts, then add patterns here. + +## Structure + +Each `.yml` file represents one or more dependencies: + +``` +metadata/ +├── _template.yml # Template for new dependencies +├── _schema.yml # JSON schema for validation +├── base.yml # Base system packages (list format) +├── cuda.yml # CUDA packages (list format) +├── tensorrt-llm.yml # Individual dependency +├── pytorch.yml +└── ... (23 total files) +``` + +## File Formats + +### Individual Dependency File + +Most dependencies use this format: + +```yaml +# metadata/pytorch.yml + +name: pytorch # Required: canonical name +description: PyTorch machine learning framework # Required: min 10 chars + +license: BSD-3-Clause # Optional: SPDX identifier +copyright: Copyright (c) PyTorch Contributors # Optional +homepage: https://pytorch.org/ # Optional: URL +source: container # Optional: how obtained (container, submodule, fetched) + +basename_matches: # Exact basename matches + - libtorch.so + - libc10.so + +linker_flags_matches: # Linker flags (-l flags) + - -ltorch_python + +directory_matches: # Directory path patterns + - ATen + - c10 + - torch +``` + +### List Format (base.yml, cuda.yml) + +System packages use a list format for compactness: + +```yaml +# metadata/base.yml or cuda.yml + +dependencies: + - name: libc6 + description: GNU C Library + source: container + basename_matches: + - libc.so.6 + linker_flags_matches: + - -lc + - -lpthread + - -ldl + directory_matches: [] + + - name: libstdc++6 + description: GNU C++ Library + source: container + basename_matches: + - libstdc++.so.6 + linker_flags_matches: + - -lstdc++ + directory_matches: [] + # ... more system libraries +``` + +## Field Names Reference + +**Current field names** (as of latest schema): +- `basename_matches` - Exact filename matches (not "patterns") +- `linker_flags_matches` - Linker flags (not "linker_flags") +- `directory_matches` - Path component patterns (not "path_components") + +## Iterative Pattern Development + +This section describes the recommended workflow for achieving high coverage through iterative pattern refinement. + +### Workflow Steps + +1. **Run the scanner** on your build directory: + ```bash + python scan_build_artifacts.py --build-dir /path/to/build + ``` + +2. **Examine scan_output/unknown.yml** to identify unmapped artifacts: + ```bash + cat scan_output/unknown.yml + ``` + + Example output: + ```yaml + summary: + count: 42 + action_required: Add patterns to YAML files in metadata/ for these artifacts + + artifacts: + - /build/3rdparty/newlib/include/foo.h + - /usr/local/cuda-13.0/include/cuda.h + - libfoo.so + - -lbar + ``` + +3. **Analyze patterns** in unknown artifacts: + - Group artifacts by logical dependency + - Identify common directory paths + - Note exact library names and linker flags + - Look for version patterns (e.g., cuda-12.9, cuda-13.0) + +4. **Add or update patterns** in metadata YAML files: + - For new dependencies: Copy `_template.yml` and create new file + - For existing dependencies: Update relevant YAML file + - Use the most powerful matching strategy (see below) + +5. **Validate your changes**: + ```bash + python scan_build_artifacts.py --validate + ``` + +6. **Re-run scanner** to verify improvements: + ```bash + python scan_build_artifacts.py + ``` + +7. **Check results**: + ```bash + # Check summary in scan_output/known.yml + grep "coverage:" scan_output/known.yml + + # Check remaining unknowns + grep "count:" scan_output/unknown.yml + ``` + +8. **Repeat** steps 2-7 until `scan_output/unknown.yml` shows `count: 0` + +### Achieving 100% Coverage + +The goal is to reduce unknown artifacts to zero. Key strategies: + +- **Start with directory_matches**: Most powerful pattern type (see below) +- **Use version-agnostic patterns**: Match across multiple versions (see next section) +- **Group related artifacts**: Single dependency file can match headers, libs, and linker flags +- **Test incrementally**: Add patterns for one dependency at a time +- **Validate frequently**: Catch syntax errors early with `--validate` + +## Version-Agnostic Pattern Matching + +For dependencies with multiple versions (e.g., CUDA 12.9, 13.0), use patterns that match all versions. + +### Problem + +Artifacts from different CUDA versions: +``` +/usr/local/cuda-12.9/include/cuda.h +/usr/local/cuda-13.0/include/cuda.h +/usr/local/cuda/include/cuda.h +``` + +### Solution: Version-Agnostic Patterns + +Use `directory_matches` with version-agnostic patterns: + +```yaml +# metadata/cuda.yml +name: cuda-cudart +description: CUDA Runtime Library + +directory_matches: + - cuda-12.9 # Matches /cuda-12.9/ paths + - cuda-13.0 # Matches /cuda-13.0/ paths + - cuda # Matches /cuda/ paths (generic fallback) +``` + +### When to Use This Approach + +- **Multiple versions installed**: Different CUDA/TensorRT versions in same environment +- **Version symlinks**: Generic paths like `/usr/local/cuda/` alongside versioned ones +- **Forward compatibility**: Pattern works for future versions without updates +- **Container evolution**: Handles version changes between container builds + +### Best Practices + +1. **List specific versions first**: More specific patterns take priority + ```yaml + directory_matches: + - cuda-12.9 # Specific version + - cuda-13.0 # Specific version + - cuda # Generic fallback + ``` + +2. **Use with basename_matches**: Combine with exact filename matching + ```yaml + basename_matches: + - libcudart.so.12 + - libcudart.so.13 + + directory_matches: + - cuda-12.9 + - cuda-13.0 + - cuda + ``` + +3. **Test across versions**: Verify patterns work with different installations + +4. **Document version ranges**: Add comments for clarity + ```yaml + directory_matches: + - cuda-12.9 # CUDA 12.9.x + - cuda-13.0 # CUDA 13.0.x + - cuda # Generic (all versions) + ``` + +## Adding Patterns + +### When You See Unknown Artifacts + +After running the scanner, check `scan_output/unknown.yml`: + +```yaml +summary: + count: 2 + action_required: Add patterns to YAML files in metadata/ for these artifacts + +artifacts: + - /build/3rdparty/newlib/include/foo.h + - libfoo.so +``` + +### Option A: Add to Existing Dependency + +If `libfoo.so` belongs to an existing dependency (e.g., `pytorch`): + +1. Open `metadata/pytorch.yml` +2. Add to the `basename_matches` list: + ```yaml + basename_matches: + - libtorch.so + - libfoo.so # ← Add here + ``` +3. Re-run scanner: + ```bash + python ../scan_build_artifacts.py + ``` + +### Option B: Create New Dependency + +If this is a new dependency: + +1. Copy the template: + ```bash + cd metadata/ + cp _template.yml foo-library.yml + ``` + +2. Edit the file: + ```yaml + name: foo-library + description: Foo library for data processing + source: submodule + + basename_matches: + - libfoo.so + - libfoo.a + + linker_flags_matches: + - -lfoo + + directory_matches: + - foo-library + ``` + +3. Validate and re-run: + ```bash + python ../scan_build_artifacts.py --validate + python ../scan_build_artifacts.py + ``` + +## Pattern Matching Behavior + +The scanner uses a **3-tier matching strategy**: + +### 1. Exact Pattern Matching (HIGH confidence) +Matches exact filenames or linker flags: + +**Basename matches:** +```yaml +basename_matches: + - libcudart.so.12 # Matches only "libcudart.so.12" exactly + - libcudart.so.12.0 # Matches only "libcudart.so.12.0" exactly +``` + +**Linker flags:** +```yaml +linker_flags_matches: + - -lpthread # Matches "-lpthread" in link.txt + - -lcudart # Matches "-lcudart" +``` + +### 2. Path Alias Matching (MEDIUM confidence) +Matches directory components in paths. **Now supports multi-directory patterns!** + +**Single component:** +```yaml +directory_matches: + - pytorch # Matches any path containing /pytorch/ + # Example: /build/pytorch/include/torch.h ✓ +``` + +**Multi-directory (NEW):** +```yaml +directory_matches: + - 3rdparty/cutlass # Matches /3rdparty/cutlass/ sequence + - external/NVIDIA/cutlass # Matches full /external/NVIDIA/cutlass/ sequence +``` + +**Matching rules:** +- Exact component match only (no substring matching) +- `"foo/bar"` matches `/home/foo/bar/file.h` ✓ +- `"foo/bar"` does NOT match `/home/foobar/file.h` ✗ +- `"oo/ba"` does NOT match `/foo/bar/file.h` ✗ +- Rightmost match wins if pattern appears multiple times in path + +### 3. Generic Inference (LOW confidence) +Fallback: extracts library name from `-lfoo` → `foo` + +### Pattern Matching Power Ranking + +**Most Powerful → Least Powerful:** + +1. **directory_matches** - Matches entire directories of headers/files + - Example: `directory_matches: [pytorch]` matches 4,822+ PyTorch headers + - Single pattern can cover hundreds or thousands of artifacts + +2. **basename_matches** - Matches specific library files + - Example: `basename_matches: [libtorch.so]` matches one library + - Good for targeting specific libraries + +3. **linker_flags_matches** - Matches linker flags in link.txt files + - Example: `linker_flags_matches: [-ltorch]` matches one linker flag + - Useful for libraries without headers in build + +**Recommendation:** Start with `directory_matches` for maximum coverage with minimal patterns. + +## Required Fields + +Every dependency MUST have: + +```yaml +name: my-dep # Required: lowercase, hyphenated, + allowed +description: "..." # Required: minimum 10 characters +``` + +At least one pattern section is required: +- `basename_matches` (exact filenames) +- `linker_flags_matches` (-l flags) +- `directory_matches` (path components) + +## Optional Fields + +Recommended for attribution/licensing: + +```yaml +version: "1.0" # Optional: version string +license: "Apache-2.0" # Optional: SPDX identifier +copyright: "Copyright 2024 NVIDIA" # Optional: copyright notice +homepage: "https://example.com" # Optional: project URL +source: "submodule" # Optional: how obtained +``` + +Valid `source` values: +- `submodule` - Git submodules in 3rdparty/ directory +- `container` - Pre-installed in container image (e.g., PyTorch, CUDA) +- `fetched` - Downloaded from URL and built from source + +## Multi-Directory Pattern Examples + +### Example 1: Vendor Directory Boundaries + +```yaml +# metadata/cutlass.yml +name: cutlass +description: CUDA Templates for Linear Algebra Subroutines +source: submodule + +directory_matches: + - cutlass # Single: matches any /cutlass/ in path + - 3rdparty/cutlass # Multi: matches /3rdparty/cutlass/ sequence + - external/NVIDIA/cutlass # Multi: matches full sequence +``` + +**Why multi-directory?** Prevents false positives: +- `"cutlass"` alone might match `/other-project/cutlass/` (unwanted) +- `"3rdparty/cutlass"` is more specific and safer + +### Example 2: Nested Dependencies + +```yaml +# metadata/dlpack.yml +name: dlpack +description: Deep Learning Pack +source: submodule + +directory_matches: + - 3rdparty/xgrammar/3rdparty/dlpack # Nested submodule path +``` + +Matches `/build/3rdparty/xgrammar/3rdparty/dlpack/include/dlpack.h` + +## Finding Which File to Edit + +Search by library name: + +```bash +cd metadata/ +grep -r "libtorch.so" . +# Output: ./pytorch.yml: - libtorch.so +``` + +Search by dependency name: + +```bash +grep "^name: pytorch" *.yml +# Output: pytorch.yml:name: pytorch +``` + +List all dependencies: + +```bash +grep "^name:" *.yml | sort +``` + +Search in list format files (base.yml, cuda.yml): + +```bash +grep -A 5 "name: libc6" base.yml +``` + +## Validation + +### Manual Validation + +After adding patterns, validate the YAML structure: + +```bash +python ../scan_build_artifacts.py --validate +``` + +Expected output: + +``` +================================================================================ +YAML Validation +================================================================================ + +✓ base.yml:libc6 +✓ base.yml:libstdc++6 +✓ cuda.yml:cuda-cudart-dev +✓ pytorch.yml +✓ tensorrt-llm.yml +... + +================================================================================ +Results: 25/25 valid, 0/25 invalid +================================================================================ +``` + +### Re-run Scanner + +After adding patterns, re-run the scanner: + +```bash +python ../scan_build_artifacts.py +``` + +Check `scan_output/unknown.yml` - should have fewer (or zero) artifacts: + +```yaml +summary: + count: 0 # Improved from previous run! + coverage: 100.0% + +artifacts: [] +``` + +### Schema Validation + +The `_schema.yml` file defines validation rules: +- Required fields: `name`, `description` +- Field types (string, array, etc.) +- Field patterns (e.g., linker flags must start with `-l`) +- Minimum lengths +- Unique items in arrays + +## Common Mistakes + +### 1. Using Old Field Names + +```yaml +patterns: [...] # ❌ Wrong (old name) +basename_matches: [...] # ✓ Correct + +linker_flags: [...] # ❌ Wrong (old name) +linker_flags_matches: [...] # ✓ Correct + +path_components: [...] # ❌ Wrong (old name) +directory_matches: [...] # ✓ Correct +``` + +### 2. Missing Required Fields + +```yaml +name: my-dep # ✓ Required +description: "..." # ✓ Required (min 10 chars) +``` + +### 3. Empty Pattern Sections + +```yaml +basename_matches: [] # ❌ Need at least one pattern section +linker_flags_matches: [] +directory_matches: [] +``` + +Must have at least one of: `basename_matches`, `linker_flags_matches`, or `directory_matches` + +### 4. Wrong Linker Flag Format + +```yaml +linker_flags_matches: + - pthread # ❌ Wrong + - -lpthread # ✓ Correct (must start with -l) +``` + +### 5. Substring Matching in directory_matches + +```yaml +directory_matches: + - oo/ba # ❌ Won't match /foo/bar/ (no substring matching) + - foo/bar # ✓ Correct (exact component match) +``` + +### 6. Invalid source Field + +```yaml +source: apt # ❌ Wrong (old enum value) +source: container # ✓ Correct (new enum) +source: pip # ❌ Wrong (old enum value) +source: submodule # ✓ Correct (new enum) +``` + +### 7. Duplicate Patterns Across Files + +Scanner will warn if same pattern appears in multiple files: + +``` +Warning: Duplicate pattern 'libfoo.so' found in bar.yml +(previously mapped to 'foo', now 'bar') +``` + +Last loaded file wins (alphabetical order). Remove duplicates. + +### 8. Invalid Name Format + +```yaml +name: MyDep # ❌ Wrong (uppercase) +name: my_dep # ❌ Wrong (underscore) +name: my-dep # ✓ Correct (lowercase, hyphenated) +name: cuda-12 # ✓ Correct (numbers ok) +name: libstdc++6 # ✓ Correct (+ allowed) +``` + +## Troubleshooting + +### Issue: Unknown artifacts not resolving after adding pattern + +**Cause**: Pattern doesn't match artifact path. + +**Solution**: +1. Check exact artifact path in `scan_output/unknown.yml` +2. Use correct field names: `basename_matches`, not `patterns` +3. For directories, use `directory_matches` +4. Check for typos in pattern + +Example: +```yaml +# If unknown.yml shows: +artifacts: + - /build/pytorch/lib/libtorch.so.2.0 + +# Add exact match: +basename_matches: + - libtorch.so.2.0 + +# OR use directory matching: +directory_matches: + - pytorch +``` + +### Issue: Multi-directory pattern not working + +**Cause**: Substring matching expectations. + +**Solution**: +- Multi-directory patterns require **exact component matches** +- `"oo/ba"` will NOT match `/foo/bar/` +- Use full component names: `"foo/bar"` + +Example: +```yaml +directory_matches: + - vendor/cutlass # ✓ Matches /vendor/cutlass/ + - cutlass # ✓ Also works (single component) + - end/cutlass # ❌ Won't match /vendor/cutlass/ (no substring matching) +``` + +### Issue: dpkg.yml not found + +**Cause**: File was renamed to `base.yml`. + +**Solution**: +```bash +# Old (incorrect) +grep "pattern" metadata/dpkg.yml + +# New (correct) +grep "pattern" metadata/base.yml +``` + +### Issue: Validation fails with schema error + +**Cause**: YAML structure doesn't match schema. + +**Solution**: +1. Compare with `_template.yml` +2. Ensure required fields present (`name`, `description`) +3. Check linker flags start with `-l` +4. Use correct field names: `basename_matches`, `linker_flags_matches`, `directory_matches` + +Example error: +``` +❌ foo.yml: 'description' is too short (minimum 10 characters) +``` + +Fix: +```yaml +description: "Foo library for data processing" # At least 10 chars +``` + +### Issue: Coverage decreased after changes + +**Cause**: Removed or moved patterns incorrectly. + +**Solution**: +1. Check git diff to see what changed +2. Re-add removed patterns +3. Run validation to ensure no syntax errors + +```bash +git diff metadata/ +python ../scan_build_artifacts.py --validate +``` + +## Best Practices + +1. **One dependency per file** (except base.yml/cuda.yml for system libs) +2. **Use descriptive names**: `cuda-cudart-12` not `cudart12` +3. **Use multi-directory patterns** for vendored dependencies to avoid false positives +4. **Add metadata** (license, copyright, homepage) for attribution +5. **Validate after changes**: `python ../scan_build_artifacts.py --validate` +6. **Test coverage**: Re-run scanner after adding patterns +7. **Use correct field names**: `basename_matches`, not `patterns` +8. **Keep base.yml for system libraries only** (resolved via dpkg-query) +9. **Use `source: container`** for pre-installed packages (PyTorch, CUDA) +10. **Use `source: submodule`** for 3rdparty/ git submodules +11. **Start with directory_matches**: Most powerful pattern type for coverage +12. **Use version-agnostic patterns**: Match multiple versions with single pattern + +## Resolution Strategy + +The scanner uses a **two-tier resolution strategy**: + +### PRIMARY: dpkg-query +- System-installed packages +- High confidence +- Handles all CUDA, system libraries automatically + +### FALLBACK: YAML Patterns +Only used when dpkg-query doesn't know about the artifact: +1. Exact basename match → High confidence +2. Exact linker flag match → High confidence +3. Directory alias match → Medium confidence +4. Generic library inference → Low confidence + +**Key insight**: Most CUDA and system packages are resolved via dpkg-query (PRIMARY), not YAML patterns. This is why `cuda.yml` and `base.yml` are sparse - they only contain fallback patterns for artifacts dpkg doesn't know about. + +## Example: Complete Dependency File + +```yaml +# metadata/cutlass.yml + +name: cutlass +description: CUDA Templates for Linear Algebra Subroutines + +version: "3.5.0" +license: BSD-3-Clause +copyright: Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES +homepage: https://github.com/NVIDIA/cutlass +source: submodule + +basename_matches: + - libcutlass.a + +linker_flags_matches: + - -lcutlass + +directory_matches: + - cutlass + - 3rdparty/cutlass # Multi-directory: prevents false positives + - external/NVIDIA/cutlass # Multi-directory: vendor-specific path +``` + +## Schema Reference + +See `_schema.yml` for full JSON schema definition. + +Key constraints: +- `name`: Required, string, pattern `^[a-z0-9-+]+$`, min length 1 +- `description`: Required, string, min length 10 +- `version`: Optional, string, min length 1 +- `basename_matches`: Optional, array of strings, unique items +- `linker_flags_matches`: Optional, array of strings matching `^-l`, unique items +- `directory_matches`: Optional, array of strings, unique items (supports multi-directory) +- `source`: Optional, enum (submodule/container/fetched) + +At least one of `basename_matches`, `linker_flags_matches`, or `directory_matches` required. + +## Support + +For issues or questions: +- Review `_schema.yml` for validation rules +- See `_template.yml` for new dependency template +- Run `python ../scan_build_artifacts.py --help` for CLI options +- Check scanner source code: `scan_build_artifacts.py` (PatternMatcher class, lines 620-926) +- Review output files: `scan_output/known.yml` and `scan_output/unknown.yml` +- See main README: `../README.md` for architecture and workflow details diff --git a/cpp/dependency_scan/metadata/_schema.yml b/cpp/dependency_scan/metadata/_schema.yml new file mode 100644 index 00000000000..dc0e82d8002 --- /dev/null +++ b/cpp/dependency_scan/metadata/_schema.yml @@ -0,0 +1,92 @@ +# JSON Schema for validating dependency YAML files +# Used by scanner to validate structure and required fields + +type: object + +required: + - name + - description + +properties: + name: + type: string + pattern: "^[a-z0-9-+]+$" + description: "Canonical dependency name (lowercase, hyphenated, + allowed)" + minLength: 1 + + version: + type: string + description: "Version string (semantic versioning recommended)" + minLength: 1 + + description: + type: string + description: "Brief description of the dependency" + minLength: 10 + + license: + type: string + description: "SPDX license identifier or license name" + + copyright: + type: string + description: "Copyright notice" + + homepage: + type: string + format: uri + description: "URL to project homepage or repository" + + source: + type: string + enum: ["submodule", "container", "fetched"] + description: | + How this dependency is obtained: + - submodule: Git submodules in 3rdparty/ directory + - container: Pre-installed in the container image (e.g., PyTorch) + - fetched: Downloaded from URL and built from source + + basename_matches: + type: array + items: + type: string + minLength: 1 + uniqueItems: true + description: "Exact basename matches (filename or library name)" + + linker_flags_matches: + type: array + items: + type: string + pattern: "^-l" + minLength: 3 + uniqueItems: true + description: "Linker flags (-l flags)" + + directory_matches: + type: array + items: + type: string + minLength: 1 + uniqueItems: true + description: | + Directory path patterns (single or multi-directory). + Supports exact consecutive component matching (no substring matching). + + Examples: + - "pytorch" (single component: matches any path containing /pytorch/) + - "3rdparty/cutlass" (multi-directory: matches /3rdparty/cutlass/ sequence) + - "foo/bar/baz" (multi-directory: matches /foo/bar/baz/ sequence) + + Matching behavior: + - Exact component match only (no substrings) + - "foo/bar" matches "/home/foo/bar/file.h" ✓ + - "foo/bar" does NOT match "/home/foobar/file.h" ✗ + - "oo/ba" does NOT match "/foo/bar/file.h" ✗ + - Rightmost match wins if pattern appears multiple times in path + +# At least one pattern matching section required +anyOf: + - required: ["basename_matches"] + - required: ["linker_flags_matches"] + - required: ["directory_matches"] diff --git a/cpp/dependency_scan/metadata/_template.yml b/cpp/dependency_scan/metadata/_template.yml new file mode 100644 index 00000000000..17c1980d190 --- /dev/null +++ b/cpp/dependency_scan/metadata/_template.yml @@ -0,0 +1,14 @@ +name: my-dependency +description: Brief description of the dependency (minimum 10 characters) +license: '' +copyright: '' +homepage: '' +source: '' # submodule, container, or fetched +basename_matches: +- libexample.so +- libexample.so.1 +linker_flags_matches: +- -lexample +directory_matches: +- example_dir # Single component +- 3rdparty/example # Multi-directory (if bundled in vendor dir) diff --git a/cpp/dependency_scan/metadata/base.yml b/cpp/dependency_scan/metadata/base.yml new file mode 100644 index 00000000000..79581bca4f9 --- /dev/null +++ b/cpp/dependency_scan/metadata/base.yml @@ -0,0 +1,13 @@ +dependencies: +- name: libc6 + description: GNU C Library - shared libraries and system calls + source: container + path_components: [] + aliases: [] + basename_matches: [] + linker_flags_matches: + - -lc + - -ldl + - -lm + - -lpthread + - -lrt diff --git a/cpp/dependency_scan/metadata/cppzmq.yml b/cpp/dependency_scan/metadata/cppzmq.yml new file mode 100644 index 00000000000..7e119a567fe --- /dev/null +++ b/cpp/dependency_scan/metadata/cppzmq.yml @@ -0,0 +1,10 @@ +name: cppzmq +description: Header-only C++ binding for libzmq +license: MIT +copyright: Copyright (c) 2016-2024 zeromq.org +homepage: https://github.com/zeromq/cppzmq +source: submodule +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- cppzmq diff --git a/cpp/dependency_scan/metadata/cuda.yml b/cpp/dependency_scan/metadata/cuda.yml new file mode 100644 index 00000000000..5fae45fa764 --- /dev/null +++ b/cpp/dependency_scan/metadata/cuda.yml @@ -0,0 +1,22 @@ +dependencies: +- name: cuda-cudart-dev + description: CUDA Runtime API development libraries and headers + source: container + basename_matches: [] + linker_flags_matches: + - -lcudadevrt + directory_matches: + - cuda-12.9 + - cuda-13.0 + - cuda + - cooperative_groups + - cub + - thrust +- name: cuda-cudart-static + description: CUDA Runtime static library for device code linking + source: container + basename_matches: + - libcudart_static.a + linker_flags_matches: + - -lcudart_static + directory_matches: [] diff --git a/cpp/dependency_scan/metadata/cutlass.yml b/cpp/dependency_scan/metadata/cutlass.yml new file mode 100644 index 00000000000..a8b5e767064 --- /dev/null +++ b/cpp/dependency_scan/metadata/cutlass.yml @@ -0,0 +1,10 @@ +name: cutlass +description: CUDA Templates for Linear Algebra Subroutines +license: BSD-3-Clause +copyright: Copyright (c) 2017-2024 NVIDIA Corporation +homepage: https://github.com/NVIDIA/cutlass +source: submodule +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- cutlass diff --git a/cpp/dependency_scan/metadata/deep-ep.yml b/cpp/dependency_scan/metadata/deep-ep.yml new file mode 100644 index 00000000000..31bc48f03c5 --- /dev/null +++ b/cpp/dependency_scan/metadata/deep-ep.yml @@ -0,0 +1,7 @@ +name: deep-ep +description: DeepEP distributed execution platform for distributed training +source: fetched +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- deep_ep_download-src diff --git a/cpp/dependency_scan/metadata/deepgemm.yml b/cpp/dependency_scan/metadata/deepgemm.yml new file mode 100644 index 00000000000..af4f2350d20 --- /dev/null +++ b/cpp/dependency_scan/metadata/deepgemm.yml @@ -0,0 +1,7 @@ +name: deepgemm +description: DeepGEMM optimized matrix multiplication library +source: submodule +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- DeepGEMM diff --git a/cpp/dependency_scan/metadata/dlpack.yml b/cpp/dependency_scan/metadata/dlpack.yml new file mode 100644 index 00000000000..e2aad21669c --- /dev/null +++ b/cpp/dependency_scan/metadata/dlpack.yml @@ -0,0 +1,11 @@ +name: dlpack +description: Common in-memory tensor structure for deep learning frameworks (vendored + in xgrammar) +license: Apache-2.0 +copyright: Copyright 2017 by Contributors +homepage: https://github.com/dmlc/dlpack +source: submodule +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- dlpack diff --git a/cpp/dependency_scan/metadata/fmt.yml b/cpp/dependency_scan/metadata/fmt.yml new file mode 100644 index 00000000000..6d2f6ad28ed --- /dev/null +++ b/cpp/dependency_scan/metadata/fmt.yml @@ -0,0 +1,11 @@ +name: fmt +description: A modern formatting library for C++ +license: MIT (with exception) +copyright: Copyright (c) 2012-2024 Victor Zverovich +homepage: https://github.com/fmtlib/fmt +source: submodule +basename_matches: [] +linker_flags_matches: +- -lfmt +directory_matches: +- fmt diff --git a/cpp/dependency_scan/metadata/hedley.yml b/cpp/dependency_scan/metadata/hedley.yml new file mode 100644 index 00000000000..195080947a4 --- /dev/null +++ b/cpp/dependency_scan/metadata/hedley.yml @@ -0,0 +1,10 @@ +name: hedley +description: Header-only C/C++ compatibility library (vendored in nlohmann-json) +license: CC0-1.0 +copyright: Copyright 2016-2021 Evan Nemerson +homepage: https://nemequ.github.io/hedley/ +source: submodule +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- hedley diff --git a/cpp/dependency_scan/metadata/nanobind.yml b/cpp/dependency_scan/metadata/nanobind.yml new file mode 100644 index 00000000000..559aa4d534d --- /dev/null +++ b/cpp/dependency_scan/metadata/nanobind.yml @@ -0,0 +1,10 @@ +name: nanobind +description: Tiny and efficient C++/Python bindings +license: BSD-3-Clause +copyright: Copyright (c) 2022-2024 Wenzel Jakob +homepage: https://github.com/wjakob/nanobind +source: submodule +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- nanobind diff --git a/cpp/dependency_scan/metadata/nlohmann-json.yml b/cpp/dependency_scan/metadata/nlohmann-json.yml new file mode 100644 index 00000000000..267945c4b29 --- /dev/null +++ b/cpp/dependency_scan/metadata/nlohmann-json.yml @@ -0,0 +1,10 @@ +name: nlohmann-json +description: JSON for Modern C++ header-only library +license: MIT +copyright: Copyright (c) 2013-2024 Niels Lohmann +homepage: https://github.com/nlohmann/json +source: submodule +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- json diff --git a/cpp/dependency_scan/metadata/numa.yml b/cpp/dependency_scan/metadata/numa.yml new file mode 100644 index 00000000000..75c2b9bc2a3 --- /dev/null +++ b/cpp/dependency_scan/metadata/numa.yml @@ -0,0 +1,8 @@ +name: libnuma1 +description: NUMA memory allocation library +source: container +path_components: [] +aliases: [] +basename_matches: [] +linker_flags_matches: +- -lnuma diff --git a/cpp/dependency_scan/metadata/nvshmem.yml b/cpp/dependency_scan/metadata/nvshmem.yml new file mode 100644 index 00000000000..dcb68972a3a --- /dev/null +++ b/cpp/dependency_scan/metadata/nvshmem.yml @@ -0,0 +1,8 @@ +name: nvshmem +description: NVIDIA Shared Memory (NVSHMEM) library for inter-GPU communication +license: NVIDIA +homepage: https://developer.nvidia.com/nvshmem +source: fetched +basename_matches: [] +linker_flags_matches: [] +directory_matches: [] diff --git a/cpp/dependency_scan/metadata/openmpi.yml b/cpp/dependency_scan/metadata/openmpi.yml new file mode 100644 index 00000000000..72cb1900193 --- /dev/null +++ b/cpp/dependency_scan/metadata/openmpi.yml @@ -0,0 +1,6 @@ +name: openmpi +description: openmpi libraries and components +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- ompi diff --git a/cpp/dependency_scan/metadata/picojson.yml b/cpp/dependency_scan/metadata/picojson.yml new file mode 100644 index 00000000000..adc23723630 --- /dev/null +++ b/cpp/dependency_scan/metadata/picojson.yml @@ -0,0 +1,10 @@ +name: picojson +description: Header-only JSON parser/serializer in C++ (vendored in xgrammar) +license: BSD-2-Clause +copyright: Copyright 2009-2010 Cybozu Labs, Inc., Copyright 2011-2014 Kazuho Oku +homepage: https://github.com/kazuho/picojson +source: submodule +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- picojson diff --git a/cpp/dependency_scan/metadata/pybind11.yml b/cpp/dependency_scan/metadata/pybind11.yml new file mode 100644 index 00000000000..b493bc6be3c --- /dev/null +++ b/cpp/dependency_scan/metadata/pybind11.yml @@ -0,0 +1,10 @@ +name: pybind11 +description: Seamless operability between C++11 and Python +license: BSD-3-Clause +copyright: Copyright (c) 2016-2024 Wenzel Jakob +homepage: https://github.com/pybind/pybind11 +source: submodule +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- pybind11 diff --git a/cpp/dependency_scan/metadata/pytorch.yml b/cpp/dependency_scan/metadata/pytorch.yml new file mode 100644 index 00000000000..e89b056e60a --- /dev/null +++ b/cpp/dependency_scan/metadata/pytorch.yml @@ -0,0 +1,15 @@ +name: pytorch +description: pytorch libraries and components +source: container +basename_matches: +- libc10.so +- libc10_cuda.so +- libtorch.so +- libtorch_python.so +linker_flags_matches: +- -ltorch_python +directory_matches: +- ATen +- c10 +- caffe2 +- torch diff --git a/cpp/dependency_scan/metadata/robin-map.yml b/cpp/dependency_scan/metadata/robin-map.yml new file mode 100644 index 00000000000..601cf08ef8b --- /dev/null +++ b/cpp/dependency_scan/metadata/robin-map.yml @@ -0,0 +1,14 @@ +name: robin-map +description: Fast hash map and hash set using robin hood hashing +license: MIT +copyright: Copyright (c) 2017 Thibaut Goetghebuer-Planchon +homepage: https://github.com/Tessil/robin-map +source: submodule +basename_matches: +- robin_map.h +- robin_hash.h +- robin_growth_policy.h +linker_flags_matches: [] +directory_matches: +- robin_map +- tsl diff --git a/cpp/dependency_scan/metadata/tensorrt-llm.yml b/cpp/dependency_scan/metadata/tensorrt-llm.yml new file mode 100644 index 00000000000..0766739b3b8 --- /dev/null +++ b/cpp/dependency_scan/metadata/tensorrt-llm.yml @@ -0,0 +1,12 @@ +name: tensorrt-llm +description: TensorRT-LLM core libraries and Python bindings built by this project +license: Apache-2.0 +homepage: https://github.com/NVIDIA/TensorRT-LLM +source: submodule +basename_matches: +- deep_ep_cpp_tllm.cpython-312-x86_64-linux-gnu.so +- deep_gemm_cpp_tllm.cpython-312-x86_64-linux-gnu.so +linker_flags_matches: +- -ltensorrt_llm +directory_matches: +- tensorrt_llm/cpp diff --git a/cpp/dependency_scan/metadata/tensorrt.yml b/cpp/dependency_scan/metadata/tensorrt.yml new file mode 100644 index 00000000000..b3c325c4c38 --- /dev/null +++ b/cpp/dependency_scan/metadata/tensorrt.yml @@ -0,0 +1,6 @@ +name: tensorrt +description: tensorrt libraries and components +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- tensorrt diff --git a/cpp/dependency_scan/metadata/ucx.yml b/cpp/dependency_scan/metadata/ucx.yml new file mode 100644 index 00000000000..95086c40f21 --- /dev/null +++ b/cpp/dependency_scan/metadata/ucx.yml @@ -0,0 +1,6 @@ +name: ucx +description: ucx libraries and components +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- ucx diff --git a/cpp/dependency_scan/metadata/ucxx.yml b/cpp/dependency_scan/metadata/ucxx.yml new file mode 100644 index 00000000000..19745c4619f --- /dev/null +++ b/cpp/dependency_scan/metadata/ucxx.yml @@ -0,0 +1,8 @@ +name: ucxx +description: C++ bindings for UCX (Unified Communication X) +homepage: https://github.com/rapidsai/ucxx +source: submodule +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- ucxx diff --git a/cpp/dependency_scan/metadata/xgrammar.yml b/cpp/dependency_scan/metadata/xgrammar.yml new file mode 100644 index 00000000000..e5ca1f6d805 --- /dev/null +++ b/cpp/dependency_scan/metadata/xgrammar.yml @@ -0,0 +1,8 @@ +name: xgrammar +description: XGrammar library for structured text generation +homepage: https://github.com/mlc-ai/xgrammar +source: submodule +basename_matches: [] +linker_flags_matches: [] +directory_matches: +- xgrammar diff --git a/cpp/dependency_scan/metadata/zeromq.yml b/cpp/dependency_scan/metadata/zeromq.yml new file mode 100644 index 00000000000..43a89b59bf4 --- /dev/null +++ b/cpp/dependency_scan/metadata/zeromq.yml @@ -0,0 +1,8 @@ +name: libzmq5 +description: ZeroMQ lightweight messaging kernel library +source: container +path_components: [] +aliases: [] +basename_matches: [] +linker_flags_matches: +- -lzmq diff --git a/cpp/dependency_scan/scan_build_artifacts.py b/cpp/dependency_scan/scan_build_artifacts.py new file mode 100755 index 00000000000..e9a608e52ee --- /dev/null +++ b/cpp/dependency_scan/scan_build_artifacts.py @@ -0,0 +1,1532 @@ +#!/usr/bin/env python3 +""" +Minimal Build Artifact Scanner for TensorRT-LLM + +Scans D files (headers), link.txt files (libraries), and wheels (binaries) +to generate a comprehensive dependency mapping report. + +Resolution Strategy: + PRIMARY: dpkg-query for system packages + FALLBACK: YAML patterns from dependencies/ directory + +Output: + - known.yml: Successfully mapped artifacts grouped by dependency (paths only) + - unknown.yml: Unmapped artifacts needing pattern additions (paths only) + +Usage: + python scan_build_artifacts.py --build-dir build/ --output-dir scan_output/ + python scan_build_artifacts.py --validate # Validate YAML files +""" + +import argparse +import os +import re +import subprocess +import sys +import tempfile +import zipfile +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional, Set + +import yaml + +try: + from jsonschema import ValidationError, validate + JSONSCHEMA_AVAILABLE = True +except ImportError: + JSONSCHEMA_AVAILABLE = False + ValidationError = Exception # Fallback for type hints + +# Configuration: Submodules directory location +# This path points to the 3rdparty submodules directory. +# Change this constant if dependencies move to a different location. +# Current: TRTLLM_ROOT/3rdparty +# Future: May change to ${CMAKE_BINARY_DIR}/_deps/ +THIRDPARTY_ROOT = Path(__file__).parent.parent.parent / '3rdparty' + +# ============================================================================ +# MODULE 1: Data Models +# ============================================================================ + + +@dataclass +class Artifact: + """Represents a discovered build artifact (header, library, or binary)""" + path: str # Canonical resolved path + type: str # 'header', 'library', 'binary' + source: str # Which file discovered it (D file, link.txt, wheel) + context_dir: Optional[str] = None # For relative path resolution + metadata: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for JSON serialization""" + return asdict(self) + + +@dataclass +class Mapping: + """Represents an artifact-to-dependency mapping""" + artifact: Artifact + dependency: str # Canonical dependency name + confidence: str # 'high', 'medium', 'low' + strategy: str # Which resolution strategy succeeded + metadata: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for JSON serialization""" + result = asdict(self) + result['artifact'] = self.artifact.to_dict() + return result + + +# ============================================================================ +# MODULE 2: DpkgResolver (PRIMARY) +# ============================================================================ + + +class DpkgResolver: + """ + Resolves artifacts to packages using dpkg-query (system package manager). + + This is the PRIMARY resolution strategy for system-installed packages + (glibc, libstdc++, gcc, cuda-dev, etc.). + + Algorithm: + 1. For absolute paths: dpkg-query -S + 2. For -l flags: find_library_path() → dpkg-query -S + 3. Parse output: "package:arch: /path/to/file" + 4. Cache results to avoid repeated queries + 5. Normalize package names (remove :arch suffix, handle cuda packages) + """ + + def __init__(self): + self._cache: Dict[str, Optional[str]] = {} + self._lib_search_paths = self._get_library_search_paths() + + def _get_library_search_paths(self) -> List[str]: + """ + Get standard library search paths for resolving -l flags. + + Returns system library directories in priority order: + - /lib/x86_64-linux-gnu + - /usr/lib/x86_64-linux-gnu + - /lib + - /usr/lib + - /usr/local/lib + """ + paths = [ + "/lib/x86_64-linux-gnu", + "/usr/lib/x86_64-linux-gnu", + "/lib", + "/usr/lib", + "/usr/local/lib", + ] + + # Add LD_LIBRARY_PATH if set + ld_library_path = os.environ.get("LD_LIBRARY_PATH", "") + if ld_library_path: + paths.extend(ld_library_path.split(":")) + + return [p for p in paths if os.path.isdir(p)] + + def find_library_path(self, lib_name: str) -> Optional[str]: + """ + Resolve linker flag (-lpthread) to actual library path. + + Algorithm: + 1. Strip -l prefix: "-lpthread" → "pthread" + 2. Try patterns: libpthread.so, libpthread.so.*, libpthread.a + 3. Search in standard library directories + 4. Return first match or None + + Examples: + -lpthread → /lib/x86_64-linux-gnu/libpthread.so.0 + -lm → /lib/x86_64-linux-gnu/libm.so.6 + -lstdc++ → /usr/lib/x86_64-linux-gnu/libstdc++.so.6 + """ + if lib_name.startswith("-l"): + lib_name = lib_name[2:] # Remove -l prefix + + # Try different library name patterns + patterns = [ + f"lib{lib_name}.so", + f"lib{lib_name}.so.*", + f"lib{lib_name}.a", + ] + + for search_path in self._lib_search_paths: + for pattern in patterns: + # Use glob to match version suffixes + import glob + matches = glob.glob(os.path.join(search_path, pattern)) + if matches: + # Return first match (highest priority) + return matches[0] + + return None + + def get_package(self, file_path: str) -> Optional[str]: + """ + Query dpkg for package owning the file. + + Algorithm: + 1. Check cache for previous result + 2. Handle -l flags: find_library_path() first + 3. Execute: dpkg-query -S + 4. Parse output: "package:arch: /path/to/file" + 5. Extract package name, remove architecture suffix + 6. Normalize CUDA packages: cuda-cccl-12-9 → cuda-cccl + 7. Cache result and return + + Examples: + /usr/include/c++/13/vector → libstdc++-13-dev + -lpthread → libc6 + /usr/local/cuda-12.9/include/cuda.h → cuda-cudart-dev-12-9 → cuda-cudart-dev + """ + # Check cache first + if file_path in self._cache: + return self._cache[file_path] + + # Handle linker flags + if file_path.startswith("-l"): + resolved_path = self.find_library_path(file_path) + if not resolved_path: + self._cache[file_path] = None + return None + file_path = resolved_path + + # Query dpkg + try: + result = subprocess.run(["dpkg-query", "-S", file_path], + capture_output=True, + text=True, + timeout=5) + + if result.returncode != 0: + self._cache[file_path] = None + return None + + # Parse output: "package:arch: /path/to/file" + output = result.stdout.strip() + if ":" in output: + package_part = output.split(":", 1)[0] + # Remove architecture suffix (package:amd64 → package) + package = package_part.split( + ":")[0] if ":" in package_part else package_part + + # Normalize CUDA packages + package = self._normalize_cuda_package(package) + + self._cache[file_path] = package + return package + + except (subprocess.TimeoutExpired, FileNotFoundError, Exception): + pass + + self._cache[file_path] = None + return None + + @staticmethod + def _normalize_cuda_package(package: str) -> str: + """ + Normalize CUDA package names by removing version suffixes. + + Examples: + cuda-cccl-12-9 → cuda-cccl + cuda-cudart-dev-12-9 → cuda-cudart-dev + libcublas-dev-12-9 → libcublas-dev + libc6 → libc6 (no change) + """ + # Pattern: package-name-##-# → package-name + match = re.match(r"^(.+?)-(\d+)-(\d+)$", package) + if match: + base_name = match.group(1) + # Only normalize if it looks like a CUDA/NVIDIA package + if any(x in base_name for x in [ + "cuda", "cublas", "curand", "cusolver", "cusparse", + "nvjitlink", "nvinfer" + ]): + return base_name + + return package + + +# ============================================================================ +# MODULE 3: ArtifactCollector +# ============================================================================ + + +class ArtifactCollector: + """ + Collects artifacts from D files (headers), link.txt files (libraries), and wheels (binaries). + + Args: + build_dir: Path to build directory to scan + """ + + def __init__(self, build_dir: Path): + self.build_dir = build_dir + + def collect_all(self) -> List[Artifact]: + """ + Collect all artifacts from build directory. + + Algorithm: + 1. Find all *.d files → parse headers + 2. Find all link.txt files → parse libraries + 3. Find all *.whl files → extract and scan binaries + 4. Return combined deduplicated list + """ + artifacts = [] + + # Collect from D files + d_files = list(self.build_dir.rglob("*.d")) + for d_file in d_files: + artifacts.extend(self._parse_d_file(d_file)) + + # Collect from link files + link_files = list(self.build_dir.rglob("link.txt")) + for link_file in link_files: + artifacts.extend(self._parse_link_file(link_file)) + + # Collect from wheels + wheel_files = list(self.build_dir.rglob("*.whl")) + for wheel_file in wheel_files: + artifacts.extend(self._scan_wheel(wheel_file)) + + # Deduplicate by path + seen = set() + unique_artifacts = [] + for artifact in artifacts: + if artifact.path not in seen: + seen.add(artifact.path) + unique_artifacts.append(artifact) + + return unique_artifacts + + def _parse_d_file(self, d_file: Path) -> List[Artifact]: + """ + Parse CMake dependency file (.d) to extract header dependencies. + + Algorithm: + 1. Read file content + 2. Handle line continuations (backslash at end) + 3. Split by whitespace to get all paths + 4. Skip first token (target: header1 header2 ...) + 5. Strip trailing colons from paths (handles malformed .d files) + 6. Resolve relative paths from depfile's parent directory + 7. Filter out non-existent paths + 8. Canonicalize with os.path.realpath() + + Malformed .d File Handling: + Some CMake-generated .d files contain paths with trailing colons. + Example: '/usr/include/stdc-predef.h:' (should be '/usr/include/stdc-predef.h') + + This is caused by incorrect formatting in CMake's dependency tracking. + The parser strips trailing colons using rstrip(':') to handle these cases, + preventing duplicate artifacts and improving accuracy. + + Example D file: + ``` + build/foo.o: /usr/include/stdio.h \ + ../include/myheader.h \ + /usr/local/cuda/include/cuda.h + ``` + """ + + artifacts = [] + + try: + content = d_file.read_text(encoding='utf-8', errors='ignore') + except Exception: + return artifacts + + # Handle line continuations + content = content.replace("\\\n", " ").replace("\\\r\n", " ") + + # Split by whitespace + tokens = content.split() + + # Skip first token (target:) + if not tokens or not tokens[0].endswith(":"): + return artifacts + + header_paths = tokens[1:] + # CMake .d files use paths relative to the target's build directory (where Makefile is) + # This is the parent of the CMakeFiles directory + d_file_parts = d_file.parts + if 'CMakeFiles' in d_file_parts: + cmake_idx = d_file_parts.index('CMakeFiles') + context_dir = Path(*d_file_parts[:cmake_idx]) + else: + # Fallback to build_dir if no CMakeFiles in path + context_dir = self.build_dir + + for header_path in header_paths: + # Strip trailing colons from paths (malformed .d files) + # Some .d files have malformed entries like '/usr/include/stdc-predef.h:' + header_path = header_path.rstrip(':') + if not header_path: + continue + + # Store original relative path before joining (for 3rdparty resolution) + original_header_path = header_path + + # Resolve relative paths + if not os.path.isabs(header_path): + header_path = os.path.join(context_dir, header_path) + + # Canonicalize path + try: + canonical_path = os.path.realpath(header_path) + + # If path doesn't exist and contains submodules dir pattern, try resolving from submodules directory + submodules_pattern = f'{THIRDPARTY_ROOT.name}/' + if not os.path.exists( + canonical_path + ) and submodules_pattern in original_header_path: + # Extract the part starting from FIRST submodules dir pattern in ORIGINAL path (handles nested dirs) + # e.g., ../../../../3rdparty/xgrammar/3rdparty/picojson/picojson.h + # should extract: xgrammar/3rdparty/picojson/picojson.h + idx = original_header_path.find(submodules_pattern) + if idx != -1: + relative_part = original_header_path[ + idx + len(submodules_pattern):] + # Resolve from THIRDPARTY_ROOT constant + alternative_path = THIRDPARTY_ROOT / relative_part + alternative_canonical = os.path.realpath( + str(alternative_path)) + + if os.path.exists(alternative_canonical): + canonical_path = alternative_canonical + + # If path doesn't exist and contains '_deps/', try resolving from build root + if not os.path.exists( + canonical_path) and '_deps/' in header_path: + # Extract the part starting from '_deps/' + parts = header_path.split('_deps/') + if len(parts) >= 2: + # Find build root (go up from cpp/dependency_scan to cpp/build) + build_root = self.build_dir + alternative_path = build_root / '_deps' / parts[-1] + alternative_canonical = os.path.realpath( + str(alternative_path)) + + if os.path.exists(alternative_canonical): + canonical_path = alternative_canonical + + # If path doesn't exist, try searching for it within build directory + # This handles cases like nvshmem-build/ or other CMake ExternalProject paths + if not os.path.exists(canonical_path) and not os.path.isabs( + header_path): + # Extract base filename to search for + basename = os.path.basename(header_path) + # Try to find the file within the build directory + import subprocess + try: + result = subprocess.run([ + 'find', + str(self.build_dir), '-name', basename, '-type', 'f' + ], + capture_output=True, + text=True, + timeout=5) + if result.returncode == 0 and result.stdout.strip(): + matches = result.stdout.strip().split('\n') + # Try to find match with similar relative path structure + for match in matches: + if header_path in match or match.endswith( + header_path): + canonical_path = os.path.realpath(match) + break + # If no exact match, use first match + if not os.path.exists(canonical_path) and matches: + canonical_path = os.path.realpath(matches[0]) + except Exception: + pass + + # Include all headers (even non-existent) for complete coverage + artifacts.append( + Artifact(path=canonical_path, + type='header', + source=str(d_file), + context_dir=str(context_dir), + metadata={ + 'original_path': original_header_path, + 'path_exists': os.path.exists(canonical_path) + })) + except Exception: + continue + + return artifacts + + def _parse_link_file(self, link_file: Path) -> List[Artifact]: + """ + Parse CMake link.txt file to extract library dependencies. + + Algorithm: + 1. Read file content (single line linker command) + 2. Split by whitespace + 3. Extract: + a) -l flags (e.g., -lpthread) + b) Absolute library paths (*.a, *.so) + c) @response.rsp files → recursively expand + d) CMakeFiles linker artifacts with embedded -Wl flags (special handling) + 4. Deduplicate and return + + CMakeFiles Linker Artifact Extraction: + CMake generates special linker artifacts in CMakeFiles directories that + encode library dependencies in the path itself. + + Pattern: /path/CMakeFiles/foo.dir/-Wl,-soname,libtest.so.1 + + These paths contain embedded linker flags (-Wl,-soname) that specify + the library's soname. The parser extracts the library name (libtest.so.1), + strips the 'lib' prefix, and converts it to a linker flag (-ltest). + + This enables proper dependency mapping for internal build artifacts that + would otherwise be unmapped. + + Example link.txt: + ``` + /usr/bin/c++ ... -lpthread -ldl /path/to/libfoo.a @response.rsp + ``` + """ + + artifacts = [] + + try: + content = link_file.read_text(encoding='utf-8', errors='ignore') + except Exception: + return artifacts + + tokens = content.split() + context_dir = link_file.parent + + for token in tokens: + # Handle response files (@response.rsp) + if token.startswith("@"): + rsp_file = Path(context_dir) / token[1:] + if rsp_file.exists(): + artifacts.extend(self._parse_link_file(rsp_file)) + continue + + # Handle CMakeFiles linker artifacts with embedded -Wl flags + # Pattern: /path/CMakeFiles/foo.dir/-Wl,-soname,libbar.so + # These encode library dependencies in the path itself + if '/CMakeFiles/' in token and '/-Wl,' in token: + # Extract library name from -Wl,-soname,libfoo.so + match = re.search(r'-Wl,-soname,(.+)$', token) + if match: + lib_name = match.group(1) + # Add as linker flag artifact for pattern matching + artifacts.append( + Artifact( + path= + f"-l{lib_name.replace('lib', '').split('.')[0]}", + type='library', + source=str(link_file), + context_dir=str(context_dir), + metadata={ + 'linker_flag': True, + 'cmake_linker_artifact': True, + 'original_token': token, + 'library_name': lib_name + })) + continue + + # Handle -l flags + if token.startswith("-l"): + artifacts.append( + Artifact(path=token, + type='library', + source=str(link_file), + context_dir=str(context_dir), + metadata={'linker_flag': True})) + continue + + # Handle absolute library paths + if token.endswith((".a", ".so")) or ".so." in token: + # Resolve relative paths + if not os.path.isabs(token): + token = os.path.join(context_dir, token) + + try: + canonical_path = os.path.realpath(token) + # Include all library paths (even non-existent) for complete coverage + artifacts.append( + Artifact(path=canonical_path, + type='library', + source=str(link_file), + context_dir=str(context_dir), + metadata={ + 'static': token.endswith('.a'), + 'path_exists': + os.path.exists(canonical_path) + })) + except Exception: + continue + + return artifacts + + def _scan_wheel(self, wheel_file: Path) -> List[Artifact]: + """ + Extract wheel and scan for binary dependencies (.so files). + + Algorithm: + 1. Create temp directory + 2. Extract wheel (ZIP format) + 3. Find all *.so files + 4. For each .so: + a) Run readelf -d to get NEEDED entries + b) Extract required library names + 5. Cleanup temp directory + 6. Return binary artifacts with NEEDED metadata + + Example: + tensorrt_llm-0.1.0-py3-none-any.whl contains: + - tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so + - Uses: libcudart.so.12, libnvinfer.so.10, libstdc++.so.6 + """ + artifacts = [] + + # Create temp directory for extraction + with tempfile.TemporaryDirectory() as temp_dir: + try: + # Extract wheel + with zipfile.ZipFile(wheel_file, 'r') as zip_ref: + zip_ref.extractall(temp_dir) + + # Find all .so files + temp_path = Path(temp_dir) + so_files = list(temp_path.rglob("*.so")) + list( + temp_path.rglob("*.so.*")) + + for so_file in so_files: + # Get NEEDED entries with readelf + needed_libs = self._get_needed_libraries(so_file) + + # Create artifact for the .so file itself + artifacts.append( + Artifact(path=str(so_file.relative_to(temp_path)), + type='binary', + source=str(wheel_file), + metadata={ + 'wheel': wheel_file.name, + 'needed': needed_libs + })) + + # Create artifacts for NEEDED libraries + for needed_lib in needed_libs: + artifacts.append( + Artifact(path=needed_lib, + type='library', + source=str(wheel_file), + metadata={ + 'from_binary': + str(so_file.relative_to(temp_path)), + 'dynamic_dependency': + True + })) + + except Exception: + pass + + return artifacts + + @staticmethod + def _get_needed_libraries(binary_path: Path) -> List[str]: + """ + Extract NEEDED entries from ELF binary using readelf. + + Algorithm: + 1. Execute: readelf -d + 2. Parse output for lines containing "(NEEDED)" + 3. Extract library names from "Shared library: [libfoo.so]" + 4. Return list of library names + + Example output: + ``` + 0x0000000000000001 (NEEDED) Shared library: [libcudart.so.12] + 0x0000000000000001 (NEEDED) Shared library: [libstdc++.so.6] + ``` + """ + needed = [] + + try: + result = subprocess.run( + ["readelf", "-d", str(binary_path)], + capture_output=True, + text=True, + timeout=10) + + if result.returncode == 0: + for line in result.stdout.split("\n"): + if "(NEEDED)" in line and "Shared library:" in line: + # Extract library name between brackets + match = re.search(r'\[([^\]]+)\]', line) + if match: + needed.append(match.group(1)) + + except Exception: + pass + + return needed + + +# ============================================================================ +# MODULE 4: PatternMatcher (FALLBACK) +# ============================================================================ + + +class PatternMatcher: + """ + Resolves artifacts using YAML files from dependencies/ directory (FALLBACK strategy for non-dpkg packages). + + Provides 3-tier resolution strategy: + 1. Exact pattern matching (basename_matches and linker_flags_matches) + 2. Path matching (directory_matches - rightmost match wins) + 3. Generic library name inference (fallback) + + YAML files are loaded from dependencies/ directory: + - Individual dependency files (e.g., tensorrt-llm.yml) + - Files with dependencies: list format (e.g., base.yml, cuda.yml) + - All *.yml files except those starting with '_' + """ + + def __init__(self, metadata_dir: Path): + """ + Initialize PatternMatcher by loading YAML files from dependencies/ directory. + + Args: + metadata_dir: Path to directory containing YAML dependency files + """ + self.pattern_mappings: Dict[str, str] = {} + self.path_aliases: Dict[str, str] = {} + self.known_names: Set[str] = set() # Track all known dependency names + self._schema = None + self._duplicate_warnings: Set[str] = set() + + # Vendor directory magic strings (industry-standard patterns) + self.vendor_patterns = [ + '3rdparty/', 'third-party/', 'thirdparty/', 'third_party/', + 'external/', 'externals/', 'vendor/', 'vendored/', 'deps/' + ] + + # Load schema if available + schema_file = metadata_dir / "_schema.yml" + if schema_file.exists() and JSONSCHEMA_AVAILABLE: + with open(schema_file, 'r') as f: + self._schema = yaml.safe_load(f) + + # Load all YAML files + self._load_yaml_files(metadata_dir) + + def _load_yaml_files(self, metadata_dir: Path): + """ + Load all YAML files from dependencies/ directory. + + Algorithm: + 1. Find all *.yml files (except those starting with '_') + 2. Load each file and validate against schema + 3. Handle two formats: + - Individual dependency files (with name, basename_matches, etc.) + - Files with dependencies: list format (e.g., base.yml, cuda.yml) + 4. Merge all basename_matches/linker_flags_matches into pattern_mappings + 5. Merge all directory_matches into path_aliases + 6. Warn about validation errors and duplicates + """ + yaml_files = sorted([ + f for f in metadata_dir.glob("*.yml") if not f.name.startswith("_") + ]) + + for yaml_file in yaml_files: + try: + with open(yaml_file, 'r') as f: + data = yaml.safe_load(f) + + # Handle files with dependencies list (e.g., dpkg.yml, cuda.yml) + if "dependencies" in data and isinstance( + data["dependencies"], list): + for dep_data in data["dependencies"]: + self._process_dependency(dep_data, yaml_file) + # Handle individual dependency files + elif "name" in data: + self._process_dependency(data, yaml_file) + else: + print( + f"Warning: Skipping {yaml_file.name} - unrecognized format", + file=sys.stderr) + + except yaml.YAMLError as e: + print(f"Warning: Failed to parse {yaml_file.name}: {e}", + file=sys.stderr) + except Exception as e: + print(f"Warning: Error loading {yaml_file.name}: {e}", + file=sys.stderr) + + def _process_dependency(self, dep_data: Dict[str, Any], source_file: Path): + """ + Process a single dependency definition and merge into internal structures. + + Args: + dep_data: Dictionary containing dependency definition + source_file: Path to YAML file being processed (for error messages) + """ + # Validate against schema if available + if self._schema and JSONSCHEMA_AVAILABLE: + try: + validate(instance=dep_data, schema=self._schema) + except ValidationError as e: + print( + f"Warning: Validation error in {source_file.name}: {e.message}", + file=sys.stderr) + # Continue processing despite validation errors + + dependency_name = dep_data.get("name") + if not dependency_name: + print(f"Warning: Missing 'name' field in {source_file.name}", + file=sys.stderr) + return + + # Merge basename_matches into pattern_mappings + basename_matches = dep_data.get("basename_matches", []) + for pattern in basename_matches: + if pattern in self.pattern_mappings and pattern not in self._duplicate_warnings: + print( + f"Warning: Duplicate basename match '{pattern}' found in {source_file.name} " + f"(previously mapped to '{self.pattern_mappings[pattern]}', now '{dependency_name}')", + file=sys.stderr) + self._duplicate_warnings.add(pattern) + self.pattern_mappings[pattern] = dependency_name + + # Merge linker_flags_matches into pattern_mappings + linker_flags_matches = dep_data.get("linker_flags_matches", []) + for flag in linker_flags_matches: + if flag in self.pattern_mappings and flag not in self._duplicate_warnings: + print( + f"Warning: Duplicate linker flag '{flag}' found in {source_file.name} " + f"(previously mapped to '{self.pattern_mappings[flag]}', now '{dependency_name}')", + file=sys.stderr) + self._duplicate_warnings.add(flag) + self.pattern_mappings[flag] = dependency_name + + # Merge directory_matches into path_aliases + directory_matches = dep_data.get("directory_matches", []) + for component in directory_matches: + if component in self.path_aliases and component not in self._duplicate_warnings: + print( + f"Warning: Duplicate path component '{component}' found in {source_file.name} " + f"(previously mapped to '{self.path_aliases[component]}', now '{dependency_name}')", + file=sys.stderr) + self._duplicate_warnings.add(component) + self.path_aliases[component] = dependency_name + + # Track known dependency names (for nested vendor detection) + self.known_names.add(dependency_name.lower()) + for component in directory_matches: + self.known_names.add(component.lower()) + + def match(self, artifact: Artifact) -> Optional[Mapping]: + """ + Match artifact using 3-tier strategy. + + Algorithm: + 1. Try pattern matching (basename_matches and linker_flags_matches - exact match only - highest confidence) + 2. Try path matching (directory_matches - rightmost directory wins) + 3. Try generic library name inference (lowest confidence) + 4. Return first match or None + """ + # Strategy 1: Pattern matching (exact match only) + result = self._match_patterns(artifact) + if result: + return result + + # Strategy 2: Path matching (directory_matches) + result = self._match_path_alias(artifact) + if result: + return result + + # Strategy 3: Generic library name inference (fallback) + result = self._match_generic_library(artifact) + if result: + return result + + return None + + def _match_patterns(self, artifact: Artifact) -> Optional[Mapping]: + """ + Match using pattern_mappings dictionary (exact match only). + + Only performs exact matching against basename_matches from YAML files. + Substring matching has been removed to prevent false positives. + + Algorithm: + 1. Try exact match on basename (e.g., "libcudart.so.12") + 2. Try exact match on full path (e.g., "-lpthread") + 3. Return mapped dependency with HIGH confidence + + Examples: + -lpthread → libc6 (exact basename match) + libcudart.so.12 → cuda-cudart-12 (exact basename match) + + Note: For partial path matching, use directory_matches in YAML files. + Directory matches work on whole directory names (e.g., "fmt/" in path). + """ + basename = os.path.basename(artifact.path) + + # Try exact match on basename + if basename in self.pattern_mappings: + return Mapping(artifact=artifact, + dependency=self.pattern_mappings[basename], + confidence='high', + strategy='exact_pattern_match', + metadata={'matched_key': basename}) + + # Try exact match on full path (for -l flags) + if artifact.path in self.pattern_mappings: + return Mapping(artifact=artifact, + dependency=self.pattern_mappings[artifact.path], + confidence='high', + strategy='exact_pattern_match', + metadata={'matched_key': artifact.path}) + + # Substring matching removed - too high risk for false positives + # Use directory_matches instead for safe partial path matching + + return None + + def _match_path_alias(self, artifact: Artifact) -> Optional[Mapping]: + """ + Match using path_aliases (supports single and multi-directory patterns). + + Algorithm: + 1. Split path by '/' to get directory components (filter empty strings) + 2. For each pattern in path_aliases: + a. Normalize pattern (strip leading/trailing slashes) + b. Split pattern into components + c. Search for exact consecutive component sequence in path + d. Rightmost match wins + 3. Return first match with MEDIUM confidence + + Examples: + Single component: + Pattern: "pytorch" + /foo/bar/pytorch/include/torch/torch.h → pytorch (matches "pytorch") + + Multi-directory: + Pattern: "foo/bar" + /home/foo/bar/file.h → foo/bar (matches consecutive "foo", "bar") + /home/foobar/file.h → NO MATCH ("foobar" != ["foo", "bar"]) + + Rightmost wins: + Pattern: "foo/bar" + /foo/bar/baz/foo/bar/qux.h → matches at rightmost position + """ + # Split path into components (filter out empty strings from leading slash) + path_components = [c for c in artifact.path.split('/') if c] + + best_match = None + best_position = -1 # Rightmost wins (highest position) + + for pattern, dependency in self.path_aliases.items(): + # Normalize pattern: strip slashes, split into components + pattern_normalized = pattern.strip('/') + + # Handle empty pattern after stripping + if not pattern_normalized: + continue + + pattern_components = pattern_normalized.split('/') + pattern_len = len(pattern_components) + + # Search for exact consecutive sequence match using sliding window + # Iterate through all possible positions in the path + for i in range(len(path_components) - pattern_len + 1): + # Check if pattern_components match exactly at position i + if path_components[i:i + pattern_len] == pattern_components: + # Found exact match at position i + # Keep rightmost match (highest i value wins) + if i > best_position: + best_position = i + best_match = Mapping(artifact=artifact, + dependency=dependency, + confidence='medium', + strategy='path_alias', + metadata={ + 'matched_pattern': + pattern, + 'matched_sequence': + '/'.join(pattern_components), + 'position': + i + }) + + return best_match + + def _match_generic_library(self, artifact: Artifact) -> Optional[Mapping]: + """ + Generic library name inference (FALLBACK with LOW confidence). + + Algorithm: + 1. Check if library type + 2. Extract basename + 3. Strip lib prefix and .so/.a suffix + 4. Return as dependency with LOW confidence + + Examples: + libfoobar.so → foobar (low confidence) + libtest.so.1 → test (low confidence) + """ + if artifact.type != 'library': + return None + + basename = os.path.basename(artifact.path) + + # Try to extract library name + match = re.match(r'^lib([a-zA-Z0-9_-]+)\.(?:so|a)(?:\.\d+)*$', basename) + if match: + return Mapping(artifact=artifact, + dependency=match.group(1), + confidence='low', + strategy='generic_library_inference', + metadata={'inferred_from': basename}) + + return None + + def extract_vendor_components(self, path: str) -> List[tuple]: + """ + Extract all vendor components from a path using magic strings. + + Args: + path: Artifact path to scan + + Returns: + List of (pattern, component) tuples for each vendor boundary found + + Example: + "/3rdparty/xgrammar/3rdparty/picojson/file.h" → + [("3rdparty/", "xgrammar"), ("3rdparty/", "picojson")] + """ + components = [] + path_lower = path.lower() + + for pattern in self.vendor_patterns: + idx = 0 + while True: + idx = path_lower.find(pattern, idx) + if idx == -1: + break + + # Extract component name after the pattern + start = idx + len(pattern) + end = path_lower.find('/', start) + if end == -1: + end = len(path_lower) + + component = path[start:end] # Use original case + if component: # Skip empty components + components.append((pattern, component)) + + idx = end + + return components + + def find_unknown_vendor_boundaries(self, + artifact: Artifact) -> Optional[str]: + """ + Check if artifact contains any unknown vendor boundaries. + + Unified vendor boundary policy: ANY component following a vendor pattern + (3rdparty/, vendor/, external/, etc.) MUST be in the known allowlist. + + Returns: + Name of unknown vendor boundary component, or None if all are known + + Examples: + Path: "/3rdparty/xgrammar/src/file.h" + Components: ["xgrammar"] + If "xgrammar" is known → returns None (OK) + + Path: "/3rdparty/unknown-lib/file.h" + Components: ["unknown-lib"] + If "unknown-lib" is NOT known → returns "unknown-lib" (REJECT) + + Path: "/3rdparty/xgrammar/3rdparty/picojson/file.h" + Components: ["xgrammar", "picojson"] + If "picojson" is NOT known → returns "picojson" (REJECT) + """ + components = self.extract_vendor_components(artifact.path) + + # Check ALL vendor boundaries (rightmost has priority for detection) + for pattern, component in reversed(components): + component_lower = component.lower() + if component_lower not in self.known_names: + return component + + return None + + +# ============================================================================ +# MODULE 5: OutputGenerator +# ============================================================================ + + +class OutputGenerator: + """ + Generates YAML reports for known and unknown artifacts. + + Output files: + - known.yml: Successfully mapped artifacts grouped by dependency (paths only) + - unknown.yml: Unmapped artifacts requiring pattern additions (paths only) + """ + + @staticmethod + def generate(mappings: List[Mapping], artifacts: List[Artifact], + output_dir: Path): + """ + Generate known.yml and unknown.yml with simplified structure (paths only). + + Algorithm: + 1. Create output directory if needed + 2. Separate mapped vs unmapped artifacts + 3. Group known artifacts by dependency (dict of lists) + 4. Sort dependencies by count (most artifacts first) + 5. Write YAML files with simplified structure + """ + output_dir.mkdir(parents=True, exist_ok=True) + + # Separate known vs unknown mappings + # Artifacts mapped to "unknown" should be treated as truly unknown + known_mappings = [m for m in mappings if m.dependency != 'unknown'] + unknown_mappings = [m for m in mappings if m.dependency == 'unknown'] + + # Build mapping lookup (only for truly known) + mapped_paths = {m.artifact.path for m in known_mappings} + + # Known artifacts - simplified structure (dependency -> list of paths) + known = {} + for mapping in known_mappings: + dep = mapping.dependency + if dep not in known: + known[dep] = [] + known[dep].append(mapping.artifact.path) + + # Sort dependencies by count (most artifacts first) + known_sorted = dict( + sorted(known.items(), key=lambda x: len(x[1]), reverse=True)) + + # Unknown artifacts - simplified structure (flat list of paths) + unknown_paths = [] + + # Add artifacts that weren't mapped at all + for artifact in artifacts: + if artifact.path not in mapped_paths and not any( + m.artifact.path == artifact.path for m in unknown_mappings): + unknown_paths.append(artifact.path) + + # Add artifacts mapped to "unknown" + for mapping in unknown_mappings: + unknown_paths.append(mapping.artifact.path) + + # Write outputs + known_file = output_dir / 'known.yml' + unknown_file = output_dir / 'unknown.yml' + + with open(known_file, 'w') as f: + yaml.dump( + { + 'summary': { + 'total_artifacts': + len(artifacts), + 'mapped': + len(known_mappings), + 'unmapped': + len(unknown_paths), + 'coverage': + f"{len(known_mappings) / len(artifacts) * 100:.1f}%" + if artifacts else "0%", + 'unique_dependencies': + len(known) + }, + 'dependencies': known_sorted + }, + f, + default_flow_style=False, + sort_keys=False) + + with open(unknown_file, 'w') as f: + yaml.dump( + { + 'summary': { + 'count': + len(unknown_paths), + 'action_required': + 'Add patterns to YAML files in dependencies/ for these artifacts' + }, + 'artifacts': unknown_paths + }, + f, + default_flow_style=False, + sort_keys=False) + + # Generate path_issues.yml for non-existent paths + path_issues_file = output_dir / 'path_issues.yml' + non_existent_paths = [] + + for artifact in artifacts: + # Check if path_exists metadata is False + # Exclude libraries since they don't have meaningful original_path metadata + if (artifact.metadata + and not artifact.metadata.get('path_exists', True) + and artifact.type != 'library'): + non_existent_paths.append({ + 'resolved_path': + artifact.path, + 'type': + artifact.type, + 'source': + artifact.source, + 'd_file_path': + artifact.metadata.get('original_path', 'N/A') + }) + + with open(path_issues_file, 'w') as f: + yaml.dump( + { + 'summary': { + 'count': + len(non_existent_paths), + 'total_artifacts': + len(artifacts), + 'percentage': + f"{len(non_existent_paths) / len(artifacts) * 100:.1f}%" + if artifacts else "0%", + 'note': + 'These header paths were resolved from .d files but do not exist in the filesystem (libraries excluded)' + }, + 'non_existent_paths': non_existent_paths + }, + f, + default_flow_style=False, + sort_keys=False) + + return known_file, unknown_file + + +# ============================================================================ +# MODULE 6: Main Orchestration +# ============================================================================ + + +def validate_yaml_files(metadata_dir: Path) -> bool: + """ + Validate YAML files without running the scanner. + + Args: + metadata_dir: Path to dependencies directory + + Returns: + True if all files are valid, False otherwise + """ + print("=" * 80) + print("YAML Validation") + print("=" * 80) + print(f"Metadata directory: {metadata_dir}") + print() + + # Check if jsonschema is available + if not JSONSCHEMA_AVAILABLE: + print("Warning: jsonschema not installed, skipping validation", + file=sys.stderr) + print("Install with: pip install jsonschema") + return False + + # Load schema + schema_file = metadata_dir / "_schema.yml" + if not schema_file.exists(): + print(f"Error: Schema file not found: {schema_file}", file=sys.stderr) + return False + + with open(schema_file, 'r') as f: + schema = yaml.safe_load(f) + + # Validate all YAML files + yaml_files = sorted( + [f for f in metadata_dir.glob("*.yml") if not f.name.startswith("_")]) + total = 0 + valid = 0 + invalid = 0 + + for yaml_file in yaml_files: + try: + with open(yaml_file, 'r') as f: + data = yaml.safe_load(f) + + # Handle files with dependencies list format (e.g., base.yml, cuda.yml) + if "dependencies" in data and isinstance(data["dependencies"], + list): + for dep_data in data["dependencies"]: + total += 1 + try: + validate(instance=dep_data, schema=schema) + print( + f"✓ {yaml_file.name}:{dep_data.get('name', 'unknown')}" + ) + valid += 1 + except ValidationError as e: + print( + f"✗ {yaml_file.name}:{dep_data.get('name', 'unknown')}: {e.message}", + file=sys.stderr) + invalid += 1 + # Handle individual dependency files + elif "name" in data: + total += 1 + try: + validate(instance=data, schema=schema) + print(f"✓ {yaml_file.name}") + valid += 1 + except ValidationError as e: + print(f"✗ {yaml_file.name}: {e.message}", file=sys.stderr) + invalid += 1 + + except yaml.YAMLError as e: + print(f"✗ {yaml_file.name}: YAML parse error: {e}", file=sys.stderr) + invalid += 1 + except Exception as e: + print(f"✗ {yaml_file.name}: {e}", file=sys.stderr) + invalid += 1 + + print() + print("=" * 80) + print(f"Results: {valid}/{total} valid, {invalid}/{total} invalid") + print("=" * 80) + + return invalid == 0 + + +def main(): + """ + Main entry point for build artifact scanner. + + Algorithm: + 1. Parse command-line arguments + 2. Validate inputs (build-dir exists, dependencies/ exists) + 3. Collect artifacts using ArtifactCollector + 4. Resolve using DpkgResolver (PRIMARY) + 5. Resolve remaining using PatternMatcher (FALLBACK) + 6. Generate reports using OutputGenerator + 7. Print summary statistics + """ + parser = argparse.ArgumentParser( + description='Minimal Build Artifact Scanner for TensorRT-LLM', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Scan default build directory + python scan_build_artifacts.py + + # Scan custom build directory with custom output + python scan_build_artifacts.py --build-dir build/Release --output-dir scan_output/ + + # Validate YAML files without scanning + python scan_build_artifacts.py --validate + + # Use custom dependencies directory + python scan_build_artifacts.py --dependencies-dir custom_dependencies/ + """) + + parser.add_argument( + '--build-dir', + type=Path, + default=Path(__file__).parent.parent / 'build', + help= + 'Build directory to scan for C++ artifacts (default: ../build/). Note: wheels are in ../../build/' + ) + + parser.add_argument( + '--output-dir', + type=Path, + default=Path('scan_output'), + help='Output directory for reports (default: scan_output/)') + + parser.add_argument( + '--metadata-dir', + type=Path, + default=Path(__file__).parent / 'metadata', + help= + 'Path to metadata directory containing YAML files (default: ./metadata/)' + ) + + parser.add_argument('--validate', + action='store_true', + help='Validate YAML files without running scanner') + + args = parser.parse_args() + + # Handle --validate flag + if args.validate: + success = validate_yaml_files(args.metadata_dir) + sys.exit(0 if success else 1) + + # Validate inputs + if not args.build_dir.exists(): + print(f"Error: Build directory not found: {args.build_dir}", + file=sys.stderr) + sys.exit(1) + + if not args.metadata_dir.exists(): + print(f"Error: Metadata directory not found: {args.metadata_dir}", + file=sys.stderr) + sys.exit(1) + + print("=" * 80) + print("TensorRT-LLM Build Artifact Scanner") + print("=" * 80) + print(f"Build directory: {args.build_dir}") + print(f"Metadata directory: {args.metadata_dir}") + print(f"Output directory: {args.output_dir}") + print() + + # Step 1: Collect artifacts + print("[1/4] Collecting artifacts...") + collector = ArtifactCollector(args.build_dir) + artifacts = collector.collect_all() + print(f" Found {len(artifacts)} unique artifacts") + print(f" - Headers: {sum(1 for a in artifacts if a.type == 'header')}") + print( + f" - Libraries: {sum(1 for a in artifacts if a.type == 'library')}") + print(f" - Binaries: {sum(1 for a in artifacts if a.type == 'binary')}") + print() + + # Step 2: Resolve with dpkg (PRIMARY) + print("[2/4] Resolving with dpkg-query (PRIMARY strategy)...") + dpkg_resolver = DpkgResolver() + dpkg_mappings = [] + + for artifact in artifacts: + package = dpkg_resolver.get_package(artifact.path) + if package: + dpkg_mappings.append( + Mapping(artifact=artifact, + dependency=package, + confidence='high', + strategy='dpkg-query', + metadata={'dpkg_package': package})) + + print( + f" Resolved {len(dpkg_mappings)} artifacts via dpkg ({len(dpkg_mappings) / len(artifacts) * 100:.1f}%)" + ) + print() + + # Step 3: Resolve remaining with YAML patterns (FALLBACK) + print("[3/4] Resolving remaining with YAML patterns (FALLBACK strategy)...") + pattern_matcher = PatternMatcher(args.metadata_dir) + pattern_mappings = [] + + dpkg_resolved_paths = {m.artifact.path for m in dpkg_mappings} + remaining_artifacts = [ + a for a in artifacts if a.path not in dpkg_resolved_paths + ] + + for artifact in remaining_artifacts: + mapping = pattern_matcher.match(artifact) + if mapping: + # Check for unknown vendor boundaries BEFORE accepting the mapping + unknown_vendor = pattern_matcher.find_unknown_vendor_boundaries( + artifact) + if unknown_vendor: + # Artifact has unknown vendor boundary - treat as unknown + # Don't add to pattern_mappings (will be in unknown.yml) + print( + f" WARNING: Unknown vendor boundary '{unknown_vendor}' found in: {artifact.path}", + file=sys.stderr) + else: + pattern_mappings.append(mapping) + + print( + f" Resolved {len(pattern_mappings)} additional artifacts via patterns ({len(pattern_mappings) / len(artifacts) * 100:.1f}%)" + ) + print() + + # Step 4: Generate reports + print("[4/4] Generating reports...") + all_mappings = dpkg_mappings + pattern_mappings + known_file, unknown_file = OutputGenerator.generate(all_mappings, artifacts, + args.output_dir) + + print(f" Reports written to:") + print(f" - {known_file}") + print(f" - {unknown_file}") + print() + + # Summary + # Separate known vs unknown (artifacts mapped to "unknown" are treated as unknown) + known_mappings = [m for m in all_mappings if m.dependency != 'unknown'] + unknown_mappings = [m for m in all_mappings if m.dependency == 'unknown'] + + total_known = len(known_mappings) + total_unknown = len(artifacts) - total_known + coverage = (total_known / len(artifacts) * 100) if artifacts else 0 + + # Count dpkg/pattern strategies among known mappings only + dpkg_known = sum(1 for m in dpkg_mappings if m.dependency != 'unknown') + pattern_known = sum(1 for m in pattern_mappings + if m.dependency != 'unknown') + + print("=" * 80) + print("SUMMARY") + print("=" * 80) + print(f"Total artifacts: {len(artifacts)}") + print( + f" Mapped (dpkg): {dpkg_known} ({dpkg_known / len(artifacts) * 100:.1f}%)" + ) + print( + f" Mapped (patterns): {pattern_known} ({pattern_known / len(artifacts) * 100:.1f}%)" + ) + print( + f" Unknown: {total_unknown} ({total_unknown / len(artifacts) * 100:.1f}%)" + ) + print(f"Coverage: {coverage:.1f}%") + print() + + # Confidence breakdown (only for known mappings) + high_conf = sum(1 for m in known_mappings if m.confidence == 'high') + med_conf = sum(1 for m in known_mappings if m.confidence == 'medium') + low_conf = sum(1 for m in known_mappings if m.confidence == 'low') + + if known_mappings: + print("Confidence Distribution:") + print( + f" High: {high_conf} ({high_conf / len(known_mappings) * 100:.1f}%)" + ) + print( + f" Medium: {med_conf} ({med_conf / len(known_mappings) * 100:.1f}%)" + ) + print( + f" Low: {low_conf} ({low_conf / len(known_mappings) * 100:.1f}%)") + else: + print("Confidence Distribution:") + print(" High: 0") + print(" Medium: 0") + print(" Low: 0") + print() + + if total_unknown > 0: + print(f"ACTION REQUIRED: {total_unknown} artifacts unknown") + if len(unknown_mappings) > 0: + print( + f" {len(unknown_mappings)} artifacts matched generic fallback (need specific patterns)" + ) + print(f" Review {unknown_file}") + print(f" Add missing patterns to YAML files in {args.metadata_dir}") + print(f" Re-run scanner to improve coverage") + else: + print("SUCCESS: All artifacts mapped!") + + print("=" * 80) + + +if __name__ == '__main__': + main() diff --git a/cpp/dependency_scan/tests/test_scan_build_artifacts.py b/cpp/dependency_scan/tests/test_scan_build_artifacts.py new file mode 100644 index 00000000000..f9d47fdf35b --- /dev/null +++ b/cpp/dependency_scan/tests/test_scan_build_artifacts.py @@ -0,0 +1,1711 @@ +#!/usr/bin/env python3 +""" +Unit tests for scan_build_artifacts.py + +Tests all 5 modules: + 1. DpkgResolver + 2. ArtifactCollector + 3. PatternMatcher (YAML-based) + 4. OutputGenerator + 5. Main CLI + +Run with: python -m pytest test_scan_build_artifacts.py -v +""" + +# Import modules under test +import os +import sys +import zipfile +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest +import yaml + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from scan_build_artifacts import (Artifact, ArtifactCollector, DpkgResolver, + Mapping, OutputGenerator, PatternMatcher) + +# ============================================================================ +# Test Data Models +# ============================================================================ + + +def test_artifact_creation(): + """Test Artifact dataclass creation and serialization""" + artifact = Artifact(path="/usr/include/stdio.h", + type="header", + source="test.d", + context_dir="/build", + metadata={"test": "value"}) + + assert artifact.path == "/usr/include/stdio.h" + assert artifact.type == "header" + assert artifact.source == "test.d" + + # Test serialization + data = artifact.to_dict() + assert data['path'] == "/usr/include/stdio.h" + assert data['metadata']['test'] == "value" + + +def test_mapping_creation(): + """Test Mapping dataclass creation and serialization""" + artifact = Artifact(path="/usr/lib/libfoo.so", + type="library", + source="link.txt") + + mapping = Mapping(artifact=artifact, + dependency="foo", + confidence="high", + strategy="dpkg-query", + metadata={"test": "meta"}) + + assert mapping.dependency == "foo" + assert mapping.confidence == "high" + + # Test serialization + data = mapping.to_dict() + assert data['dependency'] == "foo" + assert data['artifact']['path'] == "/usr/lib/libfoo.so" + + +# ============================================================================ +# Test DpkgResolver +# ============================================================================ + + +class TestDpkgResolver: + """Test cases for DpkgResolver class""" + + def test_get_library_search_paths(self): + """Test _get_library_search_paths returns expected directories""" + resolver = DpkgResolver() + paths = resolver._lib_search_paths + + # Should contain standard paths + assert any("/lib/x86_64-linux-gnu" in p for p in paths) + assert any("/usr/lib/x86_64-linux-gnu" in p for p in paths) + + def test_find_library_path_pthread(self): + """Test find_library_path resolves -lpthread""" + resolver = DpkgResolver() + result = resolver.find_library_path("-lpthread") + + # Should find libpthread.so* in system paths + if result: # May not exist on all systems + assert "pthread" in result + assert result.endswith((".so", ".a")) or ".so." in result + + @patch('subprocess.run') + def test_get_package_success(self, mock_run): + """Test get_package successfully parses dpkg-query output""" + # Mock dpkg-query output + mock_run.return_value = Mock( + returncode=0, + stdout="libc6:amd64: /lib/x86_64-linux-gnu/libc.so.6\n") + + resolver = DpkgResolver() + package = resolver.get_package("/lib/x86_64-linux-gnu/libc.so.6") + + assert package == "libc6" + mock_run.assert_called_once() + + @patch('subprocess.run') + def test_get_package_not_found(self, mock_run): + """Test get_package returns None for non-existent file""" + # Mock dpkg-query failure + mock_run.return_value = Mock(returncode=1, stdout="") + + resolver = DpkgResolver() + package = resolver.get_package("/nonexistent/file.so") + + assert package is None + + @patch('subprocess.run') + def test_get_package_caching(self, mock_run): + """Test get_package caches results""" + mock_run.return_value = Mock( + returncode=0, + stdout="libc6:amd64: /lib/x86_64-linux-gnu/libc.so.6\n") + + resolver = DpkgResolver() + + # First call + pkg1 = resolver.get_package("/lib/x86_64-linux-gnu/libc.so.6") + # Second call (should use cache) + pkg2 = resolver.get_package("/lib/x86_64-linux-gnu/libc.so.6") + + assert pkg1 == pkg2 + # Should only call dpkg-query once + assert mock_run.call_count == 1 + + def test_normalize_cuda_package(self): + """Test _normalize_cuda_package removes version suffixes""" + resolver = DpkgResolver() + + # Should normalize CUDA packages + assert resolver._normalize_cuda_package("cuda-cccl-12-9") == "cuda-cccl" + assert resolver._normalize_cuda_package( + "cuda-cudart-dev-12-9") == "cuda-cudart-dev" + assert resolver._normalize_cuda_package( + "libcublas-dev-12-9") == "libcublas-dev" + + # Should NOT normalize non-CUDA packages + assert resolver._normalize_cuda_package("libc6") == "libc6" + assert resolver._normalize_cuda_package( + "python3-12-1") == "python3-12-1" + + @patch('subprocess.run') + def test_get_package_linker_flag(self, mock_run): + """Test get_package handles -l flags by resolving first""" + # First call: find_library_path (no mock needed, uses real filesystem) + # Second call: dpkg-query + mock_run.return_value = Mock( + returncode=0, + stdout="libc6:amd64: /lib/x86_64-linux-gnu/libpthread.so.0\n") + + resolver = DpkgResolver() + + # Mock find_library_path to return a known path + with patch.object(resolver, + 'find_library_path', + return_value='/lib/x86_64-linux-gnu/libpthread.so.0'): + package = resolver.get_package("-lpthread") + + # Should resolve to libc6 + if package: # May fail if system doesn't have dpkg + assert package == "libc6" + + +# ============================================================================ +# Test ArtifactCollector +# ============================================================================ + + +class TestArtifactCollector: + """Test cases for ArtifactCollector class""" + + def test_parse_d_file_basic(self, tmp_path): + """Test _parse_d_file parses basic D file""" + # Create test D file + d_file = tmp_path / "test.d" + d_file.write_text( + "build/foo.o: /usr/include/stdio.h /usr/include/stdlib.h\n") + + collector = ArtifactCollector(tmp_path) + artifacts = collector._parse_d_file(d_file) + + # Should find 2 headers (if they exist on system) + assert len(artifacts) >= 0 # May be 0 if headers don't exist + for artifact in artifacts: + assert artifact.type == "header" + assert artifact.source == str(d_file) + + def test_parse_d_file_line_continuations(self, tmp_path): + """Test _parse_d_file handles line continuations""" + # Create test D file with line continuations + d_file = tmp_path / "test.d" + d_file.write_text( + "build/foo.o: /usr/include/stdio.h \\\n /usr/include/stdlib.h\n") + + collector = ArtifactCollector(tmp_path) + artifacts = collector._parse_d_file(d_file) + + # Should handle continuations correctly + assert isinstance(artifacts, list) + + def test_parse_d_file_relative_paths(self, tmp_path): + """Test _parse_d_file resolves relative paths""" + # Create test header + include_dir = tmp_path / "include" + include_dir.mkdir() + test_header = include_dir / "test.h" + test_header.write_text("// test header\n") + + # Create D file with relative path + d_file = tmp_path / "build" / "test.d" + d_file.parent.mkdir(parents=True, exist_ok=True) + d_file.write_text(f"build/foo.o: ../include/test.h\n") + + collector = ArtifactCollector(tmp_path) + artifacts = collector._parse_d_file(d_file) + + # Should resolve relative path + if artifacts: + assert any("test.h" in a.path for a in artifacts) + + def test_parse_d_file_trailing_colons(self, tmp_path): + """Test _parse_d_file strips trailing colons from malformed paths""" + # Create test D file with trailing colons (malformed .d file) + d_file = tmp_path / "test.d" + d_file.write_text( + "build/foo.o: /usr/include/stdio.h: /usr/include/stdlib.h:\n") + + collector = ArtifactCollector(tmp_path) + artifacts = collector._parse_d_file(d_file) + + # Should strip trailing colons from paths + for artifact in artifacts: + assert not artifact.path.endswith(':') + assert artifact.path # No empty strings + + def test_parse_link_file_basic(self, tmp_path): + """Test _parse_link_file parses link.txt""" + # Create test link file + link_file = tmp_path / "link.txt" + link_file.write_text( + "/usr/bin/c++ -o foo -lpthread -ldl /path/to/libfoo.a\n") + + collector = ArtifactCollector(tmp_path) + artifacts = collector._parse_link_file(link_file) + + # Should find -l flags + assert any(a.path == "-lpthread" for a in artifacts) + assert any(a.path == "-ldl" for a in artifacts) + + def test_parse_link_file_response_files(self, tmp_path): + """Test _parse_link_file handles @response.rsp recursively""" + # Create response file + rsp_file = tmp_path / "response.rsp" + rsp_file.write_text("-lpthread -ldl\n") + + # Create link file referencing response file + link_file = tmp_path / "link.txt" + link_file.write_text(f"/usr/bin/c++ -o foo @response.rsp\n") + + collector = ArtifactCollector(tmp_path) + artifacts = collector._parse_link_file(link_file) + + # Should recursively expand response file + assert any(a.path == "-lpthread" for a in artifacts) + + def test_parse_link_file_static_libraries(self, tmp_path): + """Test _parse_link_file handles .a files""" + # Create static library + static_lib = tmp_path / "libtest.a" + static_lib.write_text("fake static library\n") + + # Create link file + link_file = tmp_path / "link.txt" + link_file.write_text(f"/usr/bin/c++ -o foo {static_lib}\n") + + collector = ArtifactCollector(tmp_path) + artifacts = collector._parse_link_file(link_file) + + # Should find static library + assert any("libtest.a" in a.path for a in artifacts) + if artifacts: + assert artifacts[0].metadata.get('static') == True + + def test_parse_link_file_cmake_linker_artifacts(self, tmp_path): + """Test _parse_link_file handles CMakeFiles artifacts with -Wl,-soname""" + # Create link file with CMakeFiles linker artifact + link_file = tmp_path / "link.txt" + link_file.write_text( + "/usr/bin/c++ -o foo /build/CMakeFiles/foo.dir/-Wl,-soname,libtest.so.1\n" + ) + + collector = ArtifactCollector(tmp_path) + artifacts = collector._parse_link_file(link_file) + + # Should extract library name from CMakeFiles artifact + linker_artifacts = [ + a for a in artifacts if a.metadata.get('cmake_linker_artifact') + ] + assert len(linker_artifacts) == 1 + + artifact = linker_artifacts[0] + assert artifact.path == "-ltest" # Extracted from libtest.so.1 + assert artifact.type == "library" + assert artifact.metadata['linker_flag'] == True + assert artifact.metadata['cmake_linker_artifact'] == True + assert artifact.metadata['library_name'] == "libtest.so.1" + + def test_get_needed_libraries(self, tmp_path): + """Test _get_needed_libraries extracts NEEDED entries""" + # Mock readelf output + mock_output = """ +Dynamic section at offset 0x1000 contains 20 entries: + Tag Type Name/Value + 0x0000000000000001 (NEEDED) Shared library: [libcudart.so.12] + 0x0000000000000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x000000000000000e (SONAME) Library soname: [libtest.so] + """ + + with patch('subprocess.run') as mock_run: + mock_run.return_value = Mock(returncode=0, stdout=mock_output) + + needed = ArtifactCollector._get_needed_libraries(tmp_path / + "fake.so") + + assert "libcudart.so.12" in needed + assert "libstdc++.so.6" in needed + + def test_scan_wheel(self, tmp_path): + """Test _scan_wheel extracts and scans wheel""" + # Create fake wheel + wheel_file = tmp_path / "test-1.0-py3-none-any.whl" + + with zipfile.ZipFile(wheel_file, 'w') as zf: + # Add fake .so file + zf.writestr("tensorrt_llm/libs/libtest.so", b"fake binary") + + collector = ArtifactCollector(tmp_path) + + with patch.object(ArtifactCollector, + '_get_needed_libraries', + return_value=['libfoo.so']): + artifacts = collector._scan_wheel(wheel_file) + + # Should find binary artifact + assert any(a.type == "binary" for a in artifacts) + assert any(a.type == "library" and a.path == "libfoo.so" + for a in artifacts) + + def test_collect_all_deduplication(self, tmp_path): + """Test collect_all deduplicates artifacts""" + # Create two D files with overlapping headers + d1 = tmp_path / "test1.d" + d2 = tmp_path / "test2.d" + + d1.write_text("build/foo.o: /usr/include/stdio.h\n") + d2.write_text("build/bar.o: /usr/include/stdio.h\n") + + collector = ArtifactCollector(tmp_path) + artifacts = collector.collect_all() + + # Should deduplicate by path + paths = [a.path for a in artifacts] + assert len(paths) == len(set(paths)) # No duplicates + + def test_parse_d_file_relative_path_resolution(self, tmp_path): + """Test _parse_d_file resolves paths relative to build_dir, not .d file's parent + + This test verifies the fix for the critical bug where relative paths in CMake .d files + were being resolved from the wrong directory. CMake .d files use paths relative to the + build root, not the .d file's directory. + """ + # Create a realistic directory structure: + # tmp_path (build_dir) + # ├── CMakeFiles/ + # │ └── deeply/ + # │ └── nested/ + # │ └── target.dir/ + # │ └── test.d (contains relative paths) + # └── tensorrt_llm/ + # └── runtime/ + # └── layerProfiler.h (target file) + + # Create the target header file + runtime_dir = tmp_path / "tensorrt_llm" / "runtime" + runtime_dir.mkdir(parents=True) + target_header = runtime_dir / "layerProfiler.h" + target_header.write_text("// TensorRT-LLM runtime header\n") + + # Create deeply nested .d file directory + d_file_dir = tmp_path / "CMakeFiles" / "deeply" / "nested" / "target.dir" + d_file_dir.mkdir(parents=True, exist_ok=True) + d_file = d_file_dir / "test.d" + + # Write .d file with relative path from BUILD ROOT, not from .d file location + # From build root: tensorrt_llm/runtime/layerProfiler.h + # From .d file location: ../../../../../tensorrt_llm/runtime/layerProfiler.h would be wrong + d_file.write_text("build/foo.o: tensorrt_llm/runtime/layerProfiler.h\n") + + collector = ArtifactCollector(tmp_path) + artifacts = collector._parse_d_file(d_file) + + # Should resolve relative path using build_dir as context, not d_file.parent + assert len(artifacts) == 1, f"Expected 1 artifact, got {len(artifacts)}" + artifact = artifacts[0] + + # Verify the path was resolved correctly + assert "layerProfiler.h" in artifact.path + assert artifact.type == "header" + assert artifact.source == str(d_file) + + # Verify context_dir is build_dir, not d_file.parent + assert artifact.context_dir == str(tmp_path) + + # Verify path exists (resolved correctly) + assert artifact.metadata.get('path_exists') is True + + # Verify the resolved path points to the actual file + canonical_target = os.path.realpath(str(target_header)) + assert artifact.path == canonical_target + + def test_parse_d_file_build_root_context(self, tmp_path): + """Test that context_dir used for path resolution is self.build_dir + + This test verifies that regardless of where the .d file is located in the build tree, + all relative paths are resolved from the same build root directory. + """ + # Create headers at different locations + header1_dir = tmp_path / "include" + header1_dir.mkdir() + header1 = header1_dir / "test1.h" + header1.write_text("// test1 header\n") + + header2_dir = tmp_path / "src" / "common" + header2_dir.mkdir(parents=True) + header2 = header2_dir / "test2.h" + header2.write_text("// test2 header\n") + + # Create .d files at different depths + d_file1 = tmp_path / "shallow.d" + d_file1.write_text("build/obj1.o: include/test1.h\n") + + d_file2_dir = tmp_path / "CMakeFiles" / "deep" / "nested" + d_file2_dir.mkdir(parents=True) + d_file2 = d_file2_dir / "deep.d" + d_file2.write_text("build/obj2.o: src/common/test2.h\n") + + collector = ArtifactCollector(tmp_path) + + # Parse both .d files + artifacts1 = collector._parse_d_file(d_file1) + artifacts2 = collector._parse_d_file(d_file2) + + # Both should resolve successfully + assert len(artifacts1) == 1 + assert len(artifacts2) == 1 + + # Both should use build_dir as context_dir + assert artifacts1[0].context_dir == str(tmp_path) + assert artifacts2[0].context_dir == str(tmp_path) + + # Both should resolve to correct absolute paths + assert artifacts1[0].metadata.get('path_exists') is True + assert artifacts2[0].metadata.get('path_exists') is True + + # Verify correct files were found + assert "test1.h" in artifacts1[0].path + assert "test2.h" in artifacts2[0].path + + def test_parse_d_file_cross_project_paths(self, tmp_path): + """Test _parse_d_file handles paths that reference directories outside the build + + This test verifies that paths referencing parent directories are resolved correctly + relative to build root, and that non-existent paths are properly marked. + """ + # Create a project structure with 3rdparty dependencies: + # tmp_path (build_dir) + # ├── CMakeFiles/ + # │ └── target.dir/ + # │ └── test.d + # And simulate references to: + # ../../../../triton_backend/src/model.h (doesn't exist) + # ../../../tensorrt_llm/common/logger.h (exists) + + # Create an existing header in a sibling directory structure + parent_dir = tmp_path.parent + trtllm_dir = parent_dir / "tensorrt_llm" / "common" + trtllm_dir.mkdir(parents=True, exist_ok=True) + existing_header = trtllm_dir / "logger.h" + existing_header.write_text("// Logger header\n") + + # Create .d file with both existing and non-existing cross-project paths + d_file_dir = tmp_path / "CMakeFiles" / "target.dir" + d_file_dir.mkdir(parents=True) + d_file = d_file_dir / "test.d" + + # These paths are relative to BUILD ROOT + d_file.write_text( + "build/foo.o: ../../../../triton_backend/src/model.h ../tensorrt_llm/common/logger.h\n" + ) + + collector = ArtifactCollector(tmp_path) + artifacts = collector._parse_d_file(d_file) + + # Should process both paths + assert len(artifacts) == 2 + + # Find the artifacts + triton_artifact = None + logger_artifact = None + + for artifact in artifacts: + if "triton_backend" in artifact.path: + triton_artifact = artifact + elif "logger.h" in artifact.path: + logger_artifact = artifact + + # Verify triton_backend artifact (non-existent) + assert triton_artifact is not None + assert triton_artifact.type == "header" + assert triton_artifact.context_dir == str(tmp_path) + assert triton_artifact.metadata.get('path_exists') is False + + # Verify logger artifact (exists) + assert logger_artifact is not None + assert logger_artifact.type == "header" + assert logger_artifact.context_dir == str(tmp_path) + assert logger_artifact.metadata.get('path_exists') is True + + # Verify logger resolved to correct absolute path + canonical_existing = os.path.realpath(str(existing_header)) + assert logger_artifact.path == canonical_existing + + def test_parse_d_file_prevents_false_positives(self, tmp_path): + """Test that correct path resolution prevents false positives in dependency classification + + This test demonstrates the practical impact of the bug fix: when paths are resolved + correctly from build_dir, dependencies are classified accurately. + """ + # Scenario: A .d file deep in the build tree references a 3rdparty dependency + # OLD BUG: Would resolve from .d file's parent, potentially missing the file + # NEW FIX: Resolves from build root, finds the file correctly + + # Create a 3rdparty dependency structure + third_party_dir = tmp_path / "3rdparty" / "cutlass" / "include" + third_party_dir.mkdir(parents=True) + cutlass_header = third_party_dir / "cutlass.h" + cutlass_header.write_text("// CUTLASS header\n") + + # Create deeply nested .d file (simulating CMake's structure) + d_file_dir = tmp_path / "CMakeFiles" / "tensorrt_llm.dir" / "batch_manager" / "llm_request.cpp.o.d" + d_file_dir.parent.mkdir(parents=True, exist_ok=True) + d_file = d_file_dir + + # .d file contains relative path from BUILD ROOT to cutlass + # OLD BUG: Would try to resolve from .d file's parent → incorrect path + # NEW FIX: Resolves from build root → correct path + d_file.write_text("build/obj.o: 3rdparty/cutlass/include/cutlass.h\n") + + collector = ArtifactCollector(tmp_path) + artifacts = collector._parse_d_file(d_file) + + # Should successfully resolve the path + assert len(artifacts) == 1 + artifact = artifacts[0] + + # Verify correct resolution + assert artifact.metadata.get('path_exists') is True + assert "cutlass.h" in artifact.path + + # Verify it resolved to the actual file + canonical_cutlass = os.path.realpath(str(cutlass_header)) + assert artifact.path == canonical_cutlass + + # This artifact can now be correctly matched to cutlass dependency + # (with correct path resolution, pattern matching will work) + + +# ============================================================================ +# Test PatternMatcher +# ============================================================================ + + +class TestPatternMatcher: + """Test cases for PatternMatcher class""" + + @pytest.fixture + def dependencies_dir(self, tmp_path): + """Create test dependencies directory with YAML files""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + # Create dpkg.yml with system packages + dpkg_data = { + "dependencies": [{ + "name": "libc6", + "version": "2.35", + "description": "GNU C Library: Shared libraries", + "basename_matches": [], + "linker_flags_matches": ["-lpthread"], + "directory_matches": [] + }] + } + with open(deps_dir / "dpkg.yml", 'w') as f: + yaml.dump(dpkg_data, f) + + # Create cuda-cudart-12.yml + cuda_data = { + "name": "cuda-cudart-12", + "version": "12.0", + "description": "NVIDIA CUDA Runtime library version 12", + "basename_matches": ["libcudart.so.12"], + "linker_flags_matches": [], + "directory_matches": ["cuda-12"] + } + with open(deps_dir / "cuda-cudart-12.yml", 'w') as f: + yaml.dump(cuda_data, f) + + # Create pytorch.yml + pytorch_data = { + "name": "pytorch", + "version": "2.0", + "description": "PyTorch machine learning framework", + "basename_matches": [], + "linker_flags_matches": [], + "directory_matches": ["pytorch"] + } + with open(deps_dir / "pytorch.yml", 'w') as f: + yaml.dump(pytorch_data, f) + + # Create deepep.yml with bundled binary pattern + deepep_data = { + "name": "deepep", + "version": "1.0", + "description": "DeepEP library", + "basename_matches": ["deep_ep_cpp"], + "linker_flags_matches": [], + "directory_matches": [] + } + with open(deps_dir / "deepep.yml", 'w') as f: + yaml.dump(deepep_data, f) + + # Create nlohmann-json.yml + json_data = { + "name": "nlohmann-json", + "version": "3.11", + "description": "JSON for Modern C++", + "basename_matches": [], + "linker_flags_matches": [], + "directory_matches": ["json"] + } + with open(deps_dir / "nlohmann-json.yml", 'w') as f: + yaml.dump(json_data, f) + + return deps_dir + + def test_match_exact_library(self, dependencies_dir): + """Test _match_patterns finds exact matches""" + matcher = PatternMatcher(dependencies_dir) + + artifact = Artifact(path="-lpthread", type="library", source="link.txt") + + mapping = matcher._match_patterns(artifact) + + assert mapping is not None + assert mapping.dependency == "libc6" + assert mapping.confidence == "high" + assert mapping.strategy == "exact_pattern_match" + + def test_match_substring_pattern(self, dependencies_dir): + """Test that substring matching is no longer supported (removed for safety)""" + matcher = PatternMatcher(dependencies_dir) + + artifact = Artifact(path="tensorrt_llm/libs/deep_ep_cpp_tllm.so", + type="binary", + source="wheel") + + # Substring matching was removed from _match_patterns to prevent false positives + # This test verifies it returns None for non-exact matches + mapping = matcher._match_patterns(artifact) + + # Should return None since "deep_ep_cpp_tllm.so" doesn't exactly match "deep_ep_cpp" + assert mapping is None + + def test_match_path_alias_rightmost(self, dependencies_dir): + """Test _match_path_alias uses rightmost directory match""" + matcher = PatternMatcher(dependencies_dir) + + artifact = Artifact(path="/build/pytorch/include/torch/torch.h", + type="header", + source="test.d") + + mapping = matcher._match_path_alias(artifact) + + assert mapping is not None + assert mapping.dependency == "pytorch" + assert mapping.metadata['matched_pattern'] == "pytorch" + assert mapping.metadata['matched_sequence'] == "pytorch" + + def test_match_path_multi_directory(self, tmp_path): + """Test _match_path_alias matches multi-directory patterns""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + # Create YAML with multi-directory pattern + dep_data = { + "name": "test-lib", + "description": "Test library with multi-directory pattern", + "basename_matches": [], + "linker_flags_matches": [], + "directory_matches": ["foo/bar", "3rdparty/test"] + } + with open(deps_dir / "test-lib.yml", 'w') as f: + yaml.dump(dep_data, f) + + matcher = PatternMatcher(deps_dir) + + # Test: /home/foo/bar/file.h matches "foo/bar" + artifact1 = Artifact(path="/home/foo/bar/file.h", + type="header", + source="test.d") + mapping1 = matcher._match_path_alias(artifact1) + assert mapping1 is not None + assert mapping1.dependency == "test-lib" + assert mapping1.metadata['matched_pattern'] == "foo/bar" + assert mapping1.metadata['matched_sequence'] == "foo/bar" + + # Test: /home/foobar/file.h does NOT match "foo/bar" (no substring matching) + artifact2 = Artifact(path="/home/foobar/file.h", + type="header", + source="test.d") + mapping2 = matcher._match_path_alias(artifact2) + assert mapping2 is None + + # Test: /build/3rdparty/test/include/test.h matches "3rdparty/test" + artifact3 = Artifact(path="/build/3rdparty/test/include/test.h", + type="header", + source="test.d") + mapping3 = matcher._match_path_alias(artifact3) + assert mapping3 is not None + assert mapping3.dependency == "test-lib" + assert mapping3.metadata['matched_pattern'] == "3rdparty/test" + + def test_match_path_multi_directory_rightmost(self, tmp_path): + """Test rightmost wins for multi-directory patterns""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + dep_data = { + "name": "test-lib", + "description": "Test library for rightmost matching", + "basename_matches": [], + "linker_flags_matches": [], + "directory_matches": ["foo/bar"] + } + with open(deps_dir / "test-lib.yml", 'w') as f: + yaml.dump(dep_data, f) + + matcher = PatternMatcher(deps_dir) + + # Pattern: "foo/bar" appears twice in path + # Path: /foo/bar/baz/foo/bar/qux.h + # Should match at rightmost position (position 3) + artifact = Artifact(path="/foo/bar/baz/foo/bar/qux.h", + type="header", + source="test.d") + mapping = matcher._match_path_alias(artifact) + + assert mapping is not None + assert mapping.dependency == "test-lib" + assert mapping.metadata['matched_pattern'] == "foo/bar" + # Position should be 3 (rightmost occurrence) + assert mapping.metadata['position'] == 3 + + def test_match_path_no_substring_matching(self, tmp_path): + """Test that substring matching is NOT supported""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + dep_data = { + "name": "test-lib", + "description": "Test library for substring verification", + "basename_matches": [], + "linker_flags_matches": [], + "directory_matches": ["oo/ba", "o/b"] + } + with open(deps_dir / "test-lib.yml", 'w') as f: + yaml.dump(dep_data, f) + + matcher = PatternMatcher(deps_dir) + + # Pattern: "oo/ba" + # Path: /foo/bar/file.h + # Should NOT match ("oo" != "foo", "ba" != "bar") + artifact1 = Artifact(path="/foo/bar/file.h", + type="header", + source="test.d") + mapping1 = matcher._match_path_alias(artifact1) + assert mapping1 is None + + # Pattern: "o/b" + # Path: /foo/bar/file.h + # Should NOT match + artifact2 = Artifact(path="/foo/bar/file.h", + type="header", + source="test.d") + mapping2 = matcher._match_path_alias(artifact2) + assert mapping2 is None + + def test_match_path_mixed_single_and_multi(self, tmp_path): + """Test single and multi-directory patterns coexist""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + # Create two dependencies: one with single, one with multi-dir patterns + dep1_data = { + "name": "pytorch", + "description": "PyTorch with single component", + "basename_matches": [], + "linker_flags_matches": [], + "directory_matches": ["pytorch", "torch"] + } + with open(deps_dir / "pytorch.yml", 'w') as f: + yaml.dump(dep1_data, f) + + dep2_data = { + "name": "cutlass", + "description": "Cutlass with multi-directory", + "basename_matches": [], + "linker_flags_matches": [], + "directory_matches": ["3rdparty/cutlass"] + } + with open(deps_dir / "cutlass.yml", 'w') as f: + yaml.dump(dep2_data, f) + + matcher = PatternMatcher(deps_dir) + + # Test single component pattern still works + artifact1 = Artifact(path="/home/pytorch/lib/test.so", + type="library", + source="test") + mapping1 = matcher._match_path_alias(artifact1) + assert mapping1 is not None + assert mapping1.dependency == "pytorch" + + # Test multi-directory pattern works + artifact2 = Artifact(path="/build/3rdparty/cutlass/include/cutlass.h", + type="header", + source="test.d") + mapping2 = matcher._match_path_alias(artifact2) + assert mapping2 is not None + assert mapping2.dependency == "cutlass" + assert mapping2.metadata['matched_sequence'] == "3rdparty/cutlass" + + def test_match_generic_library_fallback(self, dependencies_dir): + """Test _match_generic_library as fallback""" + matcher = PatternMatcher(dependencies_dir) + + artifact = Artifact(path="/usr/lib/libunknown.so.1", + type="library", + source="link.txt") + + mapping = matcher._match_generic_library(artifact) + + assert mapping is not None + assert mapping.dependency == "unknown" + assert mapping.confidence == "low" + + def test_match_full_cascade(self, dependencies_dir): + """Test match() tries all strategies in order""" + matcher = PatternMatcher(dependencies_dir) + + # Should match exact library (highest priority) + artifact1 = Artifact(path="-lpthread", type="library", source="test") + mapping1 = matcher.match(artifact1) + assert mapping1 is not None + assert mapping1.strategy == "exact_pattern_match" + + # Should match exact pattern + artifact2 = Artifact(path="/usr/lib/libcudart.so.12", + type="library", + source="test") + mapping2 = matcher.match(artifact2) + assert mapping2 is not None + assert mapping2.strategy == "exact_pattern_match" + + # Should fall back to generic + artifact3 = Artifact(path="/usr/lib/libfallback.so", + type="library", + source="test") + mapping3 = matcher.match(artifact3) + assert mapping3 is not None + assert mapping3.strategy == "generic_library_inference" + + def test_yaml_loading_individual_files(self, tmp_path): + """Test loading individual YAML dependency files""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + # Create individual dependency file + dep_data = { + "name": "test-lib", + "version": "1.0", + "description": "Test library for unit tests", + "basename_matches": ["libtest.so"], + "linker_flags_matches": ["-ltest"], + "directory_matches": ["test-lib"] + } + with open(deps_dir / "test-lib.yml", 'w') as f: + yaml.dump(dep_data, f) + + matcher = PatternMatcher(deps_dir) + + # Check pattern_mappings + assert "libtest.so" in matcher.pattern_mappings + assert matcher.pattern_mappings["libtest.so"] == "test-lib" + assert "-ltest" in matcher.pattern_mappings + assert matcher.pattern_mappings["-ltest"] == "test-lib" + + # Check path_aliases + assert "test-lib" in matcher.path_aliases + assert matcher.path_aliases["test-lib"] == "test-lib" + + def test_yaml_loading_dpkg_format(self, tmp_path): + """Test loading dpkg.yml with dependencies list""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + # Create dpkg.yml with multiple dependencies + dpkg_data = { + "dependencies": [{ + "name": "dep1", + "version": "1.0", + "description": "First dependency", + "basename_matches": ["libdep1.so"], + "linker_flags_matches": [], + "directory_matches": [] + }, { + "name": "dep2", + "version": "2.0", + "description": "Second dependency", + "basename_matches": ["libdep2.so"], + "linker_flags_matches": [], + "directory_matches": [] + }] + } + with open(deps_dir / "dpkg.yml", 'w') as f: + yaml.dump(dpkg_data, f) + + matcher = PatternMatcher(deps_dir) + + # Both dependencies should be loaded + assert "libdep1.so" in matcher.pattern_mappings + assert matcher.pattern_mappings["libdep1.so"] == "dep1" + assert "libdep2.so" in matcher.pattern_mappings + assert matcher.pattern_mappings["libdep2.so"] == "dep2" + + def test_yaml_duplicate_pattern_warning(self, tmp_path, capsys): + """Test that duplicate patterns generate warnings""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + # Create first file with pattern + dep1_data = { + "name": "dep1", + "version": "1.0", + "description": "First dependency with duplicate pattern", + "basename_matches": ["duplicate.so"], + "linker_flags_matches": [], + "directory_matches": [] + } + with open(deps_dir / "dep1.yml", 'w') as f: + yaml.dump(dep1_data, f) + + # Create second file with same pattern + dep2_data = { + "name": "dep2", + "version": "2.0", + "description": "Second dependency with duplicate pattern", + "basename_matches": ["duplicate.so"], + "linker_flags_matches": [], + "directory_matches": [] + } + with open(deps_dir / "dep2.yml", 'w') as f: + yaml.dump(dep2_data, f) + + # Initialize matcher (should emit warning) + matcher = PatternMatcher(deps_dir) + + # Check warning was emitted + captured = capsys.readouterr() + assert "Warning: Duplicate basename match 'duplicate.so'" in captured.err + + # Last one wins + assert matcher.pattern_mappings["duplicate.so"] == "dep2" + + def test_yaml_invalid_format_warning(self, tmp_path, capsys): + """Test that invalid YAML format generates warnings""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + # Create file with invalid format (missing name) + with open(deps_dir / "invalid.yml", 'w') as f: + yaml.dump({ + "version": "1.0", + "description": "Missing name field" + }, f) + + # Initialize matcher (should emit warning) + PatternMatcher(deps_dir) + + # Check warning was emitted (either "Missing 'name' field" or "unrecognized format") + captured = capsys.readouterr() + assert ("Warning: Missing 'name' field" in captured.err + or "Warning: Skipping invalid.yml - unrecognized format" + in captured.err) + + def test_yaml_skip_underscore_files(self, tmp_path): + """Test that files starting with underscore are skipped""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + # Create _schema.yml (should be skipped) + schema_data = { + "name": "should-not-load", + "version": "1.0", + "description": "This file should be skipped", + "basename_matches": ["should-not-exist.so"], + "linker_flags_matches": [], + "directory_matches": [] + } + with open(deps_dir / "_schema.yml", 'w') as f: + yaml.dump(schema_data, f) + + # Create normal file (should be loaded) + normal_data = { + "name": "normal-dep", + "version": "1.0", + "description": "Normal dependency file", + "basename_matches": ["normal.so"], + "linker_flags_matches": [], + "directory_matches": [] + } + with open(deps_dir / "normal-dep.yml", 'w') as f: + yaml.dump(normal_data, f) + + matcher = PatternMatcher(deps_dir) + + # Schema file should not be loaded + assert "should-not-exist.so" not in matcher.pattern_mappings + + # Normal file should be loaded + assert "normal.so" in matcher.pattern_mappings + assert matcher.pattern_mappings["normal.so"] == "normal-dep" + + def test_yaml_mixed_loading(self, tmp_path): + """Test loading both dpkg.yml and individual files together""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + # Create dpkg.yml + dpkg_data = { + "dependencies": [{ + "name": "system-dep", + "version": "1.0", + "description": "System dependency from dpkg", + "basename_matches": ["libsystem.so"], + "linker_flags_matches": [], + "directory_matches": [] + }] + } + with open(deps_dir / "dpkg.yml", 'w') as f: + yaml.dump(dpkg_data, f) + + # Create individual file + custom_data = { + "name": "custom-dep", + "version": "2.0", + "description": "Custom dependency from individual file", + "basename_matches": ["libcustom.so"], + "linker_flags_matches": [], + "directory_matches": [] + } + with open(deps_dir / "custom-dep.yml", 'w') as f: + yaml.dump(custom_data, f) + + matcher = PatternMatcher(deps_dir) + + # Both should be loaded + assert "libsystem.so" in matcher.pattern_mappings + assert matcher.pattern_mappings["libsystem.so"] == "system-dep" + assert "libcustom.so" in matcher.pattern_mappings + assert matcher.pattern_mappings["libcustom.so"] == "custom-dep" + + def test_yaml_empty_arrays(self, tmp_path): + """Test that empty arrays in YAML are handled correctly""" + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + # Create dependency with empty arrays + dep_data = { + "name": "minimal-dep", + "version": "1.0", + "description": "Minimal dependency with empty arrays", + "basename_matches": [], + "linker_flags_matches": [], + "directory_matches": ["minimal"] + } + with open(deps_dir / "minimal-dep.yml", 'w') as f: + yaml.dump(dep_data, f) + + matcher = PatternMatcher(deps_dir) + + # Should load successfully + assert "minimal" in matcher.path_aliases + assert matcher.path_aliases["minimal"] == "minimal-dep" + + +# ============================================================================ +# Test OutputGenerator +# ============================================================================ + + +class TestOutputGenerator: + """Test cases for OutputGenerator class""" + + def test_generate_creates_files(self, tmp_path): + """Test generate() creates known and unknown YAML files""" + artifacts = [ + Artifact(path="/usr/include/stdio.h", + type="header", + source="test.d"), + Artifact(path="/usr/lib/libfoo.so", + type="library", + source="link.txt"), + Artifact(path="/unknown/header.h", type="header", source="test.d") + ] + + mappings = [ + Mapping(artifact=artifacts[0], + dependency="libc6", + confidence="high", + strategy="dpkg-query"), + Mapping(artifact=artifacts[1], + dependency="foo", + confidence="medium", + strategy="pattern") + ] + + output_dir = tmp_path / "reports" + known_file, unknown_file = OutputGenerator.generate( + mappings, artifacts, output_dir) + + # Check files exist + assert known_file.exists() + assert unknown_file.exists() + + # Check known.yml content (simplified structure: dependencies dict of lists) + with open(known_file) as f: + known_data = yaml.safe_load(f) + + assert known_data['summary']['total_artifacts'] == 3 + assert known_data['summary']['mapped'] == 2 + assert known_data['summary']['unmapped'] == 1 + assert len(known_data['dependencies']) == 2 + # Check dependencies is a dict with lists of paths + assert isinstance(known_data['dependencies'], dict) + assert 'libc6' in known_data['dependencies'] + assert 'foo' in known_data['dependencies'] + assert '/usr/include/stdio.h' in known_data['dependencies']['libc6'] + assert '/usr/lib/libfoo.so' in known_data['dependencies']['foo'] + + # Check unknown.yml content (simplified structure: flat list of paths) + with open(unknown_file) as f: + unknown_data = yaml.safe_load(f) + + assert unknown_data['summary']['count'] == 1 + assert len(unknown_data['artifacts']) == 1 + assert "/unknown/header.h" in unknown_data['artifacts'] + + def test_generate_groups_by_dependency(self, tmp_path): + """Test generate() groups artifacts by dependency""" + artifacts = [ + Artifact(path="/usr/include/stdio.h", + type="header", + source="test1.d"), + Artifact(path="/usr/include/stdlib.h", + type="header", + source="test2.d"), + ] + + mappings = [ + Mapping(artifact=artifacts[0], + dependency="libc6", + confidence="high", + strategy="dpkg"), + Mapping(artifact=artifacts[1], + dependency="libc6", + confidence="high", + strategy="dpkg"), + ] + + output_dir = tmp_path / "reports" + known_file, _ = OutputGenerator.generate(mappings, artifacts, + output_dir) + + with open(known_file) as f: + known_data = yaml.safe_load(f) + + # Should have 1 dependency with 2 artifacts (simplified: dict of lists) + assert len(known_data['dependencies']) == 1 + assert 'libc6' in known_data['dependencies'] + assert len(known_data['dependencies']['libc6']) == 2 + assert '/usr/include/stdio.h' in known_data['dependencies']['libc6'] + assert '/usr/include/stdlib.h' in known_data['dependencies']['libc6'] + + def test_generate_coverage_calculation(self, tmp_path): + """Test generate() calculates coverage correctly""" + artifacts = [ + Artifact(path=f"/test{i}.h", type="header", source="test.d") + for i in range(10) + ] + mappings = [ + Mapping(artifact=artifacts[i], + dependency=f"dep{i}", + confidence="high", + strategy="dpkg") for i in range(7) # 7 out of 10 mapped + ] + + output_dir = tmp_path / "reports" + known_file, _ = OutputGenerator.generate(mappings, artifacts, + output_dir) + + with open(known_file) as f: + known_data = yaml.safe_load(f) + + # Verify summary section is still included in YAML output + assert "70.0%" in known_data['summary']['coverage'] + + def test_generate_path_issues_yml_basic(self, tmp_path): + """Test generate() creates path_issues.yml with non-existent headers""" + artifacts = [ + Artifact(path="/usr/include/stdio.h", + type="header", + source="test.d", + metadata={'path_exists': True}), + Artifact(path="/nonexistent/header.h", + type="header", + source="test2.d", + metadata={ + 'path_exists': False, + 'original_path': 'nonexistent/header.h' + }), + Artifact(path="/missing/include.h", + type="header", + source="test3.d", + metadata={ + 'path_exists': False, + 'original_path': 'missing/include.h' + }), + ] + + mappings = [ + Mapping(artifact=artifacts[0], + dependency="libc6", + confidence="high", + strategy="dpkg-query") + ] + + output_dir = tmp_path / "reports" + OutputGenerator.generate(mappings, artifacts, output_dir) + + # Check path_issues.yml exists + path_issues_file = output_dir / "path_issues.yml" + assert path_issues_file.exists() + + # Load and verify content + with open(path_issues_file) as f: + path_issues_data = yaml.safe_load(f) + + # Should have 2 non-existent paths (not the existing one) + assert path_issues_data['summary']['count'] == 2 + assert path_issues_data['summary']['total_artifacts'] == 3 + assert path_issues_data['summary']['percentage'] == "66.7%" + + # Verify non_existent_paths contains the right entries + non_existent_paths = path_issues_data['non_existent_paths'] + assert len(non_existent_paths) == 2 + + # Check field names + assert all('resolved_path' in entry for entry in non_existent_paths) + assert all('type' in entry for entry in non_existent_paths) + assert all('source' in entry for entry in non_existent_paths) + assert all('d_file_path' in entry for entry in non_existent_paths) + + # Check values + paths = [entry['resolved_path'] for entry in non_existent_paths] + assert "/nonexistent/header.h" in paths + assert "/missing/include.h" in paths + assert "/usr/include/stdio.h" not in paths + + def test_generate_path_issues_yml_excludes_libraries(self, tmp_path): + """Test path_issues.yml excludes library artifacts even if they don't exist""" + artifacts = [ + Artifact(path="/nonexistent/header.h", + type="header", + source="test.d", + metadata={ + 'path_exists': False, + 'original_path': 'nonexistent/header.h' + }), + Artifact(path="-lmissing", + type="library", + source="link.txt", + metadata={'path_exists': False}), + Artifact(path="/missing/libfoo.so", + type="library", + source="link.txt", + metadata={'path_exists': False}), + ] + + mappings = [] + + output_dir = tmp_path / "reports" + OutputGenerator.generate(mappings, artifacts, output_dir) + + # Load path_issues.yml + path_issues_file = output_dir / "path_issues.yml" + with open(path_issues_file) as f: + path_issues_data = yaml.safe_load(f) + + # Should only have 1 entry (the header, not the libraries) + assert path_issues_data['summary']['count'] == 1 + assert len(path_issues_data['non_existent_paths']) == 1 + + # Verify it's the header + entry = path_issues_data['non_existent_paths'][0] + assert entry['resolved_path'] == "/nonexistent/header.h" + assert entry['type'] == "header" + assert entry['d_file_path'] == "nonexistent/header.h" + + def test_generate_path_issues_yml_field_names(self, tmp_path): + """Test path_issues.yml has correct field names (not 'path', but 'resolved_path')""" + artifacts = [ + Artifact(path="/resolved/absolute/path.h", + type="header", + source="build/CMakeFiles/test.d", + metadata={ + 'path_exists': False, + 'original_path': 'relative/path.h' + }), + ] + + mappings = [] + + output_dir = tmp_path / "reports" + OutputGenerator.generate(mappings, artifacts, output_dir) + + # Load path_issues.yml + path_issues_file = output_dir / "path_issues.yml" + with open(path_issues_file) as f: + path_issues_data = yaml.safe_load(f) + + entry = path_issues_data['non_existent_paths'][0] + + # Verify field names + assert 'resolved_path' in entry + assert 'type' in entry + assert 'source' in entry + assert 'd_file_path' in entry + + # Verify it does NOT use 'path' as field name + assert 'path' not in entry + + # Verify values + assert entry['resolved_path'] == "/resolved/absolute/path.h" + assert entry['type'] == "header" + assert entry['source'] == "build/CMakeFiles/test.d" + assert entry['d_file_path'] == "relative/path.h" + + def test_generate_path_issues_yml_percentage_calculation(self, tmp_path): + """Test path_issues.yml calculates percentage correctly""" + # Create 10 artifacts: 3 non-existent headers, 7 existing + artifacts = [] + for i in range(3): + artifacts.append( + Artifact(path=f"/missing{i}.h", + type="header", + source="test.d", + metadata={ + 'path_exists': False, + 'original_path': f'missing{i}.h' + })) + for i in range(7): + artifacts.append( + Artifact(path=f"/exists{i}.h", + type="header", + source="test.d", + metadata={'path_exists': True})) + + mappings = [] + + output_dir = tmp_path / "reports" + OutputGenerator.generate(mappings, artifacts, output_dir) + + # Load path_issues.yml + path_issues_file = output_dir / "path_issues.yml" + with open(path_issues_file) as f: + path_issues_data = yaml.safe_load(f) + + # 3 out of 10 = 30% + assert path_issues_data['summary']['count'] == 3 + assert path_issues_data['summary']['total_artifacts'] == 10 + assert path_issues_data['summary']['percentage'] == "30.0%" + + def test_generate_path_issues_yml_only_includes_path_exists_false( + self, tmp_path): + """Test path_issues.yml only includes artifacts with path_exists=False""" + artifacts = [ + Artifact(path="/exists.h", + type="header", + source="test.d", + metadata={'path_exists': True}), + Artifact(path="/missing.h", + type="header", + source="test.d", + metadata={ + 'path_exists': False, + 'original_path': 'missing.h' + }), + Artifact(path="/no_metadata.h", type="header", + source="test.d"), # No metadata + ] + + mappings = [] + + output_dir = tmp_path / "reports" + OutputGenerator.generate(mappings, artifacts, output_dir) + + # Load path_issues.yml + path_issues_file = output_dir / "path_issues.yml" + with open(path_issues_file) as f: + path_issues_data = yaml.safe_load(f) + + # Only the one with path_exists=False + assert path_issues_data['summary']['count'] == 1 + assert len(path_issues_data['non_existent_paths']) == 1 + assert path_issues_data['non_existent_paths'][0][ + 'resolved_path'] == "/missing.h" + + def test_generate_path_issues_yml_empty_when_all_exist(self, tmp_path): + """Test path_issues.yml has zero entries when all paths exist""" + artifacts = [ + Artifact(path="/exists1.h", + type="header", + source="test.d", + metadata={'path_exists': True}), + Artifact(path="/exists2.h", + type="header", + source="test.d", + metadata={'path_exists': True}), + ] + + mappings = [] + + output_dir = tmp_path / "reports" + OutputGenerator.generate(mappings, artifacts, output_dir) + + # Load path_issues.yml + path_issues_file = output_dir / "path_issues.yml" + with open(path_issues_file) as f: + path_issues_data = yaml.safe_load(f) + + # Should have 0 entries + assert path_issues_data['summary']['count'] == 0 + assert path_issues_data['summary']['total_artifacts'] == 2 + assert path_issues_data['summary']['percentage'] == "0.0%" + assert len(path_issues_data['non_existent_paths']) == 0 + + def test_generate_path_issues_yml_mixed_artifact_types(self, tmp_path): + """Test path_issues.yml with headers, libraries, and binaries""" + artifacts = [ + Artifact(path="/missing_header.h", + type="header", + source="test.d", + metadata={ + 'path_exists': False, + 'original_path': 'missing_header.h' + }), + Artifact(path="-lmissing", + type="library", + source="link.txt", + metadata={'path_exists': False}), + Artifact(path="/missing_binary.so", + type="binary", + source="wheel", + metadata={ + 'path_exists': False, + 'original_path': 'missing_binary.so' + }), + ] + + mappings = [] + + output_dir = tmp_path / "reports" + OutputGenerator.generate(mappings, artifacts, output_dir) + + # Load path_issues.yml + path_issues_file = output_dir / "path_issues.yml" + with open(path_issues_file) as f: + path_issues_data = yaml.safe_load(f) + + # Should include header and binary, but not library + assert path_issues_data['summary']['count'] == 2 + assert len(path_issues_data['non_existent_paths']) == 2 + + # Verify types + types = { + entry['type'] + for entry in path_issues_data['non_existent_paths'] + } + assert 'header' in types + assert 'binary' in types + assert 'library' not in types + + def test_generate_path_issues_yml_original_path_metadata(self, tmp_path): + """Test path_issues.yml uses original_path metadata for d_file_path field""" + artifacts = [ + Artifact(path="/resolved/absolute/path/include/header.h", + type="header", + source="build/CMakeFiles/target.dir/test.d", + metadata={ + 'path_exists': False, + 'original_path': 'relative/include/header.h' + }), + ] + + mappings = [] + + output_dir = tmp_path / "reports" + OutputGenerator.generate(mappings, artifacts, output_dir) + + # Load path_issues.yml + path_issues_file = output_dir / "path_issues.yml" + with open(path_issues_file) as f: + path_issues_data = yaml.safe_load(f) + + entry = path_issues_data['non_existent_paths'][0] + + # d_file_path should be the original relative path from the .d file + assert entry['d_file_path'] == 'relative/include/header.h' + # resolved_path should be the absolute resolved path + assert entry[ + 'resolved_path'] == "/resolved/absolute/path/include/header.h" + + def test_generate_path_issues_yml_missing_original_path_metadata( + self, tmp_path): + """Test path_issues.yml handles missing original_path metadata gracefully""" + artifacts = [ + Artifact(path="/missing.h", + type="header", + source="test.d", + metadata={'path_exists': False}), # No original_path + ] + + mappings = [] + + output_dir = tmp_path / "reports" + OutputGenerator.generate(mappings, artifacts, output_dir) + + # Load path_issues.yml + path_issues_file = output_dir / "path_issues.yml" + with open(path_issues_file) as f: + path_issues_data = yaml.safe_load(f) + + entry = path_issues_data['non_existent_paths'][0] + + # Should have 'N/A' when original_path is missing + assert entry['d_file_path'] == 'N/A' + assert entry['resolved_path'] == "/missing.h" + + def test_generate_path_issues_yml_note_field(self, tmp_path): + """Test path_issues.yml summary contains explanatory note""" + artifacts = [ + Artifact(path="/missing.h", + type="header", + source="test.d", + metadata={ + 'path_exists': False, + 'original_path': 'missing.h' + }), + ] + + mappings = [] + + output_dir = tmp_path / "reports" + OutputGenerator.generate(mappings, artifacts, output_dir) + + # Load path_issues.yml + path_issues_file = output_dir / "path_issues.yml" + with open(path_issues_file) as f: + path_issues_data = yaml.safe_load(f) + + # Verify note field exists and mentions libraries are excluded + assert 'note' in path_issues_data['summary'] + note = path_issues_data['summary']['note'] + assert 'libraries excluded' in note.lower() + assert 'do not exist' in note.lower() + + +# ============================================================================ +# Integration Tests +# ============================================================================ + + +class TestIntegration: + """Integration tests for full workflow""" + + def test_full_workflow(self, tmp_path): + """Test complete scan workflow end-to-end""" + # Setup: Create dependencies directory with YAML files + deps_dir = tmp_path / "dependencies" + deps_dir.mkdir() + + # Create dpkg.yml + dpkg_data = { + "dependencies": [{ + "name": "libc6", + "version": "2.35", + "description": "GNU C Library: Shared libraries", + "basename_matches": [], + "linker_flags_matches": ["-lpthread"], + "directory_matches": [] + }] + } + with open(deps_dir / "dpkg.yml", 'w') as f: + yaml.dump(dpkg_data, f) + + # Setup: Create build artifacts + build_dir = tmp_path / "build" + build_dir.mkdir() + + d_file = build_dir / "test.d" + d_file.write_text("build/foo.o: /usr/include/stdio.h\n") + + link_file = build_dir / "link.txt" + link_file.write_text("/usr/bin/c++ -o foo -lpthread\n") + + # Run workflow + collector = ArtifactCollector(build_dir) + artifacts = collector.collect_all() + + dpkg_resolver = DpkgResolver() + pattern_matcher = PatternMatcher(deps_dir) + + all_mappings = [] + + # Try dpkg first + for artifact in artifacts: + with patch.object(dpkg_resolver, + 'get_package', + return_value='libc6'): + package = dpkg_resolver.get_package(artifact.path) + if package: + all_mappings.append( + Mapping(artifact=artifact, + dependency=package, + confidence='high', + strategy='dpkg-query')) + + # Try patterns for remaining + dpkg_paths = {m.artifact.path for m in all_mappings} + for artifact in artifacts: + if artifact.path not in dpkg_paths: + mapping = pattern_matcher.match(artifact) + if mapping: + all_mappings.append(mapping) + + # Generate reports + output_dir = tmp_path / "reports" + known_file, unknown_file = OutputGenerator.generate( + all_mappings, artifacts, output_dir) + + # Verify outputs + assert known_file.exists() + assert unknown_file.exists() + + with open(known_file) as f: + data = yaml.safe_load(f) + assert data['summary']['total_artifacts'] > 0 + + +if __name__ == '__main__': + pytest.main([__file__, '-v'])