From 3598d6ddfac236ae5b8520b4709a64c90682f9bb Mon Sep 17 00:00:00 2001 From: prosenjitdhole Date: Fri, 23 Jan 2026 04:58:09 -0600 Subject: [PATCH 1/2] AORTA-17 CLI command for report generation : Added tracelens run utility and analyze gemm reports --- src/aorta/report/ANALYZE_CMD_DEV_DOCS.md | 775 ++++++++++++++++++ src/aorta/report/analysis/__init__.py | 14 + src/aorta/report/analysis/analyze_gemm.py | 320 ++++++++ src/aorta/report/analysis/analyze_single.py | 339 ++++++++ src/aorta/report/analysis/analyze_sweep.py | 412 ++++++++++ .../report/analysis/tracelens_wrapper.py | 342 ++++++++ src/aorta/report/cli.py | 165 +++- 7 files changed, 2344 insertions(+), 23 deletions(-) create mode 100644 src/aorta/report/ANALYZE_CMD_DEV_DOCS.md create mode 100644 src/aorta/report/analysis/__init__.py create mode 100644 src/aorta/report/analysis/analyze_gemm.py create mode 100644 src/aorta/report/analysis/analyze_single.py create mode 100644 src/aorta/report/analysis/analyze_sweep.py create mode 100644 src/aorta/report/analysis/tracelens_wrapper.py diff --git a/src/aorta/report/ANALYZE_CMD_DEV_DOCS.md b/src/aorta/report/ANALYZE_CMD_DEV_DOCS.md new file mode 100644 index 0000000..90dc12a --- /dev/null +++ b/src/aorta/report/ANALYZE_CMD_DEV_DOCS.md @@ -0,0 +1,775 @@ +# `analyze` Command Group - Developer Documentation + +**Version:** 1.0 +**Date:** January 2026 +**Status:** ✅ Implemented + +--- + +## Table of Contents + +1. [Overview](#1-overview) +2. [Command Summary](#2-command-summary) +3. [Source Script Analysis](#3-source-script-analysis) +4. [Implementation Architecture](#4-implementation-architecture) +5. [Command Specifications](#5-command-specifications) +6. [TraceLens Integration](#6-tracelens-integration) +7. [Implementation Order](#7-implementation-order) +8. [Expected Output](#8-expected-output) + +--- + +## 1. Overview + +The `analyze` command group provides TraceLens analysis capabilities for PyTorch profiler traces. It consolidates three shell/Python scripts into a unified CLI interface. + +### Commands + +| Command | Purpose | Source Script | +|---------|---------|---------------| +| `analyze single` | Analyze single configuration traces | `run_tracelens_single_config.sh` | +| `analyze sweep` | Analyze sweep with multiple configs | `run_tracelens_analysis.sh` | +| `analyze gemm` | Extract GEMM kernel variance | `analyze_gemm_reports.py` | + +### Key Features + +- **Unified interface**: Consistent CLI for all analysis operations +- **GEMM recognition**: Patched TraceLens for ROCm Tensile kernel detection +- **Auto-discovery**: Automatic detection of ranks, threads, and channels +- **Flexible output**: Configurable output directories and formats + +--- + +## 2. Command Summary + +### 2.1 `analyze single` + +Analyze a single configuration trace directory containing rank subdirectories. + +```bash +aorta-report analyze single /path/to/traces [OPTIONS] + +Options: + --individual-only Generate only individual reports + --collective-only Generate only collective report + --geo-mean Use geometric mean for timeline aggregation + --short-kernel-threshold INT Threshold for short kernel study (µs) + --topk-ops INT Number of top operations to include + -o, --output PATH Output directory +``` + +**Usage Examples:** +```bash +# Basic analysis (generates individual + collective reports) +aorta-report analyze single /path/to/traces + +# Generate only individual reports with geometric mean aggregation +aorta-report analyze single /path/to/traces --individual-only --geo-mean + +# Custom output directory +aorta-report analyze single /path/to/traces -o ./results +``` + +### 2.2 `analyze sweep` + +Analyze a sweep directory containing multiple thread/channel configurations. + +```bash +aorta-report analyze sweep /path/to/sweep [OPTIONS] + +Options: + --geo-mean Use geometric mean instead of arithmetic mean + -o, --output PATH Output directory +``` + +**Usage Examples:** +```bash +# Basic sweep analysis +aorta-report analyze sweep /path/to/sweep_20251124 + +# Use geometric mean for aggregation +aorta-report analyze sweep /path/to/sweep --geo-mean + +# Custom output directory +aorta-report analyze sweep /path/to/sweep -o ./analysis_results +``` + +### 2.3 `analyze gemm` + +Extract GEMM kernel variance data from existing TraceLens reports. + +```bash +aorta-report analyze gemm /path/to/reports [OPTIONS] + +Options: + -t, --threads INT Thread configurations to analyze (multiple allowed) + -c, --channels INT Channel configurations to analyze (multiple allowed) + -r, --ranks INT Ranks to analyze (default: 0-7) + --top-k INTEGER Number of top kernels to extract per file (default: 5) + -o, --output PATH Output CSV file +``` + +**Usage Examples:** +```bash +# Basic GEMM analysis with defaults (256/512 threads, 28/42/56/70 channels) +aorta-report analyze gemm /path/to/tracelens_analysis + +# Custom thread and channel configurations +aorta-report analyze gemm /path/to/reports -t 256 -t 512 -c 28 -c 42 + +# Extract top 10 kernels and save to custom file +aorta-report analyze gemm /path/to/reports --top-k 10 -o gemm_analysis.csv + +# Specify specific ranks +aorta-report analyze gemm /path/to/reports -r 0 -r 1 -r 2 -r 3 +``` + +--- + +## 3. Source Script Analysis + +### 3.1 `run_tracelens_single_config.sh` (267 lines) + +**Location:** `scripts/tracelens_single_config/run_tracelens_single_config.sh` + +**Functionality:** +1. Parse options (`--individual-only`, `--collective-only`) +2. Auto-detect trace directory structure: + - Check if input contains `rank*` directories (is torch_profiler/) + - Check if input contains `torch_profiler/` subdirectory +3. Create output directory structure +4. Detect number of ranks +5. Generate individual reports (per rank) +6. Generate collective multi-rank report + +**TraceLens Commands:** +```bash +# Individual report (per rank) +$TRACELENS_WRAPPER generate_perf_report \ + --profile_json_path "$TRACE" \ + --output_xlsx_path "$OUTPUT" \ + --include_unlinked_kernels \ + --short_kernel_study \ + --short_kernel_threshold_us 50 \ + --topk_ops 100 \ + --topk_roofline_ops 100 + +# Collective report (all ranks) +$TRACELENS_WRAPPER generate_multi_rank_collective \ + --trace_pattern "$TORCH_PROF_DIR/rank*/trace.json" \ + --world_size $NUM_RANKS \ + --output_xlsx_path "$OUTPUT" \ + --detailed_analysis \ + --use_multiprocessing +``` + +**Input Structure:** +``` +trace_dir/ +├── torch_profiler/ # or trace_dir IS torch_profiler/ +│ ├── rank0/ +│ │ └── *.json +│ ├── rank1/ +│ │ └── *.json +│ └── ... +``` + +**Output Structure:** +``` +trace_dir/ +└── tracelens_analysis/ + ├── individual_reports/ + │ ├── perf_rank0.xlsx + │ ├── perf_rank1.xlsx + │ └── ... + └── collective_reports/ + └── collective_all_ranks.xlsx +``` + +--- + +### 3.2 `run_tracelens_analysis.sh` (423 lines) + +**Location:** `scripts/gemm_analysis/run_tracelens_analysis.sh` + +**Functionality:** +1. Parse options (`--rocprof`) +2. Auto-discover thread configurations (e.g., `256thread`, `512thread`) +3. Auto-discover channel configurations per thread (e.g., `nccl_28channels`) +4. For each thread/channel/rank combination: + - Find trace files + - Generate individual reports +5. Generate collective reports (PyTorch mode only) +6. Generate cross-thread comparisons + +**TraceLens Commands:** +```bash +# PyTorch mode - Individual +TraceLens_generate_perf_report_pytorch \ + --profile_json_path "$TRACE" \ + --output_xlsx_path "$OUTPUT" \ + --include_unlinked_kernels \ + --short_kernel_study \ + --short_kernel_threshold_us 50 \ + --topk_ops 100 \ + --enable_kernel_summary \ + --topk_roofline_ops 100 + +# ROCprof mode - Individual +TraceLens_generate_perf_report_rocprof \ + --profile_json_path "$TRACE" \ + --output_xlsx_path "$OUTPUT" \ + --kernel_details \ + --short_kernel_study \ + --short_kernel_threshold_us 50 \ + --topk_kernels 100 + +# PyTorch mode - Collective +TraceLens_generate_multi_rank_collective_report_pytorch \ + --trace_pattern "$TRACE_DIR/rank*/trace/pt.trace.json" \ + --world_size 8 \ + --output_xlsx_path "$OUTPUT" \ + --detailed_analysis \ + --use_multiprocessing + +# Comparison across threads +TraceLens_compare_perf_reports_pytorch \ + "${reports[@]}" \ + --names "${names[@]}" \ + --sheets gpu_timeline ops_summary \ + -o "$OUTPUT" +``` + +**Input Structure:** +``` +sweep_dir/ +├── 256thread/ +│ ├── nccl_28channels/ +│ │ └── torch_profiler/ +│ │ ├── rank0/ +│ │ └── ... +│ ├── nccl_42channels/ +│ └── ... +└── 512thread/ + └── ... +``` + +**Output Structure:** +``` +sweep_dir/ +└── tracelens_analysis/ + ├── 256thread/ + │ ├── individual_reports/ + │ │ ├── perf_28ch_rank0.xlsx + │ │ ├── perf_28ch_rank1.xlsx + │ │ └── ... + │ └── collective_reports/ + │ └── collective_28ch.xlsx + ├── 512thread/ + │ └── ... + └── comparisons/ + ├── compare_28ch_rank0_across_threads.xlsx + └── ... +``` + +--- + +### 3.3 `analyze_gemm_reports.py` (344 lines) + +**Location:** `scripts/gemm_analysis/analyze_gemm_reports.py` + +**Functionality:** +1. Parse command-line arguments +2. Iterate through thread/channel/rank combinations +3. Open each Excel report +4. Read GEMM sheet +5. Extract kernel info and timing data +6. Calculate time variance (max - min) +7. Sort by variance and get top-K +8. Output combined CSV + +**Key Functions:** +```python +def process_excel_file(file_path, threads, channel, rank, top_k=5): + """Process a single Excel file and extract GEMM data.""" + # Opens workbook + # Reads GEMM sheet + # Validates column headers + # Extracts kernel_details, time_min, time_max + # Calculates time_diff + # Returns top_k results sorted by variance +``` + +**Input:** TraceLens Excel reports with GEMM sheet +**Output:** CSV with columns: +- `threads`, `channel`, `rank` +- `kernel_name` +- `kernel_time_min_us`, `kernel_time_max_us`, `time_diff_us` + +--- + +## 4. Implementation Architecture + +### 4.1 File Structure + +``` +src/aorta/report/ +├── cli.py # CLI definitions (update analyze commands) +├── analysis/ # NEW: Analysis logic +│ ├── __init__.py # Exports public functions +│ ├── tracelens_wrapper.py # GEMM-patched TraceLens wrapper +│ ├── analyze_single.py # Single config analysis +│ ├── analyze_sweep.py # Sweep analysis +│ └── analyze_gemm.py # GEMM variance analysis +├── generators/ # HTML generators (existing) +└── templates/ # HTML templates (existing) +``` + +### 4.2 Module Responsibilities + +#### `analysis/__init__.py` +```python +from .analyze_single import analyze_single_config +from .analyze_sweep import analyze_sweep_config +from .analyze_gemm import analyze_gemm_reports +from .tracelens_wrapper import TraceLensWrapper + +__all__ = [ + "analyze_single_config", + "analyze_sweep_config", + "analyze_gemm_reports", + "TraceLensWrapper", +] +``` + +#### `analysis/tracelens_wrapper.py` +```python +class TraceLensWrapper: + """GEMM-patched TraceLens wrapper.""" + + def __init__(self): + self._apply_gemm_patches() + + def _apply_gemm_patches(self): + """Apply GEMM recognition patches to TraceLens.""" + # Port from tracelens_with_gemm_patch.py + + def generate_perf_report(self, trace_path, output_path, **options): + """Generate individual performance report.""" + + def generate_collective_report(self, trace_pattern, world_size, output_path, **options): + """Generate multi-rank collective report.""" + + def compare_reports(self, report_paths, names, output_path, sheets=None): + """Compare multiple performance reports.""" +``` + +#### `analysis/analyze_single.py` +```python +def analyze_single_config( + trace_dir: Path, + output_dir: Optional[Path] = None, + individual_only: bool = False, + collective_only: bool = False, + verbose: bool = False, +) -> Path: + """Analyze a single configuration trace directory.""" + +def detect_trace_structure(input_dir: Path) -> Tuple[Path, Path]: + """Auto-detect torch_profiler directory and base directory.""" + +def discover_ranks(torch_prof_dir: Path) -> List[int]: + """Discover available ranks in the trace directory.""" + +def generate_individual_reports( + wrapper: TraceLensWrapper, + torch_prof_dir: Path, + output_dir: Path, + ranks: List[int], + verbose: bool, +) -> List[Path]: + """Generate individual performance reports for each rank.""" + +def generate_collective_report( + wrapper: TraceLensWrapper, + torch_prof_dir: Path, + output_dir: Path, + num_ranks: int, + verbose: bool, +) -> Optional[Path]: + """Generate multi-rank collective report.""" +``` + +#### `analysis/analyze_sweep.py` +```python +def analyze_sweep_config( + sweep_dir: Path, + output_dir: Optional[Path] = None, + use_geo_mean: bool = False, + verbose: bool = False, +) -> Optional[Path]: + """Process GPU timeline data from all individual reports in a sweep.""" + +def process_thread_config( + thread_config: str, + tracelens_dir: Path, + use_geo_mean: bool, + verbose: bool = False, +) -> List[pd.DataFrame]: + """Process a single thread configuration.""" + +def process_channel_config( + channel_config: str, + channel_groups: Dict[str, List[tuple]], + use_geo_mean: bool, + thread_config: str, + verbose: bool = False, +) -> Optional[pd.DataFrame]: + """Process a single channel configuration.""" + +def aggregate_rank_data( + rank_data: List[pd.DataFrame], + thread_config: str, + channel_config: str, + num_ranks: int, + use_geo_mean: bool, +) -> pd.DataFrame: + """Aggregate data across ranks and add metadata.""" +``` + +#### `analysis/analyze_gemm.py` +```python +def analyze_gemm_reports( + reports_dir: Path, + output_file: Optional[Path] = None, + top_k: int = 5, + threads: Optional[List[int]] = None, + channels: Optional[List[int]] = None, + ranks: Optional[List[int]] = None, + verbose: bool = False, +) -> Path: + """Analyze GEMM reports and extract top kernels by variance.""" + +def process_excel_file( + file_path: Path, + threads: int, + channel: int, + rank: int, + top_k: int, +) -> List[Dict]: + """Process a single Excel file and extract GEMM data.""" + +def extract_kernel_name(kernel_info_str: str) -> Optional[str]: + """Extract kernel name from kernel info string.""" +``` + +### 4.3 Data Flow + +``` +CLI (cli.py) + │ + ├── analyze single ───────────► analysis.analyze_single_config() + │ │ + │ ├── detect_trace_structure() + │ ├── discover_ranks() + │ ├── TraceLensWrapper.generate_perf_report() + │ └── TraceLensWrapper.generate_collective_report() + │ + ├── analyze sweep ────────────► analysis.analyze_sweep_config() + │ │ + │ ├── discover_configurations() + │ ├── process_configuration() + │ │ └── TraceLensWrapper.generate_perf_report() + │ ├── TraceLensWrapper.generate_collective_report() + │ └── TraceLensWrapper.compare_reports() + │ + └── analyze gemm ─────────────► analysis.analyze_gemm_reports() + │ + ├── process_excel_file() + └── write CSV output +``` + +--- + +## 5. Command Specifications + +### 5.1 `analyze single` + +| Aspect | Details | +|--------|---------| +| **Input** | Directory with torch_profiler/rank* structure | +| **Output** | individual_reports/ and collective_reports/ | +| **Options** | `--individual-only`, `--collective-only`, `-o` | +| **TraceLens** | `generate_perf_report`, `generate_multi_rank_collective` | + +### 5.2 `analyze sweep` + +| Aspect | Details | +|--------|---------| +| **Input** | Sweep directory with thread/channel structure | +| **Output** | Per-config reports + comparisons | +| **Options** | `--rocprof`, `-o` | +| **TraceLens** | `generate_perf_report`, `generate_collective`, `compare_reports` | + +### 5.3 `analyze gemm` + +| Aspect | Details | +|--------|---------| +| **Input** | Directory with TraceLens Excel reports | +| **Output** | CSV with GEMM kernel variance | +| **Options** | `--top-k`, `-o` | +| **Dependencies** | `openpyxl` for Excel reading | + +--- + +## 6. TraceLens Integration + +### 6.1 GEMM Patch Requirements + +The TraceLens wrapper must apply these patches for ROCm GEMM recognition: + +1. **`kernel_name_parser`**: Recognize Tensile GEMM patterns (`Cijk_Alik_Bljk_...`) +2. **`Trace2Tree.util`**: Enhanced `is_gemm_kernel()` function +3. **`TraceEventUtils`**: Add GEMM keys for classification +4. **`torch_op_mapping`**: Better GEMM categorization + +### 6.2 TraceLens Functions Used + +| Function | PyTorch Mode | ROCprof Mode | +|----------|-------------|--------------| +| `generate_perf_report_pytorch` | ✓ | - | +| `generate_perf_report_rocprof` | - | ✓ | +| `generate_multi_rank_collective_report_pytorch` | ✓ | - | +| `compare_perf_reports_pytorch` | ✓ | ✓ | + +### 6.3 Common TraceLens Options + +```python +# Individual report options +INDIVIDUAL_REPORT_OPTIONS = { + "include_unlinked_kernels": True, + "short_kernel_study": True, + "short_kernel_threshold_us": 50, + "topk_ops": 100, + "topk_roofline_ops": 100, +} + +# ROCprof specific options +ROCPROF_OPTIONS = { + "kernel_details": True, + "topk_kernels": 100, +} + +# Collective report options +COLLECTIVE_REPORT_OPTIONS = { + "detailed_analysis": True, + "use_multiprocessing": True, +} +``` + +--- + +## 7. Implementation Status + +### Phase 1: Foundation ✅ + +1. **Created `analysis/` directory structure** ✅ +2. **Implemented `tracelens_wrapper.py`** ✅ + - GEMM patches for ROCm Tensile kernel recognition + - Wrapper class with methods for TraceLens commands + - Support for individual, collective, and rocprof reports + +### Phase 2: `analyze gemm` ✅ + +3. **Implemented `analyze_gemm.py`** ✅ + - Ported logic from `analyze_gemm_reports.py` + - Clean API with configurable threads/channels/ranks + - Progress reporting and summary statistics + +4. **Updated CLI for `analyze gemm`** ✅ + - Connected command to implementation + - Added multiple options for configuration + +### Phase 3: `analyze single` ✅ + +5. **Implemented `analyze_single.py`** ✅ + - Directory detection logic + - Report generation with TraceLens wrapper + - GPU timeline aggregation + - Status reporting + +6. **Updated CLI for `analyze single`** ✅ + - Added geo-mean and threshold options + +### Phase 4: `analyze sweep` ✅ + +7. **Implemented `analyze_sweep.py`** ✅ + - Thread/channel config discovery + - GPU timeline processing across all configs + - Excel output with pivot tables + +8. **Updated CLI for `analyze sweep`** ✅ + - Added geo-mean option + +### Phase 5: Documentation ✅ + +9. **Updated documentation** ✅ + - This dev docs file + - Implementation complete + +--- + +## 8. Expected Output + +### 8.1 `analyze single` Output + +``` +============================================================ +TraceLens Analysis - Single Configuration +============================================================ +Input directory: /path/to/traces +Torch profiler: /path/to/traces/torch_profiler +Detected 8 ranks + +Step 1: Generating Individual Reports + [1/8] Rank 0... ✓ perf_rank0.xlsx + [2/8] Rank 1... ✓ perf_rank1.xlsx + [3/8] Rank 2... ✓ perf_rank2.xlsx + [4/8] Rank 3... ✓ perf_rank3.xlsx + [5/8] Rank 4... ✓ perf_rank4.xlsx + [6/8] Rank 5... ✓ perf_rank5.xlsx + [7/8] Rank 6... ✓ perf_rank6.xlsx + [8/8] Rank 7... ✓ perf_rank7.xlsx + +Step 2: Generating Collective Report + Processing all 8 ranks... ✓ collective_all_ranks.xlsx + +============================================================ +Analysis Complete! +============================================================ +Output: /path/to/traces/tracelens_analysis/ + +Generated reports: + Individual: 8 + Collective: 1 +``` + +### 8.2 `analyze sweep` Output + +``` +============================================================ +TraceLens Analysis - Sweep +============================================================ +Sweep directory: /path/to/sweep +Mode: PyTorch profiler + +Discovered configurations: + 256thread: 28, 42, 56, 70 channels + 512thread: 28, 42, 56, 70 channels + Total: 8 configurations × 8 ranks = 64 reports + +Step 1: Generating Individual Reports + 256thread/28ch: + [1/8] Rank 0... ✓ + [2/8] Rank 1... ✓ + ... + 256thread/42ch: + ... + +Step 2: Generating Collective Reports + 256thread/28ch... ✓ + 256thread/42ch... ✓ + ... + +Step 3: Generating Comparisons + 28ch across threads... ✓ + 42ch across threads... ✓ + ... + +============================================================ +Analysis Complete! +============================================================ +Output: /path/to/sweep/tracelens_analysis/ + +Summary: + Individual reports: 64 + Collective reports: 8 + Comparisons: 32 +``` + +### 8.3 `analyze gemm` Output + +``` +============================================================ +GEMM Kernel Variance Analysis +============================================================ +Base path: /path/to/tracelens_analysis +Configuration: + Threads: [256, 512] + Channels: [28, 42, 56, 70] + Ranks: [0, 1, 2, 3, 4, 5, 6, 7] + Top K: 5 + +Processing Excel files... + [1/64] perf_28ch_rank0.xlsx... 5 kernels found + [2/64] perf_28ch_rank1.xlsx... 5 kernels found + ... + +============================================================ +Analysis Complete! +============================================================ +Output: /path/to/tracelens_analysis/top5_gemm_kernels_time_variance.csv + +Summary: + Total kernels extracted: 320 + Unique kernel names: 45 + Max variance: 1234.56 µs + Avg variance: 89.12 µs + +Top 5 kernels by variance: + 1. Cijk_Alik_Bljk_... (256t/28ch/r0): 1234.56 µs + 2. Cijk_Alik_Bljk_... (512t/42ch/r3): 987.65 µs + ... +``` + +--- + +## Appendix A: Migration Checklist + +### From `run_tracelens_single_config.sh` +- [x] Directory structure detection +- [x] Rank discovery +- [x] Individual report generation loop +- [x] Symlink creation for collective report +- [x] Collective report generation +- [x] Summary output +- [x] GPU timeline aggregation + +### From `run_tracelens_analysis.sh` → `analyze sweep` +- [x] Thread config discovery +- [x] Channel config discovery +- [x] PyTorch trace file finding +- [x] GPU timeline processing per config +- [x] Summary Excel generation with pivot tables + +### From `analyze_gemm_reports.py` +- [x] Command-line argument handling +- [x] Excel file processing +- [x] GEMM sheet reading +- [x] Kernel name extraction +- [x] Variance calculation +- [x] CSV output + +--- + +## Appendix B: Error Handling + +| Scenario | Handling | +|----------|----------| +| Missing trace file | Log warning, continue with next | +| Missing rank directory | Log warning, continue with next | +| GEMM sheet not found | Log warning, skip file | +| TraceLens import error | Raise with helpful message | +| Permission error | Raise with fix instructions | +| No configurations found | Raise with expected structure | + diff --git a/src/aorta/report/analysis/__init__.py b/src/aorta/report/analysis/__init__.py new file mode 100644 index 0000000..e0bd324 --- /dev/null +++ b/src/aorta/report/analysis/__init__.py @@ -0,0 +1,14 @@ +"""Analysis modules for TraceLens trace processing.""" + +from .tracelens_wrapper import TraceLensWrapper +from .analyze_gemm import analyze_gemm_reports +from .analyze_single import analyze_single_config +from .analyze_sweep import analyze_sweep_config + +__all__ = [ + "TraceLensWrapper", + "analyze_gemm_reports", + "analyze_single_config", + "analyze_sweep_config", +] + diff --git a/src/aorta/report/analysis/analyze_gemm.py b/src/aorta/report/analysis/analyze_gemm.py new file mode 100644 index 0000000..a724019 --- /dev/null +++ b/src/aorta/report/analysis/analyze_gemm.py @@ -0,0 +1,320 @@ +""" +Analyze GEMM reports from TraceLens Excel files. + +Extracts top N kernels with largest time variance (max - min) from +GEMM sheet data in individual performance reports. +""" + +import csv +import re +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Any + +import openpyxl + + +def extract_name_from_kernel_info(kernel_info_str: str) -> Optional[str]: + """ + Extract the 'name' field from the kernel info string. + + Args: + kernel_info_str: String containing kernel details, e.g., + "[{'name': '...', 'stream': ..., ...}]" + + Returns: + Kernel name or None if extraction fails + """ + try: + if kernel_info_str is None or kernel_info_str == "": + return None + + # Try to extract just the name using regex + match = re.search(r"'name':\s*'([^']+)'", str(kernel_info_str)) + if match: + return match.group(1) + + return None + except Exception: + return None + + +def column_letter_to_index(letter: str) -> int: + """Convert Excel column letter to 0-based index.""" + index = 0 + for i, char in enumerate(reversed(letter.upper())): + index += (ord(char) - ord("A") + 1) * (26**i) + return index - 1 + + +def process_excel_file( + file_path: Path, + threads: int, + channel: int, + rank: int, + top_k: int = 5, +) -> List[Dict[str, Any]]: + """ + Process a single Excel file and extract GEMM data. + + Args: + file_path: Path to the Excel file + threads: Thread configuration + channel: Channel configuration + rank: Rank number + top_k: Number of top kernels to extract + + Returns: + List of dictionaries containing kernel data + """ + try: + # Open the workbook + wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True) + + # Check if GEMM sheet exists + if "GEMM" not in wb.sheetnames: + print(f"Warning: GEMM sheet not found in {file_path}") + return [] + + sheet = wb["GEMM"] + + # Expected column positions (0-based indices) + col_kernel_info = column_letter_to_index("X") # Column X + col_time_min = column_letter_to_index("AG") # Column AG + col_time_max = column_letter_to_index("AH") # Column AH + + # Read header row to validate column names + rows_data = [] + header_row = None + + for i, row in enumerate(sheet.iter_rows(values_only=True)): + if i == 0: + # This is the header - validate column names match expectations + header_row = list(row) + + # Expected column names (match what TraceLens generates) + expected_x = "kernel_details__summarize_kernel_stats" + expected_ag = "Kernel Time (µs)_min" + expected_ah = "Kernel Time (µs)_max" + + # Validate each expected column + errors = [] + + if col_kernel_info < len(header_row): + header_x = str(header_row[col_kernel_info]) if header_row[col_kernel_info] else "" + if header_x != expected_x: + errors.append(f"Column X: expected '{expected_x}', found '{header_x}'") + else: + errors.append(f"Column X: not found (only {len(header_row)} columns)") + + if col_time_min < len(header_row): + header_ag = str(header_row[col_time_min]) if header_row[col_time_min] else "" + if header_ag != expected_ag: + errors.append(f"Column AG: expected '{expected_ag}', found '{header_ag}'") + else: + errors.append(f"Column AG: not found (only {len(header_row)} columns)") + + if col_time_max < len(header_row): + header_ah = str(header_row[col_time_max]) if header_row[col_time_max] else "" + if header_ah != expected_ah: + errors.append(f"Column AH: expected '{expected_ah}', found '{header_ah}'") + else: + errors.append(f"Column AH: not found (only {len(header_row)} columns)") + + if errors: + raise ValueError( + f"Column validation failed in {file_path}:\n " + "\n ".join(errors) + ) + + continue + + if row is None or len(row) <= max(col_kernel_info, col_time_min, col_time_max): + continue + + kernel_info = row[col_kernel_info] if col_kernel_info < len(row) else None + kernel_time_min = row[col_time_min] if col_time_min < len(row) else None + kernel_time_max = row[col_time_max] if col_time_max < len(row) else None + + # Extract kernel name + kernel_name = extract_name_from_kernel_info(kernel_info) + + # Calculate time difference + if kernel_time_min is not None and kernel_time_max is not None: + try: + time_diff = float(kernel_time_max) - float(kernel_time_min) + except (ValueError, TypeError): + continue + else: + continue + + if kernel_name: + row_dict = { + "threads": threads, + "channel": channel, + "rank": rank, + "kernel_name": kernel_name, + "kernel_time_min_us": kernel_time_min, + "kernel_time_max_us": kernel_time_max, + "time_diff_us": time_diff, + } + + # Add all other columns + if header_row: + for j, val in enumerate(row): + if j < len(header_row) and header_row[j]: + col_name = f"col_{header_row[j]}" + row_dict[col_name] = val + + rows_data.append(row_dict) + + wb.close() + + # Sort by time_diff_us and get top k + rows_data.sort(key=lambda x: x["time_diff_us"], reverse=True) + top_results = rows_data[:top_k] + + return top_results + + except Exception as e: + print(f"Error processing {file_path}: {e}") + import traceback + traceback.print_exc() + return [] + + +def analyze_gemm_reports( + base_path: Path, + threads: List[int], + channels: List[int], + ranks: List[int], + top_k: int = 5, + output_file: str = "top5_gemm_kernels_time_variance.csv", + verbose: bool = False, +) -> Optional[Path]: + """ + Analyze GEMM reports from a sweep directory structure. + + Args: + base_path: Path to tracelens_analysis directory + threads: List of thread configurations to analyze (e.g., [256, 512]) + channels: List of channel configurations (e.g., [28, 42, 56, 70]) + ranks: List of ranks to analyze (e.g., [0, 1, 2, ..., 7]) + top_k: Number of top kernels to extract per file + output_file: Output CSV filename + verbose: Whether to print verbose output + + Returns: + Path to output file or None if no data processed + """ + # Validate base path + if not base_path.exists(): + raise FileNotFoundError(f"Base path does not exist: {base_path}") + + if verbose: + print(f"Configuration:") + print(f" Base path: {base_path}") + print(f" Threads: {threads}") + print(f" Channels: {channels}") + print(f" Ranks: {ranks}") + print(f" Top K: {top_k}") + print(f" Output file: {output_file}") + print() + + all_results = [] + + print("Processing Excel files...") + total_files = len(threads) * len(channels) * len(ranks) + file_count = 0 + + for thread_count in threads: + thread_dir = base_path / f"{thread_count}thread" / "individual_reports" + + for channel in channels: + for rank in ranks: + file_name = f"perf_{channel}ch_rank{rank}.xlsx" + file_path = thread_dir / file_name + + file_count += 1 + if verbose: + print(f"Processing {file_count}/{total_files}: {file_name}") + + if not file_path.exists(): + if verbose: + print(f" Warning: File not found: {file_path}") + continue + + # Process the file + results = process_excel_file(file_path, thread_count, channel, rank, top_k) + + if results: + all_results.extend(results) + if verbose: + print(f" Found {len(results)} kernels") + + if not all_results: + print("Error: No data extracted!") + return None + + # Sort by time_diff_us descending + print("\nCombining and sorting results...") + all_results.sort(key=lambda x: x["time_diff_us"], reverse=True) + + # Get all unique keys + all_keys = set() + for row in all_results: + all_keys.update(row.keys()) + + # Order columns: metadata first, then others + metadata_cols = [ + "threads", + "channel", + "rank", + "kernel_name", + "kernel_time_min_us", + "kernel_time_max_us", + "time_diff_us", + ] + other_cols = sorted([k for k in all_keys if k not in metadata_cols]) + ordered_cols = metadata_cols + other_cols + + # Determine output path + output_path = Path(output_file) + if not output_path.is_absolute(): + output_path = base_path / output_file + + # Ensure output directory exists + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Save to CSV + with open(output_path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=ordered_cols) + writer.writeheader() + + for row in all_results: + # Fill in missing keys with None + full_row = {k: row.get(k, None) for k in ordered_cols} + writer.writerow(full_row) + + print(f"\nResults saved to: {output_path}") + print(f"Total rows: {len(all_results)}") + + # Print summary + print(f"\nTop {min(10, len(all_results))} kernels by time difference:") + for i, row in enumerate(all_results[:10]): + print( + f"{i+1}. threads={row['threads']}, ch={row['channel']}, rank={row['rank']}, " + f"diff={row['time_diff_us']:.4f}us" + ) + print(f" {row['kernel_name'][:100]}...") + + # Print summary statistics + time_diffs = [r["time_diff_us"] for r in all_results] + kernel_names = set(r["kernel_name"] for r in all_results) + + print(f"\nSummary Statistics:") + print(f"Total unique kernels: {len(kernel_names)}") + print(f"Average time difference: {sum(time_diffs)/len(time_diffs):.4f} us") + print(f"Max time difference: {max(time_diffs):.4f} us") + print(f"Min time difference: {min(time_diffs):.4f} us") + + return output_path + diff --git a/src/aorta/report/analysis/analyze_single.py b/src/aorta/report/analysis/analyze_single.py new file mode 100644 index 0000000..f961cf3 --- /dev/null +++ b/src/aorta/report/analysis/analyze_single.py @@ -0,0 +1,339 @@ +""" +Single configuration analysis - analyze traces from one experiment. + +Generates individual per-rank performance reports and multi-rank collective reports +using TraceLens with GEMM patches for ROCm Tensile kernel recognition. +""" + +from pathlib import Path +from typing import List, Optional, Tuple +import numpy as np +import pandas as pd + +from .tracelens_wrapper import TraceLensWrapper + + +def geometric_mean(values: np.ndarray) -> float: + """Calculate geometric mean, handling zeros.""" + values = np.array(values) + values = np.where(values == 0, 1e-10, values) + return float(np.exp(np.mean(np.log(values)))) + + +def detect_trace_directory(input_dir: Path) -> Tuple[Path, Path]: + """ + Auto-detect directory structure for traces. + + Args: + input_dir: Input directory path + + Returns: + Tuple of (torch_profiler_dir, base_dir) + + Raises: + ValueError: If directory structure cannot be determined + """ + # Check if input_dir contains rank directories (i.e., it IS torch_profiler/) + rank_dirs = list(input_dir.glob("rank*")) + if rank_dirs: + return input_dir, input_dir.parent + + # Check if input_dir contains torch_profiler/ subdirectory + torch_prof_dir = input_dir / "torch_profiler" + if torch_prof_dir.exists(): + rank_dirs = list(torch_prof_dir.glob("rank*")) + if rank_dirs: + return torch_prof_dir, input_dir + + raise ValueError( + f"Cannot find rank directories in expected structure.\n" + f"Expected one of:\n" + f" 1. Directory with rank0/, rank1/, ... subdirectories (torch_profiler/)\n" + f" 2. Parent directory containing torch_profiler/rank0/, rank1/, ...\n" + f"Provided: {input_dir}" + ) + + +def find_trace_file(rank_dir: Path) -> Optional[Path]: + """Find trace file in a rank directory.""" + json_files = list(rank_dir.glob("*.json")) + if json_files: + return json_files[0] + return None + + +def process_gpu_timeline( + reports_dir: Path, + use_geo_mean: bool = False, + verbose: bool = False, +) -> Optional[Path]: + """ + Create mean/geometric mean aggregated GPU timeline across all ranks. + + Args: + reports_dir: Path to individual_reports directory + use_geo_mean: If True, use geometric mean; otherwise use arithmetic mean + verbose: Whether to print verbose output + + Returns: + Path to output Excel file or None if no data processed + """ + if not reports_dir.exists(): + raise FileNotFoundError(f"Directory not found: {reports_dir}") + + print(f"Processing GPU timeline from: {reports_dir}") + print(f"Aggregation: {'Geometric Mean' if use_geo_mean else 'Arithmetic Mean'}") + + perf_files = sorted(reports_dir.glob("perf_rank*.xlsx")) + + if not perf_files: + print("Error: No perf_rank*.xlsx files found") + return None + + print(f"Found {len(perf_files)} rank files") + + rank_data = [] + for file_path in perf_files: + rank_num = int(file_path.stem.replace("perf_rank", "")) + try: + df = pd.read_excel(file_path, sheet_name="gpu_timeline") + df["rank"] = rank_num + rank_data.append(df) + if verbose: + print(f" Rank {rank_num}: OK") + except Exception as e: + print(f" Rank {rank_num}: Error - {e}") + + if not rank_data: + print("Error: No valid data loaded") + return None + + combined = pd.concat(rank_data, ignore_index=True) + + agg_func = geometric_mean if use_geo_mean else "mean" + aggregated = ( + combined.groupby("type") + .agg({"time ms": agg_func, "percent": agg_func}) + .reset_index() + ) + + aggregated["num_ranks"] = len(perf_files) + + method_suffix = "geomean" if use_geo_mean else "mean" + output_path = reports_dir.parent / f"gpu_timeline_summary_{method_suffix}.xlsx" + + with pd.ExcelWriter(output_path, engine="openpyxl") as writer: + aggregated.to_excel(writer, sheet_name="Summary", index=False) + + combined_sorted = combined.sort_values(["rank", "type"]) + combined_sorted.to_excel(writer, sheet_name="All_Ranks_Combined", index=False) + + per_rank = combined.pivot_table( + values="time ms", index="type", columns="rank", aggfunc="first" + ) + per_rank.to_excel(writer, sheet_name="Per_Rank_Time_ms") + + per_rank_pct = combined.pivot_table( + values="percent", index="type", columns="rank", aggfunc="first" + ) + per_rank_pct.to_excel(writer, sheet_name="Per_Rank_Percent") + + print(f"\nSaved: {output_path}") + print("\nSummary:") + print(aggregated.to_string(index=False)) + + return output_path + + +def analyze_single_config( + input_dir: Path, + output_dir: Optional[Path] = None, + run_individual: bool = True, + run_collective: bool = True, + aggregate_timeline: bool = True, + use_geo_mean: bool = False, + short_kernel_threshold_us: int = 50, + topk_ops: int = 100, + verbose: bool = False, +) -> dict: + """ + Run TraceLens analysis on a single configuration trace directory. + + Args: + input_dir: Path to trace directory (torch_profiler/ or its parent) + output_dir: Output directory (default: input_dir/tracelens_analysis) + run_individual: Generate individual per-rank reports + run_collective: Generate multi-rank collective report + aggregate_timeline: Aggregate GPU timeline across ranks + use_geo_mean: Use geometric mean for aggregation + short_kernel_threshold_us: Threshold for short kernel study + topk_ops: Number of top operations to include + verbose: Whether to print verbose output + + Returns: + Dictionary with paths to generated reports + """ + input_path = Path(input_dir) + + # Detect directory structure + torch_prof_dir, base_dir = detect_trace_directory(input_path) + + # Set output directory + if output_dir is None: + output_path = base_dir / "tracelens_analysis" + else: + output_path = Path(output_dir) + + output_path.mkdir(parents=True, exist_ok=True) + individual_reports_dir = output_path / "individual_reports" + collective_reports_dir = output_path / "collective_reports" + + if run_individual: + individual_reports_dir.mkdir(parents=True, exist_ok=True) + if run_collective: + collective_reports_dir.mkdir(parents=True, exist_ok=True) + + # Detect ranks + rank_dirs = sorted(torch_prof_dir.glob("rank*")) + num_ranks = len(rank_dirs) + + if num_ranks == 0: + raise ValueError(f"No rank directories found in {torch_prof_dir}") + + print("=" * 80) + print("TraceLens Analysis - Single Configuration") + print("=" * 80) + print(f"\nInput directory: {input_path}") + print(f"Torch profiler traces: {torch_prof_dir}") + print(f"Detected {num_ranks} ranks") + print(f"Output directory: {output_path}") + + results = { + "output_dir": output_path, + "individual_reports": [], + "collective_report": None, + "gpu_timeline_summary": None, + } + + # Initialize TraceLens wrapper + wrapper = TraceLensWrapper(verbose=verbose) + + # Step 1: Generate individual reports + if run_individual: + print("\n" + "=" * 80) + print("Step 1: Generating Individual Performance Reports") + print("=" * 80) + + for rank_dir in rank_dirs: + rank_name = rank_dir.name + # Extract rank number + if rank_name.startswith("rank"): + rank_num = rank_name[4:] # Remove "rank" prefix + try: + rank_num = int(rank_num.lstrip("_").lstrip("0") or "0") + except ValueError: + rank_num = rank_name + + trace_file = find_trace_file(rank_dir) + if trace_file is None: + print(f" Skip {rank_name} - no trace file found") + continue + + output_file = individual_reports_dir / f"perf_rank{rank_num}.xlsx" + + print(f"\nProcessing {rank_name}...") + print(f" Trace: {trace_file.name}") + + try: + wrapper.generate_perf_report( + trace_path=trace_file, + output_path=output_file, + include_unlinked_kernels=True, + short_kernel_study=True, + short_kernel_threshold_us=short_kernel_threshold_us, + topk_ops=topk_ops, + topk_roofline_ops=topk_ops, + ) + print(f" Done: {output_file.name}") + results["individual_reports"].append(output_file) + except Exception as e: + print(f" Error processing {rank_name}: {e}") + + # Step 2: Generate collective report + if run_collective: + print("\n" + "=" * 80) + print("Step 2: Generating Multi-Rank Collective Report") + print("=" * 80) + + output_file = collective_reports_dir / "collective_all_ranks.xlsx" + + # Create trace.json symlinks for consistent pattern + for rank_dir in rank_dirs: + trace_file = find_trace_file(rank_dir) + if trace_file: + symlink_path = rank_dir / "trace.json" + if not symlink_path.exists(): + try: + symlink_path.symlink_to(trace_file.name) + except (OSError, FileExistsError): + pass # Symlink already exists or cannot be created + + trace_pattern = str(torch_prof_dir / "rank*" / "trace.json") + + print(f"\nGenerating collective report for {num_ranks} ranks...") + print(f" Trace pattern: rank*/trace.json") + + try: + wrapper.generate_collective_report( + trace_pattern=trace_pattern, + world_size=num_ranks, + output_path=output_file, + detailed_analysis=True, + use_multiprocessing=True, + ) + print(f" Done: {output_file.name}") + results["collective_report"] = output_file + except Exception as e: + print(f" Error generating collective report: {e}") + + # Step 3: Aggregate GPU timeline + if aggregate_timeline and run_individual: + print("\n" + "=" * 80) + print("Step 3: Aggregating GPU Timeline") + print("=" * 80) + + try: + summary_path = process_gpu_timeline( + reports_dir=individual_reports_dir, + use_geo_mean=use_geo_mean, + verbose=verbose, + ) + results["gpu_timeline_summary"] = summary_path + except Exception as e: + print(f" Error aggregating GPU timeline: {e}") + + # Print summary + print("\n" + "=" * 80) + print("Analysis Complete!") + print("=" * 80) + print(f"\n📁 Results saved to: {output_path}") + print(f"\nGenerated reports:") + print(f" Individual reports: {len(results['individual_reports'])}") + print(f" Collective report: {'Yes' if results['collective_report'] else 'No'}") + print(f" GPU timeline summary: {'Yes' if results['gpu_timeline_summary'] else 'No'}") + + if results["individual_reports"]: + print("\n📊 Individual Performance Reports:") + for report in results["individual_reports"]: + print(f" {report.name}") + + if results["collective_report"]: + print(f"\n📊 Collective Report:") + print(f" {results['collective_report'].name}") + + if results["gpu_timeline_summary"]: + print(f"\n📊 GPU Timeline Summary:") + print(f" {results['gpu_timeline_summary'].name}") + + return results + diff --git a/src/aorta/report/analysis/analyze_sweep.py b/src/aorta/report/analysis/analyze_sweep.py new file mode 100644 index 0000000..9626769 --- /dev/null +++ b/src/aorta/report/analysis/analyze_sweep.py @@ -0,0 +1,412 @@ +""" +Sweep configuration analysis - analyze traces from parameter sweep experiments. + +Processes GPU timeline data from TraceLens individual reports across multiple +thread and channel configurations, aggregating across ranks. +""" + +import glob +from pathlib import Path +from typing import Dict, List, Optional, Any + +import numpy as np +import pandas as pd + + +def geometric_mean(values: np.ndarray) -> float: + """Calculate geometric mean, handling zeros.""" + values = np.array(values) + # Replace zeros with small value to avoid log(0) + values = np.where(values == 0, 1e-10, values) + return float(np.exp(np.mean(np.log(values)))) + + +def parse_perf_filename(filename: str) -> tuple: + """ + Parse performance filename to extract channel config and rank. + + Args: + filename: e.g., 'perf_28ch_rank0.xlsx' + + Returns: + tuple: (channel_config, rank) e.g., ('28ch', 0) + """ + parts = filename.replace("perf_", "").replace(".xlsx", "").split("_") + channel_config = parts[0] # e.g., "28ch" + rank = int(parts[1].replace("rank", "")) + return channel_config, rank + + +def group_files_by_channel(perf_files: List[str]) -> Dict[str, List[tuple]]: + """ + Group performance files by channel configuration. + + Args: + perf_files: List of file paths + + Returns: + dict: {channel_config: [(rank, file_path), ...]} + """ + channel_groups = {} + for file_path in perf_files: + filename = Path(file_path).name + channel_config, rank = parse_perf_filename(filename) + + if channel_config not in channel_groups: + channel_groups[channel_config] = [] + channel_groups[channel_config].append((rank, file_path)) + + return channel_groups + + +def read_rank_data(rank_files: List[tuple], verbose: bool = False) -> List[pd.DataFrame]: + """ + Read gpu_timeline data from all rank files. + + Args: + rank_files: List of (rank, file_path) tuples + verbose: Whether to print verbose output + + Returns: + list: List of DataFrames with rank column added + """ + rank_data = [] + for rank, file_path in rank_files: + try: + df = pd.read_excel(file_path, sheet_name="gpu_timeline") + df["rank"] = rank + rank_data.append(df) + except Exception as e: + if verbose: + print(f" Warning: Could not read {Path(file_path).name}: {e}") + return rank_data + + +def aggregate_rank_data( + rank_data: List[pd.DataFrame], + thread_config: str, + channel_config: str, + num_ranks: int, + use_geo_mean: bool, +) -> pd.DataFrame: + """ + Aggregate data across ranks and add metadata. + + Args: + rank_data: List of DataFrames + thread_config: Thread configuration string (e.g., '256thread') + channel_config: Channel configuration string (e.g., '28ch') + num_ranks: Number of ranks + use_geo_mean: Whether to use geometric mean + + Returns: + DataFrame: Aggregated data with metadata + """ + combined = pd.concat(rank_data, ignore_index=True) + + agg_func = geometric_mean if use_geo_mean else "mean" + aggregated = ( + combined.groupby("type") + .agg({"time ms": agg_func, "percent": agg_func}) + .reset_index() + ) + + # Add metadata + aggregated["thread_config"] = thread_config + aggregated["threads_num"] = int(thread_config.replace("thread", "")) + aggregated["channel_config"] = channel_config + aggregated["channels_num"] = int(channel_config.replace("ch", "")) + aggregated["full_config"] = f"{thread_config}_{channel_config}" + aggregated["num_ranks"] = num_ranks + + return aggregated + + +def process_channel_config( + channel_config: str, + channel_groups: Dict[str, List[tuple]], + use_geo_mean: bool, + thread_config: str, + verbose: bool = False, +) -> Optional[pd.DataFrame]: + """ + Process a single channel configuration. + + Args: + channel_config: Channel configuration string + channel_groups: Dict of channel groups + use_geo_mean: Whether to use geometric mean + thread_config: Thread configuration string + verbose: Whether to print verbose output + + Returns: + DataFrame or None: Aggregated data, or None if no valid data + """ + rank_files = sorted(channel_groups[channel_config], key=lambda x: x[0]) + num_ranks = len(rank_files) + + if verbose: + print(f" {channel_config}: Processing {num_ranks} ranks...") + + rank_data = read_rank_data(rank_files, verbose) + + if not rank_data: + if verbose: + print(f" No valid data for {channel_config}") + return None + + aggregated = aggregate_rank_data( + rank_data, thread_config, channel_config, num_ranks, use_geo_mean + ) + if verbose: + print(f" [OK] Aggregated across {num_ranks} ranks") + + return aggregated + + +def process_thread_config( + thread_config: str, + tracelens_dir: Path, + use_geo_mean: bool, + verbose: bool = False, +) -> List[pd.DataFrame]: + """ + Process a single thread configuration. + + Args: + thread_config: Thread configuration string + tracelens_dir: Path to tracelens_analysis directory + use_geo_mean: Whether to use geometric mean + verbose: Whether to print verbose output + + Returns: + list: List of aggregated DataFrames + """ + individual_reports_dir = tracelens_dir / thread_config / "individual_reports" + + if not individual_reports_dir.exists(): + if verbose: + print(f" Warning: {individual_reports_dir} not found, skipping...") + return [] + + if verbose: + print(f"\nProcessing: {thread_config}") + print("-" * 60) + + perf_files = sorted(glob.glob(str(individual_reports_dir / "perf_*ch_rank*.xlsx"))) + + if not perf_files: + if verbose: + print(f" Warning: No performance files found in {individual_reports_dir}") + return [] + + channel_groups = group_files_by_channel(perf_files) + results = [] + + # Process each channel configuration (sorted by channel number) + sorted_channels = sorted( + channel_groups.keys(), key=lambda x: int(x.replace("ch", "")) + ) + for channel_config in sorted_channels: + aggregated = process_channel_config( + channel_config, channel_groups, use_geo_mean, thread_config, verbose + ) + if aggregated is not None: + results.append(aggregated) + + return results + + +def create_pivot_sheet(df: pd.DataFrame, value_col: str) -> pd.DataFrame: + """ + Create a pivot table from the dataframe. + + Args: + df: Source DataFrame + value_col: Column to use for values + + Returns: + DataFrame: Pivot table + """ + return df.pivot_table( + values=value_col, index="type", columns="full_config", aggfunc="first" + ) + + +def create_summary_sheet(df: pd.DataFrame) -> pd.DataFrame: + """ + Create a summary sheet with key metrics per configuration. + + Args: + df: Source DataFrame + + Returns: + DataFrame: Summary table + """ + summary = ( + df.groupby("full_config") + .agg({"threads_num": "first", "channels_num": "first", "num_ranks": "first"}) + .reset_index() + ) + + # Add key metrics for each config + key_metrics = [ + "computation_time", + "exposed_comm_time", + "busy_time", + "idle_time", + "total_time", + ] + for metric_type in key_metrics: + metric_data = df[df["type"] == metric_type].set_index("full_config")["time ms"] + summary[f"{metric_type}_ms"] = summary["full_config"].map(metric_data) + + return summary + + +def print_summary_report(final_df: pd.DataFrame, verbose: bool = False) -> None: + """Print summary statistics and comparisons.""" + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + + print("\nMetric Types Found:") + for metric_type in sorted(final_df["type"].unique()): + count = len(final_df[final_df["type"] == metric_type]) + print(f" {metric_type:<25} ({count} configurations)") + + print("\nConfigurations Processed:") + configs = final_df.groupby("full_config")["num_ranks"].first().sort_index() + for config, num_ranks in configs.items(): + print(f" {config:<25} ({num_ranks} ranks)") + + if verbose: + print("\n" + "=" * 80) + print("KEY METRICS COMPARISON (Sorted by Busy Time)") + print("=" * 80) + + for metric, desc in [ + ("busy_time", "Busy Time (lower is better)"), + ("idle_time", "Idle Time (lower is better)"), + ]: + metric_data = final_df[final_df["type"] == metric][ + ["full_config", "time ms", "percent"] + ].sort_values("time ms") + print(f"\n{desc}:") + print(metric_data.to_string(index=False)) + + +def analyze_sweep_config( + sweep_dir: Path, + output_dir: Optional[Path] = None, + use_geo_mean: bool = False, + verbose: bool = False, +) -> Optional[Path]: + """ + Process GPU timeline data from all individual reports in a sweep. + + Args: + sweep_dir: Path to sweep directory containing tracelens_analysis/ + output_dir: Output directory (default: sweep_dir/tracelens_analysis/) + use_geo_mean: If True, use geometric mean; otherwise use arithmetic mean + verbose: Whether to print verbose output + + Returns: + Path to output Excel file or None if no data processed + """ + sweep_path = Path(sweep_dir) + tracelens_dir = sweep_path / "tracelens_analysis" + + if not tracelens_dir.exists(): + raise FileNotFoundError( + f"tracelens_analysis directory not found in {sweep_dir}" + ) + + agg_method = "Geometric Mean" if use_geo_mean else "Arithmetic Mean" + print("=" * 80) + print(f"Processing GPU Timeline data from: {sweep_dir}") + print(f"Aggregation method: {agg_method}") + print("=" * 80) + + # Find all thread configurations + thread_configs = [ + d.name for d in tracelens_dir.iterdir() if d.is_dir() and "thread" in d.name + ] + + if not thread_configs: + raise ValueError("No thread configuration directories found") + + print(f"\nFound thread configurations: {sorted(thread_configs)}") + + # Process all thread configurations + all_results = [] + for thread_config in sorted(thread_configs): + results = process_thread_config(thread_config, tracelens_dir, use_geo_mean, verbose) + all_results.extend(results) + + if not all_results: + print("\nError: No data was processed") + return None + + # Combine and format results + print("\n" + "=" * 80) + print("CREATING OUTPUT FILE") + print("=" * 80) + + final_df = pd.concat(all_results, ignore_index=True) + + # Reorder and sort + column_order = [ + "full_config", + "threads_num", + "thread_config", + "channels_num", + "channel_config", + "num_ranks", + "type", + "time ms", + "percent", + ] + final_df = final_df[column_order] + final_df = final_df.sort_values(["threads_num", "channels_num", "type"]) + + # Determine output path + if output_dir: + output_path = Path(output_dir) + else: + output_path = tracelens_dir + + method_suffix = "geomean" if use_geo_mean else "mean" + output_file = output_path / f"gpu_timeline_all_configs_{method_suffix}.xlsx" + + # Save to Excel with multiple sheets + with pd.ExcelWriter(output_file, engine="openpyxl") as writer: + final_df.to_excel(writer, sheet_name="All_Data", index=False) + create_pivot_sheet(final_df, "time ms").to_excel( + writer, sheet_name="Pivot_Time_ms" + ) + create_pivot_sheet(final_df, "percent").to_excel( + writer, sheet_name="Pivot_Percent" + ) + create_summary_sheet(final_df).to_excel( + writer, sheet_name="Summary_By_Config", index=False + ) + + print(f"[SAVED] {output_file}") + print(" Sheets created:") + print(" 1. All_Data - Complete dataset") + print(" 2. Pivot_Time_ms - Matrix view of time (ms)") + print(" 3. Pivot_Percent - Matrix view of percentages") + print(" 4. Summary_By_Config - Key metrics per configuration") + + # Print summary + print_summary_report(final_df, verbose) + + print("\n" + "=" * 80) + print("COMPLETE!") + print("=" * 80) + print(f"\nOutput file: {output_file}") + print("Open in Excel to create custom pivots and charts!") + + return output_file + diff --git a/src/aorta/report/analysis/tracelens_wrapper.py b/src/aorta/report/analysis/tracelens_wrapper.py new file mode 100644 index 0000000..3b857a6 --- /dev/null +++ b/src/aorta/report/analysis/tracelens_wrapper.py @@ -0,0 +1,342 @@ +""" +TraceLens wrapper with GEMM recognition patches. + +Applies patches to TraceLens for better ROCm Tensile kernel recognition +and provides a clean Python API for TraceLens commands. +""" + +import re +import sys +from pathlib import Path +from typing import List, Optional, Dict, Any + + +class TraceLensWrapper: + """GEMM-patched TraceLens wrapper.""" + + _patches_applied = False + + def __init__(self, verbose: bool = False): + """Initialize wrapper and apply GEMM patches.""" + self.verbose = verbose + if not TraceLensWrapper._patches_applied: + self._apply_gemm_patches() + TraceLensWrapper._patches_applied = True + + def _log(self, message: str) -> None: + """Log message if verbose mode is enabled.""" + if self.verbose: + print(message) + + def _apply_gemm_patches(self) -> None: + """Apply all GEMM recognition patches to TraceLens.""" + self._log("Applying TraceLens GEMM recognition patches...") + + # Patch kernel_name_parser for enhanced ROCm GEMM recognition + try: + from TraceLens.PerfModel import kernel_name_parser + + def patched_is_rocm_gemm(kernel_name): + """Enhanced ROCm GEMM pattern matching for Tensile kernels.""" + pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$" + return bool(re.match(pattern, kernel_name)) + + def patched_parse_rocm_gemm(kernel_name): + """Parse ROCm GEMM kernel details.""" + trans_a, trans_b = None, None + if "_Ailk_" in kernel_name: + trans_a = False + elif "_Alik_" in kernel_name: + trans_a = True + if "_Bljk_" in kernel_name: + trans_b = False + elif "_Bjlk_" in kernel_name: + trans_b = True + + macro_tile_match = re.search(r"MT(\d+)x(\d+)x(\d+)", kernel_name) + if macro_tile_match: + mt_m = int(macro_tile_match.group(1)) + mt_n = int(macro_tile_match.group(2)) + depth_u = int(macro_tile_match.group(3)) + else: + mt_m, mt_n, depth_u = None, None, None + + return { + "transpose": (trans_a, trans_b), + "mt_m": mt_m, + "mt_n": mt_n, + "depth_u": depth_u, + } + + def patched_gemm_name_parser(kernel_name): + """Enhanced GEMM name parser with better ROCm support.""" + if patched_is_rocm_gemm(kernel_name): + return patched_parse_rocm_gemm(kernel_name) + elif kernel_name_parser.is_cuda_gemm(kernel_name): + return kernel_name_parser.parse_cuda_gemm(kernel_name) + return None + + kernel_name_parser.is_rocm_gemm = patched_is_rocm_gemm + kernel_name_parser.parse_rocm_gemm = patched_parse_rocm_gemm + kernel_name_parser.gemm_name_parser = patched_gemm_name_parser + + self._log(" [OK] Patched kernel_name_parser (ROCm GEMM recognition)") + except ImportError as e: + self._log(f" [WARN] Could not patch kernel_name_parser: {e}") + + # Patch Trace2Tree util for is_gemm_kernel function + try: + from TraceLens.Trace2Tree import util as trace_util + + def patched_is_gemm_kernel(kernel_event: dict) -> bool: + """Enhanced GEMM kernel detection.""" + assert kernel_event["cat"] == "kernel" + kernel_name = kernel_event["name"] + + pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$" + is_rocm_gemm = bool(re.match(pattern, kernel_name)) + is_cuda_gemm = kernel_name.startswith("nvjet") or "cublasLt" in kernel_name + + return is_rocm_gemm or is_cuda_gemm + + trace_util.is_gemm_kernel = patched_is_gemm_kernel + self._log(" [OK] Patched Trace2Tree.util (is_gemm_kernel)") + except ImportError as e: + self._log(f" [WARN] Could not patch Trace2Tree.util: {e}") + + # Patch TraceEventUtils to enhance GEMM keys + try: + from TraceLens import util as tracelens_util + + if hasattr(tracelens_util, "TraceEventUtils"): + if hasattr(tracelens_util.TraceEventUtils, "JaxOpKeys"): + original_gemm_keys = tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys + enhanced_gemm_keys = [ + "Cijk", + "gemm", + "nvjet", + "cublasLt", + "C[a-z]{3}_A[a-z]{3}_B[a-z]{3}", + ] + all_keys = list(set(original_gemm_keys + enhanced_gemm_keys)) + tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys = all_keys + self._log(" [OK] Patched TraceEventUtils.JaxOpKeys (GEMM keys enhanced)") + except (ImportError, AttributeError) as e: + self._log(f" [WARN] Could not patch TraceEventUtils: {e}") + + # Patch torch_op_mapping for better categorization + try: + from TraceLens.PerfModel import torch_op_mapping + + original_categorize = torch_op_mapping.categorize_torch_op + + def patched_categorize_torch_op(row): + """Enhanced categorization with better GEMM detection.""" + result = original_categorize(row) + + if result == "other" and "kernel_details" in row and len(row["kernel_details"]) > 0: + kernel_name = row["kernel_details"][0]["name"] + pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$" + if re.match(pattern, kernel_name): + return "GEMM" + + return result + + torch_op_mapping.categorize_torch_op = patched_categorize_torch_op + self._log(" [OK] Patched torch_op_mapping (categorize_torch_op)") + except ImportError as e: + self._log(f" [WARN] Could not patch torch_op_mapping: {e}") + + self._log("[OK] GEMM patches applied\n") + + def generate_perf_report( + self, + trace_path: Path, + output_path: Path, + include_unlinked_kernels: bool = True, + short_kernel_study: bool = True, + short_kernel_threshold_us: int = 50, + topk_ops: int = 100, + topk_roofline_ops: int = 100, + enable_kernel_summary: bool = False, + ) -> Path: + """ + Generate individual performance report from trace data. + + Args: + trace_path: Path to the trace JSON file + output_path: Path for output Excel file + include_unlinked_kernels: Include unlinked kernels in report + short_kernel_study: Enable short kernel study + short_kernel_threshold_us: Threshold for short kernels (microseconds) + topk_ops: Number of top operations to include + topk_roofline_ops: Number of top roofline operations + enable_kernel_summary: Enable kernel summary sheet + + Returns: + Path to generated report + """ + from TraceLens.Reporting.generate_perf_report_pytorch import main as generate_main + + # Build argument list + args = [ + "--profile_json_path", str(trace_path), + "--output_xlsx_path", str(output_path), + ] + + if include_unlinked_kernels: + args.append("--include_unlinked_kernels") + if short_kernel_study: + args.append("--short_kernel_study") + args.extend(["--short_kernel_threshold_us", str(short_kernel_threshold_us)]) + if topk_ops: + args.extend(["--topk_ops", str(topk_ops)]) + if topk_roofline_ops: + args.extend(["--topk_roofline_ops", str(topk_roofline_ops)]) + if enable_kernel_summary: + args.append("--enable_kernel_summary") + + # Save original argv and replace + original_argv = sys.argv + sys.argv = ["generate_perf_report_pytorch"] + args + + try: + generate_main() + finally: + sys.argv = original_argv + + return output_path + + def generate_perf_report_rocprof( + self, + trace_path: Path, + output_path: Path, + kernel_details: bool = True, + short_kernel_study: bool = True, + short_kernel_threshold_us: int = 50, + topk_kernels: int = 100, + ) -> Path: + """ + Generate performance report from rocprof trace data. + + Args: + trace_path: Path to the rocprof results JSON file + output_path: Path for output Excel file + kernel_details: Include kernel details + short_kernel_study: Enable short kernel study + short_kernel_threshold_us: Threshold for short kernels + topk_kernels: Number of top kernels to include + + Returns: + Path to generated report + """ + from TraceLens.Reporting.generate_perf_report_rocprof import main as generate_main + + args = [ + "--profile_json_path", str(trace_path), + "--output_xlsx_path", str(output_path), + ] + + if kernel_details: + args.append("--kernel_details") + if short_kernel_study: + args.append("--short_kernel_study") + args.extend(["--short_kernel_threshold_us", str(short_kernel_threshold_us)]) + if topk_kernels: + args.extend(["--topk_kernels", str(topk_kernels)]) + + original_argv = sys.argv + sys.argv = ["generate_perf_report_rocprof"] + args + + try: + generate_main() + finally: + sys.argv = original_argv + + return output_path + + def generate_collective_report( + self, + trace_pattern: str, + world_size: int, + output_path: Path, + detailed_analysis: bool = True, + use_multiprocessing: bool = True, + ) -> Path: + """ + Generate multi-rank collective report. + + Args: + trace_pattern: Glob pattern for trace files (e.g., "rank*/trace.json") + world_size: Number of ranks + output_path: Path for output Excel file + detailed_analysis: Enable detailed analysis + use_multiprocessing: Use multiprocessing for parallel analysis + + Returns: + Path to generated report + """ + from TraceLens.Reporting.generate_multi_rank_collective_report_pytorch import ( + main as generate_main, + ) + + args = [ + "--trace_pattern", str(trace_pattern), + "--world_size", str(world_size), + "--output_xlsx_path", str(output_path), + ] + + if detailed_analysis: + args.append("--detailed_analysis") + if use_multiprocessing: + args.append("--use_multiprocessing") + + original_argv = sys.argv + sys.argv = ["generate_multi_rank_collective_report_pytorch"] + args + + try: + generate_main() + finally: + sys.argv = original_argv + + return output_path + + def compare_reports( + self, + report_paths: List[Path], + names: List[str], + output_path: Path, + sheets: Optional[List[str]] = None, + ) -> Path: + """ + Compare multiple performance reports. + + Args: + report_paths: List of paths to Excel reports + names: Names for each report + output_path: Path for output comparison file + sheets: Sheets to compare (default: gpu_timeline, ops_summary) + + Returns: + Path to generated comparison report + """ + from TraceLens.Reporting.compare_perf_reports_pytorch import main as compare_main + + if sheets is None: + sheets = ["gpu_timeline", "ops_summary"] + + args = [str(p) for p in report_paths] + args.extend(["--names"] + names) + args.extend(["--sheets"] + sheets) + args.extend(["-o", str(output_path)]) + + original_argv = sys.argv + sys.argv = ["compare_perf_reports_pytorch"] + args + + try: + compare_main() + finally: + sys.argv = original_argv + + return output_path + diff --git a/src/aorta/report/cli.py b/src/aorta/report/cli.py index 33eadd9..e6e08b7 100644 --- a/src/aorta/report/cli.py +++ b/src/aorta/report/cli.py @@ -67,9 +67,15 @@ def analyze(ctx): @click.argument("trace_dir", type=click.Path(exists=True)) @click.option("--individual-only", is_flag=True, help="Generate only individual reports") @click.option("--collective-only", is_flag=True, help="Generate only collective report") +@click.option("--geo-mean", is_flag=True, help="Use geometric mean for timeline aggregation") +@click.option("--short-kernel-threshold", default=50, type=int, + help="Threshold for short kernel study (microseconds)") +@click.option("--topk-ops", default=100, type=int, + help="Number of top operations to include") @click.option("-o", "--output", type=click.Path(), help="Output directory") @click.pass_context -def analyze_single(ctx, trace_dir, individual_only, collective_only, output): +def analyze_single(ctx, trace_dir, individual_only, collective_only, geo_mean, + short_kernel_threshold, topk_ops, output): """Analyze a single configuration trace directory. TRACE_DIR: Path to the trace directory containing rank subdirectories. @@ -80,50 +86,117 @@ def analyze_single(ctx, trace_dir, individual_only, collective_only, output): aorta-report analyze single /path/to/traces --individual-only aorta-report analyze single /path/to/traces -o ./results """ - click.echo(f"[analyze single] trace_dir={trace_dir}") - click.echo(f" individual_only={individual_only}, collective_only={collective_only}") - click.echo(f" output={output}") - click.echo(" [NOT IMPLEMENTED]") + from pathlib import Path + from .analysis import analyze_single_config + + verbose = ctx.obj.get("verbose", False) + quiet = ctx.obj.get("quiet", False) + + run_individual = not collective_only + run_collective = not individual_only + + try: + results = analyze_single_config( + input_dir=Path(trace_dir), + output_dir=Path(output) if output else None, + run_individual=run_individual, + run_collective=run_collective, + aggregate_timeline=run_individual, + use_geo_mean=geo_mean, + short_kernel_threshold_us=short_kernel_threshold, + topk_ops=topk_ops, + verbose=verbose, + ) + if not quiet: + click.echo(f"\nAnalysis complete: {results['output_dir']}") + except (ValueError, FileNotFoundError) as e: + raise click.ClickException(str(e)) @analyze.command("sweep") @click.argument("sweep_dir", type=click.Path(exists=True)) -@click.option("--rocprof", is_flag=True, help="Use rocprof traces instead of PyTorch profiler") +@click.option("--geo-mean", is_flag=True, help="Use geometric mean instead of arithmetic mean") @click.option("-o", "--output", type=click.Path(), help="Output directory") @click.pass_context -def analyze_sweep(ctx, sweep_dir, rocprof, output): +def analyze_sweep(ctx, sweep_dir, geo_mean, output): """Analyze a sweep directory with multiple configurations. - SWEEP_DIR: Path to the sweep directory containing multiple thread/channel configs. + SWEEP_DIR: Path to the sweep directory containing tracelens_analysis/ + with multiple thread/channel configs. \b Examples: aorta-report analyze sweep /path/to/sweep_20251124 - aorta-report analyze sweep /path/to/sweep --rocprof + aorta-report analyze sweep /path/to/sweep --geo-mean """ - click.echo(f"[analyze sweep] sweep_dir={sweep_dir}") - click.echo(f" rocprof={rocprof}, output={output}") - click.echo(" [NOT IMPLEMENTED]") + from pathlib import Path + from .analysis import analyze_sweep_config + + verbose = ctx.obj.get("verbose", False) + quiet = ctx.obj.get("quiet", False) + + try: + output_path = analyze_sweep_config( + sweep_dir=Path(sweep_dir), + output_dir=Path(output) if output else None, + use_geo_mean=geo_mean, + verbose=verbose, + ) + if not quiet and output_path: + click.echo(f"\nAnalysis complete: {output_path}") + except (ValueError, FileNotFoundError) as e: + raise click.ClickException(str(e)) @analyze.command("gemm") @click.argument("reports_dir", type=click.Path(exists=True)) -@click.option("--top-k", default=5, type=int, help="Number of top kernels to extract") -@click.option("-o", "--output", type=click.Path(), help="Output CSV file") +@click.option("--threads", "-t", multiple=True, type=int, default=(256, 512), + help="Thread configurations to analyze (can be specified multiple times)") +@click.option("--channels", "-c", multiple=True, type=int, default=(28, 42, 56, 70), + help="Channel configurations to analyze (can be specified multiple times)") +@click.option("--ranks", "-r", multiple=True, type=int, + help="Ranks to analyze (default: 0-7)") +@click.option("--top-k", default=5, type=int, help="Number of top kernels to extract per file") +@click.option("-o", "--output", type=click.Path(), + default="top5_gemm_kernels_time_variance.csv", help="Output CSV file") @click.pass_context -def analyze_gemm(ctx, reports_dir, top_k, output): +def analyze_gemm(ctx, reports_dir, threads, channels, ranks, top_k, output): """Analyze GEMM kernels from TraceLens reports. - REPORTS_DIR: Path to directory containing TraceLens Excel reports. + REPORTS_DIR: Path to tracelens_analysis directory containing + {threads}thread/individual_reports/ subdirectories. \b Examples: - aorta-report analyze gemm /path/to/reports + aorta-report analyze gemm /path/to/tracelens_analysis aorta-report analyze gemm /path/to/reports --top-k 10 -o gemm_analysis.csv + aorta-report analyze gemm /path/to/reports -t 256 -t 512 -c 28 -c 42 """ - click.echo(f"[analyze gemm] reports_dir={reports_dir}") - click.echo(f" top_k={top_k}, output={output}") - click.echo(" [NOT IMPLEMENTED]") + from pathlib import Path + from .analysis import analyze_gemm_reports + + verbose = ctx.obj.get("verbose", False) + quiet = ctx.obj.get("quiet", False) + + # Convert tuples to lists, use defaults if not specified + threads_list = list(threads) if threads else [256, 512] + channels_list = list(channels) if channels else [28, 42, 56, 70] + ranks_list = list(ranks) if ranks else list(range(8)) + + try: + output_path = analyze_gemm_reports( + base_path=Path(reports_dir), + threads=threads_list, + channels=channels_list, + ranks=ranks_list, + top_k=top_k, + output_file=output, + verbose=verbose, + ) + if not quiet and output_path: + click.echo(f"\nAnalysis complete: {output_path}") + except (ValueError, FileNotFoundError) as e: + raise click.ClickException(str(e)) # ============================================================================= @@ -390,9 +463,55 @@ def process_gpu_timeline(ctx, input_dir, mode, geo_mean, output): aorta-report process gpu-timeline /path/to/individual_reports --mode single aorta-report process gpu-timeline /path/to/sweep --mode sweep --geo-mean """ - click.echo(f"[process gpu-timeline] input_dir={input_dir}") - click.echo(f" mode={mode}, geo_mean={geo_mean}, output={output}") - click.echo(" [NOT IMPLEMENTED]") + from pathlib import Path + + verbose = ctx.obj.get("verbose", False) + quiet = ctx.obj.get("quiet", False) + input_path = Path(input_dir) + + # Auto-detect mode + if mode == "auto": + # Check for sweep structure (tracelens_analysis with thread directories) + tracelens_dir = input_path / "tracelens_analysis" + if tracelens_dir.exists(): + thread_dirs = [d for d in tracelens_dir.iterdir() if d.is_dir() and "thread" in d.name] + if thread_dirs: + mode = "sweep" + else: + mode = "single" + elif input_path.name == "individual_reports" or list(input_path.glob("perf_rank*.xlsx")): + mode = "single" + elif list(input_path.glob("perf_*ch_rank*.xlsx")): + mode = "sweep" + else: + raise click.ClickException( + f"Could not auto-detect mode. Please specify --mode single or --mode sweep" + ) + + if verbose: + click.echo(f"Auto-detected mode: {mode}") + + try: + if mode == "single": + from .analysis.analyze_single import process_gpu_timeline as process_single + output_path = process_single( + reports_dir=input_path, + use_geo_mean=geo_mean, + verbose=verbose, + ) + else: # sweep + from .analysis import analyze_sweep_config + output_path = analyze_sweep_config( + sweep_dir=input_path, + output_dir=Path(output) if output else None, + use_geo_mean=geo_mean, + verbose=verbose, + ) + + if not quiet and output_path: + click.echo(f"\nProcessing complete: {output_path}") + except (ValueError, FileNotFoundError) as e: + raise click.ClickException(str(e)) @process.command("comms") From c1b620c651496e3eb39528bc54729c8d4e25ce1c Mon Sep 17 00:00:00 2001 From: prosenjitdhole Date: Wed, 28 Jan 2026 13:22:20 +0530 Subject: [PATCH 2/2] [WIP] AORTA-17 CLI command for report generation : Added processign scripts (#66) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * AORTA-17 CLI command for report generation : Added processign scripts * AORTA-17 CLI command for report generation : Adding combining scripts… (#67) * AORTA-17 CLI command for report generation : Adding combining scripts under compare subcommand * AORTA-17 CLI command for report generation : Excel generation (#68) * AORTA-17 CLI command for report generation : Generate the excel for single config runs * AORTA-17 CLI command for report generation : Generate all the plots (#69) * AORTA-17 CLI command for report generation : Generate all the plots * AORTA-17 CLI command for report generation : Implementation of pipeline (#70) * AORTA-17 CLI command for report generation : Implementation of pipeline * AORTA-17 CLI command for report generation : refactored cli (#71) Co-authored-by: Dhole --------- Co-authored-by: Dhole --------- Co-authored-by: Dhole --------- Co-authored-by: Dhole --------- Co-authored-by: Dhole --------- Co-authored-by: Dhole --- src/aorta/report/COMPARE_CMD_DEV_DOCS.md | 1058 +++++++++++++ src/aorta/report/GENERATE_EXCEL_DEV_DOCS.md | 618 ++++++++ src/aorta/report/GENERATE_PLOTS_DEV_DOCS.md | 1383 +++++++++++++++++ src/aorta/report/PIPELINE_DEV_DOCS.md | 1079 +++++++++++++ src/aorta/report/PROCESS_CMD_DEV_DOCS.md | 1009 ++++++++++++ src/aorta/report/USER_GUIDE.md | 826 ++++++++++ src/aorta/report/analysis/cli.py | 158 ++ src/aorta/report/aorta-report-detail-plan.md | 919 +++++------ .../report/aorta-report-functional-spec.md | 94 +- src/aorta/report/cli.py | 605 +------ src/aorta/report/comparison/__init__.py | 13 + src/aorta/report/comparison/cli.py | 254 +++ .../comparison/collective_comparison.py | 238 +++ src/aorta/report/comparison/combine.py | 135 ++ src/aorta/report/comparison/formatting.py | 144 ++ .../comparison/gpu_timeline_comparison.py | 222 +++ src/aorta/report/generators/__init__.py | 8 +- src/aorta/report/generators/cli.py | 291 ++++ src/aorta/report/generators/excel_report.py | 505 ++++++ src/aorta/report/generators/plot_generator.py | 185 +++ .../report/generators/plot_helper/__init__.py | 50 + .../report/generators/plot_helper/common.py | 69 + .../generators/plot_helper/gemm_boxplots.py | 108 ++ .../generators/plot_helper/gemm_data.py | 111 ++ .../plot_helper/gemm_interaction.py | 69 + .../generators/plot_helper/gemm_violin.py | 96 ++ .../generators/plot_helper/gpu_by_rank.py | 74 + .../generators/plot_helper/gpu_heatmap.py | 49 + .../plot_helper/gpu_percent_change.py | 65 + .../generators/plot_helper/nccl_charts.py | 140 ++ .../plot_helper/summary_dashboard.py | 98 ++ src/aorta/report/pipelines/__init__.py | 14 + src/aorta/report/pipelines/cli.py | 245 +++ src/aorta/report/pipelines/gemm_pipeline.py | 194 +++ .../report/pipelines/summary_pipeline.py | 412 +++++ src/aorta/report/processing/__init__.py | 13 + src/aorta/report/processing/cli.py | 188 +++ .../report/processing/gpu_timeline_single.py | 143 ++ .../report/processing/gpu_timeline_sweep.py | 435 ++++++ src/aorta/report/processing/process_comms.py | 291 ++++ .../processing/process_gemm_variance.py | 321 ++++ 41 files changed, 11815 insertions(+), 1114 deletions(-) create mode 100644 src/aorta/report/COMPARE_CMD_DEV_DOCS.md create mode 100644 src/aorta/report/GENERATE_EXCEL_DEV_DOCS.md create mode 100644 src/aorta/report/GENERATE_PLOTS_DEV_DOCS.md create mode 100644 src/aorta/report/PIPELINE_DEV_DOCS.md create mode 100644 src/aorta/report/PROCESS_CMD_DEV_DOCS.md create mode 100644 src/aorta/report/USER_GUIDE.md create mode 100644 src/aorta/report/analysis/cli.py create mode 100644 src/aorta/report/comparison/__init__.py create mode 100644 src/aorta/report/comparison/cli.py create mode 100644 src/aorta/report/comparison/collective_comparison.py create mode 100644 src/aorta/report/comparison/combine.py create mode 100644 src/aorta/report/comparison/formatting.py create mode 100644 src/aorta/report/comparison/gpu_timeline_comparison.py create mode 100644 src/aorta/report/generators/cli.py create mode 100644 src/aorta/report/generators/excel_report.py create mode 100644 src/aorta/report/generators/plot_generator.py create mode 100644 src/aorta/report/generators/plot_helper/__init__.py create mode 100644 src/aorta/report/generators/plot_helper/common.py create mode 100644 src/aorta/report/generators/plot_helper/gemm_boxplots.py create mode 100644 src/aorta/report/generators/plot_helper/gemm_data.py create mode 100644 src/aorta/report/generators/plot_helper/gemm_interaction.py create mode 100644 src/aorta/report/generators/plot_helper/gemm_violin.py create mode 100644 src/aorta/report/generators/plot_helper/gpu_by_rank.py create mode 100644 src/aorta/report/generators/plot_helper/gpu_heatmap.py create mode 100644 src/aorta/report/generators/plot_helper/gpu_percent_change.py create mode 100644 src/aorta/report/generators/plot_helper/nccl_charts.py create mode 100644 src/aorta/report/generators/plot_helper/summary_dashboard.py create mode 100644 src/aorta/report/pipelines/__init__.py create mode 100644 src/aorta/report/pipelines/cli.py create mode 100644 src/aorta/report/pipelines/gemm_pipeline.py create mode 100644 src/aorta/report/pipelines/summary_pipeline.py create mode 100644 src/aorta/report/processing/__init__.py create mode 100644 src/aorta/report/processing/cli.py create mode 100644 src/aorta/report/processing/gpu_timeline_single.py create mode 100644 src/aorta/report/processing/gpu_timeline_sweep.py create mode 100644 src/aorta/report/processing/process_comms.py create mode 100644 src/aorta/report/processing/process_gemm_variance.py diff --git a/src/aorta/report/COMPARE_CMD_DEV_DOCS.md b/src/aorta/report/COMPARE_CMD_DEV_DOCS.md new file mode 100644 index 0000000..ecd0457 --- /dev/null +++ b/src/aorta/report/COMPARE_CMD_DEV_DOCS.md @@ -0,0 +1,1058 @@ +# `compare` Command Group - Developer Documentation + +**Version:** 1.0 +**Date:** January 2026 +**Status:** ✅ Implemented + +--- + +## Table of Contents + +1. [Overview](#1-overview) +2. [Command Specification](#2-command-specification) +3. [Source Script Analysis](#3-source-script-analysis) +4. [Implementation Architecture](#4-implementation-architecture) +5. [Module Details](#5-module-details) +6. [Data Flow](#6-data-flow) +7. [Implementation Order](#7-implementation-order) +8. [Expected Output](#8-expected-output) +9. [Testing Strategy](#9-testing-strategy) + +--- + +## 1. Overview + +The `compare` command provides functionality to compare baseline and test TraceLens reports. It supports two comparison types: + +| Type | Purpose | Source Scripts | +|------|---------|----------------| +| `gpu_timeline` | Compare GPU timeline reports | `combine_reports.py` + `add_comparison_sheets.py` | +| `collective` | Compare collective/NCCL reports | `combine_reports.py` + `add_collective_comparison.py` | + +### Key Design Decisions + +1. **Single command with positional type argument** - cleaner than separate commands +2. **Exact Excel file paths** - user specifies exact files, no auto-discovery +3. **2-way comparison only** - baseline vs test (N-way comparison deferred) +4. **Shared combine logic** - reuse same function for both types +5. **Match original behavior** - output same sheets and formatting as original scripts + +--- + +## 2. Command Specification + +### 2.1 `aorta-report compare gpu_timeline` + +Compare two GPU timeline reports. + +```bash +aorta-report compare gpu_timeline \ + --baseline /path/to/baseline/gpu_timeline_summary_mean.xlsx \ + --test /path/to/test/gpu_timeline_summary_mean.xlsx \ + --baseline-label "ROCm 6.0" \ + --test-label "ROCm 7.0" \ + --output /path/to/gpu_comparison.xlsx +``` + +| Argument/Option | Required | Default | Description | +|-----------------|----------|---------|-------------| +| `--baseline`, `-b` | Yes | - | Path to baseline gpu_timeline_summary_mean.xlsx | +| `--test`, `-t` | Yes | - | Path to test gpu_timeline_summary_mean.xlsx | +| `--baseline-label` | No | grandparent dir name | Label for baseline in output | +| `--test-label` | No | grandparent dir name | Label for test in output | +| `--output`, `-o` | Yes | - | Output Excel file path | + +**Label Extraction Logic:** +- If `--baseline-label` not provided: extract grandparent directory name +- Example: `/path/to/56cu_256threads/tracelens_analysis/gpu_timeline.xlsx` → `56cu_256threads` +- Fallback: `"baseline"` if extraction fails + +**Output Sheets:** +| Sheet | Description | Source | +|-------|-------------|--------| +| Summary | Combined summaries with `source` column | Combined | +| All_Ranks_Combined | Combined raw data with `source` column | Combined | +| Per_Rank_Time_ms | Combined pivot (time) | Combined | +| Per_Rank_Percent | Combined pivot (percent) | Combined | +| **Comparison_By_Rank** | Per-rank comparison with metrics | NEW | +| **Summary_Comparison** | Overall comparison with metrics | NEW | + +--- + +### 2.2 `aorta-report compare collective` + +Compare two collective/NCCL reports. + +```bash +aorta-report compare collective \ + --baseline /path/to/baseline/collective_all_ranks.xlsx \ + --test /path/to/test/collective_all_ranks.xlsx \ + --baseline-label "ROCm 6.0" \ + --test-label "ROCm 7.0" \ + --output /path/to/collective_comparison.xlsx +``` + +| Argument/Option | Required | Default | Description | +|-----------------|----------|---------|-------------| +| `--baseline`, `-b` | Yes | - | Path to baseline collective_all_ranks.xlsx | +| `--test`, `-t` | Yes | - | Path to test collective_all_ranks.xlsx | +| `--baseline-label` | No | grandparent dir name | Label for baseline in output | +| `--test-label` | No | grandparent dir name | Label for test in output | +| `--output`, `-o` | Yes | - | Output Excel file path | + +**Sheet Filtering (matches original):** +- Only sheets with `"summary"` in the name are kept +- Non-summary sheets are skipped + +**Output Sheets:** +| Sheet | Description | Source | +|-------|-------------|--------| +| nccl_summary_implicit_sync | Combined summary (implicit sync) | Combined | +| nccl_summary_long | Combined summary (long) | Combined | +| **nccl_implicit_sync_cmp** | Comparison for implicit sync | NEW | +| **nccl_long_cmp** | Comparison for long | NEW | + +--- + +## 3. Source Script Analysis + +### 3.1 `combine_reports.py` (72 lines) + +**Location:** `scripts/tracelens_single_config/combine_reports.py` + +**Purpose:** Combine two Excel files by adding a `source` column. + +**Key Logic:** +```python +def combine_collective_reports(baseline_path, test_path, output_path, baseline_label, test_label): + baseline_xl = pd.ExcelFile(baseline_path) + test_xl = pd.ExcelFile(test_path) + + with pd.ExcelWriter(output_path, engine="openpyxl") as writer: + for sheet_name in baseline_xl.sheet_names: + if sheet_name not in test_xl.sheet_names: + continue # Skip sheets not in both files + + baseline_df = pd.read_excel(baseline_path, sheet_name=sheet_name) + test_df = pd.read_excel(test_path, sheet_name=sheet_name) + + baseline_df["source"] = baseline_label + test_df["source"] = test_label + + combined = pd.concat([baseline_df, test_df], ignore_index=True) + combined.to_excel(writer, sheet_name=sheet_name, index=False) +``` + +--- + +### 3.2 `add_comparison_sheets.py` (222 lines) + +**Location:** `scripts/tracelens_single_config/add_comparison_sheets.py` + +**Purpose:** Add GPU timeline comparison sheets to combined Excel file. + +**Key Logic:** + +```python +def add_comparison_sheets(input_path, output_path, baseline_label, test_label): + xl = pd.ExcelFile(input_path) + + with pd.ExcelWriter(output_path, engine="openpyxl") as writer: + # 1. Copy all original sheets + for sheet_name in xl.sheet_names: + df = pd.read_excel(input_path, sheet_name=sheet_name) + df.to_excel(writer, sheet_name=sheet_name, index=False) + + # 2. Read combined data + all_combined = pd.read_excel(input_path, sheet_name="All_Ranks_Combined") + + # Get actual source values from dataframe + sources = all_combined['source'].unique() + actual_baseline = sources[0] + actual_test = sources[1] + + # 3. Create Comparison_By_Rank + baseline_data = all_combined[all_combined["source"] == actual_baseline] + test_data = all_combined[all_combined["source"] == actual_test] + + comparison_by_rank = pd.DataFrame() + for rank in sorted(baseline_data["rank"].unique()): + base_rank = baseline_data[baseline_data["rank"] == rank].set_index("type") + test_rank = test_data[test_data["rank"] == rank].set_index("type") + + for metric_type in base_rank.index: + if metric_type in test_rank.index: + base_time = base_rank.loc[metric_type, "time ms"] + test_time = test_rank.loc[metric_type, "time ms"] + + # percent_change: positive when test is faster (takes less time) + pct_change = (base_time - test_time) / base_time * 100 if base_time != 0 else 0 + + # Determine status + if pct_change > 1: + status = "Better" + elif pct_change < -1: + status = "Worse" + else: + status = "Similar" + + # Build row with all metrics + row = { + "rank": rank, + "type": metric_type, + f"{baseline_label}_time_ms": base_time, + f"{test_label}_time_ms": test_time, + "diff_time_ms": test_time - base_time, + "percent_change": pct_change, + "status": status, + "ratio": test_time / base_time if base_time != 0 else 0, + f"{baseline_label}_percent": base_rank.loc[metric_type, "percent"], + f"{test_label}_percent": test_rank.loc[metric_type, "percent"], + "diff_percent": test_rank.loc[metric_type, "percent"] - base_rank.loc[metric_type, "percent"], + } + comparison_by_rank = pd.concat([comparison_by_rank, pd.DataFrame([row])], ignore_index=True) + + comparison_by_rank.to_excel(writer, sheet_name="Comparison_By_Rank", index=False) + + # 4. Create Summary_Comparison (similar logic with Summary sheet) + # ... + + # 5. Apply conditional formatting + ws = writer.sheets["Comparison_By_Rank"] + # Find percent_change column and apply color scale + ws.conditional_formatting.add( + data_range, + ColorScaleRule( + start_type="min", start_color="F8696B", # Red + mid_type="num", mid_value=0, mid_color="FFFFFF", # White + end_type="max", end_color="63BE7B", # Green + ) + ) +``` + +**Comparison Columns Created:** +| Column | Formula | Description | +|--------|---------|-------------| +| `{baseline}_time_ms` | baseline value | Time from baseline | +| `{test}_time_ms` | test value | Time from test | +| `diff_time_ms` | test - baseline | Absolute difference | +| `percent_change` | (baseline - test) / baseline × 100 | Positive = faster | +| `status` | Based on percent_change | "Better", "Worse", or "Similar" | +| `ratio` | test / baseline | Ratio comparison | +| `{baseline}_percent` | baseline value | Percent from baseline | +| `{test}_percent` | test value | Percent from test | +| `diff_percent` | test - baseline | Difference in percent | + +--- + +### 3.3 `add_collective_comparison.py` (209 lines) + +**Location:** `scripts/tracelens_single_config/add_collective_comparison.py` + +**Purpose:** Add NCCL collective comparison sheets. + +**Key Differences from GPU Timeline:** + +1. **Sheet Filtering:** Only keeps sheets with "summary" in the name +2. **Grouping:** Groups by `['Collective name', 'dtype', 'In msg nelems']` +3. **Multiple Metrics:** Compares multiple NCCL-specific metrics +4. **Semantic Difference:** Latency vs Bandwidth have opposite "better" directions + +**Key Logic:** + +```python +def add_collective_comparison_sheets(input_path, output_path, baseline_label, test_label): + xl = pd.ExcelFile(input_path) + + with pd.ExcelWriter(output_path, engine="openpyxl") as writer: + # 1. Copy only summary sheets + for sheet_name in xl.sheet_names: + if "summary" not in sheet_name.lower(): + continue # Skip non-summary sheets + df = pd.read_excel(input_path, sheet_name=sheet_name) + df.to_excel(writer, sheet_name=sheet_name, index=False) + + # 2. Process each summary sheet + for sheet_name in ["nccl_summary_implicit_sync", "nccl_summary_long"]: + if sheet_name not in xl.sheet_names: + continue + + df = pd.read_excel(input_path, sheet_name=sheet_name) + + # Get actual source values + sources = df['source'].unique() + actual_baseline = sources[0] + actual_test = sources[1] + + baseline_df = df[df["source"] == actual_baseline] + test_df = df[df["source"] == actual_test] + + # Group columns + group_cols = ["Collective name", "dtype", "In msg nelems"] + + # Metrics to compare + numeric_cols = [ + "comm_latency_mean", + "algo bw (GB/s)_mean", + "bus bw (GB/s)_mean", + "Total comm latency (ms)", + "count", + ] + + comparison = pd.DataFrame() + + for name, base_group in baseline_df.groupby(group_cols): + # Find matching test group + # ... matching logic ... + + comp_row = {} + + # Copy grouping columns + for col, val in zip(group_cols, name): + comp_row[col] = val + + # Compare each metric + for col in numeric_cols: + base_val = base_group[col].values[0] + test_val = test_group[col].values[0] + + comp_row[f"{actual_baseline}_{col}"] = base_val + comp_row[f"{actual_test}_{col}"] = test_val + comp_row[f"diff_{col}"] = test_val - base_val + + # percent_change semantics differ by metric type + if "latency" in col.lower() or "time" in col.lower(): + # Lower is better - positive when test is faster + pct_change = (base_val - test_val) / base_val * 100 + elif "bw" in col.lower() or "bandwidth" in col.lower(): + # Higher is better - positive when test is better + pct_change = (test_val - base_val) / base_val * 100 + else: + pct_change = 0 + + comp_row[f"percent_change_{col}"] = pct_change + comp_row[f"ratio_{col}"] = test_val / base_val if base_val != 0 else 0 + + comparison = pd.concat([comparison, pd.DataFrame([comp_row])], ignore_index=True) + + # Sheet name: nccl_summary_implicit_sync → nccl_implicit_sync_cmp + comparison_sheet_name = sheet_name.replace("nccl_summary_", "nccl_") + "_cmp" + comparison.to_excel(writer, sheet_name=comparison_sheet_name, index=False) + + # Apply formatting to all percent_change columns + # ... +``` + +**Metrics Compared:** +| Metric | Better Direction | percent_change Formula | +|--------|------------------|------------------------| +| `comm_latency_mean` | Lower | (base - test) / base × 100 | +| `algo bw (GB/s)_mean` | Higher | (test - base) / base × 100 | +| `bus bw (GB/s)_mean` | Higher | (test - base) / base × 100 | +| `Total comm latency (ms)` | Lower | (base - test) / base × 100 | +| `count` | N/A | No percent_change | + +--- + +## 4. Implementation Architecture + +### 4.1 File Structure + +``` +src/aorta/report/ +├── comparison/ # NEW: comparison module +│ ├── __init__.py # Package exports +│ ├── combine.py # Shared: combine two Excel files +│ ├── gpu_timeline_comparison.py # GPU timeline comparison logic +│ ├── collective_comparison.py # Collective/NCCL comparison logic +│ └── formatting.py # Shared Excel formatting utilities +├── cli.py # Update compare commands +└── ... (existing modules) +``` + +### 4.2 Module Responsibilities + +| Module | Responsibility | +|--------|----------------| +| `combine.py` | Combine two Excel files with source column | +| `gpu_timeline_comparison.py` | Add Comparison_By_Rank and Summary_Comparison sheets | +| `collective_comparison.py` | Add nccl_*_cmp sheets for each summary sheet | +| `formatting.py` | Color scale formatting, column letter conversion | + +### 4.3 Dependency Graph + +``` +cli.py + │ + ├── compare gpu_timeline ──► combine.py + │ │ + │ └──► gpu_timeline_comparison.py + │ │ + │ └──► formatting.py + │ + └── compare collective ───► combine.py + │ + └──► collective_comparison.py + │ + └──► formatting.py +``` + +--- + +## 5. Module Details + +### 5.1 `comparison/__init__.py` + +```python +"""Comparison modules for baseline vs test TraceLens reports.""" + +from .combine import combine_excel_files +from .gpu_timeline_comparison import add_gpu_timeline_comparison +from .collective_comparison import add_collective_comparison +from .formatting import save_with_formatting + +__all__ = [ + "combine_excel_files", + "add_gpu_timeline_comparison", + "add_collective_comparison", + "save_with_formatting", +] +``` + +--- + +### 5.2 `comparison/combine.py` + +```python +"""Shared functionality to combine two Excel files.""" + +from pathlib import Path +from typing import Dict, List, Optional + +import pandas as pd + + +def combine_excel_files( + baseline_path: Path, + test_path: Path, + baseline_label: str, + test_label: str, + sheets_to_combine: Optional[List[str]] = None, + filter_summary_only: bool = False, + verbose: bool = False, +) -> Dict[str, pd.DataFrame]: + """ + Combine two Excel files by adding a 'source' column. + + Args: + baseline_path: Path to baseline Excel file + test_path: Path to test Excel file + baseline_label: Label for baseline rows in 'source' column + test_label: Label for test rows in 'source' column + sheets_to_combine: Specific sheets to combine (None = all common sheets) + filter_summary_only: If True, only keep sheets with 'summary' in name + verbose: Print progress messages + + Returns: + Dict mapping sheet_name to combined DataFrame + + Raises: + FileNotFoundError: If input files don't exist + ValueError: If no common sheets found + """ +``` + +**Implementation Notes:** +- Read both Excel files using `pd.ExcelFile` +- Find common sheets (intersection of sheet names) +- If `filter_summary_only`, filter to sheets containing "summary" +- For each sheet: add `source` column, concat, store in dict +- Return dict (don't save yet - let caller handle saving) + +--- + +### 5.3 `comparison/gpu_timeline_comparison.py` + +```python +"""GPU timeline comparison logic.""" + +from typing import Dict + +import pandas as pd + + +def add_gpu_timeline_comparison( + combined_data: Dict[str, pd.DataFrame], + baseline_label: str, + test_label: str, + verbose: bool = False, +) -> Dict[str, pd.DataFrame]: + """ + Add comparison sheets for GPU timeline data. + + Args: + combined_data: Dict from combine_excel_files() + baseline_label: Label for baseline (for column naming) + test_label: Label for test (for column naming) + verbose: Print progress messages + + Returns: + Dict with original sheets + new comparison sheets: + - 'Comparison_By_Rank': Per-rank comparison + - 'Summary_Comparison': Overall comparison + + Expects combined_data to have: + - 'All_Ranks_Combined' sheet with: source, rank, type, time ms, percent + - 'Summary' sheet with: source, type, time ms, percent + + Comparison columns created: + - {baseline_label}_time_ms, {test_label}_time_ms + - diff_time_ms, percent_change, status, ratio + - {baseline_label}_percent, {test_label}_percent, diff_percent + + percent_change formula: (baseline - test) / baseline × 100 + - Positive = test is faster (better) + - Negative = test is slower (worse) + + status thresholds: + - "Better" if percent_change > 1 + - "Worse" if percent_change < -1 + - "Similar" otherwise + """ +``` + +**Implementation Notes:** +- Get actual source values from DataFrame (first = baseline, second = test) +- Create Comparison_By_Rank by iterating over ranks and types +- Create Summary_Comparison from Summary sheet +- Add to result dict and return + +--- + +### 5.4 `comparison/collective_comparison.py` + +```python +"""Collective/NCCL comparison logic.""" + +from typing import Dict + +import pandas as pd + + +# Metrics to compare +NCCL_NUMERIC_COLS = [ + "comm_latency_mean", + "algo bw (GB/s)_mean", + "bus bw (GB/s)_mean", + "Total comm latency (ms)", + "count", +] + +# Grouping columns for NCCL data +NCCL_GROUP_COLS = ["Collective name", "dtype", "In msg nelems"] + + +def add_collective_comparison( + combined_data: Dict[str, pd.DataFrame], + baseline_label: str, + test_label: str, + verbose: bool = False, +) -> Dict[str, pd.DataFrame]: + """ + Add comparison sheets for collective/NCCL data. + + Args: + combined_data: Dict from combine_excel_files() + baseline_label: Label for baseline + test_label: Label for test + verbose: Print progress messages + + Returns: + Dict with summary sheets + new comparison sheets: + - 'nccl_implicit_sync_cmp': Comparison for nccl_summary_implicit_sync + - 'nccl_long_cmp': Comparison for nccl_summary_long + + Processes sheets: + - 'nccl_summary_implicit_sync' → 'nccl_implicit_sync_cmp' + - 'nccl_summary_long' → 'nccl_long_cmp' + + Groups by: ['Collective name', 'dtype', 'In msg nelems'] + + For each metric, creates columns: + - {baseline}_{metric}, {test}_{metric} + - diff_{metric}, percent_change_{metric}, ratio_{metric} + + percent_change semantics (positive = better): + - Latency/time: (baseline - test) / baseline × 100 + - Bandwidth: (test - baseline) / baseline × 100 + """ +``` + +**Implementation Notes:** +- Only process sheets in `["nccl_summary_implicit_sync", "nccl_summary_long"]` +- Use flexible grouping (fall back to just "Collective name" if other cols missing) +- Apply correct percent_change formula based on metric type +- Sheet name transformation: `nccl_summary_X` → `nccl_X_cmp` + +--- + +### 5.5 `comparison/formatting.py` + +```python +"""Shared Excel formatting utilities.""" + +from pathlib import Path +from typing import Dict, List + +import pandas as pd +from openpyxl.formatting.rule import ColorScaleRule + + +# Color constants +RED = "F8696B" +WHITE = "FFFFFF" +GREEN = "63BE7B" + + +def get_column_letter(col_idx: int) -> str: + """ + Convert 1-based column index to Excel column letter. + + Examples: + 1 → 'A', 26 → 'Z', 27 → 'AA', 28 → 'AB' + """ + + +def create_color_scale_rule() -> ColorScaleRule: + """ + Create standard red-white-green color scale rule. + + Red (min/negative) → White (0) → Green (max/positive) + """ + return ColorScaleRule( + start_type="min", + start_color=RED, + mid_type="num", + mid_value=0, + mid_color=WHITE, + end_type="max", + end_color=GREEN, + ) + + +def apply_color_scale_to_column( + worksheet, + col_idx: int, + num_rows: int, +) -> None: + """ + Apply color scale formatting to a specific column. + + Args: + worksheet: openpyxl worksheet + col_idx: 1-based column index + num_rows: Number of data rows (excluding header) + """ + + +def save_with_formatting( + data: Dict[str, pd.DataFrame], + output_path: Path, + format_columns: Dict[str, List[str]], + verbose: bool = False, +) -> Path: + """ + Save DataFrames to Excel with conditional formatting. + + Args: + data: Dict[sheet_name, DataFrame] + output_path: Output file path + format_columns: Dict[sheet_name, list of column names to format] + verbose: Print progress + + Returns: + Path to saved file + + Example: + format_columns = { + "Comparison_By_Rank": ["percent_change"], + "Summary_Comparison": ["percent_change"], + "nccl_implicit_sync_cmp": [ + "percent_change_comm_latency_mean", + "percent_change_algo bw (GB/s)_mean", + ], + } + """ +``` + +--- + +## 6. Data Flow + +### 6.1 `compare gpu_timeline` Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ compare gpu_timeline │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ INPUT: │ +│ ├── baseline.xlsx (gpu_timeline_summary_mean.xlsx) │ +│ │ ├── Summary │ +│ │ ├── All_Ranks_Combined │ +│ │ ├── Per_Rank_Time_ms │ +│ │ └── Per_Rank_Percent │ +│ │ │ +│ └── test.xlsx (gpu_timeline_summary_mean.xlsx) │ +│ └── (same sheets) │ +│ │ +│ STEP 1: combine_excel_files() │ +│ ──────────────────────────────── │ +│ For each sheet, add 'source' column and concat: │ +│ baseline rows → source = baseline_label │ +│ test rows → source = test_label │ +│ │ +│ STEP 2: add_gpu_timeline_comparison() │ +│ ──────────────────────────────────────── │ +│ Create new sheets: │ +│ Comparison_By_Rank: Per-rank comparison │ +│ Summary_Comparison: Overall comparison │ +│ │ +│ STEP 3: save_with_formatting() │ +│ ───────────────────────────────── │ +│ Save all sheets to Excel with color formatting on percent_change │ +│ │ +│ OUTPUT: │ +│ └── output.xlsx │ +│ ├── Summary (combined) │ +│ ├── All_Ranks_Combined (combined) │ +│ ├── Per_Rank_Time_ms (combined) │ +│ ├── Per_Rank_Percent (combined) │ +│ ├── Comparison_By_Rank (NEW - with formatting) │ +│ └── Summary_Comparison (NEW - with formatting) │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### 6.2 `compare collective` Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ compare collective │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ INPUT: │ +│ ├── baseline.xlsx (collective_all_ranks.xlsx) │ +│ │ ├── nccl_summary_implicit_sync │ +│ │ ├── nccl_summary_long │ +│ │ └── (other non-summary sheets - SKIPPED) │ +│ │ │ +│ └── test.xlsx (collective_all_ranks.xlsx) │ +│ └── (same sheets) │ +│ │ +│ STEP 1: combine_excel_files(filter_summary_only=True) │ +│ ──────────────────────────────────────────────────────── │ +│ Only combine sheets with "summary" in name │ +│ Add 'source' column and concat │ +│ │ +│ STEP 2: add_collective_comparison() │ +│ ────────────────────────────────────── │ +│ For each summary sheet, create comparison sheet: │ +│ nccl_summary_implicit_sync → nccl_implicit_sync_cmp │ +│ nccl_summary_long → nccl_long_cmp │ +│ │ +│ STEP 3: save_with_formatting() │ +│ ───────────────────────────────── │ +│ Save with color formatting on all percent_change_* columns │ +│ │ +│ OUTPUT: │ +│ └── output.xlsx │ +│ ├── nccl_summary_implicit_sync (combined) │ +│ ├── nccl_summary_long (combined) │ +│ ├── nccl_implicit_sync_cmp (NEW - with formatting) │ +│ └── nccl_long_cmp (NEW - with formatting) │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 7. Implementation Order + +| Phase | Task | Est. Time | Dependencies | +|-------|------|-----------|--------------| +| **1** | Create `comparison/` directory and `__init__.py` | 5 min | None | +| **2** | Implement `formatting.py` | 25 min | Phase 1 | +| **3** | Implement `combine.py` | 20 min | Phase 1 | +| **4** | Implement `gpu_timeline_comparison.py` | 40 min | Phase 2, 3 | +| **5** | Implement `collective_comparison.py` | 40 min | Phase 2, 3 | +| **6** | Update `cli.py` with compare commands | 25 min | Phase 4, 5 | +| **7** | Testing | 30 min | Phase 6 | + +**Total estimated time: ~3 hours** + +--- + +## 8. Expected Output + +### 8.1 `compare gpu_timeline` Console Output + +``` +============================================================ +GPU Timeline Comparison +============================================================ +Baseline: /path/to/56cu_256threads/tracelens_analysis/gpu_timeline_summary_mean.xlsx +Test: /path/to/37cu_384threads/tracelens_analysis/gpu_timeline_summary_mean.xlsx +Baseline label: 56cu_256threads +Test label: 37cu_384threads + +Step 1: Combining Excel files + Loading baseline (56cu_256threads)... + Loading test (37cu_384threads)... + Combining sheets: + Summary: 10 + 10 = 20 rows + All_Ranks_Combined: 80 + 80 = 160 rows + Per_Rank_Time_ms: 10 + 10 = 20 rows + Per_Rank_Percent: 10 + 10 = 20 rows + +Step 2: Adding comparison sheets + Creating Comparison_By_Rank... + Processing 8 ranks × 10 types = 80 comparisons + Creating Summary_Comparison... + Processing 10 types + +Step 3: Saving with formatting + Applying color scale to Comparison_By_Rank.percent_change + Applying color scale to Summary_Comparison.percent_change + +============================================================ +Comparison Complete! +============================================================ +Output: /path/to/gpu_comparison.xlsx + +Sheets: + - Summary (combined data) + - All_Ranks_Combined (combined data) + - Per_Rank_Time_ms (combined data) + - Per_Rank_Percent (combined data) + - Comparison_By_Rank (per-rank comparison) + - Summary_Comparison (overall comparison) + +percent_change interpretation: + Positive = test is faster/better + Negative = test is slower/worse +``` + +### 8.2 `compare collective` Console Output + +``` +============================================================ +Collective/NCCL Comparison +============================================================ +Baseline: /path/to/56cu_256threads/tracelens_analysis/collective_reports/collective_all_ranks.xlsx +Test: /path/to/37cu_384threads/tracelens_analysis/collective_reports/collective_all_ranks.xlsx +Baseline label: 56cu_256threads +Test label: 37cu_384threads + +Step 1: Combining Excel files + Loading baseline (56cu_256threads)... + Loading test (37cu_384threads)... + Filtering to summary sheets only... + Combining sheets: + nccl_summary_implicit_sync: 15 + 15 = 30 rows + nccl_summary_long: 15 + 15 = 30 rows + Skipped sheets (non-summary): + - per_rank_comm_details + - raw_data + +Step 2: Adding comparison sheets + Processing nccl_summary_implicit_sync... + Grouping by: ['Collective name', 'dtype', 'In msg nelems'] + Created nccl_implicit_sync_cmp (15 rows) + Processing nccl_summary_long... + Created nccl_long_cmp (15 rows) + +Step 3: Saving with formatting + Applying color scale to nccl_implicit_sync_cmp: + - percent_change_comm_latency_mean + - percent_change_algo bw (GB/s)_mean + - percent_change_bus bw (GB/s)_mean + - percent_change_Total comm latency (ms) + Applying color scale to nccl_long_cmp: + - (same columns) + +============================================================ +Comparison Complete! +============================================================ +Output: /path/to/collective_comparison.xlsx + +Sheets: + - nccl_summary_implicit_sync (combined data) + - nccl_summary_long (combined data) + - nccl_implicit_sync_cmp (comparison) + - nccl_long_cmp (comparison) + +percent_change interpretation: + For latency/time: Positive = faster (better) + For bandwidth: Positive = higher bandwidth (better) +``` + +--- + +## 9. Testing Strategy + +### 9.1 Unit Tests + +```python +# tests/test_comparison/test_combine.py + +def test_combine_excel_files_basic(): + """Test combining two Excel files adds source column.""" + +def test_combine_excel_files_filter_summary(): + """Test filter_summary_only option works.""" + +def test_combine_excel_files_missing_sheet(): + """Test handling when sheet only exists in one file.""" + + +# tests/test_comparison/test_gpu_timeline.py + +def test_add_gpu_timeline_comparison_creates_sheets(): + """Test that Comparison_By_Rank and Summary_Comparison are created.""" + +def test_percent_change_calculation(): + """Test percent_change formula is correct.""" + +def test_status_thresholds(): + """Test Better/Worse/Similar status logic.""" + + +# tests/test_comparison/test_collective.py + +def test_add_collective_comparison_creates_sheets(): + """Test comparison sheets are created for each summary sheet.""" + +def test_latency_percent_change(): + """Test latency metrics use (base-test)/base formula.""" + +def test_bandwidth_percent_change(): + """Test bandwidth metrics use (test-base)/base formula.""" + + +# tests/test_comparison/test_formatting.py + +def test_get_column_letter(): + """Test column index to letter conversion.""" + assert get_column_letter(1) == "A" + assert get_column_letter(26) == "Z" + assert get_column_letter(27) == "AA" + +def test_color_scale_applied(): + """Test that color scale formatting is applied to correct columns.""" +``` + +### 9.2 Integration Tests + +```python +# tests/test_comparison/test_cli_integration.py + +def test_compare_gpu_timeline_cli(): + """Test full CLI flow for gpu_timeline comparison.""" + +def test_compare_collective_cli(): + """Test full CLI flow for collective comparison.""" + +def test_label_extraction_from_path(): + """Test grandparent directory name extraction.""" +``` + +--- + +## Appendix A: Label Extraction Logic + +```python +def extract_label_from_path(file_path: Path) -> str: + """ + Extract label from file path using grandparent directory name. + + Examples: + /path/to/56cu_256threads/tracelens_analysis/gpu_timeline.xlsx + → "56cu_256threads" + + /path/to/run1/tracelens_analysis/collective_reports/collective.xlsx + → "run1" (or "tracelens_analysis" depending on depth) + + Fallback: "baseline" or "test" if extraction fails + """ + try: + # Go up to grandparent (skip filename and parent directory) + grandparent = file_path.parent.parent.name + if grandparent and grandparent not in [".", "..", ""]: + return grandparent + except: + pass + return None # Let caller provide default +``` + +--- + +## Appendix B: CLI Help Text + +``` +$ aorta-report compare --help +Usage: aorta-report compare [OPTIONS] COMMAND [ARGS]... + + Compare baseline and test TraceLens reports. + + Supported comparison types: + gpu_timeline - Compare GPU timeline reports + collective - Compare collective/NCCL reports + +Commands: + collective Compare two collective/NCCL reports. + gpu_timeline Compare two GPU timeline reports. + + +$ aorta-report compare gpu_timeline --help +Usage: aorta-report compare gpu_timeline [OPTIONS] + + Compare two GPU timeline reports. + + Combines baseline and test files, then adds comparison sheets with diff, + percent_change, and status columns. + + Output sheets: + - Summary, All_Ranks_Combined, Per_Rank_* (combined data) + - Comparison_By_Rank (per-rank comparison) + - Summary_Comparison (overall comparison) + + Examples: + aorta-report compare gpu_timeline \ + -b baseline/gpu_timeline_summary_mean.xlsx \ + -t test/gpu_timeline_summary_mean.xlsx \ + -o comparison.xlsx + +Options: + -b, --baseline PATH Path to baseline gpu_timeline_summary_mean.xlsx + [required] + -t, --test PATH Path to test gpu_timeline_summary_mean.xlsx + [required] + --baseline-label TEXT Label for baseline (default: grandparent dir name) + --test-label TEXT Label for test (default: grandparent dir name) + -o, --output PATH Output Excel file path [required] + --help Show this message and exit. +``` + +--- + +## Appendix C: Migration from Original Scripts + +| Original | New CLI Equivalent | +|----------|-------------------| +| `python combine_reports.py --baseline b.xlsx --test t.xlsx --output combined.xlsx` | (intermediate step, now internal) | +| `python add_comparison_sheets.py --input combined.xlsx --output comparison.xlsx` | `aorta-report compare gpu_timeline -b b.xlsx -t t.xlsx -o comparison.xlsx` | +| `python add_collective_comparison.py --input combined.xlsx --output comparison.xlsx` | `aorta-report compare collective -b b.xlsx -t t.xlsx -o comparison.xlsx` | + +The new CLI combines both steps (combine + add comparison) into a single command. + diff --git a/src/aorta/report/GENERATE_EXCEL_DEV_DOCS.md b/src/aorta/report/GENERATE_EXCEL_DEV_DOCS.md new file mode 100644 index 0000000..91ca20b --- /dev/null +++ b/src/aorta/report/GENERATE_EXCEL_DEV_DOCS.md @@ -0,0 +1,618 @@ +# `generate excel` Command - Developer Documentation + +**Version:** 1.0 +**Date:** January 2026 +**Status:** ✅ Implemented + +--- + +## Table of Contents + +1. [Overview](#1-overview) +2. [Command Specification](#2-command-specification) +3. [Source Script Analysis](#3-source-script-analysis) +4. [Implementation Architecture](#4-implementation-architecture) +5. [Module Details](#5-module-details) +6. [Data Flow](#6-data-flow) +7. [Implementation Order](#7-implementation-order) +8. [Expected Output](#8-expected-output) + +--- + +## 1. Overview + +The `generate excel` command creates a comprehensive final report by combining GPU timeline and collective comparison data into a single, well-organized Excel file. + +### Key Features + +| Feature | Description | +|---------|-------------| +| **Summary Dashboard** | First visible sheet with key metrics and status | +| **Comparison Sheets** | Visible sheets with comparison data | +| **Hidden Raw Data** | Original data hidden but accessible | +| **Excel Tables** | All data formatted as tables with filters | +| **Color Coding** | Red-white-green scale on percent_change columns | + +### Source Script + +**Location:** `scripts/tracelens_single_config/create_final_report.py` (346 lines) + +--- + +## 2. Command Specification + +### Current CLI (Stub) + +```bash +aorta-report generate excel \ + --gpu-combined gpu_timeline_combined.xlsx \ + --gpu-comparison gpu_timeline_comparison.xlsx \ + --coll-combined collective_combined.xlsx \ + --coll-comparison collective_comparison.xlsx \ + --baseline-label "ROCm 6.0" \ + --test-label "ROCm 7.0" \ + -o final_analysis_report.xlsx +``` + +### Arguments + +| Option | Required | Description | +|--------|----------|-------------| +| `--gpu-combined` | Yes | GPU timeline combined file (output of `compare gpu_timeline` without comparison sheets) | +| `--gpu-comparison` | Yes | GPU timeline comparison file (output of `compare gpu_timeline`) | +| `--coll-combined` | Yes | Collective combined file (intermediate) | +| `--coll-comparison` | Yes | Collective comparison file (output of `compare collective`) | +| `--baseline-label` | No | Label for baseline (default: "Baseline") | +| `--test-label` | No | Label for test (default: "Test") | +| `-o, --output` | Yes | Output Excel file path | + +### Alternative Simplified Interface + +Since `compare gpu_timeline` and `compare collective` now produce combined comparison files directly, we could simplify: + +```bash +aorta-report generate excel \ + --gpu-comparison gpu_comparison.xlsx \ + --coll-comparison collective_comparison.xlsx \ + --baseline-label "ROCm 6.0" \ + --test-label "ROCm 7.0" \ + -o final_report.xlsx +``` + +**Decision:** Keep original 4-file interface for now. Can refactor later. + +--- + +## 3. Source Script Analysis + +### 3.1 Input Files + +The script requires 4 Excel files: + +| File | Contents | Source | +|------|----------|--------| +| `gpu_combined` | GPU summary + raw data with source column | `combine_reports.py` | +| `gpu_comparison` | GPU comparison sheets (Summary_Comparison, Comparison_By_Rank) | `add_comparison_sheets.py` | +| `coll_combined` | NCCL summary data with source column | `combine_reports.py` | +| `coll_comparison` | NCCL comparison sheets (nccl_*_cmp) | `add_collective_comparison.py` | + +### 3.2 Processing Steps + +```python +def create_final_report(gpu_combined, gpu_comparison, coll_combined, coll_comparison, output_file): + # 1. Create workbook and add sheets + with pd.ExcelWriter(output_file, engine="openpyxl") as writer: + + # 2. Add GPU Timeline sheets (raw → hidden) + for sheet in gpu_combined: + rename_and_add(sheet, "GPU_*_Raw") + mark_as_hidden(sheet) + + # 3. Add GPU Comparison sheets (visible) + for sheet in gpu_comparison: + if "Comparison" in sheet: + rename_and_add(sheet, "GPU_*_Cmp") + + # 4. Add Collective sheets (raw → hidden) + for sheet in coll_combined: + if "summary" in sheet: + rename_and_add(sheet, "NCCL_*_Raw") + mark_as_hidden(sheet) + + # 5. Add Collective Comparison sheets (visible) + for sheet in coll_comparison: + rename_and_add(sheet, "NCCL_*") + + # 6. Create Summary Dashboard + create_dashboard_from_gpu_comparison() + + # 7. Post-processing with openpyxl + wb = load_workbook(output_file) + + # 8. Hide raw data sheets + for sheet in raw_sheets: + wb[sheet].sheet_state = "hidden" + + # 9. Convert all sheets to Excel tables + for sheet in wb.sheetnames: + add_excel_table(sheet) + + # 10. Add conditional formatting to comparison sheets + for sheet in comparison_sheets: + apply_color_scale_to_percent_change_columns(sheet) + + # 11. Move Summary_Dashboard to first position + wb.move_sheet("Summary_Dashboard", offset=-(len(wb.sheetnames)-1)) + + wb.save(output_file) +``` + +### 3.3 Sheet Naming Convention + +| Original Sheet | Final Name | Visibility | +|----------------|------------|------------| +| Summary | GPU_Summary_Raw | Hidden | +| All_Ranks_Combined | GPU_AllRanks_Raw | Hidden | +| Per_Rank_Time_ms | GPU_Time_Raw | Hidden | +| Per_Rank_Percent | GPU_Pct_Raw | Hidden | +| Summary_Comparison | GPU_Summary_Cmp | Visible | +| Comparison_By_Rank | GPU_ByRank_Cmp | Visible | +| nccl_summary_implicit_sync | NCCL_ImplSync_Raw | Hidden | +| nccl_summary_long | NCCL_Long_Raw | Hidden | +| nccl_implicit_sync_cmp | NCCL_Implicit_sync_cmp | Visible | +| nccl_long_cmp | NCCL_Long_cmp | Visible | +| (generated) | Summary_Dashboard | Visible (1st) | + +### 3.4 Summary Dashboard Creation + +**Decision:** Include BOTH GPU and NCCL metrics in the Summary Dashboard. + +```python +dashboard_data = { + 'Metric': [], + baseline_label: [], + test_label: [], + 'Improvement (%)': [], + 'Status': [] +} + +# For each GPU metric type (busy_time, idle_time, etc.): +for row in gpu_summary_comparison: + dashboard_data['Metric'].append(f"GPU_{row['type']}") + dashboard_data[baseline_label].append(row[baseline_time_col]) + dashboard_data[test_label].append(row[test_time_col]) + dashboard_data['Improvement (%)'].append(row['percent_change']) + dashboard_data['Status'].append('Better' if pct > 0 else 'Worse' if pct < -1 else 'Similar') + +# Add NCCL metrics from collective comparison +for sheet in ['nccl_implicit_sync_cmp', 'nccl_long_cmp']: + # Add latency and bandwidth metrics + for row in coll_comparison[sheet]: + # Add total comm latency metric + dashboard_data['Metric'].append(f"NCCL_{collective_name}_latency") + # ... add values +``` + +### 3.5 Excel Table Formatting + +```python +def add_excel_table(worksheet, table_name): + # Create table reference: A1:Z100 + table_ref = f"A1:{get_column_letter(max_col)}{max_row}" + + tab = Table(displayName=table_name, ref=table_ref) + style = TableStyleInfo( + name="TableStyleMedium2", + showFirstColumn=False, + showLastColumn=False, + showRowStripes=True, + showColumnStripes=False, + ) + tab.tableStyleInfo = style + worksheet.add_table(tab) +``` + +### 3.6 Conditional Formatting + +Applied to columns with "percent_change" in the header: + +```python +ColorScaleRule( + start_type="min", start_color="F8696B", # Red + mid_type="num", mid_value=0, mid_color="FFFFFF", # White + end_type="max", end_color="63BE7B", # Green +) +``` + +--- + +## 4. Implementation Architecture + +### 4.1 File Structure + +``` +src/aorta/report/ +├── generators/ # Existing +│ ├── __init__.py # Add export +│ ├── html_generator.py # Existing +│ ├── sweep_comparison.py # Existing +│ ├── performance_report.py # Existing +│ └── excel_report.py # NEW: Final Excel report generator +└── cli.py # Update generate excel command +``` + +### 4.2 Relationship with Existing Modules + +The `generate excel` command will use: +- `comparison/formatting.py` - For color scale formatting (already implemented) +- Excel table creation - New utility functions + +### 4.3 Simplification Consideration + +Since `compare gpu_timeline` and `compare collective` now produce files with BOTH combined data AND comparison sheets, we could: + +**Option A: Keep current interface (4 files)** +- Matches original script exactly +- More flexible but verbose + +**Option B: Simplified interface (2 files)** +- Only needs comparison files (they contain combined data too) +- Cleaner CLI but may need to extract raw data from comparison files + +**Recommendation:** Option B with backward compatibility for Option A + +--- + +## 5. Module Details + +### 5.1 `generators/excel_report.py` + +```python +"""Final Excel report generator. + +Creates comprehensive report with: +- Summary Dashboard (first, visible) +- Comparison sheets (visible) +- Raw data sheets (hidden) +- Excel table formatting +- Color-coded percent_change columns +""" + +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import pandas as pd +from openpyxl import load_workbook +from openpyxl.worksheet.table import Table, TableStyleInfo + +from ..comparison.formatting import get_column_letter, create_color_scale_rule + + +# Sheet naming mappings +GPU_SHEET_MAPPING = { + "Summary": "GPU_Summary_Raw", + "All_Ranks_Combined": "GPU_AllRanks_Raw", + "Per_Rank_Time_ms": "GPU_Time_Raw", + "Per_Rank_Percent": "GPU_Pct_Raw", +} + +GPU_COMPARISON_MAPPING = { + "Summary_Comparison": "GPU_Summary_Cmp", + "Comparison_By_Rank": "GPU_ByRank_Cmp", +} + +COLL_SHEET_MAPPING = { + "nccl_summary_implicit_sync": "NCCL_ImplSync_Raw", + "nccl_summary_long": "NCCL_Long_Raw", +} + + +def create_final_excel_report( + gpu_comparison_path: Path, + coll_comparison_path: Path, + output_path: Path, + baseline_label: str = "Baseline", + test_label: str = "Test", + gpu_combined_path: Optional[Path] = None, + coll_combined_path: Optional[Path] = None, + verbose: bool = False, +) -> Path: + """ + Create comprehensive final Excel report. + + Args: + gpu_comparison_path: Path to GPU comparison file + coll_comparison_path: Path to collective comparison file + output_path: Output path for final report + baseline_label: Label for baseline column + test_label: Label for test column + gpu_combined_path: Optional separate GPU combined file + coll_combined_path: Optional separate collective combined file + verbose: Print progress + + Returns: + Path to created report + """ + + +def _add_gpu_sheets( + writer: pd.ExcelWriter, + gpu_comparison_path: Path, + gpu_combined_path: Optional[Path], + verbose: bool, +) -> Tuple[List[str], List[str]]: + """Add GPU timeline sheets, return (raw_sheets, comparison_sheets).""" + + +def _add_collective_sheets( + writer: pd.ExcelWriter, + coll_comparison_path: Path, + coll_combined_path: Optional[Path], + verbose: bool, +) -> Tuple[List[str], List[str]]: + """Add collective sheets, return (raw_sheets, comparison_sheets).""" + + +def _create_summary_dashboard( + writer: pd.ExcelWriter, + gpu_comparison_path: Path, + baseline_label: str, + test_label: str, + verbose: bool, +) -> str: + """Create Summary_Dashboard sheet, return sheet name.""" + + +def _apply_post_processing( + output_path: Path, + raw_sheets: List[str], + comparison_sheets: List[str], + verbose: bool, +) -> None: + """Apply Excel formatting: hide sheets, add tables, color formatting.""" + + +def add_excel_table(worksheet, table_name: str, start_row: int = 1) -> None: + """Convert worksheet data to Excel table format.""" + + +def _sanitize_table_name(sheet_name: str) -> str: + """Create valid Excel table name from sheet name.""" +``` + +### 5.2 Updated `generators/__init__.py` + +```python +"""Report generators for HTML and Excel.""" + +from .html_generator import generate_html, image_to_base64 +from .excel_report import create_final_excel_report + +__all__ = [ + "generate_html", + "image_to_base64", + "create_final_excel_report", +] +``` + +--- + +## 6. Data Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ generate excel │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ INPUTS: │ +│ ├── gpu_comparison.xlsx │ +│ │ ├── Summary (combined) │ +│ │ ├── All_Ranks_Combined (combined) │ +│ │ ├── Per_Rank_Time_ms (combined) │ +│ │ ├── Per_Rank_Percent (combined) │ +│ │ ├── Comparison_By_Rank │ +│ │ └── Summary_Comparison │ +│ │ │ +│ └── collective_comparison.xlsx │ +│ ├── nccl_summary_implicit_sync (combined) │ +│ ├── nccl_summary_long (combined) │ +│ ├── nccl_implicit_sync_cmp │ +│ └── nccl_long_cmp │ +│ │ +│ PROCESSING: │ +│ ──────────── │ +│ 1. Read all sheets from input files │ +│ 2. Rename sheets according to naming convention │ +│ 3. Create Summary_Dashboard from GPU comparison data │ +│ 4. Write all sheets to new workbook │ +│ 5. Post-process with openpyxl: │ +│ - Hide raw data sheets │ +│ - Convert to Excel tables │ +│ - Apply color formatting │ +│ - Move Summary_Dashboard to first position │ +│ │ +│ OUTPUT: │ +│ └── final_analysis_report.xlsx │ +│ ├── Summary_Dashboard (visible, FIRST) │ +│ ├── GPU_Summary_Cmp (visible) │ +│ ├── GPU_ByRank_Cmp (visible) │ +│ ├── NCCL_Implicit_sync_cmp (visible) │ +│ ├── NCCL_Long_cmp (visible) │ +│ ├── GPU_Summary_Raw (HIDDEN) │ +│ ├── GPU_AllRanks_Raw (HIDDEN) │ +│ ├── GPU_Time_Raw (HIDDEN) │ +│ ├── GPU_Pct_Raw (HIDDEN) │ +│ ├── NCCL_ImplSync_Raw (HIDDEN) │ +│ └── NCCL_Long_Raw (HIDDEN) │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 7. Implementation Order + +| Phase | Task | Est. Time | +|-------|------|-----------| +| **1** | Create `generators/excel_report.py` with core functions | 45 min | +| **2** | Implement `_add_gpu_sheets()` | 20 min | +| **3** | Implement `_add_collective_sheets()` | 20 min | +| **4** | Implement `_create_summary_dashboard()` | 25 min | +| **5** | Implement `_apply_post_processing()` | 30 min | +| **6** | Update `generators/__init__.py` | 5 min | +| **7** | Update CLI command in `cli.py` | 15 min | +| **8** | Testing | 20 min | + +**Total estimated time: ~3 hours** + +--- + +## 8. Expected Output + +### Console Output + +``` +============================================================ +Creating Final Excel Report +============================================================ +Output: final_analysis_report.xlsx +Baseline label: ROCm 6.0 +Test label: ROCm 7.0 + +Step 1: Adding GPU Timeline sheets + Added GPU_Summary_Raw (will be hidden) + Added GPU_AllRanks_Raw (will be hidden) + Added GPU_Time_Raw (will be hidden) + Added GPU_Pct_Raw (will be hidden) + Added GPU_Summary_Cmp + Added GPU_ByRank_Cmp + +Step 2: Adding Collective/NCCL sheets + Added NCCL_ImplSync_Raw (will be hidden) + Added NCCL_Long_Raw (will be hidden) + Added NCCL_Implicit_sync_cmp + Added NCCL_Long_cmp + +Step 3: Creating Summary Dashboard + Added Summary_Dashboard + +Step 4: Applying formatting + Hidden: GPU_Summary_Raw + Hidden: GPU_AllRanks_Raw + Hidden: GPU_Time_Raw + Hidden: GPU_Pct_Raw + Hidden: NCCL_ImplSync_Raw + Hidden: NCCL_Long_Raw + Converted to table: Summary_Dashboard + Converted to table: GPU_Summary_Cmp + Converted to table: GPU_ByRank_Cmp + ... + Applied color scale to GPU_Summary_Cmp column percent_change + Applied color scale to GPU_ByRank_Cmp column percent_change + ... + Moved Summary_Dashboard to first position + +============================================================ +Report Complete! +============================================================ +Output: final_analysis_report.xlsx + +Report Structure: + Visible Sheets (Analysis): + - Summary_Dashboard + - GPU_Summary_Cmp + - GPU_ByRank_Cmp + - NCCL_Implicit_sync_cmp + - NCCL_Long_cmp + + Hidden Sheets (Raw Data): + - GPU_Summary_Raw + - GPU_AllRanks_Raw + - GPU_Time_Raw + - GPU_Pct_Raw + - NCCL_ImplSync_Raw + - NCCL_Long_Raw + +Features: + - All data formatted as Excel tables with filters + - Percent change columns are color-coded (green=better, red=worse) + - Unhide raw data: Right-click sheet tab → Unhide +``` + +### Summary Dashboard Content + +| Metric | ROCm 6.0 | ROCm 7.0 | Improvement (%) | Status | +|--------|----------|----------|-----------------|--------| +| GPU_busy_time | 125.45 | 118.32 | 5.68 | Better | +| GPU_idle_time | 21.78 | 19.45 | 10.70 | Better | +| GPU_computation_time | 98.34 | 95.12 | 3.27 | Better | +| GPU_exposed_comm_time | 27.11 | 23.20 | 14.42 | Better | +| GPU_total_time | 147.23 | 137.77 | 6.43 | Better | + +--- + +## Appendix A: CLI Update + +### Simplified Interface (Recommended) + +```python +@generate.command("excel") +@click.option("--gpu-comparison", required=True, type=click.Path(exists=True), + help="GPU timeline comparison file (from 'compare gpu_timeline')") +@click.option("--coll-comparison", required=True, type=click.Path(exists=True), + help="Collective comparison file (from 'compare collective')") +@click.option("--baseline-label", default="Baseline", + help="Label for baseline configuration") +@click.option("--test-label", default="Test", + help="Label for test configuration") +@click.option("-o", "--output", required=True, type=click.Path(), + help="Output Excel file path") +@click.pass_context +def generate_excel(ctx, gpu_comparison, coll_comparison, baseline_label, test_label, output): + """Generate comprehensive final Excel report. + + Combines GPU timeline and collective comparison data into a single + well-organized Excel report with: + + \b + - Summary Dashboard (first sheet, key metrics at a glance) + - Comparison sheets (visible, with color-coded changes) + - Raw data sheets (hidden, accessible via Unhide) + - Excel table formatting with filters + + \b + Examples: + aorta-report generate excel \\ + --gpu-comparison gpu_comparison.xlsx \\ + --coll-comparison collective_comparison.xlsx \\ + -o final_report.xlsx + """ +``` + +### Full Interface (Backward Compatible) + +```python +@generate.command("excel") +@click.option("--gpu-comparison", required=True, type=click.Path(exists=True)) +@click.option("--coll-comparison", required=True, type=click.Path(exists=True)) +@click.option("--gpu-combined", type=click.Path(exists=True), + help="Optional: Separate GPU combined file") +@click.option("--coll-combined", type=click.Path(exists=True), + help="Optional: Separate collective combined file") +@click.option("--baseline-label", default="Baseline") +@click.option("--test-label", default="Test") +@click.option("-o", "--output", required=True, type=click.Path()) +``` + +--- + +## Appendix B: Design Decisions + +1. **Interface:** Keep original 4-file interface (can refactor later) + +2. **Dashboard Metrics:** Include both GPU and NCCL metrics in Summary Dashboard + +3. **Table Style:** Use `TableStyleMedium2` (standard) + +4. **Sheet Order:** Dashboard → GPU Comparison → NCCL Comparison → (hidden) + diff --git a/src/aorta/report/GENERATE_PLOTS_DEV_DOCS.md b/src/aorta/report/GENERATE_PLOTS_DEV_DOCS.md new file mode 100644 index 0000000..8de1a34 --- /dev/null +++ b/src/aorta/report/GENERATE_PLOTS_DEV_DOCS.md @@ -0,0 +1,1383 @@ +# `generate plots` Command - Developer Documentation + +**Version:** 1.1 +**Date:** January 2026 +**Status:** ✅ Implemented + +--- + +## Table of Contents + +1. [Overview](#1-overview) +2. [Source Scripts Analysis](#2-source-scripts-analysis) +3. [Command Specification](#3-command-specification) +4. [Implementation Architecture](#4-implementation-architecture) +5. [Module Details](#5-module-details) +6. [Data Flow](#6-data-flow) +7. [Implementation Order](#7-implementation-order) +8. [Output Files](#8-output-files) + +--- + +## 1. Overview + +The `generate plots` command creates visualization plots from analysis reports. It merges functionality from two existing scripts into a unified interface with two plot types. + +### Plot Types + +| Type | Source Script | Input | Description | +|------|---------------|-------|-------------| +| `summary` | `create_final_plots.py` | Excel report | GPU timeline & NCCL comparison charts | +| `gemm` | `plot_gemm_variance.py` | CSV file | GEMM kernel variance distribution plots | + +### Scripts Being Merged + +| Script | Lines | Location | +|--------|-------|----------| +| `create_final_plots.py` | 333 | `scripts/tracelens_single_config/` | +| `plot_gemm_variance.py` | 423 | `scripts/gemm_analysis/` | +| **Total** | **756** | - | + +--- + +## 2. Source Scripts Analysis + +### 2.1 `create_final_plots.py` → Plot Type: `summary` + +**Input:** Final Excel report (output of `generate excel`) +**Required Sheets:** `Summary_Dashboard`, `GPU_ByRank_Cmp`, `NCCL_ImplicitSyncCmp` + +#### Functions & Output Files + +| Function | Output File | Description | +|----------|-------------|-------------| +| `plot_improvement_chart()` | `improvement_chart.png` | Horizontal bar chart showing % improvement per metric | +| `plot_abs_time_comparison()` | `abs_time_comparison.png` | Grouped bar chart: baseline vs test absolute times | +| `create_gpu_time_accross_all_ranks()` | `{metric}_by_rank.png` | Line plots showing metric values across ranks (4 files) | +| `create_gpu_time_change_percentage_summaryby_rank()` | `gpu_time_change_percentage_summary_by_rank.png` | 2×4 grid of bar charts per metric type | +| `create_gpu_time_heatmap()` | `gpu_time_heatmap.png` | Seaborn heatmap: percent_change by (metric × rank) | +| `create_nccl_charts()` | `NCCL_*.png` | 5 NCCL comparison charts | + +**Total Output Files:** ~13 PNG files + +--- + +### 2.2 `plot_gemm_variance.py` → Plot Type: `gemm` + +**Input:** GEMM variance CSV (output of `analyze gemm` + optional `process gemm-variance`) +**Required Columns:** `threads`, `channel`, `rank`, `time_diff_us`, `kernel_name` + +#### Functions & Output Files + +| Function | Output File | Description | +|----------|-------------|-------------| +| `create_boxplot_by_threads()` | `variance_by_threads_boxplot.png` | Box plot: variance distribution by thread count | +| `create_boxplot_by_channels()` | `variance_by_channels_boxplot.png` | Box plot: variance distribution by channel count | +| `create_boxplot_by_ranks()` | `variance_by_ranks_boxplot.png` | Box plot: variance distribution by rank | +| `create_violin_plot_combined()` | `variance_violin_combined.png` | 1×3 grid: violin plots for all dimensions | +| `create_interaction_plot()` | `variance_thread_channel_interaction.png` | Line plot: thread-channel interaction | + +**Total Output Files:** 5 PNG files + +--- + +## 3. Command Specification + +### CLI Interface + +```bash +# Summary plots (GPU timeline + NCCL from Excel) +aorta-report generate plots \ + -i final_report.xlsx \ + -o ./plots/ \ + --type summary + +# GEMM variance plots (from CSV) +aorta-report generate plots \ + -i gemm_variance.csv \ + -o ./plots/ \ + --type gemm + +# All plots (requires both inputs) +aorta-report generate plots \ + --excel-input final_report.xlsx \ + --gemm-csv gemm_variance.csv \ + -o ./plots/ \ + --type all +``` + +### Options + +| Option | Required | Default | Description | +|--------|----------|---------|-------------| +| `-i, --input` | Conditional | - | Input file (Excel for summary, CSV for gemm) | +| `--excel-input` | For `all` | - | Excel report file (for `--type all`) | +| `--gemm-csv` | For `all` | - | GEMM variance CSV (for `--type all`) | +| `-o, --output` | Yes | - | Output directory for PNG files | +| `--type` | No | `all` | Plot type: `summary`, `gemm`, or `all` | +| `--dpi` | No | `150` | DPI for output images | + +### Validation Rules + +1. If `--type summary`: `-i` must be an Excel file with required sheets +2. If `--type gemm`: `-i` must be a CSV file with required columns +3. If `--type all`: Both `--excel-input` and `--gemm-csv` must be provided + +--- + +## 4. Implementation Architecture + +### 4.1 File Structure + +``` +src/aorta/report/ +└── generators/ + ├── __init__.py # Update exports + ├── html_generator.py # Existing + ├── excel_report.py # Existing + ├── plot_generator.py # NEW: Thin orchestrator (~100 lines) + └── plot_helper/ # NEW: Internal package + ├── __init__.py # Exports all plot functions + ├── common.py # Shared utilities, colors, styles + │ + │ # Summary plots (from create_final_plots.py) + ├── summary_dashboard.py # improvement_chart, abs_time_comparison + ├── gpu_by_rank.py # GPU metrics by rank line plots + ├── gpu_percent_change.py # 2x4 grid of percent change bars + ├── gpu_heatmap.py # Seaborn heatmap + ├── nccl_charts.py # NCCL comparison charts + │ + │ # GEMM plots (from plot_gemm_variance.py) + ├── gemm_data.py # CSV reader, statistics + ├── gemm_boxplots.py # Boxplots by threads/channels/ranks + ├── gemm_violin.py # Combined violin plot + └── gemm_interaction.py # Thread-channel interaction plot +``` + +### 4.2 File Size Estimates + +| File | Functions | Lines (est.) | +|------|-----------|--------------| +| **Common** | | | +| `common.py` | `configure_style()`, `COLORS`, `save_figure()` | ~50 | +| **Summary Plots** | | | +| `summary_dashboard.py` | `plot_improvement_chart()`, `plot_abs_time_comparison()`, `get_labels()` | ~80 | +| `gpu_by_rank.py` | `plot_gpu_metrics_by_rank()` | ~70 | +| `gpu_percent_change.py` | `plot_gpu_percent_change_grid()` | ~60 | +| `gpu_heatmap.py` | `plot_gpu_heatmap()` | ~50 | +| `nccl_charts.py` | `plot_nccl_comparison()`, `plot_nccl_percent_change()` | ~120 | +| **GEMM Plots** | | | +| `gemm_data.py` | `read_gemm_csv_data()`, `print_statistics()` | ~60 | +| `gemm_boxplots.py` | `create_boxplot()`, `plot_by_threads()`, `plot_by_channels()`, `plot_by_ranks()` | ~100 | +| `gemm_violin.py` | `plot_variance_violin_combined()` | ~80 | +| `gemm_interaction.py` | `plot_thread_channel_interaction()` | ~60 | +| **Orchestrator** | | | +| `plot_generator.py` | `generate_summary_plots()`, `generate_gemm_plots()`, `generate_plots()` | ~100 | +| **Total** | | **~830** | + +--- + +## 5. Module Details + +### 5.1 `plot_helper/common.py` + +Shared utilities, colors, and styling for all plots. + +```python +"""Common utilities for plot generation.""" + +from pathlib import Path +from typing import Tuple + +import matplotlib.pyplot as plt +import seaborn as sns + + +# ============================================================================= +# Color Palette +# ============================================================================= + +COLORS = { + "positive": "#2ecc71", # Green - improvements + "negative": "#e74c3c", # Red - regressions + "baseline": "#3498db", # Blue - baseline data + "test": "#e67e22", # Orange - test data + "neutral": "#95a5a6", # Gray - neutral +} + +# Extended palette for multi-series +PALETTE_MULTI = ["#3498db", "#e67e22", "#2ecc71", "#e74c3c", "#9b59b6", "#1abc9c"] + + +# ============================================================================= +# Plot Configuration +# ============================================================================= + +DEFAULT_DPI = 150 +DEFAULT_FIGSIZE = (10, 6) + + +def configure_style() -> None: + """Configure matplotlib/seaborn style for consistent plots.""" + sns.set_style("whitegrid") + plt.rcParams.update({ + "figure.dpi": DEFAULT_DPI, + "savefig.dpi": DEFAULT_DPI, + "font.size": 12, + "axes.titlesize": 14, + "axes.labelsize": 12, + }) + + +def remove_spines(ax) -> None: + """Remove all spines from an axis.""" + for spine in ["top", "right", "bottom", "left"]: + ax.spines[spine].set_visible(False) + + +def save_figure( + fig, + output_path: Path, + dpi: int = DEFAULT_DPI, + close: bool = True, +) -> Path: + """Save figure and optionally close it.""" + output_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(output_path, dpi=dpi, bbox_inches="tight") + if close: + plt.close(fig) + return output_path + + +def get_improvement_colors(values) -> list: + """Return green/red colors based on positive/negative values.""" + return [COLORS["positive"] if v > 0 else COLORS["negative"] for v in values] +``` + +--- + +### 5.2 `plot_helper/summary_dashboard.py` + +Dashboard-level plots from Summary_Dashboard sheet. + +```python +"""Summary dashboard plots: improvement chart and absolute time comparison.""" + +from pathlib import Path +from typing import List + +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +from .common import ( + COLORS, DEFAULT_DPI, DEFAULT_FIGSIZE, + remove_spines, save_figure, get_improvement_colors, +) + + +def get_labels_from_excel(excel_path: Path) -> List[str]: + """Extract baseline/test labels from Summary_Dashboard sheet.""" + df = pd.read_excel(excel_path, sheet_name="Summary_Dashboard") + cols = df.columns.tolist() + return [cols[1], cols[2]] # Baseline and Test column names + + +def plot_improvement_chart( + excel_path: Path, + output_dir: Path, + dpi: int = DEFAULT_DPI, +) -> Path: + """ + Create horizontal bar chart of percent improvement. + + Reads Summary_Dashboard sheet, plots Metric vs Improvement (%). + Green bars for positive (better), red for negative (worse). + """ + df = pd.read_excel(excel_path, sheet_name="Summary_Dashboard") + + fig, ax = plt.subplots(figsize=DEFAULT_FIGSIZE) + + colors = get_improvement_colors(df["Improvement (%)"]) + ax.barh(df["Metric"], df["Improvement (%)"], color=colors) + + ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray") + ax.set_axisbelow(True) + remove_spines(ax) + + ax.set_ylabel("Metric", fontsize=12) + ax.set_xlabel("Change (%)", fontsize=12) + ax.set_title( + "GPU Metrics Percentage Change (Test vs Baseline)\n(Positive = Test is better)", + fontsize=14, fontweight="bold", + ) + + plt.tight_layout() + return save_figure(fig, output_dir / "improvement_chart.png", dpi) + + +def plot_abs_time_comparison( + excel_path: Path, + output_dir: Path, + labels: List[str], + dpi: int = DEFAULT_DPI, +) -> Path: + """ + Create grouped bar chart of baseline vs test absolute times. + + Reads Summary_Dashboard sheet, plots side-by-side bars for each metric. + """ + df = pd.read_excel(excel_path, sheet_name="Summary_Dashboard") + + fig, ax = plt.subplots(figsize=DEFAULT_FIGSIZE) + + x = np.arange(len(df)) + width = 0.35 + colors = [COLORS["baseline"], COLORS["test"]] + + for i, label in enumerate(labels): + offset = (i - len(labels) / 2 + 0.5) * width + ax.bar(x + offset, df[label], width, label=label, color=colors[i]) + + ax.xaxis.grid(True, linestyle="--", alpha=0.7, color="gray") + ax.set_axisbelow(True) + remove_spines(ax) + + ax.set_xlabel("Metric Type", fontsize=12) + ax.set_ylabel("Time (ms)", fontsize=12) + ax.set_title("GPU Metrics Absolute Time Comparison", fontsize=14, fontweight="bold") + ax.set_xticks(x) + ax.set_xticklabels(df["Metric"], rotation=45, ha="right") + ax.legend() + + plt.tight_layout() + return save_figure(fig, output_dir / "abs_time_comparison.png", dpi) +``` + +--- + +### 5.3 `plot_helper/gpu_by_rank.py` + +Line plots showing GPU metrics across ranks. + +```python +"""GPU metrics by rank line plots.""" + +from pathlib import Path +from typing import List + +import pandas as pd +import matplotlib.pyplot as plt + +from .common import COLORS, DEFAULT_DPI, save_figure + + +METRICS_TO_PLOT = ["total_time", "computation_time", "total_comm_time", "idle_time"] + + +def plot_gpu_metrics_by_rank( + excel_path: Path, + output_dir: Path, + labels: List[str], + metrics: List[str] = None, + dpi: int = DEFAULT_DPI, +) -> List[Path]: + """ + Create line plots for GPU metrics across ranks. + + Reads GPU_ByRank_Cmp sheet, creates one plot per metric type. + Each plot shows baseline vs test values across all ranks. + + Returns list of generated file paths. + """ + df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp") + metrics = metrics or METRICS_TO_PLOT + + output_files = [] + colors = [COLORS["baseline"], COLORS["test"]] + markers = ["o", "s"] + + for metric in metrics: + metric_df = df[df["type"] == metric] + if metric_df.empty: + continue + + fig, ax = plt.subplots(figsize=(12, 6)) + + for i, label in enumerate(labels): + col_name = f"{label}_time_ms" + if col_name in metric_df.columns: + ax.plot( + metric_df["rank"], + metric_df[col_name], + marker=markers[i], + linewidth=2, + markersize=8, + color=colors[i], + label=label, + ) + + ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray") + ax.set_axisbelow(True) + + ax.set_xlabel("Rank", fontsize=12) + ax.set_ylabel("Time (ms)", fontsize=12) + ax.set_title(f"{metric} Comparison across all ranks", fontsize=14, fontweight="bold") + ax.legend() + + plt.tight_layout() + output_path = save_figure(fig, output_dir / f"{metric}_by_rank.png", dpi) + output_files.append(output_path) + + return output_files +``` + +--- + +### 5.4 `plot_helper/gpu_percent_change.py` + +2×4 grid of percent change bar charts. + +```python +"""GPU percent change grid plot.""" + +from pathlib import Path + +import pandas as pd +import matplotlib.pyplot as plt + +from .common import DEFAULT_DPI, save_figure, get_improvement_colors + + +METRIC_TYPES = [ + "busy_time", "computation_time", "exposed_comm_time", "exposed_memcpy_time", + "idle_time", "total_comm_time", "total_memcpy_time", "total_time", +] + + +def plot_gpu_percent_change_grid( + excel_path: Path, + output_dir: Path, + dpi: int = DEFAULT_DPI, +) -> Path: + """ + Create 2x4 grid of percent change bar charts by rank. + + Reads GPU_ByRank_Cmp sheet, creates one subplot per metric type. + Each subplot shows percent_change for all ranks as bar chart. + """ + df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp") + + fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(14, 8)) + + for i, metric_type in enumerate(METRIC_TYPES): + ax = axes[i // 4, i % 4] + type_df = df[df["type"] == metric_type] + + if type_df.empty: + ax.set_visible(False) + continue + + colors = get_improvement_colors(type_df["percent_change"]) + ax.bar(type_df["rank"].astype(str), type_df["percent_change"], color=colors) + + ax.axhline(y=0, color="black", linestyle="-", linewidth=0.5) + ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray") + ax.set_axisbelow(True) + ax.set_xlabel("Rank") + ax.set_ylabel("Percent Change (%)") + ax.set_title(metric_type, fontsize=10) + + fig.suptitle( + "GPU Metrics Percent Change by Rank\n(Positive = Better)", + fontsize=14, fontweight="bold", + ) + plt.tight_layout() + return save_figure(fig, output_dir / "gpu_time_change_percentage_summary_by_rank.png", dpi) +``` + +--- + +### 5.5 `plot_helper/gpu_heatmap.py` + +Seaborn heatmap of percent change. + +```python +"""GPU percent change heatmap.""" + +from pathlib import Path + +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +from .common import DEFAULT_DPI, save_figure + + +def plot_gpu_heatmap( + excel_path: Path, + output_dir: Path, + dpi: int = DEFAULT_DPI, +) -> Path: + """ + Create heatmap of percent_change by metric type and rank. + + Reads GPU_ByRank_Cmp sheet, pivots to (metric × rank) matrix, + and creates color-coded heatmap (green=better, red=worse). + """ + df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp") + pivot_df = df.pivot(index="type", columns="rank", values="percent_change") + + fig, ax = plt.subplots(figsize=(12, 8)) + + sns.heatmap( + pivot_df, + annot=True, + fmt=".1f", + cmap="RdYlGn", + center=0, + linewidths=0.5, + cbar_kws={"label": "Percent Change (%)"}, + ax=ax, + ) + + ax.set_title( + "GPU Metric Percentage Change by Rank (HeatMap)\n(Positive = Better Test)", + fontsize=14, fontweight="bold", + ) + ax.set_xlabel("Rank", fontsize=12) + ax.set_ylabel("Metric Type", fontsize=12) + + plt.tight_layout() + return save_figure(fig, output_dir / "gpu_time_heatmap.png", dpi) +``` + +--- + +### 5.6 `plot_helper/nccl_charts.py` + +NCCL comparison charts. + +```python +"""NCCL comparison charts.""" + +from pathlib import Path +from typing import List + +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +from .common import COLORS, DEFAULT_DPI, save_figure, get_improvement_colors + + +NCCL_METRICS = { + "NCCL Communication Latency": { + "y_col": "comm_latency_mean", + "y_label": "Communication Latency (ms)", + }, + "NCCL Algorithm Bandwidth": { + "y_col": "algo bw (GB/s)_mean", + "y_label": "Algorithm Bandwidth (GB/s)", + }, + "NCCL Bus Bandwidth": { + "y_col": "bus bw (GB/s)_mean", + "y_label": "Bus Bandwidth (GB/s)", + }, + "NCCL Total Communication Latency": { + "y_col": "Total comm latency (ms)", + "y_label": "Total Communication Latency (ms)", + }, +} + +NCCL_PERCENT_METRICS = { + "Comm Latency": "percent_change_comm_latency_mean", + "Algo BW": "percent_change_algo bw (GB/s)_mean", + "Bus BW": "percent_change_bus bw (GB/s)_mean", +} + + +def plot_nccl_comparison( + excel_path: Path, + output_dir: Path, + labels: List[str], + dpi: int = DEFAULT_DPI, +) -> List[Path]: + """ + Create NCCL metric comparison bar charts. + + Reads NCCL_ImplicitSyncCmp sheet, creates grouped bar charts + for each metric (latency, bandwidth). + """ + df = pd.read_excel(excel_path, sheet_name="NCCL_ImplicitSyncCmp") + df["label"] = df["Collective name"] + "\n" + df["In msg nelems"].astype(str) + + x = np.arange(len(df)) + width = 0.35 + colors = [COLORS["baseline"], COLORS["test"]] + output_files = [] + + for title, config in NCCL_METRICS.items(): + fig, ax = plt.subplots(figsize=(14, 6)) + + for i, label in enumerate(labels): + col_name = f"{label}_{config['y_col']}" + if col_name in df.columns: + offset = (i - len(labels) / 2 + 0.5) * width + ax.bar(x + offset, df[col_name], width, label=label, color=colors[i]) + + ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray") + ax.set_axisbelow(True) + ax.set_xticks(x) + ax.set_xticklabels(df["label"], rotation=45, ha="right", fontsize=8) + ax.set_xlabel("Collective Operation (Message Size)", fontsize=12) + ax.set_ylabel(config["y_label"], fontsize=12) + ax.set_title(f"{title} Comparison", fontsize=14, fontweight="bold") + ax.legend() + + plt.tight_layout() + filename = f'{title.replace(" ", "_")}_comparison.png' + output_files.append(save_figure(fig, output_dir / filename, dpi)) + + return output_files + + +def plot_nccl_percent_change( + excel_path: Path, + output_dir: Path, + dpi: int = DEFAULT_DPI, +) -> Path: + """ + Create 1x3 grid of NCCL percent change horizontal bar charts. + """ + df = pd.read_excel(excel_path, sheet_name="NCCL_ImplicitSyncCmp") + + fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14, 6)) + + for i, (title, col_name) in enumerate(NCCL_PERCENT_METRICS.items()): + ax = axes[i] + if col_name not in df.columns: + ax.set_visible(False) + continue + + colors = get_improvement_colors(df[col_name]) + ax.barh(df["In msg nelems"].astype(str), df[col_name], color=colors) + + ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray") + ax.set_axisbelow(True) + ax.set_xlabel("Percent Change (%)") + ax.set_title(f"{title}\nPercent Change (Positive = better)") + + fig.suptitle( + "NCCL Performance Percentage Change By Message Size", + fontsize=16, fontweight="bold", + ) + plt.tight_layout() + return save_figure(fig, output_dir / "NCCL_Performance_Percentage_Change_comparison.png", dpi) +``` + +--- + +### 5.7 `plot_helper/gemm_data.py` + +GEMM CSV reader and statistics. + +```python +"""GEMM variance data loading and statistics.""" + +import csv +from pathlib import Path +from typing import Dict, List, Any +from collections import defaultdict + + +def read_gemm_csv_data(csv_path: Path) -> Dict[str, Any]: + """ + Read GEMM variance CSV and organize by dimensions. + + Returns: + { + "threads": {256: [values], 512: [values]}, + "channels": {28: [values], 42: [values], ...}, + "ranks": {0: [values], 1: [values], ...}, + "all": [list of row dicts], + } + """ + data = { + "threads": defaultdict(list), + "channels": defaultdict(list), + "ranks": defaultdict(list), + "all": [], + } + + with open(csv_path, "r", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + try: + threads = int(row["threads"]) + channel = int(row["channel"]) + rank = int(row["rank"]) + time_diff = float(row["time_diff_us"]) + + data["threads"][threads].append(time_diff) + data["channels"][channel].append(time_diff) + data["ranks"][rank].append(time_diff) + data["all"].append({ + "threads": threads, + "channel": channel, + "rank": rank, + "time_diff": time_diff, + "kernel_name": row["kernel_name"], + }) + except (ValueError, KeyError) as e: + continue + + return data + + +def _calculate_median(values: List[float]) -> float: + """Calculate median of a list of values.""" + sorted_vals = sorted(values) + n = len(sorted_vals) + if n % 2 == 1: + return sorted_vals[n // 2] + return (sorted_vals[n // 2 - 1] + sorted_vals[n // 2]) / 2 + + +def print_gemm_statistics(data: Dict[str, Any], verbose: bool = True) -> Dict[str, Any]: + """Print and return summary statistics.""" + stats = {} + + if verbose: + print("\n" + "=" * 70) + print("VARIANCE DISTRIBUTION STATISTICS") + print("=" * 70) + + for dimension, label_fmt in [ + ("threads", "{} threads"), + ("channels", "{}ch"), + ("ranks", "Rank {}"), + ]: + stats[dimension] = {} + if verbose: + print(f"\nBy {dimension.title()}:") + + for key in sorted(data[dimension].keys()): + values = data[dimension][key] + mean_val = sum(values) / len(values) + median_val = _calculate_median(values) + + stats[dimension][key] = { + "mean": mean_val, + "median": median_val, + "max": max(values), + "count": len(values), + } + + if verbose: + label = label_fmt.format(key) + print(f" {label}: mean={mean_val:.2f}us, median={median_val:.2f}us, " + f"max={max(values):.2f}us, n={len(values)}") + + if verbose: + print("=" * 70 + "\n") + + return stats +``` + +--- + +### 5.8 `plot_helper/gemm_boxplots.py` + +GEMM variance boxplots. + +```python +"""GEMM variance boxplot generators.""" + +from pathlib import Path +from typing import Dict, List, Any, Tuple + +import matplotlib.pyplot as plt + +from .common import DEFAULT_DPI, save_figure + + +def _create_boxplot( + data_dict: Dict[int, List[float]], + output_path: Path, + label_fmt: str, + xlabel: str, + title: str, + colors: List[str], + figsize: Tuple[int, int] = (10, 6), + dpi: int = DEFAULT_DPI, +) -> Path: + """Generic boxplot creation helper.""" + fig, ax = plt.subplots(figsize=figsize) + + keys_list = sorted(data_dict.keys()) + plot_data = [data_dict[k] for k in keys_list] + labels = [label_fmt.format(k) for k in keys_list] + + bp = ax.boxplot( + plot_data, + tick_labels=labels, + patch_artist=True, + showmeans=True, + meanline=True, + ) + + # Handle color assignment + if colors == "viridis": + colors = plt.cm.viridis([i / len(keys_list) for i in range(len(keys_list))]) + + for patch, color in zip(bp["boxes"], colors): + patch.set_facecolor(color) + + ax.set_ylabel("Time Difference (us)", fontsize=14, fontweight="bold") + ax.set_xlabel(xlabel, fontsize=14, fontweight="bold") + ax.set_title(title, fontsize=16, fontweight="bold", pad=20) + ax.grid(True, alpha=0.3) + + plt.tight_layout() + return save_figure(fig, output_path, dpi) + + +def plot_variance_by_threads( + data: Dict[str, Any], + output_dir: Path, + dpi: int = DEFAULT_DPI, +) -> Path: + """Create boxplot of variance by thread count.""" + return _create_boxplot( + data_dict=data["threads"], + output_path=output_dir / "variance_by_threads_boxplot.png", + label_fmt="{} threads", + xlabel="Thread Configuration", + title="GEMM Kernel Time Variance by Thread Count", + colors=["lightblue", "lightcoral"], + figsize=(10, 6), + dpi=dpi, + ) + + +def plot_variance_by_channels( + data: Dict[str, Any], + output_dir: Path, + dpi: int = DEFAULT_DPI, +) -> Path: + """Create boxplot of variance by channel count.""" + return _create_boxplot( + data_dict=data["channels"], + output_path=output_dir / "variance_by_channels_boxplot.png", + label_fmt="{}ch", + xlabel="Channel Configuration", + title="GEMM Kernel Time Variance by Channel Count", + colors=["#e6f2ff", "#99ccff", "#4da6ff", "#0073e6"], + figsize=(12, 6), + dpi=dpi, + ) + + +def plot_variance_by_ranks( + data: Dict[str, Any], + output_dir: Path, + dpi: int = DEFAULT_DPI, +) -> Path: + """Create boxplot of variance by rank.""" + return _create_boxplot( + data_dict=data["ranks"], + output_path=output_dir / "variance_by_ranks_boxplot.png", + label_fmt="Rank {}", + xlabel="Rank", + title="GEMM Kernel Time Variance by Rank", + colors="viridis", + figsize=(14, 6), + dpi=dpi, + ) +``` + +--- + +### 5.9 `plot_helper/gemm_violin.py` + +Combined violin plot. + +```python +"""GEMM variance violin plot.""" + +from pathlib import Path +from typing import Dict, List, Any + +import matplotlib.pyplot as plt + +from .common import DEFAULT_DPI, save_figure + + +def _prepare_violin_data(data_dict: Dict[int, List[float]], label_fmt: str) -> List[Dict]: + """Prepare data for violin plot from a dictionary.""" + result = [] + for key, values in sorted(data_dict.items()): + for val in values: + result.append({"config": label_fmt.format(key), "time_diff": val}) + return result + + +def plot_variance_violin_combined( + data: Dict[str, Any], + output_dir: Path, + dpi: int = DEFAULT_DPI, +) -> Path: + """Create combined violin plot (1x3 grid) for all dimensions.""" + fig, axes = plt.subplots(1, 3, figsize=(20, 6)) + + configs = [ + { + "data": _prepare_violin_data(data["threads"], "{}t"), + "sort_key": lambda x: int(x[:-1]), + "color": "lightblue", + "xlabel": "Threads", + "title": "By Thread Count", + }, + { + "data": _prepare_violin_data(data["channels"], "{}ch"), + "sort_key": lambda x: int(x[:-2]), + "color": "lightcoral", + "xlabel": "Channels", + "title": "By Channel Count", + }, + { + "data": _prepare_violin_data(data["ranks"], "R{}"), + "sort_key": lambda x: int(x[1:]), + "color": "lightgreen", + "xlabel": "Ranks", + "title": "By Rank", + }, + ] + + for ax, cfg in zip(axes, configs): + violin_data = cfg["data"] + configs_list = sorted(set(d["config"] for d in violin_data), key=cfg["sort_key"]) + values = [[d["time_diff"] for d in violin_data if d["config"] == c] for c in configs_list] + + parts = ax.violinplot( + values, + positions=range(len(configs_list)), + showmeans=True, + showmedians=True, + ) + for pc in parts["bodies"]: + pc.set_facecolor(cfg["color"]) + pc.set_alpha(0.7) + + ax.set_xticks(range(len(configs_list))) + ax.set_xticklabels(configs_list) + ax.set_ylabel("Time Difference (us)", fontsize=12, fontweight="bold") + ax.set_xlabel(cfg["xlabel"], fontsize=12, fontweight="bold") + ax.set_title(cfg["title"], fontsize=14, fontweight="bold") + ax.grid(True, alpha=0.3, axis="y") + + fig.suptitle( + "GEMM Kernel Time Variance Distribution", + fontsize=18, fontweight="bold", y=1.02, + ) + + plt.tight_layout() + return save_figure(fig, output_dir / "variance_violin_combined.png", dpi) +``` + +--- + +### 5.10 `plot_helper/gemm_interaction.py` + +Thread-channel interaction plot. + +```python +"""GEMM thread-channel interaction plot.""" + +from pathlib import Path +from typing import Dict, Any +from collections import defaultdict + +import matplotlib.pyplot as plt + +from .common import DEFAULT_DPI, save_figure + + +def plot_thread_channel_interaction( + data: Dict[str, Any], + output_dir: Path, + dpi: int = DEFAULT_DPI, +) -> Path: + """Create thread-channel interaction line plot.""" + fig, ax = plt.subplots(figsize=(12, 7)) + + # Organize data by threads and channels + thread_channel_data = defaultdict(lambda: defaultdict(list)) + for row in data["all"]: + thread_channel_data[row["threads"]][row["channel"]].append(row["time_diff"]) + + threads = sorted(thread_channel_data.keys()) + channels = sorted(set( + ch for t_data in thread_channel_data.values() for ch in t_data.keys() + )) + + markers = ["o", "s", "^", "D"] + + for i, thread in enumerate(threads): + means = [] + for channel in channels: + if channel in thread_channel_data[thread]: + values = thread_channel_data[thread][channel] + means.append(sum(values) / len(values)) + else: + means.append(0) + + ax.plot( + channels, means, + marker=markers[i % len(markers)], + linewidth=2, + markersize=10, + label=f"{thread} threads", + ) + + ax.set_xlabel("Channel Count", fontsize=14, fontweight="bold") + ax.set_ylabel("Mean Time Difference (us)", fontsize=14, fontweight="bold") + ax.set_title( + "Thread-Channel Interaction: Mean Variance", + fontsize=16, fontweight="bold", pad=20, + ) + ax.set_xticks(channels) + ax.set_xticklabels([f"{c}ch" for c in channels]) + ax.legend(fontsize=12, loc="best") + ax.grid(True, alpha=0.3) + + plt.tight_layout() + return save_figure(fig, output_dir / "variance_thread_channel_interaction.png", dpi) +``` + +--- + +### 5.11 `plot_helper/__init__.py` + +Package exports. + +```python +"""Plot helper functions for summary and GEMM visualizations.""" + +from .common import configure_style, COLORS, save_figure, get_improvement_colors + +# Summary plots +from .summary_dashboard import ( + get_labels_from_excel, + plot_improvement_chart, + plot_abs_time_comparison, +) +from .gpu_by_rank import plot_gpu_metrics_by_rank +from .gpu_percent_change import plot_gpu_percent_change_grid +from .gpu_heatmap import plot_gpu_heatmap +from .nccl_charts import plot_nccl_comparison, plot_nccl_percent_change + +# GEMM plots +from .gemm_data import read_gemm_csv_data, print_gemm_statistics +from .gemm_boxplots import ( + plot_variance_by_threads, + plot_variance_by_channels, + plot_variance_by_ranks, +) +from .gemm_violin import plot_variance_violin_combined +from .gemm_interaction import plot_thread_channel_interaction + +__all__ = [ + # Common + "configure_style", + "COLORS", + "save_figure", + "get_improvement_colors", + # Summary + "get_labels_from_excel", + "plot_improvement_chart", + "plot_abs_time_comparison", + "plot_gpu_metrics_by_rank", + "plot_gpu_percent_change_grid", + "plot_gpu_heatmap", + "plot_nccl_comparison", + "plot_nccl_percent_change", + # GEMM + "read_gemm_csv_data", + "print_gemm_statistics", + "plot_variance_by_threads", + "plot_variance_by_channels", + "plot_variance_by_ranks", + "plot_variance_violin_combined", + "plot_thread_channel_interaction", +] +``` + +--- + +### 5.12 `generators/plot_generator.py` + +Main orchestrator (thin wrapper). + +```python +"""Plot generation orchestrator. + +Provides unified interface for generating summary and GEMM plots. +""" + +from pathlib import Path +from typing import Dict, List, Optional + +from .plot_helper import ( + configure_style, + # Summary + get_labels_from_excel, + plot_improvement_chart, + plot_abs_time_comparison, + plot_gpu_metrics_by_rank, + plot_gpu_percent_change_grid, + plot_gpu_heatmap, + plot_nccl_comparison, + plot_nccl_percent_change, + # GEMM + read_gemm_csv_data, + print_gemm_statistics, + plot_variance_by_threads, + plot_variance_by_channels, + plot_variance_by_ranks, + plot_variance_violin_combined, + plot_thread_channel_interaction, +) + + +def generate_summary_plots( + excel_path: Path, + output_dir: Path, + dpi: int = 150, + verbose: bool = False, +) -> List[Path]: + """ + Generate all summary plots from Excel report. + + Returns list of generated file paths. + """ + output_dir.mkdir(parents=True, exist_ok=True) + output_files = [] + + if verbose: + print(f"\nGenerating summary plots from: {excel_path}") + + labels = get_labels_from_excel(excel_path) + if verbose: + print(f" Labels: {labels}") + + # Dashboard plots + output_files.append(plot_improvement_chart(excel_path, output_dir, dpi)) + output_files.append(plot_abs_time_comparison(excel_path, output_dir, labels, dpi)) + + # GPU plots + output_files.extend(plot_gpu_metrics_by_rank(excel_path, output_dir, labels, dpi=dpi)) + output_files.append(plot_gpu_percent_change_grid(excel_path, output_dir, dpi)) + output_files.append(plot_gpu_heatmap(excel_path, output_dir, dpi)) + + # NCCL plots + output_files.extend(plot_nccl_comparison(excel_path, output_dir, labels, dpi)) + output_files.append(plot_nccl_percent_change(excel_path, output_dir, dpi)) + + if verbose: + print(f" Generated {len(output_files)} summary plots") + + return output_files + + +def generate_gemm_plots( + csv_path: Path, + output_dir: Path, + dpi: int = 150, + verbose: bool = False, +) -> List[Path]: + """ + Generate all GEMM variance plots from CSV. + + Returns list of generated file paths. + """ + output_dir.mkdir(parents=True, exist_ok=True) + output_files = [] + + if verbose: + print(f"\nGenerating GEMM plots from: {csv_path}") + + data = read_gemm_csv_data(csv_path) + + if verbose: + print(f" Total data points: {len(data['all'])}") + print_gemm_statistics(data) + + # Boxplots + output_files.append(plot_variance_by_threads(data, output_dir, dpi)) + output_files.append(plot_variance_by_channels(data, output_dir, dpi)) + output_files.append(plot_variance_by_ranks(data, output_dir, dpi)) + + # Violin and interaction + output_files.append(plot_variance_violin_combined(data, output_dir, dpi)) + output_files.append(plot_thread_channel_interaction(data, output_dir, dpi)) + + if verbose: + print(f" Generated {len(output_files)} GEMM plots") + + return output_files + + +def generate_plots( + plot_type: str, + output_dir: Path, + excel_input: Optional[Path] = None, + gemm_csv: Optional[Path] = None, + dpi: int = 150, + verbose: bool = False, +) -> Dict[str, List[Path]]: + """ + Generate plots based on type. + + Args: + plot_type: "summary", "gemm", or "all" + output_dir: Output directory for PNG files + excel_input: Path to Excel report (for summary/all) + gemm_csv: Path to GEMM CSV (for gemm/all) + dpi: DPI for output images + verbose: Print progress + + Returns: + Dict mapping category to list of generated file paths + + Raises: + ValueError: If required inputs not provided for plot_type + FileNotFoundError: If input files don't exist + """ + configure_style() + results = {} + + if plot_type in ("summary", "all"): + if excel_input is None: + raise ValueError("Excel input required for summary plots") + if not excel_input.exists(): + raise FileNotFoundError(f"Excel file not found: {excel_input}") + results["summary"] = generate_summary_plots(excel_input, output_dir, dpi, verbose) + + if plot_type in ("gemm", "all"): + if gemm_csv is None: + raise ValueError("GEMM CSV required for gemm plots") + if not gemm_csv.exists(): + raise FileNotFoundError(f"CSV file not found: {gemm_csv}") + results["gemm"] = generate_gemm_plots(gemm_csv, output_dir, dpi, verbose) + + return results +``` + +--- + +## 6. Data Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ generate plots │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ --type summary │ +│ ──────────────── │ +│ INPUT: final_report.xlsx │ +│ ├── Summary_Dashboard → summary_dashboard.py │ +│ ├── GPU_ByRank_Cmp → gpu_by_rank.py, gpu_percent_change.py, gpu_heatmap.py│ +│ └── NCCL_ImplicitSyncCmp → nccl_charts.py │ +│ │ +│ OUTPUT: ./plots/ (13 files) │ +│ ├── improvement_chart.png │ +│ ├── abs_time_comparison.png │ +│ ├── {metric}_by_rank.png (4 files) │ +│ ├── gpu_time_change_percentage_summary_by_rank.png │ +│ ├── gpu_time_heatmap.png │ +│ └── NCCL_*.png (5 files) │ +│ │ +│ --type gemm │ +│ ────────────── │ +│ INPUT: gemm_variance.csv │ +│ └── gemm_data.py → gemm_boxplots.py, gemm_violin.py, gemm_interaction.py │ +│ │ +│ OUTPUT: ./plots/ (5 files) │ +│ ├── variance_by_threads_boxplot.png │ +│ ├── variance_by_channels_boxplot.png │ +│ ├── variance_by_ranks_boxplot.png │ +│ ├── variance_violin_combined.png │ +│ └── variance_thread_channel_interaction.png │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 7. Implementation Order + +| Phase | Task | Est. Time | +|-------|------|-----------| +| **1** | Create `plot_helper/` package structure | 5 min | +| **2** | Implement `common.py` | 10 min | +| **3** | Implement `summary_dashboard.py` | 15 min | +| **4** | Implement `gpu_by_rank.py` | 10 min | +| **5** | Implement `gpu_percent_change.py` | 10 min | +| **6** | Implement `gpu_heatmap.py` | 10 min | +| **7** | Implement `nccl_charts.py` | 20 min | +| **8** | Implement `gemm_data.py` | 10 min | +| **9** | Implement `gemm_boxplots.py` | 15 min | +| **10** | Implement `gemm_violin.py` | 15 min | +| **11** | Implement `gemm_interaction.py` | 10 min | +| **12** | Implement `plot_helper/__init__.py` | 5 min | +| **13** | Implement `plot_generator.py` orchestrator | 15 min | +| **14** | Update `generators/__init__.py` | 5 min | +| **15** | Update CLI in `cli.py` | 15 min | +| **16** | Testing | 20 min | + +**Total estimated time: ~3 hours** + +--- + +## 8. Output Files + +### Summary Plots (13 files) + +| File | Source Module | Description | +|------|---------------|-------------| +| `improvement_chart.png` | `summary_dashboard.py` | Horizontal bar chart | +| `abs_time_comparison.png` | `summary_dashboard.py` | Grouped bar chart | +| `total_time_by_rank.png` | `gpu_by_rank.py` | Line plot | +| `computation_time_by_rank.png` | `gpu_by_rank.py` | Line plot | +| `total_comm_time_by_rank.png` | `gpu_by_rank.py` | Line plot | +| `idle_time_by_rank.png` | `gpu_by_rank.py` | Line plot | +| `gpu_time_change_percentage_summary_by_rank.png` | `gpu_percent_change.py` | 2×4 grid | +| `gpu_time_heatmap.png` | `gpu_heatmap.py` | Seaborn heatmap | +| `NCCL_Communication_Latency_comparison.png` | `nccl_charts.py` | Grouped bars | +| `NCCL_Algorithm_Bandwidth_comparison.png` | `nccl_charts.py` | Grouped bars | +| `NCCL_Bus_Bandwidth_comparison.png` | `nccl_charts.py` | Grouped bars | +| `NCCL_Total_Communication_Latency_comparison.png` | `nccl_charts.py` | Grouped bars | +| `NCCL_Performance_Percentage_Change_comparison.png` | `nccl_charts.py` | 1×3 grid | + +### GEMM Plots (5 files) + +| File | Source Module | Description | +|------|---------------|-------------| +| `variance_by_threads_boxplot.png` | `gemm_boxplots.py` | Boxplot | +| `variance_by_channels_boxplot.png` | `gemm_boxplots.py` | Boxplot | +| `variance_by_ranks_boxplot.png` | `gemm_boxplots.py` | Boxplot | +| `variance_violin_combined.png` | `gemm_violin.py` | 1×3 violin | +| `variance_thread_channel_interaction.png` | `gemm_interaction.py` | Line plot | + +--- + +## Appendix A: Design Decisions + +1. **Modular Structure:** One file per logical group of plots (~50-120 lines each) +2. **Plot Types:** `summary` and `gemm` as requested +3. **Internal Package:** `plot_helper/` keeps implementation details separate from public API +4. **Thin Orchestrator:** `plot_generator.py` imports from `plot_helper/` and provides CLI-facing API +5. **Consistent Style:** All plots use shared `common.py` utilities +6. **Easy Extension:** Adding new plot types = new file in `plot_helper/` diff --git a/src/aorta/report/PIPELINE_DEV_DOCS.md b/src/aorta/report/PIPELINE_DEV_DOCS.md new file mode 100644 index 0000000..821023f --- /dev/null +++ b/src/aorta/report/PIPELINE_DEV_DOCS.md @@ -0,0 +1,1079 @@ +# Pipeline Commands - Developer Documentation + +**Version:** 1.0 +**Date:** January 2026 +**Status:** ✅ Implemented + +--- + +## Table of Contents + +1. [Overview](#1-overview) +2. [Pipeline Summary](#2-pipeline-summary) +3. [Pipeline GEMM](#3-pipeline-gemm) +4. [Implementation Architecture](#4-implementation-architecture) +5. [Module Details](#5-module-details) +6. [Implementation Plan](#6-implementation-plan) + +--- + +## 1. Overview + +The pipeline commands orchestrate multi-step analysis workflows, combining existing commands into end-to-end automation. + +### Pipeline Commands + +| Command | Description | Steps | +|---------|-------------|-------| +| `pipeline summary` | Complete TraceLens analysis (GPU + NCCL) | 7 steps | +| `pipeline gemm` | GEMM kernel variance analysis | 3 steps | + +### Design Principles + +1. **Reuse Existing Functions**: Call existing module functions directly (no subprocess) +2. **Configurable Steps**: Enable/disable individual steps via flags +3. **Progress Reporting**: Clear step-by-step progress output +4. **Error Handling**: Continue on non-critical errors, fail fast on critical ones +5. **Dataclass Config**: Clean configuration management + +--- + +## 2. Pipeline Summary + +### 2.1 Source Script + +**Location:** `scripts/tracelens_single_config/run_full_analysis.py` (529 lines) + +### 2.2 Pipeline Steps + +| Step | Description | Existing Function | Skippable | +|------|-------------|-------------------|-----------| +| 1 | TraceLens Analysis | `analyze_single_config()` | Yes (`--skip-tracelens`) | +| 2 | Process GPU Timelines | `process_single_config()` | No | +| 3 | Compare GPU Timelines | `compare gpu_timeline` logic | Yes (`--no-gpu-timeline`) | +| 4 | Compare Collective | `compare collective` logic | Yes (`--no-collective`) | +| 5 | Generate Final Excel | `create_final_excel_report()` | Yes (`--no-final-report`) | +| 6 | Generate Plots | `generate_summary_plots()` | Yes (`--no-plots`) | +| 7 | Generate HTML | `generate_html(mode="performance")` | Yes (`--no-html`) | + +### 2.3 CLI Specification + +```bash +aorta-report pipeline summary \ + -b/--baseline # Required: Baseline trace directory + -t/--test # Required: Test trace directory + -o/--output # Required: Output directory + [--baseline-label