From 3598d6ddfac236ae5b8520b4709a64c90682f9bb Mon Sep 17 00:00:00 2001
From: prosenjitdhole <prosenjit.dhole@amd.com>
Date: Fri, 23 Jan 2026 04:58:09 -0600
Subject: [PATCH 1/2] AORTA-17 CLI command for report generation : Added
 tracelens run utility and analyze gemm reports

---
 src/aorta/report/ANALYZE_CMD_DEV_DOCS.md      | 775 ++++++++++++++++++
 src/aorta/report/analysis/__init__.py         |  14 +
 src/aorta/report/analysis/analyze_gemm.py     | 320 ++++++++
 src/aorta/report/analysis/analyze_single.py   | 339 ++++++++
 src/aorta/report/analysis/analyze_sweep.py    | 412 ++++++++++
 .../report/analysis/tracelens_wrapper.py      | 342 ++++++++
 src/aorta/report/cli.py                       | 165 +++-
 7 files changed, 2344 insertions(+), 23 deletions(-)
 create mode 100644 src/aorta/report/ANALYZE_CMD_DEV_DOCS.md
 create mode 100644 src/aorta/report/analysis/__init__.py
 create mode 100644 src/aorta/report/analysis/analyze_gemm.py
 create mode 100644 src/aorta/report/analysis/analyze_single.py
 create mode 100644 src/aorta/report/analysis/analyze_sweep.py
 create mode 100644 src/aorta/report/analysis/tracelens_wrapper.py

diff --git a/src/aorta/report/ANALYZE_CMD_DEV_DOCS.md b/src/aorta/report/ANALYZE_CMD_DEV_DOCS.md
new file mode 100644
index 0000000..90dc12a
--- /dev/null
+++ b/src/aorta/report/ANALYZE_CMD_DEV_DOCS.md
@@ -0,0 +1,775 @@
+# `analyze` Command Group - Developer Documentation
+
+**Version:** 1.0  
+**Date:** January 2026  
+**Status:** ✅ Implemented
+
+---
+
+## Table of Contents
+
+1. [Overview](#1-overview)
+2. [Command Summary](#2-command-summary)
+3. [Source Script Analysis](#3-source-script-analysis)
+4. [Implementation Architecture](#4-implementation-architecture)
+5. [Command Specifications](#5-command-specifications)
+6. [TraceLens Integration](#6-tracelens-integration)
+7. [Implementation Order](#7-implementation-order)
+8. [Expected Output](#8-expected-output)
+
+---
+
+## 1. Overview
+
+The `analyze` command group provides TraceLens analysis capabilities for PyTorch profiler traces. It consolidates three shell/Python scripts into a unified CLI interface.
+
+### Commands
+
+| Command | Purpose | Source Script |
+|---------|---------|---------------|
+| `analyze single` | Analyze single configuration traces | `run_tracelens_single_config.sh` |
+| `analyze sweep` | Analyze sweep with multiple configs | `run_tracelens_analysis.sh` |
+| `analyze gemm` | Extract GEMM kernel variance | `analyze_gemm_reports.py` |
+
+### Key Features
+
+- **Unified interface**: Consistent CLI for all analysis operations
+- **GEMM recognition**: Patched TraceLens for ROCm Tensile kernel detection
+- **Auto-discovery**: Automatic detection of ranks, threads, and channels
+- **Flexible output**: Configurable output directories and formats
+
+---
+
+## 2. Command Summary
+
+### 2.1 `analyze single`
+
+Analyze a single configuration trace directory containing rank subdirectories.
+
+```bash
+aorta-report analyze single /path/to/traces [OPTIONS]
+
+Options:
+  --individual-only               Generate only individual reports
+  --collective-only               Generate only collective report
+  --geo-mean                      Use geometric mean for timeline aggregation
+  --short-kernel-threshold INT    Threshold for short kernel study (µs)
+  --topk-ops INT                  Number of top operations to include
+  -o, --output PATH               Output directory
+```
+
+**Usage Examples:**
+```bash
+# Basic analysis (generates individual + collective reports)
+aorta-report analyze single /path/to/traces
+
+# Generate only individual reports with geometric mean aggregation
+aorta-report analyze single /path/to/traces --individual-only --geo-mean
+
+# Custom output directory
+aorta-report analyze single /path/to/traces -o ./results
+```
+
+### 2.2 `analyze sweep`
+
+Analyze a sweep directory containing multiple thread/channel configurations.
+
+```bash
+aorta-report analyze sweep /path/to/sweep [OPTIONS]
+
+Options:
+  --geo-mean          Use geometric mean instead of arithmetic mean
+  -o, --output PATH   Output directory
+```
+
+**Usage Examples:**
+```bash
+# Basic sweep analysis
+aorta-report analyze sweep /path/to/sweep_20251124
+
+# Use geometric mean for aggregation
+aorta-report analyze sweep /path/to/sweep --geo-mean
+
+# Custom output directory
+aorta-report analyze sweep /path/to/sweep -o ./analysis_results
+```
+
+### 2.3 `analyze gemm`
+
+Extract GEMM kernel variance data from existing TraceLens reports.
+
+```bash
+aorta-report analyze gemm /path/to/reports [OPTIONS]
+
+Options:
+  -t, --threads INT   Thread configurations to analyze (multiple allowed)
+  -c, --channels INT  Channel configurations to analyze (multiple allowed)
+  -r, --ranks INT     Ranks to analyze (default: 0-7)
+  --top-k INTEGER     Number of top kernels to extract per file (default: 5)
+  -o, --output PATH   Output CSV file
+```
+
+**Usage Examples:**
+```bash
+# Basic GEMM analysis with defaults (256/512 threads, 28/42/56/70 channels)
+aorta-report analyze gemm /path/to/tracelens_analysis
+
+# Custom thread and channel configurations
+aorta-report analyze gemm /path/to/reports -t 256 -t 512 -c 28 -c 42
+
+# Extract top 10 kernels and save to custom file
+aorta-report analyze gemm /path/to/reports --top-k 10 -o gemm_analysis.csv
+
+# Specify specific ranks
+aorta-report analyze gemm /path/to/reports -r 0 -r 1 -r 2 -r 3
+```
+
+---
+
+## 3. Source Script Analysis
+
+### 3.1 `run_tracelens_single_config.sh` (267 lines)
+
+**Location:** `scripts/tracelens_single_config/run_tracelens_single_config.sh`
+
+**Functionality:**
+1. Parse options (`--individual-only`, `--collective-only`)
+2. Auto-detect trace directory structure:
+   - Check if input contains `rank*` directories (is torch_profiler/)
+   - Check if input contains `torch_profiler/` subdirectory
+3. Create output directory structure
+4. Detect number of ranks
+5. Generate individual reports (per rank)
+6. Generate collective multi-rank report
+
+**TraceLens Commands:**
+```bash
+# Individual report (per rank)
+$TRACELENS_WRAPPER generate_perf_report \
+    --profile_json_path "$TRACE" \
+    --output_xlsx_path "$OUTPUT" \
+    --include_unlinked_kernels \
+    --short_kernel_study \
+    --short_kernel_threshold_us 50 \
+    --topk_ops 100 \
+    --topk_roofline_ops 100
+
+# Collective report (all ranks)
+$TRACELENS_WRAPPER generate_multi_rank_collective \
+    --trace_pattern "$TORCH_PROF_DIR/rank*/trace.json" \
+    --world_size $NUM_RANKS \
+    --output_xlsx_path "$OUTPUT" \
+    --detailed_analysis \
+    --use_multiprocessing
+```
+
+**Input Structure:**
+```
+trace_dir/
+├── torch_profiler/          # or trace_dir IS torch_profiler/
+│   ├── rank0/
+│   │   └── *.json
+│   ├── rank1/
+│   │   └── *.json
+│   └── ...
+```
+
+**Output Structure:**
+```
+trace_dir/
+└── tracelens_analysis/
+    ├── individual_reports/
+    │   ├── perf_rank0.xlsx
+    │   ├── perf_rank1.xlsx
+    │   └── ...
+    └── collective_reports/
+        └── collective_all_ranks.xlsx
+```
+
+---
+
+### 3.2 `run_tracelens_analysis.sh` (423 lines)
+
+**Location:** `scripts/gemm_analysis/run_tracelens_analysis.sh`
+
+**Functionality:**
+1. Parse options (`--rocprof`)
+2. Auto-discover thread configurations (e.g., `256thread`, `512thread`)
+3. Auto-discover channel configurations per thread (e.g., `nccl_28channels`)
+4. For each thread/channel/rank combination:
+   - Find trace files
+   - Generate individual reports
+5. Generate collective reports (PyTorch mode only)
+6. Generate cross-thread comparisons
+
+**TraceLens Commands:**
+```bash
+# PyTorch mode - Individual
+TraceLens_generate_perf_report_pytorch \
+    --profile_json_path "$TRACE" \
+    --output_xlsx_path "$OUTPUT" \
+    --include_unlinked_kernels \
+    --short_kernel_study \
+    --short_kernel_threshold_us 50 \
+    --topk_ops 100 \
+    --enable_kernel_summary \
+    --topk_roofline_ops 100
+
+# ROCprof mode - Individual
+TraceLens_generate_perf_report_rocprof \
+    --profile_json_path "$TRACE" \
+    --output_xlsx_path "$OUTPUT" \
+    --kernel_details \
+    --short_kernel_study \
+    --short_kernel_threshold_us 50 \
+    --topk_kernels 100
+
+# PyTorch mode - Collective
+TraceLens_generate_multi_rank_collective_report_pytorch \
+    --trace_pattern "$TRACE_DIR/rank*/trace/pt.trace.json" \
+    --world_size 8 \
+    --output_xlsx_path "$OUTPUT" \
+    --detailed_analysis \
+    --use_multiprocessing
+
+# Comparison across threads
+TraceLens_compare_perf_reports_pytorch \
+    "${reports[@]}" \
+    --names "${names[@]}" \
+    --sheets gpu_timeline ops_summary \
+    -o "$OUTPUT"
+```
+
+**Input Structure:**
+```
+sweep_dir/
+├── 256thread/
+│   ├── nccl_28channels/
+│   │   └── torch_profiler/
+│   │       ├── rank0/
+│   │       └── ...
+│   ├── nccl_42channels/
+│   └── ...
+└── 512thread/
+    └── ...
+```
+
+**Output Structure:**
+```
+sweep_dir/
+└── tracelens_analysis/
+    ├── 256thread/
+    │   ├── individual_reports/
+    │   │   ├── perf_28ch_rank0.xlsx
+    │   │   ├── perf_28ch_rank1.xlsx
+    │   │   └── ...
+    │   └── collective_reports/
+    │       └── collective_28ch.xlsx
+    ├── 512thread/
+    │   └── ...
+    └── comparisons/
+        ├── compare_28ch_rank0_across_threads.xlsx
+        └── ...
+```
+
+---
+
+### 3.3 `analyze_gemm_reports.py` (344 lines)
+
+**Location:** `scripts/gemm_analysis/analyze_gemm_reports.py`
+
+**Functionality:**
+1. Parse command-line arguments
+2. Iterate through thread/channel/rank combinations
+3. Open each Excel report
+4. Read GEMM sheet
+5. Extract kernel info and timing data
+6. Calculate time variance (max - min)
+7. Sort by variance and get top-K
+8. Output combined CSV
+
+**Key Functions:**
+```python
+def process_excel_file(file_path, threads, channel, rank, top_k=5):
+    """Process a single Excel file and extract GEMM data."""
+    # Opens workbook
+    # Reads GEMM sheet
+    # Validates column headers
+    # Extracts kernel_details, time_min, time_max
+    # Calculates time_diff
+    # Returns top_k results sorted by variance
+```
+
+**Input:** TraceLens Excel reports with GEMM sheet
+**Output:** CSV with columns:
+- `threads`, `channel`, `rank`
+- `kernel_name`
+- `kernel_time_min_us`, `kernel_time_max_us`, `time_diff_us`
+
+---
+
+## 4. Implementation Architecture
+
+### 4.1 File Structure
+
+```
+src/aorta/report/
+├── cli.py                           # CLI definitions (update analyze commands)
+├── analysis/                        # NEW: Analysis logic
+│   ├── __init__.py                  # Exports public functions
+│   ├── tracelens_wrapper.py         # GEMM-patched TraceLens wrapper
+│   ├── analyze_single.py            # Single config analysis
+│   ├── analyze_sweep.py             # Sweep analysis
+│   └── analyze_gemm.py              # GEMM variance analysis
+├── generators/                      # HTML generators (existing)
+└── templates/                       # HTML templates (existing)
+```
+
+### 4.2 Module Responsibilities
+
+#### `analysis/__init__.py`
+```python
+from .analyze_single import analyze_single_config
+from .analyze_sweep import analyze_sweep_config
+from .analyze_gemm import analyze_gemm_reports
+from .tracelens_wrapper import TraceLensWrapper
+
+__all__ = [
+    "analyze_single_config",
+    "analyze_sweep_config", 
+    "analyze_gemm_reports",
+    "TraceLensWrapper",
+]
+```
+
+#### `analysis/tracelens_wrapper.py`
+```python
+class TraceLensWrapper:
+    """GEMM-patched TraceLens wrapper."""
+    
+    def __init__(self):
+        self._apply_gemm_patches()
+    
+    def _apply_gemm_patches(self):
+        """Apply GEMM recognition patches to TraceLens."""
+        # Port from tracelens_with_gemm_patch.py
+    
+    def generate_perf_report(self, trace_path, output_path, **options):
+        """Generate individual performance report."""
+    
+    def generate_collective_report(self, trace_pattern, world_size, output_path, **options):
+        """Generate multi-rank collective report."""
+    
+    def compare_reports(self, report_paths, names, output_path, sheets=None):
+        """Compare multiple performance reports."""
+```
+
+#### `analysis/analyze_single.py`
+```python
+def analyze_single_config(
+    trace_dir: Path,
+    output_dir: Optional[Path] = None,
+    individual_only: bool = False,
+    collective_only: bool = False,
+    verbose: bool = False,
+) -> Path:
+    """Analyze a single configuration trace directory."""
+
+def detect_trace_structure(input_dir: Path) -> Tuple[Path, Path]:
+    """Auto-detect torch_profiler directory and base directory."""
+
+def discover_ranks(torch_prof_dir: Path) -> List[int]:
+    """Discover available ranks in the trace directory."""
+
+def generate_individual_reports(
+    wrapper: TraceLensWrapper,
+    torch_prof_dir: Path,
+    output_dir: Path,
+    ranks: List[int],
+    verbose: bool,
+) -> List[Path]:
+    """Generate individual performance reports for each rank."""
+
+def generate_collective_report(
+    wrapper: TraceLensWrapper,
+    torch_prof_dir: Path,
+    output_dir: Path,
+    num_ranks: int,
+    verbose: bool,
+) -> Optional[Path]:
+    """Generate multi-rank collective report."""
+```
+
+#### `analysis/analyze_sweep.py`
+```python
+def analyze_sweep_config(
+    sweep_dir: Path,
+    output_dir: Optional[Path] = None,
+    use_geo_mean: bool = False,
+    verbose: bool = False,
+) -> Optional[Path]:
+    """Process GPU timeline data from all individual reports in a sweep."""
+
+def process_thread_config(
+    thread_config: str,
+    tracelens_dir: Path,
+    use_geo_mean: bool,
+    verbose: bool = False,
+) -> List[pd.DataFrame]:
+    """Process a single thread configuration."""
+
+def process_channel_config(
+    channel_config: str,
+    channel_groups: Dict[str, List[tuple]],
+    use_geo_mean: bool,
+    thread_config: str,
+    verbose: bool = False,
+) -> Optional[pd.DataFrame]:
+    """Process a single channel configuration."""
+
+def aggregate_rank_data(
+    rank_data: List[pd.DataFrame],
+    thread_config: str,
+    channel_config: str,
+    num_ranks: int,
+    use_geo_mean: bool,
+) -> pd.DataFrame:
+    """Aggregate data across ranks and add metadata."""
+```
+
+#### `analysis/analyze_gemm.py`
+```python
+def analyze_gemm_reports(
+    reports_dir: Path,
+    output_file: Optional[Path] = None,
+    top_k: int = 5,
+    threads: Optional[List[int]] = None,
+    channels: Optional[List[int]] = None,
+    ranks: Optional[List[int]] = None,
+    verbose: bool = False,
+) -> Path:
+    """Analyze GEMM reports and extract top kernels by variance."""
+
+def process_excel_file(
+    file_path: Path,
+    threads: int,
+    channel: int,
+    rank: int,
+    top_k: int,
+) -> List[Dict]:
+    """Process a single Excel file and extract GEMM data."""
+
+def extract_kernel_name(kernel_info_str: str) -> Optional[str]:
+    """Extract kernel name from kernel info string."""
+```
+
+### 4.3 Data Flow
+
+```
+CLI (cli.py)
+    │
+    ├── analyze single ───────────►  analysis.analyze_single_config()
+    │                                        │
+    │                                        ├── detect_trace_structure()
+    │                                        ├── discover_ranks()
+    │                                        ├── TraceLensWrapper.generate_perf_report()
+    │                                        └── TraceLensWrapper.generate_collective_report()
+    │
+    ├── analyze sweep ────────────►  analysis.analyze_sweep_config()
+    │                                        │
+    │                                        ├── discover_configurations()
+    │                                        ├── process_configuration()
+    │                                        │       └── TraceLensWrapper.generate_perf_report()
+    │                                        ├── TraceLensWrapper.generate_collective_report()
+    │                                        └── TraceLensWrapper.compare_reports()
+    │
+    └── analyze gemm ─────────────►  analysis.analyze_gemm_reports()
+                                             │
+                                             ├── process_excel_file()
+                                             └── write CSV output
+```
+
+---
+
+## 5. Command Specifications
+
+### 5.1 `analyze single`
+
+| Aspect | Details |
+|--------|---------|
+| **Input** | Directory with torch_profiler/rank* structure |
+| **Output** | individual_reports/ and collective_reports/ |
+| **Options** | `--individual-only`, `--collective-only`, `-o` |
+| **TraceLens** | `generate_perf_report`, `generate_multi_rank_collective` |
+
+### 5.2 `analyze sweep`
+
+| Aspect | Details |
+|--------|---------|
+| **Input** | Sweep directory with thread/channel structure |
+| **Output** | Per-config reports + comparisons |
+| **Options** | `--rocprof`, `-o` |
+| **TraceLens** | `generate_perf_report`, `generate_collective`, `compare_reports` |
+
+### 5.3 `analyze gemm`
+
+| Aspect | Details |
+|--------|---------|
+| **Input** | Directory with TraceLens Excel reports |
+| **Output** | CSV with GEMM kernel variance |
+| **Options** | `--top-k`, `-o` |
+| **Dependencies** | `openpyxl` for Excel reading |
+
+---
+
+## 6. TraceLens Integration
+
+### 6.1 GEMM Patch Requirements
+
+The TraceLens wrapper must apply these patches for ROCm GEMM recognition:
+
+1. **`kernel_name_parser`**: Recognize Tensile GEMM patterns (`Cijk_Alik_Bljk_...`)
+2. **`Trace2Tree.util`**: Enhanced `is_gemm_kernel()` function
+3. **`TraceEventUtils`**: Add GEMM keys for classification
+4. **`torch_op_mapping`**: Better GEMM categorization
+
+### 6.2 TraceLens Functions Used
+
+| Function | PyTorch Mode | ROCprof Mode |
+|----------|-------------|--------------|
+| `generate_perf_report_pytorch` | ✓ | - |
+| `generate_perf_report_rocprof` | - | ✓ |
+| `generate_multi_rank_collective_report_pytorch` | ✓ | - |
+| `compare_perf_reports_pytorch` | ✓ | ✓ |
+
+### 6.3 Common TraceLens Options
+
+```python
+# Individual report options
+INDIVIDUAL_REPORT_OPTIONS = {
+    "include_unlinked_kernels": True,
+    "short_kernel_study": True,
+    "short_kernel_threshold_us": 50,
+    "topk_ops": 100,
+    "topk_roofline_ops": 100,
+}
+
+# ROCprof specific options
+ROCPROF_OPTIONS = {
+    "kernel_details": True,
+    "topk_kernels": 100,
+}
+
+# Collective report options
+COLLECTIVE_REPORT_OPTIONS = {
+    "detailed_analysis": True,
+    "use_multiprocessing": True,
+}
+```
+
+---
+
+## 7. Implementation Status
+
+### Phase 1: Foundation ✅
+
+1. **Created `analysis/` directory structure** ✅
+2. **Implemented `tracelens_wrapper.py`** ✅
+   - GEMM patches for ROCm Tensile kernel recognition
+   - Wrapper class with methods for TraceLens commands
+   - Support for individual, collective, and rocprof reports
+
+### Phase 2: `analyze gemm` ✅
+
+3. **Implemented `analyze_gemm.py`** ✅
+   - Ported logic from `analyze_gemm_reports.py`
+   - Clean API with configurable threads/channels/ranks
+   - Progress reporting and summary statistics
+
+4. **Updated CLI for `analyze gemm`** ✅
+   - Connected command to implementation
+   - Added multiple options for configuration
+
+### Phase 3: `analyze single` ✅
+
+5. **Implemented `analyze_single.py`** ✅
+   - Directory detection logic
+   - Report generation with TraceLens wrapper
+   - GPU timeline aggregation
+   - Status reporting
+
+6. **Updated CLI for `analyze single`** ✅
+   - Added geo-mean and threshold options
+
+### Phase 4: `analyze sweep` ✅
+
+7. **Implemented `analyze_sweep.py`** ✅
+   - Thread/channel config discovery
+   - GPU timeline processing across all configs
+   - Excel output with pivot tables
+
+8. **Updated CLI for `analyze sweep`** ✅
+   - Added geo-mean option
+
+### Phase 5: Documentation ✅
+
+9. **Updated documentation** ✅
+   - This dev docs file
+   - Implementation complete
+
+---
+
+## 8. Expected Output
+
+### 8.1 `analyze single` Output
+
+```
+============================================================
+TraceLens Analysis - Single Configuration
+============================================================
+Input directory: /path/to/traces
+Torch profiler: /path/to/traces/torch_profiler
+Detected 8 ranks
+
+Step 1: Generating Individual Reports
+  [1/8] Rank 0... ✓ perf_rank0.xlsx
+  [2/8] Rank 1... ✓ perf_rank1.xlsx
+  [3/8] Rank 2... ✓ perf_rank2.xlsx
+  [4/8] Rank 3... ✓ perf_rank3.xlsx
+  [5/8] Rank 4... ✓ perf_rank4.xlsx
+  [6/8] Rank 5... ✓ perf_rank5.xlsx
+  [7/8] Rank 6... ✓ perf_rank6.xlsx
+  [8/8] Rank 7... ✓ perf_rank7.xlsx
+
+Step 2: Generating Collective Report
+  Processing all 8 ranks... ✓ collective_all_ranks.xlsx
+
+============================================================
+Analysis Complete!
+============================================================
+Output: /path/to/traces/tracelens_analysis/
+
+Generated reports:
+  Individual: 8
+  Collective: 1
+```
+
+### 8.2 `analyze sweep` Output
+
+```
+============================================================
+TraceLens Analysis - Sweep
+============================================================
+Sweep directory: /path/to/sweep
+Mode: PyTorch profiler
+
+Discovered configurations:
+  256thread: 28, 42, 56, 70 channels
+  512thread: 28, 42, 56, 70 channels
+  Total: 8 configurations × 8 ranks = 64 reports
+
+Step 1: Generating Individual Reports
+  256thread/28ch:
+    [1/8] Rank 0... ✓
+    [2/8] Rank 1... ✓
+    ...
+  256thread/42ch:
+    ...
+
+Step 2: Generating Collective Reports
+  256thread/28ch... ✓
+  256thread/42ch... ✓
+  ...
+
+Step 3: Generating Comparisons
+  28ch across threads... ✓
+  42ch across threads... ✓
+  ...
+
+============================================================
+Analysis Complete!
+============================================================
+Output: /path/to/sweep/tracelens_analysis/
+
+Summary:
+  Individual reports: 64
+  Collective reports: 8
+  Comparisons: 32
+```
+
+### 8.3 `analyze gemm` Output
+
+```
+============================================================
+GEMM Kernel Variance Analysis
+============================================================
+Base path: /path/to/tracelens_analysis
+Configuration:
+  Threads: [256, 512]
+  Channels: [28, 42, 56, 70]
+  Ranks: [0, 1, 2, 3, 4, 5, 6, 7]
+  Top K: 5
+
+Processing Excel files...
+  [1/64] perf_28ch_rank0.xlsx... 5 kernels found
+  [2/64] perf_28ch_rank1.xlsx... 5 kernels found
+  ...
+
+============================================================
+Analysis Complete!
+============================================================
+Output: /path/to/tracelens_analysis/top5_gemm_kernels_time_variance.csv
+
+Summary:
+  Total kernels extracted: 320
+  Unique kernel names: 45
+  Max variance: 1234.56 µs
+  Avg variance: 89.12 µs
+
+Top 5 kernels by variance:
+  1. Cijk_Alik_Bljk_... (256t/28ch/r0): 1234.56 µs
+  2. Cijk_Alik_Bljk_... (512t/42ch/r3): 987.65 µs
+  ...
+```
+
+---
+
+## Appendix A: Migration Checklist
+
+### From `run_tracelens_single_config.sh`
+- [x] Directory structure detection
+- [x] Rank discovery
+- [x] Individual report generation loop
+- [x] Symlink creation for collective report
+- [x] Collective report generation
+- [x] Summary output
+- [x] GPU timeline aggregation
+
+### From `run_tracelens_analysis.sh` → `analyze sweep`
+- [x] Thread config discovery
+- [x] Channel config discovery
+- [x] PyTorch trace file finding
+- [x] GPU timeline processing per config
+- [x] Summary Excel generation with pivot tables
+
+### From `analyze_gemm_reports.py`
+- [x] Command-line argument handling
+- [x] Excel file processing
+- [x] GEMM sheet reading
+- [x] Kernel name extraction
+- [x] Variance calculation
+- [x] CSV output
+
+---
+
+## Appendix B: Error Handling
+
+| Scenario | Handling |
+|----------|----------|
+| Missing trace file | Log warning, continue with next |
+| Missing rank directory | Log warning, continue with next |
+| GEMM sheet not found | Log warning, skip file |
+| TraceLens import error | Raise with helpful message |
+| Permission error | Raise with fix instructions |
+| No configurations found | Raise with expected structure |
+
diff --git a/src/aorta/report/analysis/__init__.py b/src/aorta/report/analysis/__init__.py
new file mode 100644
index 0000000..e0bd324
--- /dev/null
+++ b/src/aorta/report/analysis/__init__.py
@@ -0,0 +1,14 @@
+"""Analysis modules for TraceLens trace processing."""
+
+from .tracelens_wrapper import TraceLensWrapper
+from .analyze_gemm import analyze_gemm_reports
+from .analyze_single import analyze_single_config
+from .analyze_sweep import analyze_sweep_config
+
+__all__ = [
+    "TraceLensWrapper",
+    "analyze_gemm_reports",
+    "analyze_single_config",
+    "analyze_sweep_config",
+]
+
diff --git a/src/aorta/report/analysis/analyze_gemm.py b/src/aorta/report/analysis/analyze_gemm.py
new file mode 100644
index 0000000..a724019
--- /dev/null
+++ b/src/aorta/report/analysis/analyze_gemm.py
@@ -0,0 +1,320 @@
+"""
+Analyze GEMM reports from TraceLens Excel files.
+
+Extracts top N kernels with largest time variance (max - min) from
+GEMM sheet data in individual performance reports.
+"""
+
+import csv
+import re
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Any
+
+import openpyxl
+
+
+def extract_name_from_kernel_info(kernel_info_str: str) -> Optional[str]:
+    """
+    Extract the 'name' field from the kernel info string.
+
+    Args:
+        kernel_info_str: String containing kernel details, e.g.,
+                        "[{'name': '...', 'stream': ..., ...}]"
+
+    Returns:
+        Kernel name or None if extraction fails
+    """
+    try:
+        if kernel_info_str is None or kernel_info_str == "":
+            return None
+
+        # Try to extract just the name using regex
+        match = re.search(r"'name':\s*'([^']+)'", str(kernel_info_str))
+        if match:
+            return match.group(1)
+
+        return None
+    except Exception:
+        return None
+
+
+def column_letter_to_index(letter: str) -> int:
+    """Convert Excel column letter to 0-based index."""
+    index = 0
+    for i, char in enumerate(reversed(letter.upper())):
+        index += (ord(char) - ord("A") + 1) * (26**i)
+    return index - 1
+
+
+def process_excel_file(
+    file_path: Path,
+    threads: int,
+    channel: int,
+    rank: int,
+    top_k: int = 5,
+) -> List[Dict[str, Any]]:
+    """
+    Process a single Excel file and extract GEMM data.
+
+    Args:
+        file_path: Path to the Excel file
+        threads: Thread configuration
+        channel: Channel configuration
+        rank: Rank number
+        top_k: Number of top kernels to extract
+
+    Returns:
+        List of dictionaries containing kernel data
+    """
+    try:
+        # Open the workbook
+        wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
+
+        # Check if GEMM sheet exists
+        if "GEMM" not in wb.sheetnames:
+            print(f"Warning: GEMM sheet not found in {file_path}")
+            return []
+
+        sheet = wb["GEMM"]
+
+        # Expected column positions (0-based indices)
+        col_kernel_info = column_letter_to_index("X")  # Column X
+        col_time_min = column_letter_to_index("AG")  # Column AG
+        col_time_max = column_letter_to_index("AH")  # Column AH
+
+        # Read header row to validate column names
+        rows_data = []
+        header_row = None
+
+        for i, row in enumerate(sheet.iter_rows(values_only=True)):
+            if i == 0:
+                # This is the header - validate column names match expectations
+                header_row = list(row)
+
+                # Expected column names (match what TraceLens generates)
+                expected_x = "kernel_details__summarize_kernel_stats"
+                expected_ag = "Kernel Time (µs)_min"
+                expected_ah = "Kernel Time (µs)_max"
+
+                # Validate each expected column
+                errors = []
+
+                if col_kernel_info < len(header_row):
+                    header_x = str(header_row[col_kernel_info]) if header_row[col_kernel_info] else ""
+                    if header_x != expected_x:
+                        errors.append(f"Column X: expected '{expected_x}', found '{header_x}'")
+                else:
+                    errors.append(f"Column X: not found (only {len(header_row)} columns)")
+
+                if col_time_min < len(header_row):
+                    header_ag = str(header_row[col_time_min]) if header_row[col_time_min] else ""
+                    if header_ag != expected_ag:
+                        errors.append(f"Column AG: expected '{expected_ag}', found '{header_ag}'")
+                else:
+                    errors.append(f"Column AG: not found (only {len(header_row)} columns)")
+
+                if col_time_max < len(header_row):
+                    header_ah = str(header_row[col_time_max]) if header_row[col_time_max] else ""
+                    if header_ah != expected_ah:
+                        errors.append(f"Column AH: expected '{expected_ah}', found '{header_ah}'")
+                else:
+                    errors.append(f"Column AH: not found (only {len(header_row)} columns)")
+
+                if errors:
+                    raise ValueError(
+                        f"Column validation failed in {file_path}:\n  " + "\n  ".join(errors)
+                    )
+
+                continue
+
+            if row is None or len(row) <= max(col_kernel_info, col_time_min, col_time_max):
+                continue
+
+            kernel_info = row[col_kernel_info] if col_kernel_info < len(row) else None
+            kernel_time_min = row[col_time_min] if col_time_min < len(row) else None
+            kernel_time_max = row[col_time_max] if col_time_max < len(row) else None
+
+            # Extract kernel name
+            kernel_name = extract_name_from_kernel_info(kernel_info)
+
+            # Calculate time difference
+            if kernel_time_min is not None and kernel_time_max is not None:
+                try:
+                    time_diff = float(kernel_time_max) - float(kernel_time_min)
+                except (ValueError, TypeError):
+                    continue
+            else:
+                continue
+
+            if kernel_name:
+                row_dict = {
+                    "threads": threads,
+                    "channel": channel,
+                    "rank": rank,
+                    "kernel_name": kernel_name,
+                    "kernel_time_min_us": kernel_time_min,
+                    "kernel_time_max_us": kernel_time_max,
+                    "time_diff_us": time_diff,
+                }
+
+                # Add all other columns
+                if header_row:
+                    for j, val in enumerate(row):
+                        if j < len(header_row) and header_row[j]:
+                            col_name = f"col_{header_row[j]}"
+                            row_dict[col_name] = val
+
+                rows_data.append(row_dict)
+
+        wb.close()
+
+        # Sort by time_diff_us and get top k
+        rows_data.sort(key=lambda x: x["time_diff_us"], reverse=True)
+        top_results = rows_data[:top_k]
+
+        return top_results
+
+    except Exception as e:
+        print(f"Error processing {file_path}: {e}")
+        import traceback
+        traceback.print_exc()
+        return []
+
+
+def analyze_gemm_reports(
+    base_path: Path,
+    threads: List[int],
+    channels: List[int],
+    ranks: List[int],
+    top_k: int = 5,
+    output_file: str = "top5_gemm_kernels_time_variance.csv",
+    verbose: bool = False,
+) -> Optional[Path]:
+    """
+    Analyze GEMM reports from a sweep directory structure.
+
+    Args:
+        base_path: Path to tracelens_analysis directory
+        threads: List of thread configurations to analyze (e.g., [256, 512])
+        channels: List of channel configurations (e.g., [28, 42, 56, 70])
+        ranks: List of ranks to analyze (e.g., [0, 1, 2, ..., 7])
+        top_k: Number of top kernels to extract per file
+        output_file: Output CSV filename
+        verbose: Whether to print verbose output
+
+    Returns:
+        Path to output file or None if no data processed
+    """
+    # Validate base path
+    if not base_path.exists():
+        raise FileNotFoundError(f"Base path does not exist: {base_path}")
+
+    if verbose:
+        print(f"Configuration:")
+        print(f"  Base path: {base_path}")
+        print(f"  Threads: {threads}")
+        print(f"  Channels: {channels}")
+        print(f"  Ranks: {ranks}")
+        print(f"  Top K: {top_k}")
+        print(f"  Output file: {output_file}")
+        print()
+
+    all_results = []
+
+    print("Processing Excel files...")
+    total_files = len(threads) * len(channels) * len(ranks)
+    file_count = 0
+
+    for thread_count in threads:
+        thread_dir = base_path / f"{thread_count}thread" / "individual_reports"
+
+        for channel in channels:
+            for rank in ranks:
+                file_name = f"perf_{channel}ch_rank{rank}.xlsx"
+                file_path = thread_dir / file_name
+
+                file_count += 1
+                if verbose:
+                    print(f"Processing {file_count}/{total_files}: {file_name}")
+
+                if not file_path.exists():
+                    if verbose:
+                        print(f"  Warning: File not found: {file_path}")
+                    continue
+
+                # Process the file
+                results = process_excel_file(file_path, thread_count, channel, rank, top_k)
+
+                if results:
+                    all_results.extend(results)
+                    if verbose:
+                        print(f"  Found {len(results)} kernels")
+
+    if not all_results:
+        print("Error: No data extracted!")
+        return None
+
+    # Sort by time_diff_us descending
+    print("\nCombining and sorting results...")
+    all_results.sort(key=lambda x: x["time_diff_us"], reverse=True)
+
+    # Get all unique keys
+    all_keys = set()
+    for row in all_results:
+        all_keys.update(row.keys())
+
+    # Order columns: metadata first, then others
+    metadata_cols = [
+        "threads",
+        "channel",
+        "rank",
+        "kernel_name",
+        "kernel_time_min_us",
+        "kernel_time_max_us",
+        "time_diff_us",
+    ]
+    other_cols = sorted([k for k in all_keys if k not in metadata_cols])
+    ordered_cols = metadata_cols + other_cols
+
+    # Determine output path
+    output_path = Path(output_file)
+    if not output_path.is_absolute():
+        output_path = base_path / output_file
+
+    # Ensure output directory exists
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Save to CSV
+    with open(output_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=ordered_cols)
+        writer.writeheader()
+
+        for row in all_results:
+            # Fill in missing keys with None
+            full_row = {k: row.get(k, None) for k in ordered_cols}
+            writer.writerow(full_row)
+
+    print(f"\nResults saved to: {output_path}")
+    print(f"Total rows: {len(all_results)}")
+
+    # Print summary
+    print(f"\nTop {min(10, len(all_results))} kernels by time difference:")
+    for i, row in enumerate(all_results[:10]):
+        print(
+            f"{i+1}. threads={row['threads']}, ch={row['channel']}, rank={row['rank']}, "
+            f"diff={row['time_diff_us']:.4f}us"
+        )
+        print(f"   {row['kernel_name'][:100]}...")
+
+    # Print summary statistics
+    time_diffs = [r["time_diff_us"] for r in all_results]
+    kernel_names = set(r["kernel_name"] for r in all_results)
+
+    print(f"\nSummary Statistics:")
+    print(f"Total unique kernels: {len(kernel_names)}")
+    print(f"Average time difference: {sum(time_diffs)/len(time_diffs):.4f} us")
+    print(f"Max time difference: {max(time_diffs):.4f} us")
+    print(f"Min time difference: {min(time_diffs):.4f} us")
+
+    return output_path
+
diff --git a/src/aorta/report/analysis/analyze_single.py b/src/aorta/report/analysis/analyze_single.py
new file mode 100644
index 0000000..f961cf3
--- /dev/null
+++ b/src/aorta/report/analysis/analyze_single.py
@@ -0,0 +1,339 @@
+"""
+Single configuration analysis - analyze traces from one experiment.
+
+Generates individual per-rank performance reports and multi-rank collective reports
+using TraceLens with GEMM patches for ROCm Tensile kernel recognition.
+"""
+
+from pathlib import Path
+from typing import List, Optional, Tuple
+import numpy as np
+import pandas as pd
+
+from .tracelens_wrapper import TraceLensWrapper
+
+
+def geometric_mean(values: np.ndarray) -> float:
+    """Calculate geometric mean, handling zeros."""
+    values = np.array(values)
+    values = np.where(values == 0, 1e-10, values)
+    return float(np.exp(np.mean(np.log(values))))
+
+
+def detect_trace_directory(input_dir: Path) -> Tuple[Path, Path]:
+    """
+    Auto-detect directory structure for traces.
+
+    Args:
+        input_dir: Input directory path
+
+    Returns:
+        Tuple of (torch_profiler_dir, base_dir)
+
+    Raises:
+        ValueError: If directory structure cannot be determined
+    """
+    # Check if input_dir contains rank directories (i.e., it IS torch_profiler/)
+    rank_dirs = list(input_dir.glob("rank*"))
+    if rank_dirs:
+        return input_dir, input_dir.parent
+
+    # Check if input_dir contains torch_profiler/ subdirectory
+    torch_prof_dir = input_dir / "torch_profiler"
+    if torch_prof_dir.exists():
+        rank_dirs = list(torch_prof_dir.glob("rank*"))
+        if rank_dirs:
+            return torch_prof_dir, input_dir
+
+    raise ValueError(
+        f"Cannot find rank directories in expected structure.\n"
+        f"Expected one of:\n"
+        f"  1. Directory with rank0/, rank1/, ... subdirectories (torch_profiler/)\n"
+        f"  2. Parent directory containing torch_profiler/rank0/, rank1/, ...\n"
+        f"Provided: {input_dir}"
+    )
+
+
+def find_trace_file(rank_dir: Path) -> Optional[Path]:
+    """Find trace file in a rank directory."""
+    json_files = list(rank_dir.glob("*.json"))
+    if json_files:
+        return json_files[0]
+    return None
+
+
+def process_gpu_timeline(
+    reports_dir: Path,
+    use_geo_mean: bool = False,
+    verbose: bool = False,
+) -> Optional[Path]:
+    """
+    Create mean/geometric mean aggregated GPU timeline across all ranks.
+
+    Args:
+        reports_dir: Path to individual_reports directory
+        use_geo_mean: If True, use geometric mean; otherwise use arithmetic mean
+        verbose: Whether to print verbose output
+
+    Returns:
+        Path to output Excel file or None if no data processed
+    """
+    if not reports_dir.exists():
+        raise FileNotFoundError(f"Directory not found: {reports_dir}")
+
+    print(f"Processing GPU timeline from: {reports_dir}")
+    print(f"Aggregation: {'Geometric Mean' if use_geo_mean else 'Arithmetic Mean'}")
+
+    perf_files = sorted(reports_dir.glob("perf_rank*.xlsx"))
+
+    if not perf_files:
+        print("Error: No perf_rank*.xlsx files found")
+        return None
+
+    print(f"Found {len(perf_files)} rank files")
+
+    rank_data = []
+    for file_path in perf_files:
+        rank_num = int(file_path.stem.replace("perf_rank", ""))
+        try:
+            df = pd.read_excel(file_path, sheet_name="gpu_timeline")
+            df["rank"] = rank_num
+            rank_data.append(df)
+            if verbose:
+                print(f"  Rank {rank_num}: OK")
+        except Exception as e:
+            print(f"  Rank {rank_num}: Error - {e}")
+
+    if not rank_data:
+        print("Error: No valid data loaded")
+        return None
+
+    combined = pd.concat(rank_data, ignore_index=True)
+
+    agg_func = geometric_mean if use_geo_mean else "mean"
+    aggregated = (
+        combined.groupby("type")
+        .agg({"time ms": agg_func, "percent": agg_func})
+        .reset_index()
+    )
+
+    aggregated["num_ranks"] = len(perf_files)
+
+    method_suffix = "geomean" if use_geo_mean else "mean"
+    output_path = reports_dir.parent / f"gpu_timeline_summary_{method_suffix}.xlsx"
+
+    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+        aggregated.to_excel(writer, sheet_name="Summary", index=False)
+
+        combined_sorted = combined.sort_values(["rank", "type"])
+        combined_sorted.to_excel(writer, sheet_name="All_Ranks_Combined", index=False)
+
+        per_rank = combined.pivot_table(
+            values="time ms", index="type", columns="rank", aggfunc="first"
+        )
+        per_rank.to_excel(writer, sheet_name="Per_Rank_Time_ms")
+
+        per_rank_pct = combined.pivot_table(
+            values="percent", index="type", columns="rank", aggfunc="first"
+        )
+        per_rank_pct.to_excel(writer, sheet_name="Per_Rank_Percent")
+
+    print(f"\nSaved: {output_path}")
+    print("\nSummary:")
+    print(aggregated.to_string(index=False))
+
+    return output_path
+
+
+def analyze_single_config(
+    input_dir: Path,
+    output_dir: Optional[Path] = None,
+    run_individual: bool = True,
+    run_collective: bool = True,
+    aggregate_timeline: bool = True,
+    use_geo_mean: bool = False,
+    short_kernel_threshold_us: int = 50,
+    topk_ops: int = 100,
+    verbose: bool = False,
+) -> dict:
+    """
+    Run TraceLens analysis on a single configuration trace directory.
+
+    Args:
+        input_dir: Path to trace directory (torch_profiler/ or its parent)
+        output_dir: Output directory (default: input_dir/tracelens_analysis)
+        run_individual: Generate individual per-rank reports
+        run_collective: Generate multi-rank collective report
+        aggregate_timeline: Aggregate GPU timeline across ranks
+        use_geo_mean: Use geometric mean for aggregation
+        short_kernel_threshold_us: Threshold for short kernel study
+        topk_ops: Number of top operations to include
+        verbose: Whether to print verbose output
+
+    Returns:
+        Dictionary with paths to generated reports
+    """
+    input_path = Path(input_dir)
+
+    # Detect directory structure
+    torch_prof_dir, base_dir = detect_trace_directory(input_path)
+
+    # Set output directory
+    if output_dir is None:
+        output_path = base_dir / "tracelens_analysis"
+    else:
+        output_path = Path(output_dir)
+
+    output_path.mkdir(parents=True, exist_ok=True)
+    individual_reports_dir = output_path / "individual_reports"
+    collective_reports_dir = output_path / "collective_reports"
+
+    if run_individual:
+        individual_reports_dir.mkdir(parents=True, exist_ok=True)
+    if run_collective:
+        collective_reports_dir.mkdir(parents=True, exist_ok=True)
+
+    # Detect ranks
+    rank_dirs = sorted(torch_prof_dir.glob("rank*"))
+    num_ranks = len(rank_dirs)
+
+    if num_ranks == 0:
+        raise ValueError(f"No rank directories found in {torch_prof_dir}")
+
+    print("=" * 80)
+    print("TraceLens Analysis - Single Configuration")
+    print("=" * 80)
+    print(f"\nInput directory: {input_path}")
+    print(f"Torch profiler traces: {torch_prof_dir}")
+    print(f"Detected {num_ranks} ranks")
+    print(f"Output directory: {output_path}")
+
+    results = {
+        "output_dir": output_path,
+        "individual_reports": [],
+        "collective_report": None,
+        "gpu_timeline_summary": None,
+    }
+
+    # Initialize TraceLens wrapper
+    wrapper = TraceLensWrapper(verbose=verbose)
+
+    # Step 1: Generate individual reports
+    if run_individual:
+        print("\n" + "=" * 80)
+        print("Step 1: Generating Individual Performance Reports")
+        print("=" * 80)
+
+        for rank_dir in rank_dirs:
+            rank_name = rank_dir.name
+            # Extract rank number
+            if rank_name.startswith("rank"):
+                rank_num = rank_name[4:]  # Remove "rank" prefix
+                try:
+                    rank_num = int(rank_num.lstrip("_").lstrip("0") or "0")
+                except ValueError:
+                    rank_num = rank_name
+
+            trace_file = find_trace_file(rank_dir)
+            if trace_file is None:
+                print(f"  Skip {rank_name} - no trace file found")
+                continue
+
+            output_file = individual_reports_dir / f"perf_rank{rank_num}.xlsx"
+
+            print(f"\nProcessing {rank_name}...")
+            print(f"  Trace: {trace_file.name}")
+
+            try:
+                wrapper.generate_perf_report(
+                    trace_path=trace_file,
+                    output_path=output_file,
+                    include_unlinked_kernels=True,
+                    short_kernel_study=True,
+                    short_kernel_threshold_us=short_kernel_threshold_us,
+                    topk_ops=topk_ops,
+                    topk_roofline_ops=topk_ops,
+                )
+                print(f"  Done: {output_file.name}")
+                results["individual_reports"].append(output_file)
+            except Exception as e:
+                print(f"  Error processing {rank_name}: {e}")
+
+    # Step 2: Generate collective report
+    if run_collective:
+        print("\n" + "=" * 80)
+        print("Step 2: Generating Multi-Rank Collective Report")
+        print("=" * 80)
+
+        output_file = collective_reports_dir / "collective_all_ranks.xlsx"
+
+        # Create trace.json symlinks for consistent pattern
+        for rank_dir in rank_dirs:
+            trace_file = find_trace_file(rank_dir)
+            if trace_file:
+                symlink_path = rank_dir / "trace.json"
+                if not symlink_path.exists():
+                    try:
+                        symlink_path.symlink_to(trace_file.name)
+                    except (OSError, FileExistsError):
+                        pass  # Symlink already exists or cannot be created
+
+        trace_pattern = str(torch_prof_dir / "rank*" / "trace.json")
+
+        print(f"\nGenerating collective report for {num_ranks} ranks...")
+        print(f"  Trace pattern: rank*/trace.json")
+
+        try:
+            wrapper.generate_collective_report(
+                trace_pattern=trace_pattern,
+                world_size=num_ranks,
+                output_path=output_file,
+                detailed_analysis=True,
+                use_multiprocessing=True,
+            )
+            print(f"  Done: {output_file.name}")
+            results["collective_report"] = output_file
+        except Exception as e:
+            print(f"  Error generating collective report: {e}")
+
+    # Step 3: Aggregate GPU timeline
+    if aggregate_timeline and run_individual:
+        print("\n" + "=" * 80)
+        print("Step 3: Aggregating GPU Timeline")
+        print("=" * 80)
+
+        try:
+            summary_path = process_gpu_timeline(
+                reports_dir=individual_reports_dir,
+                use_geo_mean=use_geo_mean,
+                verbose=verbose,
+            )
+            results["gpu_timeline_summary"] = summary_path
+        except Exception as e:
+            print(f"  Error aggregating GPU timeline: {e}")
+
+    # Print summary
+    print("\n" + "=" * 80)
+    print("Analysis Complete!")
+    print("=" * 80)
+    print(f"\n📁 Results saved to: {output_path}")
+    print(f"\nGenerated reports:")
+    print(f"  Individual reports: {len(results['individual_reports'])}")
+    print(f"  Collective report: {'Yes' if results['collective_report'] else 'No'}")
+    print(f"  GPU timeline summary: {'Yes' if results['gpu_timeline_summary'] else 'No'}")
+
+    if results["individual_reports"]:
+        print("\n📊 Individual Performance Reports:")
+        for report in results["individual_reports"]:
+            print(f"  {report.name}")
+
+    if results["collective_report"]:
+        print(f"\n📊 Collective Report:")
+        print(f"  {results['collective_report'].name}")
+
+    if results["gpu_timeline_summary"]:
+        print(f"\n📊 GPU Timeline Summary:")
+        print(f"  {results['gpu_timeline_summary'].name}")
+
+    return results
+
diff --git a/src/aorta/report/analysis/analyze_sweep.py b/src/aorta/report/analysis/analyze_sweep.py
new file mode 100644
index 0000000..9626769
--- /dev/null
+++ b/src/aorta/report/analysis/analyze_sweep.py
@@ -0,0 +1,412 @@
+"""
+Sweep configuration analysis - analyze traces from parameter sweep experiments.
+
+Processes GPU timeline data from TraceLens individual reports across multiple
+thread and channel configurations, aggregating across ranks.
+"""
+
+import glob
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+
+import numpy as np
+import pandas as pd
+
+
+def geometric_mean(values: np.ndarray) -> float:
+    """Calculate geometric mean, handling zeros."""
+    values = np.array(values)
+    # Replace zeros with small value to avoid log(0)
+    values = np.where(values == 0, 1e-10, values)
+    return float(np.exp(np.mean(np.log(values))))
+
+
+def parse_perf_filename(filename: str) -> tuple:
+    """
+    Parse performance filename to extract channel config and rank.
+
+    Args:
+        filename: e.g., 'perf_28ch_rank0.xlsx'
+
+    Returns:
+        tuple: (channel_config, rank) e.g., ('28ch', 0)
+    """
+    parts = filename.replace("perf_", "").replace(".xlsx", "").split("_")
+    channel_config = parts[0]  # e.g., "28ch"
+    rank = int(parts[1].replace("rank", ""))
+    return channel_config, rank
+
+
+def group_files_by_channel(perf_files: List[str]) -> Dict[str, List[tuple]]:
+    """
+    Group performance files by channel configuration.
+
+    Args:
+        perf_files: List of file paths
+
+    Returns:
+        dict: {channel_config: [(rank, file_path), ...]}
+    """
+    channel_groups = {}
+    for file_path in perf_files:
+        filename = Path(file_path).name
+        channel_config, rank = parse_perf_filename(filename)
+
+        if channel_config not in channel_groups:
+            channel_groups[channel_config] = []
+        channel_groups[channel_config].append((rank, file_path))
+
+    return channel_groups
+
+
+def read_rank_data(rank_files: List[tuple], verbose: bool = False) -> List[pd.DataFrame]:
+    """
+    Read gpu_timeline data from all rank files.
+
+    Args:
+        rank_files: List of (rank, file_path) tuples
+        verbose: Whether to print verbose output
+
+    Returns:
+        list: List of DataFrames with rank column added
+    """
+    rank_data = []
+    for rank, file_path in rank_files:
+        try:
+            df = pd.read_excel(file_path, sheet_name="gpu_timeline")
+            df["rank"] = rank
+            rank_data.append(df)
+        except Exception as e:
+            if verbose:
+                print(f"    Warning: Could not read {Path(file_path).name}: {e}")
+    return rank_data
+
+
+def aggregate_rank_data(
+    rank_data: List[pd.DataFrame],
+    thread_config: str,
+    channel_config: str,
+    num_ranks: int,
+    use_geo_mean: bool,
+) -> pd.DataFrame:
+    """
+    Aggregate data across ranks and add metadata.
+
+    Args:
+        rank_data: List of DataFrames
+        thread_config: Thread configuration string (e.g., '256thread')
+        channel_config: Channel configuration string (e.g., '28ch')
+        num_ranks: Number of ranks
+        use_geo_mean: Whether to use geometric mean
+
+    Returns:
+        DataFrame: Aggregated data with metadata
+    """
+    combined = pd.concat(rank_data, ignore_index=True)
+
+    agg_func = geometric_mean if use_geo_mean else "mean"
+    aggregated = (
+        combined.groupby("type")
+        .agg({"time ms": agg_func, "percent": agg_func})
+        .reset_index()
+    )
+
+    # Add metadata
+    aggregated["thread_config"] = thread_config
+    aggregated["threads_num"] = int(thread_config.replace("thread", ""))
+    aggregated["channel_config"] = channel_config
+    aggregated["channels_num"] = int(channel_config.replace("ch", ""))
+    aggregated["full_config"] = f"{thread_config}_{channel_config}"
+    aggregated["num_ranks"] = num_ranks
+
+    return aggregated
+
+
+def process_channel_config(
+    channel_config: str,
+    channel_groups: Dict[str, List[tuple]],
+    use_geo_mean: bool,
+    thread_config: str,
+    verbose: bool = False,
+) -> Optional[pd.DataFrame]:
+    """
+    Process a single channel configuration.
+
+    Args:
+        channel_config: Channel configuration string
+        channel_groups: Dict of channel groups
+        use_geo_mean: Whether to use geometric mean
+        thread_config: Thread configuration string
+        verbose: Whether to print verbose output
+
+    Returns:
+        DataFrame or None: Aggregated data, or None if no valid data
+    """
+    rank_files = sorted(channel_groups[channel_config], key=lambda x: x[0])
+    num_ranks = len(rank_files)
+
+    if verbose:
+        print(f"  {channel_config}: Processing {num_ranks} ranks...")
+
+    rank_data = read_rank_data(rank_files, verbose)
+
+    if not rank_data:
+        if verbose:
+            print(f"    No valid data for {channel_config}")
+        return None
+
+    aggregated = aggregate_rank_data(
+        rank_data, thread_config, channel_config, num_ranks, use_geo_mean
+    )
+    if verbose:
+        print(f"    [OK] Aggregated across {num_ranks} ranks")
+
+    return aggregated
+
+
+def process_thread_config(
+    thread_config: str,
+    tracelens_dir: Path,
+    use_geo_mean: bool,
+    verbose: bool = False,
+) -> List[pd.DataFrame]:
+    """
+    Process a single thread configuration.
+
+    Args:
+        thread_config: Thread configuration string
+        tracelens_dir: Path to tracelens_analysis directory
+        use_geo_mean: Whether to use geometric mean
+        verbose: Whether to print verbose output
+
+    Returns:
+        list: List of aggregated DataFrames
+    """
+    individual_reports_dir = tracelens_dir / thread_config / "individual_reports"
+
+    if not individual_reports_dir.exists():
+        if verbose:
+            print(f"  Warning: {individual_reports_dir} not found, skipping...")
+        return []
+
+    if verbose:
+        print(f"\nProcessing: {thread_config}")
+        print("-" * 60)
+
+    perf_files = sorted(glob.glob(str(individual_reports_dir / "perf_*ch_rank*.xlsx")))
+
+    if not perf_files:
+        if verbose:
+            print(f"  Warning: No performance files found in {individual_reports_dir}")
+        return []
+
+    channel_groups = group_files_by_channel(perf_files)
+    results = []
+
+    # Process each channel configuration (sorted by channel number)
+    sorted_channels = sorted(
+        channel_groups.keys(), key=lambda x: int(x.replace("ch", ""))
+    )
+    for channel_config in sorted_channels:
+        aggregated = process_channel_config(
+            channel_config, channel_groups, use_geo_mean, thread_config, verbose
+        )
+        if aggregated is not None:
+            results.append(aggregated)
+
+    return results
+
+
+def create_pivot_sheet(df: pd.DataFrame, value_col: str) -> pd.DataFrame:
+    """
+    Create a pivot table from the dataframe.
+
+    Args:
+        df: Source DataFrame
+        value_col: Column to use for values
+
+    Returns:
+        DataFrame: Pivot table
+    """
+    return df.pivot_table(
+        values=value_col, index="type", columns="full_config", aggfunc="first"
+    )
+
+
+def create_summary_sheet(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Create a summary sheet with key metrics per configuration.
+
+    Args:
+        df: Source DataFrame
+
+    Returns:
+        DataFrame: Summary table
+    """
+    summary = (
+        df.groupby("full_config")
+        .agg({"threads_num": "first", "channels_num": "first", "num_ranks": "first"})
+        .reset_index()
+    )
+
+    # Add key metrics for each config
+    key_metrics = [
+        "computation_time",
+        "exposed_comm_time",
+        "busy_time",
+        "idle_time",
+        "total_time",
+    ]
+    for metric_type in key_metrics:
+        metric_data = df[df["type"] == metric_type].set_index("full_config")["time ms"]
+        summary[f"{metric_type}_ms"] = summary["full_config"].map(metric_data)
+
+    return summary
+
+
+def print_summary_report(final_df: pd.DataFrame, verbose: bool = False) -> None:
+    """Print summary statistics and comparisons."""
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+
+    print("\nMetric Types Found:")
+    for metric_type in sorted(final_df["type"].unique()):
+        count = len(final_df[final_df["type"] == metric_type])
+        print(f"  {metric_type:<25} ({count} configurations)")
+
+    print("\nConfigurations Processed:")
+    configs = final_df.groupby("full_config")["num_ranks"].first().sort_index()
+    for config, num_ranks in configs.items():
+        print(f"  {config:<25} ({num_ranks} ranks)")
+
+    if verbose:
+        print("\n" + "=" * 80)
+        print("KEY METRICS COMPARISON (Sorted by Busy Time)")
+        print("=" * 80)
+
+        for metric, desc in [
+            ("busy_time", "Busy Time (lower is better)"),
+            ("idle_time", "Idle Time (lower is better)"),
+        ]:
+            metric_data = final_df[final_df["type"] == metric][
+                ["full_config", "time ms", "percent"]
+            ].sort_values("time ms")
+            print(f"\n{desc}:")
+            print(metric_data.to_string(index=False))
+
+
+def analyze_sweep_config(
+    sweep_dir: Path,
+    output_dir: Optional[Path] = None,
+    use_geo_mean: bool = False,
+    verbose: bool = False,
+) -> Optional[Path]:
+    """
+    Process GPU timeline data from all individual reports in a sweep.
+
+    Args:
+        sweep_dir: Path to sweep directory containing tracelens_analysis/
+        output_dir: Output directory (default: sweep_dir/tracelens_analysis/)
+        use_geo_mean: If True, use geometric mean; otherwise use arithmetic mean
+        verbose: Whether to print verbose output
+
+    Returns:
+        Path to output Excel file or None if no data processed
+    """
+    sweep_path = Path(sweep_dir)
+    tracelens_dir = sweep_path / "tracelens_analysis"
+
+    if not tracelens_dir.exists():
+        raise FileNotFoundError(
+            f"tracelens_analysis directory not found in {sweep_dir}"
+        )
+
+    agg_method = "Geometric Mean" if use_geo_mean else "Arithmetic Mean"
+    print("=" * 80)
+    print(f"Processing GPU Timeline data from: {sweep_dir}")
+    print(f"Aggregation method: {agg_method}")
+    print("=" * 80)
+
+    # Find all thread configurations
+    thread_configs = [
+        d.name for d in tracelens_dir.iterdir() if d.is_dir() and "thread" in d.name
+    ]
+
+    if not thread_configs:
+        raise ValueError("No thread configuration directories found")
+
+    print(f"\nFound thread configurations: {sorted(thread_configs)}")
+
+    # Process all thread configurations
+    all_results = []
+    for thread_config in sorted(thread_configs):
+        results = process_thread_config(thread_config, tracelens_dir, use_geo_mean, verbose)
+        all_results.extend(results)
+
+    if not all_results:
+        print("\nError: No data was processed")
+        return None
+
+    # Combine and format results
+    print("\n" + "=" * 80)
+    print("CREATING OUTPUT FILE")
+    print("=" * 80)
+
+    final_df = pd.concat(all_results, ignore_index=True)
+
+    # Reorder and sort
+    column_order = [
+        "full_config",
+        "threads_num",
+        "thread_config",
+        "channels_num",
+        "channel_config",
+        "num_ranks",
+        "type",
+        "time ms",
+        "percent",
+    ]
+    final_df = final_df[column_order]
+    final_df = final_df.sort_values(["threads_num", "channels_num", "type"])
+
+    # Determine output path
+    if output_dir:
+        output_path = Path(output_dir)
+    else:
+        output_path = tracelens_dir
+
+    method_suffix = "geomean" if use_geo_mean else "mean"
+    output_file = output_path / f"gpu_timeline_all_configs_{method_suffix}.xlsx"
+
+    # Save to Excel with multiple sheets
+    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
+        final_df.to_excel(writer, sheet_name="All_Data", index=False)
+        create_pivot_sheet(final_df, "time ms").to_excel(
+            writer, sheet_name="Pivot_Time_ms"
+        )
+        create_pivot_sheet(final_df, "percent").to_excel(
+            writer, sheet_name="Pivot_Percent"
+        )
+        create_summary_sheet(final_df).to_excel(
+            writer, sheet_name="Summary_By_Config", index=False
+        )
+
+    print(f"[SAVED] {output_file}")
+    print("  Sheets created:")
+    print("    1. All_Data - Complete dataset")
+    print("    2. Pivot_Time_ms - Matrix view of time (ms)")
+    print("    3. Pivot_Percent - Matrix view of percentages")
+    print("    4. Summary_By_Config - Key metrics per configuration")
+
+    # Print summary
+    print_summary_report(final_df, verbose)
+
+    print("\n" + "=" * 80)
+    print("COMPLETE!")
+    print("=" * 80)
+    print(f"\nOutput file: {output_file}")
+    print("Open in Excel to create custom pivots and charts!")
+
+    return output_file
+
diff --git a/src/aorta/report/analysis/tracelens_wrapper.py b/src/aorta/report/analysis/tracelens_wrapper.py
new file mode 100644
index 0000000..3b857a6
--- /dev/null
+++ b/src/aorta/report/analysis/tracelens_wrapper.py
@@ -0,0 +1,342 @@
+"""
+TraceLens wrapper with GEMM recognition patches.
+
+Applies patches to TraceLens for better ROCm Tensile kernel recognition
+and provides a clean Python API for TraceLens commands.
+"""
+
+import re
+import sys
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+
+
+class TraceLensWrapper:
+    """GEMM-patched TraceLens wrapper."""
+
+    _patches_applied = False
+
+    def __init__(self, verbose: bool = False):
+        """Initialize wrapper and apply GEMM patches."""
+        self.verbose = verbose
+        if not TraceLensWrapper._patches_applied:
+            self._apply_gemm_patches()
+            TraceLensWrapper._patches_applied = True
+
+    def _log(self, message: str) -> None:
+        """Log message if verbose mode is enabled."""
+        if self.verbose:
+            print(message)
+
+    def _apply_gemm_patches(self) -> None:
+        """Apply all GEMM recognition patches to TraceLens."""
+        self._log("Applying TraceLens GEMM recognition patches...")
+
+        # Patch kernel_name_parser for enhanced ROCm GEMM recognition
+        try:
+            from TraceLens.PerfModel import kernel_name_parser
+
+            def patched_is_rocm_gemm(kernel_name):
+                """Enhanced ROCm GEMM pattern matching for Tensile kernels."""
+                pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$"
+                return bool(re.match(pattern, kernel_name))
+
+            def patched_parse_rocm_gemm(kernel_name):
+                """Parse ROCm GEMM kernel details."""
+                trans_a, trans_b = None, None
+                if "_Ailk_" in kernel_name:
+                    trans_a = False
+                elif "_Alik_" in kernel_name:
+                    trans_a = True
+                if "_Bljk_" in kernel_name:
+                    trans_b = False
+                elif "_Bjlk_" in kernel_name:
+                    trans_b = True
+
+                macro_tile_match = re.search(r"MT(\d+)x(\d+)x(\d+)", kernel_name)
+                if macro_tile_match:
+                    mt_m = int(macro_tile_match.group(1))
+                    mt_n = int(macro_tile_match.group(2))
+                    depth_u = int(macro_tile_match.group(3))
+                else:
+                    mt_m, mt_n, depth_u = None, None, None
+
+                return {
+                    "transpose": (trans_a, trans_b),
+                    "mt_m": mt_m,
+                    "mt_n": mt_n,
+                    "depth_u": depth_u,
+                }
+
+            def patched_gemm_name_parser(kernel_name):
+                """Enhanced GEMM name parser with better ROCm support."""
+                if patched_is_rocm_gemm(kernel_name):
+                    return patched_parse_rocm_gemm(kernel_name)
+                elif kernel_name_parser.is_cuda_gemm(kernel_name):
+                    return kernel_name_parser.parse_cuda_gemm(kernel_name)
+                return None
+
+            kernel_name_parser.is_rocm_gemm = patched_is_rocm_gemm
+            kernel_name_parser.parse_rocm_gemm = patched_parse_rocm_gemm
+            kernel_name_parser.gemm_name_parser = patched_gemm_name_parser
+
+            self._log("  [OK] Patched kernel_name_parser (ROCm GEMM recognition)")
+        except ImportError as e:
+            self._log(f"  [WARN] Could not patch kernel_name_parser: {e}")
+
+        # Patch Trace2Tree util for is_gemm_kernel function
+        try:
+            from TraceLens.Trace2Tree import util as trace_util
+
+            def patched_is_gemm_kernel(kernel_event: dict) -> bool:
+                """Enhanced GEMM kernel detection."""
+                assert kernel_event["cat"] == "kernel"
+                kernel_name = kernel_event["name"]
+
+                pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$"
+                is_rocm_gemm = bool(re.match(pattern, kernel_name))
+                is_cuda_gemm = kernel_name.startswith("nvjet") or "cublasLt" in kernel_name
+
+                return is_rocm_gemm or is_cuda_gemm
+
+            trace_util.is_gemm_kernel = patched_is_gemm_kernel
+            self._log("  [OK] Patched Trace2Tree.util (is_gemm_kernel)")
+        except ImportError as e:
+            self._log(f"  [WARN] Could not patch Trace2Tree.util: {e}")
+
+        # Patch TraceEventUtils to enhance GEMM keys
+        try:
+            from TraceLens import util as tracelens_util
+
+            if hasattr(tracelens_util, "TraceEventUtils"):
+                if hasattr(tracelens_util.TraceEventUtils, "JaxOpKeys"):
+                    original_gemm_keys = tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys
+                    enhanced_gemm_keys = [
+                        "Cijk",
+                        "gemm",
+                        "nvjet",
+                        "cublasLt",
+                        "C[a-z]{3}_A[a-z]{3}_B[a-z]{3}",
+                    ]
+                    all_keys = list(set(original_gemm_keys + enhanced_gemm_keys))
+                    tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys = all_keys
+                    self._log("  [OK] Patched TraceEventUtils.JaxOpKeys (GEMM keys enhanced)")
+        except (ImportError, AttributeError) as e:
+            self._log(f"  [WARN] Could not patch TraceEventUtils: {e}")
+
+        # Patch torch_op_mapping for better categorization
+        try:
+            from TraceLens.PerfModel import torch_op_mapping
+
+            original_categorize = torch_op_mapping.categorize_torch_op
+
+            def patched_categorize_torch_op(row):
+                """Enhanced categorization with better GEMM detection."""
+                result = original_categorize(row)
+
+                if result == "other" and "kernel_details" in row and len(row["kernel_details"]) > 0:
+                    kernel_name = row["kernel_details"][0]["name"]
+                    pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$"
+                    if re.match(pattern, kernel_name):
+                        return "GEMM"
+
+                return result
+
+            torch_op_mapping.categorize_torch_op = patched_categorize_torch_op
+            self._log("  [OK] Patched torch_op_mapping (categorize_torch_op)")
+        except ImportError as e:
+            self._log(f"  [WARN] Could not patch torch_op_mapping: {e}")
+
+        self._log("[OK] GEMM patches applied\n")
+
+    def generate_perf_report(
+        self,
+        trace_path: Path,
+        output_path: Path,
+        include_unlinked_kernels: bool = True,
+        short_kernel_study: bool = True,
+        short_kernel_threshold_us: int = 50,
+        topk_ops: int = 100,
+        topk_roofline_ops: int = 100,
+        enable_kernel_summary: bool = False,
+    ) -> Path:
+        """
+        Generate individual performance report from trace data.
+
+        Args:
+            trace_path: Path to the trace JSON file
+            output_path: Path for output Excel file
+            include_unlinked_kernels: Include unlinked kernels in report
+            short_kernel_study: Enable short kernel study
+            short_kernel_threshold_us: Threshold for short kernels (microseconds)
+            topk_ops: Number of top operations to include
+            topk_roofline_ops: Number of top roofline operations
+            enable_kernel_summary: Enable kernel summary sheet
+
+        Returns:
+            Path to generated report
+        """
+        from TraceLens.Reporting.generate_perf_report_pytorch import main as generate_main
+
+        # Build argument list
+        args = [
+            "--profile_json_path", str(trace_path),
+            "--output_xlsx_path", str(output_path),
+        ]
+
+        if include_unlinked_kernels:
+            args.append("--include_unlinked_kernels")
+        if short_kernel_study:
+            args.append("--short_kernel_study")
+            args.extend(["--short_kernel_threshold_us", str(short_kernel_threshold_us)])
+        if topk_ops:
+            args.extend(["--topk_ops", str(topk_ops)])
+        if topk_roofline_ops:
+            args.extend(["--topk_roofline_ops", str(topk_roofline_ops)])
+        if enable_kernel_summary:
+            args.append("--enable_kernel_summary")
+
+        # Save original argv and replace
+        original_argv = sys.argv
+        sys.argv = ["generate_perf_report_pytorch"] + args
+
+        try:
+            generate_main()
+        finally:
+            sys.argv = original_argv
+
+        return output_path
+
+    def generate_perf_report_rocprof(
+        self,
+        trace_path: Path,
+        output_path: Path,
+        kernel_details: bool = True,
+        short_kernel_study: bool = True,
+        short_kernel_threshold_us: int = 50,
+        topk_kernels: int = 100,
+    ) -> Path:
+        """
+        Generate performance report from rocprof trace data.
+
+        Args:
+            trace_path: Path to the rocprof results JSON file
+            output_path: Path for output Excel file
+            kernel_details: Include kernel details
+            short_kernel_study: Enable short kernel study
+            short_kernel_threshold_us: Threshold for short kernels
+            topk_kernels: Number of top kernels to include
+
+        Returns:
+            Path to generated report
+        """
+        from TraceLens.Reporting.generate_perf_report_rocprof import main as generate_main
+
+        args = [
+            "--profile_json_path", str(trace_path),
+            "--output_xlsx_path", str(output_path),
+        ]
+
+        if kernel_details:
+            args.append("--kernel_details")
+        if short_kernel_study:
+            args.append("--short_kernel_study")
+            args.extend(["--short_kernel_threshold_us", str(short_kernel_threshold_us)])
+        if topk_kernels:
+            args.extend(["--topk_kernels", str(topk_kernels)])
+
+        original_argv = sys.argv
+        sys.argv = ["generate_perf_report_rocprof"] + args
+
+        try:
+            generate_main()
+        finally:
+            sys.argv = original_argv
+
+        return output_path
+
+    def generate_collective_report(
+        self,
+        trace_pattern: str,
+        world_size: int,
+        output_path: Path,
+        detailed_analysis: bool = True,
+        use_multiprocessing: bool = True,
+    ) -> Path:
+        """
+        Generate multi-rank collective report.
+
+        Args:
+            trace_pattern: Glob pattern for trace files (e.g., "rank*/trace.json")
+            world_size: Number of ranks
+            output_path: Path for output Excel file
+            detailed_analysis: Enable detailed analysis
+            use_multiprocessing: Use multiprocessing for parallel analysis
+
+        Returns:
+            Path to generated report
+        """
+        from TraceLens.Reporting.generate_multi_rank_collective_report_pytorch import (
+            main as generate_main,
+        )
+
+        args = [
+            "--trace_pattern", str(trace_pattern),
+            "--world_size", str(world_size),
+            "--output_xlsx_path", str(output_path),
+        ]
+
+        if detailed_analysis:
+            args.append("--detailed_analysis")
+        if use_multiprocessing:
+            args.append("--use_multiprocessing")
+
+        original_argv = sys.argv
+        sys.argv = ["generate_multi_rank_collective_report_pytorch"] + args
+
+        try:
+            generate_main()
+        finally:
+            sys.argv = original_argv
+
+        return output_path
+
+    def compare_reports(
+        self,
+        report_paths: List[Path],
+        names: List[str],
+        output_path: Path,
+        sheets: Optional[List[str]] = None,
+    ) -> Path:
+        """
+        Compare multiple performance reports.
+
+        Args:
+            report_paths: List of paths to Excel reports
+            names: Names for each report
+            output_path: Path for output comparison file
+            sheets: Sheets to compare (default: gpu_timeline, ops_summary)
+
+        Returns:
+            Path to generated comparison report
+        """
+        from TraceLens.Reporting.compare_perf_reports_pytorch import main as compare_main
+
+        if sheets is None:
+            sheets = ["gpu_timeline", "ops_summary"]
+
+        args = [str(p) for p in report_paths]
+        args.extend(["--names"] + names)
+        args.extend(["--sheets"] + sheets)
+        args.extend(["-o", str(output_path)])
+
+        original_argv = sys.argv
+        sys.argv = ["compare_perf_reports_pytorch"] + args
+
+        try:
+            compare_main()
+        finally:
+            sys.argv = original_argv
+
+        return output_path
+
diff --git a/src/aorta/report/cli.py b/src/aorta/report/cli.py
index 33eadd9..e6e08b7 100644
--- a/src/aorta/report/cli.py
+++ b/src/aorta/report/cli.py
@@ -67,9 +67,15 @@ def analyze(ctx):
 @click.argument("trace_dir", type=click.Path(exists=True))
 @click.option("--individual-only", is_flag=True, help="Generate only individual reports")
 @click.option("--collective-only", is_flag=True, help="Generate only collective report")
+@click.option("--geo-mean", is_flag=True, help="Use geometric mean for timeline aggregation")
+@click.option("--short-kernel-threshold", default=50, type=int,
+              help="Threshold for short kernel study (microseconds)")
+@click.option("--topk-ops", default=100, type=int,
+              help="Number of top operations to include")
 @click.option("-o", "--output", type=click.Path(), help="Output directory")
 @click.pass_context
-def analyze_single(ctx, trace_dir, individual_only, collective_only, output):
+def analyze_single(ctx, trace_dir, individual_only, collective_only, geo_mean,
+                   short_kernel_threshold, topk_ops, output):
     """Analyze a single configuration trace directory.
 
     TRACE_DIR: Path to the trace directory containing rank subdirectories.
@@ -80,50 +86,117 @@ def analyze_single(ctx, trace_dir, individual_only, collective_only, output):
       aorta-report analyze single /path/to/traces --individual-only
       aorta-report analyze single /path/to/traces -o ./results
     """
-    click.echo(f"[analyze single] trace_dir={trace_dir}")
-    click.echo(f"  individual_only={individual_only}, collective_only={collective_only}")
-    click.echo(f"  output={output}")
-    click.echo("  [NOT IMPLEMENTED]")
+    from pathlib import Path
+    from .analysis import analyze_single_config
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    run_individual = not collective_only
+    run_collective = not individual_only
+
+    try:
+        results = analyze_single_config(
+            input_dir=Path(trace_dir),
+            output_dir=Path(output) if output else None,
+            run_individual=run_individual,
+            run_collective=run_collective,
+            aggregate_timeline=run_individual,
+            use_geo_mean=geo_mean,
+            short_kernel_threshold_us=short_kernel_threshold,
+            topk_ops=topk_ops,
+            verbose=verbose,
+        )
+        if not quiet:
+            click.echo(f"\nAnalysis complete: {results['output_dir']}")
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
 
 
 @analyze.command("sweep")
 @click.argument("sweep_dir", type=click.Path(exists=True))
-@click.option("--rocprof", is_flag=True, help="Use rocprof traces instead of PyTorch profiler")
+@click.option("--geo-mean", is_flag=True, help="Use geometric mean instead of arithmetic mean")
 @click.option("-o", "--output", type=click.Path(), help="Output directory")
 @click.pass_context
-def analyze_sweep(ctx, sweep_dir, rocprof, output):
+def analyze_sweep(ctx, sweep_dir, geo_mean, output):
     """Analyze a sweep directory with multiple configurations.
 
-    SWEEP_DIR: Path to the sweep directory containing multiple thread/channel configs.
+    SWEEP_DIR: Path to the sweep directory containing tracelens_analysis/
+    with multiple thread/channel configs.
 
     \b
     Examples:
       aorta-report analyze sweep /path/to/sweep_20251124
-      aorta-report analyze sweep /path/to/sweep --rocprof
+      aorta-report analyze sweep /path/to/sweep --geo-mean
     """
-    click.echo(f"[analyze sweep] sweep_dir={sweep_dir}")
-    click.echo(f"  rocprof={rocprof}, output={output}")
-    click.echo("  [NOT IMPLEMENTED]")
+    from pathlib import Path
+    from .analysis import analyze_sweep_config
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    try:
+        output_path = analyze_sweep_config(
+            sweep_dir=Path(sweep_dir),
+            output_dir=Path(output) if output else None,
+            use_geo_mean=geo_mean,
+            verbose=verbose,
+        )
+        if not quiet and output_path:
+            click.echo(f"\nAnalysis complete: {output_path}")
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
 
 
 @analyze.command("gemm")
 @click.argument("reports_dir", type=click.Path(exists=True))
-@click.option("--top-k", default=5, type=int, help="Number of top kernels to extract")
-@click.option("-o", "--output", type=click.Path(), help="Output CSV file")
+@click.option("--threads", "-t", multiple=True, type=int, default=(256, 512),
+              help="Thread configurations to analyze (can be specified multiple times)")
+@click.option("--channels", "-c", multiple=True, type=int, default=(28, 42, 56, 70),
+              help="Channel configurations to analyze (can be specified multiple times)")
+@click.option("--ranks", "-r", multiple=True, type=int,
+              help="Ranks to analyze (default: 0-7)")
+@click.option("--top-k", default=5, type=int, help="Number of top kernels to extract per file")
+@click.option("-o", "--output", type=click.Path(),
+              default="top5_gemm_kernels_time_variance.csv", help="Output CSV file")
 @click.pass_context
-def analyze_gemm(ctx, reports_dir, top_k, output):
+def analyze_gemm(ctx, reports_dir, threads, channels, ranks, top_k, output):
     """Analyze GEMM kernels from TraceLens reports.
 
-    REPORTS_DIR: Path to directory containing TraceLens Excel reports.
+    REPORTS_DIR: Path to tracelens_analysis directory containing
+    {threads}thread/individual_reports/ subdirectories.
 
     \b
     Examples:
-      aorta-report analyze gemm /path/to/reports
+      aorta-report analyze gemm /path/to/tracelens_analysis
       aorta-report analyze gemm /path/to/reports --top-k 10 -o gemm_analysis.csv
+      aorta-report analyze gemm /path/to/reports -t 256 -t 512 -c 28 -c 42
     """
-    click.echo(f"[analyze gemm] reports_dir={reports_dir}")
-    click.echo(f"  top_k={top_k}, output={output}")
-    click.echo("  [NOT IMPLEMENTED]")
+    from pathlib import Path
+    from .analysis import analyze_gemm_reports
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    # Convert tuples to lists, use defaults if not specified
+    threads_list = list(threads) if threads else [256, 512]
+    channels_list = list(channels) if channels else [28, 42, 56, 70]
+    ranks_list = list(ranks) if ranks else list(range(8))
+
+    try:
+        output_path = analyze_gemm_reports(
+            base_path=Path(reports_dir),
+            threads=threads_list,
+            channels=channels_list,
+            ranks=ranks_list,
+            top_k=top_k,
+            output_file=output,
+            verbose=verbose,
+        )
+        if not quiet and output_path:
+            click.echo(f"\nAnalysis complete: {output_path}")
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
 
 
 # =============================================================================
@@ -390,9 +463,55 @@ def process_gpu_timeline(ctx, input_dir, mode, geo_mean, output):
       aorta-report process gpu-timeline /path/to/individual_reports --mode single
       aorta-report process gpu-timeline /path/to/sweep --mode sweep --geo-mean
     """
-    click.echo(f"[process gpu-timeline] input_dir={input_dir}")
-    click.echo(f"  mode={mode}, geo_mean={geo_mean}, output={output}")
-    click.echo("  [NOT IMPLEMENTED]")
+    from pathlib import Path
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+    input_path = Path(input_dir)
+
+    # Auto-detect mode
+    if mode == "auto":
+        # Check for sweep structure (tracelens_analysis with thread directories)
+        tracelens_dir = input_path / "tracelens_analysis"
+        if tracelens_dir.exists():
+            thread_dirs = [d for d in tracelens_dir.iterdir() if d.is_dir() and "thread" in d.name]
+            if thread_dirs:
+                mode = "sweep"
+            else:
+                mode = "single"
+        elif input_path.name == "individual_reports" or list(input_path.glob("perf_rank*.xlsx")):
+            mode = "single"
+        elif list(input_path.glob("perf_*ch_rank*.xlsx")):
+            mode = "sweep"
+        else:
+            raise click.ClickException(
+                f"Could not auto-detect mode. Please specify --mode single or --mode sweep"
+            )
+
+        if verbose:
+            click.echo(f"Auto-detected mode: {mode}")
+
+    try:
+        if mode == "single":
+            from .analysis.analyze_single import process_gpu_timeline as process_single
+            output_path = process_single(
+                reports_dir=input_path,
+                use_geo_mean=geo_mean,
+                verbose=verbose,
+            )
+        else:  # sweep
+            from .analysis import analyze_sweep_config
+            output_path = analyze_sweep_config(
+                sweep_dir=input_path,
+                output_dir=Path(output) if output else None,
+                use_geo_mean=geo_mean,
+                verbose=verbose,
+            )
+
+        if not quiet and output_path:
+            click.echo(f"\nProcessing complete: {output_path}")
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
 
 
 @process.command("comms")

From c1b620c651496e3eb39528bc54729c8d4e25ce1c Mon Sep 17 00:00:00 2001
From: prosenjitdhole <prosenjit.dhole@amd.com>
Date: Wed, 28 Jan 2026 13:22:20 +0530
Subject: [PATCH 2/2] [WIP] AORTA-17 CLI command for report generation : Added
 processign scripts (#66)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* AORTA-17 CLI command for report generation : Added processign scripts

* AORTA-17 CLI command for report generation : Adding combining scripts… (#67)

* AORTA-17 CLI command for report generation : Adding combining scripts under compare subcommand

* AORTA-17 CLI command for report generation : Excel generation (#68)

* AORTA-17 CLI command for report generation : Generate the excel for single config runs

* AORTA-17 CLI command for report generation : Generate all the plots (#69)

* AORTA-17 CLI command for report generation : Generate all the plots

* AORTA-17 CLI command for report generation : Implementation of pipeline (#70)

* AORTA-17 CLI command for report generation : Implementation of pipeline

* AORTA-17 CLI command for report generation : refactored cli (#71)

Co-authored-by: Dhole <prosenj@amd.com>

---------

Co-authored-by: Dhole <prosenj@amd.com>

---------

Co-authored-by: Dhole <prosenj@amd.com>

---------

Co-authored-by: Dhole <prosenj@amd.com>

---------

Co-authored-by: Dhole <prosenj@amd.com>

---------

Co-authored-by: Dhole <prosenj@amd.com>
---
 src/aorta/report/COMPARE_CMD_DEV_DOCS.md      | 1058 +++++++++++++
 src/aorta/report/GENERATE_EXCEL_DEV_DOCS.md   |  618 ++++++++
 src/aorta/report/GENERATE_PLOTS_DEV_DOCS.md   | 1383 +++++++++++++++++
 src/aorta/report/PIPELINE_DEV_DOCS.md         | 1079 +++++++++++++
 src/aorta/report/PROCESS_CMD_DEV_DOCS.md      | 1009 ++++++++++++
 src/aorta/report/USER_GUIDE.md                |  826 ++++++++++
 src/aorta/report/analysis/cli.py              |  158 ++
 src/aorta/report/aorta-report-detail-plan.md  |  919 +++++------
 .../report/aorta-report-functional-spec.md    |   94 +-
 src/aorta/report/cli.py                       |  605 +------
 src/aorta/report/comparison/__init__.py       |   13 +
 src/aorta/report/comparison/cli.py            |  254 +++
 .../comparison/collective_comparison.py       |  238 +++
 src/aorta/report/comparison/combine.py        |  135 ++
 src/aorta/report/comparison/formatting.py     |  144 ++
 .../comparison/gpu_timeline_comparison.py     |  222 +++
 src/aorta/report/generators/__init__.py       |    8 +-
 src/aorta/report/generators/cli.py            |  291 ++++
 src/aorta/report/generators/excel_report.py   |  505 ++++++
 src/aorta/report/generators/plot_generator.py |  185 +++
 .../report/generators/plot_helper/__init__.py |   50 +
 .../report/generators/plot_helper/common.py   |   69 +
 .../generators/plot_helper/gemm_boxplots.py   |  108 ++
 .../generators/plot_helper/gemm_data.py       |  111 ++
 .../plot_helper/gemm_interaction.py           |   69 +
 .../generators/plot_helper/gemm_violin.py     |   96 ++
 .../generators/plot_helper/gpu_by_rank.py     |   74 +
 .../generators/plot_helper/gpu_heatmap.py     |   49 +
 .../plot_helper/gpu_percent_change.py         |   65 +
 .../generators/plot_helper/nccl_charts.py     |  140 ++
 .../plot_helper/summary_dashboard.py          |   98 ++
 src/aorta/report/pipelines/__init__.py        |   14 +
 src/aorta/report/pipelines/cli.py             |  245 +++
 src/aorta/report/pipelines/gemm_pipeline.py   |  194 +++
 .../report/pipelines/summary_pipeline.py      |  412 +++++
 src/aorta/report/processing/__init__.py       |   13 +
 src/aorta/report/processing/cli.py            |  188 +++
 .../report/processing/gpu_timeline_single.py  |  143 ++
 .../report/processing/gpu_timeline_sweep.py   |  435 ++++++
 src/aorta/report/processing/process_comms.py  |  291 ++++
 .../processing/process_gemm_variance.py       |  321 ++++
 41 files changed, 11815 insertions(+), 1114 deletions(-)
 create mode 100644 src/aorta/report/COMPARE_CMD_DEV_DOCS.md
 create mode 100644 src/aorta/report/GENERATE_EXCEL_DEV_DOCS.md
 create mode 100644 src/aorta/report/GENERATE_PLOTS_DEV_DOCS.md
 create mode 100644 src/aorta/report/PIPELINE_DEV_DOCS.md
 create mode 100644 src/aorta/report/PROCESS_CMD_DEV_DOCS.md
 create mode 100644 src/aorta/report/USER_GUIDE.md
 create mode 100644 src/aorta/report/analysis/cli.py
 create mode 100644 src/aorta/report/comparison/__init__.py
 create mode 100644 src/aorta/report/comparison/cli.py
 create mode 100644 src/aorta/report/comparison/collective_comparison.py
 create mode 100644 src/aorta/report/comparison/combine.py
 create mode 100644 src/aorta/report/comparison/formatting.py
 create mode 100644 src/aorta/report/comparison/gpu_timeline_comparison.py
 create mode 100644 src/aorta/report/generators/cli.py
 create mode 100644 src/aorta/report/generators/excel_report.py
 create mode 100644 src/aorta/report/generators/plot_generator.py
 create mode 100644 src/aorta/report/generators/plot_helper/__init__.py
 create mode 100644 src/aorta/report/generators/plot_helper/common.py
 create mode 100644 src/aorta/report/generators/plot_helper/gemm_boxplots.py
 create mode 100644 src/aorta/report/generators/plot_helper/gemm_data.py
 create mode 100644 src/aorta/report/generators/plot_helper/gemm_interaction.py
 create mode 100644 src/aorta/report/generators/plot_helper/gemm_violin.py
 create mode 100644 src/aorta/report/generators/plot_helper/gpu_by_rank.py
 create mode 100644 src/aorta/report/generators/plot_helper/gpu_heatmap.py
 create mode 100644 src/aorta/report/generators/plot_helper/gpu_percent_change.py
 create mode 100644 src/aorta/report/generators/plot_helper/nccl_charts.py
 create mode 100644 src/aorta/report/generators/plot_helper/summary_dashboard.py
 create mode 100644 src/aorta/report/pipelines/__init__.py
 create mode 100644 src/aorta/report/pipelines/cli.py
 create mode 100644 src/aorta/report/pipelines/gemm_pipeline.py
 create mode 100644 src/aorta/report/pipelines/summary_pipeline.py
 create mode 100644 src/aorta/report/processing/__init__.py
 create mode 100644 src/aorta/report/processing/cli.py
 create mode 100644 src/aorta/report/processing/gpu_timeline_single.py
 create mode 100644 src/aorta/report/processing/gpu_timeline_sweep.py
 create mode 100644 src/aorta/report/processing/process_comms.py
 create mode 100644 src/aorta/report/processing/process_gemm_variance.py

diff --git a/src/aorta/report/COMPARE_CMD_DEV_DOCS.md b/src/aorta/report/COMPARE_CMD_DEV_DOCS.md
new file mode 100644
index 0000000..ecd0457
--- /dev/null
+++ b/src/aorta/report/COMPARE_CMD_DEV_DOCS.md
@@ -0,0 +1,1058 @@
+# `compare` Command Group - Developer Documentation
+
+**Version:** 1.0  
+**Date:** January 2026  
+**Status:** ✅ Implemented
+
+---
+
+## Table of Contents
+
+1. [Overview](#1-overview)
+2. [Command Specification](#2-command-specification)
+3. [Source Script Analysis](#3-source-script-analysis)
+4. [Implementation Architecture](#4-implementation-architecture)
+5. [Module Details](#5-module-details)
+6. [Data Flow](#6-data-flow)
+7. [Implementation Order](#7-implementation-order)
+8. [Expected Output](#8-expected-output)
+9. [Testing Strategy](#9-testing-strategy)
+
+---
+
+## 1. Overview
+
+The `compare` command provides functionality to compare baseline and test TraceLens reports. It supports two comparison types:
+
+| Type | Purpose | Source Scripts |
+|------|---------|----------------|
+| `gpu_timeline` | Compare GPU timeline reports | `combine_reports.py` + `add_comparison_sheets.py` |
+| `collective` | Compare collective/NCCL reports | `combine_reports.py` + `add_collective_comparison.py` |
+
+### Key Design Decisions
+
+1. **Single command with positional type argument** - cleaner than separate commands
+2. **Exact Excel file paths** - user specifies exact files, no auto-discovery
+3. **2-way comparison only** - baseline vs test (N-way comparison deferred)
+4. **Shared combine logic** - reuse same function for both types
+5. **Match original behavior** - output same sheets and formatting as original scripts
+
+---
+
+## 2. Command Specification
+
+### 2.1 `aorta-report compare gpu_timeline`
+
+Compare two GPU timeline reports.
+
+```bash
+aorta-report compare gpu_timeline \
+    --baseline /path/to/baseline/gpu_timeline_summary_mean.xlsx \
+    --test /path/to/test/gpu_timeline_summary_mean.xlsx \
+    --baseline-label "ROCm 6.0" \
+    --test-label "ROCm 7.0" \
+    --output /path/to/gpu_comparison.xlsx
+```
+
+| Argument/Option | Required | Default | Description |
+|-----------------|----------|---------|-------------|
+| `--baseline`, `-b` | Yes | - | Path to baseline gpu_timeline_summary_mean.xlsx |
+| `--test`, `-t` | Yes | - | Path to test gpu_timeline_summary_mean.xlsx |
+| `--baseline-label` | No | grandparent dir name | Label for baseline in output |
+| `--test-label` | No | grandparent dir name | Label for test in output |
+| `--output`, `-o` | Yes | - | Output Excel file path |
+
+**Label Extraction Logic:**
+- If `--baseline-label` not provided: extract grandparent directory name
+- Example: `/path/to/56cu_256threads/tracelens_analysis/gpu_timeline.xlsx` → `56cu_256threads`
+- Fallback: `"baseline"` if extraction fails
+
+**Output Sheets:**
+| Sheet | Description | Source |
+|-------|-------------|--------|
+| Summary | Combined summaries with `source` column | Combined |
+| All_Ranks_Combined | Combined raw data with `source` column | Combined |
+| Per_Rank_Time_ms | Combined pivot (time) | Combined |
+| Per_Rank_Percent | Combined pivot (percent) | Combined |
+| **Comparison_By_Rank** | Per-rank comparison with metrics | NEW |
+| **Summary_Comparison** | Overall comparison with metrics | NEW |
+
+---
+
+### 2.2 `aorta-report compare collective`
+
+Compare two collective/NCCL reports.
+
+```bash
+aorta-report compare collective \
+    --baseline /path/to/baseline/collective_all_ranks.xlsx \
+    --test /path/to/test/collective_all_ranks.xlsx \
+    --baseline-label "ROCm 6.0" \
+    --test-label "ROCm 7.0" \
+    --output /path/to/collective_comparison.xlsx
+```
+
+| Argument/Option | Required | Default | Description |
+|-----------------|----------|---------|-------------|
+| `--baseline`, `-b` | Yes | - | Path to baseline collective_all_ranks.xlsx |
+| `--test`, `-t` | Yes | - | Path to test collective_all_ranks.xlsx |
+| `--baseline-label` | No | grandparent dir name | Label for baseline in output |
+| `--test-label` | No | grandparent dir name | Label for test in output |
+| `--output`, `-o` | Yes | - | Output Excel file path |
+
+**Sheet Filtering (matches original):**
+- Only sheets with `"summary"` in the name are kept
+- Non-summary sheets are skipped
+
+**Output Sheets:**
+| Sheet | Description | Source |
+|-------|-------------|--------|
+| nccl_summary_implicit_sync | Combined summary (implicit sync) | Combined |
+| nccl_summary_long | Combined summary (long) | Combined |
+| **nccl_implicit_sync_cmp** | Comparison for implicit sync | NEW |
+| **nccl_long_cmp** | Comparison for long | NEW |
+
+---
+
+## 3. Source Script Analysis
+
+### 3.1 `combine_reports.py` (72 lines)
+
+**Location:** `scripts/tracelens_single_config/combine_reports.py`
+
+**Purpose:** Combine two Excel files by adding a `source` column.
+
+**Key Logic:**
+```python
+def combine_collective_reports(baseline_path, test_path, output_path, baseline_label, test_label):
+    baseline_xl = pd.ExcelFile(baseline_path)
+    test_xl = pd.ExcelFile(test_path)
+    
+    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+        for sheet_name in baseline_xl.sheet_names:
+            if sheet_name not in test_xl.sheet_names:
+                continue  # Skip sheets not in both files
+            
+            baseline_df = pd.read_excel(baseline_path, sheet_name=sheet_name)
+            test_df = pd.read_excel(test_path, sheet_name=sheet_name)
+            
+            baseline_df["source"] = baseline_label
+            test_df["source"] = test_label
+            
+            combined = pd.concat([baseline_df, test_df], ignore_index=True)
+            combined.to_excel(writer, sheet_name=sheet_name, index=False)
+```
+
+---
+
+### 3.2 `add_comparison_sheets.py` (222 lines)
+
+**Location:** `scripts/tracelens_single_config/add_comparison_sheets.py`
+
+**Purpose:** Add GPU timeline comparison sheets to combined Excel file.
+
+**Key Logic:**
+
+```python
+def add_comparison_sheets(input_path, output_path, baseline_label, test_label):
+    xl = pd.ExcelFile(input_path)
+    
+    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+        # 1. Copy all original sheets
+        for sheet_name in xl.sheet_names:
+            df = pd.read_excel(input_path, sheet_name=sheet_name)
+            df.to_excel(writer, sheet_name=sheet_name, index=False)
+        
+        # 2. Read combined data
+        all_combined = pd.read_excel(input_path, sheet_name="All_Ranks_Combined")
+        
+        # Get actual source values from dataframe
+        sources = all_combined['source'].unique()
+        actual_baseline = sources[0]
+        actual_test = sources[1]
+        
+        # 3. Create Comparison_By_Rank
+        baseline_data = all_combined[all_combined["source"] == actual_baseline]
+        test_data = all_combined[all_combined["source"] == actual_test]
+        
+        comparison_by_rank = pd.DataFrame()
+        for rank in sorted(baseline_data["rank"].unique()):
+            base_rank = baseline_data[baseline_data["rank"] == rank].set_index("type")
+            test_rank = test_data[test_data["rank"] == rank].set_index("type")
+            
+            for metric_type in base_rank.index:
+                if metric_type in test_rank.index:
+                    base_time = base_rank.loc[metric_type, "time ms"]
+                    test_time = test_rank.loc[metric_type, "time ms"]
+                    
+                    # percent_change: positive when test is faster (takes less time)
+                    pct_change = (base_time - test_time) / base_time * 100 if base_time != 0 else 0
+                    
+                    # Determine status
+                    if pct_change > 1:
+                        status = "Better"
+                    elif pct_change < -1:
+                        status = "Worse"
+                    else:
+                        status = "Similar"
+                    
+                    # Build row with all metrics
+                    row = {
+                        "rank": rank,
+                        "type": metric_type,
+                        f"{baseline_label}_time_ms": base_time,
+                        f"{test_label}_time_ms": test_time,
+                        "diff_time_ms": test_time - base_time,
+                        "percent_change": pct_change,
+                        "status": status,
+                        "ratio": test_time / base_time if base_time != 0 else 0,
+                        f"{baseline_label}_percent": base_rank.loc[metric_type, "percent"],
+                        f"{test_label}_percent": test_rank.loc[metric_type, "percent"],
+                        "diff_percent": test_rank.loc[metric_type, "percent"] - base_rank.loc[metric_type, "percent"],
+                    }
+                    comparison_by_rank = pd.concat([comparison_by_rank, pd.DataFrame([row])], ignore_index=True)
+        
+        comparison_by_rank.to_excel(writer, sheet_name="Comparison_By_Rank", index=False)
+        
+        # 4. Create Summary_Comparison (similar logic with Summary sheet)
+        # ...
+        
+        # 5. Apply conditional formatting
+        ws = writer.sheets["Comparison_By_Rank"]
+        # Find percent_change column and apply color scale
+        ws.conditional_formatting.add(
+            data_range,
+            ColorScaleRule(
+                start_type="min", start_color="F8696B",  # Red
+                mid_type="num", mid_value=0, mid_color="FFFFFF",  # White
+                end_type="max", end_color="63BE7B",  # Green
+            )
+        )
+```
+
+**Comparison Columns Created:**
+| Column | Formula | Description |
+|--------|---------|-------------|
+| `{baseline}_time_ms` | baseline value | Time from baseline |
+| `{test}_time_ms` | test value | Time from test |
+| `diff_time_ms` | test - baseline | Absolute difference |
+| `percent_change` | (baseline - test) / baseline × 100 | Positive = faster |
+| `status` | Based on percent_change | "Better", "Worse", or "Similar" |
+| `ratio` | test / baseline | Ratio comparison |
+| `{baseline}_percent` | baseline value | Percent from baseline |
+| `{test}_percent` | test value | Percent from test |
+| `diff_percent` | test - baseline | Difference in percent |
+
+---
+
+### 3.3 `add_collective_comparison.py` (209 lines)
+
+**Location:** `scripts/tracelens_single_config/add_collective_comparison.py`
+
+**Purpose:** Add NCCL collective comparison sheets.
+
+**Key Differences from GPU Timeline:**
+
+1. **Sheet Filtering:** Only keeps sheets with "summary" in the name
+2. **Grouping:** Groups by `['Collective name', 'dtype', 'In msg nelems']`
+3. **Multiple Metrics:** Compares multiple NCCL-specific metrics
+4. **Semantic Difference:** Latency vs Bandwidth have opposite "better" directions
+
+**Key Logic:**
+
+```python
+def add_collective_comparison_sheets(input_path, output_path, baseline_label, test_label):
+    xl = pd.ExcelFile(input_path)
+    
+    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+        # 1. Copy only summary sheets
+        for sheet_name in xl.sheet_names:
+            if "summary" not in sheet_name.lower():
+                continue  # Skip non-summary sheets
+            df = pd.read_excel(input_path, sheet_name=sheet_name)
+            df.to_excel(writer, sheet_name=sheet_name, index=False)
+        
+        # 2. Process each summary sheet
+        for sheet_name in ["nccl_summary_implicit_sync", "nccl_summary_long"]:
+            if sheet_name not in xl.sheet_names:
+                continue
+            
+            df = pd.read_excel(input_path, sheet_name=sheet_name)
+            
+            # Get actual source values
+            sources = df['source'].unique()
+            actual_baseline = sources[0]
+            actual_test = sources[1]
+            
+            baseline_df = df[df["source"] == actual_baseline]
+            test_df = df[df["source"] == actual_test]
+            
+            # Group columns
+            group_cols = ["Collective name", "dtype", "In msg nelems"]
+            
+            # Metrics to compare
+            numeric_cols = [
+                "comm_latency_mean",
+                "algo bw (GB/s)_mean",
+                "bus bw (GB/s)_mean",
+                "Total comm latency (ms)",
+                "count",
+            ]
+            
+            comparison = pd.DataFrame()
+            
+            for name, base_group in baseline_df.groupby(group_cols):
+                # Find matching test group
+                # ... matching logic ...
+                
+                comp_row = {}
+                
+                # Copy grouping columns
+                for col, val in zip(group_cols, name):
+                    comp_row[col] = val
+                
+                # Compare each metric
+                for col in numeric_cols:
+                    base_val = base_group[col].values[0]
+                    test_val = test_group[col].values[0]
+                    
+                    comp_row[f"{actual_baseline}_{col}"] = base_val
+                    comp_row[f"{actual_test}_{col}"] = test_val
+                    comp_row[f"diff_{col}"] = test_val - base_val
+                    
+                    # percent_change semantics differ by metric type
+                    if "latency" in col.lower() or "time" in col.lower():
+                        # Lower is better - positive when test is faster
+                        pct_change = (base_val - test_val) / base_val * 100
+                    elif "bw" in col.lower() or "bandwidth" in col.lower():
+                        # Higher is better - positive when test is better
+                        pct_change = (test_val - base_val) / base_val * 100
+                    else:
+                        pct_change = 0
+                    
+                    comp_row[f"percent_change_{col}"] = pct_change
+                    comp_row[f"ratio_{col}"] = test_val / base_val if base_val != 0 else 0
+                
+                comparison = pd.concat([comparison, pd.DataFrame([comp_row])], ignore_index=True)
+            
+            # Sheet name: nccl_summary_implicit_sync → nccl_implicit_sync_cmp
+            comparison_sheet_name = sheet_name.replace("nccl_summary_", "nccl_") + "_cmp"
+            comparison.to_excel(writer, sheet_name=comparison_sheet_name, index=False)
+            
+            # Apply formatting to all percent_change columns
+            # ...
+```
+
+**Metrics Compared:**
+| Metric | Better Direction | percent_change Formula |
+|--------|------------------|------------------------|
+| `comm_latency_mean` | Lower | (base - test) / base × 100 |
+| `algo bw (GB/s)_mean` | Higher | (test - base) / base × 100 |
+| `bus bw (GB/s)_mean` | Higher | (test - base) / base × 100 |
+| `Total comm latency (ms)` | Lower | (base - test) / base × 100 |
+| `count` | N/A | No percent_change |
+
+---
+
+## 4. Implementation Architecture
+
+### 4.1 File Structure
+
+```
+src/aorta/report/
+├── comparison/                      # NEW: comparison module
+│   ├── __init__.py                  # Package exports
+│   ├── combine.py                   # Shared: combine two Excel files
+│   ├── gpu_timeline_comparison.py   # GPU timeline comparison logic
+│   ├── collective_comparison.py     # Collective/NCCL comparison logic
+│   └── formatting.py                # Shared Excel formatting utilities
+├── cli.py                           # Update compare commands
+└── ... (existing modules)
+```
+
+### 4.2 Module Responsibilities
+
+| Module | Responsibility |
+|--------|----------------|
+| `combine.py` | Combine two Excel files with source column |
+| `gpu_timeline_comparison.py` | Add Comparison_By_Rank and Summary_Comparison sheets |
+| `collective_comparison.py` | Add nccl_*_cmp sheets for each summary sheet |
+| `formatting.py` | Color scale formatting, column letter conversion |
+
+### 4.3 Dependency Graph
+
+```
+cli.py
+    │
+    ├── compare gpu_timeline ──► combine.py
+    │                                │
+    │                                └──► gpu_timeline_comparison.py
+    │                                              │
+    │                                              └──► formatting.py
+    │
+    └── compare collective ───► combine.py
+                                     │
+                                     └──► collective_comparison.py
+                                                   │
+                                                   └──► formatting.py
+```
+
+---
+
+## 5. Module Details
+
+### 5.1 `comparison/__init__.py`
+
+```python
+"""Comparison modules for baseline vs test TraceLens reports."""
+
+from .combine import combine_excel_files
+from .gpu_timeline_comparison import add_gpu_timeline_comparison
+from .collective_comparison import add_collective_comparison
+from .formatting import save_with_formatting
+
+__all__ = [
+    "combine_excel_files",
+    "add_gpu_timeline_comparison",
+    "add_collective_comparison",
+    "save_with_formatting",
+]
+```
+
+---
+
+### 5.2 `comparison/combine.py`
+
+```python
+"""Shared functionality to combine two Excel files."""
+
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import pandas as pd
+
+
+def combine_excel_files(
+    baseline_path: Path,
+    test_path: Path,
+    baseline_label: str,
+    test_label: str,
+    sheets_to_combine: Optional[List[str]] = None,
+    filter_summary_only: bool = False,
+    verbose: bool = False,
+) -> Dict[str, pd.DataFrame]:
+    """
+    Combine two Excel files by adding a 'source' column.
+    
+    Args:
+        baseline_path: Path to baseline Excel file
+        test_path: Path to test Excel file
+        baseline_label: Label for baseline rows in 'source' column
+        test_label: Label for test rows in 'source' column
+        sheets_to_combine: Specific sheets to combine (None = all common sheets)
+        filter_summary_only: If True, only keep sheets with 'summary' in name
+        verbose: Print progress messages
+    
+    Returns:
+        Dict mapping sheet_name to combined DataFrame
+    
+    Raises:
+        FileNotFoundError: If input files don't exist
+        ValueError: If no common sheets found
+    """
+```
+
+**Implementation Notes:**
+- Read both Excel files using `pd.ExcelFile`
+- Find common sheets (intersection of sheet names)
+- If `filter_summary_only`, filter to sheets containing "summary"
+- For each sheet: add `source` column, concat, store in dict
+- Return dict (don't save yet - let caller handle saving)
+
+---
+
+### 5.3 `comparison/gpu_timeline_comparison.py`
+
+```python
+"""GPU timeline comparison logic."""
+
+from typing import Dict
+
+import pandas as pd
+
+
+def add_gpu_timeline_comparison(
+    combined_data: Dict[str, pd.DataFrame],
+    baseline_label: str,
+    test_label: str,
+    verbose: bool = False,
+) -> Dict[str, pd.DataFrame]:
+    """
+    Add comparison sheets for GPU timeline data.
+    
+    Args:
+        combined_data: Dict from combine_excel_files()
+        baseline_label: Label for baseline (for column naming)
+        test_label: Label for test (for column naming)
+        verbose: Print progress messages
+    
+    Returns:
+        Dict with original sheets + new comparison sheets:
+        - 'Comparison_By_Rank': Per-rank comparison
+        - 'Summary_Comparison': Overall comparison
+    
+    Expects combined_data to have:
+        - 'All_Ranks_Combined' sheet with: source, rank, type, time ms, percent
+        - 'Summary' sheet with: source, type, time ms, percent
+    
+    Comparison columns created:
+        - {baseline_label}_time_ms, {test_label}_time_ms
+        - diff_time_ms, percent_change, status, ratio
+        - {baseline_label}_percent, {test_label}_percent, diff_percent
+    
+    percent_change formula: (baseline - test) / baseline × 100
+        - Positive = test is faster (better)
+        - Negative = test is slower (worse)
+    
+    status thresholds:
+        - "Better" if percent_change > 1
+        - "Worse" if percent_change < -1
+        - "Similar" otherwise
+    """
+```
+
+**Implementation Notes:**
+- Get actual source values from DataFrame (first = baseline, second = test)
+- Create Comparison_By_Rank by iterating over ranks and types
+- Create Summary_Comparison from Summary sheet
+- Add to result dict and return
+
+---
+
+### 5.4 `comparison/collective_comparison.py`
+
+```python
+"""Collective/NCCL comparison logic."""
+
+from typing import Dict
+
+import pandas as pd
+
+
+# Metrics to compare
+NCCL_NUMERIC_COLS = [
+    "comm_latency_mean",
+    "algo bw (GB/s)_mean",
+    "bus bw (GB/s)_mean",
+    "Total comm latency (ms)",
+    "count",
+]
+
+# Grouping columns for NCCL data
+NCCL_GROUP_COLS = ["Collective name", "dtype", "In msg nelems"]
+
+
+def add_collective_comparison(
+    combined_data: Dict[str, pd.DataFrame],
+    baseline_label: str,
+    test_label: str,
+    verbose: bool = False,
+) -> Dict[str, pd.DataFrame]:
+    """
+    Add comparison sheets for collective/NCCL data.
+    
+    Args:
+        combined_data: Dict from combine_excel_files()
+        baseline_label: Label for baseline
+        test_label: Label for test
+        verbose: Print progress messages
+    
+    Returns:
+        Dict with summary sheets + new comparison sheets:
+        - 'nccl_implicit_sync_cmp': Comparison for nccl_summary_implicit_sync
+        - 'nccl_long_cmp': Comparison for nccl_summary_long
+    
+    Processes sheets:
+        - 'nccl_summary_implicit_sync' → 'nccl_implicit_sync_cmp'
+        - 'nccl_summary_long' → 'nccl_long_cmp'
+    
+    Groups by: ['Collective name', 'dtype', 'In msg nelems']
+    
+    For each metric, creates columns:
+        - {baseline}_{metric}, {test}_{metric}
+        - diff_{metric}, percent_change_{metric}, ratio_{metric}
+    
+    percent_change semantics (positive = better):
+        - Latency/time: (baseline - test) / baseline × 100
+        - Bandwidth: (test - baseline) / baseline × 100
+    """
+```
+
+**Implementation Notes:**
+- Only process sheets in `["nccl_summary_implicit_sync", "nccl_summary_long"]`
+- Use flexible grouping (fall back to just "Collective name" if other cols missing)
+- Apply correct percent_change formula based on metric type
+- Sheet name transformation: `nccl_summary_X` → `nccl_X_cmp`
+
+---
+
+### 5.5 `comparison/formatting.py`
+
+```python
+"""Shared Excel formatting utilities."""
+
+from pathlib import Path
+from typing import Dict, List
+
+import pandas as pd
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+# Color constants
+RED = "F8696B"
+WHITE = "FFFFFF"
+GREEN = "63BE7B"
+
+
+def get_column_letter(col_idx: int) -> str:
+    """
+    Convert 1-based column index to Excel column letter.
+    
+    Examples:
+        1 → 'A', 26 → 'Z', 27 → 'AA', 28 → 'AB'
+    """
+
+
+def create_color_scale_rule() -> ColorScaleRule:
+    """
+    Create standard red-white-green color scale rule.
+    
+    Red (min/negative) → White (0) → Green (max/positive)
+    """
+    return ColorScaleRule(
+        start_type="min",
+        start_color=RED,
+        mid_type="num",
+        mid_value=0,
+        mid_color=WHITE,
+        end_type="max",
+        end_color=GREEN,
+    )
+
+
+def apply_color_scale_to_column(
+    worksheet,
+    col_idx: int,
+    num_rows: int,
+) -> None:
+    """
+    Apply color scale formatting to a specific column.
+    
+    Args:
+        worksheet: openpyxl worksheet
+        col_idx: 1-based column index
+        num_rows: Number of data rows (excluding header)
+    """
+
+
+def save_with_formatting(
+    data: Dict[str, pd.DataFrame],
+    output_path: Path,
+    format_columns: Dict[str, List[str]],
+    verbose: bool = False,
+) -> Path:
+    """
+    Save DataFrames to Excel with conditional formatting.
+    
+    Args:
+        data: Dict[sheet_name, DataFrame]
+        output_path: Output file path
+        format_columns: Dict[sheet_name, list of column names to format]
+        verbose: Print progress
+    
+    Returns:
+        Path to saved file
+    
+    Example:
+        format_columns = {
+            "Comparison_By_Rank": ["percent_change"],
+            "Summary_Comparison": ["percent_change"],
+            "nccl_implicit_sync_cmp": [
+                "percent_change_comm_latency_mean",
+                "percent_change_algo bw (GB/s)_mean",
+            ],
+        }
+    """
+```
+
+---
+
+## 6. Data Flow
+
+### 6.1 `compare gpu_timeline` Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                        compare gpu_timeline                                  │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                             │
+│  INPUT:                                                                     │
+│  ├── baseline.xlsx (gpu_timeline_summary_mean.xlsx)                         │
+│  │   ├── Summary                                                            │
+│  │   ├── All_Ranks_Combined                                                 │
+│  │   ├── Per_Rank_Time_ms                                                   │
+│  │   └── Per_Rank_Percent                                                   │
+│  │                                                                          │
+│  └── test.xlsx (gpu_timeline_summary_mean.xlsx)                             │
+│      └── (same sheets)                                                      │
+│                                                                             │
+│  STEP 1: combine_excel_files()                                              │
+│  ────────────────────────────────                                           │
+│  For each sheet, add 'source' column and concat:                            │
+│      baseline rows → source = baseline_label                                │
+│      test rows → source = test_label                                        │
+│                                                                             │
+│  STEP 2: add_gpu_timeline_comparison()                                      │
+│  ────────────────────────────────────────                                   │
+│  Create new sheets:                                                         │
+│      Comparison_By_Rank: Per-rank comparison                                │
+│      Summary_Comparison: Overall comparison                                 │
+│                                                                             │
+│  STEP 3: save_with_formatting()                                             │
+│  ─────────────────────────────────                                          │
+│  Save all sheets to Excel with color formatting on percent_change           │
+│                                                                             │
+│  OUTPUT:                                                                    │
+│  └── output.xlsx                                                            │
+│      ├── Summary (combined)                                                 │
+│      ├── All_Ranks_Combined (combined)                                      │
+│      ├── Per_Rank_Time_ms (combined)                                        │
+│      ├── Per_Rank_Percent (combined)                                        │
+│      ├── Comparison_By_Rank (NEW - with formatting)                         │
+│      └── Summary_Comparison (NEW - with formatting)                         │
+│                                                                             │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+### 6.2 `compare collective` Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                          compare collective                                  │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                             │
+│  INPUT:                                                                     │
+│  ├── baseline.xlsx (collective_all_ranks.xlsx)                              │
+│  │   ├── nccl_summary_implicit_sync                                         │
+│  │   ├── nccl_summary_long                                                  │
+│  │   └── (other non-summary sheets - SKIPPED)                               │
+│  │                                                                          │
+│  └── test.xlsx (collective_all_ranks.xlsx)                                  │
+│      └── (same sheets)                                                      │
+│                                                                             │
+│  STEP 1: combine_excel_files(filter_summary_only=True)                      │
+│  ────────────────────────────────────────────────────────                   │
+│  Only combine sheets with "summary" in name                                 │
+│  Add 'source' column and concat                                             │
+│                                                                             │
+│  STEP 2: add_collective_comparison()                                        │
+│  ──────────────────────────────────────                                     │
+│  For each summary sheet, create comparison sheet:                           │
+│      nccl_summary_implicit_sync → nccl_implicit_sync_cmp                    │
+│      nccl_summary_long → nccl_long_cmp                                      │
+│                                                                             │
+│  STEP 3: save_with_formatting()                                             │
+│  ─────────────────────────────────                                          │
+│  Save with color formatting on all percent_change_* columns                 │
+│                                                                             │
+│  OUTPUT:                                                                    │
+│  └── output.xlsx                                                            │
+│      ├── nccl_summary_implicit_sync (combined)                              │
+│      ├── nccl_summary_long (combined)                                       │
+│      ├── nccl_implicit_sync_cmp (NEW - with formatting)                     │
+│      └── nccl_long_cmp (NEW - with formatting)                              │
+│                                                                             │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 7. Implementation Order
+
+| Phase | Task | Est. Time | Dependencies |
+|-------|------|-----------|--------------|
+| **1** | Create `comparison/` directory and `__init__.py` | 5 min | None |
+| **2** | Implement `formatting.py` | 25 min | Phase 1 |
+| **3** | Implement `combine.py` | 20 min | Phase 1 |
+| **4** | Implement `gpu_timeline_comparison.py` | 40 min | Phase 2, 3 |
+| **5** | Implement `collective_comparison.py` | 40 min | Phase 2, 3 |
+| **6** | Update `cli.py` with compare commands | 25 min | Phase 4, 5 |
+| **7** | Testing | 30 min | Phase 6 |
+
+**Total estimated time: ~3 hours**
+
+---
+
+## 8. Expected Output
+
+### 8.1 `compare gpu_timeline` Console Output
+
+```
+============================================================
+GPU Timeline Comparison
+============================================================
+Baseline: /path/to/56cu_256threads/tracelens_analysis/gpu_timeline_summary_mean.xlsx
+Test: /path/to/37cu_384threads/tracelens_analysis/gpu_timeline_summary_mean.xlsx
+Baseline label: 56cu_256threads
+Test label: 37cu_384threads
+
+Step 1: Combining Excel files
+  Loading baseline (56cu_256threads)...
+  Loading test (37cu_384threads)...
+  Combining sheets:
+    Summary: 10 + 10 = 20 rows
+    All_Ranks_Combined: 80 + 80 = 160 rows
+    Per_Rank_Time_ms: 10 + 10 = 20 rows
+    Per_Rank_Percent: 10 + 10 = 20 rows
+
+Step 2: Adding comparison sheets
+  Creating Comparison_By_Rank...
+    Processing 8 ranks × 10 types = 80 comparisons
+  Creating Summary_Comparison...
+    Processing 10 types
+
+Step 3: Saving with formatting
+  Applying color scale to Comparison_By_Rank.percent_change
+  Applying color scale to Summary_Comparison.percent_change
+
+============================================================
+Comparison Complete!
+============================================================
+Output: /path/to/gpu_comparison.xlsx
+
+Sheets:
+  - Summary (combined data)
+  - All_Ranks_Combined (combined data)
+  - Per_Rank_Time_ms (combined data)
+  - Per_Rank_Percent (combined data)
+  - Comparison_By_Rank (per-rank comparison)
+  - Summary_Comparison (overall comparison)
+
+percent_change interpretation:
+  Positive = test is faster/better
+  Negative = test is slower/worse
+```
+
+### 8.2 `compare collective` Console Output
+
+```
+============================================================
+Collective/NCCL Comparison
+============================================================
+Baseline: /path/to/56cu_256threads/tracelens_analysis/collective_reports/collective_all_ranks.xlsx
+Test: /path/to/37cu_384threads/tracelens_analysis/collective_reports/collective_all_ranks.xlsx
+Baseline label: 56cu_256threads
+Test label: 37cu_384threads
+
+Step 1: Combining Excel files
+  Loading baseline (56cu_256threads)...
+  Loading test (37cu_384threads)...
+  Filtering to summary sheets only...
+  Combining sheets:
+    nccl_summary_implicit_sync: 15 + 15 = 30 rows
+    nccl_summary_long: 15 + 15 = 30 rows
+  Skipped sheets (non-summary):
+    - per_rank_comm_details
+    - raw_data
+
+Step 2: Adding comparison sheets
+  Processing nccl_summary_implicit_sync...
+    Grouping by: ['Collective name', 'dtype', 'In msg nelems']
+    Created nccl_implicit_sync_cmp (15 rows)
+  Processing nccl_summary_long...
+    Created nccl_long_cmp (15 rows)
+
+Step 3: Saving with formatting
+  Applying color scale to nccl_implicit_sync_cmp:
+    - percent_change_comm_latency_mean
+    - percent_change_algo bw (GB/s)_mean
+    - percent_change_bus bw (GB/s)_mean
+    - percent_change_Total comm latency (ms)
+  Applying color scale to nccl_long_cmp:
+    - (same columns)
+
+============================================================
+Comparison Complete!
+============================================================
+Output: /path/to/collective_comparison.xlsx
+
+Sheets:
+  - nccl_summary_implicit_sync (combined data)
+  - nccl_summary_long (combined data)
+  - nccl_implicit_sync_cmp (comparison)
+  - nccl_long_cmp (comparison)
+
+percent_change interpretation:
+  For latency/time: Positive = faster (better)
+  For bandwidth: Positive = higher bandwidth (better)
+```
+
+---
+
+## 9. Testing Strategy
+
+### 9.1 Unit Tests
+
+```python
+# tests/test_comparison/test_combine.py
+
+def test_combine_excel_files_basic():
+    """Test combining two Excel files adds source column."""
+
+def test_combine_excel_files_filter_summary():
+    """Test filter_summary_only option works."""
+
+def test_combine_excel_files_missing_sheet():
+    """Test handling when sheet only exists in one file."""
+
+
+# tests/test_comparison/test_gpu_timeline.py
+
+def test_add_gpu_timeline_comparison_creates_sheets():
+    """Test that Comparison_By_Rank and Summary_Comparison are created."""
+
+def test_percent_change_calculation():
+    """Test percent_change formula is correct."""
+
+def test_status_thresholds():
+    """Test Better/Worse/Similar status logic."""
+
+
+# tests/test_comparison/test_collective.py
+
+def test_add_collective_comparison_creates_sheets():
+    """Test comparison sheets are created for each summary sheet."""
+
+def test_latency_percent_change():
+    """Test latency metrics use (base-test)/base formula."""
+
+def test_bandwidth_percent_change():
+    """Test bandwidth metrics use (test-base)/base formula."""
+
+
+# tests/test_comparison/test_formatting.py
+
+def test_get_column_letter():
+    """Test column index to letter conversion."""
+    assert get_column_letter(1) == "A"
+    assert get_column_letter(26) == "Z"
+    assert get_column_letter(27) == "AA"
+
+def test_color_scale_applied():
+    """Test that color scale formatting is applied to correct columns."""
+```
+
+### 9.2 Integration Tests
+
+```python
+# tests/test_comparison/test_cli_integration.py
+
+def test_compare_gpu_timeline_cli():
+    """Test full CLI flow for gpu_timeline comparison."""
+
+def test_compare_collective_cli():
+    """Test full CLI flow for collective comparison."""
+
+def test_label_extraction_from_path():
+    """Test grandparent directory name extraction."""
+```
+
+---
+
+## Appendix A: Label Extraction Logic
+
+```python
+def extract_label_from_path(file_path: Path) -> str:
+    """
+    Extract label from file path using grandparent directory name.
+    
+    Examples:
+        /path/to/56cu_256threads/tracelens_analysis/gpu_timeline.xlsx
+        → "56cu_256threads"
+        
+        /path/to/run1/tracelens_analysis/collective_reports/collective.xlsx
+        → "run1" (or "tracelens_analysis" depending on depth)
+    
+    Fallback: "baseline" or "test" if extraction fails
+    """
+    try:
+        # Go up to grandparent (skip filename and parent directory)
+        grandparent = file_path.parent.parent.name
+        if grandparent and grandparent not in [".", "..", ""]:
+            return grandparent
+    except:
+        pass
+    return None  # Let caller provide default
+```
+
+---
+
+## Appendix B: CLI Help Text
+
+```
+$ aorta-report compare --help
+Usage: aorta-report compare [OPTIONS] COMMAND [ARGS]...
+
+  Compare baseline and test TraceLens reports.
+
+  Supported comparison types:
+    gpu_timeline  - Compare GPU timeline reports
+    collective    - Compare collective/NCCL reports
+
+Commands:
+  collective    Compare two collective/NCCL reports.
+  gpu_timeline  Compare two GPU timeline reports.
+
+
+$ aorta-report compare gpu_timeline --help
+Usage: aorta-report compare gpu_timeline [OPTIONS]
+
+  Compare two GPU timeline reports.
+
+  Combines baseline and test files, then adds comparison sheets with diff,
+  percent_change, and status columns.
+
+  Output sheets:
+    - Summary, All_Ranks_Combined, Per_Rank_* (combined data)
+    - Comparison_By_Rank (per-rank comparison)
+    - Summary_Comparison (overall comparison)
+
+  Examples:
+    aorta-report compare gpu_timeline \
+        -b baseline/gpu_timeline_summary_mean.xlsx \
+        -t test/gpu_timeline_summary_mean.xlsx \
+        -o comparison.xlsx
+
+Options:
+  -b, --baseline PATH      Path to baseline gpu_timeline_summary_mean.xlsx
+                           [required]
+  -t, --test PATH          Path to test gpu_timeline_summary_mean.xlsx
+                           [required]
+  --baseline-label TEXT    Label for baseline (default: grandparent dir name)
+  --test-label TEXT        Label for test (default: grandparent dir name)
+  -o, --output PATH        Output Excel file path  [required]
+  --help                   Show this message and exit.
+```
+
+---
+
+## Appendix C: Migration from Original Scripts
+
+| Original | New CLI Equivalent |
+|----------|-------------------|
+| `python combine_reports.py --baseline b.xlsx --test t.xlsx --output combined.xlsx` | (intermediate step, now internal) |
+| `python add_comparison_sheets.py --input combined.xlsx --output comparison.xlsx` | `aorta-report compare gpu_timeline -b b.xlsx -t t.xlsx -o comparison.xlsx` |
+| `python add_collective_comparison.py --input combined.xlsx --output comparison.xlsx` | `aorta-report compare collective -b b.xlsx -t t.xlsx -o comparison.xlsx` |
+
+The new CLI combines both steps (combine + add comparison) into a single command.
+
diff --git a/src/aorta/report/GENERATE_EXCEL_DEV_DOCS.md b/src/aorta/report/GENERATE_EXCEL_DEV_DOCS.md
new file mode 100644
index 0000000..91ca20b
--- /dev/null
+++ b/src/aorta/report/GENERATE_EXCEL_DEV_DOCS.md
@@ -0,0 +1,618 @@
+# `generate excel` Command - Developer Documentation
+
+**Version:** 1.0  
+**Date:** January 2026  
+**Status:** ✅ Implemented
+
+---
+
+## Table of Contents
+
+1. [Overview](#1-overview)
+2. [Command Specification](#2-command-specification)
+3. [Source Script Analysis](#3-source-script-analysis)
+4. [Implementation Architecture](#4-implementation-architecture)
+5. [Module Details](#5-module-details)
+6. [Data Flow](#6-data-flow)
+7. [Implementation Order](#7-implementation-order)
+8. [Expected Output](#8-expected-output)
+
+---
+
+## 1. Overview
+
+The `generate excel` command creates a comprehensive final report by combining GPU timeline and collective comparison data into a single, well-organized Excel file.
+
+### Key Features
+
+| Feature | Description |
+|---------|-------------|
+| **Summary Dashboard** | First visible sheet with key metrics and status |
+| **Comparison Sheets** | Visible sheets with comparison data |
+| **Hidden Raw Data** | Original data hidden but accessible |
+| **Excel Tables** | All data formatted as tables with filters |
+| **Color Coding** | Red-white-green scale on percent_change columns |
+
+### Source Script
+
+**Location:** `scripts/tracelens_single_config/create_final_report.py` (346 lines)
+
+---
+
+## 2. Command Specification
+
+### Current CLI (Stub)
+
+```bash
+aorta-report generate excel \
+    --gpu-combined gpu_timeline_combined.xlsx \
+    --gpu-comparison gpu_timeline_comparison.xlsx \
+    --coll-combined collective_combined.xlsx \
+    --coll-comparison collective_comparison.xlsx \
+    --baseline-label "ROCm 6.0" \
+    --test-label "ROCm 7.0" \
+    -o final_analysis_report.xlsx
+```
+
+### Arguments
+
+| Option | Required | Description |
+|--------|----------|-------------|
+| `--gpu-combined` | Yes | GPU timeline combined file (output of `compare gpu_timeline` without comparison sheets) |
+| `--gpu-comparison` | Yes | GPU timeline comparison file (output of `compare gpu_timeline`) |
+| `--coll-combined` | Yes | Collective combined file (intermediate) |
+| `--coll-comparison` | Yes | Collective comparison file (output of `compare collective`) |
+| `--baseline-label` | No | Label for baseline (default: "Baseline") |
+| `--test-label` | No | Label for test (default: "Test") |
+| `-o, --output` | Yes | Output Excel file path |
+
+### Alternative Simplified Interface
+
+Since `compare gpu_timeline` and `compare collective` now produce combined comparison files directly, we could simplify:
+
+```bash
+aorta-report generate excel \
+    --gpu-comparison gpu_comparison.xlsx \
+    --coll-comparison collective_comparison.xlsx \
+    --baseline-label "ROCm 6.0" \
+    --test-label "ROCm 7.0" \
+    -o final_report.xlsx
+```
+
+**Decision:** Keep original 4-file interface for now. Can refactor later.
+
+---
+
+## 3. Source Script Analysis
+
+### 3.1 Input Files
+
+The script requires 4 Excel files:
+
+| File | Contents | Source |
+|------|----------|--------|
+| `gpu_combined` | GPU summary + raw data with source column | `combine_reports.py` |
+| `gpu_comparison` | GPU comparison sheets (Summary_Comparison, Comparison_By_Rank) | `add_comparison_sheets.py` |
+| `coll_combined` | NCCL summary data with source column | `combine_reports.py` |
+| `coll_comparison` | NCCL comparison sheets (nccl_*_cmp) | `add_collective_comparison.py` |
+
+### 3.2 Processing Steps
+
+```python
+def create_final_report(gpu_combined, gpu_comparison, coll_combined, coll_comparison, output_file):
+    # 1. Create workbook and add sheets
+    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
+        
+        # 2. Add GPU Timeline sheets (raw → hidden)
+        for sheet in gpu_combined:
+            rename_and_add(sheet, "GPU_*_Raw")
+            mark_as_hidden(sheet)
+        
+        # 3. Add GPU Comparison sheets (visible)
+        for sheet in gpu_comparison:
+            if "Comparison" in sheet:
+                rename_and_add(sheet, "GPU_*_Cmp")
+        
+        # 4. Add Collective sheets (raw → hidden)
+        for sheet in coll_combined:
+            if "summary" in sheet:
+                rename_and_add(sheet, "NCCL_*_Raw")
+                mark_as_hidden(sheet)
+        
+        # 5. Add Collective Comparison sheets (visible)
+        for sheet in coll_comparison:
+            rename_and_add(sheet, "NCCL_*")
+        
+        # 6. Create Summary Dashboard
+        create_dashboard_from_gpu_comparison()
+    
+    # 7. Post-processing with openpyxl
+    wb = load_workbook(output_file)
+    
+    # 8. Hide raw data sheets
+    for sheet in raw_sheets:
+        wb[sheet].sheet_state = "hidden"
+    
+    # 9. Convert all sheets to Excel tables
+    for sheet in wb.sheetnames:
+        add_excel_table(sheet)
+    
+    # 10. Add conditional formatting to comparison sheets
+    for sheet in comparison_sheets:
+        apply_color_scale_to_percent_change_columns(sheet)
+    
+    # 11. Move Summary_Dashboard to first position
+    wb.move_sheet("Summary_Dashboard", offset=-(len(wb.sheetnames)-1))
+    
+    wb.save(output_file)
+```
+
+### 3.3 Sheet Naming Convention
+
+| Original Sheet | Final Name | Visibility |
+|----------------|------------|------------|
+| Summary | GPU_Summary_Raw | Hidden |
+| All_Ranks_Combined | GPU_AllRanks_Raw | Hidden |
+| Per_Rank_Time_ms | GPU_Time_Raw | Hidden |
+| Per_Rank_Percent | GPU_Pct_Raw | Hidden |
+| Summary_Comparison | GPU_Summary_Cmp | Visible |
+| Comparison_By_Rank | GPU_ByRank_Cmp | Visible |
+| nccl_summary_implicit_sync | NCCL_ImplSync_Raw | Hidden |
+| nccl_summary_long | NCCL_Long_Raw | Hidden |
+| nccl_implicit_sync_cmp | NCCL_Implicit_sync_cmp | Visible |
+| nccl_long_cmp | NCCL_Long_cmp | Visible |
+| (generated) | Summary_Dashboard | Visible (1st) |
+
+### 3.4 Summary Dashboard Creation
+
+**Decision:** Include BOTH GPU and NCCL metrics in the Summary Dashboard.
+
+```python
+dashboard_data = {
+    'Metric': [],
+    baseline_label: [],
+    test_label: [],
+    'Improvement (%)': [],
+    'Status': []
+}
+
+# For each GPU metric type (busy_time, idle_time, etc.):
+for row in gpu_summary_comparison:
+    dashboard_data['Metric'].append(f"GPU_{row['type']}")
+    dashboard_data[baseline_label].append(row[baseline_time_col])
+    dashboard_data[test_label].append(row[test_time_col])
+    dashboard_data['Improvement (%)'].append(row['percent_change'])
+    dashboard_data['Status'].append('Better' if pct > 0 else 'Worse' if pct < -1 else 'Similar')
+
+# Add NCCL metrics from collective comparison
+for sheet in ['nccl_implicit_sync_cmp', 'nccl_long_cmp']:
+    # Add latency and bandwidth metrics
+    for row in coll_comparison[sheet]:
+        # Add total comm latency metric
+        dashboard_data['Metric'].append(f"NCCL_{collective_name}_latency")
+        # ... add values
+```
+
+### 3.5 Excel Table Formatting
+
+```python
+def add_excel_table(worksheet, table_name):
+    # Create table reference: A1:Z100
+    table_ref = f"A1:{get_column_letter(max_col)}{max_row}"
+    
+    tab = Table(displayName=table_name, ref=table_ref)
+    style = TableStyleInfo(
+        name="TableStyleMedium2",
+        showFirstColumn=False,
+        showLastColumn=False,
+        showRowStripes=True,
+        showColumnStripes=False,
+    )
+    tab.tableStyleInfo = style
+    worksheet.add_table(tab)
+```
+
+### 3.6 Conditional Formatting
+
+Applied to columns with "percent_change" in the header:
+
+```python
+ColorScaleRule(
+    start_type="min", start_color="F8696B",   # Red
+    mid_type="num", mid_value=0, mid_color="FFFFFF",  # White
+    end_type="max", end_color="63BE7B",       # Green
+)
+```
+
+---
+
+## 4. Implementation Architecture
+
+### 4.1 File Structure
+
+```
+src/aorta/report/
+├── generators/                      # Existing
+│   ├── __init__.py                  # Add export
+│   ├── html_generator.py            # Existing
+│   ├── sweep_comparison.py          # Existing
+│   ├── performance_report.py        # Existing
+│   └── excel_report.py              # NEW: Final Excel report generator
+└── cli.py                           # Update generate excel command
+```
+
+### 4.2 Relationship with Existing Modules
+
+The `generate excel` command will use:
+- `comparison/formatting.py` - For color scale formatting (already implemented)
+- Excel table creation - New utility functions
+
+### 4.3 Simplification Consideration
+
+Since `compare gpu_timeline` and `compare collective` now produce files with BOTH combined data AND comparison sheets, we could:
+
+**Option A: Keep current interface (4 files)**
+- Matches original script exactly
+- More flexible but verbose
+
+**Option B: Simplified interface (2 files)**
+- Only needs comparison files (they contain combined data too)
+- Cleaner CLI but may need to extract raw data from comparison files
+
+**Recommendation:** Option B with backward compatibility for Option A
+
+---
+
+## 5. Module Details
+
+### 5.1 `generators/excel_report.py`
+
+```python
+"""Final Excel report generator.
+
+Creates comprehensive report with:
+- Summary Dashboard (first, visible)
+- Comparison sheets (visible)
+- Raw data sheets (hidden)
+- Excel table formatting
+- Color-coded percent_change columns
+"""
+
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import pandas as pd
+from openpyxl import load_workbook
+from openpyxl.worksheet.table import Table, TableStyleInfo
+
+from ..comparison.formatting import get_column_letter, create_color_scale_rule
+
+
+# Sheet naming mappings
+GPU_SHEET_MAPPING = {
+    "Summary": "GPU_Summary_Raw",
+    "All_Ranks_Combined": "GPU_AllRanks_Raw",
+    "Per_Rank_Time_ms": "GPU_Time_Raw",
+    "Per_Rank_Percent": "GPU_Pct_Raw",
+}
+
+GPU_COMPARISON_MAPPING = {
+    "Summary_Comparison": "GPU_Summary_Cmp",
+    "Comparison_By_Rank": "GPU_ByRank_Cmp",
+}
+
+COLL_SHEET_MAPPING = {
+    "nccl_summary_implicit_sync": "NCCL_ImplSync_Raw",
+    "nccl_summary_long": "NCCL_Long_Raw",
+}
+
+
+def create_final_excel_report(
+    gpu_comparison_path: Path,
+    coll_comparison_path: Path,
+    output_path: Path,
+    baseline_label: str = "Baseline",
+    test_label: str = "Test",
+    gpu_combined_path: Optional[Path] = None,
+    coll_combined_path: Optional[Path] = None,
+    verbose: bool = False,
+) -> Path:
+    """
+    Create comprehensive final Excel report.
+    
+    Args:
+        gpu_comparison_path: Path to GPU comparison file
+        coll_comparison_path: Path to collective comparison file
+        output_path: Output path for final report
+        baseline_label: Label for baseline column
+        test_label: Label for test column
+        gpu_combined_path: Optional separate GPU combined file
+        coll_combined_path: Optional separate collective combined file
+        verbose: Print progress
+    
+    Returns:
+        Path to created report
+    """
+
+
+def _add_gpu_sheets(
+    writer: pd.ExcelWriter,
+    gpu_comparison_path: Path,
+    gpu_combined_path: Optional[Path],
+    verbose: bool,
+) -> Tuple[List[str], List[str]]:
+    """Add GPU timeline sheets, return (raw_sheets, comparison_sheets)."""
+
+
+def _add_collective_sheets(
+    writer: pd.ExcelWriter,
+    coll_comparison_path: Path,
+    coll_combined_path: Optional[Path],
+    verbose: bool,
+) -> Tuple[List[str], List[str]]:
+    """Add collective sheets, return (raw_sheets, comparison_sheets)."""
+
+
+def _create_summary_dashboard(
+    writer: pd.ExcelWriter,
+    gpu_comparison_path: Path,
+    baseline_label: str,
+    test_label: str,
+    verbose: bool,
+) -> str:
+    """Create Summary_Dashboard sheet, return sheet name."""
+
+
+def _apply_post_processing(
+    output_path: Path,
+    raw_sheets: List[str],
+    comparison_sheets: List[str],
+    verbose: bool,
+) -> None:
+    """Apply Excel formatting: hide sheets, add tables, color formatting."""
+
+
+def add_excel_table(worksheet, table_name: str, start_row: int = 1) -> None:
+    """Convert worksheet data to Excel table format."""
+
+
+def _sanitize_table_name(sheet_name: str) -> str:
+    """Create valid Excel table name from sheet name."""
+```
+
+### 5.2 Updated `generators/__init__.py`
+
+```python
+"""Report generators for HTML and Excel."""
+
+from .html_generator import generate_html, image_to_base64
+from .excel_report import create_final_excel_report
+
+__all__ = [
+    "generate_html",
+    "image_to_base64",
+    "create_final_excel_report",
+]
+```
+
+---
+
+## 6. Data Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                          generate excel                                      │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                             │
+│  INPUTS:                                                                    │
+│  ├── gpu_comparison.xlsx                                                    │
+│  │   ├── Summary (combined)                                                 │
+│  │   ├── All_Ranks_Combined (combined)                                      │
+│  │   ├── Per_Rank_Time_ms (combined)                                        │
+│  │   ├── Per_Rank_Percent (combined)                                        │
+│  │   ├── Comparison_By_Rank                                                 │
+│  │   └── Summary_Comparison                                                 │
+│  │                                                                          │
+│  └── collective_comparison.xlsx                                             │
+│      ├── nccl_summary_implicit_sync (combined)                              │
+│      ├── nccl_summary_long (combined)                                       │
+│      ├── nccl_implicit_sync_cmp                                             │
+│      └── nccl_long_cmp                                                      │
+│                                                                             │
+│  PROCESSING:                                                                │
+│  ────────────                                                               │
+│  1. Read all sheets from input files                                        │
+│  2. Rename sheets according to naming convention                            │
+│  3. Create Summary_Dashboard from GPU comparison data                       │
+│  4. Write all sheets to new workbook                                        │
+│  5. Post-process with openpyxl:                                             │
+│     - Hide raw data sheets                                                  │
+│     - Convert to Excel tables                                               │
+│     - Apply color formatting                                                │
+│     - Move Summary_Dashboard to first position                              │
+│                                                                             │
+│  OUTPUT:                                                                    │
+│  └── final_analysis_report.xlsx                                             │
+│      ├── Summary_Dashboard (visible, FIRST)                                 │
+│      ├── GPU_Summary_Cmp (visible)                                          │
+│      ├── GPU_ByRank_Cmp (visible)                                           │
+│      ├── NCCL_Implicit_sync_cmp (visible)                                   │
+│      ├── NCCL_Long_cmp (visible)                                            │
+│      ├── GPU_Summary_Raw (HIDDEN)                                           │
+│      ├── GPU_AllRanks_Raw (HIDDEN)                                          │
+│      ├── GPU_Time_Raw (HIDDEN)                                              │
+│      ├── GPU_Pct_Raw (HIDDEN)                                               │
+│      ├── NCCL_ImplSync_Raw (HIDDEN)                                         │
+│      └── NCCL_Long_Raw (HIDDEN)                                             │
+│                                                                             │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 7. Implementation Order
+
+| Phase | Task | Est. Time |
+|-------|------|-----------|
+| **1** | Create `generators/excel_report.py` with core functions | 45 min |
+| **2** | Implement `_add_gpu_sheets()` | 20 min |
+| **3** | Implement `_add_collective_sheets()` | 20 min |
+| **4** | Implement `_create_summary_dashboard()` | 25 min |
+| **5** | Implement `_apply_post_processing()` | 30 min |
+| **6** | Update `generators/__init__.py` | 5 min |
+| **7** | Update CLI command in `cli.py` | 15 min |
+| **8** | Testing | 20 min |
+
+**Total estimated time: ~3 hours**
+
+---
+
+## 8. Expected Output
+
+### Console Output
+
+```
+============================================================
+Creating Final Excel Report
+============================================================
+Output: final_analysis_report.xlsx
+Baseline label: ROCm 6.0
+Test label: ROCm 7.0
+
+Step 1: Adding GPU Timeline sheets
+  Added GPU_Summary_Raw (will be hidden)
+  Added GPU_AllRanks_Raw (will be hidden)
+  Added GPU_Time_Raw (will be hidden)
+  Added GPU_Pct_Raw (will be hidden)
+  Added GPU_Summary_Cmp
+  Added GPU_ByRank_Cmp
+
+Step 2: Adding Collective/NCCL sheets
+  Added NCCL_ImplSync_Raw (will be hidden)
+  Added NCCL_Long_Raw (will be hidden)
+  Added NCCL_Implicit_sync_cmp
+  Added NCCL_Long_cmp
+
+Step 3: Creating Summary Dashboard
+  Added Summary_Dashboard
+
+Step 4: Applying formatting
+  Hidden: GPU_Summary_Raw
+  Hidden: GPU_AllRanks_Raw
+  Hidden: GPU_Time_Raw
+  Hidden: GPU_Pct_Raw
+  Hidden: NCCL_ImplSync_Raw
+  Hidden: NCCL_Long_Raw
+  Converted to table: Summary_Dashboard
+  Converted to table: GPU_Summary_Cmp
+  Converted to table: GPU_ByRank_Cmp
+  ...
+  Applied color scale to GPU_Summary_Cmp column percent_change
+  Applied color scale to GPU_ByRank_Cmp column percent_change
+  ...
+  Moved Summary_Dashboard to first position
+
+============================================================
+Report Complete!
+============================================================
+Output: final_analysis_report.xlsx
+
+Report Structure:
+  Visible Sheets (Analysis):
+    - Summary_Dashboard
+    - GPU_Summary_Cmp
+    - GPU_ByRank_Cmp
+    - NCCL_Implicit_sync_cmp
+    - NCCL_Long_cmp
+
+  Hidden Sheets (Raw Data):
+    - GPU_Summary_Raw
+    - GPU_AllRanks_Raw
+    - GPU_Time_Raw
+    - GPU_Pct_Raw
+    - NCCL_ImplSync_Raw
+    - NCCL_Long_Raw
+
+Features:
+  - All data formatted as Excel tables with filters
+  - Percent change columns are color-coded (green=better, red=worse)
+  - Unhide raw data: Right-click sheet tab → Unhide
+```
+
+### Summary Dashboard Content
+
+| Metric | ROCm 6.0 | ROCm 7.0 | Improvement (%) | Status |
+|--------|----------|----------|-----------------|--------|
+| GPU_busy_time | 125.45 | 118.32 | 5.68 | Better |
+| GPU_idle_time | 21.78 | 19.45 | 10.70 | Better |
+| GPU_computation_time | 98.34 | 95.12 | 3.27 | Better |
+| GPU_exposed_comm_time | 27.11 | 23.20 | 14.42 | Better |
+| GPU_total_time | 147.23 | 137.77 | 6.43 | Better |
+
+---
+
+## Appendix A: CLI Update
+
+### Simplified Interface (Recommended)
+
+```python
+@generate.command("excel")
+@click.option("--gpu-comparison", required=True, type=click.Path(exists=True),
+              help="GPU timeline comparison file (from 'compare gpu_timeline')")
+@click.option("--coll-comparison", required=True, type=click.Path(exists=True),
+              help="Collective comparison file (from 'compare collective')")
+@click.option("--baseline-label", default="Baseline",
+              help="Label for baseline configuration")
+@click.option("--test-label", default="Test",
+              help="Label for test configuration")
+@click.option("-o", "--output", required=True, type=click.Path(),
+              help="Output Excel file path")
+@click.pass_context
+def generate_excel(ctx, gpu_comparison, coll_comparison, baseline_label, test_label, output):
+    """Generate comprehensive final Excel report.
+    
+    Combines GPU timeline and collective comparison data into a single
+    well-organized Excel report with:
+    
+    \b
+    - Summary Dashboard (first sheet, key metrics at a glance)
+    - Comparison sheets (visible, with color-coded changes)
+    - Raw data sheets (hidden, accessible via Unhide)
+    - Excel table formatting with filters
+    
+    \b
+    Examples:
+      aorta-report generate excel \\
+          --gpu-comparison gpu_comparison.xlsx \\
+          --coll-comparison collective_comparison.xlsx \\
+          -o final_report.xlsx
+    """
+```
+
+### Full Interface (Backward Compatible)
+
+```python
+@generate.command("excel")
+@click.option("--gpu-comparison", required=True, type=click.Path(exists=True))
+@click.option("--coll-comparison", required=True, type=click.Path(exists=True))
+@click.option("--gpu-combined", type=click.Path(exists=True),
+              help="Optional: Separate GPU combined file")
+@click.option("--coll-combined", type=click.Path(exists=True),
+              help="Optional: Separate collective combined file")
+@click.option("--baseline-label", default="Baseline")
+@click.option("--test-label", default="Test")
+@click.option("-o", "--output", required=True, type=click.Path())
+```
+
+---
+
+## Appendix B: Design Decisions
+
+1. **Interface:** Keep original 4-file interface (can refactor later)
+
+2. **Dashboard Metrics:** Include both GPU and NCCL metrics in Summary Dashboard
+
+3. **Table Style:** Use `TableStyleMedium2` (standard)
+
+4. **Sheet Order:** Dashboard → GPU Comparison → NCCL Comparison → (hidden)
+
diff --git a/src/aorta/report/GENERATE_PLOTS_DEV_DOCS.md b/src/aorta/report/GENERATE_PLOTS_DEV_DOCS.md
new file mode 100644
index 0000000..8de1a34
--- /dev/null
+++ b/src/aorta/report/GENERATE_PLOTS_DEV_DOCS.md
@@ -0,0 +1,1383 @@
+# `generate plots` Command - Developer Documentation
+
+**Version:** 1.1  
+**Date:** January 2026  
+**Status:** ✅ Implemented
+
+---
+
+## Table of Contents
+
+1. [Overview](#1-overview)
+2. [Source Scripts Analysis](#2-source-scripts-analysis)
+3. [Command Specification](#3-command-specification)
+4. [Implementation Architecture](#4-implementation-architecture)
+5. [Module Details](#5-module-details)
+6. [Data Flow](#6-data-flow)
+7. [Implementation Order](#7-implementation-order)
+8. [Output Files](#8-output-files)
+
+---
+
+## 1. Overview
+
+The `generate plots` command creates visualization plots from analysis reports. It merges functionality from two existing scripts into a unified interface with two plot types.
+
+### Plot Types
+
+| Type | Source Script | Input | Description |
+|------|---------------|-------|-------------|
+| `summary` | `create_final_plots.py` | Excel report | GPU timeline & NCCL comparison charts |
+| `gemm` | `plot_gemm_variance.py` | CSV file | GEMM kernel variance distribution plots |
+
+### Scripts Being Merged
+
+| Script | Lines | Location |
+|--------|-------|----------|
+| `create_final_plots.py` | 333 | `scripts/tracelens_single_config/` |
+| `plot_gemm_variance.py` | 423 | `scripts/gemm_analysis/` |
+| **Total** | **756** | - |
+
+---
+
+## 2. Source Scripts Analysis
+
+### 2.1 `create_final_plots.py` → Plot Type: `summary`
+
+**Input:** Final Excel report (output of `generate excel`)  
+**Required Sheets:** `Summary_Dashboard`, `GPU_ByRank_Cmp`, `NCCL_ImplicitSyncCmp`
+
+#### Functions & Output Files
+
+| Function | Output File | Description |
+|----------|-------------|-------------|
+| `plot_improvement_chart()` | `improvement_chart.png` | Horizontal bar chart showing % improvement per metric |
+| `plot_abs_time_comparison()` | `abs_time_comparison.png` | Grouped bar chart: baseline vs test absolute times |
+| `create_gpu_time_accross_all_ranks()` | `{metric}_by_rank.png` | Line plots showing metric values across ranks (4 files) |
+| `create_gpu_time_change_percentage_summaryby_rank()` | `gpu_time_change_percentage_summary_by_rank.png` | 2×4 grid of bar charts per metric type |
+| `create_gpu_time_heatmap()` | `gpu_time_heatmap.png` | Seaborn heatmap: percent_change by (metric × rank) |
+| `create_nccl_charts()` | `NCCL_*.png` | 5 NCCL comparison charts |
+
+**Total Output Files:** ~13 PNG files
+
+---
+
+### 2.2 `plot_gemm_variance.py` → Plot Type: `gemm`
+
+**Input:** GEMM variance CSV (output of `analyze gemm` + optional `process gemm-variance`)  
+**Required Columns:** `threads`, `channel`, `rank`, `time_diff_us`, `kernel_name`
+
+#### Functions & Output Files
+
+| Function | Output File | Description |
+|----------|-------------|-------------|
+| `create_boxplot_by_threads()` | `variance_by_threads_boxplot.png` | Box plot: variance distribution by thread count |
+| `create_boxplot_by_channels()` | `variance_by_channels_boxplot.png` | Box plot: variance distribution by channel count |
+| `create_boxplot_by_ranks()` | `variance_by_ranks_boxplot.png` | Box plot: variance distribution by rank |
+| `create_violin_plot_combined()` | `variance_violin_combined.png` | 1×3 grid: violin plots for all dimensions |
+| `create_interaction_plot()` | `variance_thread_channel_interaction.png` | Line plot: thread-channel interaction |
+
+**Total Output Files:** 5 PNG files
+
+---
+
+## 3. Command Specification
+
+### CLI Interface
+
+```bash
+# Summary plots (GPU timeline + NCCL from Excel)
+aorta-report generate plots \
+    -i final_report.xlsx \
+    -o ./plots/ \
+    --type summary
+
+# GEMM variance plots (from CSV)
+aorta-report generate plots \
+    -i gemm_variance.csv \
+    -o ./plots/ \
+    --type gemm
+
+# All plots (requires both inputs)
+aorta-report generate plots \
+    --excel-input final_report.xlsx \
+    --gemm-csv gemm_variance.csv \
+    -o ./plots/ \
+    --type all
+```
+
+### Options
+
+| Option | Required | Default | Description |
+|--------|----------|---------|-------------|
+| `-i, --input` | Conditional | - | Input file (Excel for summary, CSV for gemm) |
+| `--excel-input` | For `all` | - | Excel report file (for `--type all`) |
+| `--gemm-csv` | For `all` | - | GEMM variance CSV (for `--type all`) |
+| `-o, --output` | Yes | - | Output directory for PNG files |
+| `--type` | No | `all` | Plot type: `summary`, `gemm`, or `all` |
+| `--dpi` | No | `150` | DPI for output images |
+
+### Validation Rules
+
+1. If `--type summary`: `-i` must be an Excel file with required sheets
+2. If `--type gemm`: `-i` must be a CSV file with required columns
+3. If `--type all`: Both `--excel-input` and `--gemm-csv` must be provided
+
+---
+
+## 4. Implementation Architecture
+
+### 4.1 File Structure
+
+```
+src/aorta/report/
+└── generators/
+    ├── __init__.py                    # Update exports
+    ├── html_generator.py              # Existing
+    ├── excel_report.py                # Existing
+    ├── plot_generator.py              # NEW: Thin orchestrator (~100 lines)
+    └── plot_helper/                   # NEW: Internal package
+        ├── __init__.py                # Exports all plot functions
+        ├── common.py                  # Shared utilities, colors, styles
+        │
+        │ # Summary plots (from create_final_plots.py)
+        ├── summary_dashboard.py       # improvement_chart, abs_time_comparison
+        ├── gpu_by_rank.py             # GPU metrics by rank line plots
+        ├── gpu_percent_change.py      # 2x4 grid of percent change bars
+        ├── gpu_heatmap.py             # Seaborn heatmap
+        ├── nccl_charts.py             # NCCL comparison charts
+        │
+        │ # GEMM plots (from plot_gemm_variance.py)
+        ├── gemm_data.py               # CSV reader, statistics
+        ├── gemm_boxplots.py           # Boxplots by threads/channels/ranks
+        ├── gemm_violin.py             # Combined violin plot
+        └── gemm_interaction.py        # Thread-channel interaction plot
+```
+
+### 4.2 File Size Estimates
+
+| File | Functions | Lines (est.) |
+|------|-----------|--------------|
+| **Common** | | |
+| `common.py` | `configure_style()`, `COLORS`, `save_figure()` | ~50 |
+| **Summary Plots** | | |
+| `summary_dashboard.py` | `plot_improvement_chart()`, `plot_abs_time_comparison()`, `get_labels()` | ~80 |
+| `gpu_by_rank.py` | `plot_gpu_metrics_by_rank()` | ~70 |
+| `gpu_percent_change.py` | `plot_gpu_percent_change_grid()` | ~60 |
+| `gpu_heatmap.py` | `plot_gpu_heatmap()` | ~50 |
+| `nccl_charts.py` | `plot_nccl_comparison()`, `plot_nccl_percent_change()` | ~120 |
+| **GEMM Plots** | | |
+| `gemm_data.py` | `read_gemm_csv_data()`, `print_statistics()` | ~60 |
+| `gemm_boxplots.py` | `create_boxplot()`, `plot_by_threads()`, `plot_by_channels()`, `plot_by_ranks()` | ~100 |
+| `gemm_violin.py` | `plot_variance_violin_combined()` | ~80 |
+| `gemm_interaction.py` | `plot_thread_channel_interaction()` | ~60 |
+| **Orchestrator** | | |
+| `plot_generator.py` | `generate_summary_plots()`, `generate_gemm_plots()`, `generate_plots()` | ~100 |
+| **Total** | | **~830** |
+
+---
+
+## 5. Module Details
+
+### 5.1 `plot_helper/common.py`
+
+Shared utilities, colors, and styling for all plots.
+
+```python
+"""Common utilities for plot generation."""
+
+from pathlib import Path
+from typing import Tuple
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+# =============================================================================
+# Color Palette
+# =============================================================================
+
+COLORS = {
+    "positive": "#2ecc71",    # Green - improvements
+    "negative": "#e74c3c",    # Red - regressions
+    "baseline": "#3498db",    # Blue - baseline data
+    "test": "#e67e22",        # Orange - test data
+    "neutral": "#95a5a6",     # Gray - neutral
+}
+
+# Extended palette for multi-series
+PALETTE_MULTI = ["#3498db", "#e67e22", "#2ecc71", "#e74c3c", "#9b59b6", "#1abc9c"]
+
+
+# =============================================================================
+# Plot Configuration
+# =============================================================================
+
+DEFAULT_DPI = 150
+DEFAULT_FIGSIZE = (10, 6)
+
+
+def configure_style() -> None:
+    """Configure matplotlib/seaborn style for consistent plots."""
+    sns.set_style("whitegrid")
+    plt.rcParams.update({
+        "figure.dpi": DEFAULT_DPI,
+        "savefig.dpi": DEFAULT_DPI,
+        "font.size": 12,
+        "axes.titlesize": 14,
+        "axes.labelsize": 12,
+    })
+
+
+def remove_spines(ax) -> None:
+    """Remove all spines from an axis."""
+    for spine in ["top", "right", "bottom", "left"]:
+        ax.spines[spine].set_visible(False)
+
+
+def save_figure(
+    fig,
+    output_path: Path,
+    dpi: int = DEFAULT_DPI,
+    close: bool = True,
+) -> Path:
+    """Save figure and optionally close it."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(output_path, dpi=dpi, bbox_inches="tight")
+    if close:
+        plt.close(fig)
+    return output_path
+
+
+def get_improvement_colors(values) -> list:
+    """Return green/red colors based on positive/negative values."""
+    return [COLORS["positive"] if v > 0 else COLORS["negative"] for v in values]
+```
+
+---
+
+### 5.2 `plot_helper/summary_dashboard.py`
+
+Dashboard-level plots from Summary_Dashboard sheet.
+
+```python
+"""Summary dashboard plots: improvement chart and absolute time comparison."""
+
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+from .common import (
+    COLORS, DEFAULT_DPI, DEFAULT_FIGSIZE,
+    remove_spines, save_figure, get_improvement_colors,
+)
+
+
+def get_labels_from_excel(excel_path: Path) -> List[str]:
+    """Extract baseline/test labels from Summary_Dashboard sheet."""
+    df = pd.read_excel(excel_path, sheet_name="Summary_Dashboard")
+    cols = df.columns.tolist()
+    return [cols[1], cols[2]]  # Baseline and Test column names
+
+
+def plot_improvement_chart(
+    excel_path: Path,
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """
+    Create horizontal bar chart of percent improvement.
+    
+    Reads Summary_Dashboard sheet, plots Metric vs Improvement (%).
+    Green bars for positive (better), red for negative (worse).
+    """
+    df = pd.read_excel(excel_path, sheet_name="Summary_Dashboard")
+    
+    fig, ax = plt.subplots(figsize=DEFAULT_FIGSIZE)
+    
+    colors = get_improvement_colors(df["Improvement (%)"])
+    ax.barh(df["Metric"], df["Improvement (%)"], color=colors)
+    
+    ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+    ax.set_axisbelow(True)
+    remove_spines(ax)
+    
+    ax.set_ylabel("Metric", fontsize=12)
+    ax.set_xlabel("Change (%)", fontsize=12)
+    ax.set_title(
+        "GPU Metrics Percentage Change (Test vs Baseline)\n(Positive = Test is better)",
+        fontsize=14, fontweight="bold",
+    )
+    
+    plt.tight_layout()
+    return save_figure(fig, output_dir / "improvement_chart.png", dpi)
+
+
+def plot_abs_time_comparison(
+    excel_path: Path,
+    output_dir: Path,
+    labels: List[str],
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """
+    Create grouped bar chart of baseline vs test absolute times.
+    
+    Reads Summary_Dashboard sheet, plots side-by-side bars for each metric.
+    """
+    df = pd.read_excel(excel_path, sheet_name="Summary_Dashboard")
+    
+    fig, ax = plt.subplots(figsize=DEFAULT_FIGSIZE)
+    
+    x = np.arange(len(df))
+    width = 0.35
+    colors = [COLORS["baseline"], COLORS["test"]]
+    
+    for i, label in enumerate(labels):
+        offset = (i - len(labels) / 2 + 0.5) * width
+        ax.bar(x + offset, df[label], width, label=label, color=colors[i])
+    
+    ax.xaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+    ax.set_axisbelow(True)
+    remove_spines(ax)
+    
+    ax.set_xlabel("Metric Type", fontsize=12)
+    ax.set_ylabel("Time (ms)", fontsize=12)
+    ax.set_title("GPU Metrics Absolute Time Comparison", fontsize=14, fontweight="bold")
+    ax.set_xticks(x)
+    ax.set_xticklabels(df["Metric"], rotation=45, ha="right")
+    ax.legend()
+    
+    plt.tight_layout()
+    return save_figure(fig, output_dir / "abs_time_comparison.png", dpi)
+```
+
+---
+
+### 5.3 `plot_helper/gpu_by_rank.py`
+
+Line plots showing GPU metrics across ranks.
+
+```python
+"""GPU metrics by rank line plots."""
+
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from .common import COLORS, DEFAULT_DPI, save_figure
+
+
+METRICS_TO_PLOT = ["total_time", "computation_time", "total_comm_time", "idle_time"]
+
+
+def plot_gpu_metrics_by_rank(
+    excel_path: Path,
+    output_dir: Path,
+    labels: List[str],
+    metrics: List[str] = None,
+    dpi: int = DEFAULT_DPI,
+) -> List[Path]:
+    """
+    Create line plots for GPU metrics across ranks.
+    
+    Reads GPU_ByRank_Cmp sheet, creates one plot per metric type.
+    Each plot shows baseline vs test values across all ranks.
+    
+    Returns list of generated file paths.
+    """
+    df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp")
+    metrics = metrics or METRICS_TO_PLOT
+    
+    output_files = []
+    colors = [COLORS["baseline"], COLORS["test"]]
+    markers = ["o", "s"]
+    
+    for metric in metrics:
+        metric_df = df[df["type"] == metric]
+        if metric_df.empty:
+            continue
+        
+        fig, ax = plt.subplots(figsize=(12, 6))
+        
+        for i, label in enumerate(labels):
+            col_name = f"{label}_time_ms"
+            if col_name in metric_df.columns:
+                ax.plot(
+                    metric_df["rank"],
+                    metric_df[col_name],
+                    marker=markers[i],
+                    linewidth=2,
+                    markersize=8,
+                    color=colors[i],
+                    label=label,
+                )
+        
+        ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+        ax.set_axisbelow(True)
+        
+        ax.set_xlabel("Rank", fontsize=12)
+        ax.set_ylabel("Time (ms)", fontsize=12)
+        ax.set_title(f"{metric} Comparison across all ranks", fontsize=14, fontweight="bold")
+        ax.legend()
+        
+        plt.tight_layout()
+        output_path = save_figure(fig, output_dir / f"{metric}_by_rank.png", dpi)
+        output_files.append(output_path)
+    
+    return output_files
+```
+
+---
+
+### 5.4 `plot_helper/gpu_percent_change.py`
+
+2×4 grid of percent change bar charts.
+
+```python
+"""GPU percent change grid plot."""
+
+from pathlib import Path
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from .common import DEFAULT_DPI, save_figure, get_improvement_colors
+
+
+METRIC_TYPES = [
+    "busy_time", "computation_time", "exposed_comm_time", "exposed_memcpy_time",
+    "idle_time", "total_comm_time", "total_memcpy_time", "total_time",
+]
+
+
+def plot_gpu_percent_change_grid(
+    excel_path: Path,
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """
+    Create 2x4 grid of percent change bar charts by rank.
+    
+    Reads GPU_ByRank_Cmp sheet, creates one subplot per metric type.
+    Each subplot shows percent_change for all ranks as bar chart.
+    """
+    df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp")
+    
+    fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(14, 8))
+    
+    for i, metric_type in enumerate(METRIC_TYPES):
+        ax = axes[i // 4, i % 4]
+        type_df = df[df["type"] == metric_type]
+        
+        if type_df.empty:
+            ax.set_visible(False)
+            continue
+        
+        colors = get_improvement_colors(type_df["percent_change"])
+        ax.bar(type_df["rank"].astype(str), type_df["percent_change"], color=colors)
+        
+        ax.axhline(y=0, color="black", linestyle="-", linewidth=0.5)
+        ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+        ax.set_axisbelow(True)
+        ax.set_xlabel("Rank")
+        ax.set_ylabel("Percent Change (%)")
+        ax.set_title(metric_type, fontsize=10)
+    
+    fig.suptitle(
+        "GPU Metrics Percent Change by Rank\n(Positive = Better)",
+        fontsize=14, fontweight="bold",
+    )
+    plt.tight_layout()
+    return save_figure(fig, output_dir / "gpu_time_change_percentage_summary_by_rank.png", dpi)
+```
+
+---
+
+### 5.5 `plot_helper/gpu_heatmap.py`
+
+Seaborn heatmap of percent change.
+
+```python
+"""GPU percent change heatmap."""
+
+from pathlib import Path
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+from .common import DEFAULT_DPI, save_figure
+
+
+def plot_gpu_heatmap(
+    excel_path: Path,
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """
+    Create heatmap of percent_change by metric type and rank.
+    
+    Reads GPU_ByRank_Cmp sheet, pivots to (metric × rank) matrix,
+    and creates color-coded heatmap (green=better, red=worse).
+    """
+    df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp")
+    pivot_df = df.pivot(index="type", columns="rank", values="percent_change")
+    
+    fig, ax = plt.subplots(figsize=(12, 8))
+    
+    sns.heatmap(
+        pivot_df,
+        annot=True,
+        fmt=".1f",
+        cmap="RdYlGn",
+        center=0,
+        linewidths=0.5,
+        cbar_kws={"label": "Percent Change (%)"},
+        ax=ax,
+    )
+    
+    ax.set_title(
+        "GPU Metric Percentage Change by Rank (HeatMap)\n(Positive = Better Test)",
+        fontsize=14, fontweight="bold",
+    )
+    ax.set_xlabel("Rank", fontsize=12)
+    ax.set_ylabel("Metric Type", fontsize=12)
+    
+    plt.tight_layout()
+    return save_figure(fig, output_dir / "gpu_time_heatmap.png", dpi)
+```
+
+---
+
+### 5.6 `plot_helper/nccl_charts.py`
+
+NCCL comparison charts.
+
+```python
+"""NCCL comparison charts."""
+
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+from .common import COLORS, DEFAULT_DPI, save_figure, get_improvement_colors
+
+
+NCCL_METRICS = {
+    "NCCL Communication Latency": {
+        "y_col": "comm_latency_mean",
+        "y_label": "Communication Latency (ms)",
+    },
+    "NCCL Algorithm Bandwidth": {
+        "y_col": "algo bw (GB/s)_mean",
+        "y_label": "Algorithm Bandwidth (GB/s)",
+    },
+    "NCCL Bus Bandwidth": {
+        "y_col": "bus bw (GB/s)_mean",
+        "y_label": "Bus Bandwidth (GB/s)",
+    },
+    "NCCL Total Communication Latency": {
+        "y_col": "Total comm latency (ms)",
+        "y_label": "Total Communication Latency (ms)",
+    },
+}
+
+NCCL_PERCENT_METRICS = {
+    "Comm Latency": "percent_change_comm_latency_mean",
+    "Algo BW": "percent_change_algo bw (GB/s)_mean",
+    "Bus BW": "percent_change_bus bw (GB/s)_mean",
+}
+
+
+def plot_nccl_comparison(
+    excel_path: Path,
+    output_dir: Path,
+    labels: List[str],
+    dpi: int = DEFAULT_DPI,
+) -> List[Path]:
+    """
+    Create NCCL metric comparison bar charts.
+    
+    Reads NCCL_ImplicitSyncCmp sheet, creates grouped bar charts
+    for each metric (latency, bandwidth).
+    """
+    df = pd.read_excel(excel_path, sheet_name="NCCL_ImplicitSyncCmp")
+    df["label"] = df["Collective name"] + "\n" + df["In msg nelems"].astype(str)
+    
+    x = np.arange(len(df))
+    width = 0.35
+    colors = [COLORS["baseline"], COLORS["test"]]
+    output_files = []
+    
+    for title, config in NCCL_METRICS.items():
+        fig, ax = plt.subplots(figsize=(14, 6))
+        
+        for i, label in enumerate(labels):
+            col_name = f"{label}_{config['y_col']}"
+            if col_name in df.columns:
+                offset = (i - len(labels) / 2 + 0.5) * width
+                ax.bar(x + offset, df[col_name], width, label=label, color=colors[i])
+        
+        ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+        ax.set_axisbelow(True)
+        ax.set_xticks(x)
+        ax.set_xticklabels(df["label"], rotation=45, ha="right", fontsize=8)
+        ax.set_xlabel("Collective Operation (Message Size)", fontsize=12)
+        ax.set_ylabel(config["y_label"], fontsize=12)
+        ax.set_title(f"{title} Comparison", fontsize=14, fontweight="bold")
+        ax.legend()
+        
+        plt.tight_layout()
+        filename = f'{title.replace(" ", "_")}_comparison.png'
+        output_files.append(save_figure(fig, output_dir / filename, dpi))
+    
+    return output_files
+
+
+def plot_nccl_percent_change(
+    excel_path: Path,
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """
+    Create 1x3 grid of NCCL percent change horizontal bar charts.
+    """
+    df = pd.read_excel(excel_path, sheet_name="NCCL_ImplicitSyncCmp")
+    
+    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14, 6))
+    
+    for i, (title, col_name) in enumerate(NCCL_PERCENT_METRICS.items()):
+        ax = axes[i]
+        if col_name not in df.columns:
+            ax.set_visible(False)
+            continue
+        
+        colors = get_improvement_colors(df[col_name])
+        ax.barh(df["In msg nelems"].astype(str), df[col_name], color=colors)
+        
+        ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+        ax.set_axisbelow(True)
+        ax.set_xlabel("Percent Change (%)")
+        ax.set_title(f"{title}\nPercent Change (Positive = better)")
+    
+    fig.suptitle(
+        "NCCL Performance Percentage Change By Message Size",
+        fontsize=16, fontweight="bold",
+    )
+    plt.tight_layout()
+    return save_figure(fig, output_dir / "NCCL_Performance_Percentage_Change_comparison.png", dpi)
+```
+
+---
+
+### 5.7 `plot_helper/gemm_data.py`
+
+GEMM CSV reader and statistics.
+
+```python
+"""GEMM variance data loading and statistics."""
+
+import csv
+from pathlib import Path
+from typing import Dict, List, Any
+from collections import defaultdict
+
+
+def read_gemm_csv_data(csv_path: Path) -> Dict[str, Any]:
+    """
+    Read GEMM variance CSV and organize by dimensions.
+    
+    Returns:
+        {
+            "threads": {256: [values], 512: [values]},
+            "channels": {28: [values], 42: [values], ...},
+            "ranks": {0: [values], 1: [values], ...},
+            "all": [list of row dicts],
+        }
+    """
+    data = {
+        "threads": defaultdict(list),
+        "channels": defaultdict(list),
+        "ranks": defaultdict(list),
+        "all": [],
+    }
+    
+    with open(csv_path, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            try:
+                threads = int(row["threads"])
+                channel = int(row["channel"])
+                rank = int(row["rank"])
+                time_diff = float(row["time_diff_us"])
+                
+                data["threads"][threads].append(time_diff)
+                data["channels"][channel].append(time_diff)
+                data["ranks"][rank].append(time_diff)
+                data["all"].append({
+                    "threads": threads,
+                    "channel": channel,
+                    "rank": rank,
+                    "time_diff": time_diff,
+                    "kernel_name": row["kernel_name"],
+                })
+            except (ValueError, KeyError) as e:
+                continue
+    
+    return data
+
+
+def _calculate_median(values: List[float]) -> float:
+    """Calculate median of a list of values."""
+    sorted_vals = sorted(values)
+    n = len(sorted_vals)
+    if n % 2 == 1:
+        return sorted_vals[n // 2]
+    return (sorted_vals[n // 2 - 1] + sorted_vals[n // 2]) / 2
+
+
+def print_gemm_statistics(data: Dict[str, Any], verbose: bool = True) -> Dict[str, Any]:
+    """Print and return summary statistics."""
+    stats = {}
+    
+    if verbose:
+        print("\n" + "=" * 70)
+        print("VARIANCE DISTRIBUTION STATISTICS")
+        print("=" * 70)
+    
+    for dimension, label_fmt in [
+        ("threads", "{} threads"),
+        ("channels", "{}ch"),
+        ("ranks", "Rank {}"),
+    ]:
+        stats[dimension] = {}
+        if verbose:
+            print(f"\nBy {dimension.title()}:")
+        
+        for key in sorted(data[dimension].keys()):
+            values = data[dimension][key]
+            mean_val = sum(values) / len(values)
+            median_val = _calculate_median(values)
+            
+            stats[dimension][key] = {
+                "mean": mean_val,
+                "median": median_val,
+                "max": max(values),
+                "count": len(values),
+            }
+            
+            if verbose:
+                label = label_fmt.format(key)
+                print(f"  {label}: mean={mean_val:.2f}us, median={median_val:.2f}us, "
+                      f"max={max(values):.2f}us, n={len(values)}")
+    
+    if verbose:
+        print("=" * 70 + "\n")
+    
+    return stats
+```
+
+---
+
+### 5.8 `plot_helper/gemm_boxplots.py`
+
+GEMM variance boxplots.
+
+```python
+"""GEMM variance boxplot generators."""
+
+from pathlib import Path
+from typing import Dict, List, Any, Tuple
+
+import matplotlib.pyplot as plt
+
+from .common import DEFAULT_DPI, save_figure
+
+
+def _create_boxplot(
+    data_dict: Dict[int, List[float]],
+    output_path: Path,
+    label_fmt: str,
+    xlabel: str,
+    title: str,
+    colors: List[str],
+    figsize: Tuple[int, int] = (10, 6),
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Generic boxplot creation helper."""
+    fig, ax = plt.subplots(figsize=figsize)
+    
+    keys_list = sorted(data_dict.keys())
+    plot_data = [data_dict[k] for k in keys_list]
+    labels = [label_fmt.format(k) for k in keys_list]
+    
+    bp = ax.boxplot(
+        plot_data,
+        tick_labels=labels,
+        patch_artist=True,
+        showmeans=True,
+        meanline=True,
+    )
+    
+    # Handle color assignment
+    if colors == "viridis":
+        colors = plt.cm.viridis([i / len(keys_list) for i in range(len(keys_list))])
+    
+    for patch, color in zip(bp["boxes"], colors):
+        patch.set_facecolor(color)
+    
+    ax.set_ylabel("Time Difference (us)", fontsize=14, fontweight="bold")
+    ax.set_xlabel(xlabel, fontsize=14, fontweight="bold")
+    ax.set_title(title, fontsize=16, fontweight="bold", pad=20)
+    ax.grid(True, alpha=0.3)
+    
+    plt.tight_layout()
+    return save_figure(fig, output_path, dpi)
+
+
+def plot_variance_by_threads(
+    data: Dict[str, Any],
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Create boxplot of variance by thread count."""
+    return _create_boxplot(
+        data_dict=data["threads"],
+        output_path=output_dir / "variance_by_threads_boxplot.png",
+        label_fmt="{} threads",
+        xlabel="Thread Configuration",
+        title="GEMM Kernel Time Variance by Thread Count",
+        colors=["lightblue", "lightcoral"],
+        figsize=(10, 6),
+        dpi=dpi,
+    )
+
+
+def plot_variance_by_channels(
+    data: Dict[str, Any],
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Create boxplot of variance by channel count."""
+    return _create_boxplot(
+        data_dict=data["channels"],
+        output_path=output_dir / "variance_by_channels_boxplot.png",
+        label_fmt="{}ch",
+        xlabel="Channel Configuration",
+        title="GEMM Kernel Time Variance by Channel Count",
+        colors=["#e6f2ff", "#99ccff", "#4da6ff", "#0073e6"],
+        figsize=(12, 6),
+        dpi=dpi,
+    )
+
+
+def plot_variance_by_ranks(
+    data: Dict[str, Any],
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Create boxplot of variance by rank."""
+    return _create_boxplot(
+        data_dict=data["ranks"],
+        output_path=output_dir / "variance_by_ranks_boxplot.png",
+        label_fmt="Rank {}",
+        xlabel="Rank",
+        title="GEMM Kernel Time Variance by Rank",
+        colors="viridis",
+        figsize=(14, 6),
+        dpi=dpi,
+    )
+```
+
+---
+
+### 5.9 `plot_helper/gemm_violin.py`
+
+Combined violin plot.
+
+```python
+"""GEMM variance violin plot."""
+
+from pathlib import Path
+from typing import Dict, List, Any
+
+import matplotlib.pyplot as plt
+
+from .common import DEFAULT_DPI, save_figure
+
+
+def _prepare_violin_data(data_dict: Dict[int, List[float]], label_fmt: str) -> List[Dict]:
+    """Prepare data for violin plot from a dictionary."""
+    result = []
+    for key, values in sorted(data_dict.items()):
+        for val in values:
+            result.append({"config": label_fmt.format(key), "time_diff": val})
+    return result
+
+
+def plot_variance_violin_combined(
+    data: Dict[str, Any],
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Create combined violin plot (1x3 grid) for all dimensions."""
+    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
+    
+    configs = [
+        {
+            "data": _prepare_violin_data(data["threads"], "{}t"),
+            "sort_key": lambda x: int(x[:-1]),
+            "color": "lightblue",
+            "xlabel": "Threads",
+            "title": "By Thread Count",
+        },
+        {
+            "data": _prepare_violin_data(data["channels"], "{}ch"),
+            "sort_key": lambda x: int(x[:-2]),
+            "color": "lightcoral",
+            "xlabel": "Channels",
+            "title": "By Channel Count",
+        },
+        {
+            "data": _prepare_violin_data(data["ranks"], "R{}"),
+            "sort_key": lambda x: int(x[1:]),
+            "color": "lightgreen",
+            "xlabel": "Ranks",
+            "title": "By Rank",
+        },
+    ]
+    
+    for ax, cfg in zip(axes, configs):
+        violin_data = cfg["data"]
+        configs_list = sorted(set(d["config"] for d in violin_data), key=cfg["sort_key"])
+        values = [[d["time_diff"] for d in violin_data if d["config"] == c] for c in configs_list]
+        
+        parts = ax.violinplot(
+            values,
+            positions=range(len(configs_list)),
+            showmeans=True,
+            showmedians=True,
+        )
+        for pc in parts["bodies"]:
+            pc.set_facecolor(cfg["color"])
+            pc.set_alpha(0.7)
+        
+        ax.set_xticks(range(len(configs_list)))
+        ax.set_xticklabels(configs_list)
+        ax.set_ylabel("Time Difference (us)", fontsize=12, fontweight="bold")
+        ax.set_xlabel(cfg["xlabel"], fontsize=12, fontweight="bold")
+        ax.set_title(cfg["title"], fontsize=14, fontweight="bold")
+        ax.grid(True, alpha=0.3, axis="y")
+    
+    fig.suptitle(
+        "GEMM Kernel Time Variance Distribution",
+        fontsize=18, fontweight="bold", y=1.02,
+    )
+    
+    plt.tight_layout()
+    return save_figure(fig, output_dir / "variance_violin_combined.png", dpi)
+```
+
+---
+
+### 5.10 `plot_helper/gemm_interaction.py`
+
+Thread-channel interaction plot.
+
+```python
+"""GEMM thread-channel interaction plot."""
+
+from pathlib import Path
+from typing import Dict, Any
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+
+from .common import DEFAULT_DPI, save_figure
+
+
+def plot_thread_channel_interaction(
+    data: Dict[str, Any],
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Create thread-channel interaction line plot."""
+    fig, ax = plt.subplots(figsize=(12, 7))
+    
+    # Organize data by threads and channels
+    thread_channel_data = defaultdict(lambda: defaultdict(list))
+    for row in data["all"]:
+        thread_channel_data[row["threads"]][row["channel"]].append(row["time_diff"])
+    
+    threads = sorted(thread_channel_data.keys())
+    channels = sorted(set(
+        ch for t_data in thread_channel_data.values() for ch in t_data.keys()
+    ))
+    
+    markers = ["o", "s", "^", "D"]
+    
+    for i, thread in enumerate(threads):
+        means = []
+        for channel in channels:
+            if channel in thread_channel_data[thread]:
+                values = thread_channel_data[thread][channel]
+                means.append(sum(values) / len(values))
+            else:
+                means.append(0)
+        
+        ax.plot(
+            channels, means,
+            marker=markers[i % len(markers)],
+            linewidth=2,
+            markersize=10,
+            label=f"{thread} threads",
+        )
+    
+    ax.set_xlabel("Channel Count", fontsize=14, fontweight="bold")
+    ax.set_ylabel("Mean Time Difference (us)", fontsize=14, fontweight="bold")
+    ax.set_title(
+        "Thread-Channel Interaction: Mean Variance",
+        fontsize=16, fontweight="bold", pad=20,
+    )
+    ax.set_xticks(channels)
+    ax.set_xticklabels([f"{c}ch" for c in channels])
+    ax.legend(fontsize=12, loc="best")
+    ax.grid(True, alpha=0.3)
+    
+    plt.tight_layout()
+    return save_figure(fig, output_dir / "variance_thread_channel_interaction.png", dpi)
+```
+
+---
+
+### 5.11 `plot_helper/__init__.py`
+
+Package exports.
+
+```python
+"""Plot helper functions for summary and GEMM visualizations."""
+
+from .common import configure_style, COLORS, save_figure, get_improvement_colors
+
+# Summary plots
+from .summary_dashboard import (
+    get_labels_from_excel,
+    plot_improvement_chart,
+    plot_abs_time_comparison,
+)
+from .gpu_by_rank import plot_gpu_metrics_by_rank
+from .gpu_percent_change import plot_gpu_percent_change_grid
+from .gpu_heatmap import plot_gpu_heatmap
+from .nccl_charts import plot_nccl_comparison, plot_nccl_percent_change
+
+# GEMM plots
+from .gemm_data import read_gemm_csv_data, print_gemm_statistics
+from .gemm_boxplots import (
+    plot_variance_by_threads,
+    plot_variance_by_channels,
+    plot_variance_by_ranks,
+)
+from .gemm_violin import plot_variance_violin_combined
+from .gemm_interaction import plot_thread_channel_interaction
+
+__all__ = [
+    # Common
+    "configure_style",
+    "COLORS",
+    "save_figure",
+    "get_improvement_colors",
+    # Summary
+    "get_labels_from_excel",
+    "plot_improvement_chart",
+    "plot_abs_time_comparison",
+    "plot_gpu_metrics_by_rank",
+    "plot_gpu_percent_change_grid",
+    "plot_gpu_heatmap",
+    "plot_nccl_comparison",
+    "plot_nccl_percent_change",
+    # GEMM
+    "read_gemm_csv_data",
+    "print_gemm_statistics",
+    "plot_variance_by_threads",
+    "plot_variance_by_channels",
+    "plot_variance_by_ranks",
+    "plot_variance_violin_combined",
+    "plot_thread_channel_interaction",
+]
+```
+
+---
+
+### 5.12 `generators/plot_generator.py`
+
+Main orchestrator (thin wrapper).
+
+```python
+"""Plot generation orchestrator.
+
+Provides unified interface for generating summary and GEMM plots.
+"""
+
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from .plot_helper import (
+    configure_style,
+    # Summary
+    get_labels_from_excel,
+    plot_improvement_chart,
+    plot_abs_time_comparison,
+    plot_gpu_metrics_by_rank,
+    plot_gpu_percent_change_grid,
+    plot_gpu_heatmap,
+    plot_nccl_comparison,
+    plot_nccl_percent_change,
+    # GEMM
+    read_gemm_csv_data,
+    print_gemm_statistics,
+    plot_variance_by_threads,
+    plot_variance_by_channels,
+    plot_variance_by_ranks,
+    plot_variance_violin_combined,
+    plot_thread_channel_interaction,
+)
+
+
+def generate_summary_plots(
+    excel_path: Path,
+    output_dir: Path,
+    dpi: int = 150,
+    verbose: bool = False,
+) -> List[Path]:
+    """
+    Generate all summary plots from Excel report.
+    
+    Returns list of generated file paths.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_files = []
+    
+    if verbose:
+        print(f"\nGenerating summary plots from: {excel_path}")
+    
+    labels = get_labels_from_excel(excel_path)
+    if verbose:
+        print(f"  Labels: {labels}")
+    
+    # Dashboard plots
+    output_files.append(plot_improvement_chart(excel_path, output_dir, dpi))
+    output_files.append(plot_abs_time_comparison(excel_path, output_dir, labels, dpi))
+    
+    # GPU plots
+    output_files.extend(plot_gpu_metrics_by_rank(excel_path, output_dir, labels, dpi=dpi))
+    output_files.append(plot_gpu_percent_change_grid(excel_path, output_dir, dpi))
+    output_files.append(plot_gpu_heatmap(excel_path, output_dir, dpi))
+    
+    # NCCL plots
+    output_files.extend(plot_nccl_comparison(excel_path, output_dir, labels, dpi))
+    output_files.append(plot_nccl_percent_change(excel_path, output_dir, dpi))
+    
+    if verbose:
+        print(f"  Generated {len(output_files)} summary plots")
+    
+    return output_files
+
+
+def generate_gemm_plots(
+    csv_path: Path,
+    output_dir: Path,
+    dpi: int = 150,
+    verbose: bool = False,
+) -> List[Path]:
+    """
+    Generate all GEMM variance plots from CSV.
+    
+    Returns list of generated file paths.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_files = []
+    
+    if verbose:
+        print(f"\nGenerating GEMM plots from: {csv_path}")
+    
+    data = read_gemm_csv_data(csv_path)
+    
+    if verbose:
+        print(f"  Total data points: {len(data['all'])}")
+        print_gemm_statistics(data)
+    
+    # Boxplots
+    output_files.append(plot_variance_by_threads(data, output_dir, dpi))
+    output_files.append(plot_variance_by_channels(data, output_dir, dpi))
+    output_files.append(plot_variance_by_ranks(data, output_dir, dpi))
+    
+    # Violin and interaction
+    output_files.append(plot_variance_violin_combined(data, output_dir, dpi))
+    output_files.append(plot_thread_channel_interaction(data, output_dir, dpi))
+    
+    if verbose:
+        print(f"  Generated {len(output_files)} GEMM plots")
+    
+    return output_files
+
+
+def generate_plots(
+    plot_type: str,
+    output_dir: Path,
+    excel_input: Optional[Path] = None,
+    gemm_csv: Optional[Path] = None,
+    dpi: int = 150,
+    verbose: bool = False,
+) -> Dict[str, List[Path]]:
+    """
+    Generate plots based on type.
+    
+    Args:
+        plot_type: "summary", "gemm", or "all"
+        output_dir: Output directory for PNG files
+        excel_input: Path to Excel report (for summary/all)
+        gemm_csv: Path to GEMM CSV (for gemm/all)
+        dpi: DPI for output images
+        verbose: Print progress
+    
+    Returns:
+        Dict mapping category to list of generated file paths
+    
+    Raises:
+        ValueError: If required inputs not provided for plot_type
+        FileNotFoundError: If input files don't exist
+    """
+    configure_style()
+    results = {}
+    
+    if plot_type in ("summary", "all"):
+        if excel_input is None:
+            raise ValueError("Excel input required for summary plots")
+        if not excel_input.exists():
+            raise FileNotFoundError(f"Excel file not found: {excel_input}")
+        results["summary"] = generate_summary_plots(excel_input, output_dir, dpi, verbose)
+    
+    if plot_type in ("gemm", "all"):
+        if gemm_csv is None:
+            raise ValueError("GEMM CSV required for gemm plots")
+        if not gemm_csv.exists():
+            raise FileNotFoundError(f"CSV file not found: {gemm_csv}")
+        results["gemm"] = generate_gemm_plots(gemm_csv, output_dir, dpi, verbose)
+    
+    return results
+```
+
+---
+
+## 6. Data Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                          generate plots                                      │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                             │
+│  --type summary                                                             │
+│  ────────────────                                                           │
+│  INPUT: final_report.xlsx                                                   │
+│    ├── Summary_Dashboard → summary_dashboard.py                             │
+│    ├── GPU_ByRank_Cmp → gpu_by_rank.py, gpu_percent_change.py, gpu_heatmap.py│
+│    └── NCCL_ImplicitSyncCmp → nccl_charts.py                                │
+│                                                                             │
+│  OUTPUT: ./plots/ (13 files)                                                │
+│    ├── improvement_chart.png                                                │
+│    ├── abs_time_comparison.png                                              │
+│    ├── {metric}_by_rank.png (4 files)                                       │
+│    ├── gpu_time_change_percentage_summary_by_rank.png                       │
+│    ├── gpu_time_heatmap.png                                                 │
+│    └── NCCL_*.png (5 files)                                                 │
+│                                                                             │
+│  --type gemm                                                                │
+│  ──────────────                                                             │
+│  INPUT: gemm_variance.csv                                                   │
+│    └── gemm_data.py → gemm_boxplots.py, gemm_violin.py, gemm_interaction.py │
+│                                                                             │
+│  OUTPUT: ./plots/ (5 files)                                                 │
+│    ├── variance_by_threads_boxplot.png                                      │
+│    ├── variance_by_channels_boxplot.png                                     │
+│    ├── variance_by_ranks_boxplot.png                                        │
+│    ├── variance_violin_combined.png                                         │
+│    └── variance_thread_channel_interaction.png                              │
+│                                                                             │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 7. Implementation Order
+
+| Phase | Task | Est. Time |
+|-------|------|-----------|
+| **1** | Create `plot_helper/` package structure | 5 min |
+| **2** | Implement `common.py` | 10 min |
+| **3** | Implement `summary_dashboard.py` | 15 min |
+| **4** | Implement `gpu_by_rank.py` | 10 min |
+| **5** | Implement `gpu_percent_change.py` | 10 min |
+| **6** | Implement `gpu_heatmap.py` | 10 min |
+| **7** | Implement `nccl_charts.py` | 20 min |
+| **8** | Implement `gemm_data.py` | 10 min |
+| **9** | Implement `gemm_boxplots.py` | 15 min |
+| **10** | Implement `gemm_violin.py` | 15 min |
+| **11** | Implement `gemm_interaction.py` | 10 min |
+| **12** | Implement `plot_helper/__init__.py` | 5 min |
+| **13** | Implement `plot_generator.py` orchestrator | 15 min |
+| **14** | Update `generators/__init__.py` | 5 min |
+| **15** | Update CLI in `cli.py` | 15 min |
+| **16** | Testing | 20 min |
+
+**Total estimated time: ~3 hours**
+
+---
+
+## 8. Output Files
+
+### Summary Plots (13 files)
+
+| File | Source Module | Description |
+|------|---------------|-------------|
+| `improvement_chart.png` | `summary_dashboard.py` | Horizontal bar chart |
+| `abs_time_comparison.png` | `summary_dashboard.py` | Grouped bar chart |
+| `total_time_by_rank.png` | `gpu_by_rank.py` | Line plot |
+| `computation_time_by_rank.png` | `gpu_by_rank.py` | Line plot |
+| `total_comm_time_by_rank.png` | `gpu_by_rank.py` | Line plot |
+| `idle_time_by_rank.png` | `gpu_by_rank.py` | Line plot |
+| `gpu_time_change_percentage_summary_by_rank.png` | `gpu_percent_change.py` | 2×4 grid |
+| `gpu_time_heatmap.png` | `gpu_heatmap.py` | Seaborn heatmap |
+| `NCCL_Communication_Latency_comparison.png` | `nccl_charts.py` | Grouped bars |
+| `NCCL_Algorithm_Bandwidth_comparison.png` | `nccl_charts.py` | Grouped bars |
+| `NCCL_Bus_Bandwidth_comparison.png` | `nccl_charts.py` | Grouped bars |
+| `NCCL_Total_Communication_Latency_comparison.png` | `nccl_charts.py` | Grouped bars |
+| `NCCL_Performance_Percentage_Change_comparison.png` | `nccl_charts.py` | 1×3 grid |
+
+### GEMM Plots (5 files)
+
+| File | Source Module | Description |
+|------|---------------|-------------|
+| `variance_by_threads_boxplot.png` | `gemm_boxplots.py` | Boxplot |
+| `variance_by_channels_boxplot.png` | `gemm_boxplots.py` | Boxplot |
+| `variance_by_ranks_boxplot.png` | `gemm_boxplots.py` | Boxplot |
+| `variance_violin_combined.png` | `gemm_violin.py` | 1×3 violin |
+| `variance_thread_channel_interaction.png` | `gemm_interaction.py` | Line plot |
+
+---
+
+## Appendix A: Design Decisions
+
+1. **Modular Structure:** One file per logical group of plots (~50-120 lines each)
+2. **Plot Types:** `summary` and `gemm` as requested
+3. **Internal Package:** `plot_helper/` keeps implementation details separate from public API
+4. **Thin Orchestrator:** `plot_generator.py` imports from `plot_helper/` and provides CLI-facing API
+5. **Consistent Style:** All plots use shared `common.py` utilities
+6. **Easy Extension:** Adding new plot types = new file in `plot_helper/`
diff --git a/src/aorta/report/PIPELINE_DEV_DOCS.md b/src/aorta/report/PIPELINE_DEV_DOCS.md
new file mode 100644
index 0000000..821023f
--- /dev/null
+++ b/src/aorta/report/PIPELINE_DEV_DOCS.md
@@ -0,0 +1,1079 @@
+# Pipeline Commands - Developer Documentation
+
+**Version:** 1.0  
+**Date:** January 2026  
+**Status:** ✅ Implemented
+
+---
+
+## Table of Contents
+
+1. [Overview](#1-overview)
+2. [Pipeline Summary](#2-pipeline-summary)
+3. [Pipeline GEMM](#3-pipeline-gemm)
+4. [Implementation Architecture](#4-implementation-architecture)
+5. [Module Details](#5-module-details)
+6. [Implementation Plan](#6-implementation-plan)
+
+---
+
+## 1. Overview
+
+The pipeline commands orchestrate multi-step analysis workflows, combining existing commands into end-to-end automation.
+
+### Pipeline Commands
+
+| Command | Description | Steps |
+|---------|-------------|-------|
+| `pipeline summary` | Complete TraceLens analysis (GPU + NCCL) | 7 steps |
+| `pipeline gemm` | GEMM kernel variance analysis | 3 steps |
+
+### Design Principles
+
+1. **Reuse Existing Functions**: Call existing module functions directly (no subprocess)
+2. **Configurable Steps**: Enable/disable individual steps via flags
+3. **Progress Reporting**: Clear step-by-step progress output
+4. **Error Handling**: Continue on non-critical errors, fail fast on critical ones
+5. **Dataclass Config**: Clean configuration management
+
+---
+
+## 2. Pipeline Summary
+
+### 2.1 Source Script
+
+**Location:** `scripts/tracelens_single_config/run_full_analysis.py` (529 lines)
+
+### 2.2 Pipeline Steps
+
+| Step | Description | Existing Function | Skippable |
+|------|-------------|-------------------|-----------|
+| 1 | TraceLens Analysis | `analyze_single_config()` | Yes (`--skip-tracelens`) |
+| 2 | Process GPU Timelines | `process_single_config()` | No |
+| 3 | Compare GPU Timelines | `compare gpu_timeline` logic | Yes (`--no-gpu-timeline`) |
+| 4 | Compare Collective | `compare collective` logic | Yes (`--no-collective`) |
+| 5 | Generate Final Excel | `create_final_excel_report()` | Yes (`--no-final-report`) |
+| 6 | Generate Plots | `generate_summary_plots()` | Yes (`--no-plots`) |
+| 7 | Generate HTML | `generate_html(mode="performance")` | Yes (`--no-html`) |
+
+### 2.3 CLI Specification
+
+```bash
+aorta-report pipeline summary \
+    -b/--baseline <path>           # Required: Baseline trace directory
+    -t/--test <path>               # Required: Test trace directory
+    -o/--output <path>             # Required: Output directory
+    [--baseline-label <label>]     # Optional: Label for baseline (default: dir name)
+    [--test-label <label>]         # Optional: Label for test (default: dir name)
+    [--skip-tracelens]             # Skip TraceLens analysis
+    [--gpu-timeline/--no-gpu-timeline]    # Default: True
+    [--collective/--no-collective]        # Default: True
+    [--final-report/--no-final-report]    # Default: True
+    [--plots/--no-plots]                  # Default: True
+    [--html/--no-html]                    # Default: True
+```
+
+### 2.4 Examples
+
+```bash
+# Full pipeline
+aorta-report pipeline summary \
+    -b /path/to/baseline \
+    -t /path/to/test \
+    -o /path/to/output
+
+# Skip TraceLens (analysis already done)
+aorta-report pipeline summary \
+    -b /path/to/baseline \
+    -t /path/to/test \
+    -o /path/to/output \
+    --skip-tracelens
+
+# Only GPU timeline comparison
+aorta-report pipeline summary \
+    -b /path/to/baseline \
+    -t /path/to/test \
+    -o /path/to/output \
+    --no-collective --no-final-report --no-plots --no-html
+
+# Custom labels
+aorta-report pipeline summary \
+    -b /path/to/baseline \
+    -t /path/to/test \
+    -o /path/to/output \
+    --baseline-label "ROCm 6.0" \
+    --test-label "ROCm 7.0"
+```
+
+### 2.5 Data Flow
+
+```
+INPUTS:
+├── baseline/                     (trace directory)
+│   └── rank_*/pytorch_trace.json.gz
+└── test/                         (trace directory)
+    └── rank_*/pytorch_trace.json.gz
+
+STEP 1: TraceLens Analysis (--skip-tracelens to skip)
+────────────────────────────────────────────────────
+baseline/ ──► baseline/tracelens_analysis/
+              ├── individual_reports/perf_rank*.xlsx
+              └── collective_reports/collective_all_ranks.xlsx
+test/     ──► test/tracelens_analysis/
+              └── (same structure)
+
+STEP 2: Process GPU Timelines
+─────────────────────────────
+individual_reports/ ──► gpu_timeline_summary_mean.xlsx
+
+STEP 3: Compare GPU Timeline
+────────────────────────────
+baseline/gpu_timeline_summary_mean.xlsx ─┬──► gpu_timeline_combined.xlsx
+test/gpu_timeline_summary_mean.xlsx ─────┘──► gpu_timeline_comparison.xlsx
+
+STEP 4: Compare Collective
+──────────────────────────
+baseline/collective_all_ranks.xlsx ──┬──► collective_combined.xlsx
+test/collective_all_ranks.xlsx ──────┘──► collective_comparison.xlsx
+
+STEP 5: Final Excel Report
+──────────────────────────
+gpu_combined + gpu_comparison ──┬──► final_analysis_report.xlsx
+coll_combined + coll_comparison ┘
+
+STEP 6: Generate Plots
+──────────────────────
+final_analysis_report.xlsx ──► plots/*.png (13 files)
+
+STEP 7: Generate HTML
+─────────────────────
+plots/ ──► performance_analysis_report.html
+
+FINAL OUTPUT:
+└── output/
+    ├── gpu_timeline_combined.xlsx
+    ├── gpu_timeline_comparison.xlsx
+    ├── collective_combined.xlsx
+    ├── collective_comparison.xlsx
+    ├── final_analysis_report.xlsx
+    ├── plots/
+    │   ├── improvement_chart.png
+    │   ├── gpu_time_heatmap.png
+    │   └── ... (13 files)
+    └── performance_analysis_report.html
+```
+
+---
+
+## 3. Pipeline GEMM
+
+### 3.1 Purpose
+
+Automates GEMM kernel variance analysis for sweep experiments, extracting top-K kernels with highest time variance and generating visualization plots.
+
+### 3.2 Pipeline Steps
+
+| Step | Description | Existing Function | Skippable |
+|------|-------------|-------------------|-----------|
+| 1 | Analyze GEMM Reports | `analyze_gemm_reports()` | No |
+| 2 | Enhance with Timestamps | `enhance_gemm_variance()` | Yes (`--no-timestamps`) |
+| 3 | Generate GEMM Plots | `generate_gemm_plots()` | Yes (`--no-plots`) |
+
+### 3.3 CLI Specification
+
+```bash
+aorta-report pipeline gemm \
+    --sweep-dir <path>             # Required: Sweep directory with tracelens_analysis/
+    -o/--output <path>             # Required: Output directory
+    [--top-k <int>]                # Top K kernels to extract (default: 5)
+    [--threads <int>...]           # Thread configs (default: 256, 512)
+    [--channels <int>...]          # Channel configs (default: 28, 42, 56, 70)
+    [--timestamps/--no-timestamps] # Enhance with timestamps (default: True)
+    [--plots/--no-plots]           # Generate plots (default: True)
+```
+
+### 3.4 Examples
+
+```bash
+# Full GEMM pipeline
+aorta-report pipeline gemm \
+    --sweep-dir /path/to/sweep \
+    -o /path/to/output
+
+# Custom top-k and configs
+aorta-report pipeline gemm \
+    --sweep-dir /path/to/sweep \
+    -o /path/to/output \
+    --top-k 10 \
+    --threads 256 512 \
+    --channels 28 42 56 70
+
+# Skip plots
+aorta-report pipeline gemm \
+    --sweep-dir /path/to/sweep \
+    -o /path/to/output \
+    --no-plots
+
+# Skip timestamp enhancement
+aorta-report pipeline gemm \
+    --sweep-dir /path/to/sweep \
+    -o /path/to/output \
+    --no-timestamps
+```
+
+### 3.5 Data Flow
+
+```
+INPUT:
+└── sweep_dir/
+    └── tracelens_analysis/
+        ├── 256thread/
+        │   └── individual_reports/
+        │       └── perf_*ch_rank*.xlsx
+        └── 512thread/
+            └── individual_reports/
+                └── perf_*ch_rank*.xlsx
+
+STEP 1: Analyze GEMM Reports
+────────────────────────────
+tracelens_analysis/ ──► top{k}_gemm_kernels_time_variance.csv
+
+STEP 2: Enhance with Timestamps (--no-timestamps to skip)
+─────────────────────────────────────────────────────────
+variance.csv + trace files ──► variance_with_timestamps.csv
+
+STEP 3: Generate GEMM Plots (--no-plots to skip)
+────────────────────────────────────────────────
+variance.csv ──► plots/
+                 ├── variance_by_threads_boxplot.png
+                 ├── variance_by_channels_boxplot.png
+                 ├── variance_by_ranks_boxplot.png
+                 ├── variance_violin_combined.png
+                 └── variance_thread_channel_interaction.png
+
+FINAL OUTPUT:
+└── output/
+    ├── top5_gemm_kernels_time_variance.csv
+    ├── top5_gemm_kernels_time_variance_with_timestamps.csv (if --timestamps)
+    └── plots/
+        └── *.png (5 files)
+```
+
+---
+
+## 4. Implementation Architecture
+
+### 4.1 File Structure
+
+```
+src/aorta/report/
+├── pipelines/                       # NEW: Pipeline orchestrators
+│   ├── __init__.py                  # Package exports
+│   ├── summary_pipeline.py          # Summary pipeline (~250 lines)
+│   └── gemm_pipeline.py             # GEMM pipeline (~150 lines)
+└── cli.py                           # Update pipeline commands
+```
+
+### 4.2 Shared Components
+
+```python
+# pipelines/__init__.py
+
+from .summary_pipeline import run_summary_pipeline, SummaryPipelineConfig
+from .gemm_pipeline import run_gemm_pipeline, GemmPipelineConfig
+
+__all__ = [
+    "run_summary_pipeline",
+    "SummaryPipelineConfig",
+    "run_gemm_pipeline",
+    "GemmPipelineConfig",
+]
+```
+
+---
+
+## 5. Module Details
+
+### 5.1 `pipelines/summary_pipeline.py`
+
+```python
+"""Summary analysis pipeline.
+
+Orchestrates complete TraceLens analysis workflow:
+1. TraceLens Analysis (optional)
+2. Process GPU Timelines
+3. Compare GPU Timelines
+4. Compare Collective
+5. Generate Final Excel Report
+6. Generate Plots
+7. Generate HTML Report
+"""
+
+from pathlib import Path
+from typing import Optional, Dict, List, Any
+from dataclasses import dataclass, field
+
+
+@dataclass
+class SummaryPipelineConfig:
+    """Configuration for summary pipeline."""
+    baseline_path: Path
+    test_path: Path
+    output_dir: Path
+    baseline_label: Optional[str] = None
+    test_label: Optional[str] = None
+    skip_tracelens: bool = False
+    gpu_timeline: bool = True
+    collective: bool = True
+    final_report: bool = True
+    plots: bool = True
+    html: bool = True
+    verbose: bool = False
+
+
+@dataclass
+class PipelineResult:
+    """Result from pipeline execution."""
+    success: bool
+    output_dir: Path
+    files_generated: Dict[str, Path] = field(default_factory=dict)
+    steps_completed: List[str] = field(default_factory=list)
+    steps_skipped: List[str] = field(default_factory=list)
+    errors: List[str] = field(default_factory=list)
+
+
+def run_summary_pipeline(config: SummaryPipelineConfig) -> PipelineResult:
+    """
+    Run the complete summary pipeline.
+    
+    Returns PipelineResult with success status and generated files.
+    """
+    result = PipelineResult(
+        success=True,
+        output_dir=config.output_dir,
+    )
+    
+    config.output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Extract labels from directory names if not provided
+    baseline_label = config.baseline_label or config.baseline_path.name
+    test_label = config.test_label or config.test_path.name
+    
+    try:
+        # Step 1: TraceLens Analysis
+        if not config.skip_tracelens:
+            _step_tracelens_analysis(config, result)
+        else:
+            result.steps_skipped.append("tracelens_analysis")
+        
+        # Step 2: Process GPU Timelines
+        if config.gpu_timeline:
+            _step_process_gpu_timelines(config, result)
+        
+        # Step 3: Compare GPU Timelines
+        if config.gpu_timeline:
+            _step_compare_gpu_timeline(config, result, baseline_label, test_label)
+        else:
+            result.steps_skipped.append("compare_gpu_timeline")
+        
+        # Step 4: Compare Collective
+        if config.collective:
+            _step_compare_collective(config, result, baseline_label, test_label)
+        else:
+            result.steps_skipped.append("compare_collective")
+        
+        # Step 5: Generate Final Report
+        if config.final_report and config.gpu_timeline and config.collective:
+            _step_generate_final_report(config, result, baseline_label, test_label)
+        elif config.final_report:
+            result.steps_skipped.append("final_report (requires both gpu_timeline and collective)")
+        
+        # Step 6: Generate Plots
+        if config.plots and "final_analysis_report.xlsx" in str(result.files_generated):
+            _step_generate_plots(config, result)
+        elif config.plots:
+            result.steps_skipped.append("plots (requires final_report)")
+        
+        # Step 7: Generate HTML
+        if config.html and result.files_generated.get("plots_dir"):
+            _step_generate_html(config, result)
+        elif config.html:
+            result.steps_skipped.append("html (requires plots)")
+            
+    except Exception as e:
+        result.success = False
+        result.errors.append(str(e))
+    
+    return result
+
+
+def _step_tracelens_analysis(config: SummaryPipelineConfig, result: PipelineResult) -> None:
+    """Step 1: Run TraceLens analysis on baseline and test."""
+    from ..analysis import analyze_single_config
+    
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 1: TraceLens Analysis")
+        print("=" * 60)
+    
+    # Analyze baseline
+    if config.verbose:
+        print(f"\nAnalyzing baseline: {config.baseline_path}")
+    analyze_single_config(config.baseline_path, verbose=config.verbose)
+    
+    # Analyze test
+    if config.verbose:
+        print(f"\nAnalyzing test: {config.test_path}")
+    analyze_single_config(config.test_path, verbose=config.verbose)
+    
+    result.steps_completed.append("tracelens_analysis")
+
+
+def _step_process_gpu_timelines(config: SummaryPipelineConfig, result: PipelineResult) -> None:
+    """Step 2: Process GPU timelines for both baseline and test."""
+    from ..processing import process_single_config
+    
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 2: Process GPU Timelines")
+        print("=" * 60)
+    
+    baseline_reports = config.baseline_path / "tracelens_analysis" / "individual_reports"
+    test_reports = config.test_path / "tracelens_analysis" / "individual_reports"
+    
+    if config.verbose:
+        print(f"\nProcessing baseline: {baseline_reports}")
+    process_single_config(baseline_reports, verbose=config.verbose)
+    
+    if config.verbose:
+        print(f"\nProcessing test: {test_reports}")
+    process_single_config(test_reports, verbose=config.verbose)
+    
+    result.steps_completed.append("process_gpu_timelines")
+
+
+def _step_compare_gpu_timeline(
+    config: SummaryPipelineConfig,
+    result: PipelineResult,
+    baseline_label: str,
+    test_label: str,
+) -> None:
+    """Step 3: Compare GPU timelines."""
+    from ..comparison import combine_excel_files, add_gpu_timeline_comparison, save_with_formatting
+    
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 3: Compare GPU Timelines")
+        print("=" * 60)
+    
+    baseline_gpu = config.baseline_path / "tracelens_analysis" / "gpu_timeline_summary_mean.xlsx"
+    test_gpu = config.test_path / "tracelens_analysis" / "gpu_timeline_summary_mean.xlsx"
+    
+    # Combine
+    combined = combine_excel_files(baseline_gpu, test_gpu, baseline_label, test_label, verbose=config.verbose)
+    
+    # Save combined
+    combined_path = config.output_dir / "gpu_timeline_combined.xlsx"
+    save_with_formatting(combined, combined_path, {})
+    result.files_generated["gpu_combined"] = combined_path
+    
+    # Add comparison
+    comparison = add_gpu_timeline_comparison(combined, baseline_label, test_label, verbose=config.verbose)
+    
+    # Save comparison
+    comparison_path = config.output_dir / "gpu_timeline_comparison.xlsx"
+    format_columns = {
+        "Comparison_By_Rank": ["percent_change"],
+        "Summary_Comparison": ["percent_change"],
+    }
+    save_with_formatting(comparison, comparison_path, format_columns)
+    result.files_generated["gpu_comparison"] = comparison_path
+    
+    result.steps_completed.append("compare_gpu_timeline")
+
+
+def _step_compare_collective(
+    config: SummaryPipelineConfig,
+    result: PipelineResult,
+    baseline_label: str,
+    test_label: str,
+) -> None:
+    """Step 4: Compare collective/NCCL."""
+    from ..comparison import combine_excel_files, add_collective_comparison, save_with_formatting
+    from ..comparison.collective_comparison import get_percent_change_columns
+    
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 4: Compare Collective/NCCL")
+        print("=" * 60)
+    
+    baseline_coll = config.baseline_path / "tracelens_analysis" / "collective_reports" / "collective_all_ranks.xlsx"
+    test_coll = config.test_path / "tracelens_analysis" / "collective_reports" / "collective_all_ranks.xlsx"
+    
+    # Combine (filter summary sheets only)
+    combined = combine_excel_files(
+        baseline_coll, test_coll, baseline_label, test_label,
+        filter_summary_only=True, verbose=config.verbose
+    )
+    
+    # Save combined
+    combined_path = config.output_dir / "collective_combined.xlsx"
+    save_with_formatting(combined, combined_path, {})
+    result.files_generated["coll_combined"] = combined_path
+    
+    # Add comparison
+    comparison = add_collective_comparison(combined, baseline_label, test_label, verbose=config.verbose)
+    
+    # Save comparison
+    comparison_path = config.output_dir / "collective_comparison.xlsx"
+    format_columns = {}
+    for sheet_name, df in comparison.items():
+        if sheet_name.endswith("_cmp"):
+            pct_cols = get_percent_change_columns(df)
+            if pct_cols:
+                format_columns[sheet_name] = pct_cols
+    save_with_formatting(comparison, comparison_path, format_columns)
+    result.files_generated["coll_comparison"] = comparison_path
+    
+    result.steps_completed.append("compare_collective")
+
+
+def _step_generate_final_report(
+    config: SummaryPipelineConfig,
+    result: PipelineResult,
+    baseline_label: str,
+    test_label: str,
+) -> None:
+    """Step 5: Generate final Excel report."""
+    from ..generators import create_final_excel_report
+    
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 5: Generate Final Excel Report")
+        print("=" * 60)
+    
+    final_report_path = config.output_dir / "final_analysis_report.xlsx"
+    
+    create_final_excel_report(
+        gpu_combined_path=result.files_generated["gpu_combined"],
+        gpu_comparison_path=result.files_generated["gpu_comparison"],
+        coll_combined_path=result.files_generated["coll_combined"],
+        coll_comparison_path=result.files_generated["coll_comparison"],
+        output_path=final_report_path,
+        baseline_label=baseline_label,
+        test_label=test_label,
+        verbose=config.verbose,
+    )
+    
+    result.files_generated["final_report"] = final_report_path
+    result.steps_completed.append("final_report")
+
+
+def _step_generate_plots(config: SummaryPipelineConfig, result: PipelineResult) -> None:
+    """Step 6: Generate plots."""
+    from ..generators import generate_summary_plots
+    
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 6: Generate Plots")
+        print("=" * 60)
+    
+    plots_dir = config.output_dir / "plots"
+    
+    generate_summary_plots(
+        excel_path=result.files_generated["final_report"],
+        output_dir=plots_dir,
+        verbose=config.verbose,
+    )
+    
+    result.files_generated["plots_dir"] = plots_dir
+    result.steps_completed.append("plots")
+
+
+def _step_generate_html(config: SummaryPipelineConfig, result: PipelineResult) -> None:
+    """Step 7: Generate HTML report."""
+    from ..generators import generate_html
+    
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 7: Generate HTML Report")
+        print("=" * 60)
+    
+    html_path = config.output_dir / "performance_analysis_report.html"
+    
+    generate_html(
+        mode="performance",
+        output=html_path,
+        plots_dir=result.files_generated["plots_dir"],
+        verbose=config.verbose,
+    )
+    
+    result.files_generated["html_report"] = html_path
+    result.steps_completed.append("html")
+```
+
+---
+
+### 5.2 `pipelines/gemm_pipeline.py`
+
+```python
+"""GEMM variance analysis pipeline.
+
+Orchestrates GEMM kernel variance analysis:
+1. Analyze GEMM Reports
+2. Enhance with Timestamps (optional)
+3. Generate GEMM Plots (optional)
+"""
+
+from pathlib import Path
+from typing import Optional, List
+from dataclasses import dataclass, field
+
+
+@dataclass
+class GemmPipelineConfig:
+    """Configuration for GEMM pipeline."""
+    sweep_dir: Path
+    output_dir: Path
+    top_k: int = 5
+    threads: List[int] = field(default_factory=lambda: [256, 512])
+    channels: List[int] = field(default_factory=lambda: [28, 42, 56, 70])
+    ranks: List[int] = field(default_factory=lambda: list(range(8)))
+    timestamps: bool = True
+    plots: bool = True
+    verbose: bool = False
+
+
+@dataclass
+class GemmPipelineResult:
+    """Result from GEMM pipeline execution."""
+    success: bool
+    output_dir: Path
+    csv_path: Optional[Path] = None
+    csv_with_timestamps_path: Optional[Path] = None
+    plots_dir: Optional[Path] = None
+    steps_completed: List[str] = field(default_factory=list)
+    steps_skipped: List[str] = field(default_factory=list)
+    errors: List[str] = field(default_factory=list)
+
+
+def run_gemm_pipeline(config: GemmPipelineConfig) -> GemmPipelineResult:
+    """
+    Run the complete GEMM analysis pipeline.
+    
+    Returns GemmPipelineResult with success status and generated files.
+    """
+    result = GemmPipelineResult(
+        success=True,
+        output_dir=config.output_dir,
+    )
+    
+    config.output_dir.mkdir(parents=True, exist_ok=True)
+    
+    try:
+        # Step 1: Analyze GEMM Reports
+        _step_analyze_gemm(config, result)
+        
+        # Step 2: Enhance with Timestamps
+        if config.timestamps and result.csv_path:
+            _step_enhance_timestamps(config, result)
+        elif config.timestamps:
+            result.steps_skipped.append("timestamps (analyze_gemm failed)")
+        else:
+            result.steps_skipped.append("timestamps")
+        
+        # Step 3: Generate GEMM Plots
+        if config.plots and result.csv_path:
+            _step_generate_plots(config, result)
+        elif config.plots:
+            result.steps_skipped.append("plots (analyze_gemm failed)")
+        else:
+            result.steps_skipped.append("plots")
+            
+    except Exception as e:
+        result.success = False
+        result.errors.append(str(e))
+    
+    return result
+
+
+def _step_analyze_gemm(config: GemmPipelineConfig, result: GemmPipelineResult) -> None:
+    """Step 1: Analyze GEMM reports."""
+    from ..analysis import analyze_gemm_reports
+    
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 1: Analyze GEMM Reports")
+        print("=" * 60)
+    
+    reports_dir = config.sweep_dir / "tracelens_analysis"
+    output_file = f"top{config.top_k}_gemm_kernels_time_variance.csv"
+    
+    csv_path = analyze_gemm_reports(
+        base_path=reports_dir,
+        threads=config.threads,
+        channels=config.channels,
+        ranks=config.ranks,
+        top_k=config.top_k,
+        output_file=str(config.output_dir / output_file),
+        verbose=config.verbose,
+    )
+    
+    result.csv_path = csv_path
+    result.steps_completed.append("analyze_gemm")
+
+
+def _step_enhance_timestamps(config: GemmPipelineConfig, result: GemmPipelineResult) -> None:
+    """Step 2: Enhance with timestamps."""
+    from ..processing import enhance_gemm_variance
+    
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 2: Enhance with Timestamps")
+        print("=" * 60)
+    
+    output_csv = result.csv_path.with_name(
+        result.csv_path.stem + "_with_timestamps.csv"
+    )
+    
+    try:
+        enhanced_path = enhance_gemm_variance(
+            input_csv=result.csv_path,
+            base_path=config.sweep_dir,
+            output_csv=output_csv,
+            verbose=config.verbose,
+        )
+        result.csv_with_timestamps_path = enhanced_path
+        result.steps_completed.append("timestamps")
+    except Exception as e:
+        result.errors.append(f"Timestamp enhancement failed: {e}")
+        result.steps_skipped.append("timestamps (failed)")
+
+
+def _step_generate_plots(config: GemmPipelineConfig, result: GemmPipelineResult) -> None:
+    """Step 3: Generate GEMM plots."""
+    from ..generators import generate_gemm_plots
+    
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 3: Generate GEMM Plots")
+        print("=" * 60)
+    
+    plots_dir = config.output_dir / "plots"
+    
+    generate_gemm_plots(
+        csv_path=result.csv_path,
+        output_dir=plots_dir,
+        verbose=config.verbose,
+    )
+    
+    result.plots_dir = plots_dir
+    result.steps_completed.append("plots")
+```
+
+---
+
+### 5.3 `pipelines/__init__.py`
+
+```python
+"""Pipeline orchestrators for multi-step analysis workflows."""
+
+from .summary_pipeline import run_summary_pipeline, SummaryPipelineConfig, PipelineResult
+from .gemm_pipeline import run_gemm_pipeline, GemmPipelineConfig, GemmPipelineResult
+
+__all__ = [
+    "run_summary_pipeline",
+    "SummaryPipelineConfig",
+    "PipelineResult",
+    "run_gemm_pipeline",
+    "GemmPipelineConfig",
+    "GemmPipelineResult",
+]
+```
+
+---
+
+### 5.4 CLI Updates
+
+#### `pipeline summary` Command
+
+```python
+@pipeline.command("summary")
+@click.option("-b", "--baseline", required=True, type=click.Path(exists=True),
+              help="Baseline trace directory")
+@click.option("-t", "--test", required=True, type=click.Path(exists=True),
+              help="Test trace directory")
+@click.option("-o", "--output", required=True, type=click.Path(),
+              help="Output directory for results")
+@click.option("--baseline-label", default=None,
+              help="Label for baseline (default: directory name)")
+@click.option("--test-label", default=None,
+              help="Label for test (default: directory name)")
+@click.option("--skip-tracelens", is_flag=True,
+              help="Skip TraceLens analysis (if already done)")
+@click.option("--gpu-timeline/--no-gpu-timeline", default=True,
+              help="Enable/disable GPU timeline comparison")
+@click.option("--collective/--no-collective", default=True,
+              help="Enable/disable collective comparison")
+@click.option("--final-report/--no-final-report", default=True,
+              help="Enable/disable final Excel report")
+@click.option("--plots/--no-plots", default=True,
+              help="Enable/disable plot generation")
+@click.option("--html/--no-html", default=True,
+              help="Enable/disable HTML report generation")
+@click.pass_context
+def pipeline_summary(ctx, baseline, test, output, baseline_label, test_label,
+                     skip_tracelens, gpu_timeline, collective, final_report, plots, html):
+    """Run complete summary analysis pipeline.
+    
+    Orchestrates the full TraceLens analysis workflow:
+    
+    \b
+    1. TraceLens Analysis (optional, skip with --skip-tracelens)
+    2. Process GPU timelines
+    3. Compare GPU timelines (baseline vs test)
+    4. Compare collective/NCCL metrics
+    5. Generate final Excel report
+    6. Generate visualization plots
+    7. Generate HTML report
+    
+    \b
+    Examples:
+      # Full pipeline
+      aorta-report pipeline summary \\
+          -b /path/to/baseline -t /path/to/test -o /path/to/output
+      
+      # Skip TraceLens (already done)
+      aorta-report pipeline summary \\
+          -b /path/to/baseline -t /path/to/test -o /path/to/output \\
+          --skip-tracelens
+    """
+    from pathlib import Path
+    from .pipelines import run_summary_pipeline, SummaryPipelineConfig
+    
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+    
+    config = SummaryPipelineConfig(
+        baseline_path=Path(baseline),
+        test_path=Path(test),
+        output_dir=Path(output),
+        baseline_label=baseline_label,
+        test_label=test_label,
+        skip_tracelens=skip_tracelens,
+        gpu_timeline=gpu_timeline,
+        collective=collective,
+        final_report=final_report,
+        plots=plots,
+        html=html,
+        verbose=verbose,
+    )
+    
+    if not quiet:
+        click.echo("=" * 60)
+        click.echo("SUMMARY ANALYSIS PIPELINE")
+        click.echo("=" * 60)
+        click.echo(f"Baseline: {baseline}")
+        click.echo(f"Test: {test}")
+        click.echo(f"Output: {output}")
+        click.echo(f"Options: skip_tracelens={skip_tracelens}, gpu_timeline={gpu_timeline}")
+        click.echo(f"         collective={collective}, final_report={final_report}")
+        click.echo(f"         plots={plots}, html={html}")
+    
+    result = run_summary_pipeline(config)
+    
+    if not quiet:
+        click.echo("\n" + "=" * 60)
+        click.echo("PIPELINE COMPLETE!" if result.success else "PIPELINE FAILED!")
+        click.echo("=" * 60)
+        
+        if result.steps_completed:
+            click.echo("\nSteps completed:")
+            for step in result.steps_completed:
+                click.echo(f"  ✓ {step}")
+        
+        if result.steps_skipped:
+            click.echo("\nSteps skipped:")
+            for step in result.steps_skipped:
+                click.echo(f"  - {step}")
+        
+        if result.errors:
+            click.echo("\nErrors:")
+            for err in result.errors:
+                click.echo(f"  ✗ {err}")
+        
+        if result.files_generated:
+            click.echo(f"\nOutput directory: {result.output_dir}")
+            click.echo("Generated files:")
+            for name, path in result.files_generated.items():
+                click.echo(f"  - {path.name}")
+    
+    if not result.success:
+        raise click.ClickException("Pipeline failed")
+```
+
+#### `pipeline gemm` Command
+
+```python
+@pipeline.command("gemm")
+@click.option("--sweep-dir", required=True, type=click.Path(exists=True),
+              help="Sweep directory containing tracelens_analysis/")
+@click.option("-o", "--output", required=True, type=click.Path(),
+              help="Output directory for results")
+@click.option("--top-k", default=5, type=int,
+              help="Number of top kernels to extract (default: 5)")
+@click.option("--threads", "-t", multiple=True, type=int, default=(256, 512),
+              help="Thread configurations (can specify multiple)")
+@click.option("--channels", "-c", multiple=True, type=int, default=(28, 42, 56, 70),
+              help="Channel configurations (can specify multiple)")
+@click.option("--timestamps/--no-timestamps", default=True,
+              help="Enhance with timestamps (default: True)")
+@click.option("--plots/--no-plots", default=True,
+              help="Generate plots (default: True)")
+@click.pass_context
+def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps, plots):
+    """Run GEMM variance analysis pipeline.
+    
+    Analyzes GEMM kernel time variance across configurations:
+    
+    \b
+    1. Analyze GEMM reports to extract top-K kernels
+    2. Enhance with timestamps (optional)
+    3. Generate variance plots (optional)
+    
+    \b
+    Examples:
+      # Full pipeline
+      aorta-report pipeline gemm --sweep-dir /path/to/sweep -o /path/to/output
+      
+      # Custom top-k
+      aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --top-k 10
+      
+      # Skip plots
+      aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --no-plots
+    """
+    from pathlib import Path
+    from .pipelines import run_gemm_pipeline, GemmPipelineConfig
+    
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+    
+    config = GemmPipelineConfig(
+        sweep_dir=Path(sweep_dir),
+        output_dir=Path(output),
+        top_k=top_k,
+        threads=list(threads),
+        channels=list(channels),
+        timestamps=timestamps,
+        plots=plots,
+        verbose=verbose,
+    )
+    
+    if not quiet:
+        click.echo("=" * 60)
+        click.echo("GEMM VARIANCE ANALYSIS PIPELINE")
+        click.echo("=" * 60)
+        click.echo(f"Sweep dir: {sweep_dir}")
+        click.echo(f"Output: {output}")
+        click.echo(f"Top-K: {top_k}")
+        click.echo(f"Threads: {list(threads)}")
+        click.echo(f"Channels: {list(channels)}")
+        click.echo(f"Options: timestamps={timestamps}, plots={plots}")
+    
+    result = run_gemm_pipeline(config)
+    
+    if not quiet:
+        click.echo("\n" + "=" * 60)
+        click.echo("PIPELINE COMPLETE!" if result.success else "PIPELINE FAILED!")
+        click.echo("=" * 60)
+        
+        if result.steps_completed:
+            click.echo("\nSteps completed:")
+            for step in result.steps_completed:
+                click.echo(f"  ✓ {step}")
+        
+        if result.steps_skipped:
+            click.echo("\nSteps skipped:")
+            for step in result.steps_skipped:
+                click.echo(f"  - {step}")
+        
+        if result.errors:
+            click.echo("\nErrors:")
+            for err in result.errors:
+                click.echo(f"  ✗ {err}")
+        
+        click.echo(f"\nOutput directory: {result.output_dir}")
+        if result.csv_path:
+            click.echo(f"  - {result.csv_path.name}")
+        if result.csv_with_timestamps_path:
+            click.echo(f"  - {result.csv_with_timestamps_path.name}")
+        if result.plots_dir:
+            click.echo(f"  - plots/ (5 files)")
+    
+    if not result.success:
+        raise click.ClickException("Pipeline failed")
+```
+
+---
+
+## 6. Implementation Plan
+
+### Phase 1: Create Pipeline Module (~15 min)
+
+| Task | Est. Time |
+|------|-----------|
+| Create `pipelines/` directory | 2 min |
+| Create `pipelines/__init__.py` | 3 min |
+| Create dataclasses for configs and results | 10 min |
+
+### Phase 2: Implement Summary Pipeline (~40 min)
+
+| Task | Est. Time |
+|------|-----------|
+| `run_summary_pipeline()` orchestrator | 10 min |
+| `_step_tracelens_analysis()` | 5 min |
+| `_step_process_gpu_timelines()` | 5 min |
+| `_step_compare_gpu_timeline()` | 5 min |
+| `_step_compare_collective()` | 5 min |
+| `_step_generate_final_report()` | 3 min |
+| `_step_generate_plots()` | 3 min |
+| `_step_generate_html()` | 4 min |
+
+### Phase 3: Implement GEMM Pipeline (~20 min)
+
+| Task | Est. Time |
+|------|-----------|
+| `run_gemm_pipeline()` orchestrator | 5 min |
+| `_step_analyze_gemm()` | 5 min |
+| `_step_enhance_timestamps()` | 5 min |
+| `_step_generate_plots()` | 5 min |
+
+### Phase 4: Update CLI (~15 min)
+
+| Task | Est. Time |
+|------|-----------|
+| Rename/update `pipeline full` → `pipeline summary` | 5 min |
+| Update `pipeline gemm` command | 5 min |
+| Update help text | 5 min |
+
+### Phase 5: Testing (~20 min)
+
+| Task | Est. Time |
+|------|-----------|
+| Test summary pipeline | 10 min |
+| Test gemm pipeline | 10 min |
+
+**Total Estimated Time: ~2 hours**
+
+---
+
+## Appendix A: Design Decisions
+
+1. **Rename:** `pipeline full` → `pipeline summary` (as requested)
+2. **Dataclass Config:** Clean configuration management with defaults
+3. **Result Objects:** Track steps completed, skipped, errors, and files
+4. **Direct Function Calls:** Use existing module functions (no subprocess)
+5. **Graceful Degradation:** Continue on non-critical errors
+6. **Progress Reporting:** Clear step-by-step output with `verbose` flag
+7. **Label Auto-Extract:** Use directory names as default labels
+
diff --git a/src/aorta/report/PROCESS_CMD_DEV_DOCS.md b/src/aorta/report/PROCESS_CMD_DEV_DOCS.md
new file mode 100644
index 0000000..0dc8322
--- /dev/null
+++ b/src/aorta/report/PROCESS_CMD_DEV_DOCS.md
@@ -0,0 +1,1009 @@
+# `process` Command Group - Developer Documentation
+
+**Version:** 1.0  
+**Date:** January 2026  
+**Status:** ✅ Implemented
+
+---
+
+## Table of Contents
+
+1. [Overview](#1-overview)
+2. [Command Summary](#2-command-summary)
+3. [process gpu-timeline - Detailed Analysis](#3-process-gpu-timeline---detailed-analysis)
+4. [process comms - NCCL Communication Processing](#4-process-comms---nccl-communication-processing)
+5. [process gemm-variance - Timestamp Enhancement](#5-process-gemm-variance---timestamp-enhancement)
+6. [Implementation Architecture](#6-implementation-architecture)
+7. [Implementation Order](#7-implementation-order)
+8. [Expected Output Examples](#8-expected-output-examples)
+
+---
+
+## 1. Overview
+
+The `process` command group provides data processing utilities that transform raw TraceLens data into more usable formats.
+
+### Current Status
+
+| Command | Status | Source Script |
+|---------|--------|---------------|
+| `process gpu-timeline` | ✅ Implemented | Two modes: single and sweep |
+| `process comms` | ✅ Implemented | `gemm_analysis/process_comms.py` |
+| `process gemm-variance` | ✅ Implemented | `gemm_analysis/enhance_gemm_variance_with_timestamps.py` |
+
+### Key Features
+
+- **GPU Timeline Processing**: Aggregate GPU timeline data across ranks (two modes)
+- **NCCL Communication Processing**: Extract and combine communication metrics
+- **GEMM Variance Enhancement**: Add temporal information to variance analysis
+
+---
+
+## 2. Command Summary
+
+### 2.1 `process gpu-timeline`
+
+Process GPU timeline data from TraceLens reports with two distinct modes.
+
+```bash
+aorta-report process gpu-timeline INPUT_DIR [OPTIONS]
+
+Arguments:
+  INPUT_DIR             Path to reports directory or sweep directory
+
+Options:
+  --mode [auto|single|sweep]   Processing mode (default: auto)
+  --geo-mean                   Use geometric mean instead of arithmetic mean
+  -o, --output PATH            Output file path
+```
+
+**Usage Examples:**
+```bash
+# Auto-detect mode
+aorta-report process gpu-timeline /path/to/reports
+
+# Explicit single config mode
+aorta-report process gpu-timeline /path/to/individual_reports --mode single
+
+# Sweep mode with geometric mean
+aorta-report process gpu-timeline /path/to/sweep --mode sweep --geo-mean
+
+# Custom output path
+aorta-report process gpu-timeline /path/to/sweep -o ./results/timeline.xlsx
+```
+
+### 2.2 `process comms`
+
+Process NCCL collective reports and generate master data files.
+
+```bash
+aorta-report process comms SWEEP_DIR [OPTIONS]
+
+Arguments:
+  SWEEP_DIR             Path to sweep directory
+
+Options:
+  -o, --output PATH     Output directory (default: tracelens_analysis/)
+```
+
+**Usage Examples:**
+```bash
+# Basic usage
+aorta-report process comms /path/to/sweep
+
+# Custom output directory
+aorta-report process comms /path/to/sweep -o ./nccl_analysis/
+```
+
+### 2.3 `process gemm-variance`
+
+Enhance GEMM variance CSV with kernel timestamp information.
+
+```bash
+aorta-report process gemm-variance INPUT_CSV [OPTIONS]
+
+Arguments:
+  INPUT_CSV             Input CSV file with GEMM variance data
+
+Options:
+  --base-path PATH      Base path to sweep directory (required)
+  --tolerance FLOAT     Duration matching tolerance (default: 0.01 = 1%)
+  -o, --output PATH     Output CSV file
+```
+
+**Usage Examples:**
+```bash
+# Basic usage
+aorta-report process gemm-variance ./gemm_variance.csv --base-path /path/to/sweep
+
+# Custom tolerance and output
+aorta-report process gemm-variance ./gemm_variance.csv \
+    --base-path /path/to/sweep \
+    --tolerance 0.02 \
+    -o ./enhanced_variance.csv
+```
+
+---
+
+## 3. `process gpu-timeline` - Detailed Analysis
+
+### 3.1 Two Distinct Modes
+
+The command supports two processing modes for different directory structures:
+
+| Aspect | Single Config Mode | Sweep Mode |
+|--------|-------------------|------------|
+| **Source Script** | `tracelens_single_config/process_gpu_timeline.py` | `gemm_analysis/process_gpu_timeline.py` |
+| **Lines of Code** | 101 | 468 |
+| **Input Argument** | `--reports-dir` | `--sweep-dir` |
+| **Input Path** | `individual_reports/` directory | Sweep directory root |
+| **File Pattern** | `perf_rank*.xlsx` | `perf_*ch_rank*.xlsx` |
+| **Directory Structure** | Flat | Nested with thread/channel hierarchy |
+
+### 3.2 Input Structures
+
+**Single Config Mode:**
+```
+individual_reports/        # Direct input
+├── perf_rank0.xlsx
+├── perf_rank1.xlsx
+├── perf_rank2.xlsx
+├── perf_rank3.xlsx
+├── perf_rank4.xlsx
+├── perf_rank5.xlsx
+├── perf_rank6.xlsx
+└── perf_rank7.xlsx
+```
+
+**Sweep Mode:**
+```
+sweep_directory/
+└── tracelens_analysis/
+    ├── 256thread/
+    │   └── individual_reports/
+    │       ├── perf_28ch_rank0.xlsx
+    │       ├── perf_28ch_rank1.xlsx
+    │       ├── perf_28ch_rank2.xlsx
+    │       ├── ...
+    │       ├── perf_42ch_rank0.xlsx
+    │       ├── perf_42ch_rank1.xlsx
+    │       └── ...
+    └── 512thread/
+        └── individual_reports/
+            ├── perf_28ch_rank0.xlsx
+            └── ...
+```
+
+### 3.3 Output Differences
+
+**Single Config Mode Output:** `gpu_timeline_summary_{mean|geomean}.xlsx`
+
+| Sheet | Description |
+|-------|-------------|
+| `Summary` | Aggregated metrics across ranks |
+| `All_Ranks_Combined` | Raw data from all ranks with rank column |
+| `Per_Rank_Time_ms` | Pivot table: type × rank (time values) |
+| `Per_Rank_Percent` | Pivot table: type × rank (percentages) |
+
+**Sweep Mode Output:** `gpu_timeline_all_configs_{mean|geomean}.xlsx`
+
+| Sheet | Description |
+|-------|-------------|
+| `All_Data` | Complete dataset with all configs + metadata |
+| `Pivot_Time_ms` | Pivot table: type × full_config (time values) |
+| `Pivot_Percent` | Pivot table: type × full_config (percentages) |
+| `Summary_By_Config` | Key metrics per configuration |
+
+### 3.4 Metadata Columns
+
+**Single Config Mode:**
+```python
+# Minimal metadata
+aggregated["num_ranks"] = len(perf_files)
+```
+
+**Sweep Mode:**
+```python
+# Rich metadata for each configuration
+aggregated["thread_config"] = thread_config      # e.g., "256thread"
+aggregated["threads_num"] = 256                  # numeric for sorting
+aggregated["channel_config"] = channel_config    # e.g., "28ch"  
+aggregated["channels_num"] = 28                  # numeric for sorting
+aggregated["full_config"] = "256thread_28ch"     # combined identifier
+aggregated["num_ranks"] = num_ranks
+```
+
+### 3.5 Auto-Detection Logic
+
+```python
+def detect_mode(input_dir):
+    """Auto-detect processing mode from directory structure."""
+    input_path = Path(input_dir)
+    
+    # Check for sweep structure
+    tracelens_dir = input_path / "tracelens_analysis"
+    if tracelens_dir.exists():
+        thread_dirs = [d for d in tracelens_dir.iterdir() 
+                       if d.is_dir() and "thread" in d.name]
+        if thread_dirs:
+            return "sweep"
+    
+    # Check for single config structure
+    if input_path.name == "individual_reports":
+        return "single"
+    if list(input_path.glob("perf_rank*.xlsx")):
+        return "single"
+    
+    # Check for sweep files in current directory
+    if list(input_path.glob("perf_*ch_rank*.xlsx")):
+        return "sweep"
+    
+    raise ValueError("Could not auto-detect mode")
+```
+
+### 3.6 Shared Code
+
+Both modes share identical `geometric_mean()` function:
+
+```python
+def geometric_mean(values):
+    """Calculate geometric mean, handling zeros."""
+    values = np.array(values)
+    values = np.where(values == 0, 1e-10, values)
+    return np.exp(np.mean(np.log(values)))
+```
+
+And similar aggregation logic:
+
+```python
+agg_func = geometric_mean if use_geo_mean else "mean"
+aggregated = (
+    combined.groupby("type")
+    .agg({"time ms": agg_func, "percent": agg_func})
+    .reset_index()
+)
+```
+
+---
+
+## 4. `process comms` - NCCL Communication Processing
+
+### 4.1 Source Script
+
+**Location:** `scripts/gemm_analysis/process_comms.py` (291 lines)
+
+### 4.2 Purpose
+
+Process NCCL collective reports from a sweep directory and generate combined CSV/Excel files with communication metrics for analysis and visualization.
+
+### 4.3 Input Structure
+
+```
+sweep_dir/
+└── tracelens_analysis/
+    ├── 256thread/
+    │   └── collective_reports/
+    │       ├── collective_28ch.xlsx
+    │       ├── collective_42ch.xlsx
+    │       ├── collective_56ch.xlsx
+    │       └── collective_70ch.xlsx
+    └── 512thread/
+        └── collective_reports/
+            ├── collective_28ch.xlsx
+            ├── collective_42ch.xlsx
+            └── ...
+```
+
+### 4.4 Processing Steps
+
+1. **Find Configurations**
+   - Discover thread configurations (e.g., `256thread`, `512thread`)
+   - For each thread config, find collective report files
+
+2. **Read Data**
+   - Open each `collective_*.xlsx` file
+   - Read `nccl_summary_implicit_sync` sheet
+   - Extract communication metrics
+
+3. **Add Metadata**
+   ```python
+   df['thread_config'] = thread_config      # e.g., "256thread"
+   df['threads_num'] = 256
+   df['channel_config'] = channel_config    # e.g., "28ch"
+   df['channels_num'] = 28
+   df['source_file'] = filename
+   df['full_config'] = f"{thread_config}_{channel_config}"
+   ```
+
+4. **Create Operation IDs**
+   ```python
+   # Based on message size
+   unique_sizes = sorted(combined_df['Full msg size (MB)'].unique())
+   size_to_id = {size: f"OP_{i+1:02d}" for i, size in enumerate(unique_sizes)}
+   combined_df['operation_id'] = combined_df['Full msg size (MB)'].map(size_to_id)
+   
+   # Create readable operation names
+   def create_op_name(row):
+       size_mb = row['Full msg size (MB)']
+       if size_mb < 0.01:
+           return f"tiny_{size_mb*1000:.3f}KB"
+       elif size_mb < 100:
+           return f"medium_{size_mb:.2f}MB"
+       else:
+           return f"large_{size_mb:.2f}MB"
+   ```
+
+5. **Combine and Save**
+   - Merge all DataFrames
+   - Reorder columns for readability
+   - Save as Excel and CSV
+
+### 4.5 Output Files
+
+**`nccl_master_all_configs.xlsx`** - Excel file with pivot table support
+
+**`nccl_master_all_configs.csv`** - CSV for pandas/scripts
+
+### 4.6 Output Columns
+
+```python
+column_order = [
+    # Unique identifiers
+    'operation_id', 'operation_name', 'Full msg size (MB)', 'In msg nelems',
+    
+    # Configuration
+    'threads_num', 'thread_config', 'channels_num', 'channel_config', 'full_config',
+    
+    # Operation info
+    'Collective name', 'dtype', 'Group size', 'count',
+    
+    # Communication Latency
+    'comm_latency_mean', 'comm_latency_median', 'comm_latency_min', 'comm_latency_max',
+    'Total comm latency (ms)',
+    
+    # Algorithm Bandwidth
+    'algo bw (GB/s)_mean', 'algo bw (GB/s)_median', 'algo bw (GB/s)_min', 'algo bw (GB/s)_max',
+    
+    # Bus Bandwidth
+    'bus bw (GB/s)_mean', 'bus bw (GB/s)_median', 'bus bw (GB/s)_min', 'bus bw (GB/s)_max',
+    
+    # Start/End Time Skew
+    'skew in start time_mean', 'skew in start time_median', ...
+    'skew in end time_mean', 'skew in end time_median', ...
+    
+    # Process Group Info
+    'Process Group Name', 'source_file'
+]
+```
+
+---
+
+## 5. `process gemm-variance` - Timestamp Enhancement
+
+### 5.1 Source Script
+
+**Location:** `scripts/gemm_analysis/enhance_gemm_variance_with_timestamps.py` (274 lines)
+
+### 5.2 Purpose
+
+Enhance GEMM variance CSV (output from `analyze gemm`) with actual kernel timestamps by finding the specific kernel instances with min and max durations in the original trace files.
+
+### 5.3 Input
+
+1. **GEMM Variance CSV** - Output from `aorta-report analyze gemm`
+   - Contains columns: `threads`, `channel`, `rank`, `kernel_name`, `kernel_time_min_us`, `kernel_time_max_us`, `time_diff_us`
+
+2. **Base Path** - Sweep directory containing original trace files
+
+### 5.4 Processing Steps
+
+For each row in the variance CSV:
+
+1. **Find Trace File**
+   ```python
+   def get_trace_file_path(base_path, threads, channel, rank):
+       trace_dir = base_path / f"{threads}thread" / f"nccl_{channel}channels" / \
+                   "torch_profiler" / f"rank{rank}"
+       # Look for JSON trace files
+       trace_files = list(trace_dir.glob("*.json"))
+       # Prefer customer_trace files
+       for pattern in ["customer_trace*.json", "*.json"]:
+           matches = list(trace_dir.glob(pattern))
+           if matches:
+               return matches[0]
+       return None
+   ```
+
+2. **Search Trace for Kernel Instances**
+   ```python
+   def find_min_max_kernel_timestamps(trace_file, kernel_name, 
+                                       min_duration_us, max_duration_us, tolerance=0.01):
+       with open(trace_file, 'r') as f:
+           data = json.load(f)
+       
+       events = data['traceEvents']
+       kernel_instances = []
+       
+       for event in events:
+           if event.get('cat') == 'kernel' and \
+              event.get('name', '').startswith(kernel_name):
+               duration_us = event.get('dur')
+               timestamp_us = event.get('ts')
+               kernel_instances.append({
+                   'duration_us': duration_us,
+                   'timestamp_ms': timestamp_us / 1000.0,
+               })
+       
+       # Sort by duration
+       kernel_instances.sort(key=lambda x: x['duration_us'])
+       
+       min_instance = kernel_instances[0]   # Shortest duration
+       max_instance = kernel_instances[-1]  # Longest duration
+       
+       return {
+           'min_timestamp_ms': min_instance['timestamp_ms'],
+           'max_timestamp_ms': max_instance['timestamp_ms'],
+           'min_duration_found_us': min_instance['duration_us'],
+           'max_duration_found_us': max_instance['duration_us'],
+       }
+   ```
+
+3. **Update DataFrame**
+   ```python
+   df['min_duration_timestamp_ms'] = ...
+   df['max_duration_timestamp_ms'] = ...
+   df['time_between_min_max_ms'] = abs(max_ts - min_ts)
+   df['min_duration_found_us'] = ...  # For verification
+   df['max_duration_found_us'] = ...  # For verification
+   ```
+
+### 5.5 Output
+
+Enhanced CSV with additional columns:
+
+| Column | Description |
+|--------|-------------|
+| `min_duration_timestamp_ms` | When the shortest kernel instance occurred |
+| `max_duration_timestamp_ms` | When the longest kernel instance occurred |
+| `time_between_min_max_ms` | Time difference between occurrences |
+| `min_duration_found_us` | Actual min duration found (for verification) |
+| `max_duration_found_us` | Actual max duration found (for verification) |
+
+---
+
+## 6. Implementation Architecture
+
+### 6.1 File Structure
+
+```
+src/aorta/report/
+├── cli.py                       # CLI definitions (update process commands)
+├── analysis/                    # EXISTING - analyze command logic
+│   ├── __init__.py
+│   ├── tracelens_wrapper.py
+│   ├── analyze_gemm.py
+│   ├── analyze_single.py
+│   └── analyze_sweep.py
+├── generators/                  # EXISTING - generate command logic
+├── templates/                   # EXISTING - HTML templates
+└── processing/                  # NEW - process command logic
+    ├── __init__.py              # Package exports
+    ├── gpu_timeline_single.py   # Single config GPU timeline processing
+    ├── gpu_timeline_sweep.py    # Sweep GPU timeline processing
+    ├── process_comms.py         # NCCL communication processing
+    └── process_gemm_variance.py # GEMM variance timestamp enhancement
+```
+
+### 6.2 Module Responsibilities
+
+#### `processing/__init__.py`
+```python
+from .gpu_timeline_single import process_single_config
+from .gpu_timeline_sweep import process_sweep_config
+from .process_comms import process_nccl_data
+from .process_gemm_variance import enhance_gemm_variance
+
+__all__ = [
+    "process_single_config",
+    "process_sweep_config",
+    "process_nccl_data",
+    "enhance_gemm_variance",
+]
+```
+
+#### `processing/gpu_timeline_single.py`
+```python
+def geometric_mean(values: np.ndarray) -> float:
+    """Calculate geometric mean, handling zeros."""
+
+def process_single_config(
+    reports_dir: Path,
+    use_geo_mean: bool = False,
+    output_path: Optional[Path] = None,
+    verbose: bool = False,
+) -> Optional[Path]:
+    """
+    Process GPU timeline from single config individual reports.
+    
+    Args:
+        reports_dir: Path to individual_reports directory
+        use_geo_mean: Use geometric mean instead of arithmetic mean
+        output_path: Custom output path
+        verbose: Print verbose output
+    
+    Returns:
+        Path to output Excel file
+    """
+```
+
+#### `processing/gpu_timeline_sweep.py`
+```python
+def process_sweep_config(
+    sweep_dir: Path,
+    use_geo_mean: bool = False,
+    output_path: Optional[Path] = None,
+    verbose: bool = False,
+) -> Optional[Path]:
+    """
+    Process GPU timeline from sweep directory with multiple configs.
+    
+    Args:
+        sweep_dir: Path to sweep directory
+        use_geo_mean: Use geometric mean
+        output_path: Custom output path
+        verbose: Print verbose output
+    
+    Returns:
+        Path to output Excel file
+    """
+
+def parse_perf_filename(filename: str) -> Tuple[str, int]:
+    """Parse perf_28ch_rank0.xlsx to extract channel and rank."""
+
+def group_files_by_channel(perf_files: List[str]) -> Dict[str, List[Tuple[int, str]]]:
+    """Group performance files by channel configuration."""
+
+def aggregate_rank_data(rank_data, thread_config, channel_config, 
+                        num_ranks, use_geo_mean) -> pd.DataFrame:
+    """Aggregate data across ranks with metadata."""
+
+def create_pivot_sheet(df: pd.DataFrame, value_col: str) -> pd.DataFrame:
+    """Create pivot table from DataFrame."""
+
+def create_summary_sheet(df: pd.DataFrame) -> pd.DataFrame:
+    """Create summary sheet with key metrics per configuration."""
+```
+
+#### `processing/process_comms.py`
+```python
+def process_nccl_data(
+    sweep_dir: Path,
+    output_dir: Optional[Path] = None,
+    verbose: bool = False,
+) -> Tuple[Optional[Path], Optional[Path]]:
+    """
+    Process NCCL collective reports from sweep directory.
+    
+    Args:
+        sweep_dir: Path to sweep directory
+        output_dir: Custom output directory
+        verbose: Print verbose output
+    
+    Returns:
+        Tuple of (excel_path, csv_path)
+    """
+
+def create_operation_id(size_mb: float) -> str:
+    """Create operation ID based on message size."""
+
+def create_operation_name(size_mb: float) -> str:
+    """Create readable operation name."""
+```
+
+#### `processing/process_gemm_variance.py`
+```python
+def enhance_gemm_variance(
+    input_csv: Path,
+    base_path: Path,
+    output_csv: Optional[Path] = None,
+    tolerance: float = 0.01,
+    verbose: bool = False,
+) -> Optional[Path]:
+    """
+    Enhance GEMM variance CSV with timestamp information.
+    
+    Args:
+        input_csv: Input CSV file with GEMM variance data
+        base_path: Base path to sweep directory with trace files
+        output_csv: Output CSV path
+        tolerance: Duration matching tolerance (fraction)
+        verbose: Print verbose output
+    
+    Returns:
+        Path to output CSV file
+    """
+
+def get_trace_file_path(base_path: Path, threads: int, 
+                        channel: int, rank: int) -> Optional[Path]:
+    """Find trace file for a given configuration."""
+
+def find_min_max_kernel_timestamps(trace_file: Path, kernel_name: str,
+                                   min_duration_us: float, max_duration_us: float,
+                                   tolerance: float = 0.01) -> Dict[str, Optional[float]]:
+    """Find timestamps for kernel instances with min and max durations."""
+```
+
+### 6.3 Data Flow
+
+```
+CLI (cli.py)
+    │
+    ├── process gpu-timeline ──────►  Auto-detect mode
+    │                                       │
+    │                         ┌─────────────┴─────────────┐
+    │                         │                           │
+    │                         ▼                           ▼
+    │               processing.process_single_config()   processing.process_sweep_config()
+    │                         │                           │
+    │                         ▼                           ▼
+    │               gpu_timeline_summary_*.xlsx   gpu_timeline_all_configs_*.xlsx
+    │
+    ├── process comms ─────────────►  processing.process_nccl_data()
+    │                                       │
+    │                                       ▼
+    │                               nccl_master_all_configs.xlsx/csv
+    │
+    └── process gemm-variance ─────►  processing.enhance_gemm_variance()
+                                            │
+                                            ▼
+                                    *_with_timestamps.csv
+```
+
+---
+
+## 7. Implementation Status
+
+### Phase 1: Create `processing/` directory structure ✅
+- [x] Create `processing/__init__.py`
+- [x] Set up exports
+
+### Phase 2: Implement `gpu_timeline_single.py` ✅
+- [x] Port logic from `tracelens_single_config/process_gpu_timeline.py`
+- [x] Create `process_single_config()` function
+- [x] Add proper error handling and progress reporting
+
+### Phase 3: Implement `gpu_timeline_sweep.py` ✅
+- [x] Port logic from `gemm_analysis/process_gpu_timeline.py`
+- [x] Create `process_sweep_config()` function
+- [x] Implement metadata extraction
+- [x] Implement pivot table generation
+
+### Phase 4: Update `cli.py` for `process gpu-timeline` ✅
+- [x] Update command to call new processing modules
+- [x] Implement auto-detection logic
+
+### Phase 5: Implement `process_comms.py` ✅
+- [x] Port logic from `gemm_analysis/process_comms.py`
+- [x] Create `process_nccl_data()` function
+- [x] Add progress reporting and summary statistics
+- [x] Update CLI command
+
+### Phase 6: Implement `process_gemm_variance.py` ✅
+- [x] Port logic from `enhance_gemm_variance_with_timestamps.py`
+- [x] Create `enhance_gemm_variance()` function
+- [x] Add progress reporting
+- [x] Update CLI command
+
+### Phase 7: Documentation ✅
+- [x] Update this document with implementation status
+- [x] Add to functional spec
+
+---
+
+## 8. Expected Output Examples
+
+### 8.1 `process gpu-timeline --mode single` Output
+
+```
+Processing GPU timeline from: /path/to/individual_reports
+Aggregation: Arithmetic Mean
+Found 8 rank files
+  Rank 0: OK
+  Rank 1: OK
+  Rank 2: OK
+  Rank 3: OK
+  Rank 4: OK
+  Rank 5: OK
+  Rank 6: OK
+  Rank 7: OK
+
+Saved: /path/to/gpu_timeline_summary_mean.xlsx
+
+Summary:
+              type   time ms   percent
+       busy_time    125.450    85.230
+       idle_time     21.780    14.770
+      total_time    147.230   100.000
+computation_time     98.340    66.820
+exposed_comm_time    27.110    18.410
+```
+
+### 8.2 `process gpu-timeline --mode sweep` Output
+
+```
+================================================================================
+Processing GPU Timeline data from: /path/to/sweep
+Aggregation method: Arithmetic Mean
+================================================================================
+
+Found thread configurations: ['256thread', '512thread']
+
+Processing: 256thread
+------------------------------------------------------------
+  28ch: Processing 8 ranks...
+    [OK] Aggregated across 8 ranks
+  42ch: Processing 8 ranks...
+    [OK] Aggregated across 8 ranks
+  56ch: Processing 8 ranks...
+    [OK] Aggregated across 8 ranks
+  70ch: Processing 8 ranks...
+    [OK] Aggregated across 8 ranks
+
+Processing: 512thread
+------------------------------------------------------------
+  28ch: Processing 8 ranks...
+    [OK] Aggregated across 8 ranks
+  ...
+
+================================================================================
+CREATING OUTPUT FILE
+================================================================================
+[SAVED] /path/to/tracelens_analysis/gpu_timeline_all_configs_mean.xlsx
+  Sheets created:
+    1. All_Data - Complete dataset
+    2. Pivot_Time_ms - Matrix view of time (ms)
+    3. Pivot_Percent - Matrix view of percentages
+    4. Summary_By_Config - Key metrics per configuration
+
+================================================================================
+SUMMARY
+================================================================================
+
+Metric Types Found:
+  busy_time                 (8 configurations)
+  computation_time          (8 configurations)
+  exposed_comm_time         (8 configurations)
+  idle_time                 (8 configurations)
+  total_time                (8 configurations)
+
+Configurations Processed:
+  256thread_28ch            (8 ranks)
+  256thread_42ch            (8 ranks)
+  256thread_56ch            (8 ranks)
+  256thread_70ch            (8 ranks)
+  512thread_28ch            (8 ranks)
+  512thread_42ch            (8 ranks)
+  512thread_56ch            (8 ranks)
+  512thread_70ch            (8 ranks)
+
+================================================================================
+KEY METRICS COMPARISON (Sorted by Busy Time)
+================================================================================
+
+Busy Time (lower is better):
+     full_config   time ms   percent
+  512thread_70ch    98.234    78.45
+  512thread_56ch   102.456    79.12
+  256thread_70ch   115.678    82.34
+  ...
+
+Idle Time (lower is better):
+     full_config   time ms   percent
+  512thread_70ch    12.345    9.87
+  512thread_56ch    14.567   11.23
+  ...
+
+================================================================================
+COMPLETE!
+================================================================================
+Output file: /path/to/tracelens_analysis/gpu_timeline_all_configs_mean.xlsx
+Open in Excel to create custom pivots and charts!
+================================================================================
+```
+
+### 8.3 `process comms` Output
+
+```
+================================================================================
+Processing NCCL data from: /path/to/sweep
+================================================================================
+
+Found thread configurations: ['256thread', '512thread']
+
+Processing: 256thread
+------------------------------------------------------------
+  Reading: collective_28ch.xlsx
+    [OK] Loaded 15 rows
+  Reading: collective_42ch.xlsx
+    [OK] Loaded 15 rows
+  Reading: collective_56ch.xlsx
+    [OK] Loaded 15 rows
+  Reading: collective_70ch.xlsx
+    [OK] Loaded 15 rows
+
+Processing: 512thread
+------------------------------------------------------------
+  Reading: collective_28ch.xlsx
+    [OK] Loaded 15 rows
+  ...
+
+================================================================================
+COMBINING AND PROCESSING DATA
+================================================================================
+Total rows: 120
+Total columns: 28
+
+Creating unique operation IDs...
+
+================================================================================
+SAVING DATA FILE
+================================================================================
+[SAVED] Excel: nccl_master_all_configs.xlsx
+  Rows: 120, Columns: 28
+[SAVED] CSV: nccl_master_all_configs.csv
+  (Use Excel file for pivot tables, CSV for pandas/scripts)
+
+================================================================================
+SUMMARY
+================================================================================
+
+Operation ID Mapping:
+------------------------------------------------------------
+  OP_01:     0.001953 MB  (       512 elements)  tiny_1.953KB
+  OP_02:     0.062500 MB  (     16384 elements)  medium_0.06MB
+  OP_03:     0.500000 MB  (    131072 elements)  medium_0.50MB
+  OP_04:     4.000000 MB  (   1048576 elements)  medium_4.00MB
+  OP_05:   128.000000 MB  (  33554432 elements)  large_128.00MB
+
+Configurations:
+------------------------------------------------------------
+  256thread    28ch     -> 15 operations
+  256thread    42ch     -> 15 operations
+  256thread    56ch     -> 15 operations
+  256thread    70ch     -> 15 operations
+  512thread    28ch     -> 15 operations
+  512thread    42ch     -> 15 operations
+  512thread    56ch     -> 15 operations
+  512thread    70ch     -> 15 operations
+
+Total Communication Time by Configuration:
+------------------------------------------------------------
+  512thread_70ch           :     123.45 ms
+  512thread_56ch           :     134.56 ms
+  256thread_70ch           :     145.67 ms
+  ...
+
+Best Configuration by Operation:
+------------------------------------------------------------
+  OP_01 (    0.00 MB): 512thread_70ch      (    0.12 ms)
+  OP_02 (    0.06 MB): 512thread_56ch      (    0.45 ms)
+  OP_03 (    0.50 MB): 256thread_70ch      (    1.23 ms)
+  OP_04 (    4.00 MB): 512thread_70ch      (    5.67 ms)
+  OP_05 (  128.00 MB): 512thread_70ch      (   89.12 ms)
+
+================================================================================
+COMPLETE!
+================================================================================
+Generated files:
+  1. nccl_master_all_configs.xlsx (Excel - use for pivot tables)
+  2. nccl_master_all_configs.csv (CSV - use for pandas/scripts)
+
+Recommended workflow:
+  1. Open Excel file: libreoffice nccl_master_all_configs.xlsx
+  2. Create pivot table: Select all -> Insert -> Pivot Table
+  3. Setup: Rows=operation_id, Columns=full_config, Values=comm_latency_mean
+================================================================================
+```
+
+### 8.4 `process gemm-variance` Output
+
+```
+GEMM Variance Timestamp Enhancement
+============================================================
+Input CSV: ./top5_gemm_kernels_time_variance.csv
+Output CSV: ./top5_gemm_kernels_time_variance_with_timestamps.csv
+Base path: /path/to/sweep
+Tolerance: 1.0%
+
+Processing 320 rows...
+
+Processing row 1/320
+  Config: 256thread/28ch/rank0
+  Kernel: Cijk_Alik_Bljk_HBH_BH_MT128x128x16_MI16x16x1_SE_1LDSB0_APM1_ABV0_ACED0...
+  Duration range: [45.234, 89.456] us
+  Using trace: customer_trace_1234567890.json
+  Found timestamps: min at 1523.456ms, max at 2345.678ms (diff: 822.222ms)
+  Verification: found min=45.234us (expected 45.234us), found max=89.456us (expected 89.456us)
+
+Processing row 2/320
+  Config: 256thread/28ch/rank0
+  Kernel: Cijk_Alik_Bljk_HBH_BH_MT64x64x32_MI16x16x1_SE_1LDSB0_APM1_ABV0_ACED0...
+  Duration range: [32.123, 67.890] us
+  Using trace: customer_trace_1234567890.json
+  Found timestamps: min at 1678.901ms, max at 2456.789ms (diff: 777.888ms)
+  ...
+
+Processing row 320/320
+  Config: 512thread/70ch/rank7
+  ...
+
+Enhanced CSV saved to: ./top5_gemm_kernels_time_variance_with_timestamps.csv
+
+Summary:
+  Total rows: 320
+  Rows with timestamps: 312
+  Success rate: 97.5%
+
+Time between min/max occurrences:
+  Mean: 456.789 ms
+  Median: 234.567 ms
+  Max: 1234.567 ms
+  Min: 12.345 ms
+
+[OK] Enhancement complete!
+```
+
+---
+
+## Appendix A: Migration Checklist
+
+### From `tracelens_single_config/process_gpu_timeline.py` ✅
+- [x] `geometric_mean()` function
+- [x] File pattern matching (`perf_rank*.xlsx`)
+- [x] Excel sheet generation (Summary, All_Ranks_Combined, Per_Rank_*)
+- [x] Aggregation logic
+- [x] Output path generation
+
+### From `gemm_analysis/process_gpu_timeline.py` ✅
+- [x] `geometric_mean()` function (shared)
+- [x] `parse_perf_filename()` function
+- [x] `group_files_by_channel()` function
+- [x] Thread/channel config discovery
+- [x] `aggregate_rank_data()` with metadata
+- [x] `create_pivot_sheet()` function
+- [x] `create_summary_sheet()` function
+- [x] `print_summary_report()` function
+
+### From `gemm_analysis/process_comms.py` ✅
+- [x] Thread config discovery
+- [x] Excel reading (`nccl_summary_implicit_sync` sheet)
+- [x] Metadata column addition
+- [x] Operation ID creation
+- [x] Operation name creation
+- [x] Column reordering
+- [x] Excel/CSV output
+- [x] Summary statistics
+
+### From `enhance_gemm_variance_with_timestamps.py` ✅
+- [x] `get_trace_file_path()` function
+- [x] `find_min_max_kernel_timestamps()` function
+- [x] JSON trace parsing
+- [x] Kernel instance searching
+- [x] Duration tolerance matching
+- [x] CSV enhancement
+- [x] Progress reporting
+
+---
+
+## Appendix B: Error Handling
+
+| Scenario | Handling |
+|----------|----------|
+| Directory not found | Raise `FileNotFoundError` with helpful message |
+| No Excel files found | Print warning, return `None` |
+| Missing sheet in Excel | Print warning, skip file |
+| No valid data loaded | Raise `ValueError` |
+| Trace file not found | Print warning, skip row |
+| No kernel instances found | Print warning, set timestamps to `None` |
+| Duration mismatch | Print warning with expected vs found values |
+
diff --git a/src/aorta/report/USER_GUIDE.md b/src/aorta/report/USER_GUIDE.md
new file mode 100644
index 0000000..a7ac340
--- /dev/null
+++ b/src/aorta/report/USER_GUIDE.md
@@ -0,0 +1,826 @@
+# aorta-report User Guide
+
+**Version:** 1.0  
+**Date:** January 2026
+
+---
+
+## Table of Contents
+
+1. [Overview](#1-overview)
+2. [Installation](#2-installation)
+3. [Quick Start](#3-quick-start)
+4. [Command Reference](#4-command-reference)
+   - [analyze](#41-analyze-commands)
+   - [compare](#42-compare-commands)
+   - [generate](#43-generate-commands)
+   - [process](#44-process-commands)
+   - [pipeline](#45-pipeline-commands)
+5. [Common Workflows](#5-common-workflows)
+6. [Output Files](#6-output-files)
+7. [Implementation Status](#7-implementation-status)
+
+---
+
+## 1. Overview
+
+`aorta-report` is a unified CLI tool for TraceLens analysis and report generation. It provides commands for:
+
+- **Analyzing** PyTorch profiler traces
+- **Comparing** baseline and test configurations
+- **Generating** Excel reports, plots, and HTML dashboards
+- **Processing** GPU timeline and NCCL communication data
+- **Running pipelines** that orchestrate multiple steps
+
+### Global Options
+
+```bash
+aorta-report [OPTIONS] COMMAND [ARGS]...
+
+Options:
+  --version        Show version and exit
+  -v, --verbose    Enable verbose output
+  --quiet          Suppress non-error output
+  --help           Show help message
+```
+
+---
+
+## 2. Installation
+
+```bash
+# From the aorta directory
+cd aorta
+pip install -e .
+
+# Verify installation
+aorta-report --version
+aorta-report --help
+```
+
+### Dependencies
+
+- Python 3.8+
+- pandas
+- openpyxl
+- matplotlib
+- seaborn
+- click
+
+---
+
+## 3. Quick Start
+
+### Full Analysis Pipeline (Recommended)
+
+The easiest way to run a complete analysis:
+
+```bash
+# Compare baseline vs test with full analysis
+aorta-report pipeline summary \
+    -b /path/to/baseline/traces \
+    -t /path/to/test/traces \
+    -o /path/to/output
+
+# If TraceLens analysis is already done
+aorta-report pipeline summary \
+    -b /path/to/baseline/traces \
+    -t /path/to/test/traces \
+    -o /path/to/output \
+    --skip-tracelens
+```
+
+This generates:
+- GPU timeline comparison Excel
+- Collective/NCCL comparison Excel
+- Final comprehensive report
+- Visualization plots
+- HTML report
+
+### GEMM Variance Analysis
+
+```bash
+aorta-report pipeline gemm \
+    --sweep-dir /path/to/sweep \
+    -o /path/to/output
+```
+
+---
+
+## 4. Command Reference
+
+### 4.1 Analyze Commands
+
+Commands for running TraceLens analysis on trace data.
+
+#### `analyze single`
+
+Analyze a single configuration trace directory.
+
+```bash
+aorta-report analyze single <TRACE_DIR> [OPTIONS]
+
+Arguments:
+  TRACE_DIR              Path to trace directory containing rank subdirectories
+
+Options:
+  --individual-only      Generate only individual reports
+  --collective-only      Generate only collective report
+  --geo-mean             Use geometric mean for timeline aggregation
+  --short-kernel-threshold INT  Threshold for short kernel study (default: 50 µs)
+  --topk-ops INT         Number of top operations to include (default: 100)
+  -o, --output PATH      Output directory
+```
+
+**Examples:**
+
+```bash
+# Basic analysis
+aorta-report analyze single /path/to/traces
+
+# Individual reports only
+aorta-report analyze single /path/to/traces --individual-only
+
+# Custom output directory
+aorta-report analyze single /path/to/traces -o ./results
+```
+
+---
+
+#### `analyze sweep`
+
+Analyze a sweep directory with multiple configurations.
+
+```bash
+aorta-report analyze sweep <SWEEP_DIR> [OPTIONS]
+
+Arguments:
+  SWEEP_DIR              Path to sweep directory containing tracelens_analysis/
+
+Options:
+  --geo-mean             Use geometric mean instead of arithmetic mean
+  -o, --output PATH      Output directory
+```
+
+**Examples:**
+
+```bash
+aorta-report analyze sweep /path/to/sweep_20251124
+aorta-report analyze sweep /path/to/sweep --geo-mean
+```
+
+---
+
+#### `analyze gemm`
+
+Analyze GEMM kernels from TraceLens reports.
+
+```bash
+aorta-report analyze gemm <REPORTS_DIR> [OPTIONS]
+
+Arguments:
+  REPORTS_DIR            Path to tracelens_analysis directory
+
+Options:
+  -t, --threads INT      Thread configurations (can specify multiple, default: 256, 512)
+  -c, --channels INT     Channel configurations (can specify multiple, default: 28, 42, 56, 70)
+  -r, --ranks INT        Ranks to analyze (default: 0-7)
+  --top-k INT            Number of top kernels to extract (default: 5)
+  -o, --output PATH      Output CSV file
+```
+
+**Examples:**
+
+```bash
+# Default analysis
+aorta-report analyze gemm /path/to/tracelens_analysis
+
+# Custom top-k
+aorta-report analyze gemm /path/to/reports --top-k 10 -o gemm_analysis.csv
+
+# Custom configurations
+aorta-report analyze gemm /path/to/reports -t 256 -t 512 -c 28 -c 42
+```
+
+---
+
+### 4.2 Compare Commands
+
+Commands for comparing baseline and test TraceLens reports.
+
+#### `compare gpu_timeline`
+
+Compare two GPU timeline reports.
+
+```bash
+aorta-report compare gpu_timeline [OPTIONS]
+
+Options:
+  -b, --baseline PATH    Path to baseline gpu_timeline_summary_mean.xlsx (required)
+  -t, --test PATH        Path to test gpu_timeline_summary_mean.xlsx (required)
+  --baseline-label TEXT  Label for baseline (default: extracted from path)
+  --test-label TEXT      Label for test (default: extracted from path)
+  -o, --output PATH      Output Excel file path (required)
+```
+
+**Examples:**
+
+```bash
+# Basic comparison
+aorta-report compare gpu_timeline \
+    -b baseline/gpu_timeline_summary_mean.xlsx \
+    -t test/gpu_timeline_summary_mean.xlsx \
+    -o comparison.xlsx
+
+# With custom labels
+aorta-report compare gpu_timeline \
+    -b baseline/gpu.xlsx -t test/gpu.xlsx \
+    --baseline-label "ROCm 6.0" --test-label "ROCm 7.0" \
+    -o comparison.xlsx
+```
+
+**Output Sheets:**
+- `Summary` - Combined summary data
+- `All_Ranks_Combined` - Combined per-rank data
+- `Comparison_By_Rank` - Per-rank comparison with percent_change
+- `Summary_Comparison` - Overall comparison
+
+---
+
+#### `compare collective`
+
+Compare two collective/NCCL reports.
+
+```bash
+aorta-report compare collective [OPTIONS]
+
+Options:
+  -b, --baseline PATH    Path to baseline collective_all_ranks.xlsx (required)
+  -t, --test PATH        Path to test collective_all_ranks.xlsx (required)
+  --baseline-label TEXT  Label for baseline (default: extracted from path)
+  --test-label TEXT      Label for test (default: extracted from path)
+  -o, --output PATH      Output Excel file path (required)
+```
+
+**Examples:**
+
+```bash
+aorta-report compare collective \
+    -b baseline/collective_all_ranks.xlsx \
+    -t test/collective_all_ranks.xlsx \
+    -o collective_comparison.xlsx
+```
+
+**Output Sheets:**
+- `nccl_summary_implicit_sync` - Combined summary data
+- `nccl_summary_long` - Combined long operation data
+- `nccl_implicit_sync_cmp` - Comparison with latency/bandwidth metrics
+- `nccl_long_cmp` - Long operation comparison
+
+---
+
+### 4.3 Generate Commands
+
+Commands for generating reports and visualizations.
+
+#### `generate html`
+
+Generate HTML report with embedded images.
+
+```bash
+aorta-report generate html [OPTIONS]
+
+Options:
+  --mode [sweep|performance]  Report mode (required)
+  --sweep1 PATH          [sweep mode] First sweep directory
+  --sweep2 PATH          [sweep mode] Second sweep directory
+  --label1 TEXT          [sweep mode] Label for first sweep
+  --label2 TEXT          [sweep mode] Label for second sweep
+  --plots-dir PATH       [performance mode] Directory with pre-generated plots
+  -o, --output PATH      Output HTML file (required)
+```
+
+**Examples:**
+
+```bash
+# Sweep comparison (GEMM variance)
+aorta-report generate html --mode sweep \
+    --sweep1 ./exp1 --sweep2 ./exp2 \
+    --label1 "Baseline" --label2 "Optimized" \
+    -o comparison.html
+
+# Performance report
+aorta-report generate html --mode performance \
+    --plots-dir ./output/plots \
+    -o performance_report.html
+```
+
+---
+
+#### `generate excel`
+
+Generate comprehensive final Excel report.
+
+```bash
+aorta-report generate excel [OPTIONS]
+
+Options:
+  --gpu-combined PATH     GPU combined report file (required)
+  --gpu-comparison PATH   GPU comparison report file (required)
+  --coll-combined PATH    Collective combined report file (required)
+  --coll-comparison PATH  Collective comparison report file (required)
+  --baseline-label TEXT   Label for baseline (default: "Baseline")
+  --test-label TEXT       Label for test (default: "Test")
+  -o, --output PATH       Output Excel file (required)
+```
+
+**Examples:**
+
+```bash
+aorta-report generate excel \
+    --gpu-combined gpu_combined.xlsx \
+    --gpu-comparison gpu_comparison.xlsx \
+    --coll-combined coll_combined.xlsx \
+    --coll-comparison coll_comparison.xlsx \
+    --baseline-label "ROCm 6.0" --test-label "ROCm 7.0" \
+    -o final_report.xlsx
+```
+
+**Output Structure:**
+- `Summary_Dashboard` - Key metrics at a glance (visible, first sheet)
+- `GPU_Summary_Cmp`, `GPU_ByRank_Cmp` - GPU comparisons (visible)
+- `NCCL_*_Cmp` - NCCL comparisons (visible)
+- `*_Raw` sheets - Raw data (hidden, accessible via Unhide)
+
+---
+
+#### `generate plots`
+
+Generate visualization plots.
+
+```bash
+aorta-report generate plots [OPTIONS]
+
+Options:
+  -i, --input PATH       Input file (Excel for summary, CSV for gemm)
+  --excel-input PATH     Excel report file (for --type all)
+  --gemm-csv PATH        GEMM variance CSV (for --type all)
+  -o, --output PATH      Output directory for PNG files (required)
+  --type [all|summary|gemm]  Type of plots (default: all)
+  --dpi INT              DPI for output images (default: 150)
+```
+
+**Examples:**
+
+```bash
+# Summary plots from Excel report
+aorta-report generate plots \
+    -i final_report.xlsx \
+    -o ./plots/ \
+    --type summary
+
+# GEMM plots from CSV
+aorta-report generate plots \
+    -i gemm_variance.csv \
+    -o ./plots/ \
+    --type gemm
+
+# All plots
+aorta-report generate plots \
+    --excel-input final_report.xlsx \
+    --gemm-csv gemm_variance.csv \
+    -o ./plots/ \
+    --type all
+```
+
+**Summary Plots (13 files):**
+- `improvement_chart.png` - Percent improvement bar chart
+- `abs_time_comparison.png` - Absolute time comparison
+- `*_by_rank.png` - Metrics by rank (4 files)
+- `gpu_time_heatmap.png` - Heatmap of changes
+- `gpu_time_change_percentage_summary_by_rank.png` - 2×4 grid
+- `NCCL_*.png` - NCCL comparison charts (5 files)
+
+**GEMM Plots (5 files):**
+- `variance_by_threads_boxplot.png`
+- `variance_by_channels_boxplot.png`
+- `variance_by_ranks_boxplot.png`
+- `variance_violin_combined.png`
+- `variance_thread_channel_interaction.png`
+
+---
+
+### 4.4 Process Commands
+
+Data processing utilities.
+
+#### `process gpu-timeline`
+
+Process GPU timeline data from TraceLens reports.
+
+```bash
+aorta-report process gpu-timeline <INPUT_DIR> [OPTIONS]
+
+Arguments:
+  INPUT_DIR              Path to reports directory or sweep directory
+
+Options:
+  --mode [auto|single|sweep]  Processing mode (default: auto)
+  --geo-mean             Use geometric mean instead of arithmetic mean
+  -o, --output PATH      Output file path
+```
+
+**Examples:**
+
+```bash
+# Auto-detect mode
+aorta-report process gpu-timeline /path/to/reports
+
+# Single config mode
+aorta-report process gpu-timeline /path/to/individual_reports --mode single
+
+# Sweep mode with geometric mean
+aorta-report process gpu-timeline /path/to/sweep --mode sweep --geo-mean
+```
+
+---
+
+#### `process comms`
+
+Process NCCL communication data from collective reports.
+
+```bash
+aorta-report process comms <SWEEP_DIR> [OPTIONS]
+
+Arguments:
+  SWEEP_DIR              Path to sweep directory containing tracelens_analysis/
+
+Options:
+  -o, --output PATH      Output directory
+```
+
+**Examples:**
+
+```bash
+aorta-report process comms /path/to/sweep
+aorta-report process comms /path/to/sweep -o ./nccl_analysis/
+```
+
+**Output Files:**
+- `nccl_master_all_configs.xlsx` - For pivot tables
+- `nccl_master_all_configs.csv` - For pandas/scripts
+
+---
+
+#### `process gemm-variance`
+
+Enhance GEMM variance CSV with kernel timestamps.
+
+```bash
+aorta-report process gemm-variance <INPUT_CSV> [OPTIONS]
+
+Arguments:
+  INPUT_CSV              CSV file with GEMM variance data
+
+Options:
+  --base-path PATH       Base path to sweep directory (required)
+  --tolerance FLOAT      Duration matching tolerance (default: 0.01 = 1%)
+  -o, --output PATH      Output CSV file
+```
+
+**Examples:**
+
+```bash
+aorta-report process gemm-variance ./gemm_variance.csv \
+    --base-path /path/to/sweep
+
+aorta-report process gemm-variance ./variance.csv \
+    --base-path /path/to/sweep \
+    --tolerance 0.02 \
+    -o ./enhanced.csv
+```
+
+**Added Columns:**
+- `min_duration_timestamp_ms` - When shortest instance occurred
+- `max_duration_timestamp_ms` - When longest instance occurred
+- `time_between_min_max_ms` - Time difference between occurrences
+
+---
+
+### 4.5 Pipeline Commands
+
+End-to-end analysis pipelines that orchestrate multiple steps.
+
+#### `pipeline summary`
+
+Run complete summary analysis pipeline (GPU + NCCL comparison).
+
+```bash
+aorta-report pipeline summary [OPTIONS]
+
+Options:
+  -b, --baseline PATH    Baseline trace directory (required)
+  -t, --test PATH        Test trace directory (required)
+  -o, --output PATH      Output directory (required)
+  --baseline-label TEXT  Label for baseline (default: directory name)
+  --test-label TEXT      Label for test (default: directory name)
+  --skip-tracelens       Skip TraceLens analysis (if already done)
+  --gpu-timeline/--no-gpu-timeline    Enable/disable GPU timeline (default: True)
+  --collective/--no-collective        Enable/disable collective (default: True)
+  --final-report/--no-final-report    Enable/disable final report (default: True)
+  --plots/--no-plots                  Enable/disable plots (default: True)
+  --html/--no-html                    Enable/disable HTML report (default: True)
+```
+
+**Pipeline Steps:**
+1. TraceLens Analysis (skippable)
+2. Process GPU Timelines
+3. Compare GPU Timelines
+4. Compare Collective/NCCL
+5. Generate Final Excel Report
+6. Generate Visualization Plots
+7. Generate HTML Report
+
+**Examples:**
+
+```bash
+# Full pipeline
+aorta-report pipeline summary \
+    -b /path/to/baseline \
+    -t /path/to/test \
+    -o /path/to/output
+
+# Skip TraceLens (already done)
+aorta-report pipeline summary \
+    -b /path/to/baseline \
+    -t /path/to/test \
+    -o /path/to/output \
+    --skip-tracelens
+
+# Custom labels
+aorta-report pipeline summary \
+    -b /path/to/baseline \
+    -t /path/to/test \
+    -o /path/to/output \
+    --baseline-label "ROCm 6.0" \
+    --test-label "ROCm 7.0"
+
+# Only GPU timeline comparison
+aorta-report pipeline summary \
+    -b /path/to/baseline \
+    -t /path/to/test \
+    -o /path/to/output \
+    --no-collective --no-final-report --no-plots --no-html
+```
+
+---
+
+#### `pipeline gemm`
+
+Run GEMM variance analysis pipeline.
+
+```bash
+aorta-report pipeline gemm [OPTIONS]
+
+Options:
+  --sweep-dir PATH       Sweep directory containing tracelens_analysis/ (required)
+  -o, --output PATH      Output directory (required)
+  --top-k INT            Number of top kernels to extract (default: 5)
+  -t, --threads INT      Thread configurations (can specify multiple)
+  -c, --channels INT     Channel configurations (can specify multiple)
+  --timestamps/--no-timestamps  Enhance with timestamps (default: True)
+  --plots/--no-plots            Generate plots (default: True)
+```
+
+**Pipeline Steps:**
+1. Analyze GEMM Reports
+2. Enhance with Timestamps (optional)
+3. Generate GEMM Plots (optional)
+
+**Examples:**
+
+```bash
+# Full pipeline
+aorta-report pipeline gemm \
+    --sweep-dir /path/to/sweep \
+    -o /path/to/output
+
+# Custom top-k
+aorta-report pipeline gemm \
+    --sweep-dir /path/to/sweep \
+    -o ./output \
+    --top-k 10
+
+# Skip plots
+aorta-report pipeline gemm \
+    --sweep-dir /path/to/sweep \
+    -o ./output \
+    --no-plots
+
+# Custom configurations
+aorta-report pipeline gemm \
+    --sweep-dir /path/to/sweep \
+    -o ./output \
+    -t 256 -t 512 -c 28 -c 42 -c 56 -c 70
+```
+
+---
+
+## 5. Common Workflows
+
+### Workflow 1: Compare Two Configurations
+
+```bash
+# Step 1: Run TraceLens analysis on both (or use existing)
+aorta-report analyze single /path/to/baseline/traces
+aorta-report analyze single /path/to/test/traces
+
+# Step 2: Process GPU timelines
+aorta-report process gpu-timeline /path/to/baseline/tracelens_analysis/individual_reports
+aorta-report process gpu-timeline /path/to/test/tracelens_analysis/individual_reports
+
+# Step 3: Compare GPU timelines
+aorta-report compare gpu_timeline \
+    -b baseline/tracelens_analysis/gpu_timeline_summary_mean.xlsx \
+    -t test/tracelens_analysis/gpu_timeline_summary_mean.xlsx \
+    -o output/gpu_comparison.xlsx
+
+# Step 4: Compare collective/NCCL
+aorta-report compare collective \
+    -b baseline/tracelens_analysis/collective_reports/collective_all_ranks.xlsx \
+    -t test/tracelens_analysis/collective_reports/collective_all_ranks.xlsx \
+    -o output/collective_comparison.xlsx
+
+# Step 5: Generate plots
+aorta-report generate plots \
+    -i output/gpu_comparison.xlsx \
+    -o output/plots/ \
+    --type summary
+```
+
+**OR use the pipeline (recommended):**
+
+```bash
+aorta-report pipeline summary \
+    -b /path/to/baseline \
+    -t /path/to/test \
+    -o /path/to/output
+```
+
+---
+
+### Workflow 2: GEMM Kernel Variance Analysis
+
+```bash
+# Full pipeline
+aorta-report pipeline gemm \
+    --sweep-dir /path/to/sweep \
+    -o /path/to/output \
+    --top-k 10
+
+# OR step by step:
+aorta-report analyze gemm /path/to/sweep/tracelens_analysis --top-k 10 -o variance.csv
+aorta-report process gemm-variance variance.csv --base-path /path/to/sweep -o enhanced.csv
+aorta-report generate plots -i variance.csv -o ./plots/ --type gemm
+```
+
+---
+
+### Workflow 3: Generate Reports from Existing Comparisons
+
+```bash
+# If you already have comparison files:
+aorta-report generate excel \
+    --gpu-combined gpu_combined.xlsx \
+    --gpu-comparison gpu_comparison.xlsx \
+    --coll-combined coll_combined.xlsx \
+    --coll-comparison coll_comparison.xlsx \
+    -o final_report.xlsx
+
+aorta-report generate plots \
+    -i final_report.xlsx \
+    -o ./plots/ \
+    --type summary
+
+aorta-report generate html \
+    --mode performance \
+    --plots-dir ./plots/ \
+    -o report.html
+```
+
+---
+
+## 6. Output Files
+
+### Summary Pipeline Output
+
+```
+output/
+├── gpu_timeline_combined.xlsx       # Combined baseline + test GPU data
+├── gpu_timeline_comparison.xlsx     # GPU comparison with percent_change
+├── collective_combined.xlsx         # Combined NCCL data
+├── collective_comparison.xlsx       # NCCL comparison
+├── final_analysis_report.xlsx       # Comprehensive report
+│   ├── Summary_Dashboard (visible)
+│   ├── GPU_Summary_Cmp (visible)
+│   ├── GPU_ByRank_Cmp (visible)
+│   ├── NCCL_*_Cmp (visible)
+│   └── *_Raw (hidden)
+├── plots/
+│   ├── improvement_chart.png
+│   ├── abs_time_comparison.png
+│   ├── gpu_time_heatmap.png
+│   ├── total_time_by_rank.png
+│   ├── computation_time_by_rank.png
+│   ├── total_comm_time_by_rank.png
+│   ├── idle_time_by_rank.png
+│   ├── gpu_time_change_percentage_summary_by_rank.png
+│   ├── NCCL_Communication_Latency_comparison.png
+│   ├── NCCL_Algorithm_Bandwidth_comparison.png
+│   ├── NCCL_Bus_Bandwidth_comparison.png
+│   ├── NCCL_Total_Communication_Latency_comparison.png
+│   └── NCCL_Performance_Percentage_Change_comparison.png
+└── performance_analysis_report.html  # Self-contained HTML report
+```
+
+### GEMM Pipeline Output
+
+```
+output/
+├── top5_gemm_kernels_time_variance.csv
+├── top5_gemm_kernels_time_variance_with_timestamps.csv
+└── plots/
+    ├── variance_by_threads_boxplot.png
+    ├── variance_by_channels_boxplot.png
+    ├── variance_by_ranks_boxplot.png
+    ├── variance_violin_combined.png
+    └── variance_thread_channel_interaction.png
+```
+
+---
+
+## 7. Implementation Status
+
+### ✅ Fully Implemented
+
+| Command | Description |
+|---------|-------------|
+| `analyze single` | Analyze single configuration traces |
+| `analyze sweep` | Analyze sweep with multiple configs |
+| `analyze gemm` | Analyze GEMM kernels |
+| `compare gpu_timeline` | Compare two GPU timeline reports |
+| `compare collective` | Compare two collective/NCCL reports |
+| `generate html` | Generate HTML report |
+| `generate excel` | Generate comprehensive Excel report |
+| `generate plots` | Generate visualization plots |
+| `process gpu-timeline` | Process GPU timeline data |
+| `process comms` | Process NCCL communication data |
+| `process gemm-variance` | Enhance GEMM variance with timestamps |
+| `pipeline summary` | Complete summary analysis pipeline |
+| `pipeline gemm` | GEMM variance analysis pipeline |
+
+### ⏸️ Planned (Not Yet Implemented)
+
+| Command | Description |
+|---------|-------------|
+| `compare runs` | Compare N TraceLens runs (N-way comparison) |
+
+---
+
+## Appendix: Troubleshooting
+
+### Common Issues
+
+1. **"Baseline/Test analysis not found"**
+   - Run without `--skip-tracelens` first, or ensure `tracelens_analysis/` exists
+
+2. **"Individual reports not found"**
+   - Ensure TraceLens analysis completed successfully
+   - Check for `individual_reports/` directory with `perf_rank*.xlsx` files
+
+3. **"Collective reports not found"**
+   - Ensure collective analysis was run (not `--individual-only`)
+   - Check for `collective_reports/collective_all_ranks.xlsx`
+
+4. **Excel formatting issues**
+   - Ensure `openpyxl` is installed
+   - Hidden sheets can be revealed: Right-click sheet tab → Unhide
+
+### Getting Help
+
+```bash
+# General help
+aorta-report --help
+
+# Command group help
+aorta-report analyze --help
+aorta-report compare --help
+aorta-report generate --help
+aorta-report process --help
+aorta-report pipeline --help
+
+# Specific command help
+aorta-report pipeline summary --help
+aorta-report compare gpu_timeline --help
+```
+
diff --git a/src/aorta/report/analysis/cli.py b/src/aorta/report/analysis/cli.py
new file mode 100644
index 0000000..232f8d1
--- /dev/null
+++ b/src/aorta/report/analysis/cli.py
@@ -0,0 +1,158 @@
+"""CLI commands for TraceLens analysis.
+
+This module provides the 'analyze' command group with subcommands:
+  - single: Analyze a single configuration trace directory
+  - sweep: Analyze a sweep directory with multiple configurations
+  - gemm: Analyze GEMM kernels from TraceLens reports
+"""
+
+import click
+from pathlib import Path
+
+
+@click.group()
+@click.pass_context
+def analyze(ctx):
+    """Run TraceLens analysis on traces.
+
+    \b
+    Commands:
+      single  - Analyze a single configuration trace directory
+      sweep   - Analyze a sweep directory with multiple configurations
+      gemm    - Analyze GEMM kernels from TraceLens reports
+    """
+    pass
+
+
+@analyze.command("single")
+@click.argument("trace_dir", type=click.Path(exists=True))
+@click.option("--individual-only", is_flag=True, help="Generate only individual reports")
+@click.option("--collective-only", is_flag=True, help="Generate only collective report")
+@click.option("--geo-mean", is_flag=True, help="Use geometric mean for timeline aggregation")
+@click.option("--short-kernel-threshold", default=50, type=int,
+              help="Threshold for short kernel study (microseconds)")
+@click.option("--topk-ops", default=100, type=int,
+              help="Number of top operations to include")
+@click.option("-o", "--output", type=click.Path(), help="Output directory")
+@click.pass_context
+def analyze_single(ctx, trace_dir, individual_only, collective_only, geo_mean,
+                   short_kernel_threshold, topk_ops, output):
+    """Analyze a single configuration trace directory.
+
+    TRACE_DIR: Path to the trace directory containing rank subdirectories.
+
+    \b
+    Examples:
+      aorta-report analyze single /path/to/traces
+      aorta-report analyze single /path/to/traces --individual-only
+      aorta-report analyze single /path/to/traces -o ./results
+    """
+    from . import analyze_single_config
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    run_individual = not collective_only
+    run_collective = not individual_only
+
+    try:
+        results = analyze_single_config(
+            input_dir=Path(trace_dir),
+            output_dir=Path(output) if output else None,
+            run_individual=run_individual,
+            run_collective=run_collective,
+            aggregate_timeline=run_individual,
+            use_geo_mean=geo_mean,
+            short_kernel_threshold_us=short_kernel_threshold,
+            topk_ops=topk_ops,
+            verbose=verbose,
+        )
+        if not quiet:
+            click.echo(f"\nAnalysis complete: {results['output_dir']}")
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
+
+
+@analyze.command("sweep")
+@click.argument("sweep_dir", type=click.Path(exists=True))
+@click.option("--geo-mean", is_flag=True, help="Use geometric mean instead of arithmetic mean")
+@click.option("-o", "--output", type=click.Path(), help="Output directory")
+@click.pass_context
+def analyze_sweep(ctx, sweep_dir, geo_mean, output):
+    """Analyze a sweep directory with multiple configurations.
+
+    SWEEP_DIR: Path to the sweep directory containing tracelens_analysis/
+    with multiple thread/channel configs.
+
+    \b
+    Examples:
+      aorta-report analyze sweep /path/to/sweep_20251124
+      aorta-report analyze sweep /path/to/sweep --geo-mean
+    """
+    from . import analyze_sweep_config
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    try:
+        output_path = analyze_sweep_config(
+            sweep_dir=Path(sweep_dir),
+            output_dir=Path(output) if output else None,
+            use_geo_mean=geo_mean,
+            verbose=verbose,
+        )
+        if not quiet and output_path:
+            click.echo(f"\nAnalysis complete: {output_path}")
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
+
+
+@analyze.command("gemm")
+@click.argument("reports_dir", type=click.Path(exists=True))
+@click.option("--threads", "-t", multiple=True, type=int, default=(256, 512),
+              help="Thread configurations to analyze (can be specified multiple times)")
+@click.option("--channels", "-c", multiple=True, type=int, default=(28, 42, 56, 70),
+              help="Channel configurations to analyze (can be specified multiple times)")
+@click.option("--ranks", "-r", multiple=True, type=int,
+              help="Ranks to analyze (default: 0-7)")
+@click.option("--top-k", default=5, type=int, help="Number of top kernels to extract per file")
+@click.option("-o", "--output", type=click.Path(),
+              default="top5_gemm_kernels_time_variance.csv", help="Output CSV file")
+@click.pass_context
+def analyze_gemm(ctx, reports_dir, threads, channels, ranks, top_k, output):
+    """Analyze GEMM kernels from TraceLens reports.
+
+    REPORTS_DIR: Path to tracelens_analysis directory containing
+    {threads}thread/individual_reports/ subdirectories.
+
+    \b
+    Examples:
+      aorta-report analyze gemm /path/to/tracelens_analysis
+      aorta-report analyze gemm /path/to/reports --top-k 10 -o gemm_analysis.csv
+      aorta-report analyze gemm /path/to/reports -t 256 -t 512 -c 28 -c 42
+    """
+    from . import analyze_gemm_reports
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    # Convert tuples to lists, use defaults if not specified
+    threads_list = list(threads) if threads else [256, 512]
+    channels_list = list(channels) if channels else [28, 42, 56, 70]
+    ranks_list = list(ranks) if ranks else list(range(8))
+
+    try:
+        output_path = analyze_gemm_reports(
+            base_path=Path(reports_dir),
+            threads=threads_list,
+            channels=channels_list,
+            ranks=ranks_list,
+            top_k=top_k,
+            output_file=output,
+            verbose=verbose,
+        )
+        if not quiet and output_path:
+            click.echo(f"\nAnalysis complete: {output_path}")
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
+
diff --git a/src/aorta/report/aorta-report-detail-plan.md b/src/aorta/report/aorta-report-detail-plan.md
index 10a33fa..9bca9ad 100644
--- a/src/aorta/report/aorta-report-detail-plan.md
+++ b/src/aorta/report/aorta-report-detail-plan.md
@@ -1,599 +1,506 @@
-# aorta-report Detailed Implementation Plan
+# aorta-report Detailed Architecture Plan
 
-## Current State Summary
-
-| Directory | Scripts | Purpose |
-|-----------|---------|---------|
-| `gemm_analysis/` | 8 Python + 2 Shell | GEMM-specific analysis, sweep directories, HTML reports |
-| `tracelens_single_config/` | 9 Python + 2 Shell | Single-config analysis, report comparison, final reports |
-| Shared | `tracelens_with_gemm_patch.py` | GEMM-patched TraceLens wrapper |
-
-### Current Scripts Inventory
-
-#### `gemm_analysis/` directory:
-
-**Shell scripts:**
-- `run_tracelens_analysis.sh` - Main pipeline for sweep directory analysis
-- `run_train_various_channels.sh` - Training with various NCCL channels
-
-**Python scripts:**
-- `analyze_gemm_reports.py` - Analyze GEMM reports from Excel
-- `create_embeded_html_report.py` - Create HTML report with embedded images
-- `enhance_gemm_variance_with_timestamps.py` - Enhance GEMM variance data
-- `gemm_report_with_collective_overlap.py` - GEMM report with collective overlap
-- `plot_gemm_variance.py` - Plot GEMM variance
-- `process_comms.py` - Process communication data
-- `process_gpu_timeline.py` - Process GPU timeline
-
-**Support files:**
-- `html_template.py` - HTML template for reports
-- `rocprof_*.yaml` - rocprof config files
-
-#### `tracelens_single_config/` directory:
-
-**Shell scripts:**
-- `run_tracelens_single_config.sh` - TraceLens for single config
-- `run_rccl_warp_speed_comparison.sh` - RCCL comparison
-
-**Python scripts:**
-- `run_full_analysis.py` - Master pipeline script
-- `add_collective_comparison.py` - Add collective comparison sheets
-- `add_comparison_sheets.py` - Add comparison sheets
-- `combine_reports.py` - Combine reports
-- `compare_all_runs.py` - Compare all runs
-- `create_final_html.py` - Create final HTML report
-- `create_final_plots.py` - Create final plots
-- `create_final_report.py` - Create final Excel report
-- `process_gpu_timeline.py` - Process GPU timeline (duplicate!)
-
-**Support files:**
-- `html_report_config.py` - HTML config
-
-### Key Issues
-
-- Duplicate scripts (`process_gpu_timeline.py` exists in both directories)
-- Mixed Shell + Python entry points
-- No unified interface
-- Hard to discover available functionality
-- Inconsistent argument styles
+**Version:** 2.0  
+**Date:** January 2026  
+**Status:** Implemented
 
 ---
 
-## Duplicate Script Analysis: `process_gpu_timeline.py`
+## 1. Overview
 
-Both directories contain a `process_gpu_timeline.py` file with overlapping but different functionality.
+`aorta-report` is a unified CLI tool for TraceLens analysis and report generation. This document describes the modular architecture design with **colocated CLI commands**.
 
-### Comparison Summary
+### Design Philosophy
 
-| Aspect | `gemm_analysis/` | `tracelens_single_config/` |
-|--------|------------------|----------------------------|
-| **Lines of Code** | 468 | 101 |
-| **Purpose** | Multi-config sweep analysis | Single config analysis |
-| **Input Argument** | `--sweep-dir` | `--reports-dir` |
-| **File Pattern** | `perf_28ch_rank0.xlsx` | `perf_rank0.xlsx` |
+- **High Cohesion**: CLI commands live next to their implementation
+- **Single Responsibility**: Each package owns its complete interface (API + CLI)
+- **Lazy Loading**: Dependencies imported only when commands are invoked
+- **Maintainability**: Small, focused files (<300 lines each)
 
-### Detailed Differences
+---
 
-#### 1. Scope & Directory Structure
+## 2. Directory Structure
 
-**`gemm_analysis/`** - Handles **sweep directories** with multiple thread/channel configurations:
 ```
-sweep_dir/
-└── tracelens_analysis/
-    ├── 256thread/
-    │   └── individual_reports/
-    │       ├── perf_28ch_rank0.xlsx
-    │       ├── perf_28ch_rank1.xlsx
-    │       ├── perf_56ch_rank0.xlsx
-    │       └── ...
-    └── 384thread/
-        └── individual_reports/
-            └── ...
+src/aorta/report/
+├── __init__.py                 # Package version and exports
+├── __main__.py                 # python -m aorta.report support
+├── cli.py                      # Main CLI orchestrator (~80 lines)
+│
+├── analysis/                   # TraceLens analysis modules
+│   ├── __init__.py            # Package exports
+│   ├── cli.py                 # 'analyze' command group (~150 lines)
+│   ├── analyze_gemm.py        # GEMM kernel analysis
+│   ├── analyze_single.py      # Single config analysis
+│   ├── analyze_sweep.py       # Sweep analysis
+│   └── tracelens_wrapper.py   # TraceLens integration
+│
+├── comparison/                 # Report comparison modules
+│   ├── __init__.py            # Package exports
+│   ├── cli.py                 # 'compare' command group (~220 lines)
+│   ├── combine.py             # Excel file combining
+│   ├── gpu_timeline_comparison.py
+│   ├── collective_comparison.py
+│   └── formatting.py          # Excel formatting utilities
+│
+├── generators/                 # Report generation modules
+│   ├── __init__.py            # Package exports
+│   ├── cli.py                 # 'generate' command group (~270 lines)
+│   ├── html_generator.py      # HTML report generation
+│   ├── excel_report.py        # Final Excel report
+│   ├── plot_generator.py      # Plot orchestration
+│   └── plot_helper/           # Individual plot functions
+│       ├── __init__.py
+│       ├── common.py
+│       ├── summary_dashboard.py
+│       ├── gpu_by_rank.py
+│       ├── gpu_percent_change.py
+│       ├── gpu_heatmap.py
+│       ├── nccl_charts.py
+│       ├── gemm_data.py
+│       ├── gemm_boxplots.py
+│       ├── gemm_violin.py
+│       └── gemm_interaction.py
+│
+├── processing/                 # Data processing modules
+│   ├── __init__.py            # Package exports
+│   ├── cli.py                 # 'process' command group (~170 lines)
+│   ├── gpu_timeline_single.py
+│   ├── gpu_timeline_sweep.py
+│   ├── process_comms.py
+│   └── process_gemm_variance.py
+│
+├── pipelines/                  # Pipeline orchestrators
+│   ├── __init__.py            # Package exports
+│   ├── cli.py                 # 'pipeline' command group (~200 lines)
+│   ├── summary_pipeline.py    # Full analysis pipeline
+│   └── gemm_pipeline.py       # GEMM analysis pipeline
+│
+└── templates/                  # HTML templates
+    ├── __init__.py
+    ├── performance_report_template.py
+    └── sweep_comparison_template.py
 ```
 
-**`tracelens_single_config/`** - Handles **single configuration** flat directory:
-```
-reports_dir/
-├── perf_rank0.xlsx
-├── perf_rank1.xlsx
-└── ...
-```
+---
 
-#### 2. Input Arguments
+## 3. CLI Architecture
 
-```python
-# gemm_analysis/process_gpu_timeline.py
-parser.add_argument("--sweep-dir", required=True, 
-    help="Path to sweep directory (e.g., sweep_20251124_222204)")
+### 3.1 Main Orchestrator (`cli.py`)
 
-# tracelens_single_config/process_gpu_timeline.py
-parser.add_argument("--reports-dir", required=True, 
-    help="Path to individual_reports directory")
-```
+The main `cli.py` is a **thin orchestrator** (~80 lines) that:
 
-#### 3. Metadata Handling
+1. Defines the root `@click.group()` with global options (`--verbose`, `--quiet`)
+2. Imports command groups from each package
+3. Registers them with `cli.add_command()`
 
-**`gemm_analysis/`** extracts and adds rich metadata:
 ```python
-aggregated["thread_config"] = thread_config      # e.g., "256thread"
-aggregated["threads_num"] = int(...)             # e.g., 256
-aggregated["channel_config"] = channel_config    # e.g., "28ch"
-aggregated["channels_num"] = int(...)            # e.g., 28
-aggregated["full_config"] = f"{thread_config}_{channel_config}"  # e.g., "256thread_28ch"
-aggregated["num_ranks"] = num_ranks
-```
+# cli.py - Main orchestrator
+import click
+from . import __version__
 
-**`tracelens_single_config/`** - minimal metadata:
-```python
-aggregated["num_ranks"] = len(perf_files)
+@click.group()
+@click.version_option(version=__version__, prog_name="aorta-report")
+@click.option("-v", "--verbose", is_flag=True)
+@click.option("--quiet", is_flag=True)
+@click.pass_context
+def cli(ctx, verbose, quiet):
+    """aorta-report: Unified CLI for TraceLens analysis."""
+    ctx.ensure_object(dict)
+    ctx.obj["verbose"] = verbose
+    ctx.obj["quiet"] = quiet
+
+# Register command groups from subpackages
+from .analysis.cli import analyze
+from .comparison.cli import compare
+from .generators.cli import generate
+from .processing.cli import process
+from .pipelines.cli import pipeline
+
+cli.add_command(analyze)
+cli.add_command(compare)
+cli.add_command(generate)
+cli.add_command(process)
+cli.add_command(pipeline)
+
+def main():
+    cli(obj={})
 ```
 
-#### 4. Output Excel Sheets
-
-**`gemm_analysis/`** creates:
-| Sheet | Description |
-|-------|-------------|
-| `All_Data` | Complete dataset with all configs |
-| `Pivot_Time_ms` | Matrix: type × full_config |
-| `Pivot_Percent` | Matrix: type × full_config |
-| `Summary_By_Config` | Key metrics per configuration |
-
-**`tracelens_single_config/`** creates:
-| Sheet | Description |
-|-------|-------------|
-| `Summary` | Aggregated metrics |
-| `All_Ranks_Combined` | Raw data from all ranks |
-| `Per_Rank_Time_ms` | Matrix: type × rank |
-| `Per_Rank_Percent` | Matrix: type × rank |
+### 3.2 Package CLI Modules
 
-#### 5. Output File Location
+Each package has its own `cli.py` that defines:
 
-```python
-# gemm_analysis/
-output_path = tracelens_dir / f"gpu_timeline_all_configs_{method_suffix}.xlsx"
-
-# tracelens_single_config/
-output_path = reports_path.parent / f"gpu_timeline_summary_{method_suffix}.xlsx"
-```
+1. A `@click.group()` for the command group
+2. All subcommands using `@group.command()`
+3. Imports from the same package (relative imports)
 
-### Shared Code (Consolidation Candidates)
+**Example: `analysis/cli.py`**
 
-Both files have **identical** `geometric_mean()` function:
 ```python
-def geometric_mean(values):
-    """Calculate geometric mean, handling zeros."""
-    values = np.array(values)
-    values = np.where(values == 0, 1e-10, values)
-    return np.exp(np.mean(np.log(values)))
-```
-
-And **similar** aggregation logic:
-```python
-agg_func = geometric_mean if use_geo_mean else "mean"
-aggregated = (
-    combined.groupby("type")
-    .agg({"time ms": agg_func, "percent": agg_func})
-    .reset_index()
-)
-```
-
-### Consolidation Recommendation
-
-Create a unified `process_gpu_timeline()` command that:
+# analysis/cli.py
+import click
+from pathlib import Path
 
-1. **Auto-detects** input type (sweep dir vs single reports dir)
-2. Uses `--mode` flag with options: `auto`, `single`, `sweep`
-3. Shares common aggregation logic via a core module
-4. Generates appropriate output based on detected/specified mode
+@click.group()
+@click.pass_context
+def analyze(ctx):
+    """Run TraceLens analysis on traces."""
+    pass
 
-**Proposed unified CLI command:**
-```python
-@process.command("gpu-timeline")
-@click.argument("input_dir", type=click.Path(exists=True))
-@click.option("--mode", type=click.Choice(["auto", "single", "sweep"]), default="auto",
-              help="Processing mode: auto-detect, single config, or sweep")
-@click.option("--geo-mean", is_flag=True, help="Use geometric mean instead of arithmetic mean")
-@click.option("--output", "-o", help="Output file path (auto-generated if not specified)")
-def process_gpu_timeline(input_dir, mode, geo_mean, output):
-    """Process GPU timeline data from TraceLens reports.
-    
-    Supports both single-config and sweep directory structures.
-    Auto-detects the structure by default.
+@analyze.command("single")
+@click.argument("trace_dir", type=click.Path(exists=True))
+@click.option("--geo-mean", is_flag=True)
+@click.pass_context
+def analyze_single(ctx, trace_dir, geo_mean):
+    """Analyze a single configuration."""
+    from . import analyze_single_config  # Relative import from same package
     
-    Examples:
-        # Auto-detect mode
-        aorta-report process gpu-timeline /path/to/reports
-        
-        # Explicit single config
-        aorta-report process gpu-timeline /path/to/individual_reports --mode single
-        
-        # Sweep directory with geometric mean
-        aorta-report process gpu-timeline /path/to/sweep --mode sweep --geo-mean
-    """
-    from aorta.report.core.gpu_timeline import (
-        process_single_config,
-        process_sweep_config,
-        detect_input_type
+    result = analyze_single_config(
+        input_dir=Path(trace_dir),
+        use_geo_mean=geo_mean,
+        verbose=ctx.obj.get("verbose", False),
     )
-    
-    if mode == "auto":
-        mode = detect_input_type(input_dir)
-    
-    if mode == "single":
-        return process_single_config(input_dir, geo_mean, output)
-    else:
-        return process_sweep_config(input_dir, geo_mean, output)
+    click.echo(f"Complete: {result}")
 ```
 
-**Core module structure:**
-```python
-# aorta/src/aorta/report/core/gpu_timeline.py
-
-def geometric_mean(values):
-    """Shared geometric mean calculation."""
-    ...
-
-def aggregate_rank_data(rank_data, use_geo_mean):
-    """Shared aggregation logic."""
-    ...
-
-def detect_input_type(input_dir):
-    """Auto-detect if input is single config or sweep."""
-    ...
+---
 
-def process_single_config(reports_dir, use_geo_mean, output):
-    """Process single configuration (from tracelens_single_config)."""
-    ...
+## 4. Command Reference
 
-def process_sweep_config(sweep_dir, use_geo_mean, output):
-    """Process sweep directory (from gemm_analysis)."""
-    ...
-```
+### 4.1 Command Groups Summary
 
----
+| Group | File | Commands | Lines |
+|-------|------|----------|-------|
+| `analyze` | `analysis/cli.py` | `single`, `sweep`, `gemm` | ~150 |
+| `compare` | `comparison/cli.py` | `gpu_timeline`, `collective` | ~220 |
+| `generate` | `generators/cli.py` | `html`, `excel`, `plots` | ~270 |
+| `process` | `processing/cli.py` | `gpu-timeline`, `comms`, `gemm-variance` | ~170 |
+| `pipeline` | `pipelines/cli.py` | `summary`, `gemm` | ~200 |
 
-## Proposed CLI Architecture
+### 4.2 Full Command Tree
 
 ```
 aorta-report
-├── analyze           # Core analysis commands
-│   ├── single        # Single config analysis (was: run_tracelens_single_config.sh)
-│   ├── sweep         # Sweep directory analysis (was: run_tracelens_analysis.sh)
-│   └── gemm          # GEMM-specific analysis (was: analyze_gemm_reports.py)
+├── --version
+├── --verbose / -v
+├── --quiet
 │
-├── compare           # Comparison commands
-│   ├── runs          # Compare multiple runs (was: compare_all_runs.py)
-│   ├── reports       # Compare two reports (was: combine_reports.py)
-│   └── collective    # Compare collective ops (was: add_collective_comparison.py)
+├── analyze
+│   ├── single <TRACE_DIR>
+│   │   ├── --individual-only
+│   │   ├── --collective-only
+│   │   ├── --geo-mean
+│   │   ├── --short-kernel-threshold INT
+│   │   ├── --topk-ops INT
+│   │   └── -o, --output PATH
+│   │
+│   ├── sweep <SWEEP_DIR>
+│   │   ├── --geo-mean
+│   │   └── -o, --output PATH
+│   │
+│   └── gemm <REPORTS_DIR>
+│       ├── -t, --threads INT (multiple)
+│       ├── -c, --channels INT (multiple)
+│       ├── -r, --ranks INT (multiple)
+│       ├── --top-k INT
+│       └── -o, --output PATH
 │
-├── generate          # Report generation
-│   ├── html          # HTML report with two modes:
-│   │   ├── --mode sweep        # GEMM variance (was: create_embeded_html_report.py)
-│   │   └── --mode performance  # GPU/NCCL analysis (was: create_final_html.py)
-│   ├── excel         # Excel report (was: create_final_report.py)
-│   └── plots         # Generate plots (was: create_final_plots.py, plot_gemm_variance.py)
+├── compare
+│   ├── gpu_timeline
+│   │   ├── -b, --baseline PATH (required)
+│   │   ├── -t, --test PATH (required)
+│   │   ├── --baseline-label TEXT
+│   │   ├── --test-label TEXT
+│   │   └── -o, --output PATH (required)
+│   │
+│   └── collective
+│       ├── -b, --baseline PATH (required)
+│       ├── -t, --test PATH (required)
+│       ├── --baseline-label TEXT
+│       ├── --test-label TEXT
+│       └── -o, --output PATH (required)
 │
-├── process           # Data processing
-│   ├── gpu-timeline  # Process GPU timeline (consolidated)
-│   ├── comms         # Process communications
-│   └── gemm-variance # Enhance GEMM variance
+├── generate
+│   ├── html
+│   │   ├── --mode [sweep|performance] (required)
+│   │   ├── --sweep1 PATH
+│   │   ├── --sweep2 PATH
+│   │   ├── --label1 TEXT
+│   │   ├── --label2 TEXT
+│   │   ├── --plots-dir PATH
+│   │   └── -o, --output PATH (required)
+│   │
+│   ├── excel
+│   │   ├── --gpu-combined PATH (required)
+│   │   ├── --gpu-comparison PATH (required)
+│   │   ├── --coll-combined PATH (required)
+│   │   ├── --coll-comparison PATH (required)
+│   │   ├── --baseline-label TEXT
+│   │   ├── --test-label TEXT
+│   │   └── -o, --output PATH (required)
+│   │
+│   └── plots
+│       ├── -i, --input PATH
+│       ├── --excel-input PATH
+│       ├── --gemm-csv PATH
+│       ├── --type [all|summary|gemm]
+│       ├── --dpi INT
+│       └── -o, --output PATH (required)
 │
-└── pipeline          # Full pipelines (composite commands)
-    ├── full          # Full analysis pipeline (was: run_full_analysis.py)
-    └── gemm          # GEMM-focused pipeline
+├── process
+│   ├── gpu-timeline <INPUT_DIR>
+│   │   ├── --mode [auto|single|sweep]
+│   │   ├── --geo-mean
+│   │   └── -o, --output PATH
+│   │
+│   ├── comms <SWEEP_DIR>
+│   │   └── -o, --output PATH
+│   │
+│   └── gemm-variance <INPUT_CSV>
+│       ├── --base-path PATH (required)
+│       ├── --tolerance FLOAT
+│       └── -o, --output PATH
+│
+└── pipeline
+    ├── summary
+    │   ├── -b, --baseline PATH (required)
+    │   ├── -t, --test PATH (required)
+    │   ├── -o, --output PATH (required)
+    │   ├── --baseline-label TEXT
+    │   ├── --test-label TEXT
+    │   ├── --skip-tracelens
+    │   ├── --gpu-timeline / --no-gpu-timeline
+    │   ├── --collective / --no-collective
+    │   ├── --final-report / --no-final-report
+    │   ├── --plots / --no-plots
+    │   └── --html / --no-html
+    │
+    └── gemm
+        ├── --sweep-dir PATH (required)
+        ├── -o, --output PATH (required)
+        ├── --top-k INT
+        ├── -t, --threads INT (multiple)
+        ├── -c, --channels INT (multiple)
+        ├── --timestamps / --no-timestamps
+        └── --plots / --no-plots
 ```
 
 ---
 
-## Implementation Plan
+## 5. Data Flow
 
-### Phase 1: Create CLI Foundation
+### 5.1 Summary Pipeline Flow
 
-**Create new package:** `aorta/src/aorta/report/`
+```
+┌─────────────────┐     ┌─────────────────┐
+│ Baseline Traces │     │   Test Traces   │
+└────────┬────────┘     └────────┬────────┘
+         │                       │
+         ▼                       ▼
+┌────────────────────────────────────────┐
+│        analyze single (TraceLens)       │
+│   analysis/analyze_single.py            │
+└────────────────────┬───────────────────┘
+                     │
+         ┌───────────┴───────────┐
+         ▼                       ▼
+┌─────────────────┐     ┌─────────────────┐
+│  GPU Timeline   │     │  Collective     │
+│    Reports      │     │    Reports      │
+└────────┬────────┘     └────────┬────────┘
+         │                       │
+         ▼                       ▼
+┌─────────────────┐     ┌─────────────────┐
+│ compare         │     │ compare         │
+│ gpu_timeline    │     │ collective      │
+└────────┬────────┘     └────────┬────────┘
+         │                       │
+         └───────────┬───────────┘
+                     ▼
+         ┌───────────────────────┐
+         │   generate excel      │
+         │   (Final Report)      │
+         └───────────┬───────────┘
+                     │
+         ┌───────────┴───────────┐
+         ▼                       ▼
+┌─────────────────┐     ┌─────────────────┐
+│ generate plots  │     │ generate html   │
+└─────────────────┘     └─────────────────┘
+```
 
-```python
-# aorta/src/aorta/report/cli.py
-import click
+### 5.2 GEMM Pipeline Flow
 
-@click.group()
-@click.version_option()
-def cli():
-    """TraceLens Analysis CLI - Unified interface for trace analysis."""
-    pass
+```
+┌─────────────────┐
+│   Sweep Dir     │
+│ tracelens_      │
+│ analysis/       │
+└────────┬────────┘
+         │
+         ▼
+┌─────────────────┐
+│  analyze gemm   │
+│  (Top-K Kernels)│
+└────────┬────────┘
+         │
+         ▼
+┌─────────────────┐
+│ process         │
+│ gemm-variance   │
+│ (Add timestamps)│
+└────────┬────────┘
+         │
+         ▼
+┌─────────────────┐
+│ generate plots  │
+│ --type gemm     │
+└─────────────────┘
+```
 
-# === ANALYZE GROUP ===
-@cli.group()
-def analyze():
-    """Run TraceLens analysis on traces."""
-    pass
+---
 
-@analyze.command("single")
-@click.argument("trace_dir", type=click.Path(exists=True))
-@click.option("--individual-only", is_flag=True, help="Generate only individual reports")
-@click.option("--collective-only", is_flag=True, help="Generate only collective report")
-@click.option("--output", "-o", help="Output directory")
-def analyze_single(trace_dir, individual_only, collective_only, output):
-    """Analyze a single configuration trace directory."""
-    # Consolidate run_tracelens_single_config.sh logic
-    pass
+## 6. Benefits of Colocated CLI Design
 
-@analyze.command("sweep")
-@click.argument("sweep_dir", type=click.Path(exists=True))
-@click.option("--rocprof", is_flag=True, help="Use rocprof traces instead of PyTorch profiler")
-@click.option("--output", "-o", help="Output directory")
-def analyze_sweep(sweep_dir, rocprof, output):
-    """Analyze a sweep directory with multiple configurations."""
-    # Consolidate run_tracelens_analysis.sh logic
-    pass
+### 6.1 Comparison with Alternatives
 
-@analyze.command("gemm")
-@click.argument("reports_dir", type=click.Path(exists=True))
-@click.option("--top-k", default=5, help="Number of top kernels to extract")
-@click.option("--output", "-o", help="Output CSV file")
-def analyze_gemm(reports_dir, top_k, output):
-    """Analyze GEMM kernels from TraceLens reports."""
-    # From analyze_gemm_reports.py
-    pass
+| Approach | Pros | Cons |
+|----------|------|------|
+| **Single cli.py** | All in one place | 1000+ lines, hard to maintain |
+| **Separate cli/ folder** | Clear separation | Jumps between directories |
+| **Colocated** ✓ | High cohesion, easy to find | Need to look at multiple files |
 
-# === COMPARE GROUP ===
-@cli.group()
-def compare():
-    """Compare traces and reports."""
-    pass
+### 6.2 Key Benefits
 
-@compare.command("runs")
-@click.option("--inputs", "-i", multiple=True, required=True, help="Input directories")
-@click.option("--output", "-o", required=True, help="Output directory")
-def compare_runs(inputs, output):
-    """Compare multiple TraceLens analysis runs."""
-    pass
+1. **Discoverability**: Working on `analysis`? CLI is right there in `analysis/cli.py`
 
-@compare.command("reports")
-@click.option("--baseline", "-b", required=True, help="Baseline report")
-@click.option("--test", "-t", required=True, help="Test report")
-@click.option("--baseline-label", help="Label for baseline")
-@click.option("--test-label", help="Label for test")
-@click.option("--output", "-o", required=True, help="Output file")
-def compare_reports(baseline, test, baseline_label, test_label, output):
-    """Combine and compare two reports."""
-    pass
+2. **Ownership**: Each package owns its complete interface:
+   - `analysis/` → business logic + CLI
+   - No cross-cutting changes needed
 
-# === GENERATE GROUP ===
-@cli.group()
-def generate():
-    """Generate reports and visualizations."""
-    pass
+3. **Testing**: Test `analysis/cli.py` with `analysis/` fixtures:
+   ```python
+   # tests/test_analysis_cli.py
+   from aorta.report.analysis.cli import analyze
+   ```
 
-@generate.command("html")
-@click.option("--mode", type=click.Choice(["sweep", "performance"]), required=True,
-              help="Report mode: 'sweep' for GEMM variance, 'performance' for GPU/NCCL analysis")
-@click.option("--sweep1", help="[sweep mode] First sweep directory")
-@click.option("--sweep2", help="[sweep mode] Second sweep directory")
-@click.option("--label1", help="[sweep mode] Label for first sweep")
-@click.option("--label2", help="[sweep mode] Label for second sweep")
-@click.option("--plots-dir", help="[performance mode] Directory with pre-generated plots")
-@click.option("--output", "-o", required=True, help="Output HTML file")
-def generate_html(mode, sweep1, sweep2, label1, label2, plots_dir, output):
-    """Generate HTML report with embedded images.
-    
-    Two modes:
-    - sweep: GEMM variance comparison (requires --sweep1, --sweep2)
-    - performance: GPU/NCCL analysis (requires --plots-dir)
-    """
-    from .generators import generate_html as do_generate_html
-    do_generate_html(mode=mode, output=output, sweep1=sweep1, sweep2=sweep2,
-                     label1=label1, label2=label2, plots_dir=plots_dir)
-
-@generate.command("excel")
-@click.option("--gpu-combined", required=True)
-@click.option("--gpu-comparison", required=True)
-@click.option("--coll-combined", required=True)
-@click.option("--coll-comparison", required=True)
-@click.option("--output", "-o", required=True)
-def generate_excel(gpu_combined, gpu_comparison, coll_combined, coll_comparison, output):
-    """Generate comprehensive Excel report."""
-    pass
+4. **Lazy Loading**: Commands import dependencies only when invoked:
+   ```python
+   @analyze.command("gemm")
+   def analyze_gemm(ctx, ...):
+       from . import analyze_gemm_reports  # Imported only when command runs
+   ```
 
-@generate.command("plots")
-@click.option("--input", "-i", required=True, help="Input Excel report")
-@click.option("--output", "-o", required=True, help="Output directory")
-def generate_plots(input, output):
-    """Generate visualization plots."""
-    pass
-
-# === PIPELINE GROUP ===
-@cli.group()
-def pipeline():
-    """Run complete analysis pipelines."""
-    pass
-
-@pipeline.command("full")
-@click.option("--baseline", "-b", required=True, help="Baseline trace directory")
-@click.option("--test", "-t", required=True, multiple=True, help="Test trace directory(s)")
-@click.option("--output", "-o", required=True, help="Output directory")
-@click.option("--skip-tracelens", is_flag=True, help="Skip TraceLens generation")
-@click.option("--gpu-timeline/--no-gpu-timeline", default=True)
-@click.option("--collective/--no-collective", default=True)
-@click.option("--final-report/--no-final-report", default=True)
-@click.option("--plots/--no-plots", default=True)
-def pipeline_full(baseline, test, output, skip_tracelens, gpu_timeline, collective, final_report, plots):
-    """Run complete analysis pipeline with comparisons."""
-    # Consolidate run_full_analysis.py
-    pass
-```
+5. **Scalability**: Adding new functionality:
+   - Add new module to package
+   - Add command to package's `cli.py`
+   - No changes to main `cli.py`
 
 ---
 
-### Phase 2: Migrate Logic from Shell to Python
+## 7. Implementation Status
 
-| Shell Script | → | Python Function |
-|--------------|---|-----------------|
-| `run_tracelens_single_config.sh` | → | `analyze.single()` |
-| `run_tracelens_analysis.sh` | → | `analyze.sweep()` |
-| `run_rccl_warp_speed_comparison.sh` | → | `compare.rccl()` (new) |
+### 7.1 Completed Commands
 
-### Phase 3: Consolidate Duplicate Code
+| Command | Package | Status |
+|---------|---------|--------|
+| `analyze single` | `analysis/` | ✅ |
+| `analyze sweep` | `analysis/` | ✅ |
+| `analyze gemm` | `analysis/` | ✅ |
+| `compare gpu_timeline` | `comparison/` | ✅ |
+| `compare collective` | `comparison/` | ✅ |
+| `generate html` | `generators/` | ✅ |
+| `generate excel` | `generators/` | ✅ |
+| `generate plots` | `generators/` | ✅ |
+| `process gpu-timeline` | `processing/` | ✅ |
+| `process comms` | `processing/` | ✅ |
+| `process gemm-variance` | `processing/` | ✅ |
+| `pipeline summary` | `pipelines/` | ✅ |
+| `pipeline gemm` | `pipelines/` | ✅ |
 
-- Merge both `process_gpu_timeline.py` files
-- Create shared utilities in `aorta/src/aorta/report/utils/`
-- Move `html_template.py` to shared location
+### 7.2 Planned Commands
 
-### Phase 4: File Structure
-
-```
-aorta/src/aorta/report/
-├── __init__.py
-├── __main__.py              # Entry point: python -m aorta.report
-├── cli.py                   # Click CLI definition
-├── generators/              # HTML report generators (IMPLEMENTED)
-│   ├── __init__.py
-│   ├── html_generator.py    # Unified entry point + utilities
-│   ├── sweep_comparison.py  # GEMM sweep comparison mode
-│   └── performance_report.py # GPU/NCCL performance mode
-├── templates/               # HTML templates (IMPLEMENTED)
-│   ├── __init__.py
-│   ├── sweep_comparison_template.py    # GEMM comparison HTML
-│   └── performance_report_template.py  # Performance report HTML
-├── commands/                # Future: split cli.py into modules
-│   ├── __init__.py
-│   ├── analyze.py           # analyze subcommands
-│   ├── compare.py           # compare subcommands
-│   ├── report.py            # report subcommands
-│   └── pipeline.py          # pipeline subcommands
-└── core/                    # Future: shared processing logic
-    ├── __init__.py
-    ├── tracelens_wrapper.py  # GEMM-patched TraceLens
-    ├── gpu_timeline.py       # Consolidated GPU timeline processing
-    ├── gemm_analysis.py      # GEMM analysis logic
-    └── report_generator.py   # Report generation logic
-```
-
-### `generate html` Implementation (COMPLETED)
-
-The `generate html` command supports two modes:
-
-#### Mode: `sweep` (GEMM Variance Comparison)
-- **Source:** `create_embeded_html_report.py`
-- **Input:** Two sweep directories with `tracelens_analysis/plots/` containing variance plots
-- **Expected plots:**
-  - `variance_by_threads_boxplot.png`
-  - `variance_by_channels_boxplot.png`
-  - `variance_by_ranks_boxplot.png`
-  - `variance_violin_combined.png`
-  - `variance_thread_channel_interaction.png`
-- **Output:** Side-by-side comparison HTML with embedded base64 images
-
-#### Mode: `performance` (GPU/NCCL Analysis)
-- **Source:** `create_final_html.py`
-- **Input:** Plots directory containing performance charts
-- **Expected plots:**
-  - Overall: `improvement_chart.png`, `abs_time_comparison.png`
-  - Cross-Rank: `gpu_time_heatmap.png`, `*_by_rank.png`
-  - NCCL: `NCCL_*_comparison.png`
-- **Output:** Performance analysis HTML with embedded base64 images
-
-#### Features
-- Clear plot status reporting showing expected/found/missing for each plot
-- Graceful handling of missing plots with placeholder messages
-- Base64 embedding for self-contained HTML files
-
-### Phase 5: Entry Points
-
-Add to `pyproject.toml`:
-
-```toml
-[project.scripts]
-aorta-report = "aorta.report:main"
-```
+| Command | Package | Status |
+|---------|---------|--------|
+| `compare runs` | `comparison/` | ⏸️ Deferred |
 
 ---
 
-## Usage Examples (After Implementation)
-
-```bash
-# Single config analysis
-aorta-report analyze single /path/to/traces --output ./results
+## 8. File Size Summary
 
-# Sweep analysis  
-aorta-report analyze sweep /path/to/sweep --rocprof
+| File | Lines | Description |
+|------|-------|-------------|
+| `cli.py` | ~80 | Main orchestrator |
+| `analysis/cli.py` | ~150 | analyze commands |
+| `comparison/cli.py` | ~220 | compare commands |
+| `generators/cli.py` | ~270 | generate commands |
+| `processing/cli.py` | ~170 | process commands |
+| `pipelines/cli.py` | ~200 | pipeline commands |
+| **Total CLI** | **~1,090** | Split across 6 files |
 
-# GEMM analysis
-aorta-report analyze gemm /path/to/reports --top-k 10 -o gemm_analysis.csv
+**Before refactoring**: 1,182 lines in single file  
+**After refactoring**: ~80 lines main + ~200 avg per package CLI
 
-# Compare two runs
-aorta-report compare reports -b baseline.xlsx -t test.xlsx -o comparison.xlsx
+---
 
-# Generate HTML report (GEMM variance comparison)
-aorta-report generate html --mode sweep --sweep1 ./exp1 --sweep2 ./exp2 -o comparison.html
+## 9. Adding New Commands
 
-# Generate HTML report (GPU/NCCL performance analysis)
-aorta-report generate html --mode performance --plots-dir ./output/plots -o report.html
+### 9.1 Adding a Command to Existing Group
 
-# Full pipeline
-aorta-report pipeline full \
-    --baseline /path/to/baseline \
-    --test /path/to/test \
-    --output /path/to/output \
-    --plots
+1. Open the package's `cli.py` (e.g., `analysis/cli.py`)
+2. Add the command:
 
-# List available commands
-aorta-report --help
-aorta-report analyze --help
-aorta-report compare --help
+```python
+@analyze.command("new_command")
+@click.argument("input_path", type=click.Path(exists=True))
+@click.option("--option", help="Some option")
+@click.pass_context
+def analyze_new_command(ctx, input_path, option):
+    """New command description."""
+    from . import new_function  # Import from same package
+    
+    result = new_function(input_path, option)
+    click.echo(f"Done: {result}")
 ```
 
----
+3. No changes needed to main `cli.py`
 
-## Migration Strategy
+### 9.2 Adding a New Command Group
 
-| Phase | Duration | Description |
-|-------|----------|-------------|
-| **Phase 1** | 1-2 days | Create CLI skeleton with Click |
-| **Phase 2** | 2-3 days | Migrate Python scripts as command handlers |
-| **Phase 3** | 1-2 days | Convert shell scripts to Python |
-| **Phase 4** | 1 day | Add tests and documentation |
-| **Phase 5** | Optional | Deprecate old scripts with warnings |
+1. Create new package: `new_package/`
+2. Create `new_package/__init__.py` with exports
+3. Create `new_package/cli.py`:
 
-### Backward Compatibility
+```python
+import click
 
-During migration, keep old scripts working by having them call the new CLI:
+@click.group()
+@click.pass_context
+def new_group(ctx):
+    """New command group description."""
+    pass
 
-```bash
-#!/bin/bash
-# Legacy wrapper for run_tracelens_single_config.sh
-echo "DEPRECATED: Use 'aorta-report analyze single' instead"
-exec aorta-report analyze single "$@"
+@new_group.command("subcommand")
+def new_subcommand():
+    """Subcommand description."""
+    pass
 ```
 
----
+4. Register in main `cli.py`:
 
-## Script Mapping Reference
-
-| Old Script | New Command |
-|------------|-------------|
-| `run_tracelens_single_config.sh` | `aorta-report analyze single` |
-| `run_tracelens_analysis.sh` | `aorta-report analyze sweep` |
-| `analyze_gemm_reports.py` | `aorta-report analyze gemm` |
-| `run_full_analysis.py` | `aorta-report pipeline full` |
-| `compare_all_runs.py` | `aorta-report compare runs` |
-| `combine_reports.py` | `aorta-report compare reports` |
-| `add_collective_comparison.py` | `aorta-report compare collective` |
-| `create_embeded_html_report.py` | `aorta-report generate html --mode sweep` |
-| `create_final_report.py` | `aorta-report generate excel` |
-| `create_final_plots.py` | `aorta-report generate plots` |
-| `process_gpu_timeline.py` | `aorta-report process gpu-timeline` |
-| `process_comms.py` | `aorta-report process comms` |
-| `enhance_gemm_variance_with_timestamps.py` | `aorta-report process gemm-variance` |
-| `plot_gemm_variance.py` | `aorta-report generate plots --type gemm-variance` |
+```python
+from .new_package.cli import new_group
+cli.add_command(new_group)
+```
 
 ---
 
-## Benefits
-
-1. **Discoverability** - `--help` at every level shows available commands
-2. **Consistency** - Uniform argument style across all commands
-3. **Composability** - Easy to chain commands in scripts
-4. **Maintainability** - Single codebase, no shell script maintenance
-5. **Testability** - Python functions are easier to unit test
-6. **Documentation** - Auto-generated from docstrings and Click decorators
+## 10. Related Documentation
 
+- [USER_GUIDE.md](./USER_GUIDE.md) - End-user documentation
+- [ANALYZE_CMD_DEV_DOCS.md](./ANALYZE_CMD_DEV_DOCS.md) - Analyze implementation details
+- [COMPARE_CMD_DEV_DOCS.md](./COMPARE_CMD_DEV_DOCS.md) - Compare implementation details
+- [GENERATE_EXCEL_DEV_DOCS.md](./GENERATE_EXCEL_DEV_DOCS.md) - Excel generation details
+- [GENERATE_PLOTS_DEV_DOCS.md](./GENERATE_PLOTS_DEV_DOCS.md) - Plot generation details
+- [PIPELINE_DEV_DOCS.md](./PIPELINE_DEV_DOCS.md) - Pipeline implementation details
diff --git a/src/aorta/report/aorta-report-functional-spec.md b/src/aorta/report/aorta-report-functional-spec.md
index eb664da..22d608f 100644
--- a/src/aorta/report/aorta-report-functional-spec.md
+++ b/src/aorta/report/aorta-report-functional-spec.md
@@ -1,8 +1,8 @@
 # aorta-report Functional Specification
 
-**Version:** 1.0  
+**Version:** 1.1  
 **Date:** January 2026  
-**Status:** Draft
+**Status:** Partially Implemented
 
 ---
 
@@ -116,13 +116,13 @@ aorta-report
 | `sweep` | `--sweep1`, `--sweep2` | `--label1`, `--label2` | GEMM variance comparison between two sweeps |
 | `performance` | `--plots-dir` | - | GPU/NCCL performance analysis report |
 
-#### 2.2.4 `process` Group
+#### 2.2.4 `process` Group ✅ Implemented
 
 | Command | Arguments | Options | Description |
 |---------|-----------|---------|-------------|
 | `process gpu-timeline` | `INPUT_DIR` | `--mode` (auto/single/sweep), `--geo-mean`, `-o/--output` | Process GPU timeline from reports |
-| `process comms` | `INPUT_DIR` | `-o/--output` | Process communication data |
-| `process gemm-variance` | `INPUT_FILE` | `--timestamps`, `-o/--output` | Enhance GEMM variance with timestamps |
+| `process comms` | `SWEEP_DIR` | `-o/--output` | Process NCCL communication data from collective reports |
+| `process gemm-variance` | `INPUT_CSV` | `--base-path` (required), `--tolerance`, `-o/--output` | Enhance GEMM variance CSV with kernel timestamps |
 
 #### 2.2.5 `pipeline` Group
 
@@ -218,17 +218,43 @@ aorta-report generate plots \
     --output ./plots/
 ```
 
-### 3.6 GPU Timeline Processing
+### 3.6 Data Processing
+
+#### GPU Timeline Processing
 
 ```bash
 # Auto-detect input type and process
 aorta-report process gpu-timeline /path/to/reports
 
-# Explicit single config mode
+# Explicit single config mode (perf_rank*.xlsx files)
 aorta-report process gpu-timeline /path/to/individual_reports --mode single
 
-# Sweep mode with geometric mean
+# Sweep mode with geometric mean (perf_*ch_rank*.xlsx files)
 aorta-report process gpu-timeline /path/to/sweep --mode sweep --geo-mean
+
+# Custom output path
+aorta-report process gpu-timeline /path/to/sweep -o ./results/timeline.xlsx
+```
+
+#### NCCL Communication Processing
+
+```bash
+# Process NCCL collective reports from sweep directory
+aorta-report process comms /path/to/sweep
+
+# Custom output directory
+aorta-report process comms /path/to/sweep -o ./nccl_analysis/
+```
+
+#### GEMM Variance Timestamp Enhancement
+
+```bash
+# Enhance GEMM variance CSV with kernel timestamps
+aorta-report process gemm-variance ./gemm_variance.csv --base-path /path/to/sweep
+
+# Custom tolerance and output
+aorta-report process gemm-variance ./variance.csv --base-path /path/to/sweep \
+    --tolerance 0.02 -o ./enhanced.csv
 ```
 
 ### 3.7 Full Pipeline
@@ -361,23 +387,43 @@ aorta-report = "aorta.report:main"
 ```
 aorta/src/aorta/report/
 ├── __init__.py
-├── __main__.py              # python -m aorta.report
-├── cli.py                   # Click CLI definition
-├── commands/
+├── __main__.py                  # python -m aorta.report
+├── cli.py                       # Click CLI definition
+├── analysis/                    # ✅ Implemented - analyze command logic
 │   ├── __init__.py
-│   ├── analyze.py           # analyze subcommands
-│   ├── compare.py           # compare subcommands
-│   ├── report.py            # report subcommands
-│   ├── process.py           # process subcommands
-│   └── pipeline.py          # pipeline subcommands
-├── core/
+│   ├── tracelens_wrapper.py     # GEMM-patched TraceLens wrapper
+│   ├── analyze_gemm.py          # GEMM kernel variance analysis
+│   ├── analyze_single.py        # Single configuration analysis
+│   └── analyze_sweep.py         # Sweep configuration analysis
+├── generators/                  # ✅ Implemented - generate html command
 │   ├── __init__.py
-│   ├── tracelens_wrapper.py # GEMM-patched TraceLens
-│   ├── gpu_timeline.py      # GPU timeline processing
-│   ├── gemm_analysis.py     # GEMM analysis logic
-│   └── report_generator.py  # Report generation logic
-└── templates/
-    ├── html_template.py
-    └── html_report_config.py
+│   ├── html_generator.py        # Unified HTML generation entry point
+│   ├── sweep_comparison.py      # GEMM sweep comparison mode
+│   └── performance_report.py    # GPU/NCCL performance mode
+├── templates/                   # ✅ Implemented - HTML templates
+│   ├── __init__.py
+│   ├── sweep_comparison_template.py
+│   └── performance_report_template.py
+├── processing/                  # ✅ Implemented - process command logic
+│   ├── __init__.py
+│   ├── gpu_timeline_single.py   # Single config GPU timeline processing
+│   ├── gpu_timeline_sweep.py    # Sweep GPU timeline processing
+│   ├── process_comms.py         # NCCL communication data processing
+│   └── process_gemm_variance.py # GEMM variance timestamp enhancement
+├── ANALYZE_CMD_DEV_DOCS.md      # Developer documentation
+├── GENERATE_HTML_DEV_DOCS.md    # Developer documentation
+├── PROCESS_CMD_DEV_DOCS.md      # Developer documentation
+├── aorta-report-detail-plan.md  # Implementation plan
+└── aorta-report-functional-spec.md  # This document
 ```
 
+## Appendix C: Implementation Status
+
+| Command Group | Status | Notes |
+|---------------|--------|-------|
+| `analyze` | ✅ Implemented | `single`, `sweep`, `gemm` commands working |
+| `compare` | ⏳ Pending | CLI stubs exist, logic not implemented |
+| `generate` | ⚠️ Partial | `html` implemented, `excel`/`plots` pending |
+| `process` | ✅ Implemented | All commands working (`gpu-timeline`, `comms`, `gemm-variance`) |
+| `pipeline` | ⏳ Pending | CLI stubs exist, logic not implemented |
+
diff --git a/src/aorta/report/cli.py b/src/aorta/report/cli.py
index e6e08b7..f5d0ed2 100644
--- a/src/aorta/report/cli.py
+++ b/src/aorta/report/cli.py
@@ -1,6 +1,15 @@
 """
 aorta-report CLI - Unified interface for TraceLens analysis and report generation.
 
+This is the main entry point that orchestrates all command groups.
+Each command group is defined in its respective package's cli.py module:
+
+  - analysis/cli.py    → analyze commands
+  - comparison/cli.py  → compare commands
+  - generators/cli.py  → generate commands
+  - processing/cli.py  → process commands
+  - pipelines/cli.py   → pipeline commands
+
 Usage:
     aorta-report --help
     aorta-report analyze --help
@@ -45,591 +54,22 @@ def cli(ctx, verbose, quiet):
 
 
 # =============================================================================
-# ANALYZE Group
+# Register Command Groups from Subpackages
 # =============================================================================
 
+# Import command groups from their respective packages
+from .analysis.cli import analyze
+from .comparison.cli import compare
+from .generators.cli import generate
+from .processing.cli import process
+from .pipelines.cli import pipeline
 
-@cli.group()
-@click.pass_context
-def analyze(ctx):
-    """Run TraceLens analysis on traces.
-
-    \b
-    Commands:
-      single  - Analyze a single configuration trace directory
-      sweep   - Analyze a sweep directory with multiple configurations
-      gemm    - Analyze GEMM kernels from TraceLens reports
-    """
-    pass
-
-
-@analyze.command("single")
-@click.argument("trace_dir", type=click.Path(exists=True))
-@click.option("--individual-only", is_flag=True, help="Generate only individual reports")
-@click.option("--collective-only", is_flag=True, help="Generate only collective report")
-@click.option("--geo-mean", is_flag=True, help="Use geometric mean for timeline aggregation")
-@click.option("--short-kernel-threshold", default=50, type=int,
-              help="Threshold for short kernel study (microseconds)")
-@click.option("--topk-ops", default=100, type=int,
-              help="Number of top operations to include")
-@click.option("-o", "--output", type=click.Path(), help="Output directory")
-@click.pass_context
-def analyze_single(ctx, trace_dir, individual_only, collective_only, geo_mean,
-                   short_kernel_threshold, topk_ops, output):
-    """Analyze a single configuration trace directory.
-
-    TRACE_DIR: Path to the trace directory containing rank subdirectories.
-
-    \b
-    Examples:
-      aorta-report analyze single /path/to/traces
-      aorta-report analyze single /path/to/traces --individual-only
-      aorta-report analyze single /path/to/traces -o ./results
-    """
-    from pathlib import Path
-    from .analysis import analyze_single_config
-
-    verbose = ctx.obj.get("verbose", False)
-    quiet = ctx.obj.get("quiet", False)
-
-    run_individual = not collective_only
-    run_collective = not individual_only
-
-    try:
-        results = analyze_single_config(
-            input_dir=Path(trace_dir),
-            output_dir=Path(output) if output else None,
-            run_individual=run_individual,
-            run_collective=run_collective,
-            aggregate_timeline=run_individual,
-            use_geo_mean=geo_mean,
-            short_kernel_threshold_us=short_kernel_threshold,
-            topk_ops=topk_ops,
-            verbose=verbose,
-        )
-        if not quiet:
-            click.echo(f"\nAnalysis complete: {results['output_dir']}")
-    except (ValueError, FileNotFoundError) as e:
-        raise click.ClickException(str(e))
-
-
-@analyze.command("sweep")
-@click.argument("sweep_dir", type=click.Path(exists=True))
-@click.option("--geo-mean", is_flag=True, help="Use geometric mean instead of arithmetic mean")
-@click.option("-o", "--output", type=click.Path(), help="Output directory")
-@click.pass_context
-def analyze_sweep(ctx, sweep_dir, geo_mean, output):
-    """Analyze a sweep directory with multiple configurations.
-
-    SWEEP_DIR: Path to the sweep directory containing tracelens_analysis/
-    with multiple thread/channel configs.
-
-    \b
-    Examples:
-      aorta-report analyze sweep /path/to/sweep_20251124
-      aorta-report analyze sweep /path/to/sweep --geo-mean
-    """
-    from pathlib import Path
-    from .analysis import analyze_sweep_config
-
-    verbose = ctx.obj.get("verbose", False)
-    quiet = ctx.obj.get("quiet", False)
-
-    try:
-        output_path = analyze_sweep_config(
-            sweep_dir=Path(sweep_dir),
-            output_dir=Path(output) if output else None,
-            use_geo_mean=geo_mean,
-            verbose=verbose,
-        )
-        if not quiet and output_path:
-            click.echo(f"\nAnalysis complete: {output_path}")
-    except (ValueError, FileNotFoundError) as e:
-        raise click.ClickException(str(e))
-
-
-@analyze.command("gemm")
-@click.argument("reports_dir", type=click.Path(exists=True))
-@click.option("--threads", "-t", multiple=True, type=int, default=(256, 512),
-              help="Thread configurations to analyze (can be specified multiple times)")
-@click.option("--channels", "-c", multiple=True, type=int, default=(28, 42, 56, 70),
-              help="Channel configurations to analyze (can be specified multiple times)")
-@click.option("--ranks", "-r", multiple=True, type=int,
-              help="Ranks to analyze (default: 0-7)")
-@click.option("--top-k", default=5, type=int, help="Number of top kernels to extract per file")
-@click.option("-o", "--output", type=click.Path(),
-              default="top5_gemm_kernels_time_variance.csv", help="Output CSV file")
-@click.pass_context
-def analyze_gemm(ctx, reports_dir, threads, channels, ranks, top_k, output):
-    """Analyze GEMM kernels from TraceLens reports.
-
-    REPORTS_DIR: Path to tracelens_analysis directory containing
-    {threads}thread/individual_reports/ subdirectories.
-
-    \b
-    Examples:
-      aorta-report analyze gemm /path/to/tracelens_analysis
-      aorta-report analyze gemm /path/to/reports --top-k 10 -o gemm_analysis.csv
-      aorta-report analyze gemm /path/to/reports -t 256 -t 512 -c 28 -c 42
-    """
-    from pathlib import Path
-    from .analysis import analyze_gemm_reports
-
-    verbose = ctx.obj.get("verbose", False)
-    quiet = ctx.obj.get("quiet", False)
-
-    # Convert tuples to lists, use defaults if not specified
-    threads_list = list(threads) if threads else [256, 512]
-    channels_list = list(channels) if channels else [28, 42, 56, 70]
-    ranks_list = list(ranks) if ranks else list(range(8))
-
-    try:
-        output_path = analyze_gemm_reports(
-            base_path=Path(reports_dir),
-            threads=threads_list,
-            channels=channels_list,
-            ranks=ranks_list,
-            top_k=top_k,
-            output_file=output,
-            verbose=verbose,
-        )
-        if not quiet and output_path:
-            click.echo(f"\nAnalysis complete: {output_path}")
-    except (ValueError, FileNotFoundError) as e:
-        raise click.ClickException(str(e))
-
-
-# =============================================================================
-# COMPARE Group
-# =============================================================================
-
-
-@cli.group()
-@click.pass_context
-def compare(ctx):
-    """Compare traces and reports.
-
-    \b
-    Commands:
-      runs       - Compare multiple TraceLens analysis runs
-      reports    - Combine and compare two reports
-      collective - Add collective operation comparison sheets
-    """
-    pass
-
-
-@compare.command("runs")
-@click.option("-i", "--inputs", multiple=True, required=True, type=click.Path(exists=True),
-              help="Input directories (can be specified multiple times)")
-@click.option("-o", "--output", required=True, type=click.Path(), help="Output directory")
-@click.pass_context
-def compare_runs(ctx, inputs, output):
-    """Compare multiple TraceLens analysis runs.
-
-    \b
-    Examples:
-      aorta-report compare runs -i /path/to/run1 -i /path/to/run2 -o /path/to/output
-    """
-    click.echo(f"[compare runs] inputs={inputs}")
-    click.echo(f"  output={output}")
-    click.echo("  [NOT IMPLEMENTED]")
-
-
-@compare.command("reports")
-@click.option("-b", "--baseline", required=True, type=click.Path(exists=True),
-              help="Baseline report file")
-@click.option("-t", "--test", required=True, type=click.Path(exists=True),
-              help="Test report file")
-@click.option("--baseline-label", help="Label for baseline")
-@click.option("--test-label", help="Label for test")
-@click.option("-o", "--output", required=True, type=click.Path(), help="Output file")
-@click.pass_context
-def compare_reports(ctx, baseline, test, baseline_label, test_label, output):
-    """Combine and compare two reports.
-
-    \b
-    Examples:
-      aorta-report compare reports -b baseline.xlsx -t test.xlsx -o comparison.xlsx
-      aorta-report compare reports -b baseline.xlsx -t test.xlsx \\
-          --baseline-label "ROCm 6.0" --test-label "ROCm 7.0" -o comparison.xlsx
-    """
-    click.echo(f"[compare reports] baseline={baseline}, test={test}")
-    click.echo(f"  baseline_label={baseline_label}, test_label={test_label}")
-    click.echo(f"  output={output}")
-    click.echo("  [NOT IMPLEMENTED]")
-
-
-@compare.command("collective")
-@click.option("-i", "--input", "input_file", required=True, type=click.Path(exists=True),
-              help="Input combined report file")
-@click.option("-o", "--output", required=True, type=click.Path(), help="Output file")
-@click.option("--baseline-label", help="Label for baseline")
-@click.option("--test-label", help="Label for test")
-@click.pass_context
-def compare_collective(ctx, input_file, output, baseline_label, test_label):
-    """Add collective operation comparison sheets.
-
-    \b
-    Examples:
-      aorta-report compare collective -i combined.xlsx -o collective_comparison.xlsx
-    """
-    click.echo(f"[compare collective] input={input_file}")
-    click.echo(f"  baseline_label={baseline_label}, test_label={test_label}")
-    click.echo(f"  output={output}")
-    click.echo("  [NOT IMPLEMENTED]")
-
-
-# =============================================================================
-# GENERATE Group
-# =============================================================================
-
-
-@cli.group()
-@click.pass_context
-def generate(ctx):
-    """Generate reports and visualizations.
-
-    \b
-    Commands:
-      html   - Generate HTML report with embedded images
-      excel  - Generate comprehensive Excel report
-      plots  - Generate visualization plots
-    """
-    pass
-
-
-@generate.command("html")
-@click.option("--mode", type=click.Choice(["sweep", "performance"]), required=True,
-              help="Report mode: 'sweep' for GEMM variance comparison, 'performance' for GPU/NCCL analysis")
-# Sweep mode options
-@click.option("--sweep1", type=click.Path(exists=True),
-              help="[sweep mode] First sweep directory")
-@click.option("--sweep2", type=click.Path(exists=True),
-              help="[sweep mode] Second sweep directory")
-@click.option("--label1", help="[sweep mode] Label for first sweep")
-@click.option("--label2", help="[sweep mode] Label for second sweep")
-# Performance mode options
-@click.option("--plots-dir", type=click.Path(exists=True),
-              help="[performance mode] Directory containing pre-generated plots")
-# Common options
-@click.option("-o", "--output", required=True, type=click.Path(), help="Output HTML file")
-@click.pass_context
-def generate_html(ctx, mode, sweep1, sweep2, label1, label2, plots_dir, output):
-    """Generate HTML report with embedded images.
-
-    Two modes available:
-
-    \b
-    SWEEP MODE (--mode sweep):
-      Compare GEMM kernel variance between two experiment sweeps.
-      Requires: --sweep1, --sweep2
-      Optional: --label1, --label2
-
-    \b
-    PERFORMANCE MODE (--mode performance):
-      Generate GPU/NCCL performance analysis report.
-      Requires: --plots-dir (directory with pre-generated plots)
-
-    \b
-    Examples:
-      # Sweep comparison (GEMM variance)
-      aorta-report generate html --mode sweep \\
-          --sweep1 ./exp1 --sweep2 ./exp2 \\
-          --label1 "Baseline" --label2 "Optimized" \\
-          -o comparison.html
-
-      # Performance report (GPU/NCCL analysis)
-      aorta-report generate html --mode performance \\
-          --plots-dir ./output/plots \\
-          -o performance_report.html
-    """
-    from pathlib import Path
-    from .generators import generate_html as do_generate_html
-
-    verbose = ctx.obj.get("verbose", False)
-
-    try:
-        output_path = do_generate_html(
-            mode=mode,
-            output=Path(output),
-            sweep1=Path(sweep1) if sweep1 else None,
-            sweep2=Path(sweep2) if sweep2 else None,
-            label1=label1,
-            label2=label2,
-            plots_dir=Path(plots_dir) if plots_dir else None,
-            verbose=verbose,
-        )
-        if not ctx.obj.get("quiet", False):
-            click.echo(f"\nReport generated successfully: {output_path}")
-    except ValueError as e:
-        raise click.UsageError(str(e))
-    except FileNotFoundError as e:
-        raise click.ClickException(str(e))
-
-
-@generate.command("excel")
-@click.option("--gpu-combined", required=True, type=click.Path(exists=True),
-              help="GPU combined report file")
-@click.option("--gpu-comparison", required=True, type=click.Path(exists=True),
-              help="GPU comparison report file")
-@click.option("--coll-combined", required=True, type=click.Path(exists=True),
-              help="Collective combined report file")
-@click.option("--coll-comparison", required=True, type=click.Path(exists=True),
-              help="Collective comparison report file")
-@click.option("-o", "--output", required=True, type=click.Path(), help="Output Excel file")
-@click.option("--baseline-label", help="Label for baseline")
-@click.option("--test-label", help="Label for test")
-@click.pass_context
-def generate_excel(ctx, gpu_combined, gpu_comparison, coll_combined, coll_comparison,
-                   output, baseline_label, test_label):
-    """Generate comprehensive Excel report.
-
-    Combines GPU timeline and collective comparison data into a single report.
-
-    \b
-    Examples:
-      aorta-report generate excel \\
-          --gpu-combined gpu_combined.xlsx \\
-          --gpu-comparison gpu_comparison.xlsx \\
-          --coll-combined coll_combined.xlsx \\
-          --coll-comparison coll_comparison.xlsx \\
-          -o final_report.xlsx
-    """
-    click.echo(f"[generate excel] gpu_combined={gpu_combined}")
-    click.echo(f"  gpu_comparison={gpu_comparison}")
-    click.echo(f"  coll_combined={coll_combined}")
-    click.echo(f"  coll_comparison={coll_comparison}")
-    click.echo(f"  output={output}")
-    click.echo("  [NOT IMPLEMENTED]")
-
-
-@generate.command("plots")
-@click.option("-i", "--input", "input_file", required=True, type=click.Path(exists=True),
-              help="Input Excel report")
-@click.option("-o", "--output", required=True, type=click.Path(), help="Output directory")
-@click.option("--type", "plot_type", type=click.Choice(["all", "gpu-timeline", "gemm-variance"]),
-              default="all", help="Type of plots to generate")
-@click.pass_context
-def generate_plots(ctx, input_file, output, plot_type):
-    """Generate visualization plots.
-
-    \b
-    Examples:
-      aorta-report generate plots -i final_report.xlsx -o ./plots/
-      aorta-report generate plots -i report.xlsx -o ./plots/ --type gemm-variance
-    """
-    click.echo(f"[generate plots] input={input_file}")
-    click.echo(f"  output={output}, type={plot_type}")
-    click.echo("  [NOT IMPLEMENTED]")
-
-
-# =============================================================================
-# PROCESS Group
-# =============================================================================
-
-
-@cli.group()
-@click.pass_context
-def process(ctx):
-    """Data processing utilities.
-
-    \b
-    Commands:
-      gpu-timeline   - Process GPU timeline data from TraceLens reports
-      comms          - Process communication data
-      gemm-variance  - Enhance GEMM variance with timestamps
-    """
-    pass
-
-
-@process.command("gpu-timeline")
-@click.argument("input_dir", type=click.Path(exists=True))
-@click.option("--mode", type=click.Choice(["auto", "single", "sweep"]), default="auto",
-              help="Processing mode: auto-detect, single config, or sweep")
-@click.option("--geo-mean", is_flag=True, help="Use geometric mean instead of arithmetic mean")
-@click.option("-o", "--output", type=click.Path(), help="Output file path")
-@click.pass_context
-def process_gpu_timeline(ctx, input_dir, mode, geo_mean, output):
-    """Process GPU timeline data from TraceLens reports.
-
-    INPUT_DIR: Path to reports directory or sweep directory.
-
-    Supports both single-config and sweep directory structures.
-    Auto-detects the structure by default.
-
-    \b
-    Examples:
-      aorta-report process gpu-timeline /path/to/reports
-      aorta-report process gpu-timeline /path/to/individual_reports --mode single
-      aorta-report process gpu-timeline /path/to/sweep --mode sweep --geo-mean
-    """
-    from pathlib import Path
-
-    verbose = ctx.obj.get("verbose", False)
-    quiet = ctx.obj.get("quiet", False)
-    input_path = Path(input_dir)
-
-    # Auto-detect mode
-    if mode == "auto":
-        # Check for sweep structure (tracelens_analysis with thread directories)
-        tracelens_dir = input_path / "tracelens_analysis"
-        if tracelens_dir.exists():
-            thread_dirs = [d for d in tracelens_dir.iterdir() if d.is_dir() and "thread" in d.name]
-            if thread_dirs:
-                mode = "sweep"
-            else:
-                mode = "single"
-        elif input_path.name == "individual_reports" or list(input_path.glob("perf_rank*.xlsx")):
-            mode = "single"
-        elif list(input_path.glob("perf_*ch_rank*.xlsx")):
-            mode = "sweep"
-        else:
-            raise click.ClickException(
-                f"Could not auto-detect mode. Please specify --mode single or --mode sweep"
-            )
-
-        if verbose:
-            click.echo(f"Auto-detected mode: {mode}")
-
-    try:
-        if mode == "single":
-            from .analysis.analyze_single import process_gpu_timeline as process_single
-            output_path = process_single(
-                reports_dir=input_path,
-                use_geo_mean=geo_mean,
-                verbose=verbose,
-            )
-        else:  # sweep
-            from .analysis import analyze_sweep_config
-            output_path = analyze_sweep_config(
-                sweep_dir=input_path,
-                output_dir=Path(output) if output else None,
-                use_geo_mean=geo_mean,
-                verbose=verbose,
-            )
-
-        if not quiet and output_path:
-            click.echo(f"\nProcessing complete: {output_path}")
-    except (ValueError, FileNotFoundError) as e:
-        raise click.ClickException(str(e))
-
-
-@process.command("comms")
-@click.argument("input_dir", type=click.Path(exists=True))
-@click.option("-o", "--output", type=click.Path(), help="Output file path")
-@click.pass_context
-def process_comms(ctx, input_dir, output):
-    """Process communication data.
-
-    INPUT_DIR: Path to directory containing trace data.
-
-    \b
-    Examples:
-      aorta-report process comms /path/to/traces
-      aorta-report process comms /path/to/traces -o comms_processed.xlsx
-    """
-    click.echo(f"[process comms] input_dir={input_dir}")
-    click.echo(f"  output={output}")
-    click.echo("  [NOT IMPLEMENTED]")
-
-
-@process.command("gemm-variance")
-@click.argument("input_file", type=click.Path(exists=True))
-@click.option("--timestamps", is_flag=True, help="Include timestamp data")
-@click.option("-o", "--output", type=click.Path(), help="Output file path")
-@click.pass_context
-def process_gemm_variance(ctx, input_file, timestamps, output):
-    """Enhance GEMM variance with timestamps.
-
-    INPUT_FILE: Path to GEMM report file.
-
-    \b
-    Examples:
-      aorta-report process gemm-variance report.xlsx
-      aorta-report process gemm-variance report.xlsx --timestamps -o enhanced.xlsx
-    """
-    click.echo(f"[process gemm-variance] input_file={input_file}")
-    click.echo(f"  timestamps={timestamps}, output={output}")
-    click.echo("  [NOT IMPLEMENTED]")
-
-
-# =============================================================================
-# PIPELINE Group
-# =============================================================================
-
-
-@cli.group()
-@click.pass_context
-def pipeline(ctx):
-    """Run complete analysis pipelines.
-
-    \b
-    Commands:
-      full  - Run complete analysis pipeline with comparisons
-      gemm  - Run GEMM-focused analysis pipeline
-    """
-    pass
-
-
-@pipeline.command("full")
-@click.option("-b", "--baseline", required=True, type=click.Path(exists=True),
-              help="Baseline trace directory")
-@click.option("-t", "--test", multiple=True, required=True, type=click.Path(exists=True),
-              help="Test trace directory (can be specified multiple times)")
-@click.option("-o", "--output", required=True, type=click.Path(), help="Output directory")
-@click.option("--skip-tracelens", is_flag=True, help="Skip TraceLens report generation")
-@click.option("--gpu-timeline/--no-gpu-timeline", default=True,
-              help="Perform GPU timeline comparison")
-@click.option("--collective/--no-collective", default=True,
-              help="Perform collective/NCCL comparison")
-@click.option("--final-report/--no-final-report", default=True,
-              help="Create comprehensive final report")
-@click.option("--plots/--no-plots", default=True,
-              help="Generate visualization plots")
-@click.pass_context
-def pipeline_full(ctx, baseline, test, output, skip_tracelens, gpu_timeline,
-                  collective, final_report, plots):
-    """Run complete analysis pipeline with comparisons.
-
-    \b
-    Examples:
-      aorta-report pipeline full \\
-          --baseline /path/to/baseline \\
-          --test /path/to/test \\
-          --output /path/to/output
-
-      aorta-report pipeline full \\
-          -b /path/to/baseline \\
-          -t /path/to/test1 -t /path/to/test2 \\
-          -o /path/to/output \\
-          --skip-tracelens --plots
-    """
-    click.echo(f"[pipeline full] baseline={baseline}")
-    click.echo(f"  test={test}")
-    click.echo(f"  output={output}")
-    click.echo(f"  skip_tracelens={skip_tracelens}")
-    click.echo(f"  gpu_timeline={gpu_timeline}, collective={collective}")
-    click.echo(f"  final_report={final_report}, plots={plots}")
-    click.echo("  [NOT IMPLEMENTED]")
-
-
-@pipeline.command("gemm")
-@click.option("--sweep-dir", required=True, type=click.Path(exists=True),
-              help="Sweep directory to analyze")
-@click.option("-o", "--output", required=True, type=click.Path(), help="Output directory")
-@click.option("--top-k", default=5, type=int, help="Number of top kernels to extract")
-@click.pass_context
-def pipeline_gemm(ctx, sweep_dir, output, top_k):
-    """Run GEMM-focused analysis pipeline.
-
-    \b
-    Examples:
-      aorta-report pipeline gemm --sweep-dir /path/to/sweep -o /path/to/output
-      aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./results --top-k 10
-    """
-    click.echo(f"[pipeline gemm] sweep_dir={sweep_dir}")
-    click.echo(f"  output={output}, top_k={top_k}")
-    click.echo("  [NOT IMPLEMENTED]")
+# Register all command groups with the main CLI
+cli.add_command(analyze)
+cli.add_command(compare)
+cli.add_command(generate)
+cli.add_command(process)
+cli.add_command(pipeline)
 
 
 # =============================================================================
@@ -644,4 +84,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/src/aorta/report/comparison/__init__.py b/src/aorta/report/comparison/__init__.py
new file mode 100644
index 0000000..cdc486c
--- /dev/null
+++ b/src/aorta/report/comparison/__init__.py
@@ -0,0 +1,13 @@
+"""Comparison modules for baseline vs test TraceLens reports."""
+
+from .combine import combine_excel_files
+from .gpu_timeline_comparison import add_gpu_timeline_comparison
+from .collective_comparison import add_collective_comparison
+from .formatting import save_with_formatting
+
+__all__ = [
+    "combine_excel_files",
+    "add_gpu_timeline_comparison",
+    "add_collective_comparison",
+    "save_with_formatting",
+]
diff --git a/src/aorta/report/comparison/cli.py b/src/aorta/report/comparison/cli.py
new file mode 100644
index 0000000..a32b58f
--- /dev/null
+++ b/src/aorta/report/comparison/cli.py
@@ -0,0 +1,254 @@
+"""CLI commands for TraceLens report comparison.
+
+This module provides the 'compare' command group with subcommands:
+  - gpu_timeline: Compare two GPU timeline reports
+  - collective: Compare two collective/NCCL reports
+"""
+
+import click
+from pathlib import Path
+
+
+@click.group()
+@click.pass_context
+def compare(ctx):
+    """Compare baseline and test TraceLens reports.
+
+    \b
+    Supported comparison types:
+      gpu_timeline  - Compare GPU timeline reports
+      collective    - Compare collective/NCCL reports
+    """
+    pass
+
+
+@compare.command("gpu_timeline")
+@click.option("-b", "--baseline", required=True, type=click.Path(exists=True),
+              help="Path to baseline gpu_timeline_summary_mean.xlsx")
+@click.option("-t", "--test", required=True, type=click.Path(exists=True),
+              help="Path to test gpu_timeline_summary_mean.xlsx")
+@click.option("--baseline-label", default=None,
+              help="Label for baseline (default: extracted from path)")
+@click.option("--test-label", default=None,
+              help="Label for test (default: extracted from path)")
+@click.option("-o", "--output", required=True, type=click.Path(),
+              help="Output Excel file path")
+@click.pass_context
+def compare_gpu_timeline(ctx, baseline, test, baseline_label, test_label, output):
+    """Compare two GPU timeline reports.
+
+    Combines baseline and test files, then adds comparison sheets
+    with diff, percent_change, and status columns.
+
+    \b
+    Output sheets:
+      - Summary, All_Ranks_Combined, Per_Rank_* (combined data)
+      - Comparison_By_Rank (per-rank comparison)
+      - Summary_Comparison (overall comparison)
+
+    \b
+    Examples:
+      aorta-report compare gpu_timeline \\
+          -b baseline/gpu_timeline_summary_mean.xlsx \\
+          -t test/gpu_timeline_summary_mean.xlsx \\
+          -o comparison.xlsx
+
+      aorta-report compare gpu_timeline \\
+          -b baseline/gpu.xlsx -t test/gpu.xlsx \\
+          --baseline-label "ROCm 6.0" --test-label "ROCm 7.0" \\
+          -o comparison.xlsx
+    """
+    from . import (
+        combine_excel_files,
+        add_gpu_timeline_comparison,
+        save_with_formatting,
+    )
+    from .combine import extract_label_from_path
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    baseline_path = Path(baseline)
+    test_path = Path(test)
+    output_path = Path(output)
+
+    # Extract labels from paths if not provided
+    if baseline_label is None:
+        baseline_label = extract_label_from_path(baseline_path, "baseline")
+    if test_label is None:
+        test_label = extract_label_from_path(test_path, "test")
+
+    if not quiet:
+        click.echo("=" * 60)
+        click.echo("GPU Timeline Comparison")
+        click.echo("=" * 60)
+        click.echo(f"Baseline: {baseline_path}")
+        click.echo(f"Test: {test_path}")
+        click.echo(f"Baseline label: {baseline_label}")
+        click.echo(f"Test label: {test_label}")
+
+    try:
+        # Step 1: Combine Excel files
+        if not quiet:
+            click.echo("\nStep 1: Combining Excel files")
+        combined = combine_excel_files(
+            baseline_path,
+            test_path,
+            baseline_label,
+            test_label,
+            verbose=verbose,
+        )
+
+        # Step 2: Add comparison sheets
+        if not quiet:
+            click.echo("\nStep 2: Adding comparison sheets")
+        result = add_gpu_timeline_comparison(
+            combined,
+            baseline_label,
+            test_label,
+            verbose=verbose,
+        )
+
+        # Step 3: Save with formatting
+        if not quiet:
+            click.echo("\nStep 3: Saving with formatting")
+        format_columns = {
+            "Comparison_By_Rank": ["percent_change"],
+            "Summary_Comparison": ["percent_change"],
+        }
+        save_with_formatting(result, output_path, format_columns, verbose=verbose)
+
+        if not quiet:
+            click.echo("\n" + "=" * 60)
+            click.echo("Comparison Complete!")
+            click.echo("=" * 60)
+            click.echo(f"\nOutput: {output_path}")
+            click.echo("\nSheets:")
+            for sheet_name in result.keys():
+                click.echo(f"  - {sheet_name}")
+            click.echo("\npercent_change interpretation:")
+            click.echo("  Positive = test is faster/better")
+            click.echo("  Negative = test is slower/worse")
+
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
+
+
+@compare.command("collective")
+@click.option("-b", "--baseline", required=True, type=click.Path(exists=True),
+              help="Path to baseline collective_all_ranks.xlsx")
+@click.option("-t", "--test", required=True, type=click.Path(exists=True),
+              help="Path to test collective_all_ranks.xlsx")
+@click.option("--baseline-label", default=None,
+              help="Label for baseline (default: extracted from path)")
+@click.option("--test-label", default=None,
+              help="Label for test (default: extracted from path)")
+@click.option("-o", "--output", required=True, type=click.Path(),
+              help="Output Excel file path")
+@click.pass_context
+def compare_collective(ctx, baseline, test, baseline_label, test_label, output):
+    """Compare two collective/NCCL reports.
+
+    Combines baseline and test files, then adds comparison sheets
+    for NCCL summary data with latency and bandwidth metrics.
+
+    \b
+    Output sheets:
+      - nccl_summary_* (combined summary sheets)
+      - nccl_implicit_sync_cmp (comparison)
+      - nccl_long_cmp (comparison)
+
+    \b
+    Examples:
+      aorta-report compare collective \\
+          -b baseline/collective_all_ranks.xlsx \\
+          -t test/collective_all_ranks.xlsx \\
+          -o collective_comparison.xlsx
+
+      aorta-report compare collective \\
+          -b baseline/coll.xlsx -t test/coll.xlsx \\
+          --baseline-label "ROCm 6.0" --test-label "ROCm 7.0" \\
+          -o comparison.xlsx
+    """
+    from . import (
+        combine_excel_files,
+        add_collective_comparison,
+        save_with_formatting,
+    )
+    from .combine import extract_label_from_path
+    from .collective_comparison import get_percent_change_columns
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    baseline_path = Path(baseline)
+    test_path = Path(test)
+    output_path = Path(output)
+
+    # Extract labels from paths if not provided
+    if baseline_label is None:
+        baseline_label = extract_label_from_path(baseline_path, "baseline")
+    if test_label is None:
+        test_label = extract_label_from_path(test_path, "test")
+
+    if not quiet:
+        click.echo("=" * 60)
+        click.echo("Collective/NCCL Comparison")
+        click.echo("=" * 60)
+        click.echo(f"Baseline: {baseline_path}")
+        click.echo(f"Test: {test_path}")
+        click.echo(f"Baseline label: {baseline_label}")
+        click.echo(f"Test label: {test_label}")
+
+    try:
+        # Step 1: Combine Excel files (filter to summary sheets only)
+        if not quiet:
+            click.echo("\nStep 1: Combining Excel files")
+        combined = combine_excel_files(
+            baseline_path,
+            test_path,
+            baseline_label,
+            test_label,
+            filter_summary_only=True,
+            verbose=verbose,
+        )
+
+        # Step 2: Add comparison sheets
+        if not quiet:
+            click.echo("\nStep 2: Adding comparison sheets")
+        result = add_collective_comparison(
+            combined,
+            baseline_label,
+            test_label,
+            verbose=verbose,
+        )
+
+        # Step 3: Save with formatting
+        if not quiet:
+            click.echo("\nStep 3: Saving with formatting")
+
+        # Build format_columns for all comparison sheets
+        format_columns = {}
+        for sheet_name, df in result.items():
+            if sheet_name.endswith("_cmp"):
+                pct_cols = get_percent_change_columns(df)
+                if pct_cols:
+                    format_columns[sheet_name] = pct_cols
+
+        save_with_formatting(result, output_path, format_columns, verbose=verbose)
+
+        if not quiet:
+            click.echo("\n" + "=" * 60)
+            click.echo("Comparison Complete!")
+            click.echo("=" * 60)
+            click.echo(f"\nOutput: {output_path}")
+            click.echo("\nSheets:")
+            for sheet_name in result.keys():
+                click.echo(f"  - {sheet_name}")
+            click.echo("\npercent_change interpretation:")
+            click.echo("  For latency/time: Positive = faster (better)")
+            click.echo("  For bandwidth: Positive = higher bandwidth (better)")
+
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
+
diff --git a/src/aorta/report/comparison/collective_comparison.py b/src/aorta/report/comparison/collective_comparison.py
new file mode 100644
index 0000000..69c79f3
--- /dev/null
+++ b/src/aorta/report/comparison/collective_comparison.py
@@ -0,0 +1,238 @@
+"""Collective/NCCL comparison logic.
+
+Creates comparison sheets for NCCL collective data, matching the behavior of
+scripts/tracelens_single_config/add_collective_comparison.py
+"""
+
+from typing import Dict, List, Optional
+
+import pandas as pd
+
+
+# Metrics to compare
+NCCL_NUMERIC_COLS = [
+    "comm_latency_mean",
+    "algo bw (GB/s)_mean",
+    "bus bw (GB/s)_mean",
+    "Total comm latency (ms)",
+    "count",
+]
+
+# Grouping columns for NCCL data
+NCCL_GROUP_COLS = ["Collective name", "dtype", "In msg nelems"]
+
+# Summary sheets to process
+NCCL_SUMMARY_SHEETS = ["nccl_summary_implicit_sync", "nccl_summary_long"]
+
+
+def add_collective_comparison(
+    combined_data: Dict[str, pd.DataFrame],
+    baseline_label: str,
+    test_label: str,
+    verbose: bool = False,
+) -> Dict[str, pd.DataFrame]:
+    """
+    Add comparison sheets for collective/NCCL data.
+
+    Args:
+        combined_data: Dict from combine_excel_files()
+        baseline_label: Label for baseline
+        test_label: Label for test
+        verbose: Print progress messages
+
+    Returns:
+        Dict with summary sheets + new comparison sheets:
+        - 'nccl_implicit_sync_cmp': Comparison for nccl_summary_implicit_sync
+        - 'nccl_long_cmp': Comparison for nccl_summary_long
+
+    Processes sheets:
+        - 'nccl_summary_implicit_sync' → 'nccl_implicit_sync_cmp'
+        - 'nccl_summary_long' → 'nccl_long_cmp'
+
+    Groups by: ['Collective name', 'dtype', 'In msg nelems']
+
+    For each metric, creates columns:
+        - {baseline}_{metric}, {test}_{metric}
+        - diff_{metric}, percent_change_{metric}, ratio_{metric}
+
+    percent_change semantics (positive = better):
+        - Latency/time: (baseline - test) / baseline × 100
+        - Bandwidth: (test - baseline) / baseline × 100
+    """
+    result = combined_data.copy()
+
+    if verbose:
+        print(f"\nCreating comparison sheets...")
+
+    for sheet_name in NCCL_SUMMARY_SHEETS:
+        if sheet_name not in combined_data:
+            if verbose:
+                print(f"  Skipping {sheet_name} (not found)")
+            continue
+
+        df = combined_data[sheet_name]
+
+        # Get actual source values
+        sources = df["source"].unique()
+        if len(sources) < 2:
+            if verbose:
+                print(f"  Skipping {sheet_name} (only {len(sources)} source(s))")
+            continue
+
+        actual_baseline = sources[0]
+        actual_test = sources[1]
+
+        # Create comparison
+        comparison = _create_collective_comparison(
+            df,
+            actual_baseline,
+            actual_test,
+            baseline_label,
+            test_label,
+            verbose,
+        )
+
+        # Sheet name: nccl_summary_implicit_sync → nccl_implicit_sync_cmp
+        comparison_sheet_name = sheet_name.replace("nccl_summary_", "nccl_") + "_cmp"
+        result[comparison_sheet_name] = comparison
+
+        if verbose:
+            print(f"  Created {comparison_sheet_name} ({len(comparison)} rows)")
+
+    return result
+
+
+def _create_collective_comparison(
+    df: pd.DataFrame,
+    actual_baseline: str,
+    actual_test: str,
+    baseline_label: str,
+    test_label: str,
+    verbose: bool,
+) -> pd.DataFrame:
+    """Create comparison DataFrame for a single NCCL summary sheet."""
+    baseline_df = df[df["source"] == actual_baseline].copy()
+    test_df = df[df["source"] == actual_test].copy()
+
+    if len(baseline_df) == 0 or len(test_df) == 0:
+        return pd.DataFrame()
+
+    # Determine grouping columns (use fallback if some columns are missing)
+    group_cols = _get_available_group_cols(baseline_df)
+
+    if verbose:
+        print(f"    Grouping by: {group_cols}")
+
+    rows = []
+
+    # Group baseline data
+    for name, base_group in baseline_df.groupby(group_cols, as_index=False):
+        # Find matching test group
+        test_group = _find_matching_group(test_df, group_cols, name)
+
+        if test_group is None or len(test_group) == 0:
+            continue
+
+        # Build comparison row
+        comp_row = {}
+
+        # Copy grouping columns
+        if isinstance(name, tuple):
+            for col, val in zip(group_cols, name):
+                comp_row[col] = val
+        else:
+            comp_row[group_cols[0]] = name
+
+        # Compare each numeric metric
+        for col in NCCL_NUMERIC_COLS:
+            if col not in base_group.columns or col not in test_group.columns:
+                continue
+
+            base_val = base_group[col].values[0]
+            test_val = test_group[col].values[0]
+
+            # Store values
+            comp_row[f"{actual_baseline}_{col}"] = base_val
+            comp_row[f"{actual_test}_{col}"] = test_val
+            comp_row[f"diff_{col}"] = test_val - base_val
+
+            # Calculate percent_change with correct semantics
+            pct_change = _calculate_percent_change(col, base_val, test_val)
+            if pct_change is not None:
+                comp_row[f"percent_change_{col}"] = pct_change
+
+            # Ratio
+            comp_row[f"ratio_{col}"] = test_val / base_val if base_val != 0 else 0
+
+        rows.append(comp_row)
+
+    return pd.DataFrame(rows)
+
+
+def _get_available_group_cols(df: pd.DataFrame) -> List[str]:
+    """Get available grouping columns from DataFrame."""
+    available = [col for col in NCCL_GROUP_COLS if col in df.columns]
+    if not available:
+        # Fallback to just Collective name if nothing else available
+        if "Collective name" in df.columns:
+            return ["Collective name"]
+        raise ValueError("No grouping columns found in DataFrame")
+    return available
+
+
+def _find_matching_group(
+    test_df: pd.DataFrame,
+    group_cols: List[str],
+    name,
+) -> Optional[pd.DataFrame]:
+    """Find matching group in test DataFrame."""
+    if isinstance(name, tuple):
+        mask = pd.Series([True] * len(test_df), index=test_df.index)
+        for col, val in zip(group_cols, name):
+            mask = mask & (test_df[col] == val)
+    else:
+        mask = test_df[group_cols[0]] == name
+
+    result = test_df.loc[mask]
+    return result if len(result) > 0 else None
+
+
+def _calculate_percent_change(
+    col_name: str,
+    base_val: float,
+    test_val: float,
+) -> Optional[float]:
+    """
+    Calculate percent_change with correct semantics for the metric type.
+
+    For latency/time: Lower is better → positive when test is faster
+        Formula: (baseline - test) / baseline × 100
+
+    For bandwidth: Higher is better → positive when test has more bandwidth
+        Formula: (test - baseline) / baseline × 100
+
+    Returns:
+        Percent change value, or None for metrics that shouldn't have percent_change
+    """
+    if base_val == 0:
+        return 0.0
+
+    col_lower = col_name.lower()
+
+    if "latency" in col_lower or "time" in col_lower:
+        # Lower is better - positive when test is faster
+        return (base_val - test_val) / base_val * 100
+    elif "bw" in col_lower or "bandwidth" in col_lower:
+        # Higher is better - positive when test is better
+        return (test_val - base_val) / base_val * 100
+    elif "count" in col_lower:
+        # Count doesn't need percent_change
+        return None
+
+    return None
+
+
+def get_percent_change_columns(comparison_df: pd.DataFrame) -> List[str]:
+    """Get list of percent_change columns in a comparison DataFrame."""
+    return [col for col in comparison_df.columns if col.startswith("percent_change_")]
+
diff --git a/src/aorta/report/comparison/combine.py b/src/aorta/report/comparison/combine.py
new file mode 100644
index 0000000..b8f50ae
--- /dev/null
+++ b/src/aorta/report/comparison/combine.py
@@ -0,0 +1,135 @@
+"""Shared functionality to combine two Excel files."""
+
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import pandas as pd
+
+
+def combine_excel_files(
+    baseline_path: Path,
+    test_path: Path,
+    baseline_label: str,
+    test_label: str,
+    sheets_to_combine: Optional[List[str]] = None,
+    filter_summary_only: bool = False,
+    verbose: bool = False,
+) -> Dict[str, pd.DataFrame]:
+    """
+    Combine two Excel files by adding a 'source' column.
+
+    Args:
+        baseline_path: Path to baseline Excel file
+        test_path: Path to test Excel file
+        baseline_label: Label for baseline rows in 'source' column
+        test_label: Label for test rows in 'source' column
+        sheets_to_combine: Specific sheets to combine (None = all common sheets)
+        filter_summary_only: If True, only keep sheets with 'summary' in name
+        verbose: Print progress messages
+
+    Returns:
+        Dict mapping sheet_name to combined DataFrame
+
+    Raises:
+        FileNotFoundError: If input files don't exist
+        ValueError: If no common sheets found
+    """
+    baseline_path = Path(baseline_path)
+    test_path = Path(test_path)
+
+    if not baseline_path.exists():
+        raise FileNotFoundError(f"Baseline file not found: {baseline_path}")
+    if not test_path.exists():
+        raise FileNotFoundError(f"Test file not found: {test_path}")
+
+    if verbose:
+        print(f"Loading baseline ({baseline_label}): {baseline_path}")
+        print(f"Loading test ({test_label}): {test_path}")
+
+    # Load Excel files
+    baseline_xl = pd.ExcelFile(baseline_path)
+    test_xl = pd.ExcelFile(test_path)
+
+    if verbose:
+        print(f"\nBaseline sheets: {baseline_xl.sheet_names}")
+        print(f"Test sheets: {test_xl.sheet_names}")
+
+    # Determine sheets to combine
+    if sheets_to_combine is not None:
+        # Use specified sheets (must exist in both files)
+        common_sheets = [
+            s for s in sheets_to_combine
+            if s in baseline_xl.sheet_names and s in test_xl.sheet_names
+        ]
+    else:
+        # Find common sheets
+        common_sheets = [
+            s for s in baseline_xl.sheet_names
+            if s in test_xl.sheet_names
+        ]
+
+    # Apply summary filter if requested
+    if filter_summary_only:
+        filtered_sheets = [s for s in common_sheets if "summary" in s.lower()]
+        skipped_sheets = [s for s in common_sheets if "summary" not in s.lower()]
+
+        if verbose and skipped_sheets:
+            print(f"\nFiltering to summary sheets only...")
+            print(f"  Skipped sheets (non-summary): {skipped_sheets}")
+
+        common_sheets = filtered_sheets
+
+    if not common_sheets:
+        raise ValueError("No common sheets found between baseline and test files")
+
+    if verbose:
+        print(f"\nCombining sheets:")
+
+    # Combine each sheet
+    combined_data: Dict[str, pd.DataFrame] = {}
+
+    for sheet_name in common_sheets:
+        baseline_df = pd.read_excel(baseline_path, sheet_name=sheet_name)
+        test_df = pd.read_excel(test_path, sheet_name=sheet_name)
+
+        # Add source column
+        baseline_df["source"] = baseline_label
+        test_df["source"] = test_label
+
+        # Concatenate
+        combined = pd.concat([baseline_df, test_df], ignore_index=True)
+        combined_data[sheet_name] = combined
+
+        if verbose:
+            print(f"  {sheet_name}: {len(baseline_df)} + {len(test_df)} = {len(combined)} rows")
+
+    return combined_data
+
+
+def extract_label_from_path(file_path: Path, default: str = "unknown") -> str:
+    """
+    Extract label from file path using grandparent directory name.
+
+    Args:
+        file_path: Path to the Excel file
+        default: Default label if extraction fails
+
+    Returns:
+        Extracted label or default
+
+    Examples:
+        /path/to/56cu_256threads/tracelens_analysis/gpu_timeline.xlsx
+        → "56cu_256threads"
+
+        /path/to/run1/tracelens_analysis/collective_reports/collective.xlsx
+        → "tracelens_analysis" (grandparent of file)
+    """
+    try:
+        file_path = Path(file_path)
+        # Go up to grandparent (skip filename and parent directory)
+        grandparent = file_path.parent.parent.name
+        if grandparent and grandparent not in [".", "..", ""]:
+            return grandparent
+    except Exception:
+        pass
+    return default
diff --git a/src/aorta/report/comparison/formatting.py b/src/aorta/report/comparison/formatting.py
new file mode 100644
index 0000000..d4fe049
--- /dev/null
+++ b/src/aorta/report/comparison/formatting.py
@@ -0,0 +1,144 @@
+"""Shared Excel formatting utilities for comparison reports."""
+
+from pathlib import Path
+from typing import Dict, List
+
+import pandas as pd
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+# Color constants for conditional formatting
+RED = "F8696B"
+WHITE = "FFFFFF"
+GREEN = "63BE7B"
+
+
+def get_column_letter(col_idx: int) -> str:
+    """
+    Convert 1-based column index to Excel column letter.
+
+    Args:
+        col_idx: 1-based column index
+
+    Returns:
+        Excel column letter (A, B, ..., Z, AA, AB, ...)
+
+    Examples:
+        >>> get_column_letter(1)
+        'A'
+        >>> get_column_letter(26)
+        'Z'
+        >>> get_column_letter(27)
+        'AA'
+        >>> get_column_letter(28)
+        'AB'
+    """
+    result = ""
+    while col_idx > 0:
+        col_idx, remainder = divmod(col_idx - 1, 26)
+        result = chr(65 + remainder) + result
+    return result
+
+
+def create_color_scale_rule() -> ColorScaleRule:
+    """
+    Create standard red-white-green color scale rule.
+
+    Red (min/negative) → White (0) → Green (max/positive)
+
+    Returns:
+        ColorScaleRule configured for percent_change columns
+    """
+    return ColorScaleRule(
+        start_type="min",
+        start_color=RED,
+        mid_type="num",
+        mid_value=0,
+        mid_color=WHITE,
+        end_type="max",
+        end_color=GREEN,
+    )
+
+
+def apply_color_scale_to_column(
+    worksheet,
+    col_idx: int,
+    num_rows: int,
+) -> None:
+    """
+    Apply color scale formatting to a specific column.
+
+    Args:
+        worksheet: openpyxl worksheet
+        col_idx: 1-based column index
+        num_rows: Number of data rows (excluding header)
+    """
+    col_letter = get_column_letter(col_idx)
+    # Data starts at row 2 (row 1 is header)
+    data_range = f"{col_letter}2:{col_letter}{num_rows + 1}"
+
+    worksheet.conditional_formatting.add(data_range, create_color_scale_rule())
+
+
+def save_with_formatting(
+    data: Dict[str, pd.DataFrame],
+    output_path: Path,
+    format_columns: Dict[str, List[str]],
+    verbose: bool = False,
+) -> Path:
+    """
+    Save DataFrames to Excel with conditional formatting.
+
+    Args:
+        data: Dict[sheet_name, DataFrame]
+        output_path: Output file path
+        format_columns: Dict[sheet_name, list of column names to format]
+        verbose: Print progress
+
+    Returns:
+        Path to saved file
+
+    Example:
+        format_columns = {
+            "Comparison_By_Rank": ["percent_change"],
+            "Summary_Comparison": ["percent_change"],
+            "nccl_implicit_sync_cmp": [
+                "percent_change_comm_latency_mean",
+                "percent_change_algo bw (GB/s)_mean",
+            ],
+        }
+    """
+    # Ensure output directory exists
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+        # Write all sheets
+        for sheet_name, df in data.items():
+            df.to_excel(writer, sheet_name=sheet_name, index=False)
+
+        # Apply formatting
+        for sheet_name, columns_to_format in format_columns.items():
+            if sheet_name not in data:
+                continue
+
+            df = data[sheet_name]
+            worksheet = writer.sheets[sheet_name]
+            num_rows = len(df)
+
+            for col_name in columns_to_format:
+                if col_name not in df.columns:
+                    continue
+
+                # Find column index (1-based)
+                col_idx = df.columns.get_loc(col_name) + 1
+
+                apply_color_scale_to_column(worksheet, col_idx, num_rows)
+
+                if verbose:
+                    print(f"    Formatted {sheet_name}.{col_name}")
+
+    if verbose:
+        print(f"\nSaved: {output_path}")
+
+    return output_path
diff --git a/src/aorta/report/comparison/gpu_timeline_comparison.py b/src/aorta/report/comparison/gpu_timeline_comparison.py
new file mode 100644
index 0000000..418b340
--- /dev/null
+++ b/src/aorta/report/comparison/gpu_timeline_comparison.py
@@ -0,0 +1,222 @@
+"""GPU timeline comparison logic.
+
+Creates comparison sheets for GPU timeline data, matching the behavior of
+scripts/tracelens_single_config/add_comparison_sheets.py
+"""
+
+from typing import Dict
+
+import pandas as pd
+
+
+def add_gpu_timeline_comparison(
+    combined_data: Dict[str, pd.DataFrame],
+    baseline_label: str,
+    test_label: str,
+    verbose: bool = False,
+) -> Dict[str, pd.DataFrame]:
+    """
+    Add comparison sheets for GPU timeline data.
+
+    Args:
+        combined_data: Dict from combine_excel_files()
+        baseline_label: Label for baseline (for column naming)
+        test_label: Label for test (for column naming)
+        verbose: Print progress messages
+
+    Returns:
+        Dict with original sheets + new comparison sheets:
+        - 'Comparison_By_Rank': Per-rank comparison
+        - 'Summary_Comparison': Overall comparison
+
+    Expects combined_data to have:
+        - 'All_Ranks_Combined' sheet with: source, rank, type, time ms, percent
+        - 'Summary' sheet with: source, type, time ms, percent
+
+    Comparison columns created:
+        - {baseline_label}_time_ms, {test_label}_time_ms
+        - diff_time_ms, percent_change, status, ratio
+        - {baseline_label}_percent, {test_label}_percent, diff_percent
+
+    percent_change formula: (baseline - test) / baseline × 100
+        - Positive = test is faster (better)
+        - Negative = test is slower (worse)
+
+    status thresholds:
+        - "Better" if percent_change > 1
+        - "Worse" if percent_change < -1
+        - "Similar" otherwise
+    """
+    result = combined_data.copy()
+
+    # Get actual source values from the dataframe
+    all_combined = combined_data.get("All_Ranks_Combined")
+    if all_combined is None:
+        raise ValueError("'All_Ranks_Combined' sheet not found in combined data")
+
+    sources = all_combined["source"].unique()
+    if len(sources) < 2:
+        raise ValueError(f"Expected 2 sources, found {len(sources)}: {sources}")
+
+    # First source is baseline, second is test
+    actual_baseline = sources[0]
+    actual_test = sources[1]
+
+    if verbose:
+        print(f"\nCreating comparison sheets...")
+        print(f"  Baseline source: {actual_baseline}")
+        print(f"  Test source: {actual_test}")
+
+    # Create Comparison_By_Rank
+    comparison_by_rank = _create_comparison_by_rank(
+        all_combined,
+        actual_baseline,
+        actual_test,
+        baseline_label,
+        test_label,
+        verbose,
+    )
+    result["Comparison_By_Rank"] = comparison_by_rank
+
+    # Create Summary_Comparison
+    summary = combined_data.get("Summary")
+    if summary is not None:
+        summary_comparison = _create_summary_comparison(
+            summary,
+            actual_baseline,
+            actual_test,
+            baseline_label,
+            test_label,
+            verbose,
+        )
+        result["Summary_Comparison"] = summary_comparison
+
+    return result
+
+
+def _create_comparison_by_rank(
+    all_combined: pd.DataFrame,
+    actual_baseline: str,
+    actual_test: str,
+    baseline_label: str,
+    test_label: str,
+    verbose: bool,
+) -> pd.DataFrame:
+    """Create per-rank comparison DataFrame."""
+    baseline_data = all_combined[all_combined["source"] == actual_baseline]
+    test_data = all_combined[all_combined["source"] == actual_test]
+
+    rows = []
+
+    for rank in sorted(baseline_data["rank"].unique()):
+        base_rank = baseline_data[baseline_data["rank"] == rank].set_index("type")
+        test_rank = test_data[test_data["rank"] == rank].set_index("type")
+
+        for metric_type in base_rank.index:
+            if metric_type not in test_rank.index:
+                continue
+
+            base_time = base_rank.loc[metric_type, "time ms"]
+            test_time = test_rank.loc[metric_type, "time ms"]
+
+            # Calculate metrics
+            ratio_val = test_time / base_time if base_time != 0 else 0
+
+            # percent_change: positive when test is faster (takes less time)
+            pct_change = (
+                (base_time - test_time) / base_time * 100
+                if base_time != 0
+                else 0
+            )
+
+            # Determine status
+            if pct_change > 1:
+                status = "Better"
+            elif pct_change < -1:
+                status = "Worse"
+            else:
+                status = "Similar"
+
+            # Build row
+            row = {
+                "rank": rank,
+                "type": metric_type,
+                f"{baseline_label}_time_ms": base_time,
+                f"{test_label}_time_ms": test_time,
+                "diff_time_ms": test_time - base_time,
+                "percent_change": pct_change,
+                "status": status,
+                "ratio": ratio_val,
+                f"{baseline_label}_percent": base_rank.loc[metric_type, "percent"],
+                f"{test_label}_percent": test_rank.loc[metric_type, "percent"],
+                "diff_percent": (
+                    test_rank.loc[metric_type, "percent"]
+                    - base_rank.loc[metric_type, "percent"]
+                ),
+            }
+            rows.append(row)
+
+    comparison_by_rank = pd.DataFrame(rows)
+
+    if verbose:
+        num_ranks = baseline_data["rank"].nunique()
+        num_types = baseline_data["type"].nunique()
+        print(f"  Created Comparison_By_Rank ({len(comparison_by_rank)} rows)")
+        print(f"    {num_ranks} ranks × {num_types} types")
+
+    return comparison_by_rank
+
+
+def _create_summary_comparison(
+    summary: pd.DataFrame,
+    actual_baseline: str,
+    actual_test: str,
+    baseline_label: str,
+    test_label: str,
+    verbose: bool,
+) -> pd.DataFrame:
+    """Create overall summary comparison DataFrame."""
+    baseline_summary = summary[summary["source"] == actual_baseline].set_index("type")
+    test_summary = summary[summary["source"] == actual_test].set_index("type")
+
+    rows = []
+
+    for metric_type in baseline_summary.index:
+        if metric_type not in test_summary.index:
+            continue
+
+        base_time = baseline_summary.loc[metric_type, "time ms"]
+        test_time = test_summary.loc[metric_type, "time ms"]
+
+        # Calculate metrics
+        ratio_val = test_time / base_time if base_time != 0 else 0
+
+        # percent_change: positive when test is faster (takes less time)
+        pct_change = (
+            (base_time - test_time) / base_time * 100 if base_time != 0 else 0
+        )
+
+        # Build row
+        row = {
+            "type": metric_type,
+            f"{baseline_label}_time_ms": base_time,
+            f"{test_label}_time_ms": test_time,
+            "diff_time_ms": test_time - base_time,
+            "percent_change": pct_change,
+            "ratio": ratio_val,
+            f"{baseline_label}_percent": baseline_summary.loc[metric_type, "percent"],
+            f"{test_label}_percent": test_summary.loc[metric_type, "percent"],
+            "diff_percent": (
+                test_summary.loc[metric_type, "percent"]
+                - baseline_summary.loc[metric_type, "percent"]
+            ),
+        }
+        rows.append(row)
+
+    summary_comparison = pd.DataFrame(rows)
+
+    if verbose:
+        print(f"  Created Summary_Comparison ({len(summary_comparison)} rows)")
+
+    return summary_comparison
+
diff --git a/src/aorta/report/generators/__init__.py b/src/aorta/report/generators/__init__.py
index 07a4b88..7721d91 100644
--- a/src/aorta/report/generators/__init__.py
+++ b/src/aorta/report/generators/__init__.py
@@ -1,9 +1,15 @@
-"""HTML report generators."""
+"""Report generators for HTML, Excel, and plots."""
 
 from .html_generator import generate_html, image_to_base64
+from .excel_report import create_final_excel_report
+from .plot_generator import generate_plots, generate_summary_plots, generate_gemm_plots
 
 __all__ = [
     "generate_html",
     "image_to_base64",
+    "create_final_excel_report",
+    "generate_plots",
+    "generate_summary_plots",
+    "generate_gemm_plots",
 ]
 
diff --git a/src/aorta/report/generators/cli.py b/src/aorta/report/generators/cli.py
new file mode 100644
index 0000000..1120b46
--- /dev/null
+++ b/src/aorta/report/generators/cli.py
@@ -0,0 +1,291 @@
+"""CLI commands for report generation.
+
+This module provides the 'generate' command group with subcommands:
+  - html: Generate HTML report with embedded images
+  - excel: Generate comprehensive Excel report
+  - plots: Generate visualization plots
+"""
+
+import click
+from pathlib import Path
+
+
+@click.group()
+@click.pass_context
+def generate(ctx):
+    """Generate reports and visualizations.
+
+    \b
+    Commands:
+      html   - Generate HTML report with embedded images
+      excel  - Generate comprehensive Excel report
+      plots  - Generate visualization plots
+    """
+    pass
+
+
+@generate.command("html")
+@click.option("--mode", type=click.Choice(["sweep", "performance"]), required=True,
+              help="Report mode: 'sweep' for GEMM variance comparison, 'performance' for GPU/NCCL analysis")
+# Sweep mode options
+@click.option("--sweep1", type=click.Path(exists=True),
+              help="[sweep mode] First sweep directory")
+@click.option("--sweep2", type=click.Path(exists=True),
+              help="[sweep mode] Second sweep directory")
+@click.option("--label1", help="[sweep mode] Label for first sweep")
+@click.option("--label2", help="[sweep mode] Label for second sweep")
+# Performance mode options
+@click.option("--plots-dir", type=click.Path(exists=True),
+              help="[performance mode] Directory containing pre-generated plots")
+# Common options
+@click.option("-o", "--output", required=True, type=click.Path(), help="Output HTML file")
+@click.pass_context
+def generate_html(ctx, mode, sweep1, sweep2, label1, label2, plots_dir, output):
+    """Generate HTML report with embedded images.
+
+    Two modes available:
+
+    \b
+    SWEEP MODE (--mode sweep):
+      Compare GEMM kernel variance between two experiment sweeps.
+      Requires: --sweep1, --sweep2
+      Optional: --label1, --label2
+
+    \b
+    PERFORMANCE MODE (--mode performance):
+      Generate GPU/NCCL performance analysis report.
+      Requires: --plots-dir (directory with pre-generated plots)
+
+    \b
+    Examples:
+      # Sweep comparison (GEMM variance)
+      aorta-report generate html --mode sweep \\
+          --sweep1 ./exp1 --sweep2 ./exp2 \\
+          --label1 "Baseline" --label2 "Optimized" \\
+          -o comparison.html
+
+      # Performance report (GPU/NCCL analysis)
+      aorta-report generate html --mode performance \\
+          --plots-dir ./output/plots \\
+          -o performance_report.html
+    """
+    from . import generate_html as do_generate_html
+
+    verbose = ctx.obj.get("verbose", False)
+
+    try:
+        output_path = do_generate_html(
+            mode=mode,
+            output=Path(output),
+            sweep1=Path(sweep1) if sweep1 else None,
+            sweep2=Path(sweep2) if sweep2 else None,
+            label1=label1,
+            label2=label2,
+            plots_dir=Path(plots_dir) if plots_dir else None,
+            verbose=verbose,
+        )
+        if not ctx.obj.get("quiet", False):
+            click.echo(f"\nReport generated successfully: {output_path}")
+    except ValueError as e:
+        raise click.UsageError(str(e))
+    except FileNotFoundError as e:
+        raise click.ClickException(str(e))
+
+
+@generate.command("excel")
+@click.option("--gpu-combined", required=True, type=click.Path(exists=True),
+              help="GPU combined report file")
+@click.option("--gpu-comparison", required=True, type=click.Path(exists=True),
+              help="GPU comparison report file")
+@click.option("--coll-combined", required=True, type=click.Path(exists=True),
+              help="Collective combined report file")
+@click.option("--coll-comparison", required=True, type=click.Path(exists=True),
+              help="Collective comparison report file")
+@click.option("-o", "--output", required=True, type=click.Path(), help="Output Excel file")
+@click.option("--baseline-label", default="Baseline", help="Label for baseline (default: Baseline)")
+@click.option("--test-label", default="Test", help="Label for test (default: Test)")
+@click.pass_context
+def generate_excel(ctx, gpu_combined, gpu_comparison, coll_combined, coll_comparison,
+                   output, baseline_label, test_label):
+    """Generate comprehensive Excel report.
+
+    Combines GPU timeline and collective comparison data into a single
+    well-organized Excel report with:
+
+    \b
+    - Summary Dashboard (first sheet, key metrics at a glance)
+    - Comparison sheets (visible, with color-coded changes)
+    - Raw data sheets (hidden, accessible via Unhide)
+    - Excel table formatting with filters
+
+    \b
+    Examples:
+      aorta-report generate excel \\
+          --gpu-combined gpu_combined.xlsx \\
+          --gpu-comparison gpu_comparison.xlsx \\
+          --coll-combined coll_combined.xlsx \\
+          --coll-comparison coll_comparison.xlsx \\
+          -o final_report.xlsx
+
+      aorta-report generate excel \\
+          --gpu-combined gpu_combined.xlsx \\
+          --gpu-comparison gpu_comparison.xlsx \\
+          --coll-combined coll_combined.xlsx \\
+          --coll-comparison coll_comparison.xlsx \\
+          --baseline-label "ROCm 6.0" --test-label "ROCm 7.0" \\
+          -o final_report.xlsx
+    """
+    from . import create_final_excel_report
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    if not quiet:
+        click.echo("=" * 60)
+        click.echo("Creating Final Excel Report")
+        click.echo("=" * 60)
+        click.echo(f"GPU Combined:     {gpu_combined}")
+        click.echo(f"GPU Comparison:   {gpu_comparison}")
+        click.echo(f"Coll Combined:    {coll_combined}")
+        click.echo(f"Coll Comparison:  {coll_comparison}")
+        click.echo(f"Output:           {output}")
+        click.echo(f"Baseline label:   {baseline_label}")
+        click.echo(f"Test label:       {test_label}")
+
+    try:
+        result = create_final_excel_report(
+            gpu_combined_path=Path(gpu_combined),
+            gpu_comparison_path=Path(gpu_comparison),
+            coll_combined_path=Path(coll_combined),
+            coll_comparison_path=Path(coll_comparison),
+            output_path=Path(output),
+            baseline_label=baseline_label,
+            test_label=test_label,
+            verbose=verbose,
+        )
+
+        if not quiet:
+            click.echo("\n" + "=" * 60)
+            click.echo("Report Complete!")
+            click.echo("=" * 60)
+            click.echo(f"\nOutput: {result['output_path']}")
+            click.echo("\nReport Structure:")
+            click.echo("  Visible Sheets (Analysis):")
+            for sheet in result["visible_sheets"]:
+                click.echo(f"    - {sheet}")
+            click.echo("\n  Hidden Sheets (Raw Data):")
+            for sheet in result["hidden_sheets"]:
+                click.echo(f"    - {sheet}")
+            click.echo("\nFeatures:")
+            click.echo("  - All data formatted as Excel tables with filters")
+            click.echo("  - Percent change columns are color-coded (green=better, red=worse)")
+            click.echo("  - Unhide raw data: Right-click sheet tab → Unhide")
+
+    except FileNotFoundError as e:
+        raise click.ClickException(str(e))
+    except Exception as e:
+        raise click.ClickException(f"Error creating report: {e}")
+
+
+@generate.command("plots")
+@click.option("-i", "--input", "input_file", type=click.Path(exists=True),
+              help="Input file (Excel for summary, CSV for gemm)")
+@click.option("--excel-input", type=click.Path(exists=True),
+              help="Excel report file (for --type all)")
+@click.option("--gemm-csv", type=click.Path(exists=True),
+              help="GEMM variance CSV (for --type all)")
+@click.option("-o", "--output", required=True, type=click.Path(),
+              help="Output directory for PNG files")
+@click.option("--type", "plot_type",
+              type=click.Choice(["all", "summary", "gemm"]),
+              default="all", help="Type of plots to generate")
+@click.option("--dpi", default=150, type=int,
+              help="DPI for output images (default: 150)")
+@click.pass_context
+def generate_plots_cmd(ctx, input_file, excel_input, gemm_csv, output, plot_type, dpi):
+    """Generate visualization plots.
+
+    \b
+    Plot Types:
+      summary  - GPU timeline & NCCL charts from Excel report
+      gemm     - GEMM variance distribution from CSV
+      all      - Both summary and gemm plots
+
+    \b
+    Examples:
+      # Summary plots from Excel report
+      aorta-report generate plots -i final_report.xlsx -o ./plots/ --type summary
+
+      # GEMM plots from CSV
+      aorta-report generate plots -i gemm_variance.csv -o ./plots/ --type gemm
+
+      # All plots (both inputs required)
+      aorta-report generate plots \\
+          --excel-input final_report.xlsx \\
+          --gemm-csv gemm_variance.csv \\
+          -o ./plots/ --type all
+    """
+    from . import generate_plots
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    # Resolve inputs based on plot_type
+    excel_path = None
+    csv_path = None
+
+    if plot_type == "summary":
+        if input_file is None and excel_input is None:
+            raise click.UsageError("--input or --excel-input required for summary plots")
+        excel_path = Path(input_file or excel_input)
+    elif plot_type == "gemm":
+        if input_file is None and gemm_csv is None:
+            raise click.UsageError("--input or --gemm-csv required for gemm plots")
+        csv_path = Path(input_file or gemm_csv)
+    else:  # all
+        if excel_input is None:
+            raise click.UsageError("--excel-input required for --type all")
+        if gemm_csv is None:
+            raise click.UsageError("--gemm-csv required for --type all")
+        excel_path = Path(excel_input)
+        csv_path = Path(gemm_csv)
+
+    if not quiet:
+        click.echo("=" * 60)
+        click.echo("Generating Plots")
+        click.echo("=" * 60)
+        click.echo(f"Plot type: {plot_type}")
+        if excel_path:
+            click.echo(f"Excel input: {excel_path}")
+        if csv_path:
+            click.echo(f"GEMM CSV: {csv_path}")
+        click.echo(f"Output: {output}")
+        click.echo(f"DPI: {dpi}")
+
+    try:
+        results = generate_plots(
+            plot_type=plot_type,
+            output_dir=Path(output),
+            excel_input=excel_path,
+            gemm_csv=csv_path,
+            dpi=dpi,
+            verbose=verbose,
+        )
+
+        if not quiet:
+            click.echo("\n" + "=" * 60)
+            click.echo("Plots Generated!")
+            click.echo("=" * 60)
+            total = 0
+            for category, files in results.items():
+                click.echo(f"\n{category.upper()} plots:")
+                for f in files:
+                    click.echo(f"  - {f.name}")
+                total += len(files)
+            click.echo(f"\nTotal: {total} files generated in {output}")
+
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
+    except Exception as e:
+        raise click.ClickException(f"Error generating plots: {e}")
+
diff --git a/src/aorta/report/generators/excel_report.py b/src/aorta/report/generators/excel_report.py
new file mode 100644
index 0000000..4eba11d
--- /dev/null
+++ b/src/aorta/report/generators/excel_report.py
@@ -0,0 +1,505 @@
+"""Final Excel report generator.
+
+Creates comprehensive report with:
+- Summary Dashboard (first, visible)
+- Comparison sheets (visible)
+- Raw data sheets (hidden)
+- Excel table formatting
+- Color-coded percent_change columns
+"""
+
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import pandas as pd
+from openpyxl import load_workbook
+from openpyxl.worksheet.table import Table, TableStyleInfo
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+# =============================================================================
+# Sheet Naming Mappings
+# =============================================================================
+
+GPU_SHEET_MAPPING = {
+    "Summary": "GPU_Summary_Raw",
+    "All_Ranks_Combined": "GPU_AllRanks_Raw",
+    "Per_Rank_Time_ms": "GPU_Time_Raw",
+    "Per_Rank_Percent": "GPU_Pct_Raw",
+}
+
+GPU_COMPARISON_MAPPING = {
+    "Summary_Comparison": "GPU_Summary_Cmp",
+    "Comparison_By_Rank": "GPU_ByRank_Cmp",
+}
+
+COLL_SHEET_MAPPING = {
+    "nccl_summary_implicit_sync": "NCCL_ImplSync_Raw",
+    "nccl_summary_long": "NCCL_Long_Raw",
+}
+
+# Color scale colors
+RED = "F8696B"
+WHITE = "FFFFFF"
+GREEN = "63BE7B"
+
+
+# =============================================================================
+# Utility Functions
+# =============================================================================
+
+def get_column_letter(col_num: int) -> str:
+    """Convert column number (1-based) to Excel column letter."""
+    result = ""
+    while col_num > 0:
+        col_num -= 1
+        result = chr(65 + (col_num % 26)) + result
+        col_num //= 26
+    return result
+
+
+def sanitize_table_name(sheet_name: str) -> str:
+    """Create valid Excel table name from sheet name."""
+    table_name = (
+        sheet_name.replace(" ", "_")
+        .replace("-", "_")
+        .replace("(", "")
+        .replace(")", "")
+    )
+    # Ensure name starts with letter
+    if not table_name[0].isalpha():
+        table_name = "Tbl_" + table_name
+    # Max 255 chars
+    return table_name[:255]
+
+
+def add_excel_table(worksheet, table_name: str, start_row: int = 1) -> bool:
+    """Convert worksheet data to Excel table format.
+    
+    Returns True if table was added, False otherwise.
+    """
+    max_row = worksheet.max_row
+    max_col = worksheet.max_column
+
+    if max_row <= start_row:
+        return False  # No data
+
+    # Ensure all column headers are strings
+    for col_idx in range(1, max_col + 1):
+        cell = worksheet.cell(row=start_row, column=col_idx)
+        if cell.value is not None and not isinstance(cell.value, str):
+            cell.value = str(cell.value)
+
+    # Create table reference
+    start_cell = f"A{start_row}"
+    end_col_letter = get_column_letter(max_col)
+    end_cell = f"{end_col_letter}{max_row}"
+    table_ref = f"{start_cell}:{end_cell}"
+
+    try:
+        tab = Table(displayName=table_name, ref=table_ref)
+        style = TableStyleInfo(
+            name="TableStyleMedium2",
+            showFirstColumn=False,
+            showLastColumn=False,
+            showRowStripes=True,
+            showColumnStripes=False,
+        )
+        tab.tableStyleInfo = style
+        worksheet.add_table(tab)
+        return True
+    except Exception as e:
+        print(f"    Warning: Could not create table {table_name}: {e}")
+        return False
+
+
+# =============================================================================
+# Sheet Processing Functions
+# =============================================================================
+
+def _add_gpu_sheets(
+    writer: pd.ExcelWriter,
+    gpu_combined_path: Path,
+    gpu_comparison_path: Path,
+    verbose: bool,
+) -> Tuple[List[str], List[str]]:
+    """Add GPU timeline sheets.
+    
+    Returns (raw_sheets, comparison_sheets).
+    """
+    raw_sheets = []
+    comparison_sheets = []
+
+    if verbose:
+        print("\nStep 1: Adding GPU Timeline sheets")
+
+    # Read GPU combined (raw data)
+    gpu_comb_xl = pd.ExcelFile(gpu_combined_path)
+    for sheet_name in gpu_comb_xl.sheet_names:
+        df = pd.read_excel(gpu_combined_path, sheet_name=sheet_name)
+        new_name = GPU_SHEET_MAPPING.get(sheet_name, f"GPU_{sheet_name}_Raw")
+        df.to_excel(writer, sheet_name=new_name, index=False)
+        raw_sheets.append(new_name)
+        if verbose:
+            print(f"  Added {new_name} (will be hidden)")
+
+    # Read GPU comparison
+    gpu_comp_xl = pd.ExcelFile(gpu_comparison_path)
+    for sheet_name in gpu_comp_xl.sheet_names:
+        if "Comparison" in sheet_name:
+            df = pd.read_excel(gpu_comparison_path, sheet_name=sheet_name)
+            new_name = GPU_COMPARISON_MAPPING.get(sheet_name, f"GPU_{sheet_name}")
+            df.to_excel(writer, sheet_name=new_name, index=False)
+            comparison_sheets.append(new_name)
+            if verbose:
+                print(f"  Added {new_name}")
+
+    return raw_sheets, comparison_sheets
+
+
+def _add_collective_sheets(
+    writer: pd.ExcelWriter,
+    coll_combined_path: Path,
+    coll_comparison_path: Path,
+    verbose: bool,
+) -> Tuple[List[str], List[str]]:
+    """Add collective/NCCL sheets.
+    
+    Returns (raw_sheets, comparison_sheets).
+    """
+    raw_sheets = []
+    comparison_sheets = []
+
+    if verbose:
+        print("\nStep 2: Adding Collective/NCCL sheets")
+
+    # Read collective combined (raw data for hidden sheets)
+    coll_comb_xl = pd.ExcelFile(coll_combined_path)
+    for sheet_name in coll_comb_xl.sheet_names:
+        if "summary" in sheet_name.lower():
+            df = pd.read_excel(coll_combined_path, sheet_name=sheet_name)
+            new_name = COLL_SHEET_MAPPING.get(sheet_name, f"NCCL_{sheet_name}_Raw")
+            df.to_excel(writer, sheet_name=new_name, index=False)
+            raw_sheets.append(new_name)
+            if verbose:
+                print(f"  Added {new_name} (will be hidden)")
+
+    # Read collective comparison
+    coll_comp_xl = pd.ExcelFile(coll_comparison_path)
+    for sheet_name in coll_comp_xl.sheet_names:
+        df = pd.read_excel(coll_comparison_path, sheet_name=sheet_name)
+
+        # Determine appropriate naming
+        if "nccl" in sheet_name.lower():
+            if "_cmp" in sheet_name or "comparison" in sheet_name.lower():
+                new_name = f"NCCL_{sheet_name.replace('nccl_', '').title().replace('_', '')}"
+            else:
+                new_name = f"NCCL_{sheet_name}"
+        else:
+            new_name = sheet_name
+
+        df.to_excel(writer, sheet_name=new_name, index=False)
+
+        if "_cmp" in sheet_name.lower() or "comparison" in sheet_name.lower():
+            comparison_sheets.append(new_name)
+            if verbose:
+                print(f"  Added {new_name}")
+        else:
+            raw_sheets.append(new_name)
+            if verbose:
+                print(f"  Added {new_name} (will be hidden)")
+
+    return raw_sheets, comparison_sheets
+
+
+def _create_summary_dashboard(
+    writer: pd.ExcelWriter,
+    gpu_comparison_path: Path,
+    coll_comparison_path: Path,
+    baseline_label: str,
+    test_label: str,
+    verbose: bool,
+) -> str:
+    """Create Summary_Dashboard sheet with key metrics.
+    
+    Returns sheet name.
+    """
+    if verbose:
+        print("\nStep 3: Creating Summary Dashboard")
+
+    dashboard_data = {
+        "Metric": [],
+        baseline_label: [],
+        test_label: [],
+        "Improvement (%)": [],
+        "Status": [],
+    }
+
+    # Add GPU metrics
+    try:
+        gpu_summary = pd.read_excel(gpu_comparison_path, sheet_name="Summary_Comparison")
+        
+        # Find the actual column names for time values
+        time_cols = [
+            col
+            for col in gpu_summary.columns
+            if "time_ms" in col and "diff" not in col and "percent" not in col
+        ]
+        
+        if len(time_cols) >= 2:
+            baseline_col = time_cols[0]
+            test_col = time_cols[1]
+        else:
+            baseline_col = (
+                "baseline_time_ms"
+                if "baseline_time_ms" in gpu_summary.columns
+                else time_cols[0] if time_cols else None
+            )
+            test_col = (
+                "test_time_ms"
+                if "test_time_ms" in gpu_summary.columns
+                else time_cols[1] if len(time_cols) > 1 else None
+            )
+
+        if baseline_col and test_col:
+            for _, row in gpu_summary.iterrows():
+                metric_type = row["type"]
+                dashboard_data["Metric"].append(f"GPU_{metric_type}")
+                dashboard_data[baseline_label].append(round(row[baseline_col], 2))
+                dashboard_data[test_label].append(round(row[test_col], 2))
+                
+                pct_val = row.get("percent_change", 0)
+                dashboard_data["Improvement (%)"].append(round(pct_val, 2))
+                
+                if pct_val > 1:
+                    status = "Better"
+                elif pct_val < -1:
+                    status = "Worse"
+                else:
+                    status = "Similar"
+                dashboard_data["Status"].append(status)
+    except Exception as e:
+        if verbose:
+            print(f"  Warning: Could not add GPU metrics to dashboard: {e}")
+
+    # Add NCCL metrics
+    try:
+        # Try to read NCCL comparison sheets
+        coll_xl = pd.ExcelFile(coll_comparison_path)
+        nccl_cmp_sheets = [s for s in coll_xl.sheet_names if "_cmp" in s.lower()]
+        
+        for sheet_name in nccl_cmp_sheets:
+            nccl_df = pd.read_excel(coll_comparison_path, sheet_name=sheet_name)
+            
+            # Find latency columns
+            latency_cols = [
+                col for col in nccl_df.columns 
+                if "comm_latency" in col.lower() and "percent_change" not in col.lower()
+            ]
+            
+            if len(latency_cols) >= 2:
+                base_col = latency_cols[0]
+                test_col = latency_cols[1]
+                pct_col = [c for c in nccl_df.columns if "percent_change" in c.lower() and "latency" in c.lower()]
+                
+                # Aggregate across all rows (mean)
+                base_val = nccl_df[base_col].mean()
+                test_val = nccl_df[test_col].mean()
+                
+                if pct_col:
+                    pct_val = nccl_df[pct_col[0]].mean()
+                else:
+                    pct_val = (base_val - test_val) / base_val * 100 if base_val != 0 else 0
+                
+                # Create metric name from sheet name
+                metric_name = sheet_name.replace("nccl_", "NCCL_").replace("_cmp", "_latency")
+                
+                dashboard_data["Metric"].append(metric_name)
+                dashboard_data[baseline_label].append(round(base_val, 2))
+                dashboard_data[test_label].append(round(test_val, 2))
+                dashboard_data["Improvement (%)"].append(round(pct_val, 2))
+                
+                if pct_val > 1:
+                    status = "Better"
+                elif pct_val < -1:
+                    status = "Worse"
+                else:
+                    status = "Similar"
+                dashboard_data["Status"].append(status)
+                
+    except Exception as e:
+        if verbose:
+            print(f"  Warning: Could not add NCCL metrics to dashboard: {e}")
+
+    dashboard_df = pd.DataFrame(dashboard_data)
+    sheet_name = "Summary_Dashboard"
+    dashboard_df.to_excel(writer, sheet_name=sheet_name, index=False)
+    
+    if verbose:
+        print(f"  Added {sheet_name} ({len(dashboard_df)} metrics)")
+    
+    return sheet_name
+
+
+def _apply_post_processing(
+    output_path: Path,
+    raw_sheets: List[str],
+    comparison_sheets: List[str],
+    verbose: bool,
+) -> None:
+    """Apply Excel formatting: hide sheets, add tables, color formatting."""
+    
+    if verbose:
+        print("\nStep 4: Applying formatting")
+
+    wb = load_workbook(output_path)
+
+    # Hide raw data sheets
+    for sheet_name in raw_sheets:
+        if sheet_name in wb.sheetnames:
+            wb[sheet_name].sheet_state = "hidden"
+            if verbose:
+                print(f"  Hidden: {sheet_name}")
+
+    # Convert all sheets to tables and apply formatting
+    for sheet_name in wb.sheetnames:
+        ws = wb[sheet_name]
+
+        # Skip if sheet is empty
+        if ws.max_row <= 1:
+            continue
+
+        # Create unique table name
+        table_name = sanitize_table_name(sheet_name)
+        
+        if add_excel_table(ws, table_name):
+            if verbose:
+                print(f"  Converted to table: {sheet_name}")
+
+        # Add conditional formatting for percent_change columns in comparison sheets
+        if "Cmp" in sheet_name or "Comparison" in sheet_name or "Dashboard" in sheet_name:
+            for col_idx in range(1, ws.max_column + 1):
+                cell_value = ws.cell(row=1, column=col_idx).value
+                if cell_value and (
+                    "percent_change" in str(cell_value).lower()
+                    or "improvement" in str(cell_value).lower()
+                ):
+                    col_letter = get_column_letter(col_idx)
+                    data_range = f"{col_letter}2:{col_letter}{ws.max_row}"
+
+                    try:
+                        ws.conditional_formatting.add(
+                            data_range,
+                            ColorScaleRule(
+                                start_type="min",
+                                start_color=RED,
+                                mid_type="num",
+                                mid_value=0,
+                                mid_color=WHITE,
+                                end_type="max",
+                                end_color=GREEN,
+                            ),
+                        )
+                        if verbose:
+                            print(f"    Applied color scale to {sheet_name} column {cell_value}")
+                    except Exception as e:
+                        if verbose:
+                            print(f"    Warning: Could not apply formatting to {cell_value}: {e}")
+
+    # Move Summary Dashboard to first position
+    if "Summary_Dashboard" in wb.sheetnames:
+        dashboard_sheet = wb["Summary_Dashboard"]
+        wb.move_sheet(dashboard_sheet, offset=-(len(wb.sheetnames) - 1))
+        wb.active = 0  # Set dashboard as active sheet
+        if verbose:
+            print("\n  Moved Summary_Dashboard to first position")
+
+    # Save workbook
+    wb.save(output_path)
+
+
+# =============================================================================
+# Main Function
+# =============================================================================
+
+def create_final_excel_report(
+    gpu_combined_path: Path,
+    gpu_comparison_path: Path,
+    coll_combined_path: Path,
+    coll_comparison_path: Path,
+    output_path: Path,
+    baseline_label: str = "Baseline",
+    test_label: str = "Test",
+    verbose: bool = False,
+) -> Dict[str, any]:
+    """
+    Create comprehensive final Excel report.
+    
+    Args:
+        gpu_combined_path: Path to GPU combined file
+        gpu_comparison_path: Path to GPU comparison file
+        coll_combined_path: Path to collective combined file
+        coll_comparison_path: Path to collective comparison file
+        output_path: Output path for final report
+        baseline_label: Label for baseline column
+        test_label: Label for test column
+        verbose: Print progress
+    
+    Returns:
+        Dictionary with report metadata:
+        - output_path: Path to created report
+        - visible_sheets: List of visible sheet names
+        - hidden_sheets: List of hidden sheet names
+    """
+    # Validate inputs
+    for path, name in [
+        (gpu_combined_path, "GPU combined"),
+        (gpu_comparison_path, "GPU comparison"),
+        (coll_combined_path, "Collective combined"),
+        (coll_comparison_path, "Collective comparison"),
+    ]:
+        if not path.exists():
+            raise FileNotFoundError(f"{name} file not found: {path}")
+
+    # Ensure output directory exists
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Track sheets
+    all_raw_sheets = []
+    all_comparison_sheets = []
+
+    # Create report
+    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+        # Add GPU sheets
+        gpu_raw, gpu_cmp = _add_gpu_sheets(
+            writer, gpu_combined_path, gpu_comparison_path, verbose
+        )
+        all_raw_sheets.extend(gpu_raw)
+        all_comparison_sheets.extend(gpu_cmp)
+
+        # Add collective sheets
+        coll_raw, coll_cmp = _add_collective_sheets(
+            writer, coll_combined_path, coll_comparison_path, verbose
+        )
+        all_raw_sheets.extend(coll_raw)
+        all_comparison_sheets.extend(coll_cmp)
+
+        # Create summary dashboard
+        dashboard_sheet = _create_summary_dashboard(
+            writer,
+            gpu_comparison_path,
+            coll_comparison_path,
+            baseline_label,
+            test_label,
+            verbose,
+        )
+
+    # Apply post-processing (hide sheets, add tables, formatting)
+    _apply_post_processing(output_path, all_raw_sheets, all_comparison_sheets, verbose)
+
+    return {
+        "output_path": output_path,
+        "visible_sheets": [dashboard_sheet] + all_comparison_sheets,
+        "hidden_sheets": all_raw_sheets,
+    }
+
diff --git a/src/aorta/report/generators/plot_generator.py b/src/aorta/report/generators/plot_generator.py
new file mode 100644
index 0000000..de1925c
--- /dev/null
+++ b/src/aorta/report/generators/plot_generator.py
@@ -0,0 +1,185 @@
+"""Plot generation orchestrator.
+
+Provides unified interface for generating summary and GEMM plots.
+"""
+
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from .plot_helper import (
+    configure_style,
+    # Summary
+    get_labels_from_excel,
+    plot_improvement_chart,
+    plot_abs_time_comparison,
+    plot_gpu_metrics_by_rank,
+    plot_gpu_percent_change_grid,
+    plot_gpu_heatmap,
+    plot_nccl_comparison,
+    plot_nccl_percent_change,
+    # GEMM
+    read_gemm_csv_data,
+    print_gemm_statistics,
+    plot_variance_by_threads,
+    plot_variance_by_channels,
+    plot_variance_by_ranks,
+    plot_variance_violin_combined,
+    plot_thread_channel_interaction,
+)
+
+
+def generate_summary_plots(
+    excel_path: Path,
+    output_dir: Path,
+    dpi: int = 150,
+    verbose: bool = False,
+) -> List[Path]:
+    """
+    Generate all summary plots from Excel report.
+    
+    Args:
+        excel_path: Path to final Excel report
+        output_dir: Output directory for PNG files
+        dpi: DPI for output images
+        verbose: Print progress
+    
+    Returns:
+        List of generated file paths.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_files: List[Path] = []
+    
+    if verbose:
+        print(f"\nGenerating summary plots from: {excel_path}")
+    
+    labels = get_labels_from_excel(excel_path)
+    if verbose:
+        print(f"  Labels: {labels}")
+    
+    # Dashboard plots
+    if verbose:
+        print("  Creating dashboard plots...")
+    output_files.append(plot_improvement_chart(excel_path, output_dir, dpi))
+    output_files.append(plot_abs_time_comparison(excel_path, output_dir, labels, dpi))
+    
+    # GPU plots
+    if verbose:
+        print("  Creating GPU plots...")
+    output_files.extend(
+        plot_gpu_metrics_by_rank(excel_path, output_dir, labels, dpi=dpi)
+    )
+    output_files.append(plot_gpu_percent_change_grid(excel_path, output_dir, dpi))
+    output_files.append(plot_gpu_heatmap(excel_path, output_dir, dpi))
+    
+    # NCCL plots
+    if verbose:
+        print("  Creating NCCL plots...")
+    nccl_files = plot_nccl_comparison(excel_path, output_dir, labels, dpi)
+    output_files.extend(nccl_files)
+    
+    nccl_pct_file = plot_nccl_percent_change(excel_path, output_dir, dpi)
+    if nccl_pct_file:
+        output_files.append(nccl_pct_file)
+    
+    if verbose:
+        print(f"  Generated {len(output_files)} summary plots")
+    
+    return output_files
+
+
+def generate_gemm_plots(
+    csv_path: Path,
+    output_dir: Path,
+    dpi: int = 150,
+    verbose: bool = False,
+) -> List[Path]:
+    """
+    Generate all GEMM variance plots from CSV.
+    
+    Args:
+        csv_path: Path to GEMM variance CSV
+        output_dir: Output directory for PNG files
+        dpi: DPI for output images
+        verbose: Print progress
+    
+    Returns:
+        List of generated file paths.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_files: List[Path] = []
+    
+    if verbose:
+        print(f"\nGenerating GEMM plots from: {csv_path}")
+    
+    data = read_gemm_csv_data(csv_path)
+    
+    if verbose:
+        print(f"  Total data points: {len(data['all'])}")
+        print_gemm_statistics(data)
+    
+    # Boxplots
+    if verbose:
+        print("  Creating boxplots...")
+    output_files.append(plot_variance_by_threads(data, output_dir, dpi))
+    output_files.append(plot_variance_by_channels(data, output_dir, dpi))
+    output_files.append(plot_variance_by_ranks(data, output_dir, dpi))
+    
+    # Violin and interaction
+    if verbose:
+        print("  Creating violin and interaction plots...")
+    output_files.append(plot_variance_violin_combined(data, output_dir, dpi))
+    output_files.append(plot_thread_channel_interaction(data, output_dir, dpi))
+    
+    if verbose:
+        print(f"  Generated {len(output_files)} GEMM plots")
+    
+    return output_files
+
+
+def generate_plots(
+    plot_type: str,
+    output_dir: Path,
+    excel_input: Optional[Path] = None,
+    gemm_csv: Optional[Path] = None,
+    dpi: int = 150,
+    verbose: bool = False,
+) -> Dict[str, List[Path]]:
+    """
+    Generate plots based on type.
+    
+    Args:
+        plot_type: "summary", "gemm", or "all"
+        output_dir: Output directory for PNG files
+        excel_input: Path to Excel report (for summary/all)
+        gemm_csv: Path to GEMM CSV (for gemm/all)
+        dpi: DPI for output images
+        verbose: Print progress
+    
+    Returns:
+        Dict mapping category to list of generated file paths
+    
+    Raises:
+        ValueError: If required inputs not provided for plot_type
+        FileNotFoundError: If input files don't exist
+    """
+    configure_style()
+    results: Dict[str, List[Path]] = {}
+    
+    if plot_type in ("summary", "all"):
+        if excel_input is None:
+            raise ValueError("Excel input required for summary plots")
+        if not excel_input.exists():
+            raise FileNotFoundError(f"Excel file not found: {excel_input}")
+        results["summary"] = generate_summary_plots(
+            excel_input, output_dir, dpi, verbose
+        )
+    
+    if plot_type in ("gemm", "all"):
+        if gemm_csv is None:
+            raise ValueError("GEMM CSV required for gemm plots")
+        if not gemm_csv.exists():
+            raise FileNotFoundError(f"CSV file not found: {gemm_csv}")
+        results["gemm"] = generate_gemm_plots(gemm_csv, output_dir, dpi, verbose)
+    
+    return results
+
diff --git a/src/aorta/report/generators/plot_helper/__init__.py b/src/aorta/report/generators/plot_helper/__init__.py
new file mode 100644
index 0000000..9fbcaf2
--- /dev/null
+++ b/src/aorta/report/generators/plot_helper/__init__.py
@@ -0,0 +1,50 @@
+"""Plot helper functions for summary and GEMM visualizations."""
+
+from .common import configure_style, COLORS, save_figure, get_improvement_colors
+
+# Summary plots
+from .summary_dashboard import (
+    get_labels_from_excel,
+    plot_improvement_chart,
+    plot_abs_time_comparison,
+)
+from .gpu_by_rank import plot_gpu_metrics_by_rank
+from .gpu_percent_change import plot_gpu_percent_change_grid
+from .gpu_heatmap import plot_gpu_heatmap
+from .nccl_charts import plot_nccl_comparison, plot_nccl_percent_change
+
+# GEMM plots
+from .gemm_data import read_gemm_csv_data, print_gemm_statistics
+from .gemm_boxplots import (
+    plot_variance_by_threads,
+    plot_variance_by_channels,
+    plot_variance_by_ranks,
+)
+from .gemm_violin import plot_variance_violin_combined
+from .gemm_interaction import plot_thread_channel_interaction
+
+__all__ = [
+    # Common
+    "configure_style",
+    "COLORS",
+    "save_figure",
+    "get_improvement_colors",
+    # Summary
+    "get_labels_from_excel",
+    "plot_improvement_chart",
+    "plot_abs_time_comparison",
+    "plot_gpu_metrics_by_rank",
+    "plot_gpu_percent_change_grid",
+    "plot_gpu_heatmap",
+    "plot_nccl_comparison",
+    "plot_nccl_percent_change",
+    # GEMM
+    "read_gemm_csv_data",
+    "print_gemm_statistics",
+    "plot_variance_by_threads",
+    "plot_variance_by_channels",
+    "plot_variance_by_ranks",
+    "plot_variance_violin_combined",
+    "plot_thread_channel_interaction",
+]
+
diff --git a/src/aorta/report/generators/plot_helper/common.py b/src/aorta/report/generators/plot_helper/common.py
new file mode 100644
index 0000000..3e5ddd5
--- /dev/null
+++ b/src/aorta/report/generators/plot_helper/common.py
@@ -0,0 +1,69 @@
+"""Common utilities for plot generation."""
+
+from pathlib import Path
+from typing import List
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+# =============================================================================
+# Color Palette
+# =============================================================================
+
+COLORS = {
+    "positive": "#2ecc71",    # Green - improvements
+    "negative": "#e74c3c",    # Red - regressions
+    "baseline": "#3498db",    # Blue - baseline data
+    "test": "#e67e22",        # Orange - test data
+    "neutral": "#95a5a6",     # Gray - neutral
+}
+
+# Extended palette for multi-series
+PALETTE_MULTI = ["#3498db", "#e67e22", "#2ecc71", "#e74c3c", "#9b59b6", "#1abc9c"]
+
+
+# =============================================================================
+# Plot Configuration
+# =============================================================================
+
+DEFAULT_DPI = 150
+DEFAULT_FIGSIZE = (10, 6)
+
+
+def configure_style() -> None:
+    """Configure matplotlib/seaborn style for consistent plots."""
+    sns.set_style("whitegrid")
+    plt.rcParams.update({
+        "figure.dpi": DEFAULT_DPI,
+        "savefig.dpi": DEFAULT_DPI,
+        "font.size": 12,
+        "axes.titlesize": 14,
+        "axes.labelsize": 12,
+    })
+
+
+def remove_spines(ax) -> None:
+    """Remove all spines from an axis."""
+    for spine in ["top", "right", "bottom", "left"]:
+        ax.spines[spine].set_visible(False)
+
+
+def save_figure(
+    fig,
+    output_path: Path,
+    dpi: int = DEFAULT_DPI,
+    close: bool = True,
+) -> Path:
+    """Save figure and optionally close it."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(output_path, dpi=dpi, bbox_inches="tight")
+    if close:
+        plt.close(fig)
+    return output_path
+
+
+def get_improvement_colors(values) -> List[str]:
+    """Return green/red colors based on positive/negative values."""
+    return [COLORS["positive"] if v > 0 else COLORS["negative"] for v in values]
+
diff --git a/src/aorta/report/generators/plot_helper/gemm_boxplots.py b/src/aorta/report/generators/plot_helper/gemm_boxplots.py
new file mode 100644
index 0000000..b089b04
--- /dev/null
+++ b/src/aorta/report/generators/plot_helper/gemm_boxplots.py
@@ -0,0 +1,108 @@
+"""GEMM variance boxplot generators."""
+
+from pathlib import Path
+from typing import Dict, List, Any, Tuple, Union
+
+import matplotlib.pyplot as plt
+
+from .common import DEFAULT_DPI, save_figure
+
+
+def _create_boxplot(
+    data_dict: Dict[int, List[float]],
+    output_path: Path,
+    label_fmt: str,
+    xlabel: str,
+    title: str,
+    colors: Union[List[str], str],
+    figsize: Tuple[int, int] = (10, 6),
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Generic boxplot creation helper."""
+    fig, ax = plt.subplots(figsize=figsize)
+    
+    keys_list = sorted(data_dict.keys())
+    plot_data = [data_dict[k] for k in keys_list]
+    labels = [label_fmt.format(k) for k in keys_list]
+    
+    bp = ax.boxplot(
+        plot_data,
+        tick_labels=labels,
+        patch_artist=True,
+        showmeans=True,
+        meanline=True,
+    )
+    
+    # Handle color assignment
+    if colors == "viridis":
+        color_list = plt.cm.viridis(
+            [i / len(keys_list) for i in range(len(keys_list))]
+        )
+    else:
+        color_list = colors
+    
+    for patch, color in zip(bp["boxes"], color_list):
+        patch.set_facecolor(color)
+    
+    ax.set_ylabel("Time Difference (us)", fontsize=14, fontweight="bold")
+    ax.set_xlabel(xlabel, fontsize=14, fontweight="bold")
+    ax.set_title(title, fontsize=16, fontweight="bold", pad=20)
+    ax.grid(True, alpha=0.3)
+    
+    plt.tight_layout()
+    return save_figure(fig, output_path, dpi)
+
+
+def plot_variance_by_threads(
+    data: Dict[str, Any],
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Create boxplot of variance by thread count."""
+    return _create_boxplot(
+        data_dict=data["threads"],
+        output_path=output_dir / "variance_by_threads_boxplot.png",
+        label_fmt="{} threads",
+        xlabel="Thread Configuration",
+        title="GEMM Kernel Time Variance by Thread Count",
+        colors=["lightblue", "lightcoral"],
+        figsize=(10, 6),
+        dpi=dpi,
+    )
+
+
+def plot_variance_by_channels(
+    data: Dict[str, Any],
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Create boxplot of variance by channel count."""
+    return _create_boxplot(
+        data_dict=data["channels"],
+        output_path=output_dir / "variance_by_channels_boxplot.png",
+        label_fmt="{}ch",
+        xlabel="Channel Configuration",
+        title="GEMM Kernel Time Variance by Channel Count",
+        colors=["#e6f2ff", "#99ccff", "#4da6ff", "#0073e6"],
+        figsize=(12, 6),
+        dpi=dpi,
+    )
+
+
+def plot_variance_by_ranks(
+    data: Dict[str, Any],
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Create boxplot of variance by rank."""
+    return _create_boxplot(
+        data_dict=data["ranks"],
+        output_path=output_dir / "variance_by_ranks_boxplot.png",
+        label_fmt="Rank {}",
+        xlabel="Rank",
+        title="GEMM Kernel Time Variance by Rank",
+        colors="viridis",
+        figsize=(14, 6),
+        dpi=dpi,
+    )
+
diff --git a/src/aorta/report/generators/plot_helper/gemm_data.py b/src/aorta/report/generators/plot_helper/gemm_data.py
new file mode 100644
index 0000000..9d233af
--- /dev/null
+++ b/src/aorta/report/generators/plot_helper/gemm_data.py
@@ -0,0 +1,111 @@
+"""GEMM variance data loading and statistics."""
+
+import csv
+from pathlib import Path
+from typing import Dict, List, Any
+from collections import defaultdict
+
+
+def read_gemm_csv_data(csv_path: Path) -> Dict[str, Any]:
+    """
+    Read GEMM variance CSV and organize by dimensions.
+    
+    Returns:
+        {
+            "threads": {256: [values], 512: [values]},
+            "channels": {28: [values], 42: [values], ...},
+            "ranks": {0: [values], 1: [values], ...},
+            "all": [list of row dicts],
+        }
+    """
+    data: Dict[str, Any] = {
+        "threads": defaultdict(list),
+        "channels": defaultdict(list),
+        "ranks": defaultdict(list),
+        "all": [],
+    }
+    
+    with open(csv_path, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            try:
+                threads = int(row["threads"])
+                channel = int(row["channel"])
+                rank = int(row["rank"])
+                time_diff = float(row["time_diff_us"])
+                
+                data["threads"][threads].append(time_diff)
+                data["channels"][channel].append(time_diff)
+                data["ranks"][rank].append(time_diff)
+                data["all"].append({
+                    "threads": threads,
+                    "channel": channel,
+                    "rank": rank,
+                    "time_diff": time_diff,
+                    "kernel_name": row.get("kernel_name", ""),
+                })
+            except (ValueError, KeyError):
+                continue
+    
+    return data
+
+
+def _calculate_median(values: List[float]) -> float:
+    """Calculate median of a list of values."""
+    sorted_vals = sorted(values)
+    n = len(sorted_vals)
+    if n == 0:
+        return 0.0
+    if n % 2 == 1:
+        return sorted_vals[n // 2]
+    return (sorted_vals[n // 2 - 1] + sorted_vals[n // 2]) / 2
+
+
+def print_gemm_statistics(
+    data: Dict[str, Any],
+    verbose: bool = True,
+) -> Dict[str, Any]:
+    """Print and return summary statistics."""
+    stats: Dict[str, Any] = {}
+    
+    if verbose:
+        print("\n" + "=" * 70)
+        print("VARIANCE DISTRIBUTION STATISTICS")
+        print("=" * 70)
+    
+    for dimension, label_fmt in [
+        ("threads", "{} threads"),
+        ("channels", "{}ch"),
+        ("ranks", "Rank {}"),
+    ]:
+        stats[dimension] = {}
+        if verbose:
+            print(f"\nBy {dimension.title()}:")
+        
+        for key in sorted(data[dimension].keys()):
+            values = data[dimension][key]
+            if not values:
+                continue
+            
+            mean_val = sum(values) / len(values)
+            median_val = _calculate_median(values)
+            
+            stats[dimension][key] = {
+                "mean": mean_val,
+                "median": median_val,
+                "max": max(values),
+                "count": len(values),
+            }
+            
+            if verbose:
+                label = label_fmt.format(key)
+                print(
+                    f"  {label}: mean={mean_val:.2f}us, median={median_val:.2f}us, "
+                    f"max={max(values):.2f}us, n={len(values)}"
+                )
+    
+    if verbose:
+        print("=" * 70 + "\n")
+    
+    return stats
+
diff --git a/src/aorta/report/generators/plot_helper/gemm_interaction.py b/src/aorta/report/generators/plot_helper/gemm_interaction.py
new file mode 100644
index 0000000..5aaeb8b
--- /dev/null
+++ b/src/aorta/report/generators/plot_helper/gemm_interaction.py
@@ -0,0 +1,69 @@
+"""GEMM thread-channel interaction plot."""
+
+from pathlib import Path
+from typing import Dict, Any
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+
+from .common import DEFAULT_DPI, save_figure
+
+
+def plot_thread_channel_interaction(
+    data: Dict[str, Any],
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Create thread-channel interaction line plot."""
+    fig, ax = plt.subplots(figsize=(12, 7))
+    
+    # Organize data by threads and channels
+    thread_channel_data: Dict[int, Dict[int, list]] = defaultdict(
+        lambda: defaultdict(list)
+    )
+    for row in data["all"]:
+        thread_channel_data[row["threads"]][row["channel"]].append(row["time_diff"])
+    
+    threads = sorted(thread_channel_data.keys())
+    channels = sorted(
+        set(ch for t_data in thread_channel_data.values() for ch in t_data.keys())
+    )
+    
+    markers = ["o", "s", "^", "D"]
+    
+    for i, thread in enumerate(threads):
+        means = []
+        for channel in channels:
+            if channel in thread_channel_data[thread]:
+                values = thread_channel_data[thread][channel]
+                means.append(sum(values) / len(values))
+            else:
+                means.append(0)
+        
+        ax.plot(
+            channels,
+            means,
+            marker=markers[i % len(markers)],
+            linewidth=2,
+            markersize=10,
+            label=f"{thread} threads",
+        )
+    
+    ax.set_xlabel("Channel Count", fontsize=14, fontweight="bold")
+    ax.set_ylabel("Mean Time Difference (us)", fontsize=14, fontweight="bold")
+    ax.set_title(
+        "Thread-Channel Interaction: Mean Variance",
+        fontsize=16,
+        fontweight="bold",
+        pad=20,
+    )
+    ax.set_xticks(channels)
+    ax.set_xticklabels([f"{c}ch" for c in channels])
+    ax.legend(fontsize=12, loc="best")
+    ax.grid(True, alpha=0.3)
+    
+    plt.tight_layout()
+    return save_figure(
+        fig, output_dir / "variance_thread_channel_interaction.png", dpi
+    )
+
diff --git a/src/aorta/report/generators/plot_helper/gemm_violin.py b/src/aorta/report/generators/plot_helper/gemm_violin.py
new file mode 100644
index 0000000..b3f404c
--- /dev/null
+++ b/src/aorta/report/generators/plot_helper/gemm_violin.py
@@ -0,0 +1,96 @@
+"""GEMM variance violin plot."""
+
+from pathlib import Path
+from typing import Dict, List, Any
+
+import matplotlib.pyplot as plt
+
+from .common import DEFAULT_DPI, save_figure
+
+
+def _prepare_violin_data(
+    data_dict: Dict[int, List[float]],
+    label_fmt: str,
+) -> List[Dict[str, Any]]:
+    """Prepare data for violin plot from a dictionary."""
+    result = []
+    for key, values in sorted(data_dict.items()):
+        for val in values:
+            result.append({"config": label_fmt.format(key), "time_diff": val})
+    return result
+
+
+def plot_variance_violin_combined(
+    data: Dict[str, Any],
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """Create combined violin plot (1x3 grid) for all dimensions."""
+    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
+    
+    configs = [
+        {
+            "data": _prepare_violin_data(data["threads"], "{}t"),
+            "sort_key": lambda x: int(x[:-1]),
+            "color": "lightblue",
+            "xlabel": "Threads",
+            "title": "By Thread Count",
+        },
+        {
+            "data": _prepare_violin_data(data["channels"], "{}ch"),
+            "sort_key": lambda x: int(x[:-2]),
+            "color": "lightcoral",
+            "xlabel": "Channels",
+            "title": "By Channel Count",
+        },
+        {
+            "data": _prepare_violin_data(data["ranks"], "R{}"),
+            "sort_key": lambda x: int(x[1:]),
+            "color": "lightgreen",
+            "xlabel": "Ranks",
+            "title": "By Rank",
+        },
+    ]
+    
+    for ax, cfg in zip(axes, configs):
+        violin_data = cfg["data"]
+        if not violin_data:
+            ax.set_visible(False)
+            continue
+        
+        configs_list = sorted(
+            set(d["config"] for d in violin_data),
+            key=cfg["sort_key"],
+        )
+        values = [
+            [d["time_diff"] for d in violin_data if d["config"] == c]
+            for c in configs_list
+        ]
+        
+        parts = ax.violinplot(
+            values,
+            positions=range(len(configs_list)),
+            showmeans=True,
+            showmedians=True,
+        )
+        for pc in parts["bodies"]:
+            pc.set_facecolor(cfg["color"])
+            pc.set_alpha(0.7)
+        
+        ax.set_xticks(range(len(configs_list)))
+        ax.set_xticklabels(configs_list)
+        ax.set_ylabel("Time Difference (us)", fontsize=12, fontweight="bold")
+        ax.set_xlabel(cfg["xlabel"], fontsize=12, fontweight="bold")
+        ax.set_title(cfg["title"], fontsize=14, fontweight="bold")
+        ax.grid(True, alpha=0.3, axis="y")
+    
+    fig.suptitle(
+        "GEMM Kernel Time Variance Distribution",
+        fontsize=18,
+        fontweight="bold",
+        y=1.02,
+    )
+    
+    plt.tight_layout()
+    return save_figure(fig, output_dir / "variance_violin_combined.png", dpi)
+
diff --git a/src/aorta/report/generators/plot_helper/gpu_by_rank.py b/src/aorta/report/generators/plot_helper/gpu_by_rank.py
new file mode 100644
index 0000000..071c0ed
--- /dev/null
+++ b/src/aorta/report/generators/plot_helper/gpu_by_rank.py
@@ -0,0 +1,74 @@
+"""GPU metrics by rank line plots."""
+
+from pathlib import Path
+from typing import List, Optional
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from .common import COLORS, DEFAULT_DPI, save_figure
+
+
+METRICS_TO_PLOT = ["total_time", "computation_time", "total_comm_time", "idle_time"]
+
+
+def plot_gpu_metrics_by_rank(
+    excel_path: Path,
+    output_dir: Path,
+    labels: List[str],
+    metrics: Optional[List[str]] = None,
+    dpi: int = DEFAULT_DPI,
+) -> List[Path]:
+    """
+    Create line plots for GPU metrics across ranks.
+    
+    Reads GPU_ByRank_Cmp sheet, creates one plot per metric type.
+    Each plot shows baseline vs test values across all ranks.
+    
+    Returns list of generated file paths.
+    """
+    df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp")
+    metrics = metrics or METRICS_TO_PLOT
+    
+    output_files = []
+    colors = [COLORS["baseline"], COLORS["test"]]
+    markers = ["o", "s"]
+    
+    for metric in metrics:
+        metric_df = df[df["type"] == metric]
+        if metric_df.empty:
+            continue
+        
+        fig, ax = plt.subplots(figsize=(12, 6))
+        
+        for i, label in enumerate(labels):
+            col_name = f"{label}_time_ms"
+            if col_name in metric_df.columns:
+                ax.plot(
+                    metric_df["rank"],
+                    metric_df[col_name],
+                    marker=markers[i],
+                    linewidth=2,
+                    markersize=8,
+                    color=colors[i],
+                    label=label,
+                )
+        
+        ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+        ax.set_axisbelow(True)
+        
+        ax.set_xlabel("Rank", fontsize=12)
+        ax.set_ylabel("Time (ms)", fontsize=12)
+        ax.set_title(
+            f"{metric} Comparison across all ranks",
+            fontsize=14,
+            fontweight="bold",
+        )
+        ax.legend()
+        
+        plt.tight_layout()
+        output_path = save_figure(fig, output_dir / f"{metric}_by_rank.png", dpi)
+        output_files.append(output_path)
+    
+    return output_files
+
diff --git a/src/aorta/report/generators/plot_helper/gpu_heatmap.py b/src/aorta/report/generators/plot_helper/gpu_heatmap.py
new file mode 100644
index 0000000..139f631
--- /dev/null
+++ b/src/aorta/report/generators/plot_helper/gpu_heatmap.py
@@ -0,0 +1,49 @@
+"""GPU percent change heatmap."""
+
+from pathlib import Path
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+from .common import DEFAULT_DPI, save_figure
+
+
+def plot_gpu_heatmap(
+    excel_path: Path,
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """
+    Create heatmap of percent_change by metric type and rank.
+    
+    Reads GPU_ByRank_Cmp sheet, pivots to (metric × rank) matrix,
+    and creates color-coded heatmap (green=better, red=worse).
+    """
+    df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp")
+    pivot_df = df.pivot(index="type", columns="rank", values="percent_change")
+    
+    fig, ax = plt.subplots(figsize=(12, 8))
+    
+    sns.heatmap(
+        pivot_df,
+        annot=True,
+        fmt=".1f",
+        cmap="RdYlGn",
+        center=0,
+        linewidths=0.5,
+        cbar_kws={"label": "Percent Change (%)"},
+        ax=ax,
+    )
+    
+    ax.set_title(
+        "GPU Metric Percentage Change by Rank (HeatMap)\n(Positive = Better Test)",
+        fontsize=14,
+        fontweight="bold",
+    )
+    ax.set_xlabel("Rank", fontsize=12)
+    ax.set_ylabel("Metric Type", fontsize=12)
+    
+    plt.tight_layout()
+    return save_figure(fig, output_dir / "gpu_time_heatmap.png", dpi)
+
diff --git a/src/aorta/report/generators/plot_helper/gpu_percent_change.py b/src/aorta/report/generators/plot_helper/gpu_percent_change.py
new file mode 100644
index 0000000..e267400
--- /dev/null
+++ b/src/aorta/report/generators/plot_helper/gpu_percent_change.py
@@ -0,0 +1,65 @@
+"""GPU percent change grid plot."""
+
+from pathlib import Path
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from .common import DEFAULT_DPI, save_figure, get_improvement_colors
+
+
+METRIC_TYPES = [
+    "busy_time",
+    "computation_time",
+    "exposed_comm_time",
+    "exposed_memcpy_time",
+    "idle_time",
+    "total_comm_time",
+    "total_memcpy_time",
+    "total_time",
+]
+
+
+def plot_gpu_percent_change_grid(
+    excel_path: Path,
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """
+    Create 2x4 grid of percent change bar charts by rank.
+    
+    Reads GPU_ByRank_Cmp sheet, creates one subplot per metric type.
+    Each subplot shows percent_change for all ranks as bar chart.
+    """
+    df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp")
+    
+    fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(14, 8))
+    
+    for i, metric_type in enumerate(METRIC_TYPES):
+        ax = axes[i // 4, i % 4]
+        type_df = df[df["type"] == metric_type]
+        
+        if type_df.empty:
+            ax.set_visible(False)
+            continue
+        
+        colors = get_improvement_colors(type_df["percent_change"])
+        ax.bar(type_df["rank"].astype(str), type_df["percent_change"], color=colors)
+        
+        ax.axhline(y=0, color="black", linestyle="-", linewidth=0.5)
+        ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+        ax.set_axisbelow(True)
+        ax.set_xlabel("Rank")
+        ax.set_ylabel("Percent Change (%)")
+        ax.set_title(metric_type, fontsize=10)
+    
+    fig.suptitle(
+        "GPU Metrics Percent Change by Rank\n(Positive = Better)",
+        fontsize=14,
+        fontweight="bold",
+    )
+    plt.tight_layout()
+    return save_figure(
+        fig, output_dir / "gpu_time_change_percentage_summary_by_rank.png", dpi
+    )
+
diff --git a/src/aorta/report/generators/plot_helper/nccl_charts.py b/src/aorta/report/generators/plot_helper/nccl_charts.py
new file mode 100644
index 0000000..0230a4b
--- /dev/null
+++ b/src/aorta/report/generators/plot_helper/nccl_charts.py
@@ -0,0 +1,140 @@
+"""NCCL comparison charts."""
+
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+from .common import COLORS, DEFAULT_DPI, save_figure, get_improvement_colors
+
+
+NCCL_METRICS = {
+    "NCCL Communication Latency": {
+        "y_col": "comm_latency_mean",
+        "y_label": "Communication Latency (ms)",
+    },
+    "NCCL Algorithm Bandwidth": {
+        "y_col": "algo bw (GB/s)_mean",
+        "y_label": "Algorithm Bandwidth (GB/s)",
+    },
+    "NCCL Bus Bandwidth": {
+        "y_col": "bus bw (GB/s)_mean",
+        "y_label": "Bus Bandwidth (GB/s)",
+    },
+    "NCCL Total Communication Latency": {
+        "y_col": "Total comm latency (ms)",
+        "y_label": "Total Communication Latency (ms)",
+    },
+}
+
+NCCL_PERCENT_METRICS = {
+    "Comm Latency": "percent_change_comm_latency_mean",
+    "Algo BW": "percent_change_algo bw (GB/s)_mean",
+    "Bus BW": "percent_change_bus bw (GB/s)_mean",
+}
+
+
+def plot_nccl_comparison(
+    excel_path: Path,
+    output_dir: Path,
+    labels: List[str],
+    dpi: int = DEFAULT_DPI,
+) -> List[Path]:
+    """
+    Create NCCL metric comparison bar charts.
+    
+    Reads NCCL_ImplicitSyncCmp sheet, creates grouped bar charts
+    for each metric (latency, bandwidth).
+    """
+    try:
+        df = pd.read_excel(excel_path, sheet_name="NCCL_ImplicitSyncCmp")
+    except ValueError:
+        # Sheet might not exist
+        return []
+    
+    df["label"] = df["Collective name"] + "\n" + df["In msg nelems"].astype(str)
+    
+    x = np.arange(len(df))
+    width = 0.35
+    colors = [COLORS["baseline"], COLORS["test"]]
+    output_files = []
+    
+    for title, config in NCCL_METRICS.items():
+        fig, ax = plt.subplots(figsize=(14, 6))
+        
+        has_data = False
+        for i, label in enumerate(labels):
+            col_name = f"{label}_{config['y_col']}"
+            if col_name in df.columns:
+                offset = (i - len(labels) / 2 + 0.5) * width
+                ax.bar(x + offset, df[col_name], width, label=label, color=colors[i])
+                has_data = True
+        
+        if not has_data:
+            plt.close(fig)
+            continue
+        
+        ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+        ax.set_axisbelow(True)
+        ax.set_xticks(x)
+        ax.set_xticklabels(df["label"], rotation=45, ha="right", fontsize=8)
+        ax.set_xlabel("Collective Operation (Message Size)", fontsize=12)
+        ax.set_ylabel(config["y_label"], fontsize=12)
+        ax.set_title(f"{title} Comparison", fontsize=14, fontweight="bold")
+        ax.legend()
+        
+        plt.tight_layout()
+        filename = f'{title.replace(" ", "_")}_comparison.png'
+        output_files.append(save_figure(fig, output_dir / filename, dpi))
+    
+    return output_files
+
+
+def plot_nccl_percent_change(
+    excel_path: Path,
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """
+    Create 1x3 grid of NCCL percent change horizontal bar charts.
+    """
+    try:
+        df = pd.read_excel(excel_path, sheet_name="NCCL_ImplicitSyncCmp")
+    except ValueError:
+        # Sheet might not exist
+        return None
+    
+    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14, 6))
+    
+    has_any_data = False
+    for i, (title, col_name) in enumerate(NCCL_PERCENT_METRICS.items()):
+        ax = axes[i]
+        if col_name not in df.columns:
+            ax.set_visible(False)
+            continue
+        
+        has_any_data = True
+        colors = get_improvement_colors(df[col_name])
+        ax.barh(df["In msg nelems"].astype(str), df[col_name], color=colors)
+        
+        ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+        ax.set_axisbelow(True)
+        ax.set_xlabel("Percent Change (%)")
+        ax.set_title(f"{title}\nPercent Change (Positive = better)")
+    
+    if not has_any_data:
+        plt.close(fig)
+        return None
+    
+    fig.suptitle(
+        "NCCL Performance Percentage Change By Message Size",
+        fontsize=16,
+        fontweight="bold",
+    )
+    plt.tight_layout()
+    return save_figure(
+        fig, output_dir / "NCCL_Performance_Percentage_Change_comparison.png", dpi
+    )
+
diff --git a/src/aorta/report/generators/plot_helper/summary_dashboard.py b/src/aorta/report/generators/plot_helper/summary_dashboard.py
new file mode 100644
index 0000000..527d8de
--- /dev/null
+++ b/src/aorta/report/generators/plot_helper/summary_dashboard.py
@@ -0,0 +1,98 @@
+"""Summary dashboard plots: improvement chart and absolute time comparison."""
+
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+from .common import (
+    COLORS,
+    DEFAULT_DPI,
+    DEFAULT_FIGSIZE,
+    remove_spines,
+    save_figure,
+    get_improvement_colors,
+)
+
+
+def get_labels_from_excel(excel_path: Path) -> List[str]:
+    """Extract baseline/test labels from Summary_Dashboard sheet."""
+    df = pd.read_excel(excel_path, sheet_name="Summary_Dashboard")
+    cols = df.columns.tolist()
+    return [cols[1], cols[2]]  # Baseline and Test column names
+
+
+def plot_improvement_chart(
+    excel_path: Path,
+    output_dir: Path,
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """
+    Create horizontal bar chart of percent improvement.
+    
+    Reads Summary_Dashboard sheet, plots Metric vs Improvement (%).
+    Green bars for positive (better), red for negative (worse).
+    """
+    df = pd.read_excel(excel_path, sheet_name="Summary_Dashboard")
+    
+    fig, ax = plt.subplots(figsize=DEFAULT_FIGSIZE)
+    
+    colors = get_improvement_colors(df["Improvement (%)"])
+    ax.barh(df["Metric"], df["Improvement (%)"], color=colors)
+    
+    ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+    ax.set_axisbelow(True)
+    remove_spines(ax)
+    
+    ax.set_ylabel("Metric", fontsize=12)
+    ax.set_xlabel("Change (%)", fontsize=12)
+    ax.set_title(
+        "GPU Metrics Percentage Change (Test vs Baseline)\n(Positive = Test is better)",
+        fontsize=14,
+        fontweight="bold",
+    )
+    
+    plt.tight_layout()
+    return save_figure(fig, output_dir / "improvement_chart.png", dpi)
+
+
+def plot_abs_time_comparison(
+    excel_path: Path,
+    output_dir: Path,
+    labels: List[str],
+    dpi: int = DEFAULT_DPI,
+) -> Path:
+    """
+    Create grouped bar chart of baseline vs test absolute times.
+    
+    Reads Summary_Dashboard sheet, plots side-by-side bars for each metric.
+    """
+    df = pd.read_excel(excel_path, sheet_name="Summary_Dashboard")
+    
+    fig, ax = plt.subplots(figsize=DEFAULT_FIGSIZE)
+    
+    x = np.arange(len(df))
+    width = 0.35
+    colors = [COLORS["baseline"], COLORS["test"]]
+    
+    for i, label in enumerate(labels):
+        if label in df.columns:
+            offset = (i - len(labels) / 2 + 0.5) * width
+            ax.bar(x + offset, df[label], width, label=label, color=colors[i])
+    
+    ax.xaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+    ax.set_axisbelow(True)
+    remove_spines(ax)
+    
+    ax.set_xlabel("Metric Type", fontsize=12)
+    ax.set_ylabel("Time (ms)", fontsize=12)
+    ax.set_title("GPU Metrics Absolute Time Comparison", fontsize=14, fontweight="bold")
+    ax.set_xticks(x)
+    ax.set_xticklabels(df["Metric"], rotation=45, ha="right")
+    ax.legend()
+    
+    plt.tight_layout()
+    return save_figure(fig, output_dir / "abs_time_comparison.png", dpi)
+
diff --git a/src/aorta/report/pipelines/__init__.py b/src/aorta/report/pipelines/__init__.py
new file mode 100644
index 0000000..c0c3839
--- /dev/null
+++ b/src/aorta/report/pipelines/__init__.py
@@ -0,0 +1,14 @@
+"""Pipeline orchestrators for multi-step analysis workflows."""
+
+from .summary_pipeline import run_summary_pipeline, SummaryPipelineConfig, PipelineResult
+from .gemm_pipeline import run_gemm_pipeline, GemmPipelineConfig, GemmPipelineResult
+
+__all__ = [
+    "run_summary_pipeline",
+    "SummaryPipelineConfig",
+    "PipelineResult",
+    "run_gemm_pipeline",
+    "GemmPipelineConfig",
+    "GemmPipelineResult",
+]
+
diff --git a/src/aorta/report/pipelines/cli.py b/src/aorta/report/pipelines/cli.py
new file mode 100644
index 0000000..0dbf3fc
--- /dev/null
+++ b/src/aorta/report/pipelines/cli.py
@@ -0,0 +1,245 @@
+"""CLI commands for complete analysis pipelines.
+
+This module provides the 'pipeline' command group with subcommands:
+  - summary: Run complete summary analysis pipeline (GPU + NCCL)
+  - gemm: Run GEMM variance analysis pipeline
+"""
+
+import click
+from pathlib import Path
+
+
+@click.group()
+@click.pass_context
+def pipeline(ctx):
+    """Run complete analysis pipelines.
+
+    \b
+    Commands:
+      summary  - Run complete summary analysis pipeline (GPU + NCCL)
+      gemm     - Run GEMM variance analysis pipeline
+    """
+    pass
+
+
+@pipeline.command("summary")
+@click.option("-b", "--baseline", required=True, type=click.Path(exists=True),
+              help="Baseline trace directory")
+@click.option("-t", "--test", required=True, type=click.Path(exists=True),
+              help="Test trace directory")
+@click.option("-o", "--output", required=True, type=click.Path(),
+              help="Output directory for results")
+@click.option("--baseline-label", default=None,
+              help="Label for baseline (default: directory name)")
+@click.option("--test-label", default=None,
+              help="Label for test (default: directory name)")
+@click.option("--skip-tracelens", is_flag=True,
+              help="Skip TraceLens analysis (if already done)")
+@click.option("--gpu-timeline/--no-gpu-timeline", default=True,
+              help="Enable/disable GPU timeline comparison")
+@click.option("--collective/--no-collective", default=True,
+              help="Enable/disable collective comparison")
+@click.option("--final-report/--no-final-report", default=True,
+              help="Enable/disable final Excel report")
+@click.option("--plots/--no-plots", default=True,
+              help="Enable/disable plot generation")
+@click.option("--html/--no-html", default=True,
+              help="Enable/disable HTML report generation")
+@click.pass_context
+def pipeline_summary(ctx, baseline, test, output, baseline_label, test_label,
+                     skip_tracelens, gpu_timeline, collective, final_report, plots, html):
+    """Run complete summary analysis pipeline.
+
+    Orchestrates the full TraceLens analysis workflow:
+
+    \b
+    1. TraceLens Analysis (optional, skip with --skip-tracelens)
+    2. Process GPU timelines
+    3. Compare GPU timelines (baseline vs test)
+    4. Compare collective/NCCL metrics
+    5. Generate final Excel report
+    6. Generate visualization plots
+    7. Generate HTML report
+
+    \b
+    Examples:
+      # Full pipeline
+      aorta-report pipeline summary \\
+          -b /path/to/baseline -t /path/to/test -o /path/to/output
+
+      # Skip TraceLens (already done)
+      aorta-report pipeline summary \\
+          -b /path/to/baseline -t /path/to/test -o /path/to/output \\
+          --skip-tracelens
+
+      # Only GPU timeline comparison
+      aorta-report pipeline summary \\
+          -b /path/to/baseline -t /path/to/test -o /path/to/output \\
+          --no-collective --no-final-report --no-plots --no-html
+    """
+    from . import run_summary_pipeline, SummaryPipelineConfig
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    config = SummaryPipelineConfig(
+        baseline_path=Path(baseline),
+        test_path=Path(test),
+        output_dir=Path(output),
+        baseline_label=baseline_label,
+        test_label=test_label,
+        skip_tracelens=skip_tracelens,
+        gpu_timeline=gpu_timeline,
+        collective=collective,
+        final_report=final_report,
+        plots=plots,
+        html=html,
+        verbose=verbose,
+    )
+
+    if not quiet:
+        click.echo("=" * 60)
+        click.echo("SUMMARY ANALYSIS PIPELINE")
+        click.echo("=" * 60)
+        click.echo(f"Baseline: {baseline}")
+        click.echo(f"Test: {test}")
+        click.echo(f"Output: {output}")
+        click.echo(f"Labels: {baseline_label or '(auto)'} vs {test_label or '(auto)'}")
+        click.echo(f"Options: skip_tracelens={skip_tracelens}, gpu_timeline={gpu_timeline}")
+        click.echo(f"         collective={collective}, final_report={final_report}")
+        click.echo(f"         plots={plots}, html={html}")
+
+    result = run_summary_pipeline(config)
+
+    if not quiet:
+        click.echo("\n" + "=" * 60)
+        click.echo("PIPELINE COMPLETE!" if result.success else "PIPELINE FAILED!")
+        click.echo("=" * 60)
+
+        if result.steps_completed:
+            click.echo("\nSteps completed:")
+            for step in result.steps_completed:
+                click.echo(f"  ✓ {step}")
+
+        if result.steps_skipped:
+            click.echo("\nSteps skipped:")
+            for step in result.steps_skipped:
+                click.echo(f"  - {step}")
+
+        if result.errors:
+            click.echo("\nErrors:")
+            for err in result.errors:
+                click.echo(f"  ✗ {err}")
+
+        if result.files_generated:
+            click.echo(f"\nOutput directory: {result.output_dir}")
+            click.echo("Generated files:")
+            for name, path in result.files_generated.items():
+                if isinstance(path, Path):
+                    click.echo(f"  - {path.name}")
+
+    if not result.success:
+        raise click.ClickException("Pipeline failed")
+
+
+@pipeline.command("gemm")
+@click.option("--sweep-dir", required=True, type=click.Path(exists=True),
+              help="Sweep directory containing tracelens_analysis/")
+@click.option("-o", "--output", required=True, type=click.Path(),
+              help="Output directory for results")
+@click.option("--top-k", default=5, type=int,
+              help="Number of top kernels to extract (default: 5)")
+@click.option("--threads", "-t", multiple=True, type=int, default=(256, 512),
+              help="Thread configurations (can specify multiple)")
+@click.option("--channels", "-c", multiple=True, type=int, default=(28, 42, 56, 70),
+              help="Channel configurations (can specify multiple)")
+@click.option("--timestamps/--no-timestamps", default=True,
+              help="Enhance with timestamps (default: True)")
+@click.option("--plots/--no-plots", default=True,
+              help="Generate plots (default: True)")
+@click.pass_context
+def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps, plots):
+    """Run GEMM variance analysis pipeline.
+
+    Analyzes GEMM kernel time variance across configurations:
+
+    \b
+    1. Analyze GEMM reports to extract top-K kernels with highest variance
+    2. Enhance with timestamps (optional)
+    3. Generate variance plots (optional)
+
+    \b
+    Examples:
+      # Full pipeline
+      aorta-report pipeline gemm --sweep-dir /path/to/sweep -o /path/to/output
+
+      # Custom top-k
+      aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --top-k 10
+
+      # Skip plots
+      aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --no-plots
+
+      # Custom thread/channel configurations
+      aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output \\
+          -t 256 -t 512 -c 28 -c 42 -c 56 -c 70
+    """
+    from . import run_gemm_pipeline, GemmPipelineConfig
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    config = GemmPipelineConfig(
+        sweep_dir=Path(sweep_dir),
+        output_dir=Path(output),
+        top_k=top_k,
+        threads=list(threads),
+        channels=list(channels),
+        timestamps=timestamps,
+        plots=plots,
+        verbose=verbose,
+    )
+
+    if not quiet:
+        click.echo("=" * 60)
+        click.echo("GEMM VARIANCE ANALYSIS PIPELINE")
+        click.echo("=" * 60)
+        click.echo(f"Sweep dir: {sweep_dir}")
+        click.echo(f"Output: {output}")
+        click.echo(f"Top-K: {top_k}")
+        click.echo(f"Threads: {list(threads)}")
+        click.echo(f"Channels: {list(channels)}")
+        click.echo(f"Options: timestamps={timestamps}, plots={plots}")
+
+    result = run_gemm_pipeline(config)
+
+    if not quiet:
+        click.echo("\n" + "=" * 60)
+        click.echo("PIPELINE COMPLETE!" if result.success else "PIPELINE FAILED!")
+        click.echo("=" * 60)
+
+        if result.steps_completed:
+            click.echo("\nSteps completed:")
+            for step in result.steps_completed:
+                click.echo(f"  ✓ {step}")
+
+        if result.steps_skipped:
+            click.echo("\nSteps skipped:")
+            for step in result.steps_skipped:
+                click.echo(f"  - {step}")
+
+        if result.errors:
+            click.echo("\nErrors:")
+            for err in result.errors:
+                click.echo(f"  ✗ {err}")
+
+        click.echo(f"\nOutput directory: {result.output_dir}")
+        if result.csv_path:
+            click.echo(f"  - {result.csv_path.name}")
+        if result.csv_with_timestamps_path:
+            click.echo(f"  - {result.csv_with_timestamps_path.name}")
+        if result.plots_dir:
+            click.echo(f"  - plots/ (5 files)")
+
+    if not result.success:
+        raise click.ClickException("Pipeline failed")
+
diff --git a/src/aorta/report/pipelines/gemm_pipeline.py b/src/aorta/report/pipelines/gemm_pipeline.py
new file mode 100644
index 0000000..acf94b9
--- /dev/null
+++ b/src/aorta/report/pipelines/gemm_pipeline.py
@@ -0,0 +1,194 @@
+"""GEMM variance analysis pipeline.
+
+Orchestrates GEMM kernel variance analysis:
+1. Analyze GEMM Reports
+2. Enhance with Timestamps (optional)
+3. Generate GEMM Plots (optional)
+"""
+
+from pathlib import Path
+from typing import Optional, List
+from dataclasses import dataclass, field
+
+
+@dataclass
+class GemmPipelineConfig:
+    """Configuration for GEMM pipeline."""
+
+    sweep_dir: Path
+    output_dir: Path
+    top_k: int = 5
+    threads: List[int] = field(default_factory=lambda: [256, 512])
+    channels: List[int] = field(default_factory=lambda: [28, 42, 56, 70])
+    ranks: List[int] = field(default_factory=lambda: list(range(8)))
+    timestamps: bool = True
+    plots: bool = True
+    verbose: bool = False
+
+
+@dataclass
+class GemmPipelineResult:
+    """Result from GEMM pipeline execution."""
+
+    success: bool
+    output_dir: Path
+    csv_path: Optional[Path] = None
+    csv_with_timestamps_path: Optional[Path] = None
+    plots_dir: Optional[Path] = None
+    steps_completed: List[str] = field(default_factory=list)
+    steps_skipped: List[str] = field(default_factory=list)
+    errors: List[str] = field(default_factory=list)
+
+
+def run_gemm_pipeline(config: GemmPipelineConfig) -> GemmPipelineResult:
+    """
+    Run the complete GEMM analysis pipeline.
+
+    Returns GemmPipelineResult with success status and generated files.
+    """
+    result = GemmPipelineResult(
+        success=True,
+        output_dir=config.output_dir,
+    )
+
+    config.output_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        # Step 1: Analyze GEMM Reports
+        _step_analyze_gemm(config, result)
+
+        # Step 2: Enhance with Timestamps
+        if config.timestamps and result.csv_path:
+            _step_enhance_timestamps(config, result)
+        elif config.timestamps:
+            result.steps_skipped.append("timestamps (analyze_gemm failed)")
+        else:
+            result.steps_skipped.append("timestamps")
+
+        # Step 3: Generate GEMM Plots
+        if config.plots and result.csv_path:
+            _step_generate_plots(config, result)
+        elif config.plots:
+            result.steps_skipped.append("plots (analyze_gemm failed)")
+        else:
+            result.steps_skipped.append("plots")
+
+    except Exception as e:
+        result.success = False
+        result.errors.append(str(e))
+
+    return result
+
+
+def _step_analyze_gemm(config: GemmPipelineConfig, result: GemmPipelineResult) -> None:
+    """Step 1: Analyze GEMM reports."""
+    from ..analysis import analyze_gemm_reports
+
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 1: Analyze GEMM Reports")
+        print("=" * 60)
+
+    reports_dir = config.sweep_dir / "tracelens_analysis"
+
+    if not reports_dir.exists():
+        raise FileNotFoundError(
+            f"TraceLens analysis directory not found: {reports_dir}"
+        )
+
+    output_file = f"top{config.top_k}_gemm_kernels_time_variance.csv"
+    output_path = config.output_dir / output_file
+
+    if config.verbose:
+        print(f"  Reports dir: {reports_dir}")
+        print(f"  Top-K: {config.top_k}")
+        print(f"  Threads: {config.threads}")
+        print(f"  Channels: {config.channels}")
+        print(f"  Ranks: {config.ranks}")
+
+    csv_path = analyze_gemm_reports(
+        base_path=reports_dir,
+        threads=config.threads,
+        channels=config.channels,
+        ranks=config.ranks,
+        top_k=config.top_k,
+        output_file=str(output_path),
+        verbose=config.verbose,
+    )
+
+    result.csv_path = csv_path
+
+    if config.verbose:
+        print(f"  Output: {csv_path}")
+
+    result.steps_completed.append("analyze_gemm")
+
+
+def _step_enhance_timestamps(
+    config: GemmPipelineConfig, result: GemmPipelineResult
+) -> None:
+    """Step 2: Enhance with timestamps."""
+    from ..processing import enhance_gemm_variance
+
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 2: Enhance with Timestamps")
+        print("=" * 60)
+
+    if result.csv_path is None:
+        result.steps_skipped.append("timestamps (no CSV path)")
+        return
+
+    output_csv = result.csv_path.with_name(
+        result.csv_path.stem + "_with_timestamps.csv"
+    )
+
+    try:
+        enhanced_path = enhance_gemm_variance(
+            input_csv=result.csv_path,
+            base_path=config.sweep_dir,
+            output_csv=output_csv,
+            verbose=config.verbose,
+        )
+        result.csv_with_timestamps_path = enhanced_path
+
+        if config.verbose:
+            print(f"  Output: {enhanced_path}")
+
+        result.steps_completed.append("timestamps")
+    except Exception as e:
+        result.errors.append(f"Timestamp enhancement failed: {e}")
+        result.steps_skipped.append("timestamps (failed)")
+
+
+def _step_generate_plots(
+    config: GemmPipelineConfig, result: GemmPipelineResult
+) -> None:
+    """Step 3: Generate GEMM plots."""
+    from ..generators import generate_gemm_plots
+
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 3: Generate GEMM Plots")
+        print("=" * 60)
+
+    if result.csv_path is None:
+        result.steps_skipped.append("plots (no CSV path)")
+        return
+
+    plots_dir = config.output_dir / "plots"
+
+    plot_files = generate_gemm_plots(
+        csv_path=result.csv_path,
+        output_dir=plots_dir,
+        verbose=config.verbose,
+    )
+
+    result.plots_dir = plots_dir
+
+    if config.verbose:
+        print(f"  Plots directory: {plots_dir}")
+        print(f"  Generated {len(plot_files)} plots")
+
+    result.steps_completed.append("plots")
+
diff --git a/src/aorta/report/pipelines/summary_pipeline.py b/src/aorta/report/pipelines/summary_pipeline.py
new file mode 100644
index 0000000..9f58630
--- /dev/null
+++ b/src/aorta/report/pipelines/summary_pipeline.py
@@ -0,0 +1,412 @@
+"""Summary analysis pipeline.
+
+Orchestrates complete TraceLens analysis workflow:
+1. TraceLens Analysis (optional)
+2. Process GPU Timelines
+3. Compare GPU Timelines
+4. Compare Collective
+5. Generate Final Excel Report
+6. Generate Plots
+7. Generate HTML Report
+"""
+
+from pathlib import Path
+from typing import Optional, Dict, List, Any
+from dataclasses import dataclass, field
+
+
+@dataclass
+class SummaryPipelineConfig:
+    """Configuration for summary pipeline."""
+
+    baseline_path: Path
+    test_path: Path
+    output_dir: Path
+    baseline_label: Optional[str] = None
+    test_label: Optional[str] = None
+    skip_tracelens: bool = False
+    gpu_timeline: bool = True
+    collective: bool = True
+    final_report: bool = True
+    plots: bool = True
+    html: bool = True
+    verbose: bool = False
+
+
+@dataclass
+class PipelineResult:
+    """Result from pipeline execution."""
+
+    success: bool
+    output_dir: Path
+    files_generated: Dict[str, Path] = field(default_factory=dict)
+    steps_completed: List[str] = field(default_factory=list)
+    steps_skipped: List[str] = field(default_factory=list)
+    errors: List[str] = field(default_factory=list)
+
+
+def run_summary_pipeline(config: SummaryPipelineConfig) -> PipelineResult:
+    """
+    Run the complete summary pipeline.
+
+    Returns PipelineResult with success status and generated files.
+    """
+    result = PipelineResult(
+        success=True,
+        output_dir=config.output_dir,
+    )
+
+    config.output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extract labels from directory names if not provided
+    baseline_label = config.baseline_label or config.baseline_path.name
+    test_label = config.test_label or config.test_path.name
+
+    try:
+        # Step 1: TraceLens Analysis
+        if not config.skip_tracelens:
+            _step_tracelens_analysis(config, result)
+        else:
+            result.steps_skipped.append("tracelens_analysis")
+
+        # Validate analysis directories exist
+        baseline_analysis = config.baseline_path / "tracelens_analysis"
+        test_analysis = config.test_path / "tracelens_analysis"
+
+        if not baseline_analysis.exists():
+            raise FileNotFoundError(
+                f"Baseline analysis not found: {baseline_analysis}. "
+                "Run without --skip-tracelens first."
+            )
+        if not test_analysis.exists():
+            raise FileNotFoundError(
+                f"Test analysis not found: {test_analysis}. "
+                "Run without --skip-tracelens first."
+            )
+
+        # Step 2: Process GPU Timelines
+        if config.gpu_timeline:
+            _step_process_gpu_timelines(config, result)
+
+        # Step 3: Compare GPU Timelines
+        if config.gpu_timeline:
+            _step_compare_gpu_timeline(config, result, baseline_label, test_label)
+        else:
+            result.steps_skipped.append("compare_gpu_timeline")
+
+        # Step 4: Compare Collective
+        if config.collective:
+            _step_compare_collective(config, result, baseline_label, test_label)
+        else:
+            result.steps_skipped.append("compare_collective")
+
+        # Step 5: Generate Final Report
+        if (
+            config.final_report
+            and config.gpu_timeline
+            and config.collective
+            and "gpu_combined" in result.files_generated
+            and "coll_combined" in result.files_generated
+        ):
+            _step_generate_final_report(config, result, baseline_label, test_label)
+        elif config.final_report:
+            result.steps_skipped.append(
+                "final_report (requires both gpu_timeline and collective)"
+            )
+
+        # Step 6: Generate Plots
+        if config.plots and "final_report" in result.files_generated:
+            _step_generate_plots(config, result)
+        elif config.plots:
+            result.steps_skipped.append("plots (requires final_report)")
+
+        # Step 7: Generate HTML
+        if config.html and "plots_dir" in result.files_generated:
+            _step_generate_html(config, result)
+        elif config.html:
+            result.steps_skipped.append("html (requires plots)")
+
+    except Exception as e:
+        result.success = False
+        result.errors.append(str(e))
+
+    return result
+
+
+def _step_tracelens_analysis(
+    config: SummaryPipelineConfig, result: PipelineResult
+) -> None:
+    """Step 1: Run TraceLens analysis on baseline and test."""
+    from ..analysis import analyze_single_config
+
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 1: TraceLens Analysis")
+        print("=" * 60)
+
+    # Analyze baseline
+    if config.verbose:
+        print(f"\nAnalyzing baseline: {config.baseline_path}")
+    analyze_single_config(config.baseline_path, verbose=config.verbose)
+
+    # Analyze test
+    if config.verbose:
+        print(f"\nAnalyzing test: {config.test_path}")
+    analyze_single_config(config.test_path, verbose=config.verbose)
+
+    result.steps_completed.append("tracelens_analysis")
+
+
+def _step_process_gpu_timelines(
+    config: SummaryPipelineConfig, result: PipelineResult
+) -> None:
+    """Step 2: Process GPU timelines for both baseline and test."""
+    from ..processing import process_single_config
+
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 2: Process GPU Timelines")
+        print("=" * 60)
+
+    baseline_reports = (
+        config.baseline_path / "tracelens_analysis" / "individual_reports"
+    )
+    test_reports = config.test_path / "tracelens_analysis" / "individual_reports"
+
+    if config.verbose:
+        print(f"\nProcessing baseline: {baseline_reports}")
+    process_single_config(baseline_reports, verbose=config.verbose)
+
+    if config.verbose:
+        print(f"\nProcessing test: {test_reports}")
+    process_single_config(test_reports, verbose=config.verbose)
+
+    result.steps_completed.append("process_gpu_timelines")
+
+
+def _step_compare_gpu_timeline(
+    config: SummaryPipelineConfig,
+    result: PipelineResult,
+    baseline_label: str,
+    test_label: str,
+) -> None:
+    """Step 3: Compare GPU timelines."""
+    from ..comparison import (
+        combine_excel_files,
+        add_gpu_timeline_comparison,
+        save_with_formatting,
+    )
+
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 3: Compare GPU Timelines")
+        print("=" * 60)
+
+    baseline_gpu = (
+        config.baseline_path / "tracelens_analysis" / "gpu_timeline_summary_mean.xlsx"
+    )
+    test_gpu = (
+        config.test_path / "tracelens_analysis" / "gpu_timeline_summary_mean.xlsx"
+    )
+
+    if not baseline_gpu.exists():
+        raise FileNotFoundError(f"Baseline GPU timeline not found: {baseline_gpu}")
+    if not test_gpu.exists():
+        raise FileNotFoundError(f"Test GPU timeline not found: {test_gpu}")
+
+    # Combine
+    combined = combine_excel_files(
+        baseline_gpu, test_gpu, baseline_label, test_label, verbose=config.verbose
+    )
+
+    # Save combined
+    combined_path = config.output_dir / "gpu_timeline_combined.xlsx"
+    save_with_formatting(combined, combined_path, {})
+    result.files_generated["gpu_combined"] = combined_path
+
+    # Add comparison
+    comparison = add_gpu_timeline_comparison(
+        combined, baseline_label, test_label, verbose=config.verbose
+    )
+
+    # Save comparison
+    comparison_path = config.output_dir / "gpu_timeline_comparison.xlsx"
+    format_columns = {
+        "Comparison_By_Rank": ["percent_change"],
+        "Summary_Comparison": ["percent_change"],
+    }
+    save_with_formatting(comparison, comparison_path, format_columns)
+    result.files_generated["gpu_comparison"] = comparison_path
+
+    if config.verbose:
+        print(f"  GPU timeline combined: {combined_path}")
+        print(f"  GPU timeline comparison: {comparison_path}")
+
+    result.steps_completed.append("compare_gpu_timeline")
+
+
+def _step_compare_collective(
+    config: SummaryPipelineConfig,
+    result: PipelineResult,
+    baseline_label: str,
+    test_label: str,
+) -> None:
+    """Step 4: Compare collective/NCCL."""
+    from ..comparison import (
+        combine_excel_files,
+        add_collective_comparison,
+        save_with_formatting,
+    )
+    from ..comparison.collective_comparison import get_percent_change_columns
+
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 4: Compare Collective/NCCL")
+        print("=" * 60)
+
+    baseline_coll = (
+        config.baseline_path
+        / "tracelens_analysis"
+        / "collective_reports"
+        / "collective_all_ranks.xlsx"
+    )
+    test_coll = (
+        config.test_path
+        / "tracelens_analysis"
+        / "collective_reports"
+        / "collective_all_ranks.xlsx"
+    )
+
+    if not baseline_coll.exists():
+        raise FileNotFoundError(f"Baseline collective not found: {baseline_coll}")
+    if not test_coll.exists():
+        raise FileNotFoundError(f"Test collective not found: {test_coll}")
+
+    # Combine (filter summary sheets only)
+    combined = combine_excel_files(
+        baseline_coll,
+        test_coll,
+        baseline_label,
+        test_label,
+        filter_summary_only=True,
+        verbose=config.verbose,
+    )
+
+    # Save combined
+    combined_path = config.output_dir / "collective_combined.xlsx"
+    save_with_formatting(combined, combined_path, {})
+    result.files_generated["coll_combined"] = combined_path
+
+    # Add comparison
+    comparison = add_collective_comparison(
+        combined, baseline_label, test_label, verbose=config.verbose
+    )
+
+    # Save comparison
+    comparison_path = config.output_dir / "collective_comparison.xlsx"
+    format_columns: Dict[str, List[str]] = {}
+    for sheet_name, df in comparison.items():
+        if sheet_name.endswith("_cmp"):
+            pct_cols = get_percent_change_columns(df)
+            if pct_cols:
+                format_columns[sheet_name] = pct_cols
+    save_with_formatting(comparison, comparison_path, format_columns)
+    result.files_generated["coll_comparison"] = comparison_path
+
+    if config.verbose:
+        print(f"  Collective combined: {combined_path}")
+        print(f"  Collective comparison: {comparison_path}")
+
+    result.steps_completed.append("compare_collective")
+
+
+def _step_generate_final_report(
+    config: SummaryPipelineConfig,
+    result: PipelineResult,
+    baseline_label: str,
+    test_label: str,
+) -> None:
+    """Step 5: Generate final Excel report."""
+    from ..generators import create_final_excel_report
+
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 5: Generate Final Excel Report")
+        print("=" * 60)
+
+    final_report_path = config.output_dir / "final_analysis_report.xlsx"
+
+    create_final_excel_report(
+        gpu_combined_path=result.files_generated["gpu_combined"],
+        gpu_comparison_path=result.files_generated["gpu_comparison"],
+        coll_combined_path=result.files_generated["coll_combined"],
+        coll_comparison_path=result.files_generated["coll_comparison"],
+        output_path=final_report_path,
+        baseline_label=baseline_label,
+        test_label=test_label,
+        verbose=config.verbose,
+    )
+
+    result.files_generated["final_report"] = final_report_path
+
+    if config.verbose:
+        print(f"  Final report: {final_report_path}")
+
+    result.steps_completed.append("final_report")
+
+
+def _step_generate_plots(
+    config: SummaryPipelineConfig, result: PipelineResult
+) -> None:
+    """Step 6: Generate plots."""
+    from ..generators import generate_summary_plots
+
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 6: Generate Plots")
+        print("=" * 60)
+
+    plots_dir = config.output_dir / "plots"
+
+    plot_files = generate_summary_plots(
+        excel_path=result.files_generated["final_report"],
+        output_dir=plots_dir,
+        verbose=config.verbose,
+    )
+
+    result.files_generated["plots_dir"] = plots_dir
+
+    if config.verbose:
+        print(f"  Plots directory: {plots_dir}")
+        print(f"  Generated {len(plot_files)} plots")
+
+    result.steps_completed.append("plots")
+
+
+def _step_generate_html(
+    config: SummaryPipelineConfig, result: PipelineResult
+) -> None:
+    """Step 7: Generate HTML report."""
+    from ..generators import generate_html
+
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 7: Generate HTML Report")
+        print("=" * 60)
+
+    html_path = config.output_dir / "performance_analysis_report.html"
+
+    generate_html(
+        mode="performance",
+        output=html_path,
+        plots_dir=result.files_generated["plots_dir"],
+        verbose=config.verbose,
+    )
+
+    result.files_generated["html_report"] = html_path
+
+    if config.verbose:
+        print(f"  HTML report: {html_path}")
+
+    result.steps_completed.append("html")
+
diff --git a/src/aorta/report/processing/__init__.py b/src/aorta/report/processing/__init__.py
new file mode 100644
index 0000000..bbf26a2
--- /dev/null
+++ b/src/aorta/report/processing/__init__.py
@@ -0,0 +1,13 @@
+"""Processing modules for GPU timeline, NCCL communications, and GEMM variance."""
+
+from .gpu_timeline_single import process_single_config
+from .gpu_timeline_sweep import process_sweep_config
+from .process_comms import process_nccl_data
+from .process_gemm_variance import enhance_gemm_variance
+
+__all__ = [
+    "process_single_config",
+    "process_sweep_config",
+    "process_nccl_data",
+    "enhance_gemm_variance",
+]
diff --git a/src/aorta/report/processing/cli.py b/src/aorta/report/processing/cli.py
new file mode 100644
index 0000000..ead4c05
--- /dev/null
+++ b/src/aorta/report/processing/cli.py
@@ -0,0 +1,188 @@
+"""CLI commands for data processing utilities.
+
+This module provides the 'process' command group with subcommands:
+  - gpu-timeline: Process GPU timeline data from TraceLens reports
+  - comms: Process NCCL communication data
+  - gemm-variance: Enhance GEMM variance with timestamps
+"""
+
+import click
+from pathlib import Path
+
+
+@click.group()
+@click.pass_context
+def process(ctx):
+    """Data processing utilities.
+
+    \b
+    Commands:
+      gpu-timeline   - Process GPU timeline data from TraceLens reports
+      comms          - Process communication data
+      gemm-variance  - Enhance GEMM variance with timestamps
+    """
+    pass
+
+
+@process.command("gpu-timeline")
+@click.argument("input_dir", type=click.Path(exists=True))
+@click.option("--mode", type=click.Choice(["auto", "single", "sweep"]), default="auto",
+              help="Processing mode: auto-detect, single config, or sweep")
+@click.option("--geo-mean", is_flag=True, help="Use geometric mean instead of arithmetic mean")
+@click.option("-o", "--output", type=click.Path(), help="Output file path")
+@click.pass_context
+def process_gpu_timeline(ctx, input_dir, mode, geo_mean, output):
+    """Process GPU timeline data from TraceLens reports.
+
+    INPUT_DIR: Path to reports directory or sweep directory.
+
+    Supports both single-config and sweep directory structures.
+    Auto-detects the structure by default.
+
+    \b
+    Single mode: Processes perf_rank*.xlsx files from individual_reports/
+    Sweep mode: Processes perf_*ch_rank*.xlsx files from tracelens_analysis/
+
+    \b
+    Examples:
+      aorta-report process gpu-timeline /path/to/reports
+      aorta-report process gpu-timeline /path/to/individual_reports --mode single
+      aorta-report process gpu-timeline /path/to/sweep --mode sweep --geo-mean
+    """
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+    input_path = Path(input_dir)
+
+    # Auto-detect mode
+    if mode == "auto":
+        # Check for sweep structure (tracelens_analysis with thread directories)
+        tracelens_dir = input_path / "tracelens_analysis"
+        if tracelens_dir.exists():
+            thread_dirs = [d for d in tracelens_dir.iterdir() if d.is_dir() and "thread" in d.name]
+            if thread_dirs:
+                mode = "sweep"
+            else:
+                mode = "single"
+        elif input_path.name == "individual_reports" or list(input_path.glob("perf_rank*.xlsx")):
+            mode = "single"
+        elif list(input_path.glob("perf_*ch_rank*.xlsx")):
+            mode = "sweep"
+        else:
+            raise click.ClickException(
+                "Could not auto-detect mode. Please specify --mode single or --mode sweep"
+            )
+
+        if verbose:
+            click.echo(f"Auto-detected mode: {mode}")
+
+    try:
+        if mode == "single":
+            from . import process_single_config
+            output_path = process_single_config(
+                reports_dir=input_path,
+                use_geo_mean=geo_mean,
+                output_path=Path(output) if output else None,
+                verbose=verbose,
+            )
+        else:  # sweep
+            from . import process_sweep_config
+            output_path = process_sweep_config(
+                sweep_dir=input_path,
+                use_geo_mean=geo_mean,
+                output_path=Path(output) if output else None,
+                verbose=verbose,
+            )
+
+        if not quiet and output_path:
+            click.echo(f"\nProcessing complete: {output_path}")
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
+
+
+@process.command("comms")
+@click.argument("sweep_dir", type=click.Path(exists=True))
+@click.option("-o", "--output", type=click.Path(), help="Output directory")
+@click.pass_context
+def process_comms(ctx, sweep_dir, output):
+    """Process NCCL communication data from collective reports.
+
+    SWEEP_DIR: Path to sweep directory containing tracelens_analysis/
+
+    Reads nccl_summary_implicit_sync sheet from collective_*.xlsx files,
+    combines data across all configurations, and generates master files.
+
+    \b
+    Output files:
+      - nccl_master_all_configs.xlsx (for pivot tables)
+      - nccl_master_all_configs.csv (for pandas/scripts)
+
+    \b
+    Examples:
+      aorta-report process comms /path/to/sweep
+      aorta-report process comms /path/to/sweep -o ./nccl_analysis/
+    """
+    from . import process_nccl_data
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    try:
+        excel_path, csv_path = process_nccl_data(
+            sweep_dir=Path(sweep_dir),
+            output_dir=Path(output) if output else None,
+            verbose=verbose,
+        )
+        if not quiet and excel_path:
+            click.echo(f"\nProcessing complete:")
+            click.echo(f"  Excel: {excel_path}")
+            click.echo(f"  CSV: {csv_path}")
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
+
+
+@process.command("gemm-variance")
+@click.argument("input_csv", type=click.Path(exists=True))
+@click.option("--base-path", required=True, type=click.Path(exists=True),
+              help="Base path to sweep directory containing trace files")
+@click.option("--tolerance", default=0.01, type=float,
+              help="Duration matching tolerance as fraction (default: 0.01 = 1%)")
+@click.option("-o", "--output", type=click.Path(), help="Output CSV file")
+@click.pass_context
+def process_gemm_variance(ctx, input_csv, base_path, tolerance, output):
+    """Enhance GEMM variance CSV with kernel timestamps.
+
+    INPUT_CSV: CSV file with GEMM variance data (from 'analyze gemm' command).
+
+    For each row, finds the corresponding trace file and extracts timestamps
+    for the kernel instances with minimum and maximum durations.
+
+    \b
+    Added columns:
+      - min_duration_timestamp_ms: When shortest instance occurred
+      - max_duration_timestamp_ms: When longest instance occurred
+      - time_between_min_max_ms: Time difference between occurrences
+
+    \b
+    Examples:
+      aorta-report process gemm-variance ./gemm_variance.csv --base-path /path/to/sweep
+      aorta-report process gemm-variance ./variance.csv --base-path /path/to/sweep \\
+          --tolerance 0.02 -o ./enhanced.csv
+    """
+    from . import enhance_gemm_variance
+
+    verbose = ctx.obj.get("verbose", False)
+    quiet = ctx.obj.get("quiet", False)
+
+    try:
+        output_path = enhance_gemm_variance(
+            input_csv=Path(input_csv),
+            base_path=Path(base_path),
+            output_csv=Path(output) if output else None,
+            tolerance=tolerance,
+            verbose=verbose,
+        )
+        if not quiet and output_path:
+            click.echo(f"\nProcessing complete: {output_path}")
+    except (ValueError, FileNotFoundError) as e:
+        raise click.ClickException(str(e))
+
diff --git a/src/aorta/report/processing/gpu_timeline_single.py b/src/aorta/report/processing/gpu_timeline_single.py
new file mode 100644
index 0000000..3a5d58e
--- /dev/null
+++ b/src/aorta/report/processing/gpu_timeline_single.py
@@ -0,0 +1,143 @@
+"""
+GPU Timeline Processing - Single Configuration Mode.
+
+Processes GPU timeline data from TraceLens individual reports for a single
+configuration (no thread/channel variations). Aggregates across ranks.
+
+Source: scripts/tracelens_single_config/process_gpu_timeline.py
+"""
+
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+
+
+def geometric_mean(values: np.ndarray) -> float:
+    """
+    Calculate geometric mean, handling zeros.
+
+    Args:
+        values: Array of values
+
+    Returns:
+        Geometric mean value
+    """
+    values = np.array(values)
+    values = np.where(values == 0, 1e-10, values)
+    return float(np.exp(np.mean(np.log(values))))
+
+
+def process_single_config(
+    reports_dir: Path,
+    use_geo_mean: bool = False,
+    output_path: Optional[Path] = None,
+    verbose: bool = False,
+) -> Optional[Path]:
+    """
+    Process GPU timeline from single config individual reports.
+
+    Reads gpu_timeline sheet from each perf_rank*.xlsx file and aggregates
+    across all ranks using mean or geometric mean.
+
+    Args:
+        reports_dir: Path to individual_reports directory containing perf_rank*.xlsx
+        use_geo_mean: Use geometric mean instead of arithmetic mean
+        output_path: Custom output path (default: parent/gpu_timeline_summary_{method}.xlsx)
+        verbose: Print verbose output
+
+    Returns:
+        Path to output Excel file or None if processing failed
+    """
+    reports_path = Path(reports_dir)
+
+    if not reports_path.exists():
+        raise FileNotFoundError(f"Directory not found: {reports_dir}")
+
+    agg_method = "Geometric Mean" if use_geo_mean else "Arithmetic Mean"
+    print(f"Processing GPU timeline from: {reports_dir}")
+    print(f"Aggregation: {agg_method}")
+
+    # Find performance files
+    perf_files = sorted(reports_path.glob("perf_rank*.xlsx"))
+
+    if not perf_files:
+        print("Error: No perf_rank*.xlsx files found")
+        return None
+
+    print(f"Found {len(perf_files)} rank files")
+
+    # Read data from each rank
+    rank_data = []
+    for file_path in perf_files:
+        rank_num = int(file_path.stem.replace("perf_rank", ""))
+        try:
+            df = pd.read_excel(file_path, sheet_name="gpu_timeline")
+            df["rank"] = rank_num
+            rank_data.append(df)
+            if verbose:
+                print(f"  Rank {rank_num}: OK")
+        except Exception as e:
+            print(f"  Rank {rank_num}: Error - {e}")
+
+    if not rank_data:
+        print("Error: No valid data loaded")
+        return None
+
+    # Combine all rank data
+    combined = pd.concat(rank_data, ignore_index=True)
+
+    # Aggregate across ranks
+    agg_func = geometric_mean if use_geo_mean else "mean"
+    aggregated = (
+        combined.groupby("type")
+        .agg({"time ms": agg_func, "percent": agg_func})
+        .reset_index()
+    )
+
+    aggregated["num_ranks"] = len(perf_files)
+
+    # Determine output path
+    method_suffix = "geomean" if use_geo_mean else "mean"
+    if output_path is None:
+        output_path = reports_path.parent / f"gpu_timeline_summary_{method_suffix}.xlsx"
+    else:
+        output_path = Path(output_path)
+
+    # Ensure output directory exists
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Save to Excel with multiple sheets
+    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+        # Summary sheet - aggregated metrics
+        aggregated.to_excel(writer, sheet_name="Summary", index=False)
+
+        # All ranks combined - raw data with rank column
+        combined_sorted = combined.sort_values(["rank", "type"])
+        combined_sorted.to_excel(writer, sheet_name="All_Ranks_Combined", index=False)
+
+        # Per-rank pivot - time values
+        per_rank_time = combined.pivot_table(
+            values="time ms", index="type", columns="rank", aggfunc="first"
+        )
+        per_rank_time.to_excel(writer, sheet_name="Per_Rank_Time_ms")
+
+        # Per-rank pivot - percentages
+        per_rank_pct = combined.pivot_table(
+            values="percent", index="type", columns="rank", aggfunc="first"
+        )
+        per_rank_pct.to_excel(writer, sheet_name="Per_Rank_Percent")
+
+    print(f"\nSaved: {output_path}")
+    print("\nSheets created:")
+    print("  1. Summary - Aggregated metrics across ranks")
+    print("  2. All_Ranks_Combined - Raw data from all ranks")
+    print("  3. Per_Rank_Time_ms - Pivot: type × rank (time)")
+    print("  4. Per_Rank_Percent - Pivot: type × rank (percent)")
+
+    # Print summary
+    print("\nSummary:")
+    print(aggregated.to_string(index=False))
+
+    return output_path
diff --git a/src/aorta/report/processing/gpu_timeline_sweep.py b/src/aorta/report/processing/gpu_timeline_sweep.py
new file mode 100644
index 0000000..615201f
--- /dev/null
+++ b/src/aorta/report/processing/gpu_timeline_sweep.py
@@ -0,0 +1,435 @@
+"""
+GPU Timeline Processing - Sweep Mode.
+
+Processes GPU timeline data from TraceLens individual reports for a sweep
+directory with multiple thread/channel configurations. Aggregates across
+ranks for each configuration.
+
+Source: scripts/gemm_analysis/process_gpu_timeline.py
+"""
+
+import glob
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+
+
+def geometric_mean(values: np.ndarray) -> float:
+    """
+    Calculate geometric mean, handling zeros.
+
+    Args:
+        values: Array of values
+
+    Returns:
+        Geometric mean value
+    """
+    values = np.array(values)
+    values = np.where(values == 0, 1e-10, values)
+    return float(np.exp(np.mean(np.log(values))))
+
+
+def parse_perf_filename(filename: str) -> Tuple[str, int]:
+    """
+    Parse performance filename to extract channel config and rank.
+
+    Args:
+        filename: e.g., 'perf_28ch_rank0.xlsx'
+
+    Returns:
+        Tuple of (channel_config, rank) e.g., ('28ch', 0)
+    """
+    parts = filename.replace("perf_", "").replace(".xlsx", "").split("_")
+    channel_config = parts[0]  # e.g., "28ch"
+    rank = int(parts[1].replace("rank", ""))
+    return channel_config, rank
+
+
+def group_files_by_channel(perf_files: List[str]) -> Dict[str, List[Tuple[int, str]]]:
+    """
+    Group performance files by channel configuration.
+
+    Args:
+        perf_files: List of file paths
+
+    Returns:
+        Dict mapping channel_config to list of (rank, file_path) tuples
+    """
+    channel_groups: Dict[str, List[Tuple[int, str]]] = {}
+    for file_path in perf_files:
+        filename = Path(file_path).name
+        channel_config, rank = parse_perf_filename(filename)
+
+        if channel_config not in channel_groups:
+            channel_groups[channel_config] = []
+        channel_groups[channel_config].append((rank, file_path))
+
+    return channel_groups
+
+
+def read_rank_data(
+    rank_files: List[Tuple[int, str]], verbose: bool = False
+) -> List[pd.DataFrame]:
+    """
+    Read gpu_timeline data from all rank files.
+
+    Args:
+        rank_files: List of (rank, file_path) tuples
+        verbose: Print verbose output
+
+    Returns:
+        List of DataFrames with rank column added
+    """
+    rank_data = []
+    for rank, file_path in rank_files:
+        try:
+            df = pd.read_excel(file_path, sheet_name="gpu_timeline")
+            df["rank"] = rank
+            rank_data.append(df)
+        except Exception as e:
+            if verbose:
+                print(f"    Warning: Could not read {Path(file_path).name}: {e}")
+    return rank_data
+
+
+def aggregate_rank_data(
+    rank_data: List[pd.DataFrame],
+    thread_config: str,
+    channel_config: str,
+    num_ranks: int,
+    use_geo_mean: bool,
+) -> pd.DataFrame:
+    """
+    Aggregate data across ranks and add metadata.
+
+    Args:
+        rank_data: List of DataFrames
+        thread_config: Thread configuration string (e.g., '256thread')
+        channel_config: Channel configuration string (e.g., '28ch')
+        num_ranks: Number of ranks
+        use_geo_mean: Whether to use geometric mean
+
+    Returns:
+        DataFrame with aggregated data and metadata columns
+    """
+    combined = pd.concat(rank_data, ignore_index=True)
+
+    agg_func = geometric_mean if use_geo_mean else "mean"
+    aggregated = (
+        combined.groupby("type")
+        .agg({"time ms": agg_func, "percent": agg_func})
+        .reset_index()
+    )
+
+    # Add metadata columns
+    aggregated["thread_config"] = thread_config
+    aggregated["threads_num"] = int(thread_config.replace("thread", ""))
+    aggregated["channel_config"] = channel_config
+    aggregated["channels_num"] = int(channel_config.replace("ch", ""))
+    aggregated["full_config"] = f"{thread_config}_{channel_config}"
+    aggregated["num_ranks"] = num_ranks
+
+    return aggregated
+
+
+def process_channel_config(
+    channel_config: str,
+    channel_groups: Dict[str, List[Tuple[int, str]]],
+    use_geo_mean: bool,
+    thread_config: str,
+    verbose: bool = False,
+) -> Optional[pd.DataFrame]:
+    """
+    Process a single channel configuration.
+
+    Args:
+        channel_config: Channel configuration string (e.g., '28ch')
+        channel_groups: Dict mapping channel to (rank, file_path) list
+        use_geo_mean: Whether to use geometric mean
+        thread_config: Thread configuration string
+        verbose: Print verbose output
+
+    Returns:
+        Aggregated DataFrame or None if no valid data
+    """
+    rank_files = sorted(channel_groups[channel_config], key=lambda x: x[0])
+    num_ranks = len(rank_files)
+
+    print(f"  {channel_config}: Processing {num_ranks} ranks...")
+
+    rank_data = read_rank_data(rank_files, verbose)
+
+    if not rank_data:
+        print(f"    No valid data for {channel_config}")
+        return None
+
+    aggregated = aggregate_rank_data(
+        rank_data, thread_config, channel_config, num_ranks, use_geo_mean
+    )
+    print(f"    [OK] Aggregated across {num_ranks} ranks")
+
+    return aggregated
+
+
+def process_thread_config(
+    thread_config: str,
+    tracelens_dir: Path,
+    use_geo_mean: bool,
+    verbose: bool = False,
+) -> List[pd.DataFrame]:
+    """
+    Process a single thread configuration.
+
+    Args:
+        thread_config: Thread configuration string (e.g., '256thread')
+        tracelens_dir: Path to tracelens_analysis directory
+        use_geo_mean: Whether to use geometric mean
+        verbose: Print verbose output
+
+    Returns:
+        List of aggregated DataFrames for each channel config
+    """
+    individual_reports_dir = tracelens_dir / thread_config / "individual_reports"
+
+    if not individual_reports_dir.exists():
+        print(f"  Warning: {individual_reports_dir} not found, skipping...")
+        return []
+
+    print(f"\nProcessing: {thread_config}")
+    print("-" * 60)
+
+    perf_files = sorted(glob.glob(str(individual_reports_dir / "perf_*ch_rank*.xlsx")))
+
+    if not perf_files:
+        print(f"  Warning: No performance files found in {individual_reports_dir}")
+        return []
+
+    channel_groups = group_files_by_channel(perf_files)
+    results = []
+
+    # Process each channel configuration (sorted by channel number)
+    sorted_channels = sorted(
+        channel_groups.keys(), key=lambda x: int(x.replace("ch", ""))
+    )
+    for channel_config in sorted_channels:
+        aggregated = process_channel_config(
+            channel_config, channel_groups, use_geo_mean, thread_config, verbose
+        )
+        if aggregated is not None:
+            results.append(aggregated)
+
+    return results
+
+
+def create_pivot_sheet(df: pd.DataFrame, value_col: str) -> pd.DataFrame:
+    """
+    Create a pivot table from the dataframe.
+
+    Args:
+        df: Source DataFrame
+        value_col: Column to use for values
+
+    Returns:
+        Pivot table DataFrame
+    """
+    return df.pivot_table(
+        values=value_col, index="type", columns="full_config", aggfunc="first"
+    )
+
+
+def create_summary_sheet(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Create a summary sheet with key metrics per configuration.
+
+    Args:
+        df: Source DataFrame
+
+    Returns:
+        Summary DataFrame
+    """
+    summary = (
+        df.groupby("full_config")
+        .agg({"threads_num": "first", "channels_num": "first", "num_ranks": "first"})
+        .reset_index()
+    )
+
+    # Add key metrics for each config
+    key_metrics = [
+        "computation_time",
+        "exposed_comm_time",
+        "busy_time",
+        "idle_time",
+        "total_time",
+    ]
+    for metric_type in key_metrics:
+        metric_data = df[df["type"] == metric_type].set_index("full_config")["time ms"]
+        summary[f"{metric_type}_ms"] = summary["full_config"].map(metric_data)
+
+    return summary
+
+
+def print_metric_comparison(df: pd.DataFrame, metric_type: str, description: str) -> None:
+    """
+    Print a metric comparison table.
+
+    Args:
+        df: Source DataFrame
+        metric_type: Type of metric to filter
+        description: Description to print
+    """
+    metric_data = df[df["type"] == metric_type][
+        ["full_config", "time ms", "percent"]
+    ].sort_values("time ms")
+    print(f"\n{description}:")
+    print(metric_data.to_string(index=False))
+
+
+def print_summary_report(final_df: pd.DataFrame, verbose: bool = False) -> None:
+    """
+    Print summary statistics and comparisons.
+
+    Args:
+        final_df: Final combined DataFrame
+        verbose: Print detailed comparisons
+    """
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+
+    print("\nMetric Types Found:")
+    for metric_type in sorted(final_df["type"].unique()):
+        count = len(final_df[final_df["type"] == metric_type])
+        print(f"  {metric_type:<25} ({count} configurations)")
+
+    print("\nConfigurations Processed:")
+    configs = final_df.groupby("full_config")["num_ranks"].first().sort_index()
+    for config, num_ranks in configs.items():
+        print(f"  {config:<25} ({num_ranks} ranks)")
+
+    if verbose:
+        print("\n" + "=" * 80)
+        print("KEY METRICS COMPARISON (Sorted by Busy Time)")
+        print("=" * 80)
+        print_metric_comparison(final_df, "busy_time", "Busy Time (lower is better)")
+        print_metric_comparison(final_df, "idle_time", "Idle Time (lower is better)")
+
+
+def process_sweep_config(
+    sweep_dir: Path,
+    use_geo_mean: bool = False,
+    output_path: Optional[Path] = None,
+    verbose: bool = False,
+) -> Optional[Path]:
+    """
+    Process GPU timeline from sweep directory with multiple configurations.
+
+    Reads gpu_timeline sheet from each perf_*ch_rank*.xlsx file, groups by
+    thread and channel configuration, and aggregates across ranks.
+
+    Args:
+        sweep_dir: Path to sweep directory containing tracelens_analysis/
+        use_geo_mean: Use geometric mean instead of arithmetic mean
+        output_path: Custom output path
+        verbose: Print verbose output
+
+    Returns:
+        Path to output Excel file or None if processing failed
+    """
+    sweep_path = Path(sweep_dir)
+    tracelens_dir = sweep_path / "tracelens_analysis"
+
+    if not tracelens_dir.exists():
+        raise FileNotFoundError(
+            f"tracelens_analysis directory not found in {sweep_dir}"
+        )
+
+    agg_method = "Geometric Mean" if use_geo_mean else "Arithmetic Mean"
+    print("=" * 80)
+    print(f"Processing GPU Timeline data from: {sweep_dir}")
+    print(f"Aggregation method: {agg_method}")
+    print("=" * 80)
+
+    # Find all thread configurations
+    thread_configs = [
+        d.name for d in tracelens_dir.iterdir() if d.is_dir() and "thread" in d.name
+    ]
+
+    if not thread_configs:
+        raise ValueError("No thread configuration directories found")
+
+    print(f"\nFound thread configurations: {sorted(thread_configs)}")
+
+    # Process all thread configurations
+    all_results = []
+    for thread_config in sorted(thread_configs):
+        results = process_thread_config(thread_config, tracelens_dir, use_geo_mean, verbose)
+        all_results.extend(results)
+
+    if not all_results:
+        print("\nError: No data was processed")
+        return None
+
+    # Combine and format results
+    print("\n" + "=" * 80)
+    print("CREATING OUTPUT FILE")
+    print("=" * 80)
+
+    final_df = pd.concat(all_results, ignore_index=True)
+
+    # Reorder columns
+    column_order = [
+        "full_config",
+        "threads_num",
+        "thread_config",
+        "channels_num",
+        "channel_config",
+        "num_ranks",
+        "type",
+        "time ms",
+        "percent",
+    ]
+    final_df = final_df[column_order]
+    final_df = final_df.sort_values(["threads_num", "channels_num", "type"])
+
+    # Determine output path
+    method_suffix = "geomean" if use_geo_mean else "mean"
+    if output_path is None:
+        output_path = tracelens_dir / f"gpu_timeline_all_configs_{method_suffix}.xlsx"
+    else:
+        output_path = Path(output_path)
+
+    # Ensure output directory exists
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Save to Excel with multiple sheets
+    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+        final_df.to_excel(writer, sheet_name="All_Data", index=False)
+        create_pivot_sheet(final_df, "time ms").to_excel(
+            writer, sheet_name="Pivot_Time_ms"
+        )
+        create_pivot_sheet(final_df, "percent").to_excel(
+            writer, sheet_name="Pivot_Percent"
+        )
+        create_summary_sheet(final_df).to_excel(
+            writer, sheet_name="Summary_By_Config", index=False
+        )
+
+    print(f"[SAVED] {output_path}")
+    print("  Sheets created:")
+    print("    1. All_Data - Complete dataset")
+    print("    2. Pivot_Time_ms - Matrix view of time (ms)")
+    print("    3. Pivot_Percent - Matrix view of percentages")
+    print("    4. Summary_By_Config - Key metrics per configuration")
+
+    # Print summary
+    print_summary_report(final_df, verbose)
+
+    print("\n" + "=" * 80)
+    print("COMPLETE!")
+    print("=" * 80)
+    print(f"\nOutput file: {output_path}")
+    print("Open in Excel to create custom pivots and charts!")
+
+    return output_path
diff --git a/src/aorta/report/processing/process_comms.py b/src/aorta/report/processing/process_comms.py
new file mode 100644
index 0000000..b7008b9
--- /dev/null
+++ b/src/aorta/report/processing/process_comms.py
@@ -0,0 +1,291 @@
+"""
+NCCL Communication Data Processing.
+
+Processes NCCL collective reports from a sweep directory and generates
+combined CSV/Excel files with communication metrics.
+
+Source: scripts/gemm_analysis/process_comms.py
+"""
+
+import glob
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import pandas as pd
+
+
+def create_operation_name(size_mb: float) -> str:
+    """
+    Create readable operation name based on message size.
+
+    Args:
+        size_mb: Message size in megabytes
+
+    Returns:
+        Human-readable operation name
+    """
+    if size_mb < 0.01:
+        return f"tiny_{size_mb*1000:.3f}KB"
+    elif size_mb < 100:
+        return f"medium_{size_mb:.2f}MB"
+    else:
+        return f"large_{size_mb:.2f}MB"
+
+
+def process_nccl_data(
+    sweep_dir: Path,
+    output_dir: Optional[Path] = None,
+    verbose: bool = False,
+) -> Tuple[Optional[Path], Optional[Path]]:
+    """
+    Process NCCL collective reports from sweep directory.
+
+    Reads nccl_summary_implicit_sync sheet from each collective_*.xlsx file,
+    adds metadata columns, creates operation IDs, and combines into master files.
+
+    Args:
+        sweep_dir: Path to sweep directory containing tracelens_analysis/
+        output_dir: Custom output directory (default: tracelens_analysis/)
+        verbose: Print verbose output
+
+    Returns:
+        Tuple of (excel_path, csv_path) or (None, None) if processing failed
+    """
+    sweep_path = Path(sweep_dir)
+    tracelens_dir = sweep_path / "tracelens_analysis"
+
+    if not tracelens_dir.exists():
+        raise FileNotFoundError(
+            f"tracelens_analysis directory not found in {sweep_dir}"
+        )
+
+    print("=" * 80)
+    print(f"Processing NCCL data from: {sweep_dir}")
+    print("=" * 80)
+
+    # Find all thread configurations
+    thread_configs = [
+        d.name for d in tracelens_dir.iterdir()
+        if d.is_dir() and "thread" in d.name
+    ]
+
+    if not thread_configs:
+        raise ValueError("No thread configuration directories found")
+
+    print(f"\nFound thread configurations: {sorted(thread_configs)}")
+
+    all_data: List[pd.DataFrame] = []
+
+    # Process each thread configuration
+    for thread_config in sorted(thread_configs):
+        collective_dir = tracelens_dir / thread_config / "collective_reports"
+
+        if not collective_dir.exists():
+            print(f"  Warning: {collective_dir} not found, skipping...")
+            continue
+
+        print(f"\nProcessing: {thread_config}")
+        print("-" * 60)
+
+        # Find all Excel files
+        excel_files = sorted(glob.glob(str(collective_dir / "collective_*.xlsx")))
+
+        if not excel_files:
+            print(f"  Warning: No collective_*.xlsx files found in {collective_dir}")
+            continue
+
+        for file_path in excel_files:
+            filename = Path(file_path).name
+            channel_config = filename.replace("collective_", "").replace(".xlsx", "")
+            channels_num = int(channel_config.replace("ch", ""))
+            threads_num = int(thread_config.replace("thread", ""))
+
+            if verbose:
+                print(f"  Reading: {filename}")
+
+            try:
+                # Read the nccl_summary_implicit_sync sheet
+                df = pd.read_excel(file_path, sheet_name="nccl_summary_implicit_sync")
+
+                # Add metadata columns
+                df["thread_config"] = thread_config
+                df["threads_num"] = threads_num
+                df["channel_config"] = channel_config
+                df["channels_num"] = channels_num
+                df["source_file"] = filename
+                df["full_config"] = f"{thread_config}_{channel_config}"
+
+                all_data.append(df)
+                print(f"  Reading: {filename}")
+                print(f"    [OK] Loaded {len(df)} rows")
+
+            except Exception as e:
+                print(f"  Reading: {filename}")
+                print(f"    [ERROR] Error reading {filename}: {e}")
+
+    if not all_data:
+        print("\nError: No data was loaded")
+        return None, None
+
+    # Combine all data
+    print("\n" + "=" * 80)
+    print("COMBINING AND PROCESSING DATA")
+    print("=" * 80)
+
+    combined_df = pd.concat(all_data, ignore_index=True)
+    print(f"Total rows: {len(combined_df)}")
+    print(f"Total columns: {len(combined_df.columns)}")
+
+    # Create unique operation IDs based on message size
+    print("\nCreating unique operation IDs...")
+    
+    if "Full msg size (MB)" not in combined_df.columns:
+        print("Warning: 'Full msg size (MB)' column not found, skipping operation ID creation")
+    else:
+        unique_sizes = sorted(combined_df["Full msg size (MB)"].unique())
+        size_to_id = {size: f"OP_{i+1:02d}" for i, size in enumerate(unique_sizes)}
+        combined_df["operation_id"] = combined_df["Full msg size (MB)"].map(size_to_id)
+
+        # Create operation name
+        combined_df["operation_name"] = combined_df["Full msg size (MB)"].apply(create_operation_name)
+
+    # Reorder columns for better readability
+    # Define preferred column order (columns that might exist)
+    preferred_order = [
+        # Unique identifiers
+        "operation_id",
+        "operation_name",
+        "Full msg size (MB)",
+        "In msg nelems",
+        # Configuration
+        "threads_num",
+        "thread_config",
+        "channels_num",
+        "channel_config",
+        "full_config",
+        # Operation info
+        "Collective name",
+        "dtype",
+        "Group size",
+        "count",
+        # Communication Latency
+        "comm_latency_mean",
+        "comm_latency_median",
+        "comm_latency_min",
+        "comm_latency_max",
+        "Total comm latency (ms)",
+        # Algorithm Bandwidth
+        "algo bw (GB/s)_mean",
+        "algo bw (GB/s)_median",
+        "algo bw (GB/s)_min",
+        "algo bw (GB/s)_max",
+        # Bus Bandwidth
+        "bus bw (GB/s)_mean",
+        "bus bw (GB/s)_median",
+        "bus bw (GB/s)_min",
+        "bus bw (GB/s)_max",
+        # Start Time Skew
+        "skew in start time_mean",
+        "skew in start time_median",
+        "skew in start time_min",
+        "skew in start time_max",
+        # End Time Skew
+        "skew in end time_mean",
+        "skew in end time_median",
+        "skew in end time_min",
+        "skew in end time_max",
+        # Process Group Info
+        "Process Group Name",
+        "source_file",
+    ]
+
+    # Filter to columns that exist and add any remaining columns
+    existing_preferred = [c for c in preferred_order if c in combined_df.columns]
+    remaining = [c for c in combined_df.columns if c not in preferred_order]
+    column_order = existing_preferred + remaining
+
+    combined_df = combined_df[column_order]
+
+    # Sort by operation and configuration
+    sort_cols = []
+    if "operation_id" in combined_df.columns:
+        sort_cols.append("operation_id")
+    sort_cols.extend(["threads_num", "channels_num"])
+    combined_df = combined_df.sort_values(sort_cols)
+
+    # Determine output directory
+    if output_dir is None:
+        output_dir = tracelens_dir
+    else:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Save as Excel file
+    print("\n" + "=" * 80)
+    print("SAVING DATA FILE")
+    print("=" * 80)
+
+    excel_path = output_dir / "nccl_master_all_configs.xlsx"
+    combined_df.to_excel(excel_path, index=False, sheet_name="NCCL_Data")
+    print(f"[SAVED] Excel: {excel_path}")
+    print(f"  Rows: {len(combined_df)}, Columns: {len(combined_df.columns)}")
+
+    # Also save as CSV
+    csv_path = output_dir / "nccl_master_all_configs.csv"
+    combined_df.to_csv(csv_path, index=False)
+    print(f"[SAVED] CSV: {csv_path}")
+    print("  (Use Excel file for pivot tables, CSV for pandas/scripts)")
+
+    # Print summary
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+
+    if "operation_id" in combined_df.columns and "Full msg size (MB)" in combined_df.columns:
+        print("\nOperation ID Mapping:")
+        print("-" * 60)
+        for op_id in sorted(combined_df["operation_id"].unique()):
+            row = combined_df[combined_df["operation_id"] == op_id].iloc[0]
+            in_msg_nelems = int(row.get("In msg nelems", 0)) if "In msg nelems" in row else 0
+            print(
+                f"  {op_id}: {row['Full msg size (MB)']:>12.6f} MB  "
+                f"({in_msg_nelems:>10} elements)  {row.get('operation_name', '')}"
+            )
+
+    print("\nConfigurations:")
+    print("-" * 60)
+    configs = combined_df.groupby(["thread_config", "channel_config"]).size().reset_index(name="operations")
+    for _, row in configs.iterrows():
+        print(f"  {row['thread_config']:<12} {row['channel_config']:<8} -> {row['operations']} operations")
+
+    if "Total comm latency (ms)" in combined_df.columns:
+        print("\nTotal Communication Time by Configuration:")
+        print("-" * 60)
+        total_by_config = combined_df.groupby("full_config")["Total comm latency (ms)"].sum().sort_values()
+        for config, total in total_by_config.items():
+            print(f"  {config:<25}: {total:>10.2f} ms")
+
+    if "operation_id" in combined_df.columns and "comm_latency_mean" in combined_df.columns:
+        print("\nBest Configuration by Operation:")
+        print("-" * 60)
+        for op_id in sorted(combined_df["operation_id"].unique()):
+            op_data = combined_df[combined_df["operation_id"] == op_id]
+            best = op_data.loc[op_data["comm_latency_mean"].idxmin()]
+            print(
+                f"  {op_id} ({best['Full msg size (MB)']:>8.2f} MB): "
+                f"{best['full_config']:<20} ({best['comm_latency_mean']:>8.2f} ms)"
+            )
+
+    print("\n" + "=" * 80)
+    print("COMPLETE!")
+    print("=" * 80)
+    print(f"\nGenerated files:")
+    print(f"  1. {excel_path} (Excel - use for pivot tables)")
+    print(f"  2. {csv_path} (CSV - use for pandas/scripts)")
+    print("\nRecommended workflow:")
+    print("  1. Open Excel file: libreoffice nccl_master_all_configs.xlsx")
+    print("  2. Create pivot table: Select all -> Insert -> Pivot Table")
+    print("  3. Setup: Rows=operation_id, Columns=full_config, Values=comm_latency_mean")
+
+    return excel_path, csv_path
+
diff --git a/src/aorta/report/processing/process_gemm_variance.py b/src/aorta/report/processing/process_gemm_variance.py
new file mode 100644
index 0000000..d5e0a38
--- /dev/null
+++ b/src/aorta/report/processing/process_gemm_variance.py
@@ -0,0 +1,321 @@
+"""
+GEMM Variance Timestamp Enhancement.
+
+Enhances GEMM variance CSV (from analyze gemm) with actual kernel timestamps
+by finding the specific kernel instances with min and max durations in the
+original trace files.
+
+Source: scripts/gemm_analysis/enhance_gemm_variance_with_timestamps.py
+"""
+
+import json
+from pathlib import Path
+from typing import Dict, Optional
+
+import pandas as pd
+
+
+def get_trace_file_path(
+    base_path: Path, threads: int, channel: int, rank: int
+) -> Optional[Path]:
+    """
+    Find the trace file for a given configuration.
+
+    Args:
+        base_path: Base path to sweep directory
+        threads: Thread count
+        channel: Channel count
+        rank: Rank number
+
+    Returns:
+        Path to trace file or None if not found
+    """
+    trace_dir = (
+        base_path
+        / f"{threads}thread"
+        / f"nccl_{channel}channels"
+        / "torch_profiler"
+        / f"rank{rank}"
+    )
+
+    if not trace_dir.exists():
+        return None
+
+    # Look for JSON trace files
+    trace_files = list(trace_dir.glob("*.json"))
+
+    if not trace_files:
+        return None
+
+    # Prefer customer_trace files, but use any available
+    for pattern in ["customer_trace*.json", "*.json"]:
+        matches = list(trace_dir.glob(pattern))
+        if matches:
+            return matches[0]
+
+    return trace_files[0] if trace_files else None
+
+
+def find_min_max_kernel_timestamps(
+    trace_file: Path,
+    kernel_name: str,
+    min_duration_us: float,
+    max_duration_us: float,
+    tolerance: float = 0.01,
+) -> Dict[str, Optional[float]]:
+    """
+    Find timestamps for kernel instances with min and max durations.
+
+    Args:
+        trace_file: Path to trace JSON file
+        kernel_name: Kernel name to search for
+        min_duration_us: Expected minimum duration in microseconds
+        max_duration_us: Expected maximum duration in microseconds
+        tolerance: Tolerance for duration matching (fraction)
+
+    Returns:
+        Dict with min_timestamp_ms, max_timestamp_ms, and found durations
+    """
+    try:
+        with open(trace_file, "r") as f:
+            data = json.load(f)
+    except Exception as e:
+        print(f"Error reading {trace_file}: {e}")
+        return {
+            "min_timestamp_ms": None,
+            "max_timestamp_ms": None,
+            "min_duration_found_us": None,
+            "max_duration_found_us": None,
+        }
+
+    if "traceEvents" not in data:
+        return {
+            "min_timestamp_ms": None,
+            "max_timestamp_ms": None,
+            "min_duration_found_us": None,
+            "max_duration_found_us": None,
+        }
+
+    events = data["traceEvents"]
+
+    # Find all instances of this kernel
+    kernel_instances = []
+    for event in events:
+        if event.get("cat") == "kernel" and event.get("name", "").startswith(
+            kernel_name
+        ):
+            # Duration and timestamp are in microseconds in PyTorch trace file
+            duration_us = event.get("dur")
+            timestamp_us = event.get("ts")
+
+            # Skip events without proper duration or timestamp
+            if duration_us is None or timestamp_us is None:
+                continue
+
+            timestamp_ms = timestamp_us / 1000.0
+
+            kernel_instances.append(
+                {
+                    "duration_us": duration_us,
+                    "timestamp_ms": timestamp_ms,
+                    "timestamp_us": timestamp_us,
+                }
+            )
+
+    if not kernel_instances:
+        print(f"  Warning: No valid instances of kernel {kernel_name[:50]}... found")
+        return {
+            "min_timestamp_ms": None,
+            "max_timestamp_ms": None,
+            "min_duration_found_us": None,
+            "max_duration_found_us": None,
+        }
+
+    # Sort by duration (guaranteed to exist and be non-None)
+    kernel_instances.sort(key=lambda x: x["duration_us"])
+
+    # Get the actual minimum and maximum instances
+    min_instance = kernel_instances[0]  # Shortest duration
+    max_instance = kernel_instances[-1]  # Longest duration
+
+    # Verify the matches are reasonably close
+    min_tolerance = min_duration_us * tolerance
+    max_tolerance = max_duration_us * tolerance
+
+    result = {
+        "min_timestamp_ms": min_instance["timestamp_ms"],
+        "max_timestamp_ms": max_instance["timestamp_ms"],
+        "min_duration_found_us": min_instance["duration_us"],
+        "max_duration_found_us": max_instance["duration_us"],
+    }
+
+    # Print warnings if mismatch
+    if abs(min_instance["duration_us"] - min_duration_us) > min_tolerance:
+        print(
+            f"  Warning: Min duration mismatch: found {min_instance['duration_us']:.3f}us "
+            f"vs expected {min_duration_us:.3f}us"
+        )
+
+    if abs(max_instance["duration_us"] - max_duration_us) > max_tolerance:
+        print(
+            f"  Warning: Max duration mismatch: found {max_instance['duration_us']:.3f}us "
+            f"vs expected {max_duration_us:.3f}us"
+        )
+
+    return result
+
+
+def enhance_gemm_variance(
+    input_csv: Path,
+    base_path: Path,
+    output_csv: Optional[Path] = None,
+    tolerance: float = 0.01,
+    verbose: bool = False,
+) -> Optional[Path]:
+    """
+    Enhance GEMM variance CSV with timestamp information.
+
+    For each row in the variance CSV, finds the corresponding trace file,
+    searches for kernel instances with min/max durations, and adds their
+    timestamps to the output.
+
+    Args:
+        input_csv: Input CSV file with GEMM variance data
+        base_path: Base path to sweep directory containing trace files
+        output_csv: Output CSV path (default: input_with_timestamps.csv)
+        tolerance: Duration matching tolerance as fraction (default: 0.01 = 1%)
+        verbose: Print verbose output
+
+    Returns:
+        Path to output CSV file or None if processing failed
+    """
+    input_path = Path(input_csv)
+    base_path = Path(base_path)
+
+    if not input_path.exists():
+        raise FileNotFoundError(f"Input CSV not found: {input_csv}")
+
+    if not base_path.exists():
+        raise FileNotFoundError(f"Base path not found: {base_path}")
+
+    # Set default output file if not specified
+    if output_csv is None:
+        output_csv = input_path.parent / f"{input_path.stem}_with_timestamps.csv"
+    else:
+        output_csv = Path(output_csv)
+
+    print("GEMM Variance Timestamp Enhancement")
+    print("=" * 60)
+    print(f"Input CSV: {input_csv}")
+    print(f"Output CSV: {output_csv}")
+    print(f"Base path: {base_path}")
+    print(f"Tolerance: {tolerance * 100:.1f}%")
+    print()
+
+    # Read the existing CSV
+    df = pd.read_csv(input_path)
+
+    # Add new columns
+    df["min_duration_timestamp_ms"] = pd.NA
+    df["max_duration_timestamp_ms"] = pd.NA
+    df["time_between_min_max_ms"] = pd.NA
+    df["min_duration_found_us"] = pd.NA
+    df["max_duration_found_us"] = pd.NA
+
+    total_rows = len(df)
+    print(f"Processing {total_rows} rows...")
+
+    successful_rows = 0
+
+    for idx, row in df.iterrows():
+        if verbose or (idx + 1) % 10 == 0 or idx == 0:
+            print(f"\nProcessing row {idx + 1}/{total_rows}")
+
+        # Extract configuration
+        threads = int(row["threads"])
+        channel = int(row["channel"])
+        rank = int(row["rank"])
+        kernel_name = row["kernel_name"]
+
+        # Get durations in microseconds
+        min_duration_us = float(row["kernel_time_min_us"])
+        max_duration_us = float(row["kernel_time_max_us"])
+
+        if verbose:
+            print(f"  Config: {threads}thread/{channel}ch/rank{rank}")
+            print(f"  Kernel: {kernel_name[:60]}...")
+            print(f"  Duration range: [{min_duration_us:.3f}, {max_duration_us:.3f}] us")
+
+        # Find trace file
+        trace_file = get_trace_file_path(base_path, threads, channel, rank)
+
+        if trace_file is None:
+            if verbose:
+                print("  Warning: No trace file found")
+            continue
+
+        if verbose:
+            print(f"  Using trace: {trace_file.name}")
+
+        # Find timestamps
+        timestamps = find_min_max_kernel_timestamps(
+            trace_file, kernel_name, min_duration_us, max_duration_us, tolerance
+        )
+
+        if timestamps["min_timestamp_ms"] is not None:
+            df.at[idx, "min_duration_timestamp_ms"] = timestamps["min_timestamp_ms"]
+
+        if timestamps["max_timestamp_ms"] is not None:
+            df.at[idx, "max_duration_timestamp_ms"] = timestamps["max_timestamp_ms"]
+
+        # Store found durations for verification
+        if timestamps["min_duration_found_us"] is not None:
+            df.at[idx, "min_duration_found_us"] = timestamps["min_duration_found_us"]
+
+        if timestamps["max_duration_found_us"] is not None:
+            df.at[idx, "max_duration_found_us"] = timestamps["max_duration_found_us"]
+
+        # Calculate time between min and max occurrences
+        if (
+            timestamps["min_timestamp_ms"] is not None
+            and timestamps["max_timestamp_ms"] is not None
+        ):
+            time_diff = abs(
+                timestamps["max_timestamp_ms"] - timestamps["min_timestamp_ms"]
+            )
+            df.at[idx, "time_between_min_max_ms"] = time_diff
+            successful_rows += 1
+
+            if verbose:
+                print(
+                    f"  Found timestamps: min at {timestamps['min_timestamp_ms']:.3f}ms, "
+                    f"max at {timestamps['max_timestamp_ms']:.3f}ms (diff: {time_diff:.3f}ms)"
+                )
+
+    # Ensure output directory exists
+    output_csv.parent.mkdir(parents=True, exist_ok=True)
+
+    # Save enhanced CSV
+    df.to_csv(output_csv, index=False)
+    print(f"\nEnhanced CSV saved to: {output_csv}")
+
+    # Print summary statistics
+    valid_timestamps = df["min_duration_timestamp_ms"].notna().sum()
+    print(f"\nSummary:")
+    print(f"  Total rows: {total_rows}")
+    print(f"  Rows with timestamps: {valid_timestamps}")
+    print(f"  Success rate: {valid_timestamps/total_rows*100:.1f}%")
+
+    if valid_timestamps > 0:
+        time_diffs = df["time_between_min_max_ms"].dropna()
+        if len(time_diffs) > 0:
+            print(f"\nTime between min/max occurrences:")
+            print(f"  Mean: {time_diffs.mean():.3f} ms")
+            print(f"  Median: {time_diffs.median():.3f} ms")
+            print(f"  Max: {time_diffs.max():.3f} ms")
+            print(f"  Min: {time_diffs.min():.3f} ms")
+
+    print("\n[OK] Enhancement complete!")
+
+    return output_csv
+