diff --git a/docs/QUICK_RUN_GUIDE.md b/docs/QUICK_RUN_GUIDE.md
new file mode 100644
index 00000000..14b29d7b
--- /dev/null
+++ b/docs/QUICK_RUN_GUIDE.md
@@ -0,0 +1,224 @@
+# Quick Run Guide: aorta-report Pipelines
+
+This guide demonstrates how to use the `aorta-report` CLI to analyze PyTorch profiler traces.
+
+---
+
+## 1. Input Directory Structures
+
+### GEMM Sweep Directory (`gemm-sweep/`)
+
+Used for analyzing GEMM kernel variance across multiple thread/channel configurations.
+
+```
+experiments/2026-01-10/gemm-sweep/
+├── 256thread/
+│   ├── nccl_28channels/
+│   │   └── torch_profiler/
+│   │       ├── rank0/trace/pt.trace.json
+│   │       ├── rank1/trace/pt.trace.json
+│   │       └── ... (rank2-7)
+│   └── nccl_56channels/
+│       └── torch_profiler/
+│           └── rank*/trace/pt.trace.json
+├── 512thread/
+│   ├── nccl_28channels/
+│   │   └── torch_profiler/rank*/...
+│   └── nccl_56channels/
+│       └── torch_profiler/rank*/...
+└── tracelens_analysis/           # Generated by TraceLens
+    ├── 256thread/individual_reports/
+    └── 512thread/individual_reports/
+```
+
+### RCCL Warp Speed Directory (`rccl-warp-speed/`)
+
+Used for comparing baseline vs test configurations (A/B comparison).
+
+```
+experiments/2026-01-10/rccl-warp-speed/
+├── 32cu_512threads/              # Baseline configuration
+│   ├── torch_profiler/
+│   │   ├── rank0/*.json
+│   │   ├── rank1/*.json
+│   │   └── ... (rank2-7)
+│   └── tracelens_analysis/       # Generated by TraceLens
+│       ├── individual_reports/
+│       └── collective_reports/
+├── 37cu_384threads/              # Test configuration
+│   ├── torch_profiler/rank*/...
+│   └── tracelens_analysis/...
+└── 56cu_256threads/              # Another configuration
+    └── ...
+```
+
+---
+
+## 2. Pipeline Commands
+
+### GEMM Variance Analysis Pipeline
+
+Analyzes GEMM kernel time variance across thread/channel configurations.
+
+```bash
+aorta-report pipeline gemm \
+    --sweep-dir ./experiments/2026-01-10/gemm-sweep/ \
+    -o ./comparison_gemm_1/
+```
+
+**Options:**
+- `--sweep-dir` - Path to sweep directory with thread/channel subdirectories
+- `-o, --output` - Output directory for results
+- `--skip-tracelens` - Skip TraceLens analysis if reports already exist
+- `--top-k` - Number of top GEMM kernels to extract (default: 5)
+- `-t, --threads` - Thread configs to analyze (default: 256, 512)
+- `-c, --channels` - Channel configs to analyze (default: 28, 42, 56, 70)
+- `--no-plots` - Skip plot generation
+- `--no-html` - Skip HTML report generation
+
+**Example with options:**
+```bash
+aorta-report pipeline gemm \
+    --sweep-dir ./experiments/2026-01-10/gemm-sweep/ \
+    -o ./comparison_gemm/ \
+    --skip-tracelens \
+    --top-k 10 \
+    -t 256 -t 512 \
+    -c 28 -c 56
+```
+
+---
+
+### Summary Comparison Pipeline
+
+Compares two configurations (baseline vs test) with comprehensive analysis.
+
+```bash
+aorta-report pipeline summary \
+    --baseline ./experiments/2026-01-10/rccl-warp-speed/32cu_512threads/ \
+    --test ./experiments/2026-01-10/rccl-warp-speed/37cu_384threads/ \
+    --baseline-label 32c_512t \
+    --test-label 37c_384t \
+    --output ./comparison_rccl/
+```
+
+**Options:**
+- `--baseline` - Path to baseline trace directory
+- `--test` - Path to test trace directory
+- `--baseline-label` - Label for baseline in reports
+- `--test-label` - Label for test in reports
+- `--output` - Output directory for results
+- `--skip-tracelens` - Skip TraceLens analysis if reports already exist
+- `--gpu-timeline/--no-gpu-timeline` - Include GPU timeline comparison
+- `--collective/--no-collective` - Include collective/NCCL comparison
+
+**Example with options:**
+```bash
+aorta-report pipeline summary \
+    --baseline ./experiments/2026-01-10/rccl-warp-speed/32cu_512threads/ \
+    --test ./experiments/2026-01-10/rccl-warp-speed/56cu_256threads/ \
+    --baseline-label baseline_32cu \
+    --test-label test_56cu \
+    --output ./comparison_output/ \
+    --skip-tracelens
+```
+
+---
+
+## 3. Output Directory Structures
+
+### GEMM Pipeline Output (`comparison_gemm_1/`)
+
+```
+comparison_gemm_1/
+├── top5_gemm_kernels_time_variance.csv           # Raw GEMM variance data
+├── top5_gemm_kernels_time_variance_with_timestamps.csv  # Enhanced with timestamps
+├── plots/
+│   ├── variance_by_threads_boxplot.png           # Variance by thread config
+│   ├── variance_by_channels_boxplot.png          # Variance by channel config
+│   ├── variance_by_ranks_boxplot.png             # Variance by rank
+│   ├── variance_thread_channel_interaction.png   # Thread × Channel interaction
+│   └── variance_violin_combined.png              # Combined violin plot
+└── gemm_variance_report.html                     # Self-contained HTML report
+```
+
+**Key outputs:**
+- **CSV files**: Raw data for further analysis
+- **Boxplots**: Identify which configs have highest variance
+- **HTML report**: Share with team (includes all plots embedded)
+
+---
+
+### Summary Pipeline Output (`comparison_rccl/`)
+
+```
+comparison_rccl/
+├── gpu_timeline_comparison.xlsx                  # GPU timeline comparison
+├── gpu_timeline_combined.xlsx                    # Combined timeline data
+├── collective_comparison.xlsx                    # NCCL collective comparison
+├── collective_combined.xlsx                      # Combined collective data
+├── final_analysis_report.xlsx                    # Comprehensive analysis
+├── plots/
+│   ├── abs_time_comparison.png                   # Absolute time comparison
+│   ├── computation_time_by_rank.png              # Computation time per rank
+│   ├── idle_time_by_rank.png                     # Idle time per rank
+│   ├── total_time_by_rank.png                    # Total time per rank
+│   ├── total_comm_time_by_rank.png               # Communication time per rank
+│   ├── gpu_time_heatmap.png                      # GPU time heatmap
+│   ├── gpu_time_change_percentage_summary_by_rank.png  # % change summary
+│   ├── improvement_chart.png                     # Overall improvement chart
+│   ├── NCCL_Algorithm_Bandwidth_comparison.png   # NCCL bandwidth comparison
+│   ├── NCCL_Bus_Bandwidth_comparison.png         # Bus bandwidth comparison
+│   ├── NCCL_Communication_Latency_comparison.png # Latency comparison
+│   ├── NCCL_Total_Communication_Latency_comparison.png
+│   └── NCCL_Performance_Percentage_Change_comparison.png
+└── performance_analysis_report.html              # Self-contained HTML report
+```
+
+**Key outputs:**
+- **Excel files**: Detailed data for spreadsheet analysis
+- **Plots**: Visual comparisons between baseline and test
+- **HTML report**: Share comprehensive results with team
+
+---
+
+## 4. Quick Start Examples
+
+### Analyze a new sweep directory
+```bash
+# Full pipeline (runs TraceLens + GEMM analysis)
+aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output/
+
+# If TraceLens was already run
+aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output/ --skip-tracelens
+```
+
+### Compare two configurations
+```bash
+# Full comparison (runs TraceLens + comparison)
+aorta-report pipeline summary \
+    --baseline /path/to/baseline \
+    --test /path/to/test \
+    --baseline-label "Baseline" \
+    --test-label "Test" \
+    --output ./comparison/
+```
+
+### Run only TraceLens analysis
+```bash
+# Single configuration
+aorta-report analyze single /path/to/traces
+
+# Sweep directory (multiple configs)
+aorta-report analyze sweep /path/to/sweep
+```
+
+---
+
+## 5. Tips
+
+1. **First run**: Let the pipeline run TraceLens (don't use `--skip-tracelens`)
+2. **Subsequent runs**: Use `--skip-tracelens` to save time
+3. **Large datasets**: Use `--no-plots --no-html` for faster processing
+4. **Custom analysis**: Use the CSV/Excel outputs for custom visualization
+
diff --git a/scripts/gemm_analysis/run_tracelens_analysis.sh b/scripts/gemm_analysis/run_tracelens_analysis.sh
index c3b654c1..1154ec9d 100755
--- a/scripts/gemm_analysis/run_tracelens_analysis.sh
+++ b/scripts/gemm_analysis/run_tracelens_analysis.sh
@@ -264,7 +264,10 @@ else
             # trace file in the rank folder to the canonical `trace/pt.trace.json` path.
             # This will satisfy TraceLens's requirement of only one `*` being present in the trace pattern
             # while also avoiding FileNotFoundErrors due to different filenames.
-            find $TRACE_DIR/rank* -name "*.json" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \;
+            # OLD (not idempotent - running twice creates trace/trace/pt.trace.json):
+            # find $TRACE_DIR/rank* -name "*.json" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \;
+            # NEW: -not -path "*/trace/*" ensures this is idempotent (safe to run multiple times)
+            find $TRACE_DIR/rank* -name "*.json" -not -path "*/trace/*" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \;
 
             TraceLens_generate_multi_rank_collective_report_pytorch \
                 --trace_pattern "$TRACE_DIR/rank*/trace/pt.trace.json" \
diff --git a/src/aorta/report/analysis/__init__.py b/src/aorta/report/analysis/__init__.py
index e0bd3246..d8fc0031 100644
--- a/src/aorta/report/analysis/__init__.py
+++ b/src/aorta/report/analysis/__init__.py
@@ -3,12 +3,13 @@
 from .tracelens_wrapper import TraceLensWrapper
 from .analyze_gemm import analyze_gemm_reports
 from .analyze_single import analyze_single_config
-from .analyze_sweep import analyze_sweep_config
+from .analyze_sweep import analyze_sweep_config, discover_and_run_tracelens
 
 __all__ = [
     "TraceLensWrapper",
     "analyze_gemm_reports",
     "analyze_single_config",
     "analyze_sweep_config",
+    "discover_and_run_tracelens",
 ]
 
diff --git a/src/aorta/report/analysis/analyze_gemm.py b/src/aorta/report/analysis/analyze_gemm.py
index a724019e..a8112022 100644
--- a/src/aorta/report/analysis/analyze_gemm.py
+++ b/src/aorta/report/analysis/analyze_gemm.py
@@ -38,12 +38,48 @@ def extract_name_from_kernel_info(kernel_info_str: str) -> Optional[str]:
         return None
 
 
-def column_letter_to_index(letter: str) -> int:
-    """Convert Excel column letter to 0-based index."""
-    index = 0
-    for i, char in enumerate(reversed(letter.upper())):
-        index += (ord(char) - ord("A") + 1) * (26**i)
-    return index - 1
+def find_column_indices(
+    header_row: List[Any],
+    required_columns: Dict[str, str],
+) -> Dict[str, int]:
+    """
+    Find column indices by matching column names in header row.
+
+    Args:
+        header_row: List of column header values
+        required_columns: Dict mapping logical names to expected column names
+                         e.g., {"kernel_info": "kernel_details__summarize_kernel_stats"}
+
+    Returns:
+        Dict mapping logical names to column indices (0-based)
+
+    Raises:
+        ValueError: If any required column is not found
+    """
+    # Create a mapping of column name -> index
+    header_map = {}
+    for idx, col_name in enumerate(header_row):
+        if col_name is not None:
+            header_map[str(col_name)] = idx
+
+    # Find indices for required columns
+    column_indices = {}
+    missing_columns = []
+
+    for logical_name, expected_name in required_columns.items():
+        if expected_name in header_map:
+            column_indices[logical_name] = header_map[expected_name]
+        else:
+            missing_columns.append(f"'{expected_name}' (for {logical_name})")
+
+    if missing_columns:
+        available = list(header_map.keys())[:20]  # Show first 20 columns
+        raise ValueError(
+            f"Required columns not found: {', '.join(missing_columns)}\n"
+            f"Available columns (first 20): {available}"
+        )
+
+    return column_indices
 
 
 def process_excel_file(
@@ -66,6 +102,13 @@ def process_excel_file(
     Returns:
         List of dictionaries containing kernel data
     """
+    # Define required columns by their expected names
+    REQUIRED_COLUMNS = {
+        "kernel_info": "kernel_details__summarize_kernel_stats",
+        "time_min": "Kernel Time (µs)_min",
+        "time_max": "Kernel Time (µs)_max",
+    }
+
     try:
         # Open the workbook
         wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
@@ -77,62 +120,24 @@ def process_excel_file(
 
         sheet = wb["GEMM"]
 
-        # Expected column positions (0-based indices)
-        col_kernel_info = column_letter_to_index("X")  # Column X
-        col_time_min = column_letter_to_index("AG")  # Column AG
-        col_time_max = column_letter_to_index("AH")  # Column AH
-
-        # Read header row to validate column names
         rows_data = []
         header_row = None
+        col_indices = None
 
         for i, row in enumerate(sheet.iter_rows(values_only=True)):
             if i == 0:
-                # This is the header - validate column names match expectations
+                # Parse header row and find column indices dynamically
                 header_row = list(row)
-
-                # Expected column names (match what TraceLens generates)
-                expected_x = "kernel_details__summarize_kernel_stats"
-                expected_ag = "Kernel Time (µs)_min"
-                expected_ah = "Kernel Time (µs)_max"
-
-                # Validate each expected column
-                errors = []
-
-                if col_kernel_info < len(header_row):
-                    header_x = str(header_row[col_kernel_info]) if header_row[col_kernel_info] else ""
-                    if header_x != expected_x:
-                        errors.append(f"Column X: expected '{expected_x}', found '{header_x}'")
-                else:
-                    errors.append(f"Column X: not found (only {len(header_row)} columns)")
-
-                if col_time_min < len(header_row):
-                    header_ag = str(header_row[col_time_min]) if header_row[col_time_min] else ""
-                    if header_ag != expected_ag:
-                        errors.append(f"Column AG: expected '{expected_ag}', found '{header_ag}'")
-                else:
-                    errors.append(f"Column AG: not found (only {len(header_row)} columns)")
-
-                if col_time_max < len(header_row):
-                    header_ah = str(header_row[col_time_max]) if header_row[col_time_max] else ""
-                    if header_ah != expected_ah:
-                        errors.append(f"Column AH: expected '{expected_ah}', found '{header_ah}'")
-                else:
-                    errors.append(f"Column AH: not found (only {len(header_row)} columns)")
-
-                if errors:
-                    raise ValueError(
-                        f"Column validation failed in {file_path}:\n  " + "\n  ".join(errors)
-                    )
-
+                col_indices = find_column_indices(header_row, REQUIRED_COLUMNS)
                 continue
 
-            if row is None or len(row) <= max(col_kernel_info, col_time_min, col_time_max):
+            if row is None or col_indices is None:
                 continue
 
-            kernel_info = row[col_kernel_info] if col_kernel_info < len(row) else None
-            kernel_time_min = row[col_time_min] if col_time_min < len(row) else None
-            kernel_time_max = row[col_time_max] if col_time_max < len(row) else None
+            # Extract values using dynamically found indices
+            kernel_info = row[col_indices["kernel_info"]] if col_indices["kernel_info"] < len(row) else None
+            kernel_time_min = row[col_indices["time_min"]] if col_indices["time_min"] < len(row) else None
+            kernel_time_max = row[col_indices["time_max"]] if col_indices["time_max"] < len(row) else None
 
             # Extract kernel name
             kernel_name = extract_name_from_kernel_info(kernel_info)
diff --git a/src/aorta/report/analysis/analyze_single.py b/src/aorta/report/analysis/analyze_single.py
index f961cf33..869fa138 100644
--- a/src/aorta/report/analysis/analyze_single.py
+++ b/src/aorta/report/analysis/analyze_single.py
@@ -34,14 +34,15 @@ def detect_trace_directory(input_dir: Path) -> Tuple[Path, Path]:
         ValueError: If directory structure cannot be determined
     """
     # Check if input_dir contains rank directories (i.e., it IS torch_profiler/)
-    rank_dirs = list(input_dir.glob("rank*"))
+    # Use is_dir() to filter out files matching rank* pattern (e.g., rank0.log)
+    rank_dirs = [p for p in input_dir.glob("rank*") if p.is_dir()]
     if rank_dirs:
         return input_dir, input_dir.parent
 
     # Check if input_dir contains torch_profiler/ subdirectory
     torch_prof_dir = input_dir / "torch_profiler"
     if torch_prof_dir.exists():
-        rank_dirs = list(torch_prof_dir.glob("rank*"))
+        rank_dirs = [p for p in torch_prof_dir.glob("rank*") if p.is_dir()]
         if rank_dirs:
             return torch_prof_dir, input_dir
 
@@ -55,10 +56,30 @@ def detect_trace_directory(input_dir: Path) -> Tuple[Path, Path]:
 
 
 def find_trace_file(rank_dir: Path) -> Optional[Path]:
-    """Find trace file in a rank directory."""
+    """Find trace file in a rank directory.
+    
+    Searches for JSON trace files in the following order:
+    1. Directly in rank_dir (e.g., rank0/*.json)
+    2. In trace/ subdirectory (e.g., rank0/trace/pt.trace.json)
+    3. Recursively in any subdirectory (e.g., rank0/**/*.json)
+    """
+    # First, look directly in the rank directory
     json_files = list(rank_dir.glob("*.json"))
     if json_files:
         return json_files[0]
+    
+    # Then check trace/ subdirectory (common after collective report prep)
+    trace_subdir = rank_dir / "trace"
+    if trace_subdir.exists():
+        json_files = list(trace_subdir.glob("*.json"))
+        if json_files:
+            return json_files[0]
+    
+    # Finally, search recursively
+    json_files = list(rank_dir.glob("**/*.json"))
+    if json_files:
+        return json_files[0]
+    
     return None
 
 
@@ -155,6 +176,7 @@ def analyze_single_config(
     short_kernel_threshold_us: int = 50,
     topk_ops: int = 100,
     verbose: bool = False,
+    output_prefix: Optional[str] = None,
 ) -> dict:
     """
     Run TraceLens analysis on a single configuration trace directory.
@@ -169,6 +191,7 @@ def analyze_single_config(
         short_kernel_threshold_us: Threshold for short kernel study
         topk_ops: Number of top operations to include
         verbose: Whether to print verbose output
+        output_prefix: Custom prefix for output files (e.g., "28ch" -> perf_28ch_rank0.xlsx)
 
     Returns:
         Dictionary with paths to generated reports
@@ -239,7 +262,11 @@ def analyze_single_config(
                 print(f"  Skip {rank_name} - no trace file found")
                 continue
 
-            output_file = individual_reports_dir / f"perf_rank{rank_num}.xlsx"
+            # Use custom prefix if provided (for sweep mode), otherwise default naming
+            if output_prefix:
+                output_file = individual_reports_dir / f"perf_{output_prefix}_rank{rank_num}.xlsx"
+            else:
+                output_file = individual_reports_dir / f"perf_rank{rank_num}.xlsx"
 
             print(f"\nProcessing {rank_name}...")
             print(f"  Trace: {trace_file.name}")
@@ -274,8 +301,11 @@ def analyze_single_config(
                 symlink_path = rank_dir / "trace.json"
                 if not symlink_path.exists():
                     try:
-                        symlink_path.symlink_to(trace_file.name)
-                    except (OSError, FileExistsError):
+                        # Use relative path from rank_dir to trace_file
+                        # This handles cases where trace is in subdirectory (e.g., trace/pt.trace.json)
+                        relative_path = trace_file.relative_to(rank_dir)
+                        symlink_path.symlink_to(relative_path)
+                    except (OSError, FileExistsError, ValueError):
                         pass  # Symlink already exists or cannot be created
 
         trace_pattern = str(torch_prof_dir / "rank*" / "trace.json")
diff --git a/src/aorta/report/analysis/analyze_sweep.py b/src/aorta/report/analysis/analyze_sweep.py
index 9626769a..5db88549 100644
--- a/src/aorta/report/analysis/analyze_sweep.py
+++ b/src/aorta/report/analysis/analyze_sweep.py
@@ -3,15 +3,131 @@
 
 Processes GPU timeline data from TraceLens individual reports across multiple
 thread and channel configurations, aggregating across ranks.
+
+Supports two modes:
+1. Run TraceLens on all configurations then aggregate (default)
+2. Aggregate existing TraceLens reports only (--skip-tracelens)
 """
 
 import glob
+import re
 from pathlib import Path
 from typing import Dict, List, Optional, Any
 
 import numpy as np
 import pandas as pd
 
+from .analyze_single import analyze_single_config
+
+
+def discover_and_run_tracelens(
+    sweep_dir: Path,
+    short_kernel_threshold_us: int = 50,
+    topk_ops: int = 100,
+    verbose: bool = False,
+) -> Path:
+    """
+    Discover thread/channel configs and run TraceLens on each.
+
+    Expected input structure:
+        sweep_dir/
+        ├── 256thread/
+        │   ├── nccl_28channels/
+        │   │   └── torch_profiler/rank*/
+        │   └── nccl_42channels/
+        └── 512thread/
+            └── ...
+
+    Output structure:
+        sweep_dir/
+        └── tracelens_analysis/
+            ├── 256thread/
+            │   └── individual_reports/
+            │       ├── perf_28ch_rank0.xlsx
+            │       └── ...
+            └── 512thread/
+                └── ...
+
+    Args:
+        sweep_dir: Path to sweep directory with thread/channel subdirectories
+        short_kernel_threshold_us: Threshold for short kernel study
+        topk_ops: Number of top operations to include
+        verbose: Whether to print verbose output
+
+    Returns:
+        Path to tracelens_analysis output directory
+    """
+    sweep_path = Path(sweep_dir)
+    output_base = sweep_path / "tracelens_analysis"
+
+    # Discover thread configurations (e.g., "256thread", "512thread")
+    thread_dirs = sorted([
+        d for d in sweep_path.iterdir()
+        if d.is_dir() and "thread" in d.name
+    ])
+
+    if not thread_dirs:
+        raise ValueError(f"No thread configurations found in {sweep_dir}")
+
+    print("=" * 80)
+    print("Step 0: Running TraceLens on All Configurations")
+    print("=" * 80)
+    print(f"\nDiscovered thread configs: {[d.name for d in thread_dirs]}")
+
+    for thread_dir in thread_dirs:
+        thread_name = thread_dir.name  # e.g., "256thread"
+
+        # Find channel configs (e.g., "nccl_28channels")
+        channel_dirs = sorted([
+            d for d in thread_dir.iterdir()
+            if d.is_dir() and "channel" in d.name
+        ])
+
+        if not channel_dirs:
+            print(f"  [WARN] No channel configs in {thread_name}")
+            continue
+
+        print(f"\n{thread_name}: {[d.name for d in channel_dirs]}")
+
+        for channel_dir in channel_dirs:
+            # Extract channel number (e.g., "nccl_28channels" -> "28")
+            channel_name = channel_dir.name
+            channel_match = re.search(r"(\d+)", channel_name)
+            channel_num = channel_match.group(1) if channel_match else "0"
+
+            # Look for torch_profiler directory
+            trace_dir = channel_dir / "torch_profiler"
+            if not trace_dir.exists():
+                print(f"    [SKIP] {channel_name} - no torch_profiler/")
+                continue
+
+            # Output to: tracelens_analysis/{thread}/individual_reports/
+            output_dir = output_base / thread_name
+
+            print(f"  Processing {channel_name}...")
+
+            try:
+                analyze_single_config(
+                    input_dir=trace_dir,
+                    output_dir=output_dir,
+                    run_individual=True,
+                    run_collective=False,  # Skip collective for sweep
+                    aggregate_timeline=False,  # Will aggregate at sweep level
+                    short_kernel_threshold_us=short_kernel_threshold_us,
+                    topk_ops=topk_ops,
+                    verbose=verbose,
+                    output_prefix=f"{channel_num}ch",  # e.g., "28ch"
+                )
+                print(f"    [OK] {channel_name}")
+            except Exception as e:
+                print(f"    [ERROR] {channel_name}: {e}")
+
+    print("\n" + "=" * 80)
+    print("TraceLens Analysis Complete")
+    print("=" * 80)
+
+    return output_base
+
 
 def geometric_mean(values: np.ndarray) -> float:
     """Calculate geometric mean, handling zeros."""
@@ -300,15 +416,25 @@ def analyze_sweep_config(
     sweep_dir: Path,
     output_dir: Optional[Path] = None,
     use_geo_mean: bool = False,
+    skip_tracelens: bool = False,
+    short_kernel_threshold_us: int = 50,
+    topk_ops: int = 100,
     verbose: bool = False,
 ) -> Optional[Path]:
     """
-    Process GPU timeline data from all individual reports in a sweep.
+    Analyze a sweep directory: run TraceLens on all configs and aggregate results.
+
+    By default, runs TraceLens analysis on all thread/channel configurations first,
+    then aggregates GPU timeline data. Use skip_tracelens=True to only aggregate
+    existing reports.
 
     Args:
-        sweep_dir: Path to sweep directory containing tracelens_analysis/
+        sweep_dir: Path to sweep directory with thread/channel subdirectories
         output_dir: Output directory (default: sweep_dir/tracelens_analysis/)
         use_geo_mean: If True, use geometric mean; otherwise use arithmetic mean
+        skip_tracelens: If True, skip TraceLens analysis (only aggregate existing)
+        short_kernel_threshold_us: Threshold for short kernel study
+        topk_ops: Number of top operations to include
         verbose: Whether to print verbose output
 
     Returns:
@@ -317,6 +443,16 @@ def analyze_sweep_config(
     sweep_path = Path(sweep_dir)
     tracelens_dir = sweep_path / "tracelens_analysis"
 
+    # Step 1: Run TraceLens on all configurations (unless skipped)
+    if not skip_tracelens:
+        discover_and_run_tracelens(
+            sweep_dir=sweep_path,
+            short_kernel_threshold_us=short_kernel_threshold_us,
+            topk_ops=topk_ops,
+            verbose=verbose,
+        )
+
+    # Step 2: Aggregate results
     if not tracelens_dir.exists():
         raise FileNotFoundError(
             f"tracelens_analysis directory not found in {sweep_dir}"
diff --git a/src/aorta/report/analysis/cli.py b/src/aorta/report/analysis/cli.py
index 232f8d16..e51205c3 100644
--- a/src/aorta/report/analysis/cli.py
+++ b/src/aorta/report/analysis/cli.py
@@ -75,18 +75,43 @@ def analyze_single(ctx, trace_dir, individual_only, collective_only, geo_mean,
 
 @analyze.command("sweep")
 @click.argument("sweep_dir", type=click.Path(exists=True))
+@click.option("--skip-tracelens", is_flag=True,
+              help="Skip TraceLens analysis, only aggregate existing reports")
 @click.option("--geo-mean", is_flag=True, help="Use geometric mean instead of arithmetic mean")
+@click.option("--short-kernel-threshold", default=50, type=int,
+              help="Threshold for short kernel study (microseconds)")
+@click.option("--topk-ops", default=100, type=int,
+              help="Number of top operations to include")
 @click.option("-o", "--output", type=click.Path(), help="Output directory")
 @click.pass_context
-def analyze_sweep(ctx, sweep_dir, geo_mean, output):
+def analyze_sweep(ctx, sweep_dir, skip_tracelens, geo_mean, short_kernel_threshold,
+                  topk_ops, output):
     """Analyze a sweep directory with multiple configurations.
 
-    SWEEP_DIR: Path to the sweep directory containing tracelens_analysis/
-    with multiple thread/channel configs.
+    SWEEP_DIR: Path to the sweep directory containing thread/channel subdirectories.
+
+    By default, runs TraceLens analysis on all configurations first, then
+    aggregates the results. Use --skip-tracelens to only aggregate existing reports.
+
+    \b
+    Expected directory structure:
+      sweep_dir/
+      ├── 256thread/
+      │   ├── nccl_28channels/
+      │   │   └── torch_profiler/rank*/
+      │   └── nccl_42channels/
+      └── 512thread/
+          └── ...
 
     \b
     Examples:
+      # Run TraceLens + aggregate (default)
       aorta-report analyze sweep /path/to/sweep_20251124
+
+      # Only aggregate existing reports
+      aorta-report analyze sweep /path/to/sweep --skip-tracelens
+
+      # With geometric mean aggregation
       aorta-report analyze sweep /path/to/sweep --geo-mean
     """
     from . import analyze_sweep_config
@@ -99,6 +124,9 @@ def analyze_sweep(ctx, sweep_dir, geo_mean, output):
             sweep_dir=Path(sweep_dir),
             output_dir=Path(output) if output else None,
             use_geo_mean=geo_mean,
+            skip_tracelens=skip_tracelens,
+            short_kernel_threshold_us=short_kernel_threshold,
+            topk_ops=topk_ops,
             verbose=verbose,
         )
         if not quiet and output_path:
diff --git a/src/aorta/report/pipelines/cli.py b/src/aorta/report/pipelines/cli.py
index a8059968..e8cb79d7 100644
--- a/src/aorta/report/pipelines/cli.py
+++ b/src/aorta/report/pipelines/cli.py
@@ -165,12 +165,29 @@ def pipeline_summary(
     "--sweep-dir",
     required=True,
     type=click.Path(exists=True),
-    help="Sweep directory containing tracelens_analysis/",
+    help="Sweep directory with thread/channel subdirectories",
 )
 @click.option(
     "-o", "--output", required=True, type=click.Path(), help="Output directory for results"
 )
-@click.option("--top-k", default=5, type=int, help="Number of top kernels to extract (default: 5)")
+@click.option(
+    "--skip-tracelens",
+    is_flag=True,
+    help="Skip TraceLens analysis, use existing reports",
+)
+@click.option(
+    "--short-kernel-threshold",
+    default=50,
+    type=int,
+    help="Threshold for short kernel study in microseconds (default: 50)",
+)
+@click.option(
+    "--topk-ops",
+    default=100,
+    type=int,
+    help="Number of top operations to include in TraceLens analysis (default: 100)",
+)
+@click.option("--top-k", default=5, type=int, help="Number of top GEMM kernels to extract (default: 5)")
 @click.option(
     "--threads",
     "-t",
@@ -193,22 +210,29 @@ def pipeline_summary(
 @click.option("--plots/--no-plots", default=True, help="Generate plots (default: True)")
 @click.option("--html/--no-html", default=True, help="Generate HTML report (default: True)")
 @click.pass_context
-def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps, plots, html):
+def pipeline_gemm(ctx, sweep_dir, output, skip_tracelens, short_kernel_threshold, topk_ops,
+                  top_k, threads, channels, timestamps, plots, html):
     """Run GEMM variance analysis pipeline.
 
-    Analyzes GEMM kernel time variance across configurations:
+    By default, runs TraceLens on all configurations first, then analyzes GEMM kernels.
+    Use --skip-tracelens to use existing TraceLens reports.
 
     \b
-    1. Analyze GEMM reports to extract top-K kernels with highest variance
-    2. Enhance with timestamps (optional)
-    3. Generate variance plots (optional)
-    4. Generate HTML report (optional)
+    Steps:
+      0. Run TraceLens on sweep (default, skip with --skip-tracelens)
+      1. Analyze GEMM reports to extract top-K kernels with highest variance
+      2. Enhance with timestamps (optional)
+      3. Generate variance plots (optional)
+      4. Generate HTML report (optional)
 
     \b
     Examples:
-      # Full pipeline
+      # Full pipeline (runs TraceLens + GEMM analysis)
       aorta-report pipeline gemm --sweep-dir /path/to/sweep -o /path/to/output
 
+      # Skip TraceLens, use existing reports
+      aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --skip-tracelens
+
       # Custom top-k
       aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --top-k 10
 
@@ -234,6 +258,9 @@ def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps,
         top_k=top_k,
         threads=list(threads),
         channels=list(channels),
+        skip_tracelens=skip_tracelens,
+        short_kernel_threshold_us=short_kernel_threshold,
+        topk_ops=topk_ops,
         timestamps=timestamps,
         plots=plots,
         html=html,
@@ -246,7 +273,8 @@ def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps,
         click.echo("=" * 60)
         click.echo(f"Sweep dir: {sweep_dir}")
         click.echo(f"Output: {output}")
-        click.echo(f"Top-K: {top_k}")
+        click.echo(f"TraceLens: {'skip' if skip_tracelens else 'run'}")
+        click.echo(f"Top-K GEMM kernels: {top_k}")
         click.echo(f"Threads: {list(threads)}")
         click.echo(f"Channels: {list(channels)}")
         click.echo(f"Options: timestamps={timestamps}, plots={plots}, html={html}")
diff --git a/src/aorta/report/pipelines/gemm_pipeline.py b/src/aorta/report/pipelines/gemm_pipeline.py
index f9ee58ab..3be4dda5 100644
--- a/src/aorta/report/pipelines/gemm_pipeline.py
+++ b/src/aorta/report/pipelines/gemm_pipeline.py
@@ -1,9 +1,11 @@
 """GEMM variance analysis pipeline.
 
 Orchestrates GEMM kernel variance analysis:
+0. Run TraceLens on sweep (optional, default: enabled)
 1. Analyze GEMM Reports
 2. Enhance with Timestamps (optional)
 3. Generate GEMM Plots (optional)
+4. Generate HTML Report (optional)
 """
 
 from pathlib import Path
@@ -21,6 +23,9 @@ class GemmPipelineConfig:
     threads: List[int] = field(default_factory=lambda: [256, 512])
     channels: List[int] = field(default_factory=lambda: [28, 42, 56, 70])
     ranks: List[int] = field(default_factory=lambda: list(range(8)))
+    skip_tracelens: bool = False  # Run TraceLens by default
+    short_kernel_threshold_us: int = 50
+    topk_ops: int = 100
     timestamps: bool = True
     plots: bool = True
     html: bool = True
@@ -56,6 +61,12 @@ def run_gemm_pipeline(config: GemmPipelineConfig) -> GemmPipelineResult:
     config.output_dir.mkdir(parents=True, exist_ok=True)
 
     try:
+        # Step 0: Run TraceLens on sweep (unless skipped)
+        if not config.skip_tracelens:
+            _step_run_tracelens(config, result)
+        else:
+            result.steps_skipped.append("tracelens")
+
         # Step 1: Analyze GEMM Reports
         _step_analyze_gemm(config, result)
 
@@ -90,6 +101,31 @@ def run_gemm_pipeline(config: GemmPipelineConfig) -> GemmPipelineResult:
     return result
 
 
+def _step_run_tracelens(config: GemmPipelineConfig, result: GemmPipelineResult) -> None:
+    """Step 0: Run TraceLens analysis on all configurations in the sweep."""
+    from ..analysis import analyze_sweep_config
+
+    if config.verbose:
+        print("\n" + "=" * 60)
+        print("STEP 0: Run TraceLens Analysis")
+        print("=" * 60)
+
+    try:
+        analyze_sweep_config(
+            sweep_dir=config.sweep_dir,
+            skip_tracelens=False,  # Always run TraceLens in this step
+            short_kernel_threshold_us=config.short_kernel_threshold_us,
+            topk_ops=config.topk_ops,
+            verbose=config.verbose,
+        )
+        result.steps_completed.append("tracelens")
+    except Exception as e:
+        # Don't fail the whole pipeline if TraceLens fails
+        # The subsequent steps will check if tracelens_analysis/ exists
+        result.errors.append(f"TraceLens analysis failed: {e}")
+        result.steps_skipped.append("tracelens (failed)")
+
+
 def _step_analyze_gemm(config: GemmPipelineConfig, result: GemmPipelineResult) -> None:
     """Step 1: Analyze GEMM reports."""
     from ..analysis import analyze_gemm_reports
diff --git a/src/aorta/source_diff.path b/src/aorta/source_diff.path
new file mode 100644
index 00000000..ecfaff04
--- /dev/null
+++ b/src/aorta/source_diff.path
@@ -0,0 +1,555 @@
+diff --git a/scripts/gemm_analysis/run_tracelens_analysis.sh b/scripts/gemm_analysis/run_tracelens_analysis.sh
+index c3b654c..1154ec9 100755
+--- a/scripts/gemm_analysis/run_tracelens_analysis.sh
++++ b/scripts/gemm_analysis/run_tracelens_analysis.sh
+@@ -264,7 +264,10 @@ else
+             # trace file in the rank folder to the canonical `trace/pt.trace.json` path.
+             # This will satisfy TraceLens's requirement of only one `*` being present in the trace pattern
+             # while also avoiding FileNotFoundErrors due to different filenames.
+-            find $TRACE_DIR/rank* -name "*.json" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \;
++            # OLD (not idempotent - running twice creates trace/trace/pt.trace.json):
++            # find $TRACE_DIR/rank* -name "*.json" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \;
++            # NEW: -not -path "*/trace/*" ensures this is idempotent (safe to run multiple times)
++            find $TRACE_DIR/rank* -name "*.json" -not -path "*/trace/*" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \;
+ 
+             TraceLens_generate_multi_rank_collective_report_pytorch \
+                 --trace_pattern "$TRACE_DIR/rank*/trace/pt.trace.json" \
+diff --git a/src/aorta/report/analysis/__init__.py b/src/aorta/report/analysis/__init__.py
+index e0bd324..d8fc003 100644
+--- a/src/aorta/report/analysis/__init__.py
++++ b/src/aorta/report/analysis/__init__.py
+@@ -3,12 +3,13 @@
+ from .tracelens_wrapper import TraceLensWrapper
+ from .analyze_gemm import analyze_gemm_reports
+ from .analyze_single import analyze_single_config
+-from .analyze_sweep import analyze_sweep_config
++from .analyze_sweep import analyze_sweep_config, discover_and_run_tracelens
+ 
+ __all__ = [
+     "TraceLensWrapper",
+     "analyze_gemm_reports",
+     "analyze_single_config",
+     "analyze_sweep_config",
++    "discover_and_run_tracelens",
+ ]
+ 
+diff --git a/src/aorta/report/analysis/analyze_gemm.py b/src/aorta/report/analysis/analyze_gemm.py
+index a724019..56fd19b 100644
+--- a/src/aorta/report/analysis/analyze_gemm.py
++++ b/src/aorta/report/analysis/analyze_gemm.py
+@@ -78,9 +78,9 @@ def process_excel_file(
+         sheet = wb["GEMM"]
+ 
+         # Expected column positions (0-based indices)
+-        col_kernel_info = column_letter_to_index("X")  # Column X
+-        col_time_min = column_letter_to_index("AG")  # Column AG
+-        col_time_max = column_letter_to_index("AH")  # Column AH
++        col_kernel_info = column_letter_to_index("Y")  # Column X
++        col_time_min = column_letter_to_index("AH")  # Column AG
++        col_time_max = column_letter_to_index("AI")  # Column AH
+ 
+         # Read header row to validate column names
+         rows_data = []
+diff --git a/src/aorta/report/analysis/analyze_single.py b/src/aorta/report/analysis/analyze_single.py
+index f961cf3..869fa13 100644
+--- a/src/aorta/report/analysis/analyze_single.py
++++ b/src/aorta/report/analysis/analyze_single.py
+@@ -34,14 +34,15 @@ def detect_trace_directory(input_dir: Path) -> Tuple[Path, Path]:
+         ValueError: If directory structure cannot be determined
+     """
+     # Check if input_dir contains rank directories (i.e., it IS torch_profiler/)
+-    rank_dirs = list(input_dir.glob("rank*"))
++    # Use is_dir() to filter out files matching rank* pattern (e.g., rank0.log)
++    rank_dirs = [p for p in input_dir.glob("rank*") if p.is_dir()]
+     if rank_dirs:
+         return input_dir, input_dir.parent
+ 
+     # Check if input_dir contains torch_profiler/ subdirectory
+     torch_prof_dir = input_dir / "torch_profiler"
+     if torch_prof_dir.exists():
+-        rank_dirs = list(torch_prof_dir.glob("rank*"))
++        rank_dirs = [p for p in torch_prof_dir.glob("rank*") if p.is_dir()]
+         if rank_dirs:
+             return torch_prof_dir, input_dir
+ 
+@@ -55,10 +56,30 @@ def detect_trace_directory(input_dir: Path) -> Tuple[Path, Path]:
+ 
+ 
+ def find_trace_file(rank_dir: Path) -> Optional[Path]:
+-    """Find trace file in a rank directory."""
++    """Find trace file in a rank directory.
++    
++    Searches for JSON trace files in the following order:
++    1. Directly in rank_dir (e.g., rank0/*.json)
++    2. In trace/ subdirectory (e.g., rank0/trace/pt.trace.json)
++    3. Recursively in any subdirectory (e.g., rank0/**/*.json)
++    """
++    # First, look directly in the rank directory
+     json_files = list(rank_dir.glob("*.json"))
+     if json_files:
+         return json_files[0]
++    
++    # Then check trace/ subdirectory (common after collective report prep)
++    trace_subdir = rank_dir / "trace"
++    if trace_subdir.exists():
++        json_files = list(trace_subdir.glob("*.json"))
++        if json_files:
++            return json_files[0]
++    
++    # Finally, search recursively
++    json_files = list(rank_dir.glob("**/*.json"))
++    if json_files:
++        return json_files[0]
++    
+     return None
+ 
+ 
+@@ -155,6 +176,7 @@ def analyze_single_config(
+     short_kernel_threshold_us: int = 50,
+     topk_ops: int = 100,
+     verbose: bool = False,
++    output_prefix: Optional[str] = None,
+ ) -> dict:
+     """
+     Run TraceLens analysis on a single configuration trace directory.
+@@ -169,6 +191,7 @@ def analyze_single_config(
+         short_kernel_threshold_us: Threshold for short kernel study
+         topk_ops: Number of top operations to include
+         verbose: Whether to print verbose output
++        output_prefix: Custom prefix for output files (e.g., "28ch" -> perf_28ch_rank0.xlsx)
+ 
+     Returns:
+         Dictionary with paths to generated reports
+@@ -239,7 +262,11 @@ def analyze_single_config(
+                 print(f"  Skip {rank_name} - no trace file found")
+                 continue
+ 
+-            output_file = individual_reports_dir / f"perf_rank{rank_num}.xlsx"
++            # Use custom prefix if provided (for sweep mode), otherwise default naming
++            if output_prefix:
++                output_file = individual_reports_dir / f"perf_{output_prefix}_rank{rank_num}.xlsx"
++            else:
++                output_file = individual_reports_dir / f"perf_rank{rank_num}.xlsx"
+ 
+             print(f"\nProcessing {rank_name}...")
+             print(f"  Trace: {trace_file.name}")
+@@ -274,8 +301,11 @@ def analyze_single_config(
+                 symlink_path = rank_dir / "trace.json"
+                 if not symlink_path.exists():
+                     try:
+-                        symlink_path.symlink_to(trace_file.name)
+-                    except (OSError, FileExistsError):
++                        # Use relative path from rank_dir to trace_file
++                        # This handles cases where trace is in subdirectory (e.g., trace/pt.trace.json)
++                        relative_path = trace_file.relative_to(rank_dir)
++                        symlink_path.symlink_to(relative_path)
++                    except (OSError, FileExistsError, ValueError):
+                         pass  # Symlink already exists or cannot be created
+ 
+         trace_pattern = str(torch_prof_dir / "rank*" / "trace.json")
+diff --git a/src/aorta/report/analysis/analyze_sweep.py b/src/aorta/report/analysis/analyze_sweep.py
+index 9626769..5db8854 100644
+--- a/src/aorta/report/analysis/analyze_sweep.py
++++ b/src/aorta/report/analysis/analyze_sweep.py
+@@ -3,15 +3,131 @@ Sweep configuration analysis - analyze traces from parameter sweep experiments.
+ 
+ Processes GPU timeline data from TraceLens individual reports across multiple
+ thread and channel configurations, aggregating across ranks.
++
++Supports two modes:
++1. Run TraceLens on all configurations then aggregate (default)
++2. Aggregate existing TraceLens reports only (--skip-tracelens)
+ """
+ 
+ import glob
++import re
+ from pathlib import Path
+ from typing import Dict, List, Optional, Any
+ 
+ import numpy as np
+ import pandas as pd
+ 
++from .analyze_single import analyze_single_config
++
++
++def discover_and_run_tracelens(
++    sweep_dir: Path,
++    short_kernel_threshold_us: int = 50,
++    topk_ops: int = 100,
++    verbose: bool = False,
++) -> Path:
++    """
++    Discover thread/channel configs and run TraceLens on each.
++
++    Expected input structure:
++        sweep_dir/
++        ├── 256thread/
++        │   ├── nccl_28channels/
++        │   │   └── torch_profiler/rank*/
++        │   └── nccl_42channels/
++        └── 512thread/
++            └── ...
++
++    Output structure:
++        sweep_dir/
++        └── tracelens_analysis/
++            ├── 256thread/
++            │   └── individual_reports/
++            │       ├── perf_28ch_rank0.xlsx
++            │       └── ...
++            └── 512thread/
++                └── ...
++
++    Args:
++        sweep_dir: Path to sweep directory with thread/channel subdirectories
++        short_kernel_threshold_us: Threshold for short kernel study
++        topk_ops: Number of top operations to include
++        verbose: Whether to print verbose output
++
++    Returns:
++        Path to tracelens_analysis output directory
++    """
++    sweep_path = Path(sweep_dir)
++    output_base = sweep_path / "tracelens_analysis"
++
++    # Discover thread configurations (e.g., "256thread", "512thread")
++    thread_dirs = sorted([
++        d for d in sweep_path.iterdir()
++        if d.is_dir() and "thread" in d.name
++    ])
++
++    if not thread_dirs:
++        raise ValueError(f"No thread configurations found in {sweep_dir}")
++
++    print("=" * 80)
++    print("Step 0: Running TraceLens on All Configurations")
++    print("=" * 80)
++    print(f"\nDiscovered thread configs: {[d.name for d in thread_dirs]}")
++
++    for thread_dir in thread_dirs:
++        thread_name = thread_dir.name  # e.g., "256thread"
++
++        # Find channel configs (e.g., "nccl_28channels")
++        channel_dirs = sorted([
++            d for d in thread_dir.iterdir()
++            if d.is_dir() and "channel" in d.name
++        ])
++
++        if not channel_dirs:
++            print(f"  [WARN] No channel configs in {thread_name}")
++            continue
++
++        print(f"\n{thread_name}: {[d.name for d in channel_dirs]}")
++
++        for channel_dir in channel_dirs:
++            # Extract channel number (e.g., "nccl_28channels" -> "28")
++            channel_name = channel_dir.name
++            channel_match = re.search(r"(\d+)", channel_name)
++            channel_num = channel_match.group(1) if channel_match else "0"
++
++            # Look for torch_profiler directory
++            trace_dir = channel_dir / "torch_profiler"
++            if not trace_dir.exists():
++                print(f"    [SKIP] {channel_name} - no torch_profiler/")
++                continue
++
++            # Output to: tracelens_analysis/{thread}/individual_reports/
++            output_dir = output_base / thread_name
++
++            print(f"  Processing {channel_name}...")
++
++            try:
++                analyze_single_config(
++                    input_dir=trace_dir,
++                    output_dir=output_dir,
++                    run_individual=True,
++                    run_collective=False,  # Skip collective for sweep
++                    aggregate_timeline=False,  # Will aggregate at sweep level
++                    short_kernel_threshold_us=short_kernel_threshold_us,
++                    topk_ops=topk_ops,
++                    verbose=verbose,
++                    output_prefix=f"{channel_num}ch",  # e.g., "28ch"
++                )
++                print(f"    [OK] {channel_name}")
++            except Exception as e:
++                print(f"    [ERROR] {channel_name}: {e}")
++
++    print("\n" + "=" * 80)
++    print("TraceLens Analysis Complete")
++    print("=" * 80)
++
++    return output_base
++
+ 
+ def geometric_mean(values: np.ndarray) -> float:
+     """Calculate geometric mean, handling zeros."""
+@@ -300,15 +416,25 @@ def analyze_sweep_config(
+     sweep_dir: Path,
+     output_dir: Optional[Path] = None,
+     use_geo_mean: bool = False,
++    skip_tracelens: bool = False,
++    short_kernel_threshold_us: int = 50,
++    topk_ops: int = 100,
+     verbose: bool = False,
+ ) -> Optional[Path]:
+     """
+-    Process GPU timeline data from all individual reports in a sweep.
++    Analyze a sweep directory: run TraceLens on all configs and aggregate results.
++
++    By default, runs TraceLens analysis on all thread/channel configurations first,
++    then aggregates GPU timeline data. Use skip_tracelens=True to only aggregate
++    existing reports.
+ 
+     Args:
+-        sweep_dir: Path to sweep directory containing tracelens_analysis/
++        sweep_dir: Path to sweep directory with thread/channel subdirectories
+         output_dir: Output directory (default: sweep_dir/tracelens_analysis/)
+         use_geo_mean: If True, use geometric mean; otherwise use arithmetic mean
++        skip_tracelens: If True, skip TraceLens analysis (only aggregate existing)
++        short_kernel_threshold_us: Threshold for short kernel study
++        topk_ops: Number of top operations to include
+         verbose: Whether to print verbose output
+ 
+     Returns:
+@@ -317,6 +443,16 @@ def analyze_sweep_config(
+     sweep_path = Path(sweep_dir)
+     tracelens_dir = sweep_path / "tracelens_analysis"
+ 
++    # Step 1: Run TraceLens on all configurations (unless skipped)
++    if not skip_tracelens:
++        discover_and_run_tracelens(
++            sweep_dir=sweep_path,
++            short_kernel_threshold_us=short_kernel_threshold_us,
++            topk_ops=topk_ops,
++            verbose=verbose,
++        )
++
++    # Step 2: Aggregate results
+     if not tracelens_dir.exists():
+         raise FileNotFoundError(
+             f"tracelens_analysis directory not found in {sweep_dir}"
+diff --git a/src/aorta/report/analysis/cli.py b/src/aorta/report/analysis/cli.py
+index 232f8d1..e51205c 100644
+--- a/src/aorta/report/analysis/cli.py
++++ b/src/aorta/report/analysis/cli.py
+@@ -75,18 +75,43 @@ def analyze_single(ctx, trace_dir, individual_only, collective_only, geo_mean,
+ 
+ @analyze.command("sweep")
+ @click.argument("sweep_dir", type=click.Path(exists=True))
++@click.option("--skip-tracelens", is_flag=True,
++              help="Skip TraceLens analysis, only aggregate existing reports")
+ @click.option("--geo-mean", is_flag=True, help="Use geometric mean instead of arithmetic mean")
++@click.option("--short-kernel-threshold", default=50, type=int,
++              help="Threshold for short kernel study (microseconds)")
++@click.option("--topk-ops", default=100, type=int,
++              help="Number of top operations to include")
+ @click.option("-o", "--output", type=click.Path(), help="Output directory")
+ @click.pass_context
+-def analyze_sweep(ctx, sweep_dir, geo_mean, output):
++def analyze_sweep(ctx, sweep_dir, skip_tracelens, geo_mean, short_kernel_threshold,
++                  topk_ops, output):
+     """Analyze a sweep directory with multiple configurations.
+ 
+-    SWEEP_DIR: Path to the sweep directory containing tracelens_analysis/
+-    with multiple thread/channel configs.
++    SWEEP_DIR: Path to the sweep directory containing thread/channel subdirectories.
++
++    By default, runs TraceLens analysis on all configurations first, then
++    aggregates the results. Use --skip-tracelens to only aggregate existing reports.
++
++    \b
++    Expected directory structure:
++      sweep_dir/
++      ├── 256thread/
++      │   ├── nccl_28channels/
++      │   │   └── torch_profiler/rank*/
++      │   └── nccl_42channels/
++      └── 512thread/
++          └── ...
+ 
+     \b
+     Examples:
++      # Run TraceLens + aggregate (default)
+       aorta-report analyze sweep /path/to/sweep_20251124
++
++      # Only aggregate existing reports
++      aorta-report analyze sweep /path/to/sweep --skip-tracelens
++
++      # With geometric mean aggregation
+       aorta-report analyze sweep /path/to/sweep --geo-mean
+     """
+     from . import analyze_sweep_config
+@@ -99,6 +124,9 @@ def analyze_sweep(ctx, sweep_dir, geo_mean, output):
+             sweep_dir=Path(sweep_dir),
+             output_dir=Path(output) if output else None,
+             use_geo_mean=geo_mean,
++            skip_tracelens=skip_tracelens,
++            short_kernel_threshold_us=short_kernel_threshold,
++            topk_ops=topk_ops,
+             verbose=verbose,
+         )
+         if not quiet and output_path:
+diff --git a/src/aorta/report/pipelines/cli.py b/src/aorta/report/pipelines/cli.py
+index a805996..e8cb79d 100644
+--- a/src/aorta/report/pipelines/cli.py
++++ b/src/aorta/report/pipelines/cli.py
+@@ -165,12 +165,29 @@ def pipeline_summary(
+     "--sweep-dir",
+     required=True,
+     type=click.Path(exists=True),
+-    help="Sweep directory containing tracelens_analysis/",
++    help="Sweep directory with thread/channel subdirectories",
+ )
+ @click.option(
+     "-o", "--output", required=True, type=click.Path(), help="Output directory for results"
+ )
+-@click.option("--top-k", default=5, type=int, help="Number of top kernels to extract (default: 5)")
++@click.option(
++    "--skip-tracelens",
++    is_flag=True,
++    help="Skip TraceLens analysis, use existing reports",
++)
++@click.option(
++    "--short-kernel-threshold",
++    default=50,
++    type=int,
++    help="Threshold for short kernel study in microseconds (default: 50)",
++)
++@click.option(
++    "--topk-ops",
++    default=100,
++    type=int,
++    help="Number of top operations to include in TraceLens analysis (default: 100)",
++)
++@click.option("--top-k", default=5, type=int, help="Number of top GEMM kernels to extract (default: 5)")
+ @click.option(
+     "--threads",
+     "-t",
+@@ -193,22 +210,29 @@ def pipeline_summary(
+ @click.option("--plots/--no-plots", default=True, help="Generate plots (default: True)")
+ @click.option("--html/--no-html", default=True, help="Generate HTML report (default: True)")
+ @click.pass_context
+-def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps, plots, html):
++def pipeline_gemm(ctx, sweep_dir, output, skip_tracelens, short_kernel_threshold, topk_ops,
++                  top_k, threads, channels, timestamps, plots, html):
+     """Run GEMM variance analysis pipeline.
+ 
+-    Analyzes GEMM kernel time variance across configurations:
++    By default, runs TraceLens on all configurations first, then analyzes GEMM kernels.
++    Use --skip-tracelens to use existing TraceLens reports.
+ 
+     \b
+-    1. Analyze GEMM reports to extract top-K kernels with highest variance
+-    2. Enhance with timestamps (optional)
+-    3. Generate variance plots (optional)
+-    4. Generate HTML report (optional)
++    Steps:
++      0. Run TraceLens on sweep (default, skip with --skip-tracelens)
++      1. Analyze GEMM reports to extract top-K kernels with highest variance
++      2. Enhance with timestamps (optional)
++      3. Generate variance plots (optional)
++      4. Generate HTML report (optional)
+ 
+     \b
+     Examples:
+-      # Full pipeline
++      # Full pipeline (runs TraceLens + GEMM analysis)
+       aorta-report pipeline gemm --sweep-dir /path/to/sweep -o /path/to/output
+ 
++      # Skip TraceLens, use existing reports
++      aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --skip-tracelens
++
+       # Custom top-k
+       aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --top-k 10
+ 
+@@ -234,6 +258,9 @@ def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps,
+         top_k=top_k,
+         threads=list(threads),
+         channels=list(channels),
++        skip_tracelens=skip_tracelens,
++        short_kernel_threshold_us=short_kernel_threshold,
++        topk_ops=topk_ops,
+         timestamps=timestamps,
+         plots=plots,
+         html=html,
+@@ -246,7 +273,8 @@ def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps,
+         click.echo("=" * 60)
+         click.echo(f"Sweep dir: {sweep_dir}")
+         click.echo(f"Output: {output}")
+-        click.echo(f"Top-K: {top_k}")
++        click.echo(f"TraceLens: {'skip' if skip_tracelens else 'run'}")
++        click.echo(f"Top-K GEMM kernels: {top_k}")
+         click.echo(f"Threads: {list(threads)}")
+         click.echo(f"Channels: {list(channels)}")
+         click.echo(f"Options: timestamps={timestamps}, plots={plots}, html={html}")
+diff --git a/src/aorta/report/pipelines/gemm_pipeline.py b/src/aorta/report/pipelines/gemm_pipeline.py
+index f9ee58a..3be4dda 100644
+--- a/src/aorta/report/pipelines/gemm_pipeline.py
++++ b/src/aorta/report/pipelines/gemm_pipeline.py
+@@ -1,9 +1,11 @@
+ """GEMM variance analysis pipeline.
+ 
+ Orchestrates GEMM kernel variance analysis:
++0. Run TraceLens on sweep (optional, default: enabled)
+ 1. Analyze GEMM Reports
+ 2. Enhance with Timestamps (optional)
+ 3. Generate GEMM Plots (optional)
++4. Generate HTML Report (optional)
+ """
+ 
+ from pathlib import Path
+@@ -21,6 +23,9 @@ class GemmPipelineConfig:
+     threads: List[int] = field(default_factory=lambda: [256, 512])
+     channels: List[int] = field(default_factory=lambda: [28, 42, 56, 70])
+     ranks: List[int] = field(default_factory=lambda: list(range(8)))
++    skip_tracelens: bool = False  # Run TraceLens by default
++    short_kernel_threshold_us: int = 50
++    topk_ops: int = 100
+     timestamps: bool = True
+     plots: bool = True
+     html: bool = True
+@@ -56,6 +61,12 @@ def run_gemm_pipeline(config: GemmPipelineConfig) -> GemmPipelineResult:
+     config.output_dir.mkdir(parents=True, exist_ok=True)
+ 
+     try:
++        # Step 0: Run TraceLens on sweep (unless skipped)
++        if not config.skip_tracelens:
++            _step_run_tracelens(config, result)
++        else:
++            result.steps_skipped.append("tracelens")
++
+         # Step 1: Analyze GEMM Reports
+         _step_analyze_gemm(config, result)
+ 
+@@ -90,6 +101,31 @@ def run_gemm_pipeline(config: GemmPipelineConfig) -> GemmPipelineResult:
+     return result
+ 
+ 
++def _step_run_tracelens(config: GemmPipelineConfig, result: GemmPipelineResult) -> None:
++    """Step 0: Run TraceLens analysis on all configurations in the sweep."""
++    from ..analysis import analyze_sweep_config
++
++    if config.verbose:
++        print("\n" + "=" * 60)
++        print("STEP 0: Run TraceLens Analysis")
++        print("=" * 60)
++
++    try:
++        analyze_sweep_config(
++            sweep_dir=config.sweep_dir,
++            skip_tracelens=False,  # Always run TraceLens in this step
++            short_kernel_threshold_us=config.short_kernel_threshold_us,
++            topk_ops=config.topk_ops,
++            verbose=config.verbose,
++        )
++        result.steps_completed.append("tracelens")
++    except Exception as e:
++        # Don't fail the whole pipeline if TraceLens fails
++        # The subsequent steps will check if tracelens_analysis/ exists
++        result.errors.append(f"TraceLens analysis failed: {e}")
++        result.steps_skipped.append("tracelens (failed)")
++
++
+ def _step_analyze_gemm(config: GemmPipelineConfig, result: GemmPipelineResult) -> None:
+     """Step 1: Analyze GEMM reports."""
+     from ..analysis import analyze_gemm_reports