diff --git a/docs/QUICK_RUN_GUIDE.md b/docs/QUICK_RUN_GUIDE.md new file mode 100644 index 00000000..14b29d7b --- /dev/null +++ b/docs/QUICK_RUN_GUIDE.md @@ -0,0 +1,224 @@ +# Quick Run Guide: aorta-report Pipelines + +This guide demonstrates how to use the `aorta-report` CLI to analyze PyTorch profiler traces. + +--- + +## 1. Input Directory Structures + +### GEMM Sweep Directory (`gemm-sweep/`) + +Used for analyzing GEMM kernel variance across multiple thread/channel configurations. + +``` +experiments/2026-01-10/gemm-sweep/ +├── 256thread/ +│ ├── nccl_28channels/ +│ │ └── torch_profiler/ +│ │ ├── rank0/trace/pt.trace.json +│ │ ├── rank1/trace/pt.trace.json +│ │ └── ... (rank2-7) +│ └── nccl_56channels/ +│ └── torch_profiler/ +│ └── rank*/trace/pt.trace.json +├── 512thread/ +│ ├── nccl_28channels/ +│ │ └── torch_profiler/rank*/... +│ └── nccl_56channels/ +│ └── torch_profiler/rank*/... +└── tracelens_analysis/ # Generated by TraceLens + ├── 256thread/individual_reports/ + └── 512thread/individual_reports/ +``` + +### RCCL Warp Speed Directory (`rccl-warp-speed/`) + +Used for comparing baseline vs test configurations (A/B comparison). + +``` +experiments/2026-01-10/rccl-warp-speed/ +├── 32cu_512threads/ # Baseline configuration +│ ├── torch_profiler/ +│ │ ├── rank0/*.json +│ │ ├── rank1/*.json +│ │ └── ... (rank2-7) +│ └── tracelens_analysis/ # Generated by TraceLens +│ ├── individual_reports/ +│ └── collective_reports/ +├── 37cu_384threads/ # Test configuration +│ ├── torch_profiler/rank*/... +│ └── tracelens_analysis/... +└── 56cu_256threads/ # Another configuration + └── ... +``` + +--- + +## 2. Pipeline Commands + +### GEMM Variance Analysis Pipeline + +Analyzes GEMM kernel time variance across thread/channel configurations. + +```bash +aorta-report pipeline gemm \ + --sweep-dir ./experiments/2026-01-10/gemm-sweep/ \ + -o ./comparison_gemm_1/ +``` + +**Options:** +- `--sweep-dir` - Path to sweep directory with thread/channel subdirectories +- `-o, --output` - Output directory for results +- `--skip-tracelens` - Skip TraceLens analysis if reports already exist +- `--top-k` - Number of top GEMM kernels to extract (default: 5) +- `-t, --threads` - Thread configs to analyze (default: 256, 512) +- `-c, --channels` - Channel configs to analyze (default: 28, 42, 56, 70) +- `--no-plots` - Skip plot generation +- `--no-html` - Skip HTML report generation + +**Example with options:** +```bash +aorta-report pipeline gemm \ + --sweep-dir ./experiments/2026-01-10/gemm-sweep/ \ + -o ./comparison_gemm/ \ + --skip-tracelens \ + --top-k 10 \ + -t 256 -t 512 \ + -c 28 -c 56 +``` + +--- + +### Summary Comparison Pipeline + +Compares two configurations (baseline vs test) with comprehensive analysis. + +```bash +aorta-report pipeline summary \ + --baseline ./experiments/2026-01-10/rccl-warp-speed/32cu_512threads/ \ + --test ./experiments/2026-01-10/rccl-warp-speed/37cu_384threads/ \ + --baseline-label 32c_512t \ + --test-label 37c_384t \ + --output ./comparison_rccl/ +``` + +**Options:** +- `--baseline` - Path to baseline trace directory +- `--test` - Path to test trace directory +- `--baseline-label` - Label for baseline in reports +- `--test-label` - Label for test in reports +- `--output` - Output directory for results +- `--skip-tracelens` - Skip TraceLens analysis if reports already exist +- `--gpu-timeline/--no-gpu-timeline` - Include GPU timeline comparison +- `--collective/--no-collective` - Include collective/NCCL comparison + +**Example with options:** +```bash +aorta-report pipeline summary \ + --baseline ./experiments/2026-01-10/rccl-warp-speed/32cu_512threads/ \ + --test ./experiments/2026-01-10/rccl-warp-speed/56cu_256threads/ \ + --baseline-label baseline_32cu \ + --test-label test_56cu \ + --output ./comparison_output/ \ + --skip-tracelens +``` + +--- + +## 3. Output Directory Structures + +### GEMM Pipeline Output (`comparison_gemm_1/`) + +``` +comparison_gemm_1/ +├── top5_gemm_kernels_time_variance.csv # Raw GEMM variance data +├── top5_gemm_kernels_time_variance_with_timestamps.csv # Enhanced with timestamps +├── plots/ +│ ├── variance_by_threads_boxplot.png # Variance by thread config +│ ├── variance_by_channels_boxplot.png # Variance by channel config +│ ├── variance_by_ranks_boxplot.png # Variance by rank +│ ├── variance_thread_channel_interaction.png # Thread × Channel interaction +│ └── variance_violin_combined.png # Combined violin plot +└── gemm_variance_report.html # Self-contained HTML report +``` + +**Key outputs:** +- **CSV files**: Raw data for further analysis +- **Boxplots**: Identify which configs have highest variance +- **HTML report**: Share with team (includes all plots embedded) + +--- + +### Summary Pipeline Output (`comparison_rccl/`) + +``` +comparison_rccl/ +├── gpu_timeline_comparison.xlsx # GPU timeline comparison +├── gpu_timeline_combined.xlsx # Combined timeline data +├── collective_comparison.xlsx # NCCL collective comparison +├── collective_combined.xlsx # Combined collective data +├── final_analysis_report.xlsx # Comprehensive analysis +├── plots/ +│ ├── abs_time_comparison.png # Absolute time comparison +│ ├── computation_time_by_rank.png # Computation time per rank +│ ├── idle_time_by_rank.png # Idle time per rank +│ ├── total_time_by_rank.png # Total time per rank +│ ├── total_comm_time_by_rank.png # Communication time per rank +│ ├── gpu_time_heatmap.png # GPU time heatmap +│ ├── gpu_time_change_percentage_summary_by_rank.png # % change summary +│ ├── improvement_chart.png # Overall improvement chart +│ ├── NCCL_Algorithm_Bandwidth_comparison.png # NCCL bandwidth comparison +│ ├── NCCL_Bus_Bandwidth_comparison.png # Bus bandwidth comparison +│ ├── NCCL_Communication_Latency_comparison.png # Latency comparison +│ ├── NCCL_Total_Communication_Latency_comparison.png +│ └── NCCL_Performance_Percentage_Change_comparison.png +└── performance_analysis_report.html # Self-contained HTML report +``` + +**Key outputs:** +- **Excel files**: Detailed data for spreadsheet analysis +- **Plots**: Visual comparisons between baseline and test +- **HTML report**: Share comprehensive results with team + +--- + +## 4. Quick Start Examples + +### Analyze a new sweep directory +```bash +# Full pipeline (runs TraceLens + GEMM analysis) +aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output/ + +# If TraceLens was already run +aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output/ --skip-tracelens +``` + +### Compare two configurations +```bash +# Full comparison (runs TraceLens + comparison) +aorta-report pipeline summary \ + --baseline /path/to/baseline \ + --test /path/to/test \ + --baseline-label "Baseline" \ + --test-label "Test" \ + --output ./comparison/ +``` + +### Run only TraceLens analysis +```bash +# Single configuration +aorta-report analyze single /path/to/traces + +# Sweep directory (multiple configs) +aorta-report analyze sweep /path/to/sweep +``` + +--- + +## 5. Tips + +1. **First run**: Let the pipeline run TraceLens (don't use `--skip-tracelens`) +2. **Subsequent runs**: Use `--skip-tracelens` to save time +3. **Large datasets**: Use `--no-plots --no-html` for faster processing +4. **Custom analysis**: Use the CSV/Excel outputs for custom visualization + diff --git a/scripts/gemm_analysis/run_tracelens_analysis.sh b/scripts/gemm_analysis/run_tracelens_analysis.sh index c3b654c1..1154ec9d 100755 --- a/scripts/gemm_analysis/run_tracelens_analysis.sh +++ b/scripts/gemm_analysis/run_tracelens_analysis.sh @@ -264,7 +264,10 @@ else # trace file in the rank folder to the canonical `trace/pt.trace.json` path. # This will satisfy TraceLens's requirement of only one `*` being present in the trace pattern # while also avoiding FileNotFoundErrors due to different filenames. - find $TRACE_DIR/rank* -name "*.json" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \; + # OLD (not idempotent - running twice creates trace/trace/pt.trace.json): + # find $TRACE_DIR/rank* -name "*.json" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \; + # NEW: -not -path "*/trace/*" ensures this is idempotent (safe to run multiple times) + find $TRACE_DIR/rank* -name "*.json" -not -path "*/trace/*" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \; TraceLens_generate_multi_rank_collective_report_pytorch \ --trace_pattern "$TRACE_DIR/rank*/trace/pt.trace.json" \ diff --git a/src/aorta/report/analysis/__init__.py b/src/aorta/report/analysis/__init__.py index e0bd3246..d8fc0031 100644 --- a/src/aorta/report/analysis/__init__.py +++ b/src/aorta/report/analysis/__init__.py @@ -3,12 +3,13 @@ from .tracelens_wrapper import TraceLensWrapper from .analyze_gemm import analyze_gemm_reports from .analyze_single import analyze_single_config -from .analyze_sweep import analyze_sweep_config +from .analyze_sweep import analyze_sweep_config, discover_and_run_tracelens __all__ = [ "TraceLensWrapper", "analyze_gemm_reports", "analyze_single_config", "analyze_sweep_config", + "discover_and_run_tracelens", ] diff --git a/src/aorta/report/analysis/analyze_gemm.py b/src/aorta/report/analysis/analyze_gemm.py index a724019e..a8112022 100644 --- a/src/aorta/report/analysis/analyze_gemm.py +++ b/src/aorta/report/analysis/analyze_gemm.py @@ -38,12 +38,48 @@ def extract_name_from_kernel_info(kernel_info_str: str) -> Optional[str]: return None -def column_letter_to_index(letter: str) -> int: - """Convert Excel column letter to 0-based index.""" - index = 0 - for i, char in enumerate(reversed(letter.upper())): - index += (ord(char) - ord("A") + 1) * (26**i) - return index - 1 +def find_column_indices( + header_row: List[Any], + required_columns: Dict[str, str], +) -> Dict[str, int]: + """ + Find column indices by matching column names in header row. + + Args: + header_row: List of column header values + required_columns: Dict mapping logical names to expected column names + e.g., {"kernel_info": "kernel_details__summarize_kernel_stats"} + + Returns: + Dict mapping logical names to column indices (0-based) + + Raises: + ValueError: If any required column is not found + """ + # Create a mapping of column name -> index + header_map = {} + for idx, col_name in enumerate(header_row): + if col_name is not None: + header_map[str(col_name)] = idx + + # Find indices for required columns + column_indices = {} + missing_columns = [] + + for logical_name, expected_name in required_columns.items(): + if expected_name in header_map: + column_indices[logical_name] = header_map[expected_name] + else: + missing_columns.append(f"'{expected_name}' (for {logical_name})") + + if missing_columns: + available = list(header_map.keys())[:20] # Show first 20 columns + raise ValueError( + f"Required columns not found: {', '.join(missing_columns)}\n" + f"Available columns (first 20): {available}" + ) + + return column_indices def process_excel_file( @@ -66,6 +102,13 @@ def process_excel_file( Returns: List of dictionaries containing kernel data """ + # Define required columns by their expected names + REQUIRED_COLUMNS = { + "kernel_info": "kernel_details__summarize_kernel_stats", + "time_min": "Kernel Time (µs)_min", + "time_max": "Kernel Time (µs)_max", + } + try: # Open the workbook wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True) @@ -77,62 +120,24 @@ def process_excel_file( sheet = wb["GEMM"] - # Expected column positions (0-based indices) - col_kernel_info = column_letter_to_index("X") # Column X - col_time_min = column_letter_to_index("AG") # Column AG - col_time_max = column_letter_to_index("AH") # Column AH - - # Read header row to validate column names rows_data = [] header_row = None + col_indices = None for i, row in enumerate(sheet.iter_rows(values_only=True)): if i == 0: - # This is the header - validate column names match expectations + # Parse header row and find column indices dynamically header_row = list(row) - - # Expected column names (match what TraceLens generates) - expected_x = "kernel_details__summarize_kernel_stats" - expected_ag = "Kernel Time (µs)_min" - expected_ah = "Kernel Time (µs)_max" - - # Validate each expected column - errors = [] - - if col_kernel_info < len(header_row): - header_x = str(header_row[col_kernel_info]) if header_row[col_kernel_info] else "" - if header_x != expected_x: - errors.append(f"Column X: expected '{expected_x}', found '{header_x}'") - else: - errors.append(f"Column X: not found (only {len(header_row)} columns)") - - if col_time_min < len(header_row): - header_ag = str(header_row[col_time_min]) if header_row[col_time_min] else "" - if header_ag != expected_ag: - errors.append(f"Column AG: expected '{expected_ag}', found '{header_ag}'") - else: - errors.append(f"Column AG: not found (only {len(header_row)} columns)") - - if col_time_max < len(header_row): - header_ah = str(header_row[col_time_max]) if header_row[col_time_max] else "" - if header_ah != expected_ah: - errors.append(f"Column AH: expected '{expected_ah}', found '{header_ah}'") - else: - errors.append(f"Column AH: not found (only {len(header_row)} columns)") - - if errors: - raise ValueError( - f"Column validation failed in {file_path}:\n " + "\n ".join(errors) - ) - + col_indices = find_column_indices(header_row, REQUIRED_COLUMNS) continue - if row is None or len(row) <= max(col_kernel_info, col_time_min, col_time_max): + if row is None or col_indices is None: continue - kernel_info = row[col_kernel_info] if col_kernel_info < len(row) else None - kernel_time_min = row[col_time_min] if col_time_min < len(row) else None - kernel_time_max = row[col_time_max] if col_time_max < len(row) else None + # Extract values using dynamically found indices + kernel_info = row[col_indices["kernel_info"]] if col_indices["kernel_info"] < len(row) else None + kernel_time_min = row[col_indices["time_min"]] if col_indices["time_min"] < len(row) else None + kernel_time_max = row[col_indices["time_max"]] if col_indices["time_max"] < len(row) else None # Extract kernel name kernel_name = extract_name_from_kernel_info(kernel_info) diff --git a/src/aorta/report/analysis/analyze_single.py b/src/aorta/report/analysis/analyze_single.py index f961cf33..869fa138 100644 --- a/src/aorta/report/analysis/analyze_single.py +++ b/src/aorta/report/analysis/analyze_single.py @@ -34,14 +34,15 @@ def detect_trace_directory(input_dir: Path) -> Tuple[Path, Path]: ValueError: If directory structure cannot be determined """ # Check if input_dir contains rank directories (i.e., it IS torch_profiler/) - rank_dirs = list(input_dir.glob("rank*")) + # Use is_dir() to filter out files matching rank* pattern (e.g., rank0.log) + rank_dirs = [p for p in input_dir.glob("rank*") if p.is_dir()] if rank_dirs: return input_dir, input_dir.parent # Check if input_dir contains torch_profiler/ subdirectory torch_prof_dir = input_dir / "torch_profiler" if torch_prof_dir.exists(): - rank_dirs = list(torch_prof_dir.glob("rank*")) + rank_dirs = [p for p in torch_prof_dir.glob("rank*") if p.is_dir()] if rank_dirs: return torch_prof_dir, input_dir @@ -55,10 +56,30 @@ def detect_trace_directory(input_dir: Path) -> Tuple[Path, Path]: def find_trace_file(rank_dir: Path) -> Optional[Path]: - """Find trace file in a rank directory.""" + """Find trace file in a rank directory. + + Searches for JSON trace files in the following order: + 1. Directly in rank_dir (e.g., rank0/*.json) + 2. In trace/ subdirectory (e.g., rank0/trace/pt.trace.json) + 3. Recursively in any subdirectory (e.g., rank0/**/*.json) + """ + # First, look directly in the rank directory json_files = list(rank_dir.glob("*.json")) if json_files: return json_files[0] + + # Then check trace/ subdirectory (common after collective report prep) + trace_subdir = rank_dir / "trace" + if trace_subdir.exists(): + json_files = list(trace_subdir.glob("*.json")) + if json_files: + return json_files[0] + + # Finally, search recursively + json_files = list(rank_dir.glob("**/*.json")) + if json_files: + return json_files[0] + return None @@ -155,6 +176,7 @@ def analyze_single_config( short_kernel_threshold_us: int = 50, topk_ops: int = 100, verbose: bool = False, + output_prefix: Optional[str] = None, ) -> dict: """ Run TraceLens analysis on a single configuration trace directory. @@ -169,6 +191,7 @@ def analyze_single_config( short_kernel_threshold_us: Threshold for short kernel study topk_ops: Number of top operations to include verbose: Whether to print verbose output + output_prefix: Custom prefix for output files (e.g., "28ch" -> perf_28ch_rank0.xlsx) Returns: Dictionary with paths to generated reports @@ -239,7 +262,11 @@ def analyze_single_config( print(f" Skip {rank_name} - no trace file found") continue - output_file = individual_reports_dir / f"perf_rank{rank_num}.xlsx" + # Use custom prefix if provided (for sweep mode), otherwise default naming + if output_prefix: + output_file = individual_reports_dir / f"perf_{output_prefix}_rank{rank_num}.xlsx" + else: + output_file = individual_reports_dir / f"perf_rank{rank_num}.xlsx" print(f"\nProcessing {rank_name}...") print(f" Trace: {trace_file.name}") @@ -274,8 +301,11 @@ def analyze_single_config( symlink_path = rank_dir / "trace.json" if not symlink_path.exists(): try: - symlink_path.symlink_to(trace_file.name) - except (OSError, FileExistsError): + # Use relative path from rank_dir to trace_file + # This handles cases where trace is in subdirectory (e.g., trace/pt.trace.json) + relative_path = trace_file.relative_to(rank_dir) + symlink_path.symlink_to(relative_path) + except (OSError, FileExistsError, ValueError): pass # Symlink already exists or cannot be created trace_pattern = str(torch_prof_dir / "rank*" / "trace.json") diff --git a/src/aorta/report/analysis/analyze_sweep.py b/src/aorta/report/analysis/analyze_sweep.py index 9626769a..5db88549 100644 --- a/src/aorta/report/analysis/analyze_sweep.py +++ b/src/aorta/report/analysis/analyze_sweep.py @@ -3,15 +3,131 @@ Processes GPU timeline data from TraceLens individual reports across multiple thread and channel configurations, aggregating across ranks. + +Supports two modes: +1. Run TraceLens on all configurations then aggregate (default) +2. Aggregate existing TraceLens reports only (--skip-tracelens) """ import glob +import re from pathlib import Path from typing import Dict, List, Optional, Any import numpy as np import pandas as pd +from .analyze_single import analyze_single_config + + +def discover_and_run_tracelens( + sweep_dir: Path, + short_kernel_threshold_us: int = 50, + topk_ops: int = 100, + verbose: bool = False, +) -> Path: + """ + Discover thread/channel configs and run TraceLens on each. + + Expected input structure: + sweep_dir/ + ├── 256thread/ + │ ├── nccl_28channels/ + │ │ └── torch_profiler/rank*/ + │ └── nccl_42channels/ + └── 512thread/ + └── ... + + Output structure: + sweep_dir/ + └── tracelens_analysis/ + ├── 256thread/ + │ └── individual_reports/ + │ ├── perf_28ch_rank0.xlsx + │ └── ... + └── 512thread/ + └── ... + + Args: + sweep_dir: Path to sweep directory with thread/channel subdirectories + short_kernel_threshold_us: Threshold for short kernel study + topk_ops: Number of top operations to include + verbose: Whether to print verbose output + + Returns: + Path to tracelens_analysis output directory + """ + sweep_path = Path(sweep_dir) + output_base = sweep_path / "tracelens_analysis" + + # Discover thread configurations (e.g., "256thread", "512thread") + thread_dirs = sorted([ + d for d in sweep_path.iterdir() + if d.is_dir() and "thread" in d.name + ]) + + if not thread_dirs: + raise ValueError(f"No thread configurations found in {sweep_dir}") + + print("=" * 80) + print("Step 0: Running TraceLens on All Configurations") + print("=" * 80) + print(f"\nDiscovered thread configs: {[d.name for d in thread_dirs]}") + + for thread_dir in thread_dirs: + thread_name = thread_dir.name # e.g., "256thread" + + # Find channel configs (e.g., "nccl_28channels") + channel_dirs = sorted([ + d for d in thread_dir.iterdir() + if d.is_dir() and "channel" in d.name + ]) + + if not channel_dirs: + print(f" [WARN] No channel configs in {thread_name}") + continue + + print(f"\n{thread_name}: {[d.name for d in channel_dirs]}") + + for channel_dir in channel_dirs: + # Extract channel number (e.g., "nccl_28channels" -> "28") + channel_name = channel_dir.name + channel_match = re.search(r"(\d+)", channel_name) + channel_num = channel_match.group(1) if channel_match else "0" + + # Look for torch_profiler directory + trace_dir = channel_dir / "torch_profiler" + if not trace_dir.exists(): + print(f" [SKIP] {channel_name} - no torch_profiler/") + continue + + # Output to: tracelens_analysis/{thread}/individual_reports/ + output_dir = output_base / thread_name + + print(f" Processing {channel_name}...") + + try: + analyze_single_config( + input_dir=trace_dir, + output_dir=output_dir, + run_individual=True, + run_collective=False, # Skip collective for sweep + aggregate_timeline=False, # Will aggregate at sweep level + short_kernel_threshold_us=short_kernel_threshold_us, + topk_ops=topk_ops, + verbose=verbose, + output_prefix=f"{channel_num}ch", # e.g., "28ch" + ) + print(f" [OK] {channel_name}") + except Exception as e: + print(f" [ERROR] {channel_name}: {e}") + + print("\n" + "=" * 80) + print("TraceLens Analysis Complete") + print("=" * 80) + + return output_base + def geometric_mean(values: np.ndarray) -> float: """Calculate geometric mean, handling zeros.""" @@ -300,15 +416,25 @@ def analyze_sweep_config( sweep_dir: Path, output_dir: Optional[Path] = None, use_geo_mean: bool = False, + skip_tracelens: bool = False, + short_kernel_threshold_us: int = 50, + topk_ops: int = 100, verbose: bool = False, ) -> Optional[Path]: """ - Process GPU timeline data from all individual reports in a sweep. + Analyze a sweep directory: run TraceLens on all configs and aggregate results. + + By default, runs TraceLens analysis on all thread/channel configurations first, + then aggregates GPU timeline data. Use skip_tracelens=True to only aggregate + existing reports. Args: - sweep_dir: Path to sweep directory containing tracelens_analysis/ + sweep_dir: Path to sweep directory with thread/channel subdirectories output_dir: Output directory (default: sweep_dir/tracelens_analysis/) use_geo_mean: If True, use geometric mean; otherwise use arithmetic mean + skip_tracelens: If True, skip TraceLens analysis (only aggregate existing) + short_kernel_threshold_us: Threshold for short kernel study + topk_ops: Number of top operations to include verbose: Whether to print verbose output Returns: @@ -317,6 +443,16 @@ def analyze_sweep_config( sweep_path = Path(sweep_dir) tracelens_dir = sweep_path / "tracelens_analysis" + # Step 1: Run TraceLens on all configurations (unless skipped) + if not skip_tracelens: + discover_and_run_tracelens( + sweep_dir=sweep_path, + short_kernel_threshold_us=short_kernel_threshold_us, + topk_ops=topk_ops, + verbose=verbose, + ) + + # Step 2: Aggregate results if not tracelens_dir.exists(): raise FileNotFoundError( f"tracelens_analysis directory not found in {sweep_dir}" diff --git a/src/aorta/report/analysis/cli.py b/src/aorta/report/analysis/cli.py index 232f8d16..e51205c3 100644 --- a/src/aorta/report/analysis/cli.py +++ b/src/aorta/report/analysis/cli.py @@ -75,18 +75,43 @@ def analyze_single(ctx, trace_dir, individual_only, collective_only, geo_mean, @analyze.command("sweep") @click.argument("sweep_dir", type=click.Path(exists=True)) +@click.option("--skip-tracelens", is_flag=True, + help="Skip TraceLens analysis, only aggregate existing reports") @click.option("--geo-mean", is_flag=True, help="Use geometric mean instead of arithmetic mean") +@click.option("--short-kernel-threshold", default=50, type=int, + help="Threshold for short kernel study (microseconds)") +@click.option("--topk-ops", default=100, type=int, + help="Number of top operations to include") @click.option("-o", "--output", type=click.Path(), help="Output directory") @click.pass_context -def analyze_sweep(ctx, sweep_dir, geo_mean, output): +def analyze_sweep(ctx, sweep_dir, skip_tracelens, geo_mean, short_kernel_threshold, + topk_ops, output): """Analyze a sweep directory with multiple configurations. - SWEEP_DIR: Path to the sweep directory containing tracelens_analysis/ - with multiple thread/channel configs. + SWEEP_DIR: Path to the sweep directory containing thread/channel subdirectories. + + By default, runs TraceLens analysis on all configurations first, then + aggregates the results. Use --skip-tracelens to only aggregate existing reports. + + \b + Expected directory structure: + sweep_dir/ + ├── 256thread/ + │ ├── nccl_28channels/ + │ │ └── torch_profiler/rank*/ + │ └── nccl_42channels/ + └── 512thread/ + └── ... \b Examples: + # Run TraceLens + aggregate (default) aorta-report analyze sweep /path/to/sweep_20251124 + + # Only aggregate existing reports + aorta-report analyze sweep /path/to/sweep --skip-tracelens + + # With geometric mean aggregation aorta-report analyze sweep /path/to/sweep --geo-mean """ from . import analyze_sweep_config @@ -99,6 +124,9 @@ def analyze_sweep(ctx, sweep_dir, geo_mean, output): sweep_dir=Path(sweep_dir), output_dir=Path(output) if output else None, use_geo_mean=geo_mean, + skip_tracelens=skip_tracelens, + short_kernel_threshold_us=short_kernel_threshold, + topk_ops=topk_ops, verbose=verbose, ) if not quiet and output_path: diff --git a/src/aorta/report/pipelines/cli.py b/src/aorta/report/pipelines/cli.py index a8059968..e8cb79d7 100644 --- a/src/aorta/report/pipelines/cli.py +++ b/src/aorta/report/pipelines/cli.py @@ -165,12 +165,29 @@ def pipeline_summary( "--sweep-dir", required=True, type=click.Path(exists=True), - help="Sweep directory containing tracelens_analysis/", + help="Sweep directory with thread/channel subdirectories", ) @click.option( "-o", "--output", required=True, type=click.Path(), help="Output directory for results" ) -@click.option("--top-k", default=5, type=int, help="Number of top kernels to extract (default: 5)") +@click.option( + "--skip-tracelens", + is_flag=True, + help="Skip TraceLens analysis, use existing reports", +) +@click.option( + "--short-kernel-threshold", + default=50, + type=int, + help="Threshold for short kernel study in microseconds (default: 50)", +) +@click.option( + "--topk-ops", + default=100, + type=int, + help="Number of top operations to include in TraceLens analysis (default: 100)", +) +@click.option("--top-k", default=5, type=int, help="Number of top GEMM kernels to extract (default: 5)") @click.option( "--threads", "-t", @@ -193,22 +210,29 @@ def pipeline_summary( @click.option("--plots/--no-plots", default=True, help="Generate plots (default: True)") @click.option("--html/--no-html", default=True, help="Generate HTML report (default: True)") @click.pass_context -def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps, plots, html): +def pipeline_gemm(ctx, sweep_dir, output, skip_tracelens, short_kernel_threshold, topk_ops, + top_k, threads, channels, timestamps, plots, html): """Run GEMM variance analysis pipeline. - Analyzes GEMM kernel time variance across configurations: + By default, runs TraceLens on all configurations first, then analyzes GEMM kernels. + Use --skip-tracelens to use existing TraceLens reports. \b - 1. Analyze GEMM reports to extract top-K kernels with highest variance - 2. Enhance with timestamps (optional) - 3. Generate variance plots (optional) - 4. Generate HTML report (optional) + Steps: + 0. Run TraceLens on sweep (default, skip with --skip-tracelens) + 1. Analyze GEMM reports to extract top-K kernels with highest variance + 2. Enhance with timestamps (optional) + 3. Generate variance plots (optional) + 4. Generate HTML report (optional) \b Examples: - # Full pipeline + # Full pipeline (runs TraceLens + GEMM analysis) aorta-report pipeline gemm --sweep-dir /path/to/sweep -o /path/to/output + # Skip TraceLens, use existing reports + aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --skip-tracelens + # Custom top-k aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --top-k 10 @@ -234,6 +258,9 @@ def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps, top_k=top_k, threads=list(threads), channels=list(channels), + skip_tracelens=skip_tracelens, + short_kernel_threshold_us=short_kernel_threshold, + topk_ops=topk_ops, timestamps=timestamps, plots=plots, html=html, @@ -246,7 +273,8 @@ def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps, click.echo("=" * 60) click.echo(f"Sweep dir: {sweep_dir}") click.echo(f"Output: {output}") - click.echo(f"Top-K: {top_k}") + click.echo(f"TraceLens: {'skip' if skip_tracelens else 'run'}") + click.echo(f"Top-K GEMM kernels: {top_k}") click.echo(f"Threads: {list(threads)}") click.echo(f"Channels: {list(channels)}") click.echo(f"Options: timestamps={timestamps}, plots={plots}, html={html}") diff --git a/src/aorta/report/pipelines/gemm_pipeline.py b/src/aorta/report/pipelines/gemm_pipeline.py index f9ee58ab..3be4dda5 100644 --- a/src/aorta/report/pipelines/gemm_pipeline.py +++ b/src/aorta/report/pipelines/gemm_pipeline.py @@ -1,9 +1,11 @@ """GEMM variance analysis pipeline. Orchestrates GEMM kernel variance analysis: +0. Run TraceLens on sweep (optional, default: enabled) 1. Analyze GEMM Reports 2. Enhance with Timestamps (optional) 3. Generate GEMM Plots (optional) +4. Generate HTML Report (optional) """ from pathlib import Path @@ -21,6 +23,9 @@ class GemmPipelineConfig: threads: List[int] = field(default_factory=lambda: [256, 512]) channels: List[int] = field(default_factory=lambda: [28, 42, 56, 70]) ranks: List[int] = field(default_factory=lambda: list(range(8))) + skip_tracelens: bool = False # Run TraceLens by default + short_kernel_threshold_us: int = 50 + topk_ops: int = 100 timestamps: bool = True plots: bool = True html: bool = True @@ -56,6 +61,12 @@ def run_gemm_pipeline(config: GemmPipelineConfig) -> GemmPipelineResult: config.output_dir.mkdir(parents=True, exist_ok=True) try: + # Step 0: Run TraceLens on sweep (unless skipped) + if not config.skip_tracelens: + _step_run_tracelens(config, result) + else: + result.steps_skipped.append("tracelens") + # Step 1: Analyze GEMM Reports _step_analyze_gemm(config, result) @@ -90,6 +101,31 @@ def run_gemm_pipeline(config: GemmPipelineConfig) -> GemmPipelineResult: return result +def _step_run_tracelens(config: GemmPipelineConfig, result: GemmPipelineResult) -> None: + """Step 0: Run TraceLens analysis on all configurations in the sweep.""" + from ..analysis import analyze_sweep_config + + if config.verbose: + print("\n" + "=" * 60) + print("STEP 0: Run TraceLens Analysis") + print("=" * 60) + + try: + analyze_sweep_config( + sweep_dir=config.sweep_dir, + skip_tracelens=False, # Always run TraceLens in this step + short_kernel_threshold_us=config.short_kernel_threshold_us, + topk_ops=config.topk_ops, + verbose=config.verbose, + ) + result.steps_completed.append("tracelens") + except Exception as e: + # Don't fail the whole pipeline if TraceLens fails + # The subsequent steps will check if tracelens_analysis/ exists + result.errors.append(f"TraceLens analysis failed: {e}") + result.steps_skipped.append("tracelens (failed)") + + def _step_analyze_gemm(config: GemmPipelineConfig, result: GemmPipelineResult) -> None: """Step 1: Analyze GEMM reports.""" from ..analysis import analyze_gemm_reports diff --git a/src/aorta/source_diff.path b/src/aorta/source_diff.path new file mode 100644 index 00000000..ecfaff04 --- /dev/null +++ b/src/aorta/source_diff.path @@ -0,0 +1,555 @@ +diff --git a/scripts/gemm_analysis/run_tracelens_analysis.sh b/scripts/gemm_analysis/run_tracelens_analysis.sh +index c3b654c..1154ec9 100755 +--- a/scripts/gemm_analysis/run_tracelens_analysis.sh ++++ b/scripts/gemm_analysis/run_tracelens_analysis.sh +@@ -264,7 +264,10 @@ else + # trace file in the rank folder to the canonical `trace/pt.trace.json` path. + # This will satisfy TraceLens's requirement of only one `*` being present in the trace pattern + # while also avoiding FileNotFoundErrors due to different filenames. +- find $TRACE_DIR/rank* -name "*.json" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \; ++ # OLD (not idempotent - running twice creates trace/trace/pt.trace.json): ++ # find $TRACE_DIR/rank* -name "*.json" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \; ++ # NEW: -not -path "*/trace/*" ensures this is idempotent (safe to run multiple times) ++ find $TRACE_DIR/rank* -name "*.json" -not -path "*/trace/*" -exec sh -c 'mkdir -p "$(dirname "$0")/trace" && mv "$0" "$(dirname "$0")/trace/pt.trace.json"' {} \; + + TraceLens_generate_multi_rank_collective_report_pytorch \ + --trace_pattern "$TRACE_DIR/rank*/trace/pt.trace.json" \ +diff --git a/src/aorta/report/analysis/__init__.py b/src/aorta/report/analysis/__init__.py +index e0bd324..d8fc003 100644 +--- a/src/aorta/report/analysis/__init__.py ++++ b/src/aorta/report/analysis/__init__.py +@@ -3,12 +3,13 @@ + from .tracelens_wrapper import TraceLensWrapper + from .analyze_gemm import analyze_gemm_reports + from .analyze_single import analyze_single_config +-from .analyze_sweep import analyze_sweep_config ++from .analyze_sweep import analyze_sweep_config, discover_and_run_tracelens + + __all__ = [ + "TraceLensWrapper", + "analyze_gemm_reports", + "analyze_single_config", + "analyze_sweep_config", ++ "discover_and_run_tracelens", + ] + +diff --git a/src/aorta/report/analysis/analyze_gemm.py b/src/aorta/report/analysis/analyze_gemm.py +index a724019..56fd19b 100644 +--- a/src/aorta/report/analysis/analyze_gemm.py ++++ b/src/aorta/report/analysis/analyze_gemm.py +@@ -78,9 +78,9 @@ def process_excel_file( + sheet = wb["GEMM"] + + # Expected column positions (0-based indices) +- col_kernel_info = column_letter_to_index("X") # Column X +- col_time_min = column_letter_to_index("AG") # Column AG +- col_time_max = column_letter_to_index("AH") # Column AH ++ col_kernel_info = column_letter_to_index("Y") # Column X ++ col_time_min = column_letter_to_index("AH") # Column AG ++ col_time_max = column_letter_to_index("AI") # Column AH + + # Read header row to validate column names + rows_data = [] +diff --git a/src/aorta/report/analysis/analyze_single.py b/src/aorta/report/analysis/analyze_single.py +index f961cf3..869fa13 100644 +--- a/src/aorta/report/analysis/analyze_single.py ++++ b/src/aorta/report/analysis/analyze_single.py +@@ -34,14 +34,15 @@ def detect_trace_directory(input_dir: Path) -> Tuple[Path, Path]: + ValueError: If directory structure cannot be determined + """ + # Check if input_dir contains rank directories (i.e., it IS torch_profiler/) +- rank_dirs = list(input_dir.glob("rank*")) ++ # Use is_dir() to filter out files matching rank* pattern (e.g., rank0.log) ++ rank_dirs = [p for p in input_dir.glob("rank*") if p.is_dir()] + if rank_dirs: + return input_dir, input_dir.parent + + # Check if input_dir contains torch_profiler/ subdirectory + torch_prof_dir = input_dir / "torch_profiler" + if torch_prof_dir.exists(): +- rank_dirs = list(torch_prof_dir.glob("rank*")) ++ rank_dirs = [p for p in torch_prof_dir.glob("rank*") if p.is_dir()] + if rank_dirs: + return torch_prof_dir, input_dir + +@@ -55,10 +56,30 @@ def detect_trace_directory(input_dir: Path) -> Tuple[Path, Path]: + + + def find_trace_file(rank_dir: Path) -> Optional[Path]: +- """Find trace file in a rank directory.""" ++ """Find trace file in a rank directory. ++ ++ Searches for JSON trace files in the following order: ++ 1. Directly in rank_dir (e.g., rank0/*.json) ++ 2. In trace/ subdirectory (e.g., rank0/trace/pt.trace.json) ++ 3. Recursively in any subdirectory (e.g., rank0/**/*.json) ++ """ ++ # First, look directly in the rank directory + json_files = list(rank_dir.glob("*.json")) + if json_files: + return json_files[0] ++ ++ # Then check trace/ subdirectory (common after collective report prep) ++ trace_subdir = rank_dir / "trace" ++ if trace_subdir.exists(): ++ json_files = list(trace_subdir.glob("*.json")) ++ if json_files: ++ return json_files[0] ++ ++ # Finally, search recursively ++ json_files = list(rank_dir.glob("**/*.json")) ++ if json_files: ++ return json_files[0] ++ + return None + + +@@ -155,6 +176,7 @@ def analyze_single_config( + short_kernel_threshold_us: int = 50, + topk_ops: int = 100, + verbose: bool = False, ++ output_prefix: Optional[str] = None, + ) -> dict: + """ + Run TraceLens analysis on a single configuration trace directory. +@@ -169,6 +191,7 @@ def analyze_single_config( + short_kernel_threshold_us: Threshold for short kernel study + topk_ops: Number of top operations to include + verbose: Whether to print verbose output ++ output_prefix: Custom prefix for output files (e.g., "28ch" -> perf_28ch_rank0.xlsx) + + Returns: + Dictionary with paths to generated reports +@@ -239,7 +262,11 @@ def analyze_single_config( + print(f" Skip {rank_name} - no trace file found") + continue + +- output_file = individual_reports_dir / f"perf_rank{rank_num}.xlsx" ++ # Use custom prefix if provided (for sweep mode), otherwise default naming ++ if output_prefix: ++ output_file = individual_reports_dir / f"perf_{output_prefix}_rank{rank_num}.xlsx" ++ else: ++ output_file = individual_reports_dir / f"perf_rank{rank_num}.xlsx" + + print(f"\nProcessing {rank_name}...") + print(f" Trace: {trace_file.name}") +@@ -274,8 +301,11 @@ def analyze_single_config( + symlink_path = rank_dir / "trace.json" + if not symlink_path.exists(): + try: +- symlink_path.symlink_to(trace_file.name) +- except (OSError, FileExistsError): ++ # Use relative path from rank_dir to trace_file ++ # This handles cases where trace is in subdirectory (e.g., trace/pt.trace.json) ++ relative_path = trace_file.relative_to(rank_dir) ++ symlink_path.symlink_to(relative_path) ++ except (OSError, FileExistsError, ValueError): + pass # Symlink already exists or cannot be created + + trace_pattern = str(torch_prof_dir / "rank*" / "trace.json") +diff --git a/src/aorta/report/analysis/analyze_sweep.py b/src/aorta/report/analysis/analyze_sweep.py +index 9626769..5db8854 100644 +--- a/src/aorta/report/analysis/analyze_sweep.py ++++ b/src/aorta/report/analysis/analyze_sweep.py +@@ -3,15 +3,131 @@ Sweep configuration analysis - analyze traces from parameter sweep experiments. + + Processes GPU timeline data from TraceLens individual reports across multiple + thread and channel configurations, aggregating across ranks. ++ ++Supports two modes: ++1. Run TraceLens on all configurations then aggregate (default) ++2. Aggregate existing TraceLens reports only (--skip-tracelens) + """ + + import glob ++import re + from pathlib import Path + from typing import Dict, List, Optional, Any + + import numpy as np + import pandas as pd + ++from .analyze_single import analyze_single_config ++ ++ ++def discover_and_run_tracelens( ++ sweep_dir: Path, ++ short_kernel_threshold_us: int = 50, ++ topk_ops: int = 100, ++ verbose: bool = False, ++) -> Path: ++ """ ++ Discover thread/channel configs and run TraceLens on each. ++ ++ Expected input structure: ++ sweep_dir/ ++ ├── 256thread/ ++ │ ├── nccl_28channels/ ++ │ │ └── torch_profiler/rank*/ ++ │ └── nccl_42channels/ ++ └── 512thread/ ++ └── ... ++ ++ Output structure: ++ sweep_dir/ ++ └── tracelens_analysis/ ++ ├── 256thread/ ++ │ └── individual_reports/ ++ │ ├── perf_28ch_rank0.xlsx ++ │ └── ... ++ └── 512thread/ ++ └── ... ++ ++ Args: ++ sweep_dir: Path to sweep directory with thread/channel subdirectories ++ short_kernel_threshold_us: Threshold for short kernel study ++ topk_ops: Number of top operations to include ++ verbose: Whether to print verbose output ++ ++ Returns: ++ Path to tracelens_analysis output directory ++ """ ++ sweep_path = Path(sweep_dir) ++ output_base = sweep_path / "tracelens_analysis" ++ ++ # Discover thread configurations (e.g., "256thread", "512thread") ++ thread_dirs = sorted([ ++ d for d in sweep_path.iterdir() ++ if d.is_dir() and "thread" in d.name ++ ]) ++ ++ if not thread_dirs: ++ raise ValueError(f"No thread configurations found in {sweep_dir}") ++ ++ print("=" * 80) ++ print("Step 0: Running TraceLens on All Configurations") ++ print("=" * 80) ++ print(f"\nDiscovered thread configs: {[d.name for d in thread_dirs]}") ++ ++ for thread_dir in thread_dirs: ++ thread_name = thread_dir.name # e.g., "256thread" ++ ++ # Find channel configs (e.g., "nccl_28channels") ++ channel_dirs = sorted([ ++ d for d in thread_dir.iterdir() ++ if d.is_dir() and "channel" in d.name ++ ]) ++ ++ if not channel_dirs: ++ print(f" [WARN] No channel configs in {thread_name}") ++ continue ++ ++ print(f"\n{thread_name}: {[d.name for d in channel_dirs]}") ++ ++ for channel_dir in channel_dirs: ++ # Extract channel number (e.g., "nccl_28channels" -> "28") ++ channel_name = channel_dir.name ++ channel_match = re.search(r"(\d+)", channel_name) ++ channel_num = channel_match.group(1) if channel_match else "0" ++ ++ # Look for torch_profiler directory ++ trace_dir = channel_dir / "torch_profiler" ++ if not trace_dir.exists(): ++ print(f" [SKIP] {channel_name} - no torch_profiler/") ++ continue ++ ++ # Output to: tracelens_analysis/{thread}/individual_reports/ ++ output_dir = output_base / thread_name ++ ++ print(f" Processing {channel_name}...") ++ ++ try: ++ analyze_single_config( ++ input_dir=trace_dir, ++ output_dir=output_dir, ++ run_individual=True, ++ run_collective=False, # Skip collective for sweep ++ aggregate_timeline=False, # Will aggregate at sweep level ++ short_kernel_threshold_us=short_kernel_threshold_us, ++ topk_ops=topk_ops, ++ verbose=verbose, ++ output_prefix=f"{channel_num}ch", # e.g., "28ch" ++ ) ++ print(f" [OK] {channel_name}") ++ except Exception as e: ++ print(f" [ERROR] {channel_name}: {e}") ++ ++ print("\n" + "=" * 80) ++ print("TraceLens Analysis Complete") ++ print("=" * 80) ++ ++ return output_base ++ + + def geometric_mean(values: np.ndarray) -> float: + """Calculate geometric mean, handling zeros.""" +@@ -300,15 +416,25 @@ def analyze_sweep_config( + sweep_dir: Path, + output_dir: Optional[Path] = None, + use_geo_mean: bool = False, ++ skip_tracelens: bool = False, ++ short_kernel_threshold_us: int = 50, ++ topk_ops: int = 100, + verbose: bool = False, + ) -> Optional[Path]: + """ +- Process GPU timeline data from all individual reports in a sweep. ++ Analyze a sweep directory: run TraceLens on all configs and aggregate results. ++ ++ By default, runs TraceLens analysis on all thread/channel configurations first, ++ then aggregates GPU timeline data. Use skip_tracelens=True to only aggregate ++ existing reports. + + Args: +- sweep_dir: Path to sweep directory containing tracelens_analysis/ ++ sweep_dir: Path to sweep directory with thread/channel subdirectories + output_dir: Output directory (default: sweep_dir/tracelens_analysis/) + use_geo_mean: If True, use geometric mean; otherwise use arithmetic mean ++ skip_tracelens: If True, skip TraceLens analysis (only aggregate existing) ++ short_kernel_threshold_us: Threshold for short kernel study ++ topk_ops: Number of top operations to include + verbose: Whether to print verbose output + + Returns: +@@ -317,6 +443,16 @@ def analyze_sweep_config( + sweep_path = Path(sweep_dir) + tracelens_dir = sweep_path / "tracelens_analysis" + ++ # Step 1: Run TraceLens on all configurations (unless skipped) ++ if not skip_tracelens: ++ discover_and_run_tracelens( ++ sweep_dir=sweep_path, ++ short_kernel_threshold_us=short_kernel_threshold_us, ++ topk_ops=topk_ops, ++ verbose=verbose, ++ ) ++ ++ # Step 2: Aggregate results + if not tracelens_dir.exists(): + raise FileNotFoundError( + f"tracelens_analysis directory not found in {sweep_dir}" +diff --git a/src/aorta/report/analysis/cli.py b/src/aorta/report/analysis/cli.py +index 232f8d1..e51205c 100644 +--- a/src/aorta/report/analysis/cli.py ++++ b/src/aorta/report/analysis/cli.py +@@ -75,18 +75,43 @@ def analyze_single(ctx, trace_dir, individual_only, collective_only, geo_mean, + + @analyze.command("sweep") + @click.argument("sweep_dir", type=click.Path(exists=True)) ++@click.option("--skip-tracelens", is_flag=True, ++ help="Skip TraceLens analysis, only aggregate existing reports") + @click.option("--geo-mean", is_flag=True, help="Use geometric mean instead of arithmetic mean") ++@click.option("--short-kernel-threshold", default=50, type=int, ++ help="Threshold for short kernel study (microseconds)") ++@click.option("--topk-ops", default=100, type=int, ++ help="Number of top operations to include") + @click.option("-o", "--output", type=click.Path(), help="Output directory") + @click.pass_context +-def analyze_sweep(ctx, sweep_dir, geo_mean, output): ++def analyze_sweep(ctx, sweep_dir, skip_tracelens, geo_mean, short_kernel_threshold, ++ topk_ops, output): + """Analyze a sweep directory with multiple configurations. + +- SWEEP_DIR: Path to the sweep directory containing tracelens_analysis/ +- with multiple thread/channel configs. ++ SWEEP_DIR: Path to the sweep directory containing thread/channel subdirectories. ++ ++ By default, runs TraceLens analysis on all configurations first, then ++ aggregates the results. Use --skip-tracelens to only aggregate existing reports. ++ ++ \b ++ Expected directory structure: ++ sweep_dir/ ++ ├── 256thread/ ++ │ ├── nccl_28channels/ ++ │ │ └── torch_profiler/rank*/ ++ │ └── nccl_42channels/ ++ └── 512thread/ ++ └── ... + + \b + Examples: ++ # Run TraceLens + aggregate (default) + aorta-report analyze sweep /path/to/sweep_20251124 ++ ++ # Only aggregate existing reports ++ aorta-report analyze sweep /path/to/sweep --skip-tracelens ++ ++ # With geometric mean aggregation + aorta-report analyze sweep /path/to/sweep --geo-mean + """ + from . import analyze_sweep_config +@@ -99,6 +124,9 @@ def analyze_sweep(ctx, sweep_dir, geo_mean, output): + sweep_dir=Path(sweep_dir), + output_dir=Path(output) if output else None, + use_geo_mean=geo_mean, ++ skip_tracelens=skip_tracelens, ++ short_kernel_threshold_us=short_kernel_threshold, ++ topk_ops=topk_ops, + verbose=verbose, + ) + if not quiet and output_path: +diff --git a/src/aorta/report/pipelines/cli.py b/src/aorta/report/pipelines/cli.py +index a805996..e8cb79d 100644 +--- a/src/aorta/report/pipelines/cli.py ++++ b/src/aorta/report/pipelines/cli.py +@@ -165,12 +165,29 @@ def pipeline_summary( + "--sweep-dir", + required=True, + type=click.Path(exists=True), +- help="Sweep directory containing tracelens_analysis/", ++ help="Sweep directory with thread/channel subdirectories", + ) + @click.option( + "-o", "--output", required=True, type=click.Path(), help="Output directory for results" + ) +-@click.option("--top-k", default=5, type=int, help="Number of top kernels to extract (default: 5)") ++@click.option( ++ "--skip-tracelens", ++ is_flag=True, ++ help="Skip TraceLens analysis, use existing reports", ++) ++@click.option( ++ "--short-kernel-threshold", ++ default=50, ++ type=int, ++ help="Threshold for short kernel study in microseconds (default: 50)", ++) ++@click.option( ++ "--topk-ops", ++ default=100, ++ type=int, ++ help="Number of top operations to include in TraceLens analysis (default: 100)", ++) ++@click.option("--top-k", default=5, type=int, help="Number of top GEMM kernels to extract (default: 5)") + @click.option( + "--threads", + "-t", +@@ -193,22 +210,29 @@ def pipeline_summary( + @click.option("--plots/--no-plots", default=True, help="Generate plots (default: True)") + @click.option("--html/--no-html", default=True, help="Generate HTML report (default: True)") + @click.pass_context +-def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps, plots, html): ++def pipeline_gemm(ctx, sweep_dir, output, skip_tracelens, short_kernel_threshold, topk_ops, ++ top_k, threads, channels, timestamps, plots, html): + """Run GEMM variance analysis pipeline. + +- Analyzes GEMM kernel time variance across configurations: ++ By default, runs TraceLens on all configurations first, then analyzes GEMM kernels. ++ Use --skip-tracelens to use existing TraceLens reports. + + \b +- 1. Analyze GEMM reports to extract top-K kernels with highest variance +- 2. Enhance with timestamps (optional) +- 3. Generate variance plots (optional) +- 4. Generate HTML report (optional) ++ Steps: ++ 0. Run TraceLens on sweep (default, skip with --skip-tracelens) ++ 1. Analyze GEMM reports to extract top-K kernels with highest variance ++ 2. Enhance with timestamps (optional) ++ 3. Generate variance plots (optional) ++ 4. Generate HTML report (optional) + + \b + Examples: +- # Full pipeline ++ # Full pipeline (runs TraceLens + GEMM analysis) + aorta-report pipeline gemm --sweep-dir /path/to/sweep -o /path/to/output + ++ # Skip TraceLens, use existing reports ++ aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --skip-tracelens ++ + # Custom top-k + aorta-report pipeline gemm --sweep-dir /path/to/sweep -o ./output --top-k 10 + +@@ -234,6 +258,9 @@ def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps, + top_k=top_k, + threads=list(threads), + channels=list(channels), ++ skip_tracelens=skip_tracelens, ++ short_kernel_threshold_us=short_kernel_threshold, ++ topk_ops=topk_ops, + timestamps=timestamps, + plots=plots, + html=html, +@@ -246,7 +273,8 @@ def pipeline_gemm(ctx, sweep_dir, output, top_k, threads, channels, timestamps, + click.echo("=" * 60) + click.echo(f"Sweep dir: {sweep_dir}") + click.echo(f"Output: {output}") +- click.echo(f"Top-K: {top_k}") ++ click.echo(f"TraceLens: {'skip' if skip_tracelens else 'run'}") ++ click.echo(f"Top-K GEMM kernels: {top_k}") + click.echo(f"Threads: {list(threads)}") + click.echo(f"Channels: {list(channels)}") + click.echo(f"Options: timestamps={timestamps}, plots={plots}, html={html}") +diff --git a/src/aorta/report/pipelines/gemm_pipeline.py b/src/aorta/report/pipelines/gemm_pipeline.py +index f9ee58a..3be4dda 100644 +--- a/src/aorta/report/pipelines/gemm_pipeline.py ++++ b/src/aorta/report/pipelines/gemm_pipeline.py +@@ -1,9 +1,11 @@ + """GEMM variance analysis pipeline. + + Orchestrates GEMM kernel variance analysis: ++0. Run TraceLens on sweep (optional, default: enabled) + 1. Analyze GEMM Reports + 2. Enhance with Timestamps (optional) + 3. Generate GEMM Plots (optional) ++4. Generate HTML Report (optional) + """ + + from pathlib import Path +@@ -21,6 +23,9 @@ class GemmPipelineConfig: + threads: List[int] = field(default_factory=lambda: [256, 512]) + channels: List[int] = field(default_factory=lambda: [28, 42, 56, 70]) + ranks: List[int] = field(default_factory=lambda: list(range(8))) ++ skip_tracelens: bool = False # Run TraceLens by default ++ short_kernel_threshold_us: int = 50 ++ topk_ops: int = 100 + timestamps: bool = True + plots: bool = True + html: bool = True +@@ -56,6 +61,12 @@ def run_gemm_pipeline(config: GemmPipelineConfig) -> GemmPipelineResult: + config.output_dir.mkdir(parents=True, exist_ok=True) + + try: ++ # Step 0: Run TraceLens on sweep (unless skipped) ++ if not config.skip_tracelens: ++ _step_run_tracelens(config, result) ++ else: ++ result.steps_skipped.append("tracelens") ++ + # Step 1: Analyze GEMM Reports + _step_analyze_gemm(config, result) + +@@ -90,6 +101,31 @@ def run_gemm_pipeline(config: GemmPipelineConfig) -> GemmPipelineResult: + return result + + ++def _step_run_tracelens(config: GemmPipelineConfig, result: GemmPipelineResult) -> None: ++ """Step 0: Run TraceLens analysis on all configurations in the sweep.""" ++ from ..analysis import analyze_sweep_config ++ ++ if config.verbose: ++ print("\n" + "=" * 60) ++ print("STEP 0: Run TraceLens Analysis") ++ print("=" * 60) ++ ++ try: ++ analyze_sweep_config( ++ sweep_dir=config.sweep_dir, ++ skip_tracelens=False, # Always run TraceLens in this step ++ short_kernel_threshold_us=config.short_kernel_threshold_us, ++ topk_ops=config.topk_ops, ++ verbose=config.verbose, ++ ) ++ result.steps_completed.append("tracelens") ++ except Exception as e: ++ # Don't fail the whole pipeline if TraceLens fails ++ # The subsequent steps will check if tracelens_analysis/ exists ++ result.errors.append(f"TraceLens analysis failed: {e}") ++ result.steps_skipped.append("tracelens (failed)") ++ ++ + def _step_analyze_gemm(config: GemmPipelineConfig, result: GemmPipelineResult) -> None: + """Step 1: Analyze GEMM reports.""" + from ..analysis import analyze_gemm_reports