diff --git a/.github/scripts/benchmark_validate.py b/.github/scripts/benchmark_validate.py
new file mode 100644
index 000000000..9dd5de14b
--- /dev/null
+++ b/.github/scripts/benchmark_validate.py
@@ -0,0 +1,404 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+"""
+Benchmark Validator for GitHub Actions
+
+Compares two benchmark JSON outputs (baseline vs target), checks thresholds,
+writes a Markdown summary, and optionally posts a PR comment on failure.
+
+This single script replaces the previous three-step pipeline:
+  compare_disk_index_json_output.py → csv_to_markdown.py → benchmark_result_parse.py
+
+Usage:
+    # PR mode (directional thresholds, posts PR comment on failure)
+    python benchmark_validate.py --mode pr --baseline baseline.json --target target.json
+
+    # A/A mode (symmetric thresholds)
+    python benchmark_validate.py --mode aa --baseline baseline.json --target target.json
+
+Environment Variables (for PR comments):
+    GITHUB_TOKEN: GitHub token for API access
+    GITHUB_REPOSITORY: Owner/repo (e.g., "microsoft/DiskANN")
+    GITHUB_PR_NUMBER: Pull request number
+    GITHUB_RUN_ID: Workflow run ID for linking to logs
+    GITHUB_STEP_SUMMARY: Path to step summary file
+"""
+
+import json
+import os
+import sys
+import argparse
+from typing import Any
+from urllib.request import urlopen, Request
+from urllib.error import URLError
+
+
+# =============================================================================
+# JSON Extraction
+# =============================================================================
+
+def load_json(path: str) -> list[dict[str, Any]]:
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def extract_build_metrics(results: dict) -> dict[str, float]:
+    build = results.get("build", {})
+    if not build:
+        return {}
+
+    metrics: dict[str, float] = {}
+
+    build_time = build.get("build_time")
+    if build_time:
+        metrics["total_time"] = build_time / 1e6  # μs → s
+
+    for span in build.get("span_metrics", {}).get("spans", []):
+        name = span.get("span_name", "")
+        data = span.get("metrics", {})
+        if name == "DiskIndexBuild-PqConstruction":
+            metrics["pq_construction_time"] = data.get("duration_seconds")
+        elif name == "DiskIndexBuild-InmemIndexBuild":
+            metrics["inmem_index_build_time"] = data.get("duration_seconds")
+        elif name == "DiskIndexBuild-DiskLayout":
+            metrics["disk_layout_time"] = data.get("duration_seconds")
+
+    return metrics
+
+
+def extract_search_metrics(results: dict, search_l: int, beam_width: int) -> dict[str, float]:
+    search = results.get("search", {})
+    if not search:
+        return {}
+
+    metrics: dict[str, float] = {}
+
+    # From search_results_per_l
+    for sr in search.get("search_results_per_l", []):
+        if sr.get("search_l") == search_l:
+            metrics["qps"] = sr.get("qps")
+            metrics["recall"] = sr.get("recall")
+            metrics["mean_latency"] = sr.get("mean_latency")
+            metrics["mean_ios"] = sr.get("mean_ios")
+            metrics["mean_comps"] = sr.get("mean_comparisons")
+            metrics["mean_hops"] = sr.get("mean_hops")
+            metrics["mean_io_time"] = sr.get("mean_io_time")
+            metrics["mean_cpus"] = sr.get("mean_cpu_time")
+            metrics["latency_95"] = sr.get("p95_latency")
+            break
+
+    # Override with span metrics if available
+    span_name = f"search-with-L={search_l}-bw={beam_width}"
+    for span in search.get("span_metrics", {}).get("spans", []):
+        if span.get("span_name") == span_name:
+            data = span.get("metrics", {})
+            for key in ("qps", "recall", "mean_latency", "mean_ios", "mean_comps",
+                        "mean_hops", "mean_io_time", "mean_cpus"):
+                if key in data:
+                    metrics[key] = data[key]
+            break
+
+    return metrics
+
+
+def compute_diff(baseline_json: list[dict], target_json: list[dict]) -> list[dict]:
+    """
+    Compare baseline and target JSONs.
+    Returns a flat list of metric diffs:
+        [{category, metric, baseline, target, deviation}, ...]
+    """
+    rows = []
+
+    for baseline, target in zip(baseline_json, target_json):
+        b_results = baseline.get("results", {})
+        t_results = target.get("results", {})
+
+        inp = target.get("input", {})
+        search_phase = inp.get("content", {}).get("search_phase", {})
+        search_list = search_phase.get("search_list", [200])
+        beam_width = search_phase.get("beam_width", 4)
+        primary_l = search_list[0] if search_list else 200
+
+        # Build metrics
+        b_build = extract_build_metrics(b_results)
+        t_build = extract_build_metrics(t_results)
+
+        for key in ("total_time", "pq_construction_time", "inmem_index_build_time", "disk_layout_time"):
+            bv = b_build.get(key)
+            tv = t_build.get(key)
+            if bv is None or tv is None:
+                continue  # skip metrics missing from either side
+            rows.append({
+                "category": "index-build statistics",
+                "metric": key,
+                "baseline": bv,
+                "target": tv,
+                "deviation": ((tv - bv) / bv * 100) if bv else 0,
+            })
+
+        # Search metrics
+        b_search = extract_search_metrics(b_results, primary_l, beam_width)
+        t_search = extract_search_metrics(t_results, primary_l, beam_width)
+        span_cat = f"search-with-L={primary_l}-bw={beam_width}"
+
+        for key in ("qps", "recall", "mean_latency", "latency_95", "mean_ios",
+                     "mean_comps", "mean_hops", "mean_io_time", "mean_cpus"):
+            bv = b_search.get(key)
+            tv = t_search.get(key)
+            if bv is None or tv is None:
+                continue  # skip metrics missing from either side
+            rows.append({
+                "category": span_cat,
+                "metric": key,
+                "baseline": bv,
+                "target": tv,
+                "deviation": ((tv - bv) / bv * 100) if bv else 0,
+            })
+
+    return rows
+
+
+# =============================================================================
+# Thresholds
+# =============================================================================
+
+# Format: [max_deviation_%, direction, contract_value]
+#   direction: 'GT' = higher is better, 'LT' = lower is better
+#   contract_value: absolute limit (empty string = none)
+THRESHOLDS: dict[str, dict[str, list]] = {
+    "DiskIndexBuild-PqConstruction": {
+        "duration_seconds": [10, "LT", ""],
+        "peak_memory_usage": [10, "LT", ""],
+    },
+    "DiskIndexBuild-InmemIndexBuild": {
+        "duration_seconds": [10, "LT", ""],
+        "peak_memory_usage": [10, "LT", ""],
+    },
+    "search_disk_index-search_completed": {
+        "duration_seconds": [10, "LT", ""],
+        "peak_memory_usage": [10, "LT", 1.42],
+    },
+    "disk_index_perf_test": {
+        "total_duration_seconds": [10, "LT", ""],
+    },
+    "index-build statistics": {
+        # Calibrated from 5 GitHub runner runs (10 observations):
+        #   Wikipedia: 35.9–37.2s, OpenAI: 23.0–76.4s (SQ_1_2.0 variance)
+        #   Contract: worst × 1.5 to absorb shared-runner variance
+        "total_time": [10, "LT", 115],
+        "total_comparisons": [1, "LT", ""],
+        "search_hops": [1, "LT", ""],
+    },
+    "search-with-L=200-bw=4": {
+        "latency_95": [15, "LT", ""],   # wider — p95 latency is noisy on shared runners
+        "mean_latency": [10, "LT", ""],
+        "mean_io_time": [10, "LT", ""],
+        "mean_cpus": [15, "LT", ""],
+        "qps": [10, "GT", ""],
+        "mean_ios": [10, "LT", ""],
+        "mean_comps": [10, "LT", ""],
+        "mean_hops": [10, "LT", ""],
+        "recall": [1, "GT", ""],
+    },
+}
+
+
+def allowed_range(threshold: float, direction: str, mode: str) -> tuple[float, float]:
+    """Acceptable change range (in %)."""
+    if mode == "aa":
+        return (-threshold, threshold)
+    if direction == "GT":
+        return (-threshold, float("inf"))
+    return (float("-inf"), threshold)
+
+
+def fmt_range(lo: float, hi: float) -> str:
+    lo_s = "-inf" if lo == float("-inf") else f"{lo}%"
+    hi_s = "inf" if hi == float("inf") else f"{hi}%"
+    return f"({lo_s} – {hi_s})"
+
+
+def check_contract(value: float, contract: Any, direction: str) -> tuple[bool, str]:
+    """Check if value violates a hard contract. Returns (broken, formatted_contract)."""
+    if contract == "":
+        return False, "N/A"
+    contract = float(contract)
+    if direction == "GT" and value < contract:
+        return True, f"> {contract}"
+    if direction == "LT" and value > contract:
+        return True, f"< {contract}"
+    return False, str(contract)
+
+
+# =============================================================================
+# Validation
+# =============================================================================
+
+def validate(diffs: list[dict], mode: str, run_id: str | None) -> tuple[bool, str]:
+    """
+    Check all diffs against thresholds.
+    Returns (has_failures, markdown_report).
+    """
+    failed_rows: list[str] = []
+
+    for d in diffs:
+        cat, metric = d["category"], d["metric"]
+        if cat not in THRESHOLDS or metric not in THRESHOLDS[cat]:
+            continue
+
+        pct, direction, contract = THRESHOLDS[cat][metric]
+        rng = allowed_range(pct, direction, mode)
+        dev = d["deviation"]
+
+        threshold_failed = dev < rng[0] or dev > rng[1]
+        contract_broken, contract_fmt = check_contract(d["target"], contract, direction)
+
+        if threshold_failed:
+            print(f"THRESHOLD FAILED: {cat}/{metric} change={dev:.2f}% allowed={fmt_range(*rng)}")
+        if contract_broken:
+            print(f"CONTRACT BROKEN:  {cat}/{metric} value={d['target']} required={contract_fmt}")
+
+        if threshold_failed or contract_broken:
+            outcome = []
+            if threshold_failed:
+                outcome.append("Regression detected")
+            if contract_broken:
+                outcome.append("Contract broken")
+            failed_rows.append(
+                f"| {cat}/{metric} | {d['baseline']:.4g} | {d['target']:.4g} | "
+                f"{contract_fmt} | {dev:.2f}% | {fmt_range(*rng)} | {', '.join(outcome)} |"
+            )
+
+    if not failed_rows:
+        return False, ""
+
+    logs_link = ""
+    if run_id:
+        repo = os.getenv("GITHUB_REPOSITORY", "microsoft/DiskANN")
+        logs_link = f"https://github.com/{repo}/actions/runs/{run_id}"
+
+    report = "### ❌ Benchmark Check Failed\n\n"
+    if logs_link:
+        report += f"Please investigate the [workflow logs]({logs_link}) to determine if the failure is due to your changes.\n\n"
+    report += "| Metric | Baseline | Current | Contract | Change | Allowed | Outcome |\n"
+    report += "|--------|----------|---------|----------|--------|---------|--------|\n"
+    report += "\n".join(failed_rows)
+
+    return True, report
+
+
+# =============================================================================
+# Markdown output
+# =============================================================================
+
+def diffs_to_markdown(diffs: list[dict], title: str) -> str:
+    """Render diffs as a Markdown table."""
+    lines = [
+        f"### {title}",
+        "",
+        "| Category | Metric | Baseline | Current | Change |",
+        "|----------|--------|----------|---------|--------|",
+    ]
+    for d in diffs:
+        lines.append(
+            f"| {d['category']} | {d['metric']} | {d['baseline']:.4g} | "
+            f"{d['target']:.4g} | {d['deviation']:+.2f}% |"
+        )
+    return "\n".join(lines)
+
+
+# =============================================================================
+# GitHub helpers (stdlib only — no requests dependency)
+# =============================================================================
+
+def post_pr_comment(body: str) -> bool:
+    token = os.getenv("GITHUB_TOKEN")
+    repo = os.getenv("GITHUB_REPOSITORY")
+    pr = os.getenv("GITHUB_PR_NUMBER")
+    if not all([token, repo, pr]):
+        print("WARNING: Missing GitHub env vars for PR comment "
+              f"(TOKEN={'set' if token else 'missing'}, REPO={repo or 'missing'}, PR={pr or 'missing'})")
+        return False
+
+    url = f"https://api.github.com/repos/{repo}/issues/{pr}/comments"
+    data = json.dumps({"body": body}).encode()
+    req = Request(url, data=data, method="POST", headers={
+        "Accept": "application/vnd.github+json",
+        "Authorization": f"Bearer {token}",
+        "X-GitHub-Api-Version": "2022-11-28",
+        "Content-Type": "application/json",
+    })
+    try:
+        with urlopen(req, timeout=30) as resp:
+            if resp.status < 300:
+                print(f"Posted comment to PR #{pr}")
+                return True
+    except URLError as e:
+        print(f"ERROR posting PR comment: {e}")
+    return False
+
+
+def write_step_summary(content: str) -> None:
+    path = os.getenv("GITHUB_STEP_SUMMARY")
+    if path:
+        with open(path, "a", encoding="utf-8") as f:
+            f.write(content + "\n")
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Compare two benchmark JSONs, validate thresholds, output Markdown."
+    )
+    parser.add_argument("--mode", choices=["aa", "pr"], default="aa",
+                        help="aa = symmetric thresholds, pr = directional")
+    parser.add_argument("--baseline", required=True, help="Baseline JSON path")
+    parser.add_argument("--target", required=True, help="Target JSON path")
+    parser.add_argument("--title", default="Benchmark Results",
+                        help="Title for the Markdown summary table")
+    parser.add_argument("--no-comment", action="store_true",
+                        help="Skip posting PR comment on failure")
+    args = parser.parse_args()
+
+    print(f"Mode: {args.mode}")
+    print(f"Baseline: {args.baseline}")
+    print(f"Target:   {args.target}")
+
+    baseline = load_json(args.baseline)
+    target = load_json(args.target)
+
+    if len(baseline) != len(target):
+        print(f"ERROR: JSON arrays differ in length: {len(baseline)} vs {len(target)}")
+        return 1
+
+    # Compare
+    diffs = compute_diff(baseline, target)
+    print(f"\nCompared {len(diffs)} metrics")
+
+    # Write Markdown summary
+    md = diffs_to_markdown(diffs, args.title)
+    write_step_summary(md)
+
+    # Validate thresholds
+    run_id = os.getenv("GITHUB_RUN_ID")
+    has_failures, report = validate(diffs, args.mode, run_id)
+
+    if has_failures:
+        print("\n" + report)
+        write_step_summary(report)
+        if args.mode == "pr" and not args.no_comment:
+            post_pr_comment(report)
+        return 1
+
+    print("\n✅ All metrics within thresholds")
+    write_step_summary("### ✅ Benchmark Check Passed\n\nAll metrics within acceptable thresholds.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml
new file mode 100644
index 000000000..c8256bd59
--- /dev/null
+++ b/.github/workflows/benchmarks-aa.yml
@@ -0,0 +1,241 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+# DiskANN Daily A/A Benchmark Stability Test
+#
+# Runs main vs main at 9 AM UTC every day to detect environment noise.
+# If any threshold is breached, a GitHub issue is created to notify @microsoft/diskann-admin.
+# Can also be triggered manually for debugging.
+
+name: Benchmarks (A/A)
+
+on:
+  schedule:
+    # Daily at 9 AM UTC
+    - cron: '0 9 * * *'
+  workflow_dispatch:  # Allow manual trigger for debugging
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+  issues: write  # Required for creating failure notification issues
+
+jobs:
+  # A/A benchmark: Wikipedia-100K dataset (main vs main)
+  aa-wikipedia-100K:
+    name: A/A - Wikipedia 100K
+    runs-on: ubuntu-latest
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout main (target)
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          path: diskann_rust
+          lfs: true
+
+      - name: Checkout main (baseline)
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          path: baseline
+          lfs: true
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: stable
+
+      - name: Cache Rust dependencies (target)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: diskann_rust -> target
+          key: aa-target
+
+      - name: Cache Rust dependencies (baseline)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: baseline -> target
+          key: aa-baseline
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y openssl libssl-dev pkg-config
+
+      # Download pre-packaged Wikipedia-100K dataset from GitHub Release
+      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
+      - name: Download wikipedia-100K dataset
+        run: |
+          mkdir -p diskann_rust/target/tmp baseline/target/tmp
+          curl -L -o wikipedia-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/wikipedia-100K.tar.gz
+          tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/
+          cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/
+
+      - name: Run baseline benchmark
+        working-directory: baseline
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
+            --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+
+      - name: Run target benchmark
+        working-directory: diskann_rust
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
+            --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json
+
+      - name: Validate benchmark results
+        run: |
+          python diskann_rust/.github/scripts/benchmark_validate.py \
+            --mode aa \
+            --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \
+            --target diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \
+            --title 'A/A Results: Wikipedia-100K Dataset'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: aa-results-wikipedia-100K
+          path: |
+            diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json
+            baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+          retention-days: 30
+
+  # A/A benchmark: OpenAI ArXiv 100K dataset (main vs main)
+  aa-openai-100K:
+    name: A/A - OAI ArXiv 100K
+    runs-on: ubuntu-latest
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout main (target)
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          path: diskann_rust
+          lfs: true
+
+      - name: Checkout main (baseline)
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          path: baseline
+          lfs: true
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: stable
+
+      - name: Cache Rust dependencies (target)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: diskann_rust -> target
+          key: aa-target
+
+      - name: Cache Rust dependencies (baseline)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: baseline -> target
+          key: aa-baseline
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y openssl libssl-dev pkg-config
+
+      # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release
+      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
+      - name: Download openai-100K dataset
+        run: |
+          mkdir -p diskann_rust/target/tmp baseline/target/tmp
+          curl -L -o openai-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/openai-100K.tar.gz
+          tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/
+          cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/
+
+      - name: Run baseline benchmark
+        working-directory: baseline
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
+            --output-file target/tmp/openai-100K_benchmark_crate_baseline.json
+
+      - name: Run target benchmark
+        working-directory: diskann_rust
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
+            --output-file target/tmp/openai-100K_benchmark_crate_target.json
+
+      - name: Validate benchmark results
+        run: |
+          python diskann_rust/.github/scripts/benchmark_validate.py \
+            --mode aa \
+            --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \
+            --target diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \
+            --title 'A/A Results: OpenAI ArXiv 100K Dataset'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: aa-results-openai-100K
+          path: |
+            diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json
+            baseline/target/tmp/openai-100K_benchmark_crate_baseline.json
+          retention-days: 30
+
+  # Notify diskann-admin on A/A failure
+  notify-on-failure:
+    name: Notify on A/A Failure
+    needs: [aa-wikipedia-100K, aa-openai-100K]
+    runs-on: ubuntu-latest
+    if: failure()
+    steps:
+      - name: Create GitHub issue for A/A failure
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const date = new Date().toISOString().split('T')[0];
+            const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
+            await github.rest.issues.create({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              title: `[Benchmark A/A] Daily stability test failed – ${date}`,
+              body: [
+                `## Daily A/A Benchmark Failure`,
+                ``,
+                `The scheduled A/A benchmark run (main vs main) **failed** on ${date}.`,
+                `This indicates environment noise exceeded the configured thresholds.`,
+                ``,
+                `**Run:** ${runUrl}`,
+                ``,
+                `Please review the benchmark artifacts and determine if thresholds need tuning`,
+                `or if there is a runner environment issue.`,
+                ``,
+                `/cc @microsoft/diskann-admin`,
+              ].join('\n'),
+              labels: ['benchmark', 'A/A-failure'],
+            });
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
new file mode 100644
index 000000000..d75e0efe7
--- /dev/null
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,237 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+# DiskANN Benchmarks Workflow
+#
+# This workflow runs macro benchmarks comparing the current branch against a baseline.
+# It is manually triggered and requires a baseline reference (branch, tag, or commit).
+
+name: Benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      baseline_ref:
+        description: 'A branch, commit SHA, or tag name to compare the current branch with'
+        required: true
+        default: 'main'
+        type: string
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'diskann/**'
+      - 'diskann-disk/**'
+      - 'diskann-linalg/**'
+      - 'diskann-providers/**'
+      - 'diskann-quantization/**'
+      - 'diskann-vector/**'
+      - 'diskann-wide/**'
+      - 'diskann-utils/**'
+      - 'diskann-platform/**'
+      - 'diskann-label-filter/**'
+      - 'diskann-benchmark/**'
+      - '.github/workflows/benchmarks.yml'
+      - '.github/scripts/benchmark_validate.py'
+
+# Cancel in-progress runs when a new run is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+  pull-requests: write  # Required for posting PR comments
+
+jobs:
+  # Macro benchmark: Wikipedia-100K dataset
+  macro-benchmark-wikipedia-100K:
+    name: Macro Benchmark - Wikipedia 100K
+    runs-on: ubuntu-latest
+    # TODO: For production benchmarks, consider using a self-hosted runner with:
+    # - NVMe storage for consistent I/O performance
+    # - CPU pinning (taskset) for reduced variance
+    # - Dedicated hardware to avoid noisy neighbor effects
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout current branch
+        uses: actions/checkout@v4
+        with:
+          path: diskann_rust
+          lfs: true
+
+      - name: Checkout baseline (${{ inputs.baseline_ref || 'main' }})
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.baseline_ref || 'main' }}
+          path: baseline
+          lfs: true
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: stable
+
+      - name: Cache Rust dependencies (current)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: diskann_rust -> target
+          key: benchmark-current
+
+      - name: Cache Rust dependencies (baseline)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: baseline -> target
+          key: benchmark-baseline
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y openssl libssl-dev pkg-config
+
+      # Download pre-packaged Wikipedia-100K dataset from GitHub Release
+      # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance)
+      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
+      - name: Download wikipedia-100K dataset
+        run: |
+          mkdir -p diskann_rust/target/tmp baseline/target/tmp
+          curl -L -o wikipedia-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/wikipedia-100K.tar.gz
+          tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/
+          cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/
+
+      - name: Run baseline benchmark
+        working-directory: baseline
+        run: |
+          # Note: For accurate benchmarks, consider using CPU pinning on self-hosted runners:
+          # sudo taskset -c 0,2,4,6 ionice -c 1 -n 0 cargo run ...
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
+            --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+
+      - name: Run current branch benchmark
+        working-directory: diskann_rust
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
+            --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json
+
+      - name: Validate benchmark results
+        run: |
+          python diskann_rust/.github/scripts/benchmark_validate.py \
+            --mode pr \
+            --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \
+            --target diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \
+            --title 'Benchmark Results: Wikipedia-100K Dataset'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        if: always()  # Upload even if validation fails
+        with:
+          name: benchmark-results-wikipedia-100K
+          path: |
+            diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json
+            baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+          retention-days: 30
+
+  # Macro benchmark: OpenAI ArXiv dataset
+  macro-benchmark-oai-large:
+    name: Macro Benchmark - OAI ArXiv 100K
+    runs-on: ubuntu-latest
+    # TODO: For production benchmarks, consider using a self-hosted runner
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout current branch
+        uses: actions/checkout@v4
+        with:
+          path: diskann_rust
+          lfs: true
+
+      - name: Checkout baseline (${{ inputs.baseline_ref || 'main' }})
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.baseline_ref || 'main' }}
+          path: baseline
+          lfs: true
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: stable
+
+      - name: Cache Rust dependencies (current)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: diskann_rust -> target
+          key: benchmark-current
+
+      - name: Cache Rust dependencies (baseline)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: baseline -> target
+          key: benchmark-baseline
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y openssl libssl-dev pkg-config
+
+      # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release
+      # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance)
+      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
+      - name: Download openai-100K dataset
+        run: |
+          mkdir -p diskann_rust/target/tmp baseline/target/tmp
+          curl -L -o openai-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/openai-100K.tar.gz
+          tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/
+          cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/
+
+      - name: Run baseline benchmark
+        working-directory: baseline
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
+            --output-file target/tmp/openai-100K_benchmark_crate_baseline.json
+
+      - name: Run current branch benchmark
+        working-directory: diskann_rust
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
+            --output-file target/tmp/openai-100K_benchmark_crate_target.json
+
+      - name: Validate benchmark results
+        run: |
+          python diskann_rust/.github/scripts/benchmark_validate.py \
+            --mode pr \
+            --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \
+            --target diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \
+            --title 'Benchmark Results: OpenAI ArXiv 100K Dataset'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        if: always()  # Upload even if validation fails
+        with:
+          name: benchmark-results-openai-100K
+          path: |
+            diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json
+            baseline/target/tmp/openai-100K_benchmark_crate_baseline.json
+          retention-days: 30
\ No newline at end of file
diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
new file mode 100644
index 000000000..d021640fc
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
@@ -0,0 +1,39 @@
+{
+    "search_directories": [
+        "target/tmp"
+    ],
+    "jobs": [
+        {
+            "type": "disk-index",
+            "content": {
+                "source": {
+                    "disk-index-source": "Build",
+                    "data_type": "float32",
+                    "data": "OpenAIArXiv/openai_base.bin.crop_nb_100000",
+                    "distance": "squared_l2",
+                    "dim": 1536,
+                    "max_degree": 59,
+                    "l_build": 80,
+                    "num_threads": 4,
+                    "build_ram_limit_gb": 4.0,
+                    "num_pq_chunks": 384,
+                    "quantization_type": "SQ_1_2.0",
+                    "save_path": "openai_100k_benchmark_index"
+                },
+                "search_phase": {
+                    "queries": "OpenAIArXiv/openai_query.bin",
+                    "groundtruth": "OpenAIArXiv/openai-100K",
+                    "search_list": [
+                        200
+                    ],
+                    "beam_width": 4,
+                    "recall_at": 100,
+                    "num_threads": 4,
+                    "is_flat_search": false,
+                    "distance": "squared_l2",
+                    "vector_filters_file": null
+                }
+            }
+        }
+    ]
+}
diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
new file mode 100644
index 000000000..e5f06aa1b
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
@@ -0,0 +1,39 @@
+{
+    "search_directories": [
+        "target/tmp"
+    ],
+    "jobs": [
+        {
+            "type": "disk-index",
+            "content": {
+                "source": {
+                    "disk-index-source": "Build",
+                    "data_type": "float32",
+                    "data": "wikipedia_cohere/wikipedia_base.bin.crop_nb_100000",
+                    "distance": "inner_product",
+                    "dim": 768,
+                    "max_degree": 59,
+                    "l_build": 72,
+                    "num_threads": 4,
+                    "build_ram_limit_gb": 4.0,
+                    "num_pq_chunks": 192,
+                    "quantization_type": "FP",
+                    "save_path": "wikipedia_100k_benchmark_index"
+                },
+                "search_phase": {
+                    "queries": "wikipedia_cohere/wikipedia_query.bin",
+                    "groundtruth": "wikipedia_cohere/wikipedia-100K",
+                    "search_list": [
+                        200
+                    ],
+                    "beam_width": 4,
+                    "recall_at": 100,
+                    "num_threads": 4,
+                    "is_flat_search": false,
+                    "distance": "inner_product",
+                    "vector_filters_file": null
+                }
+            }
+        }
+    ]
+}