From a3f7aec1d4cebbd538614d7b5977c822d05696ac Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <gewenyu99@gmail.com>
Date: Wed, 11 Feb 2026 15:34:55 -0500
Subject: [PATCH] benchmarking

---
 README.md                       |  35 ++++-
 mprocs.yaml                     |   6 +
 services/wizard-ci/benchmark.ts | 229 ++++++++++++++++++++++++++++++++
 services/wizard-ci/index.ts     |  33 +++++
 services/wizard-ci/utils.ts     |  12 +-
 5 files changed, 313 insertions(+), 2 deletions(-)
 create mode 100644 services/wizard-ci/benchmark.ts
diff --git a/README.md b/README.md
index 61d87fdb7..63bc2568d 100644
--- a/README.md
+++ b/README.md
@@ -111,6 +111,7 @@ Use keyboard shortcuts in mprocs: `s` to start, `x` to stop, `r` to restart, `q`
 | `wizard-tail-run` | Tail the wizard's verbose output (`/tmp/posthog-wizard.log`) |
 | `wizard-ci-run` | Full CI flow: run wizard, create PR, evaluate |
 | `wizard-ci-local-run` | CI flow with local evaluation (no PR) |
+| `wizard-ci-benchmark` | Benchmark only: run wizard with per-phase tracking (no evaluation) |
 | `wizard-ci-create-pr` | Push branch and create PR only (skip wizard run) |
 | `wizard-ci-evaluate-pr` | Evaluate an existing PR or local branch |
 
@@ -160,4 +161,36 @@ You can activate `wizard-ci.yml` in a few ways:
 
 1. **Manual** - Run from GitHub Actions UI
 2. **Schedule** - Runs on cron
-3. **Dispatch** - Webhook call via `repository_dispatch` with event type `wizard-ci-trigger` 
+3. **Dispatch** - Webhook call via `repository_dispatch` with event type `wizard-ci-trigger`
+
+---
+
+## Benchmarking
+
+Wizard CI runs automatically collect per-phase token usage, cost, and timing data. The wizard's `--benchmark` flag is always enabled in CI mode, breaking execution into separate agent calls per workflow phase (setup, 1.0-begin, 1.1-edit, 1.2-revise, 1.3-conclude).
+
+### Running a benchmark
+
+In mprocs, start **`wizard-ci-benchmark`** (press `s`), pick your test app, and the benchmark table prints after the wizard completes:
+
+```
+┌─────────────┬──────────┬──────────┬───────────┬───────┬─────────┐
+│ Phase       │ Input    │ Output   │ Cost      │ Turns │ Time    │
+├─────────────┼──────────┼──────────┼───────────┼───────┼─────────┤
+│ Setup       │   1,234  │     567  │ $0.0234   │     5 │   42.3s │
+│ 1.0-begin   │  12,345  │   2,345  │ $0.1234   │    12 │  123.4s │
+│ 1.1-edit    │  34,567  │   8,901  │ $0.3456   │    25 │  234.5s │
+│ 1.2-revise  │   8,901  │   1,234  │ $0.0890   │     8 │   67.8s │
+│ 1.3-conclude│   5,678  │   2,345  │ $0.0567   │    10 │   89.0s │
+├─────────────┼──────────┼──────────┼───────────┼───────┼─────────┤
+│ TOTAL       │  62,725  │  15,392  │ $0.6381   │    60 │  557.0s │
+└─────────────┴──────────┴──────────┴───────────┴───────┴─────────┘
+```
+
+Benchmark data is also saved to `test-evaluations/<run-name>/benchmark.json`.
+
+Use **`wizard-ci-run`** instead if you want the full flow with a GitHub PR — the benchmark table is included in the PR body as markdown.
+
+### Raw data
+
+The wizard writes `/tmp/posthog-wizard-benchmark.json` with per-step token counts, cost, duration, and model usage. The CI runner reads and cleans up this file automatically.
diff --git a/mprocs.yaml b/mprocs.yaml
index 1f3c01e13..2504081fb 100644
--- a/mprocs.yaml
+++ b/mprocs.yaml
@@ -59,6 +59,12 @@ procs:
     autostart: false
     env_file: .env
 
+  wizard-ci-benchmark:
+    # Runs wizard with benchmark mode only (no evaluation)
+    shell: "pnpm wizard-ci --local"
+    autostart: false
+    env_file: .env
+
   wizard-ci-create-pr:
     # Only pushes branch and creates PR on GH
     shell: "pnpm wizard-ci --push-only --branch"
diff --git a/services/wizard-ci/benchmark.ts b/services/wizard-ci/benchmark.ts
new file mode 100644
index 000000000..5ee40b0cb
--- /dev/null
+++ b/services/wizard-ci/benchmark.ts
@@ -0,0 +1,229 @@
+/**
+ * Benchmark data types and formatting for wizard CI runs.
+ * Reads per-phase token usage data written by the wizard's --benchmark mode.
+ */
+import { readFileSync, unlinkSync, existsSync } from "fs";
+
+export const BENCHMARK_FILE_PATH = "/tmp/posthog-wizard-benchmark.json";
+
+export interface StepUsage {
+  name: string;
+  usage: {
+    input_tokens: number;
+    output_tokens: number;
+    cache_creation_input_tokens: number;
+    cache_read_input_tokens: number;
+  };
+  modelUsage: Record<string, unknown>;
+  totalCostUsd: number;
+  durationMs: number;
+  durationApiMs: number;
+  numTurns: number;
+  contextTokensIn?: number;
+  contextTokensOut?: number;
+  compactions?: number;
+  compactionPreTokens?: number[];
+}
+
+export interface BenchmarkData {
+  timestamp: string;
+  steps: StepUsage[];
+  totals: {
+    totalCostUsd: number;
+    durationMs: number;
+    inputTokens: number;
+    outputTokens: number;
+    numTurns: number;
+  };
+}
+
+/**
+ * Read and parse the benchmark file written by the wizard.
+ * Returns null if file doesn't exist or can't be parsed.
+ * Optionally cleans up the temp file after reading.
+ */
+export function readBenchmarkFile(cleanup = true): BenchmarkData | null {
+  try {
+    if (!existsSync(BENCHMARK_FILE_PATH)) {
+      return null;
+    }
+    const raw = readFileSync(BENCHMARK_FILE_PATH, "utf-8");
+    const data = JSON.parse(raw) as BenchmarkData;
+
+    if (cleanup) {
+      try {
+        unlinkSync(BENCHMARK_FILE_PATH);
+      } catch {
+        // Ignore cleanup errors
+      }
+    }
+
+    return data;
+  } catch {
+    return null;
+  }
+}
+
+// ============================================================================
+// Formatting helpers
+// ============================================================================
+
+function formatNumber(n: number): string {
+  return n.toLocaleString("en-US");
+}
+
+function formatCost(usd: number): string {
+  return `$${usd.toFixed(4)}`;
+}
+
+function formatDuration(ms: number): string {
+  return `${(ms / 1000).toFixed(1)}s`;
+}
+
+function formatContext(tokens: number | undefined): string {
+  if (tokens == null) return "-";
+  if (tokens >= 1_000_000) return `${(tokens / 1_000_000).toFixed(1)}M`;
+  if (tokens >= 10_000) return `${Math.round(tokens / 1000)}K`;
+  return formatNumber(tokens);
+}
+
+function padRight(str: string, len: number): string {
+  return str.length >= len ? str : str + " ".repeat(len - str.length);
+}
+
+function padLeft(str: string, len: number): string {
+  return str.length >= len ? str : " ".repeat(len - str.length) + str;
+}
+
+/**
+ * Format benchmark data as a console table.
+ */
+export function formatBenchmarkConsole(data: BenchmarkData): string {
+  const hasContext = data.steps.some((s) => s.contextTokensOut != null);
+  const COL = { phase: 13, input: 10, output: 10, cost: 11, turns: 7, time: 9, ctxIn: 9, ctxOut: 9 };
+
+  const cols = [COL.phase, COL.input, COL.output, COL.cost, COL.turns, COL.time];
+  const headers = [" Phase", " Input", " Output", " Cost", " Turns", " Time"];
+  if (hasContext) {
+    cols.push(COL.ctxIn, COL.ctxOut);
+    headers.push(" Ctx In", " Ctx Out");
+  }
+
+  const line = (left: string, mid: string, right: string, fill: string) =>
+    left + cols.map((w) => fill.repeat(w)).join(mid) + right;
+
+  const header =
+    line("\u250c", "\u252c", "\u2510", "\u2500") +
+    "\n" +
+    "\u2502" + headers.map((h, i) => padRight(h, cols[i])).join("\u2502") + "\u2502" +
+    "\n" +
+    line("\u251c", "\u253c", "\u2524", "\u2500");
+
+  const rows = data.steps.map((step) => {
+    const totalInput = step.usage.input_tokens + step.usage.cache_read_input_tokens + step.usage.cache_creation_input_tokens;
+    const cells = [
+      padRight(` ${step.name}`, COL.phase),
+      padLeft(formatNumber(totalInput), COL.input - 1) + " ",
+      padLeft(formatNumber(step.usage.output_tokens), COL.output - 1) + " ",
+      padLeft(formatCost(step.totalCostUsd), COL.cost - 1) + " ",
+      padLeft(String(step.numTurns), COL.turns - 1) + " ",
+      padLeft(formatDuration(step.durationMs), COL.time - 1) + " ",
+    ];
+    if (hasContext) {
+      cells.push(
+        padLeft(formatContext(step.contextTokensIn), COL.ctxIn - 1) + " ",
+        padLeft(formatContext(step.contextTokensOut), COL.ctxOut - 1) + " ",
+      );
+    }
+    return "\u2502" + cells.join("\u2502") + "\u2502";
+  });
+
+  const separator = line("\u251c", "\u253c", "\u2524", "\u2500");
+
+  const totalCells = [
+    padRight(" TOTAL", COL.phase),
+    padLeft(formatNumber(data.totals.inputTokens), COL.input - 1) + " ",
+    padLeft(formatNumber(data.totals.outputTokens), COL.output - 1) + " ",
+    padLeft(formatCost(data.totals.totalCostUsd), COL.cost - 1) + " ",
+    padLeft(String(data.totals.numTurns), COL.turns - 1) + " ",
+    padLeft(formatDuration(data.totals.durationMs), COL.time - 1) + " ",
+  ];
+  if (hasContext) {
+    const lastStep = data.steps[data.steps.length - 1];
+    totalCells.push(
+      padLeft("", COL.ctxIn - 1) + " ",
+      padLeft(formatContext(lastStep?.contextTokensOut), COL.ctxOut - 1) + " ",
+    );
+  }
+  const totalRow = "\u2502" + totalCells.join("\u2502") + "\u2502";
+
+  const footer = line("\u2514", "\u2534", "\u2518", "\u2500");
+
+  const parts = [header, ...rows, separator, totalRow, footer];
+
+  // Add compaction notes below the table if any occurred
+  const compactedSteps = data.steps.filter((s) => s.compactions && s.compactions > 0);
+  if (compactedSteps.length > 0) {
+    parts.push("");
+    const totalCompactions = compactedSteps.reduce((sum, s) => sum + (s.compactions ?? 0), 0);
+    parts.push(`\u26a0 ${totalCompactions} compaction(s) detected:`);
+    for (const step of compactedSteps) {
+      const preTokensStr = step.compactionPreTokens
+        ? step.compactionPreTokens.map((t) => formatContext(t)).join(", ")
+        : "";
+      parts.push(`  ${step.name}: ${step.compactions}x (pre-tokens: ${preTokensStr})`);
+    }
+  }
+
+  return parts.join("\n");
+}
+
+/**
+ * Format benchmark data as a markdown table for PR bodies.
+ */
+export function formatBenchmarkMarkdown(data: BenchmarkData): string {
+  const hasContext = data.steps.some((s) => s.contextTokensOut != null);
+  const ctxHeaders = hasContext ? " Ctx In | Ctx Out |" : "";
+  const ctxAlign = hasContext ? "------:|-------:|" : "";
+
+  const lines = [
+    "## Benchmark",
+    "",
+    `| Phase | Input | Output | Cost | Turns | Time |${ctxHeaders}`,
+    `|-------|------:|-------:|-----:|------:|-----:|${ctxAlign}`,
+  ];
+
+  for (const step of data.steps) {
+    const totalInput = step.usage.input_tokens + step.usage.cache_read_input_tokens + step.usage.cache_creation_input_tokens;
+    const ctxCols = hasContext
+      ? ` ${formatContext(step.contextTokensIn)} | ${formatContext(step.contextTokensOut)} |`
+      : "";
+    lines.push(
+      `| ${step.name} | ${formatNumber(totalInput)} | ${formatNumber(step.usage.output_tokens)} | ${formatCost(step.totalCostUsd)} | ${step.numTurns} | ${formatDuration(step.durationMs)} |${ctxCols}`,
+    );
+  }
+
+  const lastStep = data.steps[data.steps.length - 1];
+  const ctxTotalCols = hasContext
+    ? ` | **${formatContext(lastStep?.contextTokensOut)}** |`
+    : "";
+  lines.push(
+    `| **TOTAL** | **${formatNumber(data.totals.inputTokens)}** | **${formatNumber(data.totals.outputTokens)}** | **${formatCost(data.totals.totalCostUsd)}** | **${data.totals.numTurns}** | **${formatDuration(data.totals.durationMs)}** |${ctxTotalCols}`,
+  );
+
+  // Add compaction notes if any occurred
+  const compactedSteps = data.steps.filter((s) => s.compactions && s.compactions > 0);
+  if (compactedSteps.length > 0) {
+    const totalCompactions = compactedSteps.reduce((sum, s) => sum + (s.compactions ?? 0), 0);
+    lines.push("");
+    lines.push(`> **${totalCompactions} compaction(s)** detected during run:`);
+    for (const step of compactedSteps) {
+      const preTokensStr = step.compactionPreTokens
+        ? step.compactionPreTokens.map((t) => formatContext(t)).join(", ")
+        : "";
+      lines.push(`> - **${step.name}**: ${step.compactions}x (pre-tokens: ${preTokensStr})`);
+    }
+  }
+
+  return lines.join("\n");
+}
diff --git a/services/wizard-ci/index.ts b/services/wizard-ci/index.ts
index b952009c7..cb81ce736 100644
--- a/services/wizard-ci/index.ts
+++ b/services/wizard-ci/index.ts
@@ -11,7 +11,9 @@
  */
 import "dotenv/config";
 import { createInterface } from "readline";
+import { writeFileSync, mkdirSync } from "fs";
 import { join, relative } from "path";
+import { formatBenchmarkConsole, formatBenchmarkMarkdown, type BenchmarkData } from "./benchmark.js";
 import {
   findApps,
   resetApp,
@@ -71,6 +73,7 @@ interface PRMetadata {
   posthogRef?: string;
   source?: string;
   sourceUrl?: string;
+  benchmarkMarkdown?: string;
 }
 
 function getDependencyRefs(): Pick<PRMetadata, "wizardRef" | "contextMillRef" | "posthogRef"> {
@@ -125,6 +128,9 @@ function buildPRBody(meta: PRMetadata): string {
   if (meta.duration !== undefined) {
     lines.push(`Duration: ${formatMs(meta.duration)}`);
   }
+  if (meta.benchmarkMarkdown) {
+    lines.push("", meta.benchmarkMarkdown);
+  }
   return lines.join("\n");
 }
 
@@ -523,6 +529,11 @@ async function runCI(app: App, opts: Options, triggerId: string): Promise<boolea
   }
   console.log(`      Completed in ${formatMs(result.duration)}\n`);
 
+  // Log benchmark data if available
+  if (result.benchmark) {
+    console.log("\n" + formatBenchmarkConsole(result.benchmark) + "\n");
+  }
+
   // 3. Check changes in app directory only
   console.log("[3/5] Checking changes...");
   if (!hasChangesInPath(repoRoot, appRelativePath)) {
@@ -562,6 +573,11 @@ async function runCI(app: App, opts: Options, triggerId: string): Promise<boolea
       const testRunName = `local-${triggerId}-${app.name.replace(/\//g, "-")}`;
       const evalInfo = await runLocalEvaluation(branchName, opts.base, testRunName);
 
+      // Save benchmark data to evaluation directory if available
+      if (result.benchmark && testRunName) {
+        saveBenchmarkToEvalDir(testRunName, result.benchmark);
+      }
+
       // Return to original branch
       checkout(repoRoot, originalBranch);
 
@@ -623,6 +639,7 @@ async function runCI(app: App, opts: Options, triggerId: string): Promise<boolea
     shortId: triggerId,
     branch: branchName,
     duration: result.duration,
+    benchmarkMarkdown: result.benchmark ? formatBenchmarkMarkdown(result.benchmark) : undefined,
     ...getDependencyRefs(),
     ...getSourceInfo(),
   };
@@ -674,6 +691,22 @@ async function runCI(app: App, opts: Options, triggerId: string): Promise<boolea
   return true;
 }
 
+// ============================================================================
+// Benchmark helpers
+// ============================================================================
+
+function saveBenchmarkToEvalDir(testRunName: string, benchmark: BenchmarkData): void {
+  try {
+    const evalDir = join(process.cwd(), "test-evaluations", testRunName);
+    mkdirSync(evalDir, { recursive: true });
+    const benchmarkPath = join(evalDir, "benchmark.json");
+    writeFileSync(benchmarkPath, JSON.stringify(benchmark, null, 2));
+    console.log(`      Benchmark saved: ${benchmarkPath}\n`);
+  } catch (e) {
+    console.warn(`      Failed to save benchmark data: ${e}\n`);
+  }
+}
+
 // ============================================================================
 // Main
 // ============================================================================
diff --git a/services/wizard-ci/utils.ts b/services/wizard-ci/utils.ts
index 4adc77804..f2e0ceb50 100644
--- a/services/wizard-ci/utils.ts
+++ b/services/wizard-ci/utils.ts
@@ -7,6 +7,7 @@
 import { spawn } from "child_process";
 import { existsSync, readdirSync, statSync } from "fs";
 import { join } from "path";
+import { readBenchmarkFile, type BenchmarkData } from "./benchmark.js";
 
 // Re-export git operations from shared service
 export {
@@ -133,6 +134,7 @@ export interface WizardResult {
   success: boolean;
   duration: number;
   error?: string;
+  benchmark?: BenchmarkData;
 }
 
 export interface WizardOptions {
@@ -181,14 +183,22 @@ export function runWizard(appPath: string, options: WizardOptions = {}): Promise
     const child = spawn("node", args, {
       cwd: appPath,
       stdio: "inherit",
-      env: process.env,
+      env: {
+        ...process.env,
+        // Always enable benchmark mode in CI to collect usage data
+        POSTHOG_WIZARD_BENCHMARK: "true",
+      },
     });
 
     child.on("close", (code) => {
+      // Read benchmark data written by the wizard (if any)
+      const benchmark = readBenchmarkFile() ?? undefined;
+
       resolve({
         success: code === 0,
         duration: Date.now() - start,
         error: code !== 0 ? `Exit code: ${code}` : undefined,
+        benchmark,
       });
     });