From a3f7aec1d4cebbd538614d7b5977c822d05696ac Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" Date: Wed, 11 Feb 2026 15:34:55 -0500 Subject: [PATCH] benchmarking --- README.md | 35 ++++- mprocs.yaml | 6 + services/wizard-ci/benchmark.ts | 229 ++++++++++++++++++++++++++++++++ services/wizard-ci/index.ts | 33 +++++ services/wizard-ci/utils.ts | 12 +- 5 files changed, 313 insertions(+), 2 deletions(-) create mode 100644 services/wizard-ci/benchmark.ts diff --git a/README.md b/README.md index 61d87fdb7..63bc2568d 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,7 @@ Use keyboard shortcuts in mprocs: `s` to start, `x` to stop, `r` to restart, `q` | `wizard-tail-run` | Tail the wizard's verbose output (`/tmp/posthog-wizard.log`) | | `wizard-ci-run` | Full CI flow: run wizard, create PR, evaluate | | `wizard-ci-local-run` | CI flow with local evaluation (no PR) | +| `wizard-ci-benchmark` | Benchmark only: run wizard with per-phase tracking (no evaluation) | | `wizard-ci-create-pr` | Push branch and create PR only (skip wizard run) | | `wizard-ci-evaluate-pr` | Evaluate an existing PR or local branch | @@ -160,4 +161,36 @@ You can activate `wizard-ci.yml` in a few ways: 1. **Manual** - Run from GitHub Actions UI 2. **Schedule** - Runs on cron -3. **Dispatch** - Webhook call via `repository_dispatch` with event type `wizard-ci-trigger` +3. **Dispatch** - Webhook call via `repository_dispatch` with event type `wizard-ci-trigger` + +--- + +## Benchmarking + +Wizard CI runs automatically collect per-phase token usage, cost, and timing data. The wizard's `--benchmark` flag is always enabled in CI mode, breaking execution into separate agent calls per workflow phase (setup, 1.0-begin, 1.1-edit, 1.2-revise, 1.3-conclude). + +### Running a benchmark + +In mprocs, start **`wizard-ci-benchmark`** (press `s`), pick your test app, and the benchmark table prints after the wizard completes: + +``` +┌─────────────┬──────────┬──────────┬───────────┬───────┬─────────┐ +│ Phase │ Input │ Output │ Cost │ Turns │ Time │ +├─────────────┼──────────┼──────────┼───────────┼───────┼─────────┤ +│ Setup │ 1,234 │ 567 │ $0.0234 │ 5 │ 42.3s │ +│ 1.0-begin │ 12,345 │ 2,345 │ $0.1234 │ 12 │ 123.4s │ +│ 1.1-edit │ 34,567 │ 8,901 │ $0.3456 │ 25 │ 234.5s │ +│ 1.2-revise │ 8,901 │ 1,234 │ $0.0890 │ 8 │ 67.8s │ +│ 1.3-conclude│ 5,678 │ 2,345 │ $0.0567 │ 10 │ 89.0s │ +├─────────────┼──────────┼──────────┼───────────┼───────┼─────────┤ +│ TOTAL │ 62,725 │ 15,392 │ $0.6381 │ 60 │ 557.0s │ +└─────────────┴──────────┴──────────┴───────────┴───────┴─────────┘ +``` + +Benchmark data is also saved to `test-evaluations//benchmark.json`. + +Use **`wizard-ci-run`** instead if you want the full flow with a GitHub PR — the benchmark table is included in the PR body as markdown. + +### Raw data + +The wizard writes `/tmp/posthog-wizard-benchmark.json` with per-step token counts, cost, duration, and model usage. The CI runner reads and cleans up this file automatically. diff --git a/mprocs.yaml b/mprocs.yaml index 1f3c01e13..2504081fb 100644 --- a/mprocs.yaml +++ b/mprocs.yaml @@ -59,6 +59,12 @@ procs: autostart: false env_file: .env + wizard-ci-benchmark: + # Runs wizard with benchmark mode only (no evaluation) + shell: "pnpm wizard-ci --local" + autostart: false + env_file: .env + wizard-ci-create-pr: # Only pushes branch and creates PR on GH shell: "pnpm wizard-ci --push-only --branch" diff --git a/services/wizard-ci/benchmark.ts b/services/wizard-ci/benchmark.ts new file mode 100644 index 000000000..5ee40b0cb --- /dev/null +++ b/services/wizard-ci/benchmark.ts @@ -0,0 +1,229 @@ +/** + * Benchmark data types and formatting for wizard CI runs. + * Reads per-phase token usage data written by the wizard's --benchmark mode. + */ +import { readFileSync, unlinkSync, existsSync } from "fs"; + +export const BENCHMARK_FILE_PATH = "/tmp/posthog-wizard-benchmark.json"; + +export interface StepUsage { + name: string; + usage: { + input_tokens: number; + output_tokens: number; + cache_creation_input_tokens: number; + cache_read_input_tokens: number; + }; + modelUsage: Record; + totalCostUsd: number; + durationMs: number; + durationApiMs: number; + numTurns: number; + contextTokensIn?: number; + contextTokensOut?: number; + compactions?: number; + compactionPreTokens?: number[]; +} + +export interface BenchmarkData { + timestamp: string; + steps: StepUsage[]; + totals: { + totalCostUsd: number; + durationMs: number; + inputTokens: number; + outputTokens: number; + numTurns: number; + }; +} + +/** + * Read and parse the benchmark file written by the wizard. + * Returns null if file doesn't exist or can't be parsed. + * Optionally cleans up the temp file after reading. + */ +export function readBenchmarkFile(cleanup = true): BenchmarkData | null { + try { + if (!existsSync(BENCHMARK_FILE_PATH)) { + return null; + } + const raw = readFileSync(BENCHMARK_FILE_PATH, "utf-8"); + const data = JSON.parse(raw) as BenchmarkData; + + if (cleanup) { + try { + unlinkSync(BENCHMARK_FILE_PATH); + } catch { + // Ignore cleanup errors + } + } + + return data; + } catch { + return null; + } +} + +// ============================================================================ +// Formatting helpers +// ============================================================================ + +function formatNumber(n: number): string { + return n.toLocaleString("en-US"); +} + +function formatCost(usd: number): string { + return `$${usd.toFixed(4)}`; +} + +function formatDuration(ms: number): string { + return `${(ms / 1000).toFixed(1)}s`; +} + +function formatContext(tokens: number | undefined): string { + if (tokens == null) return "-"; + if (tokens >= 1_000_000) return `${(tokens / 1_000_000).toFixed(1)}M`; + if (tokens >= 10_000) return `${Math.round(tokens / 1000)}K`; + return formatNumber(tokens); +} + +function padRight(str: string, len: number): string { + return str.length >= len ? str : str + " ".repeat(len - str.length); +} + +function padLeft(str: string, len: number): string { + return str.length >= len ? str : " ".repeat(len - str.length) + str; +} + +/** + * Format benchmark data as a console table. + */ +export function formatBenchmarkConsole(data: BenchmarkData): string { + const hasContext = data.steps.some((s) => s.contextTokensOut != null); + const COL = { phase: 13, input: 10, output: 10, cost: 11, turns: 7, time: 9, ctxIn: 9, ctxOut: 9 }; + + const cols = [COL.phase, COL.input, COL.output, COL.cost, COL.turns, COL.time]; + const headers = [" Phase", " Input", " Output", " Cost", " Turns", " Time"]; + if (hasContext) { + cols.push(COL.ctxIn, COL.ctxOut); + headers.push(" Ctx In", " Ctx Out"); + } + + const line = (left: string, mid: string, right: string, fill: string) => + left + cols.map((w) => fill.repeat(w)).join(mid) + right; + + const header = + line("\u250c", "\u252c", "\u2510", "\u2500") + + "\n" + + "\u2502" + headers.map((h, i) => padRight(h, cols[i])).join("\u2502") + "\u2502" + + "\n" + + line("\u251c", "\u253c", "\u2524", "\u2500"); + + const rows = data.steps.map((step) => { + const totalInput = step.usage.input_tokens + step.usage.cache_read_input_tokens + step.usage.cache_creation_input_tokens; + const cells = [ + padRight(` ${step.name}`, COL.phase), + padLeft(formatNumber(totalInput), COL.input - 1) + " ", + padLeft(formatNumber(step.usage.output_tokens), COL.output - 1) + " ", + padLeft(formatCost(step.totalCostUsd), COL.cost - 1) + " ", + padLeft(String(step.numTurns), COL.turns - 1) + " ", + padLeft(formatDuration(step.durationMs), COL.time - 1) + " ", + ]; + if (hasContext) { + cells.push( + padLeft(formatContext(step.contextTokensIn), COL.ctxIn - 1) + " ", + padLeft(formatContext(step.contextTokensOut), COL.ctxOut - 1) + " ", + ); + } + return "\u2502" + cells.join("\u2502") + "\u2502"; + }); + + const separator = line("\u251c", "\u253c", "\u2524", "\u2500"); + + const totalCells = [ + padRight(" TOTAL", COL.phase), + padLeft(formatNumber(data.totals.inputTokens), COL.input - 1) + " ", + padLeft(formatNumber(data.totals.outputTokens), COL.output - 1) + " ", + padLeft(formatCost(data.totals.totalCostUsd), COL.cost - 1) + " ", + padLeft(String(data.totals.numTurns), COL.turns - 1) + " ", + padLeft(formatDuration(data.totals.durationMs), COL.time - 1) + " ", + ]; + if (hasContext) { + const lastStep = data.steps[data.steps.length - 1]; + totalCells.push( + padLeft("", COL.ctxIn - 1) + " ", + padLeft(formatContext(lastStep?.contextTokensOut), COL.ctxOut - 1) + " ", + ); + } + const totalRow = "\u2502" + totalCells.join("\u2502") + "\u2502"; + + const footer = line("\u2514", "\u2534", "\u2518", "\u2500"); + + const parts = [header, ...rows, separator, totalRow, footer]; + + // Add compaction notes below the table if any occurred + const compactedSteps = data.steps.filter((s) => s.compactions && s.compactions > 0); + if (compactedSteps.length > 0) { + parts.push(""); + const totalCompactions = compactedSteps.reduce((sum, s) => sum + (s.compactions ?? 0), 0); + parts.push(`\u26a0 ${totalCompactions} compaction(s) detected:`); + for (const step of compactedSteps) { + const preTokensStr = step.compactionPreTokens + ? step.compactionPreTokens.map((t) => formatContext(t)).join(", ") + : ""; + parts.push(` ${step.name}: ${step.compactions}x (pre-tokens: ${preTokensStr})`); + } + } + + return parts.join("\n"); +} + +/** + * Format benchmark data as a markdown table for PR bodies. + */ +export function formatBenchmarkMarkdown(data: BenchmarkData): string { + const hasContext = data.steps.some((s) => s.contextTokensOut != null); + const ctxHeaders = hasContext ? " Ctx In | Ctx Out |" : ""; + const ctxAlign = hasContext ? "------:|-------:|" : ""; + + const lines = [ + "## Benchmark", + "", + `| Phase | Input | Output | Cost | Turns | Time |${ctxHeaders}`, + `|-------|------:|-------:|-----:|------:|-----:|${ctxAlign}`, + ]; + + for (const step of data.steps) { + const totalInput = step.usage.input_tokens + step.usage.cache_read_input_tokens + step.usage.cache_creation_input_tokens; + const ctxCols = hasContext + ? ` ${formatContext(step.contextTokensIn)} | ${formatContext(step.contextTokensOut)} |` + : ""; + lines.push( + `| ${step.name} | ${formatNumber(totalInput)} | ${formatNumber(step.usage.output_tokens)} | ${formatCost(step.totalCostUsd)} | ${step.numTurns} | ${formatDuration(step.durationMs)} |${ctxCols}`, + ); + } + + const lastStep = data.steps[data.steps.length - 1]; + const ctxTotalCols = hasContext + ? ` | **${formatContext(lastStep?.contextTokensOut)}** |` + : ""; + lines.push( + `| **TOTAL** | **${formatNumber(data.totals.inputTokens)}** | **${formatNumber(data.totals.outputTokens)}** | **${formatCost(data.totals.totalCostUsd)}** | **${data.totals.numTurns}** | **${formatDuration(data.totals.durationMs)}** |${ctxTotalCols}`, + ); + + // Add compaction notes if any occurred + const compactedSteps = data.steps.filter((s) => s.compactions && s.compactions > 0); + if (compactedSteps.length > 0) { + const totalCompactions = compactedSteps.reduce((sum, s) => sum + (s.compactions ?? 0), 0); + lines.push(""); + lines.push(`> **${totalCompactions} compaction(s)** detected during run:`); + for (const step of compactedSteps) { + const preTokensStr = step.compactionPreTokens + ? step.compactionPreTokens.map((t) => formatContext(t)).join(", ") + : ""; + lines.push(`> - **${step.name}**: ${step.compactions}x (pre-tokens: ${preTokensStr})`); + } + } + + return lines.join("\n"); +} diff --git a/services/wizard-ci/index.ts b/services/wizard-ci/index.ts index b952009c7..cb81ce736 100644 --- a/services/wizard-ci/index.ts +++ b/services/wizard-ci/index.ts @@ -11,7 +11,9 @@ */ import "dotenv/config"; import { createInterface } from "readline"; +import { writeFileSync, mkdirSync } from "fs"; import { join, relative } from "path"; +import { formatBenchmarkConsole, formatBenchmarkMarkdown, type BenchmarkData } from "./benchmark.js"; import { findApps, resetApp, @@ -71,6 +73,7 @@ interface PRMetadata { posthogRef?: string; source?: string; sourceUrl?: string; + benchmarkMarkdown?: string; } function getDependencyRefs(): Pick { @@ -125,6 +128,9 @@ function buildPRBody(meta: PRMetadata): string { if (meta.duration !== undefined) { lines.push(`Duration: ${formatMs(meta.duration)}`); } + if (meta.benchmarkMarkdown) { + lines.push("", meta.benchmarkMarkdown); + } return lines.join("\n"); } @@ -523,6 +529,11 @@ async function runCI(app: App, opts: Options, triggerId: string): Promise { + // Read benchmark data written by the wizard (if any) + const benchmark = readBenchmarkFile() ?? undefined; + resolve({ success: code === 0, duration: Date.now() - start, error: code !== 0 ? `Exit code: ${code}` : undefined, + benchmark, }); });