diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 35222fd5..93d72be2 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -66,6 +66,16 @@ const ROOT_DIR = path.resolve(fileURLToPath(new URL("..", import.meta.url))); const REVIEW_SCHEMA = path.join(ROOT_DIR, "schemas", "review-output.schema.json"); const DEFAULT_STATUS_WAIT_TIMEOUT_MS = 240000; const DEFAULT_STATUS_POLL_INTERVAL_MS = 2000; +// Full per-turn budget for background/detached jobs (no external Bash ceiling +// to collide with). codex.mjs reads CODEX_TURN_TIMEOUT_MS; this is the value we +// thread down for background runs when the user hasn't overridden it. +const DEFAULT_TURN_TIMEOUT_MS = 600000; +// Foreground runs are invoked by Claude Code's Bash tool, which SIGKILLs node +// at its own timeout (default 120000ms) and returns nothing. Set the runtime +// turn budget just below that so the foreground turn fails fast with a +// structured "turn timed out — re-run with --background" message that carries +// any partial output, instead of being killed with an empty result. +const FOREGROUND_TURN_TIMEOUT_MS = 110000; const VALID_REASONING_EFFORTS = new Set(["none", "minimal", "low", "medium", "high", "xhigh"]); const MODEL_ALIASES = new Map([["spark", "gpt-5.3-codex-spark"]]); const STOP_REVIEW_TASK_MARKER = "Run a stop-gate review of the previous Claude turn."; @@ -679,9 +689,25 @@ function enqueueBackgroundTask(cwd, job, request) { }; } +// Set the per-turn budget for a FOREGROUND command (codex.mjs reads +// CODEX_TURN_TIMEOUT_MS at call time). Precedence: explicit --turn-timeout-ms +// flag > a pre-set CODEX_TURN_TIMEOUT_MS env (e.g. settings.json) > the +// foreground default just under the host Bash ceiling, so a stalled turn +// returns a structured timeout instead of being SIGKILLed with no output. +// Only call this on a foreground path: a detached background worker inherits +// the parent env, so capping it here would shrink the background budget too. +function applyForegroundTurnBudget(options) { + const explicit = Number(options["turn-timeout-ms"]); + if (Number.isFinite(explicit) && explicit > 0) { + process.env.CODEX_TURN_TIMEOUT_MS = String(explicit); + } else if (!process.env.CODEX_TURN_TIMEOUT_MS) { + process.env.CODEX_TURN_TIMEOUT_MS = String(FOREGROUND_TURN_TIMEOUT_MS); + } +} + async function handleReviewCommand(argv, config) { const { options, positionals } = parseCommandInput(argv, { - valueOptions: ["base", "scope", "model", "cwd"], + valueOptions: ["base", "scope", "model", "cwd", "turn-timeout-ms"], booleanOptions: ["json", "background", "wait"], aliasMap: { m: "model" @@ -706,6 +732,12 @@ async function handleReviewCommand(argv, config) { jobClass: "review", summary: metadata.summary }); + // Review turns run foreground (--wait) through the same path as tasks; give + // them the same foreground budget so a stall returns a structured timeout + // instead of hitting the host Bash ceiling with an empty result. + if (!options.background) { + applyForegroundTurnBudget(options); + } await runForegroundCommand( job, (progress) => @@ -731,7 +763,7 @@ async function handleReview(argv) { async function handleTask(argv) { const { options, positionals } = parseCommandInput(argv, { - valueOptions: ["model", "effort", "cwd", "prompt-file"], + valueOptions: ["model", "effort", "cwd", "prompt-file", "turn-timeout-ms"], booleanOptions: ["json", "write", "resume-last", "resume", "fresh", "background"], aliasMap: { m: "model" @@ -774,6 +806,11 @@ async function handleTask(argv) { return; } + // Foreground turn budget (see applyForegroundTurnBudget). Runs only on the + // foreground path; the background branch returned above and its detached + // worker inherits the parent env unchanged (full default budget). + applyForegroundTurnBudget(options); + const job = buildTaskJob(workspaceRoot, taskMetadata, write); await runForegroundCommand( job, diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index f2fe88bd..d57b6cb5 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -43,6 +43,29 @@ const SERVICE_NAME = "claude_code_codex_plugin"; const TASK_THREAD_PREFIX = "Codex Companion Task"; const DEFAULT_CONTINUE_PROMPT = "Continue from the current thread state. Pick the next highest-value step and follow through until the task is resolved."; +// Hard upper bound on a single Codex turn. Without this, the completion await +// at the end of captureTurn is unbounded: it is resolved ONLY by completeTurn() +// and is never rejected on a stalled/dead process (rejectCompletion was dead +// code). The foreground budget is set below the external Bash ceiling by the +// companion so timeouts surface as structured errors instead of a SIGKILL. +const DEFAULT_TURN_TIMEOUT_MS = 600000; + +// Resolve the per-turn budget at CALL time, not import time. The companion sets +// CODEX_TURN_TIMEOUT_MS (e.g. the foreground budget, below the Bash ceiling) +// AFTER this module is imported; reading it at import froze the value at the +// default and made --turn-timeout-ms / the foreground budget inert. Reading it +// when the turn actually starts lets the option/env override take effect. +function resolveTurnTimeoutMs(options = {}) { + const fromOptions = Number(options.turnTimeoutMs); + if (Number.isFinite(fromOptions) && fromOptions > 0) { + return fromOptions; + } + const fromEnv = Number(process.env.CODEX_TURN_TIMEOUT_MS); + if (Number.isFinite(fromEnv) && fromEnv > 0) { + return fromEnv; + } + return DEFAULT_TURN_TIMEOUT_MS; +} function cleanCodexStderr(stderr) { return stderr @@ -597,7 +620,43 @@ async function captureTurn(client, threadId, startRequest, options = {}) { completeTurn(state, response.turn); } - return await state.completion; + // Bound the await so it can never outlast a dead process or a runaway turn: + // 1. state.completion — resolves on turn/completed (or inferred). Wire the + // previously-dead rejectCompletion to the client exit so an app-server + // death AFTER startRequest resolved rejects the await immediately + // instead of hanging until the deadline. Registered HERE rather than + // before startRequest: if startRequest itself rejects (e.g. broker-busy + // from turn/start, or the app-server exiting while that request is + // pending), it propagates directly and state.completion is never + // observed — wiring the exit earlier would reject an unobserved promise + // and surface as an unhandled rejection. + // 2. deadline — hard per-turn budget (resolveTurnTimeoutMs: option > + // CODEX_TURN_TIMEOUT_MS env > default, resolved at call time). + let exitRaceSettled = false; + client.exitPromise.then(() => { + if (exitRaceSettled || state.completed) { + return; + } + exitRaceSettled = true; + state.rejectCompletion( + client.exitError ?? new Error("codex app-server exited before the turn completed.") + ); + }); + const turnTimeoutMs = resolveTurnTimeoutMs(options); + let deadlineTimer = null; + const deadline = new Promise((_resolve, reject) => { + deadlineTimer = setTimeout(() => { + reject(new Error(`codex turn exceeded the ${turnTimeoutMs}ms turn budget.`)); + }, turnTimeoutMs); + deadlineTimer.unref?.(); + }); + try { + return await Promise.race([state.completion, deadline]); + } finally { + if (deadlineTimer) { + clearTimeout(deadlineTimer); + } + } } finally { clearCompletionTimer(state); client.setNotificationHandler(previousHandler ?? null);