From 5415c43684a97851175daeba2d22e6db17029bbe Mon Sep 17 00:00:00 2001 From: Russ Hammond Date: Tue, 16 Jun 2026 01:19:45 +0000 Subject: [PATCH 1/2] Bound the Codex turn await so a stalled/dead app-server can't hang forever MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit captureTurn ended in `return await state.completion`, a Promise resolved only by completeTurn() (on turn/completed or a 250ms inferred fallback). Its rejectCompletion counterpart was wired into state but never called — dead code. So if the app-server died mid-turn or simply stalled, nothing rejected the await: the turn hung until the caller's external timeout SIGKILLed the process with no output. Separately, the per-turn budget was read at import time (`const TURN_TIMEOUT_MS = Number(process.env.CODEX_TURN_TIMEOUT_MS) || 600000`), so any value set at runtime (a foreground budget below the caller's ceiling, or --turn-timeout-ms) was ignored — the budget was frozen at the default and effectively inert. Fixes: - Wire rejectCompletion to client.exitPromise so a mid-turn process exit surfaces immediately as an error instead of hanging. - Race state.completion against a hard deadline timer (cleared in finally). - Resolve the turn budget at call time (resolveTurnTimeoutMs: option > CODEX_TURN_TIMEOUT_MS env > default) so the companion's foreground budget and --turn-timeout-ms actually take effect. - companion: set a foreground turn budget below the caller's ceiling and add a --turn-timeout-ms flag; background jobs keep the full default. Verified: node --check on both files; --turn-timeout-ms 3000 now aborts a long turn at ~3s with a structured error; a normal turn still completes cleanly. Co-Authored-By: Claude Opus 4.8 (1M context) --- plugins/codex/scripts/codex-companion.mjs | 25 ++++++++- plugins/codex/scripts/lib/codex.mjs | 63 ++++++++++++++++++++++- 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 35222fd5..9fa173db 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -66,6 +66,16 @@ const ROOT_DIR = path.resolve(fileURLToPath(new URL("..", import.meta.url))); const REVIEW_SCHEMA = path.join(ROOT_DIR, "schemas", "review-output.schema.json"); const DEFAULT_STATUS_WAIT_TIMEOUT_MS = 240000; const DEFAULT_STATUS_POLL_INTERVAL_MS = 2000; +// Full per-turn budget for background/detached jobs (no external Bash ceiling +// to collide with). codex.mjs reads CODEX_TURN_TIMEOUT_MS; this is the value we +// thread down for background runs when the user hasn't overridden it. +const DEFAULT_TURN_TIMEOUT_MS = 600000; +// Foreground runs are invoked by Claude Code's Bash tool, which SIGKILLs node +// at its own timeout (default 120000ms) and returns nothing. Set the runtime +// turn budget just below that so the foreground turn fails fast with a +// structured "turn timed out — re-run with --background" message that carries +// any partial output, instead of being killed with an empty result. +const FOREGROUND_TURN_TIMEOUT_MS = 110000; const VALID_REASONING_EFFORTS = new Set(["none", "minimal", "low", "medium", "high", "xhigh"]); const MODEL_ALIASES = new Map([["spark", "gpt-5.3-codex-spark"]]); const STOP_REVIEW_TASK_MARKER = "Run a stop-gate review of the previous Claude turn."; @@ -731,7 +741,7 @@ async function handleReview(argv) { async function handleTask(argv) { const { options, positionals } = parseCommandInput(argv, { - valueOptions: ["model", "effort", "cwd", "prompt-file"], + valueOptions: ["model", "effort", "cwd", "prompt-file", "turn-timeout-ms"], booleanOptions: ["json", "write", "resume-last", "resume", "fresh", "background"], aliasMap: { m: "model" @@ -774,6 +784,19 @@ async function handleTask(argv) { return; } + // Foreground turn budget. codex.mjs reads CODEX_TURN_TIMEOUT_MS. Precedence: + // explicit --turn-timeout-ms flag, then a pre-set CODEX_TURN_TIMEOUT_MS env + // (e.g. from settings.json), else the foreground default just under the Bash + // ceiling so a stall returns a structured timeout instead of a SIGKILL. + // This runs only on the foreground path; the background branch returned above + // and its detached worker inherits the parent env unchanged (full budget). + const explicitTurnTimeout = Number(options["turn-timeout-ms"]); + if (Number.isFinite(explicitTurnTimeout) && explicitTurnTimeout > 0) { + process.env.CODEX_TURN_TIMEOUT_MS = String(explicitTurnTimeout); + } else if (!process.env.CODEX_TURN_TIMEOUT_MS) { + process.env.CODEX_TURN_TIMEOUT_MS = String(FOREGROUND_TURN_TIMEOUT_MS); + } + const job = buildTaskJob(workspaceRoot, taskMetadata, write); await runForegroundCommand( job, diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index f2fe88bd..305f34f4 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -43,6 +43,29 @@ const SERVICE_NAME = "claude_code_codex_plugin"; const TASK_THREAD_PREFIX = "Codex Companion Task"; const DEFAULT_CONTINUE_PROMPT = "Continue from the current thread state. Pick the next highest-value step and follow through until the task is resolved."; +// Hard upper bound on a single Codex turn. Without this, the completion await +// at the end of captureTurn is unbounded: it is resolved ONLY by completeTurn() +// and is never rejected on a stalled/dead process (rejectCompletion was dead +// code). The foreground budget is set below the external Bash ceiling by the +// companion so timeouts surface as structured errors instead of a SIGKILL. +const DEFAULT_TURN_TIMEOUT_MS = 600000; + +// Resolve the per-turn budget at CALL time, not import time. The companion sets +// CODEX_TURN_TIMEOUT_MS (e.g. the foreground budget, below the Bash ceiling) +// AFTER this module is imported; reading it at import froze the value at the +// default and made --turn-timeout-ms / the foreground budget inert. Reading it +// when the turn actually starts lets the option/env override take effect. +function resolveTurnTimeoutMs(options = {}) { + const fromOptions = Number(options.turnTimeoutMs); + if (Number.isFinite(fromOptions) && fromOptions > 0) { + return fromOptions; + } + const fromEnv = Number(process.env.CODEX_TURN_TIMEOUT_MS); + if (Number.isFinite(fromEnv) && fromEnv > 0) { + return fromEnv; + } + return DEFAULT_TURN_TIMEOUT_MS; +} function cleanCodexStderr(stderr) { return stderr @@ -554,6 +577,21 @@ async function captureTurn(client, threadId, startRequest, options = {}) { const state = createTurnCaptureState(threadId, options); const previousHandler = client.notificationHandler; + // If the app-server connection dies after startRequest() resolves, nothing + // else rejects state.completion (it is resolved only by completeTurn). Wire + // the previously-dead rejectCompletion to the client exit so a process death + // mid-turn surfaces immediately instead of hanging until the deadline. + let exitRaceSettled = false; + client.exitPromise.then(() => { + if (exitRaceSettled || state.completed) { + return; + } + exitRaceSettled = true; + state.rejectCompletion( + client.exitError ?? new Error("codex app-server exited before the turn completed.") + ); + }); + client.setNotificationHandler((message) => { if (!state.turnId) { state.bufferedNotifications.push(message); @@ -597,7 +635,30 @@ async function captureTurn(client, threadId, startRequest, options = {}) { completeTurn(state, response.turn); } - return await state.completion; + // Bound the await so it can never outlast a dead process or a runaway turn: + // 1. state.completion — resolves on turn/completed (or inferred), and now + // REJECTS on a mid-turn process death via the rejectCompletion handler + // wired to client.exitPromise above (previously dead code). + // 2. deadline — hard per-turn budget (resolveTurnTimeoutMs: + // option > CODEX_TURN_TIMEOUT_MS env > default, resolved at call time). + // No separate exitPromise branch is raced here: routing the exit through + // rejectCompletion keeps a single rejection source and avoids a dangling + // rejected promise after the turn has already settled. + const turnTimeoutMs = resolveTurnTimeoutMs(options); + let deadlineTimer = null; + const deadline = new Promise((_resolve, reject) => { + deadlineTimer = setTimeout(() => { + reject(new Error(`codex turn exceeded the ${turnTimeoutMs}ms turn budget.`)); + }, turnTimeoutMs); + deadlineTimer.unref?.(); + }); + try { + return await Promise.race([state.completion, deadline]); + } finally { + if (deadlineTimer) { + clearTimeout(deadlineTimer); + } + } } finally { clearCompletionTimer(state); client.setNotificationHandler(previousHandler ?? null); From 688192a62403510556c5e6fc0f06da02f1086b1b Mon Sep 17 00:00:00 2001 From: Russ Hammond Date: Tue, 16 Jun 2026 01:31:14 +0000 Subject: [PATCH 2/2] Address Codex review: avoid unobserved-rejection + cover review turns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two P2 fixes from the automated review: 1. codex.mjs — register the client.exitPromise -> rejectCompletion handler AFTER startRequest() resolves, not before. If startRequest() rejects (e.g. broker-busy from turn/start, or the app-server exiting while the request is pending), the error now propagates directly and state.completion is never observed. Previously the early-registered handler could reject an unobserved state.completion and surface as an unhandled rejection that can terminate the CLI or preempt the intended error reporting. The mid-turn-death coverage is unchanged (the handler is in place before the completion is awaited). 2. codex-companion.mjs — apply the foreground turn budget to review turns too. Extracted applyForegroundTurnBudget(options) and call it from both handleTask and handleReviewCommand. /codex:review --wait and adversarial-review --wait run foreground through runForegroundCommand but previously never set CODEX_TURN_TIMEOUT_MS, so they fell back to the 600000ms default and could be SIGKILLed at the host Bash ceiling with an empty result. Also accept --turn-timeout-ms on the review command. Guarded with !options.background so a detached worker still inherits the full default budget. Co-Authored-By: Claude Opus 4.8 (1M context) --- plugins/codex/scripts/codex-companion.mjs | 40 ++++++++++++++------- plugins/codex/scripts/lib/codex.mjs | 44 +++++++++++------------ 2 files changed, 48 insertions(+), 36 deletions(-) diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 9fa173db..93d72be2 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -689,9 +689,25 @@ function enqueueBackgroundTask(cwd, job, request) { }; } +// Set the per-turn budget for a FOREGROUND command (codex.mjs reads +// CODEX_TURN_TIMEOUT_MS at call time). Precedence: explicit --turn-timeout-ms +// flag > a pre-set CODEX_TURN_TIMEOUT_MS env (e.g. settings.json) > the +// foreground default just under the host Bash ceiling, so a stalled turn +// returns a structured timeout instead of being SIGKILLed with no output. +// Only call this on a foreground path: a detached background worker inherits +// the parent env, so capping it here would shrink the background budget too. +function applyForegroundTurnBudget(options) { + const explicit = Number(options["turn-timeout-ms"]); + if (Number.isFinite(explicit) && explicit > 0) { + process.env.CODEX_TURN_TIMEOUT_MS = String(explicit); + } else if (!process.env.CODEX_TURN_TIMEOUT_MS) { + process.env.CODEX_TURN_TIMEOUT_MS = String(FOREGROUND_TURN_TIMEOUT_MS); + } +} + async function handleReviewCommand(argv, config) { const { options, positionals } = parseCommandInput(argv, { - valueOptions: ["base", "scope", "model", "cwd"], + valueOptions: ["base", "scope", "model", "cwd", "turn-timeout-ms"], booleanOptions: ["json", "background", "wait"], aliasMap: { m: "model" @@ -716,6 +732,12 @@ async function handleReviewCommand(argv, config) { jobClass: "review", summary: metadata.summary }); + // Review turns run foreground (--wait) through the same path as tasks; give + // them the same foreground budget so a stall returns a structured timeout + // instead of hitting the host Bash ceiling with an empty result. + if (!options.background) { + applyForegroundTurnBudget(options); + } await runForegroundCommand( job, (progress) => @@ -784,18 +806,10 @@ async function handleTask(argv) { return; } - // Foreground turn budget. codex.mjs reads CODEX_TURN_TIMEOUT_MS. Precedence: - // explicit --turn-timeout-ms flag, then a pre-set CODEX_TURN_TIMEOUT_MS env - // (e.g. from settings.json), else the foreground default just under the Bash - // ceiling so a stall returns a structured timeout instead of a SIGKILL. - // This runs only on the foreground path; the background branch returned above - // and its detached worker inherits the parent env unchanged (full budget). - const explicitTurnTimeout = Number(options["turn-timeout-ms"]); - if (Number.isFinite(explicitTurnTimeout) && explicitTurnTimeout > 0) { - process.env.CODEX_TURN_TIMEOUT_MS = String(explicitTurnTimeout); - } else if (!process.env.CODEX_TURN_TIMEOUT_MS) { - process.env.CODEX_TURN_TIMEOUT_MS = String(FOREGROUND_TURN_TIMEOUT_MS); - } + // Foreground turn budget (see applyForegroundTurnBudget). Runs only on the + // foreground path; the background branch returned above and its detached + // worker inherits the parent env unchanged (full default budget). + applyForegroundTurnBudget(options); const job = buildTaskJob(workspaceRoot, taskMetadata, write); await runForegroundCommand( diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index 305f34f4..d57b6cb5 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -577,21 +577,6 @@ async function captureTurn(client, threadId, startRequest, options = {}) { const state = createTurnCaptureState(threadId, options); const previousHandler = client.notificationHandler; - // If the app-server connection dies after startRequest() resolves, nothing - // else rejects state.completion (it is resolved only by completeTurn). Wire - // the previously-dead rejectCompletion to the client exit so a process death - // mid-turn surfaces immediately instead of hanging until the deadline. - let exitRaceSettled = false; - client.exitPromise.then(() => { - if (exitRaceSettled || state.completed) { - return; - } - exitRaceSettled = true; - state.rejectCompletion( - client.exitError ?? new Error("codex app-server exited before the turn completed.") - ); - }); - client.setNotificationHandler((message) => { if (!state.turnId) { state.bufferedNotifications.push(message); @@ -636,14 +621,27 @@ async function captureTurn(client, threadId, startRequest, options = {}) { } // Bound the await so it can never outlast a dead process or a runaway turn: - // 1. state.completion — resolves on turn/completed (or inferred), and now - // REJECTS on a mid-turn process death via the rejectCompletion handler - // wired to client.exitPromise above (previously dead code). - // 2. deadline — hard per-turn budget (resolveTurnTimeoutMs: - // option > CODEX_TURN_TIMEOUT_MS env > default, resolved at call time). - // No separate exitPromise branch is raced here: routing the exit through - // rejectCompletion keeps a single rejection source and avoids a dangling - // rejected promise after the turn has already settled. + // 1. state.completion — resolves on turn/completed (or inferred). Wire the + // previously-dead rejectCompletion to the client exit so an app-server + // death AFTER startRequest resolved rejects the await immediately + // instead of hanging until the deadline. Registered HERE rather than + // before startRequest: if startRequest itself rejects (e.g. broker-busy + // from turn/start, or the app-server exiting while that request is + // pending), it propagates directly and state.completion is never + // observed — wiring the exit earlier would reject an unobserved promise + // and surface as an unhandled rejection. + // 2. deadline — hard per-turn budget (resolveTurnTimeoutMs: option > + // CODEX_TURN_TIMEOUT_MS env > default, resolved at call time). + let exitRaceSettled = false; + client.exitPromise.then(() => { + if (exitRaceSettled || state.completed) { + return; + } + exitRaceSettled = true; + state.rejectCompletion( + client.exitError ?? new Error("codex app-server exited before the turn completed.") + ); + }); const turnTimeoutMs = resolveTurnTimeoutMs(options); let deadlineTimer = null; const deadline = new Promise((_resolve, reject) => {