From 8faf9bb846887c01a6d526ffc19526dfe30822d1 Mon Sep 17 00:00:00 2001
From: Matt Apperson <me@mattapperson.com>
Date: Tue, 5 May 2026 18:54:32 -0400
Subject: [PATCH 1/5] feat(tools): add human-in-the-loop tool type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a fourth client tool kind that extends manual-tool semantics with two
async hooks. `onToolCalled` runs when the model invokes the tool — returning
a value short-circuits like `execute`, returning `null` pauses the loop like
a manual tool. `onResponseReceived` runs on a later turn when an incoming
`FunctionCallOutputItem` matches (by callId → function_call.name) a HITL
tool, letting the tool post-process caller-supplied results before the
model sees them.

Keeps HITL control flow local to the tool definition instead of smeared
across the caller.
---
 packages/agent/src/index.ts                 |   5 +
 packages/agent/src/lib/model-result.ts      |  61 +++-
 packages/agent/src/lib/tool-executor.ts     | 183 ++++++++++-
 packages/agent/src/lib/tool-orchestrator.ts |  12 +-
 packages/agent/src/lib/tool-types.ts        |  74 ++++-
 packages/agent/src/lib/tool.ts              |  93 ++++++
 packages/agent/tests/unit/hitl-tool.test.ts | 319 ++++++++++++++++++++
 7 files changed, 729 insertions(+), 18 deletions(-)
 create mode 100644 packages/agent/tests/unit/hitl-tool.test.ts
diff --git a/packages/agent/src/index.ts b/packages/agent/src/index.ts
index 0bd90cf..50abf3d 100644
--- a/packages/agent/src/index.ts
+++ b/packages/agent/src/index.ts
@@ -138,6 +138,8 @@ export type {
   ConversationState,
   ConversationStatus,
   HasApprovalTools,
+  HITLTool,
+  HITLToolFunction,
   InferToolEvent,
   InferToolEventsUnion,
   InferToolInput,
@@ -184,8 +186,11 @@ export type {
 export {
   hasApprovalRequiredTools,
   hasExecuteFunction,
+  isAutoResolvableTool,
   isClientTool,
   isGeneratorTool,
+  isHITLTool,
+  isManualTool,
   isRegularExecuteTool,
   isServerTool,
   isToolCallOutputEvent,
diff --git a/packages/agent/src/lib/model-result.ts b/packages/agent/src/lib/model-result.ts
index e706fe0..01eacc4 100644
--- a/packages/agent/src/lib/model-result.ts
+++ b/packages/agent/src/lib/model-result.ts
@@ -51,7 +51,7 @@ import {
 import type { ContextInput } from './tool-context.js';
 import { resolveContext, ToolContextStore } from './tool-context.js';
 import { ToolEventBroadcaster } from './tool-event-broadcaster.js';
-import { executeTool } from './tool-executor.js';
+import { applyOnResponseReceivedHooks, executeTool } from './tool-executor.js';
 import type {
   ConversationState,
   InferToolEventsUnion,
@@ -73,6 +73,7 @@ import type {
 } from './tool-types.js';
 import {
   hasExecuteFunction,
+  isAutoResolvableTool,
   isClientTool,
   isServerTool,
   isToolCallOutputEvent,
@@ -624,7 +625,7 @@ export class ModelResult<
   ): Promise<UnsentToolResult<TTools>[]> {
     const toolCallPromises = toolCalls.map(async (tc) => {
       const tool = this.options.tools?.find((t) => isClientTool(t) && t.function.name === tc.name);
-      if (!tool || !hasExecuteFunction(tool)) {
+      if (!tool || !isAutoResolvableTool(tool)) {
         return null;
       }
 
@@ -637,6 +638,11 @@ export class ModelResult<
         this.options.sharedContextSchema,
       );
 
+      if (result === null) {
+        // HITL tool paused — no unsent result for this call in this round
+        return null;
+      }
+
       if (result.error) {
         return createRejectedResult(tc.id, String(tc.name), result.error.message);
       }
@@ -748,7 +754,7 @@ export class ModelResult<
       const tool = this.options.tools?.find(
         (t) => isClientTool(t) && t.function.name === toolCall.name,
       );
-      if (!tool || !hasExecuteFunction(tool)) {
+      if (!tool || !isAutoResolvableTool(tool)) {
         return null;
       }
 
@@ -799,6 +805,14 @@ export class ModelResult<
         this.options.sharedContextSchema,
       );
 
+      if (result === null) {
+        // HITL tool paused — surface as manual (no output this round)
+        return {
+          type: 'paused' as const,
+          toolCall,
+        };
+      }
+
       return {
         type: 'execution' as const,
         toolCall,
@@ -858,6 +872,13 @@ export class ModelResult<
         continue;
       }
 
+      if (value.type === 'paused') {
+        // HITL tool returned null — emit nothing; the call surfaces to the caller
+        // for manual resume. The outer loop will end because toolResults lacks
+        // an entry for this call.
+        continue;
+      }
+
       const toolResult = (
         value.result.error
           ? {
@@ -1227,6 +1248,21 @@ export class ModelResult<
         }
       }
 
+      // Apply onResponseReceived hooks to caller-supplied tool-output items
+      if (baseRequest.input !== undefined) {
+        const hookedInput = await applyOnResponseReceivedHooks(
+          baseRequest.input,
+          this.options.tools,
+          initialContext,
+          this.contextStore ?? undefined,
+          this.options.sharedContextSchema,
+        );
+        baseRequest = {
+          ...baseRequest,
+          input: hookedInput,
+        };
+      }
+
       // Store resolved request with stream mode
       this.resolvedRequest = {
         ...baseRequest,
@@ -1293,7 +1329,7 @@ export class ModelResult<
       const tool = this.options.tools?.find(
         (t) => isClientTool(t) && t.function.name === toolCall.name,
       );
-      if (!tool || !hasExecuteFunction(tool)) {
+      if (!tool || !isAutoResolvableTool(tool)) {
         // Can't execute, create error result
         unsentResults.push(
           createRejectedResult(callId, String(toolCall.name), 'Tool not found or not executable'),
@@ -1310,6 +1346,11 @@ export class ModelResult<
         this.options.sharedContextSchema,
       );
 
+      if (result === null) {
+        // HITL tool paused on approval — no unsent result; caller resumes later
+        continue;
+      }
+
       if (result.error) {
         unsentResults.push(
           createRejectedResult(callId, String(toolCall.name), result.error.message),
@@ -1408,10 +1449,20 @@ export class ModelResult<
 
     const baseRequest = await this.resolveRequestForContext(turnContext);
 
+    // Apply onResponseReceived hooks to caller-supplied tool-output items
+    // (unsentToolResults merged into newInput above may include caller-provided outputs)
+    const hookedInput = await applyOnResponseReceivedHooks(
+      newInput,
+      this.options.tools,
+      turnContext,
+      this.contextStore ?? undefined,
+      this.options.sharedContextSchema,
+    );
+
     // Create request with the accumulated messages
     const request: models.ResponsesRequest = {
       ...baseRequest,
-      input: newInput,
+      input: hookedInput,
       stream: true,
     };
 
diff --git a/packages/agent/src/lib/tool-executor.ts b/packages/agent/src/lib/tool-executor.ts
index 42b1dab..52d2303 100644
--- a/packages/agent/src/lib/tool-executor.ts
+++ b/packages/agent/src/lib/tool-executor.ts
@@ -1,11 +1,13 @@
 import type * as models from '@openrouter/sdk/models';
 import * as z4 from 'zod/v4';
 import type { $ZodObject, $ZodShape, $ZodType } from 'zod/v4/core';
+import { isFunctionCallItem, isFunctionCallOutputItem } from './stream-type-guards.js';
 import type { ToolContextStore } from './tool-context.js';
 import { buildToolExecuteContext } from './tool-context.js';
 import type {
   APITool,
   ClientTool,
+  HITLTool,
   ParsedToolCall,
   Tool,
   ToolExecuteContext,
@@ -15,6 +17,7 @@ import type {
 import {
   hasExecuteFunction,
   isGeneratorTool,
+  isHITLTool,
   isRegularExecuteTool,
   isServerTool,
 } from './tool-types.js';
@@ -322,8 +325,69 @@ export async function executeGeneratorTool(
 }
 
 /**
- * Execute a tool call
- * Automatically detects if it's a regular or generator tool
+ * Execute a HITL tool's `onToolCalled` hook.
+ *
+ * Returns:
+ * - `ToolExecutionResult` if the hook produced a value (short-circuit, send to model)
+ * - `null` if the hook returned `null` (pause — treat as manual tool)
+ * - `ToolExecutionResult` with `error` set if the hook threw
+ */
+// biome-ignore lint: parameters match the internal API shape
+export async function executeHITLTool(
+  tool: Tool,
+  toolCall: ParsedToolCall<Tool>,
+  context: TurnContext,
+  contextStore?: ToolContextStore,
+  sharedSchema?: $ZodObject<$ZodShape>,
+): Promise<ToolExecutionResult<Tool> | null> {
+  if (!isHITLTool(tool)) {
+    throw new Error(`Tool "${toolCall.name}" is not a HITL tool`);
+  }
+
+  try {
+    const validatedInput = validateToolInput(tool.function.inputSchema, toolCall.arguments);
+    const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema);
+
+    const result = await Promise.resolve(
+      tool.function.onToolCalled(validatedInput, executeContext),
+    );
+
+    if (result === null) {
+      // Pause — treat as manual tool
+      return null;
+    }
+
+    if (tool.function.outputSchema) {
+      const validatedOutput = validateToolOutput(tool.function.outputSchema, result);
+      return {
+        toolCallId: toolCall.id,
+        toolName: toolCall.name,
+        result: validatedOutput,
+      };
+    }
+
+    return {
+      toolCallId: toolCall.id,
+      toolName: toolCall.name,
+      result,
+    };
+  } catch (error) {
+    return {
+      toolCallId: toolCall.id,
+      toolName: toolCall.name,
+      result: null,
+      error: error instanceof Error ? error : new Error(String(error)),
+    };
+  }
+}
+
+/**
+ * Execute a tool call.
+ * Automatically detects if it's a regular, generator, or HITL tool.
+ *
+ * Returns `null` only for HITL tools whose `onToolCalled` returned `null`
+ * (signaling a manual-style pause). All other tools always return a
+ * `ToolExecutionResult` (with `error` set on failure).
  */
 // biome-ignore lint: parameters match the internal API shape
 export async function executeTool(
@@ -333,7 +397,11 @@ export async function executeTool(
   onPreliminaryResult?: (toolCallId: string, result: unknown) => void,
   contextStore?: ToolContextStore,
   sharedSchema?: $ZodObject<$ZodShape>,
-): Promise<ToolExecutionResult<Tool>> {
+): Promise<ToolExecutionResult<Tool> | null> {
+  if (isHITLTool(tool)) {
+    return executeHITLTool(tool, toolCall, context, contextStore, sharedSchema);
+  }
+
   if (!hasExecuteFunction(tool)) {
     throw new Error(`Tool "${toolCall.name}" has no execute function. Use manual tool execution.`);
   }
@@ -391,3 +459,112 @@ export function formatToolExecutionError(error: Error, toolCall: ParsedToolCall<
 
   return `Tool "${toolCall.name}" execution error: ${error.message}`;
 }
+
+/**
+ * Typeguard: input is a plain array of items. Narrows the InputsUnion's
+ * "string | Array<...>" shape so we can walk the array.
+ */
+function isItemArray(
+  input: models.InputsUnion,
+): input is Extract<models.InputsUnion, readonly unknown[]> {
+  return Array.isArray(input);
+}
+
+/**
+ * Walk the input items and apply `onResponseReceived` hooks for HITL tools.
+ *
+ * For each `function_call_output` item in `input`, locate the matching
+ * `function_call` (by `callId`) to identify the tool name. If that tool is a
+ * HITL tool with an `onResponseReceived` hook, invoke it with the parsed output
+ * and replace the item's `output` with the hook's return value (stringified).
+ *
+ * If a hook throws, the output is replaced with `{"error": "<message>"}` so the
+ * model sees a tool error. Items that don't match a HITL tool are left untouched.
+ *
+ * @returns a new input array when any item was rewritten, otherwise the original input.
+ */
+// biome-ignore lint: parameters match the internal API shape
+export async function applyOnResponseReceivedHooks(
+  input: models.InputsUnion,
+  tools: readonly Tool[] | undefined,
+  context: TurnContext,
+  contextStore?: ToolContextStore,
+  sharedSchema?: $ZodObject<$ZodShape>,
+): Promise<models.InputsUnion> {
+  if (!tools || tools.length === 0 || !isItemArray(input)) {
+    return input;
+  }
+
+  const hitlTools = tools.filter((t): t is HITLTool => isHITLTool(t));
+  if (hitlTools.length === 0) {
+    return input;
+  }
+  const hookByName = new Map<string, HITLTool>();
+  for (const t of hitlTools) {
+    if (t.function.onResponseReceived) {
+      hookByName.set(t.function.name, t);
+    }
+  }
+  if (hookByName.size === 0) {
+    return input;
+  }
+
+  // Build callId -> name map from function_call items in the array
+  const callIdToName = new Map<string, string>();
+  for (const item of input) {
+    if (isFunctionCallItem(item)) {
+      callIdToName.set(item.callId, item.name);
+    }
+  }
+
+  let changed = false;
+  const rewritten: unknown[] = [];
+  for (const item of input) {
+    if (!isFunctionCallOutputItem(item)) {
+      rewritten.push(item);
+      continue;
+    }
+    const toolName = callIdToName.get(item.callId);
+    if (!toolName) {
+      rewritten.push(item);
+      continue;
+    }
+    const tool = hookByName.get(toolName);
+    if (!tool || !tool.function.onResponseReceived) {
+      rewritten.push(item);
+      continue;
+    }
+
+    const raw = item.output;
+    let parsed: unknown = raw;
+    if (typeof raw === 'string') {
+      try {
+        parsed = JSON.parse(raw);
+      } catch {
+        parsed = raw;
+      }
+    }
+
+    const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema);
+
+    let newOutput: string;
+    try {
+      const hookResult = await Promise.resolve(
+        tool.function.onResponseReceived(parsed, executeContext),
+      );
+      newOutput = JSON.stringify(hookResult);
+    } catch (err) {
+      newOutput = JSON.stringify({
+        error: err instanceof Error ? err.message : String(err),
+      });
+    }
+
+    rewritten.push({
+      ...item,
+      output: newOutput,
+    });
+    changed = true;
+  }
+
+  return changed ? (rewritten as models.InputsUnion) : input;
+}
diff --git a/packages/agent/src/lib/tool-orchestrator.ts b/packages/agent/src/lib/tool-orchestrator.ts
index cd0df4a..c27d3e2 100644
--- a/packages/agent/src/lib/tool-orchestrator.ts
+++ b/packages/agent/src/lib/tool-orchestrator.ts
@@ -7,7 +7,7 @@ import { extractToolCallsFromResponse, responseHasToolCalls } from './stream-tra
 import { isFunctionCallItem } from './stream-type-guards.js';
 import { executeTool, findToolByName } from './tool-executor.js';
 import type { APITool, Tool, ToolExecutionResult } from './tool-types.js';
-import { hasExecuteFunction } from './tool-types.js';
+import { isAutoResolvableTool } from './tool-types.js';
 import { buildTurnContext } from './turn-context.js';
 
 /**
@@ -75,13 +75,13 @@ export async function executeToolLoop(
       break;
     }
 
-    // Check if any tools have execute functions
+    // Check if any tools can be auto-resolved (execute or HITL onToolCalled)
     const hasExecutableTools = toolCalls.some((toolCall) => {
       const tool = findToolByName(tools, toolCall.name);
-      return tool && hasExecuteFunction(tool);
+      return tool && isAutoResolvableTool(tool);
     });
 
-    // If no executable tools, return (manual execution mode)
+    // If no auto-resolvable tools, return (manual execution mode)
     if (!hasExecutableTools) {
       break;
     }
@@ -100,8 +100,8 @@ export async function executeToolLoop(
         } as ToolExecutionResult<Tool>;
       }
 
-      if (!hasExecuteFunction(tool)) {
-        // Tool has no execute function - return null to filter out
+      if (!isAutoResolvableTool(tool)) {
+        // Tool has no execute/onToolCalled - return null to filter out
         return null;
       }
 
diff --git a/packages/agent/src/lib/tool-types.ts b/packages/agent/src/lib/tool-types.ts
index f4794a8..2713776 100644
--- a/packages/agent/src/lib/tool-types.ts
+++ b/packages/agent/src/lib/tool-types.ts
@@ -278,6 +278,36 @@ export interface ManualToolFunction<
   outputSchema?: TOutput;
 }
 
+/**
+ * Human-in-the-loop tool. Extends manual-tool semantics with two async hooks.
+ *
+ * `onToolCalled` fires when the model invokes the tool. Returning a value feeds
+ * the model directly (like regular `execute`); returning `null` pauses the loop
+ * like a manual tool, letting the caller resume later with a FunctionCallOutputItem.
+ *
+ * `onResponseReceived` fires on the next turn when an incoming FunctionCallOutputItem
+ * corresponds to a prior call of this tool (matched by callId → function_call.name).
+ * It receives the caller-supplied raw result and returns the value sent to the model.
+ * Throwing surfaces as a tool error to the model.
+ */
+export interface HITLToolFunction<
+  TInput extends $ZodObject<$ZodShape>,
+  TOutput extends $ZodType = $ZodType<unknown>,
+  TContext extends Record<string, unknown> = Record<string, unknown>,
+  TName extends string = string,
+> extends BaseToolFunction<TInput> {
+  outputSchema?: TOutput;
+  onToolCalled: (
+    params: zodInfer<TInput>,
+    context?: ToolExecuteContext<TName, TContext>,
+  ) => Promise<zodInfer<TOutput> | null> | zodInfer<TOutput> | null;
+  onResponseReceived?: (
+    rawResult: unknown,
+    context?: ToolExecuteContext<TName, TContext>,
+  ) => Promise<zodInfer<TOutput>> | zodInfer<TOutput>;
+  toModelOutput?: ToModelOutputFunction<zodInfer<TInput>, zodInfer<TOutput>>;
+}
+
 /**
  * Tool with execute function (regular or generator)
  */
@@ -315,13 +345,26 @@ export type ManualTool<
 };
 
 /**
- * Union type of all client-executed tool shapes (function, generator, manual).
+ * Human-in-the-loop tool (with onToolCalled / onResponseReceived hooks)
+ */
+export type HITLTool<
+  TInput extends $ZodObject<$ZodShape> = $ZodObject<$ZodShape>,
+  TOutput extends $ZodType = $ZodType<unknown>,
+  TContext extends Record<string, unknown> = Record<string, unknown>,
+> = {
+  type: ToolType.Function;
+  function: HITLToolFunction<TInput, TOutput, TContext>;
+};
+
+/**
+ * Union type of all client-executed tool shapes (function, generator, manual, HITL).
  * These run in the user's process via the agent SDK's tool execution loop.
  */
 export type ClientTool =
   | ToolWithExecute<$ZodObject<$ZodShape>, $ZodType<unknown>>
   | ToolWithGenerator<$ZodObject<$ZodShape>, $ZodType<unknown>, $ZodType<unknown>>
-  | ManualTool<$ZodObject<$ZodShape>, $ZodType<unknown>>;
+  | ManualTool<$ZodObject<$ZodShape>, $ZodType<unknown>>
+  | HITLTool<$ZodObject<$ZodShape>, $ZodType<unknown>>;
 
 /**
  * Config payload for an OpenRouter server-executed tool. Derived directly
@@ -551,13 +594,36 @@ export function isRegularExecuteTool(tool: Tool): tool is ToolWithExecute {
 }
 
 /**
- * Type guard to check if a tool is a manual tool (no execute function)
+ * Type guard to check if a tool is a manual tool (no execute and no onToolCalled)
  */
 export function isManualTool(tool: Tool): tool is ManualTool {
   if (isServerTool(tool)) {
     return false;
   }
-  return !('execute' in tool.function);
+  return !('execute' in tool.function) && !('onToolCalled' in tool.function);
+}
+
+/**
+ * Type guard to check if a tool is a human-in-the-loop tool (has onToolCalled)
+ */
+export function isHITLTool(tool: Tool): tool is HITLTool {
+  if (isServerTool(tool)) {
+    return false;
+  }
+  return (
+    'onToolCalled' in tool.function && typeof tool.function.onToolCalled === 'function'
+  );
+}
+
+/**
+ * Type guard: true if the tool can be auto-resolved within a turn — either
+ * through a client execute/generator function or through a HITL onToolCalled hook.
+ * Returns false for manual tools (which always pause) and server tools.
+ */
+export function isAutoResolvableTool(
+  tool: Tool,
+): tool is ToolWithExecute | ToolWithGenerator | HITLTool {
+  return hasExecuteFunction(tool) || isHITLTool(tool);
 }
 
 /**
diff --git a/packages/agent/src/lib/tool.ts b/packages/agent/src/lib/tool.ts
index 00ea165..043b4df 100644
--- a/packages/agent/src/lib/tool.ts
+++ b/packages/agent/src/lib/tool.ts
@@ -1,5 +1,6 @@
 import type { $ZodObject, $ZodShape, $ZodType, infer as zodInfer } from 'zod/v4/core';
 import type {
+  HITLTool,
   ManualTool,
   NextTurnParamsFunctions,
   ServerTool,
@@ -109,6 +110,45 @@ type ManualToolConfig<TInput extends $ZodObject<$ZodShape>> = {
   execute: false;
 };
 
+/**
+ * Configuration for a human-in-the-loop tool.
+ * Discriminated by the presence of `onToolCalled`. No `execute` or `eventSchema`.
+ *
+ * `onToolCalled` returning `null` pauses the loop (manual-tool semantics).
+ * Any non-null return is treated as the tool's result for the model.
+ *
+ * `onResponseReceived` is invoked on a later turn when an incoming
+ * `FunctionCallOutputItem` corresponds to a prior call of this tool; the
+ * returned value replaces what the model ultimately sees.
+ */
+type HITLToolConfig<
+  TInput extends $ZodObject<$ZodShape>,
+  TOutput extends $ZodType,
+  TContext extends Record<string, unknown> = Record<string, unknown>,
+  TName extends string = string,
+> = {
+  name: TName;
+  description?: string;
+  inputSchema: TInput;
+  outputSchema?: TOutput;
+  eventSchema?: undefined;
+  execute?: undefined;
+  /** Zod schema declaring the context data this tool needs */
+  contextSchema?: $ZodObject<$ZodShape>;
+  nextTurnParams?: NextTurnParamsFunctions<zodInfer<TInput>>;
+  requireApproval?: boolean | ToolApprovalCheck<zodInfer<TInput>>;
+  onToolCalled: (
+    params: zodInfer<TInput>,
+    context?: ToolExecuteContext<TName, TContext>,
+  ) => Promise<zodInfer<TOutput> | null> | zodInfer<TOutput> | null;
+  onResponseReceived?: (
+    rawResult: unknown,
+    context?: ToolExecuteContext<TName, TContext>,
+  ) => Promise<zodInfer<TOutput>> | zodInfer<TOutput>;
+  /** Convert tool execution output to model-facing output */
+  toModelOutput?: ToModelOutputFunction<zodInfer<TInput>, zodInfer<TOutput>>;
+};
+
 /**
  * Loose config type for the `tool<TShared>()` overload.
  * Accepts any valid tool config while typing `ctx.shared` from TShared.
@@ -194,6 +234,16 @@ export function tool<
   config: GeneratorToolConfig<TInput, TEvent, TOutput, TContext, TName>,
 ): ToolWithGenerator<TInput, TEvent, TOutput, TContext>;
 
+// Overload for HITL tools (when onToolCalled is provided)
+export function tool<
+  TInput extends $ZodObject<$ZodShape>,
+  TOutput extends $ZodType,
+  TContext extends Record<string, unknown> = Record<string, unknown>,
+  TName extends string = string,
+>(
+  config: HITLToolConfig<TInput, TOutput, TContext, TName>,
+): HITLTool<TInput, TOutput, TContext>;
+
 // Overload for manual tools (execute: false)
 export function tool<TInput extends $ZodObject<$ZodShape>>(
   config: ManualToolConfig<TInput>,
@@ -233,6 +283,7 @@ export function tool(
     | GeneratorToolConfig<$ZodObject<$ZodShape>, $ZodType, $ZodType>
     | RegularToolConfig<$ZodObject<$ZodShape>, $ZodType, unknown>
     | ManualToolConfig<$ZodObject<$ZodShape>>
+    | HITLToolConfig<$ZodObject<$ZodShape>, $ZodType>
     | ToolConfigWithSharedContext<Record<string, unknown>>,
 ): Tool {
   // 'shared' is reserved for shared context — forbid it as a tool name
@@ -242,6 +293,48 @@ export function tool(
     );
   }
 
+  // Check for HITL tool (has onToolCalled hook)
+  if ('onToolCalled' in config && typeof config.onToolCalled === 'function') {
+    const fn: HITLTool<$ZodObject<$ZodShape>, $ZodType>['function'] = {
+      name: config.name,
+      inputSchema: config.inputSchema,
+      onToolCalled: config.onToolCalled,
+    };
+
+    if (config.description !== undefined) {
+      fn.description = config.description;
+    }
+
+    if (config.outputSchema !== undefined) {
+      fn.outputSchema = config.outputSchema;
+    }
+
+    if (config.contextSchema !== undefined) {
+      fn.contextSchema = config.contextSchema;
+    }
+
+    if (config.nextTurnParams !== undefined) {
+      fn.nextTurnParams = config.nextTurnParams;
+    }
+
+    if (config.requireApproval !== undefined) {
+      fn.requireApproval = config.requireApproval;
+    }
+
+    if (config.onResponseReceived !== undefined) {
+      fn.onResponseReceived = config.onResponseReceived;
+    }
+
+    if (config.toModelOutput !== undefined) {
+      fn.toModelOutput = config.toModelOutput;
+    }
+
+    return {
+      type: ToolType.Function,
+      function: fn,
+    };
+  }
+
   // Check for manual tool first (execute === false)
   if (config.execute === false) {
     const fn: ManualTool<$ZodObject<$ZodShape>>['function'] = {
diff --git a/packages/agent/tests/unit/hitl-tool.test.ts b/packages/agent/tests/unit/hitl-tool.test.ts
new file mode 100644
index 0000000..69c117f
--- /dev/null
+++ b/packages/agent/tests/unit/hitl-tool.test.ts
@@ -0,0 +1,319 @@
+import type * as models from '@openrouter/sdk/models';
+import { describe, expect, it, vi } from 'vitest';
+import { z } from 'zod/v4';
+import {
+  applyOnResponseReceivedHooks,
+  executeHITLTool,
+  executeTool,
+} from '../../src/lib/tool-executor.js';
+import { tool } from '../../src/lib/tool.js';
+import type {
+  HITLTool,
+  ParsedToolCall,
+  Tool,
+  TurnContext,
+} from '../../src/lib/tool-types.js';
+import {
+  isAutoResolvableTool,
+  isHITLTool,
+  isManualTool,
+  ToolType,
+} from '../../src/lib/tool-types.js';
+
+const turnContext: TurnContext = { numberOfTurns: 1 };
+
+function makeToolCall(name: string, id: string, args: unknown): ParsedToolCall<Tool> {
+  return { id, name, arguments: args } as ParsedToolCall<Tool>;
+}
+
+describe('tool() factory — HITL tools', () => {
+  it('creates a HITL tool when onToolCalled is present', () => {
+    const approve = tool({
+      name: 'approve_payment',
+      description: 'Approve a payment',
+      inputSchema: z.object({ amount: z.number() }),
+      outputSchema: z.object({ ok: z.boolean() }),
+      onToolCalled: async (input) => {
+        return input.amount < 100 ? { ok: true } : null;
+      },
+      onResponseReceived: async (raw) => {
+        return raw as { ok: boolean };
+      },
+    });
+
+    expect(approve.type).toBe(ToolType.Function);
+    expect(approve.function.name).toBe('approve_payment');
+    expect(isHITLTool(approve)).toBe(true);
+    expect(isManualTool(approve)).toBe(false);
+    expect(isAutoResolvableTool(approve)).toBe(true);
+    expect('execute' in approve.function).toBe(false);
+  });
+
+  it('omits onResponseReceived when not provided', () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({ x: z.number() }),
+      onToolCalled: async () => ({ ok: true }),
+    });
+    expect('onResponseReceived' in t.function).toBe(false);
+  });
+
+  it('isManualTool returns true only for tools with neither execute nor onToolCalled', () => {
+    const manual = tool({
+      name: 'manual',
+      inputSchema: z.object({ x: z.number() }),
+      execute: false,
+    });
+    const hitl = tool({
+      name: 'hitl',
+      inputSchema: z.object({ x: z.number() }),
+      onToolCalled: async () => null,
+    });
+    const regular = tool({
+      name: 'regular',
+      inputSchema: z.object({ x: z.number() }),
+      execute: async () => ({ y: 1 }),
+    });
+
+    expect(isManualTool(manual)).toBe(true);
+    expect(isManualTool(hitl)).toBe(false);
+    expect(isManualTool(regular)).toBe(false);
+  });
+});
+
+describe('executeHITLTool', () => {
+  it('returns a ToolExecutionResult when onToolCalled returns a value', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      outputSchema: z.object({ ok: z.boolean() }),
+      onToolCalled: async (input) => ({ ok: input.amount < 100 }),
+    });
+
+    const result = await executeHITLTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext);
+    expect(result).not.toBeNull();
+    expect(result?.error).toBeUndefined();
+    expect(result?.result).toEqual({ ok: true });
+  });
+
+  it('returns null when onToolCalled returns null (pause)', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      onToolCalled: async () => null,
+    });
+
+    const result = await executeHITLTool(t, makeToolCall('approve', 'c2', { amount: 5 }), turnContext);
+    expect(result).toBeNull();
+  });
+
+  it('captures thrown errors into the ToolExecutionResult', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      onToolCalled: async () => {
+        throw new Error('boom');
+      },
+    });
+
+    const result = await executeHITLTool(t, makeToolCall('approve', 'c3', { amount: 5 }), turnContext);
+    expect(result).not.toBeNull();
+    expect(result?.error).toBeInstanceOf(Error);
+    expect(result?.error?.message).toBe('boom');
+  });
+
+  it('validates onToolCalled return value against outputSchema', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      outputSchema: z.object({ ok: z.boolean() }),
+      // Return value doesn't match schema to force validation error
+      onToolCalled: async () => ({ ok: 'yes' as unknown as boolean }),
+    });
+
+    const result = await executeHITLTool(t, makeToolCall('approve', 'c4', { amount: 5 }), turnContext);
+    expect(result?.error).toBeDefined();
+  });
+});
+
+describe('executeTool dispatcher with HITL tools', () => {
+  it('routes HITL tools through executeHITLTool', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      onToolCalled: async () => ({ approved: true }),
+    });
+
+    const result = await executeTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext);
+    expect(result).not.toBeNull();
+    expect(result?.result).toEqual({ approved: true });
+  });
+
+  it('returns null for HITL pause through the dispatcher', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      onToolCalled: async () => null,
+    });
+
+    const result = await executeTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext);
+    expect(result).toBeNull();
+  });
+});
+
+describe('applyOnResponseReceivedHooks', () => {
+  function callItem(
+    callId: string,
+    name: string,
+    args = '{}',
+  ): models.OutputFunctionCallItem {
+    return {
+      type: 'function_call',
+      id: `fc_${callId}`,
+      callId,
+      name,
+      arguments: args,
+      status: 'completed',
+    };
+  }
+
+  function outputItem(callId: string, output: string): models.FunctionCallOutputItem {
+    return {
+      type: 'function_call_output',
+      id: `output_${callId}`,
+      callId,
+      output,
+    };
+  }
+
+  it('transforms a FunctionCallOutputItem when its tool has onResponseReceived', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      onToolCalled: async () => null,
+      onResponseReceived: async (raw) => {
+        const obj = raw as { ok: boolean };
+        return { ...obj, reviewedAt: 1234 };
+      },
+    });
+
+    const input: models.InputsUnion = [
+      callItem('c1', 'approve'),
+      outputItem('c1', JSON.stringify({ ok: true })),
+    ];
+
+    const result = await applyOnResponseReceivedHooks(input, [t], turnContext);
+    expect(Array.isArray(result)).toBe(true);
+    const arr = result as unknown[];
+    const out = arr[1] as models.FunctionCallOutputItem;
+    expect(out.type).toBe('function_call_output');
+    expect(out.output).toBe(JSON.stringify({ ok: true, reviewedAt: 1234 }));
+  });
+
+  it('leaves output unchanged when no matching tool has a hook', async () => {
+    const t = tool({
+      name: 'regular',
+      inputSchema: z.object({ x: z.number() }),
+      execute: async () => ({ y: 1 }),
+    });
+
+    const input: models.InputsUnion = [
+      callItem('c1', 'regular'),
+      outputItem('c1', JSON.stringify({ y: 1 })),
+    ];
+
+    const result = await applyOnResponseReceivedHooks(input, [t], turnContext);
+    expect(result).toBe(input); // same reference, no rewrite
+  });
+
+  it('replaces output with an error object when the hook throws', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      onToolCalled: async () => null,
+      onResponseReceived: async () => {
+        throw new Error('invalid result');
+      },
+    });
+
+    const input: models.InputsUnion = [
+      callItem('c1', 'approve'),
+      outputItem('c1', JSON.stringify({ ok: true })),
+    ];
+
+    const result = await applyOnResponseReceivedHooks(input, [t], turnContext);
+    const arr = result as unknown[];
+    const out = arr[1] as models.FunctionCallOutputItem;
+    const parsed = JSON.parse(out.output as string) as { error: string };
+    expect(parsed.error).toBe('invalid result');
+  });
+
+  it('passes the parsed raw result (not the raw string) to the hook', async () => {
+    const spy = vi.fn(async (raw: unknown) => raw);
+    const t: HITLTool = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      onToolCalled: async () => null,
+      onResponseReceived: spy,
+    });
+
+    const payload = { ok: true, note: 'hi' };
+    const input: models.InputsUnion = [
+      callItem('c1', 'approve'),
+      outputItem('c1', JSON.stringify(payload)),
+    ];
+
+    await applyOnResponseReceivedHooks(input, [t], turnContext);
+    expect(spy).toHaveBeenCalledTimes(1);
+    expect(spy.mock.calls[0]?.[0]).toEqual(payload);
+  });
+
+  it('passes the raw string through to the hook when output is not JSON', async () => {
+    const spy = vi.fn(async (raw: unknown) => raw);
+    const t: HITLTool = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      onToolCalled: async () => null,
+      onResponseReceived: spy,
+    });
+
+    const input: models.InputsUnion = [
+      callItem('c1', 'approve'),
+      outputItem('c1', 'not-json'),
+    ];
+
+    await applyOnResponseReceivedHooks(input, [t], turnContext);
+    expect(spy.mock.calls[0]?.[0]).toBe('not-json');
+  });
+
+  it('leaves outputs whose callId has no matching function_call untouched', async () => {
+    const t: HITLTool = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      onToolCalled: async () => null,
+      onResponseReceived: async (raw) => ({ ...(raw as object), tagged: true }),
+    });
+
+    // No function_call in the input at all — just an orphan output
+    const input: models.InputsUnion = [outputItem('orphan', JSON.stringify({ ok: true }))];
+
+    const result = await applyOnResponseReceivedHooks(input, [t], turnContext);
+    expect(result).toBe(input);
+  });
+
+  it('ignores tools without onResponseReceived even if they are HITL', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({ amount: z.number() }),
+      onToolCalled: async () => null,
+    });
+
+    const input: models.InputsUnion = [
+      callItem('c1', 'approve'),
+      outputItem('c1', JSON.stringify({ ok: true })),
+    ];
+
+    const result = await applyOnResponseReceivedHooks(input, [t], turnContext);
+    expect(result).toBe(input);
+  });
+});

From 5155f4456ee3e0aed8734ad15a6949dbb068ee68 Mon Sep 17 00:00:00 2001
From: Matt Apperson <me@mattapperson.com>
Date: Tue, 5 May 2026 18:55:17 -0400
Subject: [PATCH 2/5] style: apply biome formatting and optional-chain
 suggestion

---
 packages/agent/src/lib/tool-executor.ts     |   2 +-
 packages/agent/src/lib/tool-types.ts        |   4 +-
 packages/agent/src/lib/tool.ts              |   4 +-
 packages/agent/tests/unit/hitl-tool.test.ts | 322 +++++++++++++++-----
 4 files changed, 256 insertions(+), 76 deletions(-)

diff --git a/packages/agent/src/lib/tool-executor.ts b/packages/agent/src/lib/tool-executor.ts
index 52d2303..8929dea 100644
--- a/packages/agent/src/lib/tool-executor.ts
+++ b/packages/agent/src/lib/tool-executor.ts
@@ -530,7 +530,7 @@ export async function applyOnResponseReceivedHooks(
       continue;
     }
     const tool = hookByName.get(toolName);
-    if (!tool || !tool.function.onResponseReceived) {
+    if (!tool?.function.onResponseReceived) {
       rewritten.push(item);
       continue;
     }
diff --git a/packages/agent/src/lib/tool-types.ts b/packages/agent/src/lib/tool-types.ts
index 2713776..f43bebc 100644
--- a/packages/agent/src/lib/tool-types.ts
+++ b/packages/agent/src/lib/tool-types.ts
@@ -610,9 +610,7 @@ export function isHITLTool(tool: Tool): tool is HITLTool {
   if (isServerTool(tool)) {
     return false;
   }
-  return (
-    'onToolCalled' in tool.function && typeof tool.function.onToolCalled === 'function'
-  );
+  return 'onToolCalled' in tool.function && typeof tool.function.onToolCalled === 'function';
 }
 
 /**
diff --git a/packages/agent/src/lib/tool.ts b/packages/agent/src/lib/tool.ts
index 043b4df..cb40c05 100644
--- a/packages/agent/src/lib/tool.ts
+++ b/packages/agent/src/lib/tool.ts
@@ -240,9 +240,7 @@ export function tool<
   TOutput extends $ZodType,
   TContext extends Record<string, unknown> = Record<string, unknown>,
   TName extends string = string,
->(
-  config: HITLToolConfig<TInput, TOutput, TContext, TName>,
-): HITLTool<TInput, TOutput, TContext>;
+>(config: HITLToolConfig<TInput, TOutput, TContext, TName>): HITLTool<TInput, TOutput, TContext>;
 
 // Overload for manual tools (execute: false)
 export function tool<TInput extends $ZodObject<$ZodShape>>(
diff --git a/packages/agent/tests/unit/hitl-tool.test.ts b/packages/agent/tests/unit/hitl-tool.test.ts
index 69c117f..6594917 100644
--- a/packages/agent/tests/unit/hitl-tool.test.ts
+++ b/packages/agent/tests/unit/hitl-tool.test.ts
@@ -1,18 +1,13 @@
 import type * as models from '@openrouter/sdk/models';
 import { describe, expect, it, vi } from 'vitest';
 import { z } from 'zod/v4';
+import { tool } from '../../src/lib/tool.js';
 import {
   applyOnResponseReceivedHooks,
   executeHITLTool,
   executeTool,
 } from '../../src/lib/tool-executor.js';
-import { tool } from '../../src/lib/tool.js';
-import type {
-  HITLTool,
-  ParsedToolCall,
-  Tool,
-  TurnContext,
-} from '../../src/lib/tool-types.js';
+import type { HITLTool, ParsedToolCall, Tool, TurnContext } from '../../src/lib/tool-types.js';
 import {
   isAutoResolvableTool,
   isHITLTool,
@@ -20,10 +15,16 @@ import {
   ToolType,
 } from '../../src/lib/tool-types.js';
 
-const turnContext: TurnContext = { numberOfTurns: 1 };
+const turnContext: TurnContext = {
+  numberOfTurns: 1,
+};
 
 function makeToolCall(name: string, id: string, args: unknown): ParsedToolCall<Tool> {
-  return { id, name, arguments: args } as ParsedToolCall<Tool>;
+  return {
+    id,
+    name,
+    arguments: args,
+  } as ParsedToolCall<Tool>;
 }
 
 describe('tool() factory — HITL tools', () => {
@@ -31,13 +32,23 @@ describe('tool() factory — HITL tools', () => {
     const approve = tool({
       name: 'approve_payment',
       description: 'Approve a payment',
-      inputSchema: z.object({ amount: z.number() }),
-      outputSchema: z.object({ ok: z.boolean() }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       onToolCalled: async (input) => {
-        return input.amount < 100 ? { ok: true } : null;
+        return input.amount < 100
+          ? {
+              ok: true,
+            }
+          : null;
       },
       onResponseReceived: async (raw) => {
-        return raw as { ok: boolean };
+        return raw as {
+          ok: boolean;
+        };
       },
     });
 
@@ -52,8 +63,12 @@ describe('tool() factory — HITL tools', () => {
   it('omits onResponseReceived when not provided', () => {
     const t = tool({
       name: 'approve',
-      inputSchema: z.object({ x: z.number() }),
-      onToolCalled: async () => ({ ok: true }),
+      inputSchema: z.object({
+        x: z.number(),
+      }),
+      onToolCalled: async () => ({
+        ok: true,
+      }),
     });
     expect('onResponseReceived' in t.function).toBe(false);
   });
@@ -61,18 +76,26 @@ describe('tool() factory — HITL tools', () => {
   it('isManualTool returns true only for tools with neither execute nor onToolCalled', () => {
     const manual = tool({
       name: 'manual',
-      inputSchema: z.object({ x: z.number() }),
+      inputSchema: z.object({
+        x: z.number(),
+      }),
       execute: false,
     });
     const hitl = tool({
       name: 'hitl',
-      inputSchema: z.object({ x: z.number() }),
+      inputSchema: z.object({
+        x: z.number(),
+      }),
       onToolCalled: async () => null,
     });
     const regular = tool({
       name: 'regular',
-      inputSchema: z.object({ x: z.number() }),
-      execute: async () => ({ y: 1 }),
+      inputSchema: z.object({
+        x: z.number(),
+      }),
+      execute: async () => ({
+        y: 1,
+      }),
     });
 
     expect(isManualTool(manual)).toBe(true);
@@ -85,38 +108,68 @@ describe('executeHITLTool', () => {
   it('returns a ToolExecutionResult when onToolCalled returns a value', async () => {
     const t = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
-      outputSchema: z.object({ ok: z.boolean() }),
-      onToolCalled: async (input) => ({ ok: input.amount < 100 }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
+      onToolCalled: async (input) => ({
+        ok: input.amount < 100,
+      }),
     });
 
-    const result = await executeHITLTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext);
+    const result = await executeHITLTool(
+      t,
+      makeToolCall('approve', 'c1', {
+        amount: 5,
+      }),
+      turnContext,
+    );
     expect(result).not.toBeNull();
     expect(result?.error).toBeUndefined();
-    expect(result?.result).toEqual({ ok: true });
+    expect(result?.result).toEqual({
+      ok: true,
+    });
   });
 
   it('returns null when onToolCalled returns null (pause)', async () => {
     const t = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
       onToolCalled: async () => null,
     });
 
-    const result = await executeHITLTool(t, makeToolCall('approve', 'c2', { amount: 5 }), turnContext);
+    const result = await executeHITLTool(
+      t,
+      makeToolCall('approve', 'c2', {
+        amount: 5,
+      }),
+      turnContext,
+    );
     expect(result).toBeNull();
   });
 
   it('captures thrown errors into the ToolExecutionResult', async () => {
     const t = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
       onToolCalled: async () => {
         throw new Error('boom');
       },
     });
 
-    const result = await executeHITLTool(t, makeToolCall('approve', 'c3', { amount: 5 }), turnContext);
+    const result = await executeHITLTool(
+      t,
+      makeToolCall('approve', 'c3', {
+        amount: 5,
+      }),
+      turnContext,
+    );
     expect(result).not.toBeNull();
     expect(result?.error).toBeInstanceOf(Error);
     expect(result?.error?.message).toBe('boom');
@@ -125,13 +178,25 @@ describe('executeHITLTool', () => {
   it('validates onToolCalled return value against outputSchema', async () => {
     const t = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
-      outputSchema: z.object({ ok: z.boolean() }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       // Return value doesn't match schema to force validation error
-      onToolCalled: async () => ({ ok: 'yes' as unknown as boolean }),
+      onToolCalled: async () => ({
+        ok: 'yes' as unknown as boolean,
+      }),
     });
 
-    const result = await executeHITLTool(t, makeToolCall('approve', 'c4', { amount: 5 }), turnContext);
+    const result = await executeHITLTool(
+      t,
+      makeToolCall('approve', 'c4', {
+        amount: 5,
+      }),
+      turnContext,
+    );
     expect(result?.error).toBeDefined();
   });
 });
@@ -140,33 +205,49 @@ describe('executeTool dispatcher with HITL tools', () => {
   it('routes HITL tools through executeHITLTool', async () => {
     const t = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
-      onToolCalled: async () => ({ approved: true }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      onToolCalled: async () => ({
+        approved: true,
+      }),
     });
 
-    const result = await executeTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext);
+    const result = await executeTool(
+      t,
+      makeToolCall('approve', 'c1', {
+        amount: 5,
+      }),
+      turnContext,
+    );
     expect(result).not.toBeNull();
-    expect(result?.result).toEqual({ approved: true });
+    expect(result?.result).toEqual({
+      approved: true,
+    });
   });
 
   it('returns null for HITL pause through the dispatcher', async () => {
     const t = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
       onToolCalled: async () => null,
     });
 
-    const result = await executeTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext);
+    const result = await executeTool(
+      t,
+      makeToolCall('approve', 'c1', {
+        amount: 5,
+      }),
+      turnContext,
+    );
     expect(result).toBeNull();
   });
 });
 
 describe('applyOnResponseReceivedHooks', () => {
-  function callItem(
-    callId: string,
-    name: string,
-    args = '{}',
-  ): models.OutputFunctionCallItem {
+  function callItem(callId: string, name: string, args = '{}'): models.OutputFunctionCallItem {
     return {
       type: 'function_call',
       id: `fc_${callId}`,
@@ -189,47 +270,87 @@ describe('applyOnResponseReceivedHooks', () => {
   it('transforms a FunctionCallOutputItem when its tool has onResponseReceived', async () => {
     const t = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
       onToolCalled: async () => null,
       onResponseReceived: async (raw) => {
-        const obj = raw as { ok: boolean };
-        return { ...obj, reviewedAt: 1234 };
+        const obj = raw as {
+          ok: boolean;
+        };
+        return {
+          ...obj,
+          reviewedAt: 1234,
+        };
       },
     });
 
     const input: models.InputsUnion = [
       callItem('c1', 'approve'),
-      outputItem('c1', JSON.stringify({ ok: true })),
+      outputItem(
+        'c1',
+        JSON.stringify({
+          ok: true,
+        }),
+      ),
     ];
 
-    const result = await applyOnResponseReceivedHooks(input, [t], turnContext);
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
     expect(Array.isArray(result)).toBe(true);
     const arr = result as unknown[];
     const out = arr[1] as models.FunctionCallOutputItem;
     expect(out.type).toBe('function_call_output');
-    expect(out.output).toBe(JSON.stringify({ ok: true, reviewedAt: 1234 }));
+    expect(out.output).toBe(
+      JSON.stringify({
+        ok: true,
+        reviewedAt: 1234,
+      }),
+    );
   });
 
   it('leaves output unchanged when no matching tool has a hook', async () => {
     const t = tool({
       name: 'regular',
-      inputSchema: z.object({ x: z.number() }),
-      execute: async () => ({ y: 1 }),
+      inputSchema: z.object({
+        x: z.number(),
+      }),
+      execute: async () => ({
+        y: 1,
+      }),
     });
 
     const input: models.InputsUnion = [
       callItem('c1', 'regular'),
-      outputItem('c1', JSON.stringify({ y: 1 })),
+      outputItem(
+        'c1',
+        JSON.stringify({
+          y: 1,
+        }),
+      ),
     ];
 
-    const result = await applyOnResponseReceivedHooks(input, [t], turnContext);
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
     expect(result).toBe(input); // same reference, no rewrite
   });
 
   it('replaces output with an error object when the hook throws', async () => {
     const t = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
       onToolCalled: async () => null,
       onResponseReceived: async () => {
         throw new Error('invalid result');
@@ -238,13 +359,26 @@ describe('applyOnResponseReceivedHooks', () => {
 
     const input: models.InputsUnion = [
       callItem('c1', 'approve'),
-      outputItem('c1', JSON.stringify({ ok: true })),
+      outputItem(
+        'c1',
+        JSON.stringify({
+          ok: true,
+        }),
+      ),
     ];
 
-    const result = await applyOnResponseReceivedHooks(input, [t], turnContext);
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
     const arr = result as unknown[];
     const out = arr[1] as models.FunctionCallOutputItem;
-    const parsed = JSON.parse(out.output as string) as { error: string };
+    const parsed = JSON.parse(out.output as string) as {
+      error: string;
+    };
     expect(parsed.error).toBe('invalid result');
   });
 
@@ -252,18 +386,29 @@ describe('applyOnResponseReceivedHooks', () => {
     const spy = vi.fn(async (raw: unknown) => raw);
     const t: HITLTool = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
       onToolCalled: async () => null,
       onResponseReceived: spy,
     });
 
-    const payload = { ok: true, note: 'hi' };
+    const payload = {
+      ok: true,
+      note: 'hi',
+    };
     const input: models.InputsUnion = [
       callItem('c1', 'approve'),
       outputItem('c1', JSON.stringify(payload)),
     ];
 
-    await applyOnResponseReceivedHooks(input, [t], turnContext);
+    await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
     expect(spy).toHaveBeenCalledTimes(1);
     expect(spy.mock.calls[0]?.[0]).toEqual(payload);
   });
@@ -272,7 +417,9 @@ describe('applyOnResponseReceivedHooks', () => {
     const spy = vi.fn(async (raw: unknown) => raw);
     const t: HITLTool = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
       onToolCalled: async () => null,
       onResponseReceived: spy,
     });
@@ -282,38 +429,75 @@ describe('applyOnResponseReceivedHooks', () => {
       outputItem('c1', 'not-json'),
     ];
 
-    await applyOnResponseReceivedHooks(input, [t], turnContext);
+    await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
     expect(spy.mock.calls[0]?.[0]).toBe('not-json');
   });
 
   it('leaves outputs whose callId has no matching function_call untouched', async () => {
     const t: HITLTool = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
       onToolCalled: async () => null,
-      onResponseReceived: async (raw) => ({ ...(raw as object), tagged: true }),
+      onResponseReceived: async (raw) => ({
+        ...(raw as object),
+        tagged: true,
+      }),
     });
 
     // No function_call in the input at all — just an orphan output
-    const input: models.InputsUnion = [outputItem('orphan', JSON.stringify({ ok: true }))];
+    const input: models.InputsUnion = [
+      outputItem(
+        'orphan',
+        JSON.stringify({
+          ok: true,
+        }),
+      ),
+    ];
 
-    const result = await applyOnResponseReceivedHooks(input, [t], turnContext);
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
     expect(result).toBe(input);
   });
 
   it('ignores tools without onResponseReceived even if they are HITL', async () => {
     const t = tool({
       name: 'approve',
-      inputSchema: z.object({ amount: z.number() }),
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
       onToolCalled: async () => null,
     });
 
     const input: models.InputsUnion = [
       callItem('c1', 'approve'),
-      outputItem('c1', JSON.stringify({ ok: true })),
+      outputItem(
+        'c1',
+        JSON.stringify({
+          ok: true,
+        }),
+      ),
     ];
 
-    const result = await applyOnResponseReceivedHooks(input, [t], turnContext);
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
     expect(result).toBe(input);
   });
 });

From dc2ba48b88662be93c0ccd3e1e17ced61a2a30b8 Mon Sep 17 00:00:00 2001
From: Matt Apperson <me@mattapperson.com>
Date: Tue, 5 May 2026 19:47:31 -0400
Subject: [PATCH 3/5] fix(tools): close HITL execution gaps surfaced in review

- Route auto-resolvable checks through isAutoResolvableTool so pure-HITL
  turns actually enter the execution loop and invoke onToolCalled.
- Propagate HITL pauses out of executeToolRound; the outer loop now
  persists pending calls under a new awaiting_hitl status and returns
  before issuing a follow-up request with missing outputs.
- Scope onResponseReceived hooks to freshly-supplied outputs on resume
  so caller-supplied outputs hooked at init aren't re-hooked.
- Preserve paused HITL calls in pendingToolCalls when they occur during
  approval resume instead of silently dropping them.
- Include originalOutput alongside error when onResponseReceived throws,
  so the model can distinguish hook failure from tool-reported error.
- Replace unsafe InputsUnion cast with a structurally-typed rewritten
  array.
---
 packages/agent/src/lib/model-result.ts      | 283 +++++-
 packages/agent/src/lib/tool-executor.ts     |  16 +-
 packages/agent/src/lib/tool-types.ts        |  19 +-
 packages/agent/tests/unit/hitl-tool.test.ts | 960 +++++++++++++++++++-
 4 files changed, 1214 insertions(+), 64 deletions(-)

diff --git a/packages/agent/src/lib/model-result.ts b/packages/agent/src/lib/model-result.ts
index 01eacc4..03bdca7 100644
--- a/packages/agent/src/lib/model-result.ts
+++ b/packages/agent/src/lib/model-result.ts
@@ -54,6 +54,7 @@ import { ToolEventBroadcaster } from './tool-event-broadcaster.js';
 import { applyOnResponseReceivedHooks, executeTool } from './tool-executor.js';
 import type {
   ConversationState,
+  ConversationStatus,
   InferToolEventsUnion,
   InferToolOutputsUnion,
   ParsedToolCall,
@@ -72,7 +73,6 @@ import type {
   UnsentToolResult,
 } from './tool-types.js';
 import {
-  hasExecuteFunction,
   isAutoResolvableTool,
   isClientTool,
   isServerTool,
@@ -592,24 +592,35 @@ export class ModelResult<
   }
 
   /**
-   * Check if any tool calls have execute functions.
+   * Check if any tool calls can be auto-resolved in the current turn.
    * Used to determine if automatic tool execution should be attempted.
    *
+   * A tool call is auto-resolvable if its tool has either an `execute` function
+   * (regular or generator) or an `onToolCalled` hook (HITL). HITL tools are
+   * included here because their hook fires before the model's follow-up request,
+   * even when the hook ultimately decides to pause by returning `null`.
+   *
    * @param toolCalls - The tool calls to check
-   * @returns True if at least one tool call has an executable function
+   * @returns True if at least one tool call is auto-resolvable
    */
   private hasExecutableToolCalls(toolCalls: ParsedToolCall<Tool>[]): boolean {
     return toolCalls.some((toolCall) => {
       const tool = this.options.tools?.find(
         (t) => isClientTool(t) && t.function.name === toolCall.name,
       );
-      return tool && hasExecuteFunction(tool);
+      return tool && isAutoResolvableTool(tool);
     });
   }
 
+  /**
+   * A manual tool call is one whose tool has neither an `execute` function nor
+   * an `onToolCalled` hook — i.e. the caller is expected to produce the output
+   * externally. HITL tools are auto-resolvable even when they pause, so they
+   * are not classified as manual here.
+   */
   private isManualToolCall(item: models.OutputFunctionCallItem): boolean {
     const tool = this.options.tools?.find((t) => isClientTool(t) && t.function.name === item.name);
-    return !!tool && !hasExecuteFunction(tool);
+    return !!tool && !isAutoResolvableTool(tool);
   }
 
   /**
@@ -738,18 +749,58 @@ export class ModelResult<
     return true; // Pause for approval
   }
 
+  /**
+   * Persist state when one or more HITL tools paused during a round.
+   *
+   * Mirrors `handleApprovalCheck` so paused HITL calls are surfaced through
+   * `pendingToolCalls` (visible via `getPendingToolCalls()` / `getState()`).
+   * Sets the status to `awaiting_hitl` so the caller can discriminate HITL
+   * pauses from approval pauses.
+   *
+   * Already-executed results from the same round are persisted on the turn's
+   * message history via `saveToolResultsToState` (called by the outer loop
+   * before this helper) — no need to duplicate them in `unsentToolResults`.
+   *
+   * @param currentResponse - The response that produced the paused tool calls
+   * @param pausedCalls - HITL tool calls whose `onToolCalled` returned `null`
+   */
+  private async persistHitlPause(
+    currentResponse: models.OpenResponsesResult,
+    pausedCalls: ParsedToolCall<Tool>[],
+  ): Promise<void> {
+    this.finalResponse = currentResponse;
+
+    if (!this.stateAccessor) {
+      return;
+    }
+
+    const stateUpdates: Partial<Omit<ConversationState<TTools>, 'id' | 'createdAt' | 'updatedAt'>> =
+      {
+        pendingToolCalls: pausedCalls as ParsedToolCall<TTools[number]>[],
+        status: 'awaiting_hitl',
+      };
+    await this.saveStateSafely(stateUpdates);
+  }
+
   /**
    * Execute all tools in a single round in parallel.
    * Emits tool.result events after tool execution completes.
    *
    * @param toolCalls - The tool calls to execute
    * @param turnContext - The current turn context
-   * @returns Array of function call outputs formatted for the API
+   * @returns Object with the function call outputs formatted for the API and
+   *   the list of HITL tool calls that paused (returned `null` from
+   *   `onToolCalled`). Callers should break out of the execution loop when
+   *   `pausedCalls` is non-empty rather than sending an incomplete set of
+   *   outputs back to the model.
    */
   private async executeToolRound(
     toolCalls: ParsedToolCall<Tool>[],
     turnContext: TurnContext,
-  ): Promise<models.FunctionCallOutputItem[]> {
+  ): Promise<{
+    toolResults: models.FunctionCallOutputItem[];
+    pausedCalls: ParsedToolCall<Tool>[];
+  }> {
     const toolCallPromises = toolCalls.map(async (toolCall) => {
       const tool = this.options.tools?.find(
         (t) => isClientTool(t) && t.function.name === toolCall.name,
@@ -824,6 +875,7 @@ export class ModelResult<
 
     const settledResults = await Promise.allSettled(toolCallPromises);
     const toolResults: models.FunctionCallOutputItem[] = [];
+    const pausedCalls: ParsedToolCall<Tool>[] = [];
 
     for (let i = 0; i < settledResults.length; i++) {
       const settled = settledResults[i];
@@ -873,9 +925,11 @@ export class ModelResult<
       }
 
       if (value.type === 'paused') {
-        // HITL tool returned null — emit nothing; the call surfaces to the caller
-        // for manual resume. The outer loop will end because toolResults lacks
-        // an entry for this call.
+        // HITL tool returned null — record the pause so the caller can break
+        // out of the outer loop before attempting a follow-up request with an
+        // incomplete set of outputs. The call will be surfaced via state
+        // (pendingToolCalls + status='awaiting_hitl') for manual resume.
+        pausedCalls.push(value.toolCall);
         continue;
       }
 
@@ -938,7 +992,10 @@ export class ModelResult<
       } satisfies ToolCallOutputEvent);
     }
 
-    return toolResults;
+    return {
+      toolResults,
+      pausedCalls,
+    };
   }
 
   /**
@@ -1100,6 +1157,81 @@ export class ModelResult<
     return rest as ResolvedCallModelInput;
   }
 
+  /**
+   * Apply `onResponseReceived` hooks to the freshly-produced tool outputs
+   * only, without re-hooking caller-supplied outputs that were already hooked
+   * during init. Uses the existing `function_call` items from message history
+   * for name resolution but drops them from the returned array.
+   *
+   * @param freshToolOutputs - Outputs produced in this turn (not yet hooked)
+   * @param messageHistory - Existing conversation messages; scanned for
+   *   matching `function_call` items to resolve tool names
+   * @param turnContext - Turn context for hook invocation
+   * @returns The tool outputs with hook-rewritten `output` values where a
+   *   matching HITL tool defines `onResponseReceived`
+   */
+  private async hookFreshToolOutputs(
+    freshToolOutputs: models.FunctionCallOutputItem[],
+    messageHistory: models.InputsUnion,
+    turnContext: TurnContext,
+  ): Promise<models.FunctionCallOutputItem[]> {
+    if (freshToolOutputs.length === 0) {
+      return freshToolOutputs;
+    }
+
+    // Collect function_call items from history so `applyOnResponseReceivedHooks`
+    // can resolve callId -> toolName. We don't mutate history here — the items
+    // are only used to power the internal callIdToName map.
+    const historyArray = Array.isArray(messageHistory)
+      ? messageHistory
+      : [
+          messageHistory,
+        ];
+    const functionCallItems: models.BaseInputsUnion[] = [];
+    for (const item of historyArray) {
+      if (item && typeof item === 'object' && 'type' in item && item.type === 'function_call') {
+        functionCallItems.push(item as models.BaseInputsUnion);
+      }
+    }
+
+    const syntheticInput: models.InputsUnion = [
+      ...functionCallItems,
+      ...freshToolOutputs,
+    ];
+
+    const hookedInput = await applyOnResponseReceivedHooks(
+      syntheticInput,
+      this.options.tools,
+      turnContext,
+      this.contextStore ?? undefined,
+      this.options.sharedContextSchema,
+    );
+
+    if (hookedInput === syntheticInput) {
+      // No rewrites; return the originals unchanged.
+      return freshToolOutputs;
+    }
+
+    // Extract the rewritten tool outputs (they sit after the function_calls).
+    const hookedArray = Array.isArray(hookedInput)
+      ? hookedInput
+      : [
+          hookedInput,
+        ];
+    const extracted: models.FunctionCallOutputItem[] = [];
+    for (const item of hookedArray) {
+      if (
+        item &&
+        typeof item === 'object' &&
+        'type' in item &&
+        item.type === 'function_call_output'
+      ) {
+        extracted.push(item as models.FunctionCallOutputItem);
+      }
+    }
+    return extracted.length === freshToolOutputs.length ? extracted : freshToolOutputs;
+  }
+
   /**
    * Safely persist state with error handling.
    * Wraps state save operations to ensure failures are properly reported.
@@ -1164,9 +1296,16 @@ export class ModelResult<
         if (loadedState) {
           this.currentState = loadedState;
 
-          // Check if we're resuming from awaiting_approval with decisions
+          // Check if we're resuming from awaiting_approval or awaiting_hitl
+          // with decisions. `awaiting_hitl` reuses `processApprovalDecisions`
+          // because the resume mechanism is identical — the caller supplies
+          // `approveToolCalls`/`rejectToolCalls` for paused call IDs, and we
+          // re-invoke `executeTool` on approved calls (which re-runs
+          // `onToolCalled` for HITL tools).
+          const isResumableStatus =
+            loadedState.status === 'awaiting_approval' || loadedState.status === 'awaiting_hitl';
           if (
-            loadedState.status === 'awaiting_approval' &&
+            isResumableStatus &&
             (this.approvedToolCalls.length > 0 || this.rejectedToolCalls.length > 0)
           ) {
             // Initialize context store before resuming so tools have access
@@ -1319,6 +1458,10 @@ export class ModelResult<
       // context is handled via contextStore, not on TurnContext
     };
 
+    // Track approved HITL calls that paused (onToolCalled returned null) —
+    // these stay in pendingToolCalls so the caller can resume them later.
+    const hitlPausedIds = new Set<string>();
+
     // Process approvals - execute the approved tools
     for (const callId of this.approvedToolCalls) {
       const toolCall = pendingCalls.find((tc) => tc.id === callId);
@@ -1347,7 +1490,9 @@ export class ModelResult<
       );
 
       if (result === null) {
-        // HITL tool paused on approval — no unsent result; caller resumes later
+        // HITL tool paused on approval — keep the call visible to the caller
+        // via pendingToolCalls (status becomes 'awaiting_hitl' below).
+        hitlPausedIds.add(callId);
         continue;
       }
 
@@ -1370,17 +1515,35 @@ export class ModelResult<
       unsentResults.push(createRejectedResult(callId, String(toolCall.name), 'Rejected by user'));
     }
 
-    // Remove processed calls from pending
-    const processedIds = new Set([
-      ...this.approvedToolCalls,
-      ...this.rejectedToolCalls,
-    ]);
+    // Remove processed calls from pending. Approved HITL calls that paused are
+    // NOT considered processed — they stay on pendingToolCalls so getPendingToolCalls()
+    // still surfaces them to the caller on resume.
+    const processedIds = new Set(
+      [
+        ...this.approvedToolCalls,
+        ...this.rejectedToolCalls,
+      ].filter((id) => !hitlPausedIds.has(id)),
+    );
     const remainingPending = pendingCalls.filter((tc) => !processedIds.has(tc.id));
 
+    // Determine status:
+    //   - Any still-unprocessed approval-required call keeps us in 'awaiting_approval'
+    //   - Otherwise, any HITL paused call moves us to 'awaiting_hitl'
+    //   - Otherwise, we continue with 'in_progress'
+    const remainingUnresolvedApprovals = remainingPending.filter((tc) => !hitlPausedIds.has(tc.id));
+    let nextStatus: ConversationStatus;
+    if (remainingUnresolvedApprovals.length > 0) {
+      nextStatus = 'awaiting_approval';
+    } else if (hitlPausedIds.size > 0) {
+      nextStatus = 'awaiting_hitl';
+    } else {
+      nextStatus = 'in_progress';
+    }
+
     // Update state - conditionally include optional properties only if they have values
     const stateUpdates: Partial<Omit<ConversationState<TTools>, 'id' | 'createdAt' | 'updatedAt'>> =
       {
-        status: remainingPending.length > 0 ? 'awaiting_approval' : 'in_progress',
+        status: nextStatus,
       };
     if (remainingPending.length > 0) {
       stateUpdates.pendingToolCalls = remainingPending;
@@ -1403,8 +1566,8 @@ export class ModelResult<
       await this.saveStateSafely();
     }
 
-    // If we still have pending approvals, stop here
-    if (remainingPending.length > 0) {
+    // If we are paused (for approval or for HITL), stop here
+    if (nextStatus !== 'in_progress') {
       return;
     }
 
@@ -1428,9 +1591,25 @@ export class ModelResult<
     // Convert to API format
     const toolOutputs = unsentResultsToAPIFormat(unsentResults);
 
-    // Build new input with tool results
+    // Build turn context for hook resolution
+    // numberOfTurns represents the current turn number (1-indexed after initial)
+    const turnContext: TurnContext = {
+      numberOfTurns: this.allToolExecutionRounds.length + 1,
+    };
+
+    // Apply onResponseReceived hooks ONLY to the newly-added tool outputs.
+    // `currentMessages` already contains caller-supplied outputs that were
+    // hooked during init (see `initStream` -> `applyOnResponseReceivedHooks`);
+    // re-hooking them would double-invoke non-idempotent hooks.
     const currentMessages = this.currentState.messages;
-    const newInput = appendToMessages(currentMessages, toolOutputs);
+    const hookedToolOutputs = await this.hookFreshToolOutputs(
+      toolOutputs,
+      currentMessages,
+      turnContext,
+    );
+
+    // Build new input with the hooked tool results appended
+    const newInput = appendToMessages(currentMessages, hookedToolOutputs);
 
     // Clear unsent results from state
     this.currentState = updateState(this.currentState, {
@@ -1442,27 +1621,15 @@ export class ModelResult<
     await this.saveStateSafely();
 
     // Build request with the updated input
-    // numberOfTurns represents the current turn number (1-indexed after initial)
-    const turnContext: TurnContext = {
-      numberOfTurns: this.allToolExecutionRounds.length + 1,
-    };
-
     const baseRequest = await this.resolveRequestForContext(turnContext);
 
-    // Apply onResponseReceived hooks to caller-supplied tool-output items
-    // (unsentToolResults merged into newInput above may include caller-provided outputs)
-    const hookedInput = await applyOnResponseReceivedHooks(
-      newInput,
-      this.options.tools,
-      turnContext,
-      this.contextStore ?? undefined,
-      this.options.sharedContextSchema,
-    );
-
-    // Create request with the accumulated messages
+    // Create request with the accumulated messages. No full-input re-hooking:
+    // the fresh tool outputs were hooked above via `hookFreshToolOutputs`, and
+    // previously-hooked items in `newInput` (carried over from init) must not
+    // be re-hooked here (non-idempotent hooks would double-fire).
     const request: models.ResponsesRequest = {
       ...baseRequest,
-      input: hookedInput,
+      input: newInput,
       stream: true,
     };
 
@@ -1504,8 +1671,14 @@ export class ModelResult<
     this.toolExecutionPromise = (async () => {
       await this.initStream();
 
-      // If resuming from approval and still pending, don't continue
-      if (this.isResumingFromApproval && this.currentState?.status === 'awaiting_approval') {
+      // If resuming from approval or HITL pause and still pending, don't continue.
+      // `processApprovalDecisions` runs in initStream for resumes; if it left us
+      // paused (any remaining pending calls), the outer loop should not execute.
+      if (
+        this.isResumingFromApproval &&
+        (this.currentState?.status === 'awaiting_approval' ||
+          this.currentState?.status === 'awaiting_hitl')
+      ) {
         return;
       }
 
@@ -1580,7 +1753,10 @@ export class ModelResult<
         await this.resolveAsyncFunctionsForTurn(turnContext);
 
         // Execute tools
-        const toolResults = await this.executeToolRound(currentToolCalls, turnContext);
+        const { toolResults, pausedCalls } = await this.executeToolRound(
+          currentToolCalls,
+          turnContext,
+        );
 
         // Server-tool output items are already-executed results in the
         // response; collect them so toolResults presents a unified list.
@@ -1619,6 +1795,14 @@ export class ModelResult<
         // Save tool results to state
         await this.saveToolResultsToState(toolResults);
 
+        // If any HITL tools paused this round, stop here without making a
+        // follow-up request — sending an incomplete set of outputs would be
+        // incorrect. Persist the paused calls so the caller can resume later.
+        if (pausedCalls.length > 0) {
+          await this.persistHitlPause(currentResponse, pausedCalls);
+          return;
+        }
+
         // Apply nextTurnParams
         await this.applyNextTurnParams(currentToolCalls);
 
@@ -2244,14 +2428,17 @@ export class ModelResult<
   // =========================================================================
 
   /**
-   * Check if the conversation requires human approval to continue.
-   * Returns true if there are pending tool calls awaiting approval.
+   * Check if the conversation requires human input to continue.
+   * Returns true when the conversation is paused waiting for caller-supplied
+   * decisions — either approval/rejection (`awaiting_approval`) or HITL tool
+   * resume (`awaiting_hitl`). Also returns true whenever `pendingToolCalls`
+   * is populated regardless of status.
    */
   async requiresApproval(): Promise<boolean> {
     await this.initStream();
 
-    // If we have pending tool calls in state, approval is required
-    if (this.currentState?.status === 'awaiting_approval') {
+    const status = this.currentState?.status;
+    if (status === 'awaiting_approval' || status === 'awaiting_hitl') {
       return true;
     }
 
diff --git a/packages/agent/src/lib/tool-executor.ts b/packages/agent/src/lib/tool-executor.ts
index 8929dea..3ea3a74 100644
--- a/packages/agent/src/lib/tool-executor.ts
+++ b/packages/agent/src/lib/tool-executor.ts
@@ -517,8 +517,12 @@ export async function applyOnResponseReceivedHooks(
     }
   }
 
+  // Element type of the array form of InputsUnion — use this so `rewritten`
+  // is structurally assignable back to InputsUnion without an `as` cast.
+  type InputsArrayItem = Extract<models.InputsUnion, readonly unknown[]>[number];
+
   let changed = false;
-  const rewritten: unknown[] = [];
+  const rewritten: InputsArrayItem[] = [];
   for (const item of input) {
     if (!isFunctionCallOutputItem(item)) {
       rewritten.push(item);
@@ -554,17 +558,21 @@ export async function applyOnResponseReceivedHooks(
       );
       newOutput = JSON.stringify(hookResult);
     } catch (err) {
+      // Preserve the caller's original output alongside the hook error so the
+      // model can distinguish a hook failure from a tool-reported error.
       newOutput = JSON.stringify({
         error: err instanceof Error ? err.message : String(err),
+        originalOutput: parsed,
       });
     }
 
-    rewritten.push({
+    const replaced: models.FunctionCallOutputItem = {
       ...item,
       output: newOutput,
-    });
+    };
+    rewritten.push(replaced);
     changed = true;
   }
 
-  return changed ? (rewritten as models.InputsUnion) : input;
+  return changed ? rewritten : input;
 }
diff --git a/packages/agent/src/lib/tool-types.ts b/packages/agent/src/lib/tool-types.ts
index f43bebc..d34d269 100644
--- a/packages/agent/src/lib/tool-types.ts
+++ b/packages/agent/src/lib/tool-types.ts
@@ -939,9 +939,22 @@ export interface PartialResponse<TTools extends readonly Tool[] = readonly Tool[
 }
 
 /**
- * Status of a conversation state
- */
-export type ConversationStatus = 'complete' | 'interrupted' | 'awaiting_approval' | 'in_progress';
+ * Status of a conversation state.
+ *
+ * - `in_progress`: conversation is actively executing
+ * - `complete`: conversation finished successfully
+ * - `interrupted`: execution was externally interrupted
+ * - `awaiting_approval`: tool calls are waiting for caller to approve/reject
+ * - `awaiting_hitl`: one or more HITL tools returned `null` from `onToolCalled`,
+ *   pausing execution so the caller can supply outputs for the paused calls
+ *   before resuming
+ */
+export type ConversationStatus =
+  | 'complete'
+  | 'interrupted'
+  | 'awaiting_approval'
+  | 'awaiting_hitl'
+  | 'in_progress';
 
 /**
  * State for multi-turn conversations with persistence and approval gates
diff --git a/packages/agent/tests/unit/hitl-tool.test.ts b/packages/agent/tests/unit/hitl-tool.test.ts
index 6594917..10b7bba 100644
--- a/packages/agent/tests/unit/hitl-tool.test.ts
+++ b/packages/agent/tests/unit/hitl-tool.test.ts
@@ -1,5 +1,6 @@
+import type { OpenRouterCore } from '@openrouter/sdk/core';
 import type * as models from '@openrouter/sdk/models';
-import { describe, expect, it, vi } from 'vitest';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
 import { z } from 'zod/v4';
 import { tool } from '../../src/lib/tool.js';
 import {
@@ -7,7 +8,14 @@ import {
   executeHITLTool,
   executeTool,
 } from '../../src/lib/tool-executor.js';
-import type { HITLTool, ParsedToolCall, Tool, TurnContext } from '../../src/lib/tool-types.js';
+import type {
+  ConversationState,
+  HITLTool,
+  ParsedToolCall,
+  StateAccessor,
+  Tool,
+  TurnContext,
+} from '../../src/lib/tool-types.js';
 import {
   isAutoResolvableTool,
   isHITLTool,
@@ -15,6 +23,19 @@ import {
   ToolType,
 } from '../../src/lib/tool-types.js';
 
+// Hoisted mock for the SDK's API transport. Any test that imports
+// ModelResult below will go through this mock instead of hitting the
+// network. Individual tests provide per-call canned responses with
+// `mockResolvedValueOnce`.
+const mockBetaResponsesSend = vi.hoisted(() => vi.fn());
+
+vi.mock('@openrouter/sdk/funcs/betaResponsesSend', () => ({
+  betaResponsesSend: mockBetaResponsesSend,
+}));
+
+// Import ModelResult AFTER vi.mock so the mock is wired in.
+const { ModelResult } = await import('../../src/lib/model-result.js');
+
 const turnContext: TurnContext = {
   numberOfTurns: 1,
 };
@@ -345,7 +366,7 @@ describe('applyOnResponseReceivedHooks', () => {
     expect(result).toBe(input); // same reference, no rewrite
   });
 
-  it('replaces output with an error object when the hook throws', async () => {
+  it('replaces output with an error object preserving the original output when the hook throws', async () => {
     const t = tool({
       name: 'approve',
       inputSchema: z.object({
@@ -357,14 +378,13 @@ describe('applyOnResponseReceivedHooks', () => {
       },
     });
 
+    const originalPayload = {
+      ok: true,
+      note: 'carry-me-through',
+    };
     const input: models.InputsUnion = [
       callItem('c1', 'approve'),
-      outputItem(
-        'c1',
-        JSON.stringify({
-          ok: true,
-        }),
-      ),
+      outputItem('c1', JSON.stringify(originalPayload)),
     ];
 
     const result = await applyOnResponseReceivedHooks(
@@ -378,8 +398,46 @@ describe('applyOnResponseReceivedHooks', () => {
     const out = arr[1] as models.FunctionCallOutputItem;
     const parsed = JSON.parse(out.output as string) as {
       error: string;
+      originalOutput: unknown;
     };
     expect(parsed.error).toBe('invalid result');
+    // Original caller-supplied output must be preserved (parsed form) so the
+    // model can distinguish a hook failure from a tool-reported error.
+    expect(parsed.originalOutput).toEqual(originalPayload);
+  });
+
+  it('preserves the raw string as originalOutput when the output is not JSON and the hook throws', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      onToolCalled: async () => null,
+      onResponseReceived: async () => {
+        throw new Error('invalid result');
+      },
+    });
+
+    const input: models.InputsUnion = [
+      callItem('c1', 'approve'),
+      outputItem('c1', 'not-json'),
+    ];
+
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
+    const arr = result as unknown[];
+    const out = arr[1] as models.FunctionCallOutputItem;
+    const parsed = JSON.parse(out.output as string) as {
+      error: string;
+      originalOutput: unknown;
+    };
+    expect(parsed.error).toBe('invalid result');
+    expect(parsed.originalOutput).toBe('not-json');
   });
 
   it('passes the parsed raw result (not the raw string) to the hook', async () => {
@@ -501,3 +559,887 @@ describe('applyOnResponseReceivedHooks', () => {
     expect(result).toBe(input);
   });
 });
+
+// ---------------------------------------------------------------------------
+// Integration tests — full HITL flow through ModelResult
+// ---------------------------------------------------------------------------
+//
+// These tests drive the HITL loop end-to-end: onToolCalled fires from
+// ModelResult.executeToolsIfNeeded, and the follow-up turn sends the
+// tool output (possibly transformed by onResponseReceived) back to the
+// mocked API. Everything below the describe block uses the vi.hoisted
+// mock for `@openrouter/sdk/funcs/betaResponsesSend` declared at the
+// top of this file.
+
+/**
+ * Build a completed OpenResponsesResult with the given output items.
+ * Intentionally fills only the fields ModelResult actually reads — the
+ * full schema is huge and any missing fields would fail the SDK's own
+ * inbound validation, but we never push through that path from a mock.
+ */
+function makeResponse(
+  id: string,
+  output: models.OpenResponsesResult['output'],
+): models.OpenResponsesResult {
+  return {
+    id,
+    object: 'response',
+    createdAt: 0,
+    model: 'test-model',
+    status: 'completed',
+    completedAt: 0,
+    output,
+    error: null,
+    incompleteDetails: null,
+    temperature: null,
+    topP: null,
+    presencePenalty: null,
+    frequencyPenalty: null,
+    metadata: null,
+    instructions: null,
+    tools: [],
+    toolChoice: 'auto',
+    parallelToolCalls: false,
+  };
+}
+
+/**
+ * Wrap a completed response in a one-shot ReadableStream of StreamEvents,
+ * mirroring what betaResponsesSend returns when stream: true is requested.
+ * Note: the single event satisfies `StreamEventsResponseCompleted` which
+ * is a member of the `StreamEvents` union — no cast needed.
+ */
+function makeCompletedStream(
+  response: models.OpenResponsesResult,
+): ReadableStream<models.StreamEvents> {
+  const completedEvent: models.StreamEventsResponseCompleted = {
+    type: 'response.completed',
+    response,
+    sequenceNumber: 0,
+  };
+  return new ReadableStream<models.StreamEvents>({
+    start(controller) {
+      controller.enqueue(completedEvent);
+      controller.close();
+    },
+  });
+}
+
+function makeFunctionCallItem(
+  callId: string,
+  name: string,
+  args: string,
+): models.OutputFunctionCallItem {
+  return {
+    type: 'function_call',
+    id: `fc_${callId}`,
+    callId,
+    name,
+    arguments: args,
+    status: 'completed',
+  };
+}
+
+function makeMessageItem(id: string, text: string): models.OutputMessage {
+  return {
+    id,
+    type: 'message',
+    role: 'assistant',
+    status: 'completed',
+    content: [
+      {
+        type: 'output_text',
+        text,
+        annotations: [],
+      },
+    ],
+  };
+}
+
+/**
+ * Typeguard for a completed betaResponsesSend invocation — narrows the
+ * loosely-typed `vi.fn()` mock call payload to the subset we need
+ * (the `responsesRequest` wrapper with the `input` / `tools` fields).
+ */
+function isSendCallArg(value: unknown): value is {
+  responsesRequest: {
+    input?: models.InputsUnion;
+    tools?: models.ResponsesRequestToolUnion[];
+  };
+} {
+  if (typeof value !== 'object' || value === null) {
+    return false;
+  }
+  if (!('responsesRequest' in value)) {
+    return false;
+  }
+  const rr = value.responsesRequest;
+  return typeof rr === 'object' && rr !== null;
+}
+
+/**
+ * Typeguard narrowing an InputsUnion array element to a function_call_output.
+ */
+function isFunctionCallOutput(item: unknown): item is models.FunctionCallOutputItem {
+  if (typeof item !== 'object' || item === null) {
+    return false;
+  }
+  if (!('type' in item)) {
+    return false;
+  }
+  return item.type === 'function_call_output';
+}
+
+/**
+ * Typeguard for a plain-object record (used to narrow the raw tool
+ * output inside onResponseReceived without a type assertion).
+ */
+function isRawRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === 'object' && value !== null && !Array.isArray(value);
+}
+
+describe('HITL flow through ModelResult (integration)', () => {
+  beforeEach(() => {
+    mockBetaResponsesSend.mockReset();
+  });
+
+  it('auto-resolve: onToolCalled returning a value feeds the model on the next turn', async () => {
+    // Turn 0: model invokes the HITL tool.
+    const turn0 = makeResponse('resp_turn_0', [
+      makeFunctionCallItem(
+        'c1',
+        'approve',
+        JSON.stringify({
+          amount: 5,
+        }),
+      ),
+    ]);
+    // Turn 1: model replies after seeing the tool output.
+    const turn1 = makeResponse('resp_turn_1', [
+      makeMessageItem('msg_1', 'approved'),
+    ]);
+
+    mockBetaResponsesSend
+      .mockResolvedValueOnce({
+        ok: true,
+        value: makeCompletedStream(turn0),
+      })
+      .mockResolvedValueOnce({
+        ok: true,
+        value: makeCompletedStream(turn1),
+      });
+
+    const approve = tool({
+      name: 'approve',
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
+      onToolCalled: async () => ({
+        ok: true,
+      }),
+    });
+
+    const tools = [
+      approve,
+    ] as const;
+
+    const result = new ModelResult<typeof tools>({
+      request: {
+        model: 'test-model',
+        input: 'approve 5',
+        tools: [
+          {
+            type: 'function',
+            name: 'approve',
+            description: null,
+            strict: null,
+            parameters: {},
+          },
+        ],
+      },
+      client: {} as OpenRouterCore,
+      tools,
+    });
+
+    const finalResponse = await result.getResponse();
+
+    // Both turns should have fired — pause did NOT trigger because
+    // onToolCalled returned a value.
+    expect(mockBetaResponsesSend).toHaveBeenCalledTimes(2);
+    expect(finalResponse.id).toBe('resp_turn_1');
+
+    // Second request's input must contain the function_call_output with
+    // the hook's return value.
+    const secondCallArg = mockBetaResponsesSend.mock.calls[1]?.[1];
+    if (!isSendCallArg(secondCallArg)) {
+      throw new Error('Second send call arg did not match expected shape');
+    }
+    const input = secondCallArg.responsesRequest.input;
+    if (!Array.isArray(input)) {
+      throw new Error('Expected second-turn input to be an array');
+    }
+    const output = input.find(isFunctionCallOutput);
+    if (!output) {
+      throw new Error('Expected a function_call_output in second-turn input');
+    }
+    expect(output.callId).toBe('c1');
+    const rawOutput = output.output;
+    if (typeof rawOutput !== 'string') {
+      throw new Error('Expected function_call_output.output to be a string');
+    }
+    expect(JSON.parse(rawOutput)).toEqual({
+      ok: true,
+    });
+  });
+
+  it('pause: onToolCalled returning null breaks the loop cleanly', async () => {
+    // Turn 0: model invokes the HITL tool. No turn 1 should follow.
+    const turn0 = makeResponse('resp_turn_0', [
+      makeFunctionCallItem(
+        'c1',
+        'approve',
+        JSON.stringify({
+          amount: 5,
+        }),
+      ),
+    ]);
+
+    mockBetaResponsesSend.mockResolvedValueOnce({
+      ok: true,
+      value: makeCompletedStream(turn0),
+    });
+
+    const approve = tool({
+      name: 'approve',
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      onToolCalled: async () => null,
+    });
+
+    const tools = [
+      approve,
+    ] as const;
+
+    const result = new ModelResult<typeof tools>({
+      request: {
+        model: 'test-model',
+        input: 'approve 5',
+        tools: [
+          {
+            type: 'function',
+            name: 'approve',
+            description: null,
+            strict: null,
+            parameters: {},
+          },
+        ],
+      },
+      client: {} as OpenRouterCore,
+      tools,
+    });
+
+    const finalResponse = await result.getResponse();
+
+    // The loop must NOT have made a follow-up request — the caller is
+    // expected to resume by invoking callModel again with a
+    // function_call_output they produce externally.
+    expect(mockBetaResponsesSend).toHaveBeenCalledTimes(1);
+
+    // The final response surfaces the pending function_call so the
+    // caller can see what needs to be resolved.
+    const toolCalls = await result.getToolCalls();
+    expect(toolCalls).toHaveLength(1);
+    expect(toolCalls[0]?.id).toBe('c1');
+    expect(toolCalls[0]?.name).toBe('approve');
+
+    // The response output should retain the paused function_call item.
+    const outputs = Array.isArray(finalResponse.output)
+      ? finalResponse.output
+      : [
+          finalResponse.output,
+        ];
+    const pendingCall = outputs.find(
+      (item): item is models.OutputFunctionCallItem =>
+        typeof item === 'object' &&
+        item !== null &&
+        'type' in item &&
+        item.type === 'function_call',
+    );
+    expect(pendingCall).toBeDefined();
+    expect(pendingCall?.callId).toBe('c1');
+  });
+
+  it('resume: caller-supplied function_call_output is transformed by onResponseReceived before the model sees it', async () => {
+    // Single turn: the caller resumes by supplying both the prior
+    // function_call and its function_call_output; the model replies
+    // after the hook rewrites the output.
+    const resumedTurn = makeResponse('resp_resumed', [
+      makeMessageItem('msg_1', 'approved after review'),
+    ]);
+
+    mockBetaResponsesSend.mockResolvedValueOnce({
+      ok: true,
+      value: makeCompletedStream(resumedTurn),
+    });
+
+    const onToolCalled = vi.fn(async () => null);
+    const approve = tool({
+      name: 'approve',
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      onToolCalled,
+      onResponseReceived: async (raw) => {
+        // Transform the caller's raw output before the model sees it.
+        if (!isRawRecord(raw)) {
+          return {
+            ok: false,
+            reviewedAt: 1234,
+          };
+        }
+        return {
+          ...raw,
+          reviewedAt: 1234,
+        };
+      },
+    });
+
+    const tools = [
+      approve,
+    ] as const;
+
+    // Caller-supplied input: the prior function_call plus the caller's
+    // raw function_call_output for c1.
+    const resumeInput: models.InputsUnion = [
+      makeFunctionCallItem(
+        'c1',
+        'approve',
+        JSON.stringify({
+          amount: 5,
+        }),
+      ),
+      {
+        type: 'function_call_output',
+        id: 'output_c1',
+        callId: 'c1',
+        output: JSON.stringify({
+          ok: true,
+        }),
+      },
+    ];
+
+    const result = new ModelResult<typeof tools>({
+      request: {
+        model: 'test-model',
+        input: resumeInput,
+        tools: [
+          {
+            type: 'function',
+            name: 'approve',
+            description: null,
+            strict: null,
+            parameters: {},
+          },
+        ],
+      },
+      client: {} as OpenRouterCore,
+      tools,
+    });
+
+    const text = await result.getText();
+
+    // Only one API call — resume is a single turn because the caller
+    // pre-baked the function_call_output.
+    expect(mockBetaResponsesSend).toHaveBeenCalledTimes(1);
+
+    // onToolCalled should NOT fire on resume — we are not reacting to a
+    // fresh function_call from the model, we are supplying a prior one.
+    expect(onToolCalled).not.toHaveBeenCalled();
+
+    // The request shipped to the mocked API must carry the hook's
+    // transformed output, not the caller's raw payload.
+    const firstCallArg = mockBetaResponsesSend.mock.calls[0]?.[1];
+    if (!isSendCallArg(firstCallArg)) {
+      throw new Error('First send call arg did not match expected shape');
+    }
+    const input = firstCallArg.responsesRequest.input;
+    if (!Array.isArray(input)) {
+      throw new Error('Expected resume input to be an array');
+    }
+    const output = input.find(isFunctionCallOutput);
+    if (!output) {
+      throw new Error('Expected a function_call_output in the resume input');
+    }
+    const rawOutput = output.output;
+    if (typeof rawOutput !== 'string') {
+      throw new Error('Expected function_call_output.output to be a string');
+    }
+    const parsed: unknown = JSON.parse(rawOutput);
+    expect(parsed).toEqual({
+      ok: true,
+      reviewedAt: 1234,
+    });
+
+    // The final text reflects the model's reply after it saw the
+    // transformed tool output.
+    expect(text).toBe('approved after review');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// ModelResult state-machine tests for HITL pause + resume fixes
+// (verify ConversationStatus transitions and pendingToolCalls visibility)
+// ---------------------------------------------------------------------------
+
+/**
+ * Minimal in-memory StateAccessor for driving ModelResult through
+ * awaiting_approval / awaiting_hitl transitions without a real DB.
+ */
+function createMemoryAccessor<TTools extends readonly Tool[]>(
+  initial: ConversationState<TTools> | null = null,
+): {
+  accessor: StateAccessor<TTools>;
+  get: () => ConversationState<TTools> | null;
+} {
+  let state = initial;
+  const accessor: StateAccessor<TTools> = {
+    load: async () => state,
+    save: async (next) => {
+      state = next;
+    },
+  };
+  return {
+    accessor,
+    get: () => state,
+  };
+}
+
+describe('HITL pause persists state (Fix #2)', () => {
+  beforeEach(() => {
+    mockBetaResponsesSend.mockReset();
+  });
+
+  it('HITL pause sets status=awaiting_hitl and populates pendingToolCalls', async () => {
+    const turn0 = makeResponse('resp_turn_0', [
+      makeFunctionCallItem(
+        'c1',
+        'approve',
+        JSON.stringify({
+          amount: 5,
+        }),
+      ),
+    ]);
+
+    mockBetaResponsesSend.mockResolvedValueOnce({
+      ok: true,
+      value: makeCompletedStream(turn0),
+    });
+
+    const approve = tool({
+      name: 'approve',
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      onToolCalled: async () => null,
+    });
+
+    const tools = [
+      approve,
+    ] as const;
+
+    const { accessor, get } = createMemoryAccessor<typeof tools>();
+
+    const result = new ModelResult<typeof tools>({
+      request: {
+        model: 'test-model',
+        input: 'approve 5',
+        tools: [
+          {
+            type: 'function',
+            name: 'approve',
+            description: null,
+            strict: null,
+            parameters: {},
+          },
+        ],
+      },
+      client: {} as OpenRouterCore,
+      tools,
+      state: accessor,
+    });
+
+    // Drive the loop to completion (pause).
+    await result.getResponse();
+
+    // Only the initial turn was sent — no follow-up after pause.
+    expect(mockBetaResponsesSend).toHaveBeenCalledTimes(1);
+
+    const saved = get();
+    expect(saved).not.toBeNull();
+    expect(saved?.status).toBe('awaiting_hitl');
+    expect(saved?.pendingToolCalls).toHaveLength(1);
+    expect(saved?.pendingToolCalls?.[0]?.id).toBe('c1');
+
+    // getPendingToolCalls() surfaces paused calls to the caller.
+    const pending = await result.getPendingToolCalls();
+    expect(pending).toHaveLength(1);
+    expect(pending[0]?.id).toBe('c1');
+
+    // requiresApproval() returns true for HITL pauses too.
+    expect(await result.requiresApproval()).toBe(true);
+  });
+});
+
+describe('Approved HITL pauses retain pendingToolCalls (Fix #9)', () => {
+  beforeEach(() => {
+    mockBetaResponsesSend.mockReset();
+  });
+
+  it('resume: approved HITL that returns null stays in pendingToolCalls with status awaiting_hitl', async () => {
+    // This tool pauses every time — simulating a caller that must
+    // explicitly supply the output externally later.
+    const onToolCalled = vi.fn(async () => null);
+    const approve = tool({
+      name: 'approve',
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      requireApproval: true,
+      onToolCalled,
+    });
+
+    const tools = [
+      approve,
+    ] as const;
+
+    // Pre-populate state as if the prior run left us awaiting_approval
+    // with one pending call c1.
+    const pending: ParsedToolCall<(typeof tools)[number]> = {
+      id: 'c1',
+      name: 'approve',
+      arguments: {
+        amount: 5,
+      },
+    };
+    const savedState: ConversationState<typeof tools> = {
+      id: 'conv_test',
+      messages: [
+        makeFunctionCallItem(
+          'c1',
+          'approve',
+          JSON.stringify({
+            amount: 5,
+          }),
+        ),
+      ],
+      pendingToolCalls: [
+        pending,
+      ],
+      status: 'awaiting_approval',
+      createdAt: 0,
+      updatedAt: 0,
+    };
+
+    const { accessor, get } = createMemoryAccessor<typeof tools>(savedState);
+
+    const result = new ModelResult<typeof tools>({
+      request: {
+        model: 'test-model',
+        input: 'approve 5',
+        tools: [
+          {
+            type: 'function',
+            name: 'approve',
+            description: null,
+            strict: null,
+            parameters: {},
+          },
+        ],
+      },
+      client: {} as OpenRouterCore,
+      tools,
+      state: accessor,
+      // Caller approves c1 — but onToolCalled returns null, so the call
+      // must STAY on pendingToolCalls instead of silently disappearing.
+      approveToolCalls: [
+        'c1',
+      ],
+    });
+
+    // Drive initStream + processApprovalDecisions.
+    const pausedCalls = await result.getPendingToolCalls();
+
+    // The approved HITL call paused, so onToolCalled should have been
+    // invoked once during processApprovalDecisions.
+    expect(onToolCalled).toHaveBeenCalledTimes(1);
+
+    // No API request should have been made — we paused before any
+    // follow-up turn.
+    expect(mockBetaResponsesSend).not.toHaveBeenCalled();
+
+    // The call must remain in pendingToolCalls (Fix #9).
+    expect(pausedCalls).toHaveLength(1);
+    expect(pausedCalls[0]?.id).toBe('c1');
+
+    const saved = get();
+    expect(saved?.status).toBe('awaiting_hitl');
+    expect(saved?.pendingToolCalls).toHaveLength(1);
+    expect(saved?.pendingToolCalls?.[0]?.id).toBe('c1');
+  });
+});
+
+describe('hasExecutableToolCalls treats HITL tools as auto-resolvable (Fix #1)', () => {
+  beforeEach(() => {
+    mockBetaResponsesSend.mockReset();
+  });
+
+  it('HITL-only tool calls invoke onToolCalled instead of being classified as manual', async () => {
+    // If hasExecutableToolCalls still used hasExecuteFunction, the loop
+    // would exit immediately without invoking onToolCalled and no
+    // follow-up turn would be made. This test locks in that HITL tools
+    // DO drive the loop forward.
+    const onToolCalled = vi.fn(async () => ({
+      ok: true,
+    }));
+
+    const turn0 = makeResponse('resp_turn_0', [
+      makeFunctionCallItem(
+        'c1',
+        'approve',
+        JSON.stringify({
+          amount: 5,
+        }),
+      ),
+    ]);
+    const turn1 = makeResponse('resp_turn_1', [
+      makeMessageItem('msg_1', 'approved'),
+    ]);
+
+    mockBetaResponsesSend
+      .mockResolvedValueOnce({
+        ok: true,
+        value: makeCompletedStream(turn0),
+      })
+      .mockResolvedValueOnce({
+        ok: true,
+        value: makeCompletedStream(turn1),
+      });
+
+    const approve = tool({
+      name: 'approve',
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
+      onToolCalled,
+    });
+
+    const tools = [
+      approve,
+    ] as const;
+
+    const result = new ModelResult<typeof tools>({
+      request: {
+        model: 'test-model',
+        input: 'approve 5',
+        tools: [
+          {
+            type: 'function',
+            name: 'approve',
+            description: null,
+            strict: null,
+            parameters: {},
+          },
+        ],
+      },
+      client: {} as OpenRouterCore,
+      tools,
+    });
+
+    const final = await result.getResponse();
+
+    // Two turns means the HITL tool WAS recognized as executable.
+    expect(mockBetaResponsesSend).toHaveBeenCalledTimes(2);
+    expect(onToolCalled).toHaveBeenCalledTimes(1);
+    expect(final.id).toBe('resp_turn_1');
+  });
+});
+
+describe('continueWithUnsentResults does not re-hook caller-supplied items (Fix #8)', () => {
+  beforeEach(() => {
+    mockBetaResponsesSend.mockReset();
+  });
+
+  it('onResponseReceived fires once per caller-supplied output across init + resume', async () => {
+    // Scenario: caller supplies a function_call_output for a HITL tool in
+    // the initial input, the model requests another (approval-required)
+    // tool call in response, and after the caller approves that second
+    // call the loop resumes via continueWithUnsentResults. Before Fix #8
+    // the initial caller-supplied output would be re-hooked during
+    // resume because applyOnResponseReceivedHooks walked the full
+    // accumulated newInput.
+    const onResponseReceived = vi.fn(async (raw: unknown) => {
+      if (!isRawRecord(raw)) {
+        return {
+          hooked: true,
+        };
+      }
+      return {
+        ...raw,
+        hooked: true,
+      };
+    });
+
+    const approve = tool({
+      name: 'approve',
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      onToolCalled: async () => ({
+        ok: true,
+      }),
+      onResponseReceived,
+    });
+
+    const needsApproval = tool({
+      name: 'dangerous',
+      inputSchema: z.object({
+        x: z.number(),
+      }),
+      requireApproval: true,
+      execute: async () => ({
+        done: true,
+      }),
+    });
+
+    const tools = [
+      approve,
+      needsApproval,
+    ] as const;
+
+    // Initial model turn requests `dangerous`, forcing an approval pause.
+    const turn0 = makeResponse('resp_turn_0', [
+      makeFunctionCallItem(
+        'd1',
+        'dangerous',
+        JSON.stringify({
+          x: 1,
+        }),
+      ),
+    ]);
+    // After the caller approves d1 and execution resumes, the model
+    // replies with a plain message.
+    const turn1 = makeResponse('resp_turn_1', [
+      makeMessageItem('msg_1', 'done'),
+    ]);
+
+    mockBetaResponsesSend
+      .mockResolvedValueOnce({
+        ok: true,
+        value: makeCompletedStream(turn0),
+      })
+      .mockResolvedValueOnce({
+        ok: true,
+        value: makeCompletedStream(turn1),
+      });
+
+    // Caller-supplied initial input: a function_call_output for `approve`
+    // that will be hooked during init (via applyOnResponseReceivedHooks).
+    const initialInput: models.InputsUnion = [
+      makeFunctionCallItem(
+        'c0',
+        'approve',
+        JSON.stringify({
+          amount: 5,
+        }),
+      ),
+      {
+        type: 'function_call_output',
+        id: 'output_c0',
+        callId: 'c0',
+        output: JSON.stringify({
+          ok: true,
+        }),
+      },
+    ];
+
+    const { accessor } = createMemoryAccessor<typeof tools>();
+
+    // First pass: initial run. The initial caller-supplied output gets
+    // hooked once during init.
+    const result1 = new ModelResult<typeof tools>({
+      request: {
+        model: 'test-model',
+        input: initialInput,
+        tools: [
+          {
+            type: 'function',
+            name: 'approve',
+            description: null,
+            strict: null,
+            parameters: {},
+          },
+          {
+            type: 'function',
+            name: 'dangerous',
+            description: null,
+            strict: null,
+            parameters: {},
+          },
+        ],
+      },
+      client: {} as OpenRouterCore,
+      tools,
+      state: accessor,
+    });
+
+    await result1.getResponse();
+    expect(onResponseReceived).toHaveBeenCalledTimes(1);
+
+    // Second pass: resume by approving d1. continueWithUnsentResults
+    // must NOT re-hook the earlier c0 output (which lives in the saved
+    // messages from the first run).
+    onResponseReceived.mockClear();
+
+    const result2 = new ModelResult<typeof tools>({
+      request: {
+        model: 'test-model',
+        input: initialInput,
+        tools: [
+          {
+            type: 'function',
+            name: 'approve',
+            description: null,
+            strict: null,
+            parameters: {},
+          },
+          {
+            type: 'function',
+            name: 'dangerous',
+            description: null,
+            strict: null,
+            parameters: {},
+          },
+        ],
+      },
+      client: {} as OpenRouterCore,
+      tools,
+      state: accessor,
+      approveToolCalls: [
+        'd1',
+      ],
+    });
+
+    await result2.getResponse();
+
+    // On resume, `dangerous` is an execute tool (no hook) and the c0
+    // output lives in message history — the hook must NOT run again.
+    expect(onResponseReceived).not.toHaveBeenCalled();
+  });
+});

From 51a983d256d3594060283e6a95b3c2fac0e9cfff Mon Sep 17 00:00:00 2001
From: Matt Apperson <me@mattapperson.com>
Date: Tue, 5 May 2026 20:19:13 -0400
Subject: [PATCH 4/5] fix(tools): address HITL review feedback

- Preserve content-array FunctionCallOutputItem shapes in the HITL
  onResponseReceived pipeline instead of JSON.stringify'ing them; export
  isContentArray from conversation-state for reuse.
- Require outputSchema on HITL tool configs and use it to validate
  caller-supplied responses. Validation failures (and thrown hooks)
  replace the output with {error, originalOutput}; executeHITLTool now
  validates onToolCalled's return unconditionally.
- Stop re-processing historical function_call_output items at initStream.
  When resuming from saved state, only freshly-supplied input items are
  hooked; history's function_call items still power callId->name
  resolution. Refactored hookFreshToolOutputs into applyHooksToFreshItems.
- Drop the onResponseReceived call on SDK-generated outputs in
  continueWithUnsentResults. The hook is now strictly for caller-supplied
  outputs, matching its documented semantics.
---
 packages/agent/src/lib/conversation-state.ts |   4 +-
 packages/agent/src/lib/model-result.ts       | 172 +++--
 packages/agent/src/lib/tool-executor.ts      | 153 +++-
 packages/agent/src/lib/tool-types.ts         |   8 +-
 packages/agent/src/lib/tool.ts               |  22 +-
 packages/agent/tests/unit/hitl-tool.test.ts  | 724 ++++++++++++++++---
 6 files changed, 851 insertions(+), 232 deletions(-)

diff --git a/packages/agent/src/lib/conversation-state.ts b/packages/agent/src/lib/conversation-state.ts
index d1845b6..d672643 100644
--- a/packages/agent/src/lib/conversation-state.ts
+++ b/packages/agent/src/lib/conversation-state.ts
@@ -244,7 +244,9 @@ export function createRejectedResult<TTools extends readonly Tool[] = readonly T
  * Check if a value is a valid content array (array of input_text, input_image, or input_file blocks).
  * These can be passed directly as tool output without JSON.stringify.
  */
-function isContentArray(value: unknown): value is models.FunctionCallOutputItemOutputUnion1[] {
+export function isContentArray(
+  value: unknown,
+): value is models.FunctionCallOutputItemOutputUnion1[] {
   if (!Array.isArray(value)) {
     return false;
   }
diff --git a/packages/agent/src/lib/model-result.ts b/packages/agent/src/lib/model-result.ts
index 03bdca7..26d3500 100644
--- a/packages/agent/src/lib/model-result.ts
+++ b/packages/agent/src/lib/model-result.ts
@@ -1158,45 +1158,58 @@ export class ModelResult<
   }
 
   /**
-   * Apply `onResponseReceived` hooks to the freshly-produced tool outputs
-   * only, without re-hooking caller-supplied outputs that were already hooked
-   * during init. Uses the existing `function_call` items from message history
-   * for name resolution but drops them from the returned array.
+   * Apply `onResponseReceived` hooks to the freshly-supplied input items
+   * only, without re-hooking historical items that live in
+   * `currentState.messages`. Historical `function_call` items are passed to
+   * `applyOnResponseReceivedHooks` purely as callId → toolName
+   * name-resolution context and are dropped from the returned array.
    *
-   * @param freshToolOutputs - Outputs produced in this turn (not yet hooked)
-   * @param messageHistory - Existing conversation messages; scanned for
-   *   matching `function_call` items to resolve tool names
+   * This keeps hooks idempotent across `callModel` invocations on the same
+   * conversation: the first call hooks the caller-supplied output, and
+   * subsequent calls (which rehydrate state) do not re-fire it.
+   *
+   * @param freshItems - Items newly supplied this turn (not yet hooked).
+   *   May contain any mix of InputsUnion array members — only
+   *   `function_call_output` items are affected by hooks; everything else
+   *   is returned unchanged.
+   * @param historicalItems - Existing messages from loaded state. Only
+   *   `function_call` entries are consulted for name resolution; no other
+   *   items are inspected and none are mutated.
    * @param turnContext - Turn context for hook invocation
-   * @returns The tool outputs with hook-rewritten `output` values where a
-   *   matching HITL tool defines `onResponseReceived`
+   * @returns The fresh items in original order, with `output` rewritten on
+   *   any `function_call_output` whose matching HITL tool defines
+   *   `onResponseReceived`.
    */
-  private async hookFreshToolOutputs(
-    freshToolOutputs: models.FunctionCallOutputItem[],
-    messageHistory: models.InputsUnion,
+  private async applyHooksToFreshItems(
+    freshItems: models.BaseInputsUnion[],
+    historicalItems: models.InputsUnion,
     turnContext: TurnContext,
-  ): Promise<models.FunctionCallOutputItem[]> {
-    if (freshToolOutputs.length === 0) {
-      return freshToolOutputs;
+  ): Promise<models.BaseInputsUnion[]> {
+    if (freshItems.length === 0) {
+      return freshItems;
     }
 
-    // Collect function_call items from history so `applyOnResponseReceivedHooks`
-    // can resolve callId -> toolName. We don't mutate history here — the items
-    // are only used to power the internal callIdToName map.
-    const historyArray = Array.isArray(messageHistory)
-      ? messageHistory
+    // Collect function_call items from history so the hook executor can
+    // resolve callId -> toolName without us having to mirror that logic.
+    const historyArray = Array.isArray(historicalItems)
+      ? historicalItems
       : [
-          messageHistory,
+          historicalItems,
         ];
     const functionCallItems: models.BaseInputsUnion[] = [];
     for (const item of historyArray) {
-      if (item && typeof item === 'object' && 'type' in item && item.type === 'function_call') {
-        functionCallItems.push(item as models.BaseInputsUnion);
+      if (isFunctionCallItem(item)) {
+        functionCallItems.push(item);
       }
     }
 
+    // Build a synthetic input that puts the historical function_calls
+    // BEFORE the fresh items. `applyOnResponseReceivedHooks` only rewrites
+    // function_call_output items, so the function_call items are seen only
+    // as name-resolution context.
     const syntheticInput: models.InputsUnion = [
       ...functionCallItems,
-      ...freshToolOutputs,
+      ...freshItems,
     ];
 
     const hookedInput = await applyOnResponseReceivedHooks(
@@ -1209,27 +1222,21 @@ export class ModelResult<
 
     if (hookedInput === syntheticInput) {
       // No rewrites; return the originals unchanged.
-      return freshToolOutputs;
+      return freshItems;
     }
 
-    // Extract the rewritten tool outputs (they sit after the function_calls).
+    // Drop the leading function_call items we prepended; what remains is
+    // the fresh items in their original order (some with rewritten outputs).
     const hookedArray = Array.isArray(hookedInput)
       ? hookedInput
       : [
           hookedInput,
         ];
-    const extracted: models.FunctionCallOutputItem[] = [];
-    for (const item of hookedArray) {
-      if (
-        item &&
-        typeof item === 'object' &&
-        'type' in item &&
-        item.type === 'function_call_output'
-      ) {
-        extracted.push(item as models.FunctionCallOutputItem);
-      }
+    if (hookedArray.length !== syntheticInput.length) {
+      // Shouldn't happen (hooks only rewrite in-place), but be conservative.
+      return freshItems;
     }
-    return extracted.length === freshToolOutputs.length ? extracted : freshToolOutputs;
+    return hookedArray.slice(functionCallItems.length);
   }
 
   /**
@@ -1358,37 +1365,54 @@ export class ModelResult<
       // Resolve any async functions first
       let baseRequest = await this.resolveRequestForContext(initialContext);
 
-      // If we have state with existing messages, use those as input
-      if (
-        this.currentState?.messages &&
+      // Split input into "historical" (already in state.messages) and "fresh"
+      // (newly supplied this turn). `onResponseReceived` must fire only for
+      // fresh items — re-hooking historical outputs on every callModel call
+      // would double-invoke non-idempotent hooks.
+      const hasLoadedHistory =
+        !!this.currentState?.messages &&
         Array.isArray(this.currentState.messages) &&
-        this.currentState.messages.length > 0
-      ) {
-        // Append new input to existing messages
+        this.currentState.messages.length > 0;
+
+      if (hasLoadedHistory && this.currentState) {
+        // `currentState.messages` is InputsUnion — keep it as that union so
+        // appendToMessages (which expects InputsUnion) accepts it directly.
+        const historicalMessages: models.InputsUnion = this.currentState.messages;
+
+        // Normalize the caller-supplied input for this turn into an array of
+        // fresh items. Undefined stays undefined (no new items). The widening
+        // to BaseInputsUnion[] matches the signature of appendToMessages and
+        // mirrors the pre-existing pattern elsewhere in this file; the two
+        // union shapes (InputsUnion1 vs BaseInputsUnion1) describe the same
+        // SDK input items with different nominal types, and BaseInputsUnion
+        // already includes `any` in its element type, so the runtime shape
+        // is preserved either way.
         const newInput = baseRequest.input;
-        if (newInput) {
-          const inputArray = Array.isArray(newInput)
-            ? newInput
+        let freshItems: models.BaseInputsUnion[] | undefined;
+        if (newInput !== undefined) {
+          freshItems = Array.isArray(newInput)
+            ? (newInput as models.BaseInputsUnion[])
             : [
                 newInput,
               ];
-          baseRequest = {
-            ...baseRequest,
-            input: appendToMessages(
-              this.currentState.messages,
-              inputArray as models.BaseInputsUnion[],
-            ),
-          };
-        } else {
-          baseRequest = {
-            ...baseRequest,
-            input: this.currentState.messages,
-          };
         }
-      }
 
-      // Apply onResponseReceived hooks to caller-supplied tool-output items
-      if (baseRequest.input !== undefined) {
+        // Hook fresh items only (historical function_calls serve as
+        // name-resolution context). Leave historical items untouched.
+        const hookedFresh = freshItems
+          ? await this.applyHooksToFreshItems(freshItems, historicalMessages, initialContext)
+          : undefined;
+
+        baseRequest = {
+          ...baseRequest,
+          input: hookedFresh
+            ? appendToMessages(historicalMessages, hookedFresh)
+            : historicalMessages,
+        };
+      } else if (baseRequest.input !== undefined) {
+        // No loaded history — everything in input is fresh. Hook the whole
+        // thing (non-array inputs pass through applyOnResponseReceivedHooks
+        // unchanged).
         const hookedInput = await applyOnResponseReceivedHooks(
           baseRequest.input,
           this.options.tools,
@@ -1597,19 +1621,13 @@ export class ModelResult<
       numberOfTurns: this.allToolExecutionRounds.length + 1,
     };
 
-    // Apply onResponseReceived hooks ONLY to the newly-added tool outputs.
-    // `currentMessages` already contains caller-supplied outputs that were
-    // hooked during init (see `initStream` -> `applyOnResponseReceivedHooks`);
-    // re-hooking them would double-invoke non-idempotent hooks.
+    // Append SDK-generated tool outputs directly — `onResponseReceived` is
+    // reserved for caller-supplied outputs (the resume-with-function-call-
+    // output path, hooked during init). SDK-produced outputs from auto-
+    // executed tools already went through the tool's own execute/generator
+    // pipeline and must not be mutated by the resume hook.
     const currentMessages = this.currentState.messages;
-    const hookedToolOutputs = await this.hookFreshToolOutputs(
-      toolOutputs,
-      currentMessages,
-      turnContext,
-    );
-
-    // Build new input with the hooked tool results appended
-    const newInput = appendToMessages(currentMessages, hookedToolOutputs);
+    const newInput = appendToMessages(currentMessages, toolOutputs);
 
     // Clear unsent results from state
     this.currentState = updateState(this.currentState, {
@@ -1623,10 +1641,10 @@ export class ModelResult<
     // Build request with the updated input
     const baseRequest = await this.resolveRequestForContext(turnContext);
 
-    // Create request with the accumulated messages. No full-input re-hooking:
-    // the fresh tool outputs were hooked above via `hookFreshToolOutputs`, and
-    // previously-hooked items in `newInput` (carried over from init) must not
-    // be re-hooked here (non-idempotent hooks would double-fire).
+    // No hooking here: SDK-generated outputs are appended as-is and any
+    // caller-supplied items in `newInput` (carried over from init) were
+    // already hooked during `initStream` — re-hooking would double-fire
+    // non-idempotent hooks.
     const request: models.ResponsesRequest = {
       ...baseRequest,
       input: newInput,
diff --git a/packages/agent/src/lib/tool-executor.ts b/packages/agent/src/lib/tool-executor.ts
index 3ea3a74..9b7ab46 100644
--- a/packages/agent/src/lib/tool-executor.ts
+++ b/packages/agent/src/lib/tool-executor.ts
@@ -1,6 +1,7 @@
 import type * as models from '@openrouter/sdk/models';
 import * as z4 from 'zod/v4';
 import type { $ZodObject, $ZodShape, $ZodType } from 'zod/v4/core';
+import { isContentArray } from './conversation-state.js';
 import { isFunctionCallItem, isFunctionCallOutputItem } from './stream-type-guards.js';
 import type { ToolContextStore } from './tool-context.js';
 import { buildToolExecuteContext } from './tool-context.js';
@@ -357,19 +358,12 @@ export async function executeHITLTool(
       return null;
     }
 
-    if (tool.function.outputSchema) {
-      const validatedOutput = validateToolOutput(tool.function.outputSchema, result);
-      return {
-        toolCallId: toolCall.id,
-        toolName: toolCall.name,
-        result: validatedOutput,
-      };
-    }
-
+    // outputSchema is required on HITL tools — validate unconditionally.
+    const validatedOutput = validateToolOutput(tool.function.outputSchema, result);
     return {
       toolCallId: toolCall.id,
       toolName: toolCall.name,
-      result,
+      result: validatedOutput,
     };
   } catch (error) {
     return {
@@ -471,15 +465,72 @@ function isItemArray(
 }
 
 /**
- * Walk the input items and apply `onResponseReceived` hooks for HITL tools.
+ * Serialize a value for `FunctionCallOutputItem.output`, preserving the
+ * content-array shape (input_text/input_image/input_file blocks) verbatim so
+ * multimodal outputs are not corrupted by JSON.stringify. All other values
+ * are stringified. Matches the detection used elsewhere in the SDK
+ * (`unsentResultsToAPIFormat`).
+ */
+function toFunctionCallOutputValue(
+  value: unknown,
+): string | models.FunctionCallOutputItemOutputUnion1[] {
+  if (isContentArray(value)) {
+    return value;
+  }
+  return JSON.stringify(value);
+}
+
+/**
+ * Format a validation/hook-error payload alongside the caller's original output.
+ * The wrapper itself is a synthetic object and is always JSON. The
+ * `originalOutput` slot preserves a content-array shape verbatim if the
+ * caller supplied one, so the model can still see the rich payload that
+ * triggered the error.
+ */
+function formatHookError(
+  message: string,
+  originalOutput: unknown,
+): string | models.FunctionCallOutputItemOutputUnion1[] {
+  if (isContentArray(originalOutput)) {
+    // When the caller supplied a content array, we cannot cram both the
+    // error wrapper and the content blocks into a single content array —
+    // the `FunctionCallOutputItemOutputUnion1` members are visible-to-model
+    // blocks, not arbitrary JSON. Emit a text block carrying the error plus
+    // the original content blocks so the model sees both.
+    return [
+      {
+        type: 'input_text',
+        text: JSON.stringify({
+          error: message,
+        }),
+      },
+      ...originalOutput,
+    ];
+  }
+  return JSON.stringify({
+    error: message,
+    originalOutput,
+  });
+}
+
+/**
+ * Walk the input items and apply HITL per-tool validation plus any
+ * `onResponseReceived` hooks.
  *
  * For each `function_call_output` item in `input`, locate the matching
  * `function_call` (by `callId`) to identify the tool name. If that tool is a
- * HITL tool with an `onResponseReceived` hook, invoke it with the parsed output
- * and replace the item's `output` with the hook's return value (stringified).
+ * HITL tool, validate the value the model will see against the tool's
+ * `outputSchema`:
+ *
+ * - With `onResponseReceived`: invoke it with the parsed raw output, validate
+ *   the hook's return against `outputSchema`, then serialize (preserving
+ *   content-array shapes). Hook throws or validation failures are replaced
+ *   with `{error, originalOutput}` so the model sees a tool error.
+ * - Without `onResponseReceived`: validate the parsed raw output against
+ *   `outputSchema`. If it matches, leave the item untouched. Otherwise
+ *   replace with `{error, originalOutput}` using the same shape.
  *
- * If a hook throws, the output is replaced with `{"error": "<message>"}` so the
- * model sees a tool error. Items that don't match a HITL tool are left untouched.
+ * Items that don't match a HITL tool are left untouched.
  *
  * @returns a new input array when any item was rewritten, otherwise the original input.
  */
@@ -499,14 +550,10 @@ export async function applyOnResponseReceivedHooks(
   if (hitlTools.length === 0) {
     return input;
   }
-  const hookByName = new Map<string, HITLTool>();
+  // All HITL tools participate now (validation runs even without a hook).
+  const hitlByName = new Map<string, HITLTool>();
   for (const t of hitlTools) {
-    if (t.function.onResponseReceived) {
-      hookByName.set(t.function.name, t);
-    }
-  }
-  if (hookByName.size === 0) {
-    return input;
+    hitlByName.set(t.function.name, t);
   }
 
   // Build callId -> name map from function_call items in the array
@@ -533,13 +580,15 @@ export async function applyOnResponseReceivedHooks(
       rewritten.push(item);
       continue;
     }
-    const tool = hookByName.get(toolName);
-    if (!tool?.function.onResponseReceived) {
+    const tool = hitlByName.get(toolName);
+    if (!tool) {
       rewritten.push(item);
       continue;
     }
 
     const raw = item.output;
+    // Parse string outputs as JSON when possible; leave content arrays and
+    // non-string inputs (e.g. already-parsed objects) alone.
     let parsed: unknown = raw;
     if (typeof raw === 'string') {
       try {
@@ -549,21 +598,49 @@ export async function applyOnResponseReceivedHooks(
       }
     }
 
-    const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema);
+    let newOutput: string | models.FunctionCallOutputItemOutputUnion1[];
 
-    let newOutput: string;
-    try {
-      const hookResult = await Promise.resolve(
-        tool.function.onResponseReceived(parsed, executeContext),
-      );
-      newOutput = JSON.stringify(hookResult);
-    } catch (err) {
-      // Preserve the caller's original output alongside the hook error so the
-      // model can distinguish a hook failure from a tool-reported error.
-      newOutput = JSON.stringify({
-        error: err instanceof Error ? err.message : String(err),
-        originalOutput: parsed,
-      });
+    if (tool.function.onResponseReceived) {
+      const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema);
+      try {
+        const hookResult = await Promise.resolve(
+          tool.function.onResponseReceived(parsed, executeContext),
+        );
+        // Validate the hook's return against outputSchema — the model must
+        // only see schema-conformant values. A validation failure here is
+        // surfaced the same way a hook throw is.
+        try {
+          validateToolOutput(tool.function.outputSchema, hookResult);
+        } catch (validationErr) {
+          newOutput = formatHookError(
+            validationErr instanceof Error ? validationErr.message : String(validationErr),
+            parsed,
+          );
+          const replaced: models.FunctionCallOutputItem = {
+            ...item,
+            output: newOutput,
+          };
+          rewritten.push(replaced);
+          changed = true;
+          continue;
+        }
+        newOutput = toFunctionCallOutputValue(hookResult);
+      } catch (err) {
+        // Preserve the caller's original output alongside the hook error so
+        // the model can distinguish a hook failure from a tool-reported error.
+        newOutput = formatHookError(err instanceof Error ? err.message : String(err), parsed);
+      }
+    } else {
+      // No hook — validate the parsed raw output against outputSchema.
+      // On success, leave the item untouched (same as the pre-change
+      // behavior). On failure, replace with an error wrapper so the model
+      // sees why the caller-supplied payload was rejected.
+      const parseResult = z4.safeParse(tool.function.outputSchema, parsed);
+      if (parseResult.success) {
+        rewritten.push(item);
+        continue;
+      }
+      newOutput = formatHookError(parseResult.error.message, parsed);
     }
 
     const replaced: models.FunctionCallOutputItem = {
diff --git a/packages/agent/src/lib/tool-types.ts b/packages/agent/src/lib/tool-types.ts
index d34d269..e768358 100644
--- a/packages/agent/src/lib/tool-types.ts
+++ b/packages/agent/src/lib/tool-types.ts
@@ -296,7 +296,13 @@ export interface HITLToolFunction<
   TContext extends Record<string, unknown> = Record<string, unknown>,
   TName extends string = string,
 > extends BaseToolFunction<TInput> {
-  outputSchema?: TOutput;
+  /**
+   * Required for HITL tools. Used to validate both the `onToolCalled` return
+   * value (when non-null) and the caller-supplied response that comes back via
+   * a matching `function_call_output` — whether transformed by
+   * `onResponseReceived` or passed through directly when no hook is defined.
+   */
+  outputSchema: TOutput;
   onToolCalled: (
     params: zodInfer<TInput>,
     context?: ToolExecuteContext<TName, TContext>,
diff --git a/packages/agent/src/lib/tool.ts b/packages/agent/src/lib/tool.ts
index cb40c05..2d09bae 100644
--- a/packages/agent/src/lib/tool.ts
+++ b/packages/agent/src/lib/tool.ts
@@ -130,7 +130,13 @@ type HITLToolConfig<
   name: TName;
   description?: string;
   inputSchema: TInput;
-  outputSchema?: TOutput;
+  /**
+   * Required for HITL tools. Used to validate both the `onToolCalled` return
+   * value (when non-null) and the caller-supplied response that comes back via
+   * a matching `function_call_output` — whether transformed by
+   * `onResponseReceived` or passed through directly when no hook is defined.
+   */
+  outputSchema: TOutput;
   eventSchema?: undefined;
   execute?: undefined;
   /** Zod schema declaring the context data this tool needs */
@@ -293,9 +299,19 @@ export function tool(
 
   // Check for HITL tool (has onToolCalled hook)
   if ('onToolCalled' in config && typeof config.onToolCalled === 'function') {
+    // outputSchema is required at the type level for HITL configs, but
+    // defensively check at runtime too — JavaScript callers can bypass types.
+    const hitlName = config.name;
+    if (!('outputSchema' in config) || config.outputSchema === undefined) {
+      throw new Error(
+        `HITL tool "${hitlName}" must declare an outputSchema. HITL tools require a schema so caller-supplied responses can be validated before the model sees them.`,
+      );
+    }
+
     const fn: HITLTool<$ZodObject<$ZodShape>, $ZodType>['function'] = {
       name: config.name,
       inputSchema: config.inputSchema,
+      outputSchema: config.outputSchema,
       onToolCalled: config.onToolCalled,
     };
 
@@ -303,10 +319,6 @@ export function tool(
       fn.description = config.description;
     }
 
-    if (config.outputSchema !== undefined) {
-      fn.outputSchema = config.outputSchema;
-    }
-
     if (config.contextSchema !== undefined) {
       fn.contextSchema = config.contextSchema;
     }
diff --git a/packages/agent/tests/unit/hitl-tool.test.ts b/packages/agent/tests/unit/hitl-tool.test.ts
index 10b7bba..30c6830 100644
--- a/packages/agent/tests/unit/hitl-tool.test.ts
+++ b/packages/agent/tests/unit/hitl-tool.test.ts
@@ -87,6 +87,9 @@ describe('tool() factory — HITL tools', () => {
       inputSchema: z.object({
         x: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       onToolCalled: async () => ({
         ok: true,
       }),
@@ -94,6 +97,21 @@ describe('tool() factory — HITL tools', () => {
     expect('onResponseReceived' in t.function).toBe(false);
   });
 
+  it('throws when a HITL tool is created without an outputSchema', () => {
+    expect(() =>
+      tool({
+        name: 'approve',
+        inputSchema: z.object({
+          x: z.number(),
+        }),
+        // @ts-expect-error outputSchema is now required for HITL tools
+        onToolCalled: async () => ({
+          ok: true,
+        }),
+      }),
+    ).toThrow(/outputSchema/);
+  });
+
   it('isManualTool returns true only for tools with neither execute nor onToolCalled', () => {
     const manual = tool({
       name: 'manual',
@@ -107,6 +125,9 @@ describe('tool() factory — HITL tools', () => {
       inputSchema: z.object({
         x: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       onToolCalled: async () => null,
     });
     const regular = tool({
@@ -160,6 +181,9 @@ describe('executeHITLTool', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       onToolCalled: async () => null,
     });
 
@@ -179,6 +203,9 @@ describe('executeHITLTool', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       onToolCalled: async () => {
         throw new Error('boom');
       },
@@ -229,6 +256,9 @@ describe('executeTool dispatcher with HITL tools', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        approved: z.boolean(),
+      }),
       onToolCalled: async () => ({
         approved: true,
       }),
@@ -253,6 +283,9 @@ describe('executeTool dispatcher with HITL tools', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       onToolCalled: async () => null,
     });
 
@@ -294,13 +327,20 @@ describe('applyOnResponseReceivedHooks', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+        reviewedAt: z.number(),
+      }),
       onToolCalled: async () => null,
       onResponseReceived: async (raw) => {
-        const obj = raw as {
-          ok: boolean;
-        };
+        if (!isRawRecord(raw)) {
+          return {
+            ok: false,
+            reviewedAt: 1234,
+          };
+        }
         return {
-          ...obj,
+          ...raw,
           reviewedAt: 1234,
         };
       },
@@ -372,6 +412,9 @@ describe('applyOnResponseReceivedHooks', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       onToolCalled: async () => null,
       onResponseReceived: async () => {
         throw new Error('invalid result');
@@ -412,6 +455,9 @@ describe('applyOnResponseReceivedHooks', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       onToolCalled: async () => null,
       onResponseReceived: async () => {
         throw new Error('invalid result');
@@ -441,12 +487,20 @@ describe('applyOnResponseReceivedHooks', () => {
   });
 
   it('passes the parsed raw result (not the raw string) to the hook', async () => {
-    const spy = vi.fn(async (raw: unknown) => raw);
+    // Loose outputSchema accepting any object keeps the focus of this
+    // test on argument propagation, not schema validation.
+    const spy = vi.fn(async (raw: unknown) => {
+      if (!isRawRecord(raw)) {
+        return {};
+      }
+      return raw;
+    });
     const t: HITLTool = tool({
       name: 'approve',
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.record(z.string(), z.unknown()),
       onToolCalled: async () => null,
       onResponseReceived: spy,
     });
@@ -472,12 +526,19 @@ describe('applyOnResponseReceivedHooks', () => {
   });
 
   it('passes the raw string through to the hook when output is not JSON', async () => {
-    const spy = vi.fn(async (raw: unknown) => raw);
+    // Permissive outputSchema so the string hook-return passes validation.
+    const spy = vi.fn(async (raw: unknown) => {
+      if (typeof raw !== 'string') {
+        return 'fallback';
+      }
+      return raw;
+    });
     const t: HITLTool = tool({
       name: 'approve',
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.string(),
       onToolCalled: async () => null,
       onResponseReceived: spy,
     });
@@ -503,11 +564,19 @@ describe('applyOnResponseReceivedHooks', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.record(z.string(), z.unknown()),
       onToolCalled: async () => null,
-      onResponseReceived: async (raw) => ({
-        ...(raw as object),
-        tagged: true,
-      }),
+      onResponseReceived: async (raw) => {
+        if (!isRawRecord(raw)) {
+          return {
+            tagged: true,
+          };
+        }
+        return {
+          ...raw,
+          tagged: true,
+        };
+      },
     });
 
     // No function_call in the input at all — just an orphan output
@@ -530,12 +599,15 @@ describe('applyOnResponseReceivedHooks', () => {
     expect(result).toBe(input);
   });
 
-  it('ignores tools without onResponseReceived even if they are HITL', async () => {
+  it('validates raw output against outputSchema when no onResponseReceived hook is defined (passes)', async () => {
     const t = tool({
       name: 'approve',
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       onToolCalled: async () => null,
     });
 
@@ -549,6 +621,264 @@ describe('applyOnResponseReceivedHooks', () => {
       ),
     ];
 
+    // Schema-conformant raw outputs should leave the item (and the array
+    // reference) untouched.
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
+    expect(result).toBe(input);
+  });
+
+  it('replaces output with an error object when raw does not match outputSchema and no hook is defined', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
+      onToolCalled: async () => null,
+    });
+
+    const badPayload = {
+      ok: 'nope',
+    };
+    const input: models.InputsUnion = [
+      callItem('c1', 'approve'),
+      outputItem('c1', JSON.stringify(badPayload)),
+    ];
+
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
+    const arr = result as unknown[];
+    const out = arr[1] as models.FunctionCallOutputItem;
+    const parsed = JSON.parse(out.output as string) as {
+      error: string;
+      originalOutput: unknown;
+    };
+    expect(typeof parsed.error).toBe('string');
+    expect(parsed.error.length).toBeGreaterThan(0);
+    expect(parsed.originalOutput).toEqual(badPayload);
+  });
+
+  it('replaces output with an error object when the hook return does not match outputSchema', async () => {
+    const t = tool({
+      name: 'approve',
+      inputSchema: z.object({
+        amount: z.number(),
+      }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
+      onToolCalled: async () => null,
+      // Intentionally return a value that does not match outputSchema.
+      onResponseReceived: async () => ({
+        ok: 'yes' as unknown as boolean,
+      }),
+    });
+
+    const originalPayload = {
+      ok: true,
+    };
+    const input: models.InputsUnion = [
+      callItem('c1', 'approve'),
+      outputItem('c1', JSON.stringify(originalPayload)),
+    ];
+
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
+    const arr = result as unknown[];
+    const out = arr[1] as models.FunctionCallOutputItem;
+    const parsed = JSON.parse(out.output as string) as {
+      error: string;
+      originalOutput: unknown;
+    };
+    expect(typeof parsed.error).toBe('string');
+    expect(parsed.error.length).toBeGreaterThan(0);
+    expect(parsed.originalOutput).toEqual(originalPayload);
+  });
+
+  it('preserves a content-array hook return verbatim (no JSON.stringify)', async () => {
+    const contentArray = [
+      {
+        type: 'input_text' as const,
+        text: 'hello',
+      },
+      {
+        type: 'input_image' as const,
+        detail: 'auto' as const,
+        imageUrl: 'https://example.com/img.png',
+      },
+    ];
+
+    const t = tool({
+      name: 'describe',
+      inputSchema: z.object({
+        id: z.string(),
+      }),
+      // Loose schema — we care about shape preservation, not the schema type.
+      outputSchema: z.array(z.record(z.string(), z.unknown())),
+      onToolCalled: async () => null,
+      onResponseReceived: async () => contentArray,
+    });
+
+    const input: models.InputsUnion = [
+      callItem('c1', 'describe'),
+      outputItem(
+        'c1',
+        JSON.stringify({
+          ok: true,
+        }),
+      ),
+    ];
+
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
+    const arr = result as unknown[];
+    const out = arr[1] as models.FunctionCallOutputItem;
+    // output should be the content-array verbatim, not a JSON string.
+    expect(Array.isArray(out.output)).toBe(true);
+    expect(out.output).toEqual(contentArray);
+  });
+
+  it('preserves a caller-supplied content-array output across no-hook validation', async () => {
+    const contentArray = [
+      {
+        type: 'input_text' as const,
+        text: 'plain text',
+      },
+    ];
+
+    const t = tool({
+      name: 'describe',
+      inputSchema: z.object({
+        id: z.string(),
+      }),
+      outputSchema: z.array(z.record(z.string(), z.unknown())),
+      onToolCalled: async () => null,
+    });
+
+    const input: models.InputsUnion = [
+      callItem('c1', 'describe'),
+      {
+        type: 'function_call_output',
+        id: 'output_c1',
+        callId: 'c1',
+        output: contentArray,
+      },
+    ];
+
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
+    // Schema-conformant content-array output should be untouched.
+    expect(result).toBe(input);
+  });
+
+  it('preserves a caller-supplied content-array in originalOutput when the hook throws', async () => {
+    const contentArray = [
+      {
+        type: 'input_text' as const,
+        text: 'caller payload',
+      },
+    ];
+
+    const t = tool({
+      name: 'describe',
+      inputSchema: z.object({
+        id: z.string(),
+      }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
+      onToolCalled: async () => null,
+      onResponseReceived: async () => {
+        throw new Error('bad hook');
+      },
+    });
+
+    const input: models.InputsUnion = [
+      callItem('c1', 'describe'),
+      {
+        type: 'function_call_output',
+        id: 'output_c1',
+        callId: 'c1',
+        output: contentArray,
+      },
+    ];
+
+    const result = await applyOnResponseReceivedHooks(
+      input,
+      [
+        t,
+      ],
+      turnContext,
+    );
+    const arr = result as unknown[];
+    const out = arr[1] as models.FunctionCallOutputItem;
+    // A content-array originalOutput cannot be embedded inside a JSON
+    // wrapper (the output union is a content array of visible blocks, not
+    // arbitrary JSON). The replacement uses a content-array form: the
+    // error as an input_text block followed by the original blocks.
+    if (!Array.isArray(out.output)) {
+      throw new Error('Expected content-array output on hook failure');
+    }
+    const errBlock = out.output[0];
+    expect(errBlock?.type).toBe('input_text');
+    if (!errBlock || errBlock.type !== 'input_text') {
+      throw new Error('Expected first block to be input_text');
+    }
+    const errPayload = JSON.parse(errBlock.text) as {
+      error: string;
+    };
+    expect(errPayload.error).toBe('bad hook');
+    // Remaining blocks are the caller's original content-array payload.
+    expect(out.output.slice(1)).toEqual(contentArray);
+  });
+
+  it('ignores manual tools (no onToolCalled) — they are not HITL', async () => {
+    const t = tool({
+      name: 'manual',
+      inputSchema: z.object({
+        x: z.number(),
+      }),
+      execute: false,
+    });
+
+    const input: models.InputsUnion = [
+      callItem('c1', 'manual'),
+      outputItem(
+        'c1',
+        JSON.stringify({
+          y: 1,
+        }),
+      ),
+    ];
+
     const result = await applyOnResponseReceivedHooks(
       input,
       [
@@ -817,6 +1147,9 @@ describe('HITL flow through ModelResult (integration)', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       onToolCalled: async () => null,
     });
 
@@ -892,6 +1225,10 @@ describe('HITL flow through ModelResult (integration)', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+        reviewedAt: z.number(),
+      }),
       onToolCalled,
       onResponseReceived: async (raw) => {
         // Transform the caller's raw output before the model sees it.
@@ -902,7 +1239,7 @@ describe('HITL flow through ModelResult (integration)', () => {
           };
         }
         return {
-          ...raw,
+          ok: raw.ok === true,
           reviewedAt: 1234,
         };
       },
@@ -1044,6 +1381,9 @@ describe('HITL pause persists state (Fix #2)', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       onToolCalled: async () => null,
     });
 
@@ -1108,6 +1448,9 @@ describe('Approved HITL pauses retain pendingToolCalls (Fix #9)', () => {
       inputSchema: z.object({
         amount: z.number(),
       }),
+      outputSchema: z.object({
+        ok: z.boolean(),
+      }),
       requireApproval: true,
       onToolCalled,
     });
@@ -1271,20 +1614,29 @@ describe('hasExecutableToolCalls treats HITL tools as auto-resolvable (Fix #1)',
   });
 });
 
-describe('continueWithUnsentResults does not re-hook caller-supplied items (Fix #8)', () => {
+describe('continueWithUnsentResults does not hook SDK-generated outputs (Fix #4)', () => {
   beforeEach(() => {
     mockBetaResponsesSend.mockReset();
   });
 
-  it('onResponseReceived fires once per caller-supplied output across init + resume', async () => {
-    // Scenario: caller supplies a function_call_output for a HITL tool in
-    // the initial input, the model requests another (approval-required)
-    // tool call in response, and after the caller approves that second
-    // call the loop resumes via continueWithUnsentResults. Before Fix #8
-    // the initial caller-supplied output would be re-hooked during
-    // resume because applyOnResponseReceivedHooks walked the full
-    // accumulated newInput.
+  it('onResponseReceived does NOT fire for SDK-generated outputs produced during continueWithUnsentResults', async () => {
+    // Scenario: a HITL tool requires approval AND defines both onToolCalled
+    // (which produces an output on approval) and onResponseReceived (which
+    // is documented to apply only to CALLER-supplied outputs — i.e. the
+    // resume-by-passing-a-function_call_output path). When the caller
+    // approves the pending call, processApprovalDecisions invokes
+    // onToolCalled, queues the result in unsentToolResults, and
+    // continueWithUnsentResults stitches it onto the message history and
+    // sends the follow-up request. The SDK-generated output must NOT be
+    // run through onResponseReceived — that hook is reserved for outputs
+    // the caller produces externally.
+    const onToolCalled = vi.fn(async () => ({
+      ok: true,
+    }));
     const onResponseReceived = vi.fn(async (raw: unknown) => {
+      // Should never run in this test. If it does, the assertion below
+      // catches it; returning a distinct payload makes misbehaviour
+      // easy to diagnose in the wire capture as well.
       if (!isRawRecord(raw)) {
         return {
           hooked: true,
@@ -1301,82 +1653,229 @@ describe('continueWithUnsentResults does not re-hook caller-supplied items (Fix
       inputSchema: z.object({
         amount: z.number(),
       }),
-      onToolCalled: async () => ({
-        ok: true,
+      outputSchema: z.object({
+        ok: z.boolean(),
       }),
+      requireApproval: true,
+      onToolCalled,
       onResponseReceived,
     });
 
-    const needsApproval = tool({
-      name: 'dangerous',
+    const tools = [
+      approve,
+    ] as const;
+
+    // Pre-populate state as if a prior run left us awaiting_approval with
+    // one pending call c1. This bypasses the initial model turn (we are
+    // jumping straight to the resume path) so the only invocation of a
+    // hook we care about happens inside continueWithUnsentResults.
+    const pending: ParsedToolCall<(typeof tools)[number]> = {
+      id: 'c1',
+      name: 'approve',
+      arguments: {
+        amount: 5,
+      },
+    };
+    const savedState: ConversationState<typeof tools> = {
+      id: 'conv_test',
+      messages: [
+        makeFunctionCallItem(
+          'c1',
+          'approve',
+          JSON.stringify({
+            amount: 5,
+          }),
+        ),
+      ],
+      pendingToolCalls: [
+        pending,
+      ],
+      status: 'awaiting_approval',
+      createdAt: 0,
+      updatedAt: 0,
+    };
+
+    // After the approval resolves and the follow-up request fires, the
+    // model replies with a plain message.
+    const followupTurn = makeResponse('resp_followup', [
+      makeMessageItem('msg_1', 'done'),
+    ]);
+    mockBetaResponsesSend.mockResolvedValueOnce({
+      ok: true,
+      value: makeCompletedStream(followupTurn),
+    });
+
+    const { accessor } = createMemoryAccessor<typeof tools>(savedState);
+
+    const result = new ModelResult<typeof tools>({
+      request: {
+        model: 'test-model',
+        input: 'approve 5',
+        tools: [
+          {
+            type: 'function',
+            name: 'approve',
+            description: null,
+            strict: null,
+            parameters: {},
+          },
+        ],
+      },
+      client: {} as OpenRouterCore,
+      tools,
+      state: accessor,
+      // Caller approves c1 — onToolCalled will produce {ok: true} and the
+      // follow-up request will carry that SDK-generated output.
+      approveToolCalls: [
+        'c1',
+      ],
+    });
+
+    await result.getResponse();
+
+    // onToolCalled fired exactly once (during processApprovalDecisions).
+    expect(onToolCalled).toHaveBeenCalledTimes(1);
+
+    // The critical assertion: onResponseReceived must NOT have fired.
+    // SDK-generated outputs are appended verbatim; the hook is reserved
+    // for caller-supplied outputs only.
+    expect(onResponseReceived).not.toHaveBeenCalled();
+
+    // And the wire request must carry the raw SDK-generated output, not
+    // a hook-transformed version.
+    expect(mockBetaResponsesSend).toHaveBeenCalledTimes(1);
+    const callArg = mockBetaResponsesSend.mock.calls[0]?.[1];
+    if (!isSendCallArg(callArg)) {
+      throw new Error('Send call arg did not match expected shape');
+    }
+    const input = callArg.responsesRequest.input;
+    if (!Array.isArray(input)) {
+      throw new Error('Expected follow-up input to be an array');
+    }
+    const output = input.find(isFunctionCallOutput);
+    if (!output) {
+      throw new Error('Expected a function_call_output in the follow-up input');
+    }
+    const rawOutput = output.output;
+    if (typeof rawOutput !== 'string') {
+      throw new Error('Expected function_call_output.output to be a string');
+    }
+    expect(JSON.parse(rawOutput)).toEqual({
+      ok: true,
+    });
+  });
+});
+
+describe('initStream does not re-hook historical outputs (Fix #3)', () => {
+  beforeEach(() => {
+    mockBetaResponsesSend.mockReset();
+  });
+
+  it('onResponseReceived fires only for fresh caller-supplied outputs, not historical ones loaded from state', async () => {
+    // Scenario: state already contains a function_call / function_call_output
+    // pair from a prior run (the output was hooked when it was first
+    // supplied). On a new callModel invocation, the caller passes fresh
+    // input that includes ANOTHER function_call / function_call_output pair
+    // for the same HITL tool. The hook must fire for the fresh output
+    // ONLY — the historical output must not be re-hooked (non-idempotent
+    // hooks would double-fire otherwise).
+    const onResponseReceived = vi.fn(async (raw: unknown) => {
+      if (!isRawRecord(raw)) {
+        return {
+          hooked: true,
+        };
+      }
+      return {
+        ...raw,
+        hooked: true,
+      };
+    });
+
+    const approve = tool({
+      name: 'approve',
       inputSchema: z.object({
-        x: z.number(),
+        amount: z.number(),
       }),
-      requireApproval: true,
-      execute: async () => ({
-        done: true,
+      outputSchema: z.object({
+        ok: z.boolean(),
       }),
+      onToolCalled: async () => null,
+      onResponseReceived,
     });
 
     const tools = [
       approve,
-      needsApproval,
     ] as const;
 
-    // Initial model turn requests `dangerous`, forcing an approval pause.
-    const turn0 = makeResponse('resp_turn_0', [
-      makeFunctionCallItem(
-        'd1',
-        'dangerous',
-        JSON.stringify({
-          x: 1,
-        }),
-      ),
-    ]);
-    // After the caller approves d1 and execution resumes, the model
-    // replies with a plain message.
-    const turn1 = makeResponse('resp_turn_1', [
-      makeMessageItem('msg_1', 'done'),
-    ]);
-
-    mockBetaResponsesSend
-      .mockResolvedValueOnce({
-        ok: true,
-        value: makeCompletedStream(turn0),
-      })
-      .mockResolvedValueOnce({
+    // Pre-populated state: a historical function_call + function_call_output
+    // pair (c0) that was already hooked on the prior run. Its `output`
+    // field here simulates an already-hooked payload. If initStream
+    // re-hooks it, the mock will observe a second invocation for c0.
+    const historicalCall = makeFunctionCallItem(
+      'c0',
+      'approve',
+      JSON.stringify({
+        amount: 1,
+      }),
+    );
+    const historicalOutput: models.FunctionCallOutputItem = {
+      type: 'function_call_output',
+      id: 'output_c0',
+      callId: 'c0',
+      output: JSON.stringify({
         ok: true,
-        value: makeCompletedStream(turn1),
-      });
+        hooked: true,
+      }),
+    };
+
+    const savedState: ConversationState<typeof tools> = {
+      id: 'conv_test',
+      messages: [
+        historicalCall,
+        historicalOutput,
+      ],
+      status: 'in_progress',
+      createdAt: 0,
+      updatedAt: 0,
+    };
 
-    // Caller-supplied initial input: a function_call_output for `approve`
-    // that will be hooked during init (via applyOnResponseReceivedHooks).
-    const initialInput: models.InputsUnion = [
+    // Fresh input: a new function_call / function_call_output pair (c1)
+    // that the caller is supplying this turn. The hook should fire for
+    // c1 but NOT for c0.
+    const freshInput: models.InputsUnion = [
       makeFunctionCallItem(
-        'c0',
+        'c1',
         'approve',
         JSON.stringify({
-          amount: 5,
+          amount: 2,
         }),
       ),
       {
         type: 'function_call_output',
-        id: 'output_c0',
-        callId: 'c0',
+        id: 'output_c1',
+        callId: 'c1',
         output: JSON.stringify({
           ok: true,
         }),
       },
     ];
 
-    const { accessor } = createMemoryAccessor<typeof tools>();
+    // After the fresh output is hooked and appended to history, the model
+    // replies with a plain message.
+    const replyTurn = makeResponse('resp_reply', [
+      makeMessageItem('msg_1', 'noted'),
+    ]);
+    mockBetaResponsesSend.mockResolvedValueOnce({
+      ok: true,
+      value: makeCompletedStream(replyTurn),
+    });
+
+    const { accessor } = createMemoryAccessor<typeof tools>(savedState);
 
-    // First pass: initial run. The initial caller-supplied output gets
-    // hooked once during init.
-    const result1 = new ModelResult<typeof tools>({
+    const result = new ModelResult<typeof tools>({
       request: {
         model: 'test-model',
-        input: initialInput,
+        input: freshInput,
         tools: [
           {
             type: 'function',
@@ -1385,13 +1884,6 @@ describe('continueWithUnsentResults does not re-hook caller-supplied items (Fix
             strict: null,
             parameters: {},
           },
-          {
-            type: 'function',
-            name: 'dangerous',
-            description: null,
-            strict: null,
-            parameters: {},
-          },
         ],
       },
       client: {} as OpenRouterCore,
@@ -1399,47 +1891,59 @@ describe('continueWithUnsentResults does not re-hook caller-supplied items (Fix
       state: accessor,
     });
 
-    await result1.getResponse();
+    await result.getResponse();
+
+    // Hook fired exactly once — for the fresh c1 output — and received
+    // the fresh payload, NOT the already-hooked historical one.
     expect(onResponseReceived).toHaveBeenCalledTimes(1);
+    const hookArg = onResponseReceived.mock.calls[0]?.[0];
+    expect(hookArg).toEqual({
+      ok: true,
+    });
 
-    // Second pass: resume by approving d1. continueWithUnsentResults
-    // must NOT re-hook the earlier c0 output (which lives in the saved
-    // messages from the first run).
-    onResponseReceived.mockClear();
+    // The wire request must carry both outputs with the correct values:
+    //   - c0: unchanged historical output (not re-hooked)
+    //   - c1: hooked fresh output
+    expect(mockBetaResponsesSend).toHaveBeenCalledTimes(1);
+    const callArg = mockBetaResponsesSend.mock.calls[0]?.[1];
+    if (!isSendCallArg(callArg)) {
+      throw new Error('Send call arg did not match expected shape');
+    }
+    const input = callArg.responsesRequest.input;
+    if (!Array.isArray(input)) {
+      throw new Error('Expected input to be an array');
+    }
+    const outputs = input.filter(isFunctionCallOutput);
+    expect(outputs).toHaveLength(2);
 
-    const result2 = new ModelResult<typeof tools>({
-      request: {
-        model: 'test-model',
-        input: initialInput,
-        tools: [
-          {
-            type: 'function',
-            name: 'approve',
-            description: null,
-            strict: null,
-            parameters: {},
-          },
-          {
-            type: 'function',
-            name: 'dangerous',
-            description: null,
-            strict: null,
-            parameters: {},
-          },
-        ],
-      },
-      client: {} as OpenRouterCore,
-      tools,
-      state: accessor,
-      approveToolCalls: [
-        'd1',
-      ],
+    const c0Output = outputs.find((o) => o.callId === 'c0');
+    if (!c0Output) {
+      throw new Error('Expected c0 output to be preserved');
+    }
+    const c0Raw = c0Output.output;
+    if (typeof c0Raw !== 'string') {
+      throw new Error('Expected c0 output.output to be a string');
+    }
+    // c0 preserved verbatim — NOT re-hooked. The hooked: true marker is
+    // from the historical (prior-run) payload, not a re-hook.
+    expect(JSON.parse(c0Raw)).toEqual({
+      ok: true,
+      hooked: true,
     });
 
-    await result2.getResponse();
-
-    // On resume, `dangerous` is an execute tool (no hook) and the c0
-    // output lives in message history — the hook must NOT run again.
-    expect(onResponseReceived).not.toHaveBeenCalled();
+    const c1Output = outputs.find((o) => o.callId === 'c1');
+    if (!c1Output) {
+      throw new Error('Expected c1 output to be present');
+    }
+    const c1Raw = c1Output.output;
+    if (typeof c1Raw !== 'string') {
+      throw new Error('Expected c1 output.output to be a string');
+    }
+    // c1 was hooked — the fresh payload plus hooked: true added by the
+    // onResponseReceived implementation above.
+    expect(JSON.parse(c1Raw)).toEqual({
+      ok: true,
+      hooked: true,
+    });
   });
 });

From 396d44b3371df99ddf660288a8d53cd4fc810728 Mon Sep 17 00:00:00 2001
From: Matt Apperson <me@mattapperson.com>
Date: Tue, 5 May 2026 20:34:12 -0400
Subject: [PATCH 5/5] refactor(tools): extract helpers to keep cc under
 structural gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

applyOnResponseReceivedHooks and executeToolRound grew cyclomatic
complexity above 15 when HITL logic landed, tripping sentrux gate
(complex functions 9 -> 11). Extract per-item helpers so the gate
matches baseline again:

- applyOnResponseReceivedHooks: move per-item hook/validate logic into
  computeHitlItemOutput + invokeOnResponseReceived; split map builders
  into buildHitlToolMap / buildCallIdToNameMap; parseRawFunctionCallOutput
  replaces the inline try/JSON.parse branch.
- executeToolRound: move the output-for-model branching into
  computeToolOutputForModel; describeNonRecord factors out the nested
  typeof/Array.isArray ternary.

Pure refactor — no behavior change. All 280 unit tests still pass;
sentrux gate reports "No degradation detected".
---
 packages/agent/src/lib/model-result.ts  |  92 +++++++----
 packages/agent/src/lib/tool-executor.ts | 199 ++++++++++++++----------
 2 files changed, 178 insertions(+), 113 deletions(-)

diff --git a/packages/agent/src/lib/model-result.ts b/packages/agent/src/lib/model-result.ts
index 26d3500..9a3670d 100644
--- a/packages/agent/src/lib/model-result.ts
+++ b/packages/agent/src/lib/model-result.ts
@@ -92,6 +92,20 @@ function isRecord(value: unknown): value is Record<string, unknown> {
   return typeof value === 'object' && value !== null && !Array.isArray(value);
 }
 
+/**
+ * Human-readable label for a value that failed the `isRecord` check. Used
+ * exclusively to make `toModelOutput` misuse errors specific.
+ */
+function describeNonRecord(value: unknown): string {
+  if (value === null) {
+    return 'null';
+  }
+  if (Array.isArray(value)) {
+    return 'array';
+  }
+  return typeof value;
+}
+
 /**
  * Type guard for stream event responses
  * Checks constructor name and readable stream behavior
@@ -782,6 +796,52 @@ export class ModelResult<
     await this.saveStateSafely(stateUpdates);
   }
 
+  /**
+   * Compute the `output` payload sent to the model for a successfully
+   * settled tool execution. Routes through `toModelOutput` when the tool
+   * defines one (which may itself throw to surface an error), falls back to
+   * `JSON.stringify(result)` otherwise, and emits an error envelope when the
+   * executor itself reported an error.
+   */
+  private async computeToolOutputForModel(value: {
+    toolCall: ParsedToolCall<Tool>;
+    tool: Tool;
+    result: {
+      result: unknown;
+      error?: Error;
+    };
+  }): Promise<string | models.FunctionCallOutputItemOutputUnion1[]> {
+    if (value.result.error) {
+      return JSON.stringify({
+        error: value.result.error.message,
+      });
+    }
+
+    if (!isAutoResolvableTool(value.tool) || !value.tool.function.toModelOutput) {
+      return JSON.stringify(value.result.result);
+    }
+
+    // Arguments have already been validated upstream by the tool's Zod
+    // inputSchema (which must be a ZodObject), so the runtime shape is
+    // always a record here. A non-record value here signals a real upstream
+    // bug we want surfaced, not a case to paper over with `{}`.
+    const rawArgs: unknown = value.toolCall.arguments;
+    if (!isRecord(rawArgs)) {
+      throw new Error(
+        `toolCall.arguments for "${value.toolCall.name}" must be an object after Zod validation, got ${describeNonRecord(rawArgs)}`,
+      );
+    }
+
+    const modelOutputResult = await value.tool.function.toModelOutput({
+      output: value.result.result,
+      input: rawArgs,
+    });
+    if (modelOutputResult.type === 'content') {
+      return modelOutputResult.value;
+    }
+    return JSON.stringify(value.result.result);
+  }
+
   /**
    * Execute all tools in a single round in parallel.
    * Emits tool.result events after tool execution completes.
@@ -946,37 +1006,7 @@ export class ModelResult<
         value.preliminaryResultsForCall.length > 0 ? value.preliminaryResultsForCall : undefined,
       );
 
-      let outputForModel: string | models.FunctionCallOutputItemOutputUnion1[];
-
-      if (value.result.error) {
-        outputForModel = JSON.stringify({
-          error: value.result.error.message,
-        });
-      } else if (value.tool.function.toModelOutput) {
-        // toModelOutput exists - call it (may throw, which surfaces the error).
-        // Arguments have already been validated upstream by the tool's Zod
-        // inputSchema (which must be a ZodObject), so the runtime shape is
-        // always a record here. The `Tool` union widening loses the specific
-        // InferToolInput type, so we re-narrow defensively — a non-record
-        // value here signals a real upstream bug we want surfaced, not a
-        // case to paper over with `{}`.
-        const rawArgs: unknown = value.toolCall.arguments;
-        if (!isRecord(rawArgs)) {
-          throw new Error(
-            `toolCall.arguments for "${value.toolCall.name}" must be an object after Zod validation, got ${rawArgs === null ? 'null' : Array.isArray(rawArgs) ? 'array' : typeof rawArgs}`,
-          );
-        }
-        const modelOutputResult = await value.tool.function.toModelOutput({
-          output: value.result.result,
-          input: rawArgs,
-        });
-        outputForModel =
-          modelOutputResult.type === 'content'
-            ? modelOutputResult.value
-            : JSON.stringify(value.result.result);
-      } else {
-        outputForModel = JSON.stringify(value.result.result);
-      }
+      const outputForModel = await this.computeToolOutputForModel(value);
 
       const executedOutput: models.FunctionCallOutputItem = {
         type: 'function_call_output' as const,
diff --git a/packages/agent/src/lib/tool-executor.ts b/packages/agent/src/lib/tool-executor.ts
index 9b7ab46..572fa0d 100644
--- a/packages/agent/src/lib/tool-executor.ts
+++ b/packages/agent/src/lib/tool-executor.ts
@@ -513,6 +513,111 @@ function formatHookError(
   });
 }
 
+/**
+ * Parse a raw `FunctionCallOutputItem.output` value for HITL processing.
+ * JSON-parses string payloads when possible; leaves content arrays and
+ * non-string inputs untouched.
+ */
+function parseRawFunctionCallOutput(raw: unknown): unknown {
+  if (typeof raw !== 'string') {
+    return raw;
+  }
+  try {
+    return JSON.parse(raw);
+  } catch {
+    return raw;
+  }
+}
+
+/** Build a name → HITL tool map from the registered tools. */
+function buildHitlToolMap(tools: readonly Tool[]): Map<string, HITLTool> {
+  const map = new Map<string, HITLTool>();
+  for (const t of tools) {
+    if (isHITLTool(t)) {
+      map.set(t.function.name, t);
+    }
+  }
+  return map;
+}
+
+/**
+ * Build a callId → tool-name map from `function_call` items in an input
+ * array, so `function_call_output` items can be associated with their tool.
+ */
+function buildCallIdToNameMap(
+  input: Extract<models.InputsUnion, readonly unknown[]>,
+): Map<string, string> {
+  const map = new Map<string, string>();
+  for (const item of input) {
+    if (isFunctionCallItem(item)) {
+      map.set(item.callId, item.name);
+    }
+  }
+  return map;
+}
+
+type HookOutput = string | models.FunctionCallOutputItemOutputUnion1[];
+
+/**
+ * Invoke `onResponseReceived`, validate the return against the tool's
+ * `outputSchema`, and convert the result (or any error) to a
+ * `FunctionCallOutputItem.output` payload.
+ */
+// biome-ignore lint: parameters match the internal API shape
+async function invokeOnResponseReceived(
+  tool: HITLTool,
+  parsed: unknown,
+  context: TurnContext,
+  contextStore?: ToolContextStore,
+  sharedSchema?: $ZodObject<$ZodShape>,
+): Promise<HookOutput> {
+  const hook = tool.function.onResponseReceived;
+  if (!hook) {
+    throw new Error('invokeOnResponseReceived called without onResponseReceived hook');
+  }
+  const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema);
+  try {
+    const hookResult = await Promise.resolve(hook(parsed, executeContext));
+    const validation = z4.safeParse(tool.function.outputSchema, hookResult);
+    if (!validation.success) {
+      return formatHookError(validation.error.message, parsed);
+    }
+    return toFunctionCallOutputValue(hookResult);
+  } catch (err) {
+    // Preserve the caller's original output alongside the hook error so
+    // the model can distinguish a hook failure from a tool-reported error.
+    return formatHookError(err instanceof Error ? err.message : String(err), parsed);
+  }
+}
+
+/**
+ * Compute the (optional) replacement output for a single HITL
+ * `function_call_output` item. Returns `null` when the caller-supplied output
+ * is schema-conformant and should be left untouched.
+ */
+// biome-ignore lint: parameters match the internal API shape
+async function computeHitlItemOutput(
+  tool: HITLTool,
+  item: models.FunctionCallOutputItem,
+  context: TurnContext,
+  contextStore?: ToolContextStore,
+  sharedSchema?: $ZodObject<$ZodShape>,
+): Promise<HookOutput | null> {
+  const parsed = parseRawFunctionCallOutput(item.output);
+
+  if (tool.function.onResponseReceived) {
+    return invokeOnResponseReceived(tool, parsed, context, contextStore, sharedSchema);
+  }
+
+  // No hook — validate the parsed raw output against outputSchema. On
+  // success, leave the item untouched; on failure, surface an error wrapper.
+  const validation = z4.safeParse(tool.function.outputSchema, parsed);
+  if (validation.success) {
+    return null;
+  }
+  return formatHookError(validation.error.message, parsed);
+}
+
 /**
  * Walk the input items and apply HITL per-tool validation plus any
  * `onResponseReceived` hooks.
@@ -546,23 +651,12 @@ export async function applyOnResponseReceivedHooks(
     return input;
   }
 
-  const hitlTools = tools.filter((t): t is HITLTool => isHITLTool(t));
-  if (hitlTools.length === 0) {
+  const hitlByName = buildHitlToolMap(tools);
+  if (hitlByName.size === 0) {
     return input;
   }
-  // All HITL tools participate now (validation runs even without a hook).
-  const hitlByName = new Map<string, HITLTool>();
-  for (const t of hitlTools) {
-    hitlByName.set(t.function.name, t);
-  }
 
-  // Build callId -> name map from function_call items in the array
-  const callIdToName = new Map<string, string>();
-  for (const item of input) {
-    if (isFunctionCallItem(item)) {
-      callIdToName.set(item.callId, item.name);
-    }
-  }
+  const callIdToName = buildCallIdToNameMap(input);
 
   // Element type of the array form of InputsUnion — use this so `rewritten`
   // is structurally assignable back to InputsUnion without an `as` cast.
@@ -571,83 +665,24 @@ export async function applyOnResponseReceivedHooks(
   let changed = false;
   const rewritten: InputsArrayItem[] = [];
   for (const item of input) {
-    if (!isFunctionCallOutputItem(item)) {
+    const tool = isFunctionCallOutputItem(item)
+      ? hitlByName.get(callIdToName.get(item.callId) ?? '')
+      : undefined;
+    if (!tool || !isFunctionCallOutputItem(item)) {
       rewritten.push(item);
       continue;
     }
-    const toolName = callIdToName.get(item.callId);
-    if (!toolName) {
-      rewritten.push(item);
-      continue;
-    }
-    const tool = hitlByName.get(toolName);
-    if (!tool) {
+
+    const newOutput = await computeHitlItemOutput(tool, item, context, contextStore, sharedSchema);
+    if (newOutput === null) {
       rewritten.push(item);
       continue;
     }
 
-    const raw = item.output;
-    // Parse string outputs as JSON when possible; leave content arrays and
-    // non-string inputs (e.g. already-parsed objects) alone.
-    let parsed: unknown = raw;
-    if (typeof raw === 'string') {
-      try {
-        parsed = JSON.parse(raw);
-      } catch {
-        parsed = raw;
-      }
-    }
-
-    let newOutput: string | models.FunctionCallOutputItemOutputUnion1[];
-
-    if (tool.function.onResponseReceived) {
-      const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema);
-      try {
-        const hookResult = await Promise.resolve(
-          tool.function.onResponseReceived(parsed, executeContext),
-        );
-        // Validate the hook's return against outputSchema — the model must
-        // only see schema-conformant values. A validation failure here is
-        // surfaced the same way a hook throw is.
-        try {
-          validateToolOutput(tool.function.outputSchema, hookResult);
-        } catch (validationErr) {
-          newOutput = formatHookError(
-            validationErr instanceof Error ? validationErr.message : String(validationErr),
-            parsed,
-          );
-          const replaced: models.FunctionCallOutputItem = {
-            ...item,
-            output: newOutput,
-          };
-          rewritten.push(replaced);
-          changed = true;
-          continue;
-        }
-        newOutput = toFunctionCallOutputValue(hookResult);
-      } catch (err) {
-        // Preserve the caller's original output alongside the hook error so
-        // the model can distinguish a hook failure from a tool-reported error.
-        newOutput = formatHookError(err instanceof Error ? err.message : String(err), parsed);
-      }
-    } else {
-      // No hook — validate the parsed raw output against outputSchema.
-      // On success, leave the item untouched (same as the pre-change
-      // behavior). On failure, replace with an error wrapper so the model
-      // sees why the caller-supplied payload was rejected.
-      const parseResult = z4.safeParse(tool.function.outputSchema, parsed);
-      if (parseResult.success) {
-        rewritten.push(item);
-        continue;
-      }
-      newOutput = formatHookError(parseResult.error.message, parsed);
-    }
-
-    const replaced: models.FunctionCallOutputItem = {
+    rewritten.push({
       ...item,
       output: newOutput,
-    };
-    rewritten.push(replaced);
+    });
     changed = true;
   }