From 8faf9bb846887c01a6d526ffc19526dfe30822d1 Mon Sep 17 00:00:00 2001 From: Matt Apperson Date: Tue, 5 May 2026 18:54:32 -0400 Subject: [PATCH 1/5] feat(tools): add human-in-the-loop tool type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a fourth client tool kind that extends manual-tool semantics with two async hooks. `onToolCalled` runs when the model invokes the tool — returning a value short-circuits like `execute`, returning `null` pauses the loop like a manual tool. `onResponseReceived` runs on a later turn when an incoming `FunctionCallOutputItem` matches (by callId → function_call.name) a HITL tool, letting the tool post-process caller-supplied results before the model sees them. Keeps HITL control flow local to the tool definition instead of smeared across the caller. --- packages/agent/src/index.ts | 5 + packages/agent/src/lib/model-result.ts | 61 +++- packages/agent/src/lib/tool-executor.ts | 183 ++++++++++- packages/agent/src/lib/tool-orchestrator.ts | 12 +- packages/agent/src/lib/tool-types.ts | 74 ++++- packages/agent/src/lib/tool.ts | 93 ++++++ packages/agent/tests/unit/hitl-tool.test.ts | 319 ++++++++++++++++++++ 7 files changed, 729 insertions(+), 18 deletions(-) create mode 100644 packages/agent/tests/unit/hitl-tool.test.ts diff --git a/packages/agent/src/index.ts b/packages/agent/src/index.ts index 0bd90cf..50abf3d 100644 --- a/packages/agent/src/index.ts +++ b/packages/agent/src/index.ts @@ -138,6 +138,8 @@ export type { ConversationState, ConversationStatus, HasApprovalTools, + HITLTool, + HITLToolFunction, InferToolEvent, InferToolEventsUnion, InferToolInput, @@ -184,8 +186,11 @@ export type { export { hasApprovalRequiredTools, hasExecuteFunction, + isAutoResolvableTool, isClientTool, isGeneratorTool, + isHITLTool, + isManualTool, isRegularExecuteTool, isServerTool, isToolCallOutputEvent, diff --git a/packages/agent/src/lib/model-result.ts b/packages/agent/src/lib/model-result.ts index e706fe0..01eacc4 100644 --- a/packages/agent/src/lib/model-result.ts +++ b/packages/agent/src/lib/model-result.ts @@ -51,7 +51,7 @@ import { import type { ContextInput } from './tool-context.js'; import { resolveContext, ToolContextStore } from './tool-context.js'; import { ToolEventBroadcaster } from './tool-event-broadcaster.js'; -import { executeTool } from './tool-executor.js'; +import { applyOnResponseReceivedHooks, executeTool } from './tool-executor.js'; import type { ConversationState, InferToolEventsUnion, @@ -73,6 +73,7 @@ import type { } from './tool-types.js'; import { hasExecuteFunction, + isAutoResolvableTool, isClientTool, isServerTool, isToolCallOutputEvent, @@ -624,7 +625,7 @@ export class ModelResult< ): Promise[]> { const toolCallPromises = toolCalls.map(async (tc) => { const tool = this.options.tools?.find((t) => isClientTool(t) && t.function.name === tc.name); - if (!tool || !hasExecuteFunction(tool)) { + if (!tool || !isAutoResolvableTool(tool)) { return null; } @@ -637,6 +638,11 @@ export class ModelResult< this.options.sharedContextSchema, ); + if (result === null) { + // HITL tool paused — no unsent result for this call in this round + return null; + } + if (result.error) { return createRejectedResult(tc.id, String(tc.name), result.error.message); } @@ -748,7 +754,7 @@ export class ModelResult< const tool = this.options.tools?.find( (t) => isClientTool(t) && t.function.name === toolCall.name, ); - if (!tool || !hasExecuteFunction(tool)) { + if (!tool || !isAutoResolvableTool(tool)) { return null; } @@ -799,6 +805,14 @@ export class ModelResult< this.options.sharedContextSchema, ); + if (result === null) { + // HITL tool paused — surface as manual (no output this round) + return { + type: 'paused' as const, + toolCall, + }; + } + return { type: 'execution' as const, toolCall, @@ -858,6 +872,13 @@ export class ModelResult< continue; } + if (value.type === 'paused') { + // HITL tool returned null — emit nothing; the call surfaces to the caller + // for manual resume. The outer loop will end because toolResults lacks + // an entry for this call. + continue; + } + const toolResult = ( value.result.error ? { @@ -1227,6 +1248,21 @@ export class ModelResult< } } + // Apply onResponseReceived hooks to caller-supplied tool-output items + if (baseRequest.input !== undefined) { + const hookedInput = await applyOnResponseReceivedHooks( + baseRequest.input, + this.options.tools, + initialContext, + this.contextStore ?? undefined, + this.options.sharedContextSchema, + ); + baseRequest = { + ...baseRequest, + input: hookedInput, + }; + } + // Store resolved request with stream mode this.resolvedRequest = { ...baseRequest, @@ -1293,7 +1329,7 @@ export class ModelResult< const tool = this.options.tools?.find( (t) => isClientTool(t) && t.function.name === toolCall.name, ); - if (!tool || !hasExecuteFunction(tool)) { + if (!tool || !isAutoResolvableTool(tool)) { // Can't execute, create error result unsentResults.push( createRejectedResult(callId, String(toolCall.name), 'Tool not found or not executable'), @@ -1310,6 +1346,11 @@ export class ModelResult< this.options.sharedContextSchema, ); + if (result === null) { + // HITL tool paused on approval — no unsent result; caller resumes later + continue; + } + if (result.error) { unsentResults.push( createRejectedResult(callId, String(toolCall.name), result.error.message), @@ -1408,10 +1449,20 @@ export class ModelResult< const baseRequest = await this.resolveRequestForContext(turnContext); + // Apply onResponseReceived hooks to caller-supplied tool-output items + // (unsentToolResults merged into newInput above may include caller-provided outputs) + const hookedInput = await applyOnResponseReceivedHooks( + newInput, + this.options.tools, + turnContext, + this.contextStore ?? undefined, + this.options.sharedContextSchema, + ); + // Create request with the accumulated messages const request: models.ResponsesRequest = { ...baseRequest, - input: newInput, + input: hookedInput, stream: true, }; diff --git a/packages/agent/src/lib/tool-executor.ts b/packages/agent/src/lib/tool-executor.ts index 42b1dab..52d2303 100644 --- a/packages/agent/src/lib/tool-executor.ts +++ b/packages/agent/src/lib/tool-executor.ts @@ -1,11 +1,13 @@ import type * as models from '@openrouter/sdk/models'; import * as z4 from 'zod/v4'; import type { $ZodObject, $ZodShape, $ZodType } from 'zod/v4/core'; +import { isFunctionCallItem, isFunctionCallOutputItem } from './stream-type-guards.js'; import type { ToolContextStore } from './tool-context.js'; import { buildToolExecuteContext } from './tool-context.js'; import type { APITool, ClientTool, + HITLTool, ParsedToolCall, Tool, ToolExecuteContext, @@ -15,6 +17,7 @@ import type { import { hasExecuteFunction, isGeneratorTool, + isHITLTool, isRegularExecuteTool, isServerTool, } from './tool-types.js'; @@ -322,8 +325,69 @@ export async function executeGeneratorTool( } /** - * Execute a tool call - * Automatically detects if it's a regular or generator tool + * Execute a HITL tool's `onToolCalled` hook. + * + * Returns: + * - `ToolExecutionResult` if the hook produced a value (short-circuit, send to model) + * - `null` if the hook returned `null` (pause — treat as manual tool) + * - `ToolExecutionResult` with `error` set if the hook threw + */ +// biome-ignore lint: parameters match the internal API shape +export async function executeHITLTool( + tool: Tool, + toolCall: ParsedToolCall, + context: TurnContext, + contextStore?: ToolContextStore, + sharedSchema?: $ZodObject<$ZodShape>, +): Promise | null> { + if (!isHITLTool(tool)) { + throw new Error(`Tool "${toolCall.name}" is not a HITL tool`); + } + + try { + const validatedInput = validateToolInput(tool.function.inputSchema, toolCall.arguments); + const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema); + + const result = await Promise.resolve( + tool.function.onToolCalled(validatedInput, executeContext), + ); + + if (result === null) { + // Pause — treat as manual tool + return null; + } + + if (tool.function.outputSchema) { + const validatedOutput = validateToolOutput(tool.function.outputSchema, result); + return { + toolCallId: toolCall.id, + toolName: toolCall.name, + result: validatedOutput, + }; + } + + return { + toolCallId: toolCall.id, + toolName: toolCall.name, + result, + }; + } catch (error) { + return { + toolCallId: toolCall.id, + toolName: toolCall.name, + result: null, + error: error instanceof Error ? error : new Error(String(error)), + }; + } +} + +/** + * Execute a tool call. + * Automatically detects if it's a regular, generator, or HITL tool. + * + * Returns `null` only for HITL tools whose `onToolCalled` returned `null` + * (signaling a manual-style pause). All other tools always return a + * `ToolExecutionResult` (with `error` set on failure). */ // biome-ignore lint: parameters match the internal API shape export async function executeTool( @@ -333,7 +397,11 @@ export async function executeTool( onPreliminaryResult?: (toolCallId: string, result: unknown) => void, contextStore?: ToolContextStore, sharedSchema?: $ZodObject<$ZodShape>, -): Promise> { +): Promise | null> { + if (isHITLTool(tool)) { + return executeHITLTool(tool, toolCall, context, contextStore, sharedSchema); + } + if (!hasExecuteFunction(tool)) { throw new Error(`Tool "${toolCall.name}" has no execute function. Use manual tool execution.`); } @@ -391,3 +459,112 @@ export function formatToolExecutionError(error: Error, toolCall: ParsedToolCall< return `Tool "${toolCall.name}" execution error: ${error.message}`; } + +/** + * Typeguard: input is a plain array of items. Narrows the InputsUnion's + * "string | Array<...>" shape so we can walk the array. + */ +function isItemArray( + input: models.InputsUnion, +): input is Extract { + return Array.isArray(input); +} + +/** + * Walk the input items and apply `onResponseReceived` hooks for HITL tools. + * + * For each `function_call_output` item in `input`, locate the matching + * `function_call` (by `callId`) to identify the tool name. If that tool is a + * HITL tool with an `onResponseReceived` hook, invoke it with the parsed output + * and replace the item's `output` with the hook's return value (stringified). + * + * If a hook throws, the output is replaced with `{"error": ""}` so the + * model sees a tool error. Items that don't match a HITL tool are left untouched. + * + * @returns a new input array when any item was rewritten, otherwise the original input. + */ +// biome-ignore lint: parameters match the internal API shape +export async function applyOnResponseReceivedHooks( + input: models.InputsUnion, + tools: readonly Tool[] | undefined, + context: TurnContext, + contextStore?: ToolContextStore, + sharedSchema?: $ZodObject<$ZodShape>, +): Promise { + if (!tools || tools.length === 0 || !isItemArray(input)) { + return input; + } + + const hitlTools = tools.filter((t): t is HITLTool => isHITLTool(t)); + if (hitlTools.length === 0) { + return input; + } + const hookByName = new Map(); + for (const t of hitlTools) { + if (t.function.onResponseReceived) { + hookByName.set(t.function.name, t); + } + } + if (hookByName.size === 0) { + return input; + } + + // Build callId -> name map from function_call items in the array + const callIdToName = new Map(); + for (const item of input) { + if (isFunctionCallItem(item)) { + callIdToName.set(item.callId, item.name); + } + } + + let changed = false; + const rewritten: unknown[] = []; + for (const item of input) { + if (!isFunctionCallOutputItem(item)) { + rewritten.push(item); + continue; + } + const toolName = callIdToName.get(item.callId); + if (!toolName) { + rewritten.push(item); + continue; + } + const tool = hookByName.get(toolName); + if (!tool || !tool.function.onResponseReceived) { + rewritten.push(item); + continue; + } + + const raw = item.output; + let parsed: unknown = raw; + if (typeof raw === 'string') { + try { + parsed = JSON.parse(raw); + } catch { + parsed = raw; + } + } + + const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema); + + let newOutput: string; + try { + const hookResult = await Promise.resolve( + tool.function.onResponseReceived(parsed, executeContext), + ); + newOutput = JSON.stringify(hookResult); + } catch (err) { + newOutput = JSON.stringify({ + error: err instanceof Error ? err.message : String(err), + }); + } + + rewritten.push({ + ...item, + output: newOutput, + }); + changed = true; + } + + return changed ? (rewritten as models.InputsUnion) : input; +} diff --git a/packages/agent/src/lib/tool-orchestrator.ts b/packages/agent/src/lib/tool-orchestrator.ts index cd0df4a..c27d3e2 100644 --- a/packages/agent/src/lib/tool-orchestrator.ts +++ b/packages/agent/src/lib/tool-orchestrator.ts @@ -7,7 +7,7 @@ import { extractToolCallsFromResponse, responseHasToolCalls } from './stream-tra import { isFunctionCallItem } from './stream-type-guards.js'; import { executeTool, findToolByName } from './tool-executor.js'; import type { APITool, Tool, ToolExecutionResult } from './tool-types.js'; -import { hasExecuteFunction } from './tool-types.js'; +import { isAutoResolvableTool } from './tool-types.js'; import { buildTurnContext } from './turn-context.js'; /** @@ -75,13 +75,13 @@ export async function executeToolLoop( break; } - // Check if any tools have execute functions + // Check if any tools can be auto-resolved (execute or HITL onToolCalled) const hasExecutableTools = toolCalls.some((toolCall) => { const tool = findToolByName(tools, toolCall.name); - return tool && hasExecuteFunction(tool); + return tool && isAutoResolvableTool(tool); }); - // If no executable tools, return (manual execution mode) + // If no auto-resolvable tools, return (manual execution mode) if (!hasExecutableTools) { break; } @@ -100,8 +100,8 @@ export async function executeToolLoop( } as ToolExecutionResult; } - if (!hasExecuteFunction(tool)) { - // Tool has no execute function - return null to filter out + if (!isAutoResolvableTool(tool)) { + // Tool has no execute/onToolCalled - return null to filter out return null; } diff --git a/packages/agent/src/lib/tool-types.ts b/packages/agent/src/lib/tool-types.ts index f4794a8..2713776 100644 --- a/packages/agent/src/lib/tool-types.ts +++ b/packages/agent/src/lib/tool-types.ts @@ -278,6 +278,36 @@ export interface ManualToolFunction< outputSchema?: TOutput; } +/** + * Human-in-the-loop tool. Extends manual-tool semantics with two async hooks. + * + * `onToolCalled` fires when the model invokes the tool. Returning a value feeds + * the model directly (like regular `execute`); returning `null` pauses the loop + * like a manual tool, letting the caller resume later with a FunctionCallOutputItem. + * + * `onResponseReceived` fires on the next turn when an incoming FunctionCallOutputItem + * corresponds to a prior call of this tool (matched by callId → function_call.name). + * It receives the caller-supplied raw result and returns the value sent to the model. + * Throwing surfaces as a tool error to the model. + */ +export interface HITLToolFunction< + TInput extends $ZodObject<$ZodShape>, + TOutput extends $ZodType = $ZodType, + TContext extends Record = Record, + TName extends string = string, +> extends BaseToolFunction { + outputSchema?: TOutput; + onToolCalled: ( + params: zodInfer, + context?: ToolExecuteContext, + ) => Promise | null> | zodInfer | null; + onResponseReceived?: ( + rawResult: unknown, + context?: ToolExecuteContext, + ) => Promise> | zodInfer; + toModelOutput?: ToModelOutputFunction, zodInfer>; +} + /** * Tool with execute function (regular or generator) */ @@ -315,13 +345,26 @@ export type ManualTool< }; /** - * Union type of all client-executed tool shapes (function, generator, manual). + * Human-in-the-loop tool (with onToolCalled / onResponseReceived hooks) + */ +export type HITLTool< + TInput extends $ZodObject<$ZodShape> = $ZodObject<$ZodShape>, + TOutput extends $ZodType = $ZodType, + TContext extends Record = Record, +> = { + type: ToolType.Function; + function: HITLToolFunction; +}; + +/** + * Union type of all client-executed tool shapes (function, generator, manual, HITL). * These run in the user's process via the agent SDK's tool execution loop. */ export type ClientTool = | ToolWithExecute<$ZodObject<$ZodShape>, $ZodType> | ToolWithGenerator<$ZodObject<$ZodShape>, $ZodType, $ZodType> - | ManualTool<$ZodObject<$ZodShape>, $ZodType>; + | ManualTool<$ZodObject<$ZodShape>, $ZodType> + | HITLTool<$ZodObject<$ZodShape>, $ZodType>; /** * Config payload for an OpenRouter server-executed tool. Derived directly @@ -551,13 +594,36 @@ export function isRegularExecuteTool(tool: Tool): tool is ToolWithExecute { } /** - * Type guard to check if a tool is a manual tool (no execute function) + * Type guard to check if a tool is a manual tool (no execute and no onToolCalled) */ export function isManualTool(tool: Tool): tool is ManualTool { if (isServerTool(tool)) { return false; } - return !('execute' in tool.function); + return !('execute' in tool.function) && !('onToolCalled' in tool.function); +} + +/** + * Type guard to check if a tool is a human-in-the-loop tool (has onToolCalled) + */ +export function isHITLTool(tool: Tool): tool is HITLTool { + if (isServerTool(tool)) { + return false; + } + return ( + 'onToolCalled' in tool.function && typeof tool.function.onToolCalled === 'function' + ); +} + +/** + * Type guard: true if the tool can be auto-resolved within a turn — either + * through a client execute/generator function or through a HITL onToolCalled hook. + * Returns false for manual tools (which always pause) and server tools. + */ +export function isAutoResolvableTool( + tool: Tool, +): tool is ToolWithExecute | ToolWithGenerator | HITLTool { + return hasExecuteFunction(tool) || isHITLTool(tool); } /** diff --git a/packages/agent/src/lib/tool.ts b/packages/agent/src/lib/tool.ts index 00ea165..043b4df 100644 --- a/packages/agent/src/lib/tool.ts +++ b/packages/agent/src/lib/tool.ts @@ -1,5 +1,6 @@ import type { $ZodObject, $ZodShape, $ZodType, infer as zodInfer } from 'zod/v4/core'; import type { + HITLTool, ManualTool, NextTurnParamsFunctions, ServerTool, @@ -109,6 +110,45 @@ type ManualToolConfig> = { execute: false; }; +/** + * Configuration for a human-in-the-loop tool. + * Discriminated by the presence of `onToolCalled`. No `execute` or `eventSchema`. + * + * `onToolCalled` returning `null` pauses the loop (manual-tool semantics). + * Any non-null return is treated as the tool's result for the model. + * + * `onResponseReceived` is invoked on a later turn when an incoming + * `FunctionCallOutputItem` corresponds to a prior call of this tool; the + * returned value replaces what the model ultimately sees. + */ +type HITLToolConfig< + TInput extends $ZodObject<$ZodShape>, + TOutput extends $ZodType, + TContext extends Record = Record, + TName extends string = string, +> = { + name: TName; + description?: string; + inputSchema: TInput; + outputSchema?: TOutput; + eventSchema?: undefined; + execute?: undefined; + /** Zod schema declaring the context data this tool needs */ + contextSchema?: $ZodObject<$ZodShape>; + nextTurnParams?: NextTurnParamsFunctions>; + requireApproval?: boolean | ToolApprovalCheck>; + onToolCalled: ( + params: zodInfer, + context?: ToolExecuteContext, + ) => Promise | null> | zodInfer | null; + onResponseReceived?: ( + rawResult: unknown, + context?: ToolExecuteContext, + ) => Promise> | zodInfer; + /** Convert tool execution output to model-facing output */ + toModelOutput?: ToModelOutputFunction, zodInfer>; +}; + /** * Loose config type for the `tool()` overload. * Accepts any valid tool config while typing `ctx.shared` from TShared. @@ -194,6 +234,16 @@ export function tool< config: GeneratorToolConfig, ): ToolWithGenerator; +// Overload for HITL tools (when onToolCalled is provided) +export function tool< + TInput extends $ZodObject<$ZodShape>, + TOutput extends $ZodType, + TContext extends Record = Record, + TName extends string = string, +>( + config: HITLToolConfig, +): HITLTool; + // Overload for manual tools (execute: false) export function tool>( config: ManualToolConfig, @@ -233,6 +283,7 @@ export function tool( | GeneratorToolConfig<$ZodObject<$ZodShape>, $ZodType, $ZodType> | RegularToolConfig<$ZodObject<$ZodShape>, $ZodType, unknown> | ManualToolConfig<$ZodObject<$ZodShape>> + | HITLToolConfig<$ZodObject<$ZodShape>, $ZodType> | ToolConfigWithSharedContext>, ): Tool { // 'shared' is reserved for shared context — forbid it as a tool name @@ -242,6 +293,48 @@ export function tool( ); } + // Check for HITL tool (has onToolCalled hook) + if ('onToolCalled' in config && typeof config.onToolCalled === 'function') { + const fn: HITLTool<$ZodObject<$ZodShape>, $ZodType>['function'] = { + name: config.name, + inputSchema: config.inputSchema, + onToolCalled: config.onToolCalled, + }; + + if (config.description !== undefined) { + fn.description = config.description; + } + + if (config.outputSchema !== undefined) { + fn.outputSchema = config.outputSchema; + } + + if (config.contextSchema !== undefined) { + fn.contextSchema = config.contextSchema; + } + + if (config.nextTurnParams !== undefined) { + fn.nextTurnParams = config.nextTurnParams; + } + + if (config.requireApproval !== undefined) { + fn.requireApproval = config.requireApproval; + } + + if (config.onResponseReceived !== undefined) { + fn.onResponseReceived = config.onResponseReceived; + } + + if (config.toModelOutput !== undefined) { + fn.toModelOutput = config.toModelOutput; + } + + return { + type: ToolType.Function, + function: fn, + }; + } + // Check for manual tool first (execute === false) if (config.execute === false) { const fn: ManualTool<$ZodObject<$ZodShape>>['function'] = { diff --git a/packages/agent/tests/unit/hitl-tool.test.ts b/packages/agent/tests/unit/hitl-tool.test.ts new file mode 100644 index 0000000..69c117f --- /dev/null +++ b/packages/agent/tests/unit/hitl-tool.test.ts @@ -0,0 +1,319 @@ +import type * as models from '@openrouter/sdk/models'; +import { describe, expect, it, vi } from 'vitest'; +import { z } from 'zod/v4'; +import { + applyOnResponseReceivedHooks, + executeHITLTool, + executeTool, +} from '../../src/lib/tool-executor.js'; +import { tool } from '../../src/lib/tool.js'; +import type { + HITLTool, + ParsedToolCall, + Tool, + TurnContext, +} from '../../src/lib/tool-types.js'; +import { + isAutoResolvableTool, + isHITLTool, + isManualTool, + ToolType, +} from '../../src/lib/tool-types.js'; + +const turnContext: TurnContext = { numberOfTurns: 1 }; + +function makeToolCall(name: string, id: string, args: unknown): ParsedToolCall { + return { id, name, arguments: args } as ParsedToolCall; +} + +describe('tool() factory — HITL tools', () => { + it('creates a HITL tool when onToolCalled is present', () => { + const approve = tool({ + name: 'approve_payment', + description: 'Approve a payment', + inputSchema: z.object({ amount: z.number() }), + outputSchema: z.object({ ok: z.boolean() }), + onToolCalled: async (input) => { + return input.amount < 100 ? { ok: true } : null; + }, + onResponseReceived: async (raw) => { + return raw as { ok: boolean }; + }, + }); + + expect(approve.type).toBe(ToolType.Function); + expect(approve.function.name).toBe('approve_payment'); + expect(isHITLTool(approve)).toBe(true); + expect(isManualTool(approve)).toBe(false); + expect(isAutoResolvableTool(approve)).toBe(true); + expect('execute' in approve.function).toBe(false); + }); + + it('omits onResponseReceived when not provided', () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ x: z.number() }), + onToolCalled: async () => ({ ok: true }), + }); + expect('onResponseReceived' in t.function).toBe(false); + }); + + it('isManualTool returns true only for tools with neither execute nor onToolCalled', () => { + const manual = tool({ + name: 'manual', + inputSchema: z.object({ x: z.number() }), + execute: false, + }); + const hitl = tool({ + name: 'hitl', + inputSchema: z.object({ x: z.number() }), + onToolCalled: async () => null, + }); + const regular = tool({ + name: 'regular', + inputSchema: z.object({ x: z.number() }), + execute: async () => ({ y: 1 }), + }); + + expect(isManualTool(manual)).toBe(true); + expect(isManualTool(hitl)).toBe(false); + expect(isManualTool(regular)).toBe(false); + }); +}); + +describe('executeHITLTool', () => { + it('returns a ToolExecutionResult when onToolCalled returns a value', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + outputSchema: z.object({ ok: z.boolean() }), + onToolCalled: async (input) => ({ ok: input.amount < 100 }), + }); + + const result = await executeHITLTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext); + expect(result).not.toBeNull(); + expect(result?.error).toBeUndefined(); + expect(result?.result).toEqual({ ok: true }); + }); + + it('returns null when onToolCalled returns null (pause)', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + onToolCalled: async () => null, + }); + + const result = await executeHITLTool(t, makeToolCall('approve', 'c2', { amount: 5 }), turnContext); + expect(result).toBeNull(); + }); + + it('captures thrown errors into the ToolExecutionResult', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + onToolCalled: async () => { + throw new Error('boom'); + }, + }); + + const result = await executeHITLTool(t, makeToolCall('approve', 'c3', { amount: 5 }), turnContext); + expect(result).not.toBeNull(); + expect(result?.error).toBeInstanceOf(Error); + expect(result?.error?.message).toBe('boom'); + }); + + it('validates onToolCalled return value against outputSchema', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + outputSchema: z.object({ ok: z.boolean() }), + // Return value doesn't match schema to force validation error + onToolCalled: async () => ({ ok: 'yes' as unknown as boolean }), + }); + + const result = await executeHITLTool(t, makeToolCall('approve', 'c4', { amount: 5 }), turnContext); + expect(result?.error).toBeDefined(); + }); +}); + +describe('executeTool dispatcher with HITL tools', () => { + it('routes HITL tools through executeHITLTool', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + onToolCalled: async () => ({ approved: true }), + }); + + const result = await executeTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext); + expect(result).not.toBeNull(); + expect(result?.result).toEqual({ approved: true }); + }); + + it('returns null for HITL pause through the dispatcher', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + onToolCalled: async () => null, + }); + + const result = await executeTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext); + expect(result).toBeNull(); + }); +}); + +describe('applyOnResponseReceivedHooks', () => { + function callItem( + callId: string, + name: string, + args = '{}', + ): models.OutputFunctionCallItem { + return { + type: 'function_call', + id: `fc_${callId}`, + callId, + name, + arguments: args, + status: 'completed', + }; + } + + function outputItem(callId: string, output: string): models.FunctionCallOutputItem { + return { + type: 'function_call_output', + id: `output_${callId}`, + callId, + output, + }; + } + + it('transforms a FunctionCallOutputItem when its tool has onResponseReceived', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + onToolCalled: async () => null, + onResponseReceived: async (raw) => { + const obj = raw as { ok: boolean }; + return { ...obj, reviewedAt: 1234 }; + }, + }); + + const input: models.InputsUnion = [ + callItem('c1', 'approve'), + outputItem('c1', JSON.stringify({ ok: true })), + ]; + + const result = await applyOnResponseReceivedHooks(input, [t], turnContext); + expect(Array.isArray(result)).toBe(true); + const arr = result as unknown[]; + const out = arr[1] as models.FunctionCallOutputItem; + expect(out.type).toBe('function_call_output'); + expect(out.output).toBe(JSON.stringify({ ok: true, reviewedAt: 1234 })); + }); + + it('leaves output unchanged when no matching tool has a hook', async () => { + const t = tool({ + name: 'regular', + inputSchema: z.object({ x: z.number() }), + execute: async () => ({ y: 1 }), + }); + + const input: models.InputsUnion = [ + callItem('c1', 'regular'), + outputItem('c1', JSON.stringify({ y: 1 })), + ]; + + const result = await applyOnResponseReceivedHooks(input, [t], turnContext); + expect(result).toBe(input); // same reference, no rewrite + }); + + it('replaces output with an error object when the hook throws', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + onToolCalled: async () => null, + onResponseReceived: async () => { + throw new Error('invalid result'); + }, + }); + + const input: models.InputsUnion = [ + callItem('c1', 'approve'), + outputItem('c1', JSON.stringify({ ok: true })), + ]; + + const result = await applyOnResponseReceivedHooks(input, [t], turnContext); + const arr = result as unknown[]; + const out = arr[1] as models.FunctionCallOutputItem; + const parsed = JSON.parse(out.output as string) as { error: string }; + expect(parsed.error).toBe('invalid result'); + }); + + it('passes the parsed raw result (not the raw string) to the hook', async () => { + const spy = vi.fn(async (raw: unknown) => raw); + const t: HITLTool = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + onToolCalled: async () => null, + onResponseReceived: spy, + }); + + const payload = { ok: true, note: 'hi' }; + const input: models.InputsUnion = [ + callItem('c1', 'approve'), + outputItem('c1', JSON.stringify(payload)), + ]; + + await applyOnResponseReceivedHooks(input, [t], turnContext); + expect(spy).toHaveBeenCalledTimes(1); + expect(spy.mock.calls[0]?.[0]).toEqual(payload); + }); + + it('passes the raw string through to the hook when output is not JSON', async () => { + const spy = vi.fn(async (raw: unknown) => raw); + const t: HITLTool = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + onToolCalled: async () => null, + onResponseReceived: spy, + }); + + const input: models.InputsUnion = [ + callItem('c1', 'approve'), + outputItem('c1', 'not-json'), + ]; + + await applyOnResponseReceivedHooks(input, [t], turnContext); + expect(spy.mock.calls[0]?.[0]).toBe('not-json'); + }); + + it('leaves outputs whose callId has no matching function_call untouched', async () => { + const t: HITLTool = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + onToolCalled: async () => null, + onResponseReceived: async (raw) => ({ ...(raw as object), tagged: true }), + }); + + // No function_call in the input at all — just an orphan output + const input: models.InputsUnion = [outputItem('orphan', JSON.stringify({ ok: true }))]; + + const result = await applyOnResponseReceivedHooks(input, [t], turnContext); + expect(result).toBe(input); + }); + + it('ignores tools without onResponseReceived even if they are HITL', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ amount: z.number() }), + onToolCalled: async () => null, + }); + + const input: models.InputsUnion = [ + callItem('c1', 'approve'), + outputItem('c1', JSON.stringify({ ok: true })), + ]; + + const result = await applyOnResponseReceivedHooks(input, [t], turnContext); + expect(result).toBe(input); + }); +}); From 5155f4456ee3e0aed8734ad15a6949dbb068ee68 Mon Sep 17 00:00:00 2001 From: Matt Apperson Date: Tue, 5 May 2026 18:55:17 -0400 Subject: [PATCH 2/5] style: apply biome formatting and optional-chain suggestion --- packages/agent/src/lib/tool-executor.ts | 2 +- packages/agent/src/lib/tool-types.ts | 4 +- packages/agent/src/lib/tool.ts | 4 +- packages/agent/tests/unit/hitl-tool.test.ts | 322 +++++++++++++++----- 4 files changed, 256 insertions(+), 76 deletions(-) diff --git a/packages/agent/src/lib/tool-executor.ts b/packages/agent/src/lib/tool-executor.ts index 52d2303..8929dea 100644 --- a/packages/agent/src/lib/tool-executor.ts +++ b/packages/agent/src/lib/tool-executor.ts @@ -530,7 +530,7 @@ export async function applyOnResponseReceivedHooks( continue; } const tool = hookByName.get(toolName); - if (!tool || !tool.function.onResponseReceived) { + if (!tool?.function.onResponseReceived) { rewritten.push(item); continue; } diff --git a/packages/agent/src/lib/tool-types.ts b/packages/agent/src/lib/tool-types.ts index 2713776..f43bebc 100644 --- a/packages/agent/src/lib/tool-types.ts +++ b/packages/agent/src/lib/tool-types.ts @@ -610,9 +610,7 @@ export function isHITLTool(tool: Tool): tool is HITLTool { if (isServerTool(tool)) { return false; } - return ( - 'onToolCalled' in tool.function && typeof tool.function.onToolCalled === 'function' - ); + return 'onToolCalled' in tool.function && typeof tool.function.onToolCalled === 'function'; } /** diff --git a/packages/agent/src/lib/tool.ts b/packages/agent/src/lib/tool.ts index 043b4df..cb40c05 100644 --- a/packages/agent/src/lib/tool.ts +++ b/packages/agent/src/lib/tool.ts @@ -240,9 +240,7 @@ export function tool< TOutput extends $ZodType, TContext extends Record = Record, TName extends string = string, ->( - config: HITLToolConfig, -): HITLTool; +>(config: HITLToolConfig): HITLTool; // Overload for manual tools (execute: false) export function tool>( diff --git a/packages/agent/tests/unit/hitl-tool.test.ts b/packages/agent/tests/unit/hitl-tool.test.ts index 69c117f..6594917 100644 --- a/packages/agent/tests/unit/hitl-tool.test.ts +++ b/packages/agent/tests/unit/hitl-tool.test.ts @@ -1,18 +1,13 @@ import type * as models from '@openrouter/sdk/models'; import { describe, expect, it, vi } from 'vitest'; import { z } from 'zod/v4'; +import { tool } from '../../src/lib/tool.js'; import { applyOnResponseReceivedHooks, executeHITLTool, executeTool, } from '../../src/lib/tool-executor.js'; -import { tool } from '../../src/lib/tool.js'; -import type { - HITLTool, - ParsedToolCall, - Tool, - TurnContext, -} from '../../src/lib/tool-types.js'; +import type { HITLTool, ParsedToolCall, Tool, TurnContext } from '../../src/lib/tool-types.js'; import { isAutoResolvableTool, isHITLTool, @@ -20,10 +15,16 @@ import { ToolType, } from '../../src/lib/tool-types.js'; -const turnContext: TurnContext = { numberOfTurns: 1 }; +const turnContext: TurnContext = { + numberOfTurns: 1, +}; function makeToolCall(name: string, id: string, args: unknown): ParsedToolCall { - return { id, name, arguments: args } as ParsedToolCall; + return { + id, + name, + arguments: args, + } as ParsedToolCall; } describe('tool() factory — HITL tools', () => { @@ -31,13 +32,23 @@ describe('tool() factory — HITL tools', () => { const approve = tool({ name: 'approve_payment', description: 'Approve a payment', - inputSchema: z.object({ amount: z.number() }), - outputSchema: z.object({ ok: z.boolean() }), + inputSchema: z.object({ + amount: z.number(), + }), + outputSchema: z.object({ + ok: z.boolean(), + }), onToolCalled: async (input) => { - return input.amount < 100 ? { ok: true } : null; + return input.amount < 100 + ? { + ok: true, + } + : null; }, onResponseReceived: async (raw) => { - return raw as { ok: boolean }; + return raw as { + ok: boolean; + }; }, }); @@ -52,8 +63,12 @@ describe('tool() factory — HITL tools', () => { it('omits onResponseReceived when not provided', () => { const t = tool({ name: 'approve', - inputSchema: z.object({ x: z.number() }), - onToolCalled: async () => ({ ok: true }), + inputSchema: z.object({ + x: z.number(), + }), + onToolCalled: async () => ({ + ok: true, + }), }); expect('onResponseReceived' in t.function).toBe(false); }); @@ -61,18 +76,26 @@ describe('tool() factory — HITL tools', () => { it('isManualTool returns true only for tools with neither execute nor onToolCalled', () => { const manual = tool({ name: 'manual', - inputSchema: z.object({ x: z.number() }), + inputSchema: z.object({ + x: z.number(), + }), execute: false, }); const hitl = tool({ name: 'hitl', - inputSchema: z.object({ x: z.number() }), + inputSchema: z.object({ + x: z.number(), + }), onToolCalled: async () => null, }); const regular = tool({ name: 'regular', - inputSchema: z.object({ x: z.number() }), - execute: async () => ({ y: 1 }), + inputSchema: z.object({ + x: z.number(), + }), + execute: async () => ({ + y: 1, + }), }); expect(isManualTool(manual)).toBe(true); @@ -85,38 +108,68 @@ describe('executeHITLTool', () => { it('returns a ToolExecutionResult when onToolCalled returns a value', async () => { const t = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), - outputSchema: z.object({ ok: z.boolean() }), - onToolCalled: async (input) => ({ ok: input.amount < 100 }), + inputSchema: z.object({ + amount: z.number(), + }), + outputSchema: z.object({ + ok: z.boolean(), + }), + onToolCalled: async (input) => ({ + ok: input.amount < 100, + }), }); - const result = await executeHITLTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext); + const result = await executeHITLTool( + t, + makeToolCall('approve', 'c1', { + amount: 5, + }), + turnContext, + ); expect(result).not.toBeNull(); expect(result?.error).toBeUndefined(); - expect(result?.result).toEqual({ ok: true }); + expect(result?.result).toEqual({ + ok: true, + }); }); it('returns null when onToolCalled returns null (pause)', async () => { const t = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), + inputSchema: z.object({ + amount: z.number(), + }), onToolCalled: async () => null, }); - const result = await executeHITLTool(t, makeToolCall('approve', 'c2', { amount: 5 }), turnContext); + const result = await executeHITLTool( + t, + makeToolCall('approve', 'c2', { + amount: 5, + }), + turnContext, + ); expect(result).toBeNull(); }); it('captures thrown errors into the ToolExecutionResult', async () => { const t = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), + inputSchema: z.object({ + amount: z.number(), + }), onToolCalled: async () => { throw new Error('boom'); }, }); - const result = await executeHITLTool(t, makeToolCall('approve', 'c3', { amount: 5 }), turnContext); + const result = await executeHITLTool( + t, + makeToolCall('approve', 'c3', { + amount: 5, + }), + turnContext, + ); expect(result).not.toBeNull(); expect(result?.error).toBeInstanceOf(Error); expect(result?.error?.message).toBe('boom'); @@ -125,13 +178,25 @@ describe('executeHITLTool', () => { it('validates onToolCalled return value against outputSchema', async () => { const t = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), - outputSchema: z.object({ ok: z.boolean() }), + inputSchema: z.object({ + amount: z.number(), + }), + outputSchema: z.object({ + ok: z.boolean(), + }), // Return value doesn't match schema to force validation error - onToolCalled: async () => ({ ok: 'yes' as unknown as boolean }), + onToolCalled: async () => ({ + ok: 'yes' as unknown as boolean, + }), }); - const result = await executeHITLTool(t, makeToolCall('approve', 'c4', { amount: 5 }), turnContext); + const result = await executeHITLTool( + t, + makeToolCall('approve', 'c4', { + amount: 5, + }), + turnContext, + ); expect(result?.error).toBeDefined(); }); }); @@ -140,33 +205,49 @@ describe('executeTool dispatcher with HITL tools', () => { it('routes HITL tools through executeHITLTool', async () => { const t = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), - onToolCalled: async () => ({ approved: true }), + inputSchema: z.object({ + amount: z.number(), + }), + onToolCalled: async () => ({ + approved: true, + }), }); - const result = await executeTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext); + const result = await executeTool( + t, + makeToolCall('approve', 'c1', { + amount: 5, + }), + turnContext, + ); expect(result).not.toBeNull(); - expect(result?.result).toEqual({ approved: true }); + expect(result?.result).toEqual({ + approved: true, + }); }); it('returns null for HITL pause through the dispatcher', async () => { const t = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), + inputSchema: z.object({ + amount: z.number(), + }), onToolCalled: async () => null, }); - const result = await executeTool(t, makeToolCall('approve', 'c1', { amount: 5 }), turnContext); + const result = await executeTool( + t, + makeToolCall('approve', 'c1', { + amount: 5, + }), + turnContext, + ); expect(result).toBeNull(); }); }); describe('applyOnResponseReceivedHooks', () => { - function callItem( - callId: string, - name: string, - args = '{}', - ): models.OutputFunctionCallItem { + function callItem(callId: string, name: string, args = '{}'): models.OutputFunctionCallItem { return { type: 'function_call', id: `fc_${callId}`, @@ -189,47 +270,87 @@ describe('applyOnResponseReceivedHooks', () => { it('transforms a FunctionCallOutputItem when its tool has onResponseReceived', async () => { const t = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), + inputSchema: z.object({ + amount: z.number(), + }), onToolCalled: async () => null, onResponseReceived: async (raw) => { - const obj = raw as { ok: boolean }; - return { ...obj, reviewedAt: 1234 }; + const obj = raw as { + ok: boolean; + }; + return { + ...obj, + reviewedAt: 1234, + }; }, }); const input: models.InputsUnion = [ callItem('c1', 'approve'), - outputItem('c1', JSON.stringify({ ok: true })), + outputItem( + 'c1', + JSON.stringify({ + ok: true, + }), + ), ]; - const result = await applyOnResponseReceivedHooks(input, [t], turnContext); + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); expect(Array.isArray(result)).toBe(true); const arr = result as unknown[]; const out = arr[1] as models.FunctionCallOutputItem; expect(out.type).toBe('function_call_output'); - expect(out.output).toBe(JSON.stringify({ ok: true, reviewedAt: 1234 })); + expect(out.output).toBe( + JSON.stringify({ + ok: true, + reviewedAt: 1234, + }), + ); }); it('leaves output unchanged when no matching tool has a hook', async () => { const t = tool({ name: 'regular', - inputSchema: z.object({ x: z.number() }), - execute: async () => ({ y: 1 }), + inputSchema: z.object({ + x: z.number(), + }), + execute: async () => ({ + y: 1, + }), }); const input: models.InputsUnion = [ callItem('c1', 'regular'), - outputItem('c1', JSON.stringify({ y: 1 })), + outputItem( + 'c1', + JSON.stringify({ + y: 1, + }), + ), ]; - const result = await applyOnResponseReceivedHooks(input, [t], turnContext); + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); expect(result).toBe(input); // same reference, no rewrite }); it('replaces output with an error object when the hook throws', async () => { const t = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), + inputSchema: z.object({ + amount: z.number(), + }), onToolCalled: async () => null, onResponseReceived: async () => { throw new Error('invalid result'); @@ -238,13 +359,26 @@ describe('applyOnResponseReceivedHooks', () => { const input: models.InputsUnion = [ callItem('c1', 'approve'), - outputItem('c1', JSON.stringify({ ok: true })), + outputItem( + 'c1', + JSON.stringify({ + ok: true, + }), + ), ]; - const result = await applyOnResponseReceivedHooks(input, [t], turnContext); + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); const arr = result as unknown[]; const out = arr[1] as models.FunctionCallOutputItem; - const parsed = JSON.parse(out.output as string) as { error: string }; + const parsed = JSON.parse(out.output as string) as { + error: string; + }; expect(parsed.error).toBe('invalid result'); }); @@ -252,18 +386,29 @@ describe('applyOnResponseReceivedHooks', () => { const spy = vi.fn(async (raw: unknown) => raw); const t: HITLTool = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), + inputSchema: z.object({ + amount: z.number(), + }), onToolCalled: async () => null, onResponseReceived: spy, }); - const payload = { ok: true, note: 'hi' }; + const payload = { + ok: true, + note: 'hi', + }; const input: models.InputsUnion = [ callItem('c1', 'approve'), outputItem('c1', JSON.stringify(payload)), ]; - await applyOnResponseReceivedHooks(input, [t], turnContext); + await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); expect(spy).toHaveBeenCalledTimes(1); expect(spy.mock.calls[0]?.[0]).toEqual(payload); }); @@ -272,7 +417,9 @@ describe('applyOnResponseReceivedHooks', () => { const spy = vi.fn(async (raw: unknown) => raw); const t: HITLTool = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), + inputSchema: z.object({ + amount: z.number(), + }), onToolCalled: async () => null, onResponseReceived: spy, }); @@ -282,38 +429,75 @@ describe('applyOnResponseReceivedHooks', () => { outputItem('c1', 'not-json'), ]; - await applyOnResponseReceivedHooks(input, [t], turnContext); + await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); expect(spy.mock.calls[0]?.[0]).toBe('not-json'); }); it('leaves outputs whose callId has no matching function_call untouched', async () => { const t: HITLTool = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), + inputSchema: z.object({ + amount: z.number(), + }), onToolCalled: async () => null, - onResponseReceived: async (raw) => ({ ...(raw as object), tagged: true }), + onResponseReceived: async (raw) => ({ + ...(raw as object), + tagged: true, + }), }); // No function_call in the input at all — just an orphan output - const input: models.InputsUnion = [outputItem('orphan', JSON.stringify({ ok: true }))]; + const input: models.InputsUnion = [ + outputItem( + 'orphan', + JSON.stringify({ + ok: true, + }), + ), + ]; - const result = await applyOnResponseReceivedHooks(input, [t], turnContext); + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); expect(result).toBe(input); }); it('ignores tools without onResponseReceived even if they are HITL', async () => { const t = tool({ name: 'approve', - inputSchema: z.object({ amount: z.number() }), + inputSchema: z.object({ + amount: z.number(), + }), onToolCalled: async () => null, }); const input: models.InputsUnion = [ callItem('c1', 'approve'), - outputItem('c1', JSON.stringify({ ok: true })), + outputItem( + 'c1', + JSON.stringify({ + ok: true, + }), + ), ]; - const result = await applyOnResponseReceivedHooks(input, [t], turnContext); + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); expect(result).toBe(input); }); }); From dc2ba48b88662be93c0ccd3e1e17ced61a2a30b8 Mon Sep 17 00:00:00 2001 From: Matt Apperson Date: Tue, 5 May 2026 19:47:31 -0400 Subject: [PATCH 3/5] fix(tools): close HITL execution gaps surfaced in review - Route auto-resolvable checks through isAutoResolvableTool so pure-HITL turns actually enter the execution loop and invoke onToolCalled. - Propagate HITL pauses out of executeToolRound; the outer loop now persists pending calls under a new awaiting_hitl status and returns before issuing a follow-up request with missing outputs. - Scope onResponseReceived hooks to freshly-supplied outputs on resume so caller-supplied outputs hooked at init aren't re-hooked. - Preserve paused HITL calls in pendingToolCalls when they occur during approval resume instead of silently dropping them. - Include originalOutput alongside error when onResponseReceived throws, so the model can distinguish hook failure from tool-reported error. - Replace unsafe InputsUnion cast with a structurally-typed rewritten array. --- packages/agent/src/lib/model-result.ts | 283 +++++- packages/agent/src/lib/tool-executor.ts | 16 +- packages/agent/src/lib/tool-types.ts | 19 +- packages/agent/tests/unit/hitl-tool.test.ts | 960 +++++++++++++++++++- 4 files changed, 1214 insertions(+), 64 deletions(-) diff --git a/packages/agent/src/lib/model-result.ts b/packages/agent/src/lib/model-result.ts index 01eacc4..03bdca7 100644 --- a/packages/agent/src/lib/model-result.ts +++ b/packages/agent/src/lib/model-result.ts @@ -54,6 +54,7 @@ import { ToolEventBroadcaster } from './tool-event-broadcaster.js'; import { applyOnResponseReceivedHooks, executeTool } from './tool-executor.js'; import type { ConversationState, + ConversationStatus, InferToolEventsUnion, InferToolOutputsUnion, ParsedToolCall, @@ -72,7 +73,6 @@ import type { UnsentToolResult, } from './tool-types.js'; import { - hasExecuteFunction, isAutoResolvableTool, isClientTool, isServerTool, @@ -592,24 +592,35 @@ export class ModelResult< } /** - * Check if any tool calls have execute functions. + * Check if any tool calls can be auto-resolved in the current turn. * Used to determine if automatic tool execution should be attempted. * + * A tool call is auto-resolvable if its tool has either an `execute` function + * (regular or generator) or an `onToolCalled` hook (HITL). HITL tools are + * included here because their hook fires before the model's follow-up request, + * even when the hook ultimately decides to pause by returning `null`. + * * @param toolCalls - The tool calls to check - * @returns True if at least one tool call has an executable function + * @returns True if at least one tool call is auto-resolvable */ private hasExecutableToolCalls(toolCalls: ParsedToolCall[]): boolean { return toolCalls.some((toolCall) => { const tool = this.options.tools?.find( (t) => isClientTool(t) && t.function.name === toolCall.name, ); - return tool && hasExecuteFunction(tool); + return tool && isAutoResolvableTool(tool); }); } + /** + * A manual tool call is one whose tool has neither an `execute` function nor + * an `onToolCalled` hook — i.e. the caller is expected to produce the output + * externally. HITL tools are auto-resolvable even when they pause, so they + * are not classified as manual here. + */ private isManualToolCall(item: models.OutputFunctionCallItem): boolean { const tool = this.options.tools?.find((t) => isClientTool(t) && t.function.name === item.name); - return !!tool && !hasExecuteFunction(tool); + return !!tool && !isAutoResolvableTool(tool); } /** @@ -738,18 +749,58 @@ export class ModelResult< return true; // Pause for approval } + /** + * Persist state when one or more HITL tools paused during a round. + * + * Mirrors `handleApprovalCheck` so paused HITL calls are surfaced through + * `pendingToolCalls` (visible via `getPendingToolCalls()` / `getState()`). + * Sets the status to `awaiting_hitl` so the caller can discriminate HITL + * pauses from approval pauses. + * + * Already-executed results from the same round are persisted on the turn's + * message history via `saveToolResultsToState` (called by the outer loop + * before this helper) — no need to duplicate them in `unsentToolResults`. + * + * @param currentResponse - The response that produced the paused tool calls + * @param pausedCalls - HITL tool calls whose `onToolCalled` returned `null` + */ + private async persistHitlPause( + currentResponse: models.OpenResponsesResult, + pausedCalls: ParsedToolCall[], + ): Promise { + this.finalResponse = currentResponse; + + if (!this.stateAccessor) { + return; + } + + const stateUpdates: Partial, 'id' | 'createdAt' | 'updatedAt'>> = + { + pendingToolCalls: pausedCalls as ParsedToolCall[], + status: 'awaiting_hitl', + }; + await this.saveStateSafely(stateUpdates); + } + /** * Execute all tools in a single round in parallel. * Emits tool.result events after tool execution completes. * * @param toolCalls - The tool calls to execute * @param turnContext - The current turn context - * @returns Array of function call outputs formatted for the API + * @returns Object with the function call outputs formatted for the API and + * the list of HITL tool calls that paused (returned `null` from + * `onToolCalled`). Callers should break out of the execution loop when + * `pausedCalls` is non-empty rather than sending an incomplete set of + * outputs back to the model. */ private async executeToolRound( toolCalls: ParsedToolCall[], turnContext: TurnContext, - ): Promise { + ): Promise<{ + toolResults: models.FunctionCallOutputItem[]; + pausedCalls: ParsedToolCall[]; + }> { const toolCallPromises = toolCalls.map(async (toolCall) => { const tool = this.options.tools?.find( (t) => isClientTool(t) && t.function.name === toolCall.name, @@ -824,6 +875,7 @@ export class ModelResult< const settledResults = await Promise.allSettled(toolCallPromises); const toolResults: models.FunctionCallOutputItem[] = []; + const pausedCalls: ParsedToolCall[] = []; for (let i = 0; i < settledResults.length; i++) { const settled = settledResults[i]; @@ -873,9 +925,11 @@ export class ModelResult< } if (value.type === 'paused') { - // HITL tool returned null — emit nothing; the call surfaces to the caller - // for manual resume. The outer loop will end because toolResults lacks - // an entry for this call. + // HITL tool returned null — record the pause so the caller can break + // out of the outer loop before attempting a follow-up request with an + // incomplete set of outputs. The call will be surfaced via state + // (pendingToolCalls + status='awaiting_hitl') for manual resume. + pausedCalls.push(value.toolCall); continue; } @@ -938,7 +992,10 @@ export class ModelResult< } satisfies ToolCallOutputEvent); } - return toolResults; + return { + toolResults, + pausedCalls, + }; } /** @@ -1100,6 +1157,81 @@ export class ModelResult< return rest as ResolvedCallModelInput; } + /** + * Apply `onResponseReceived` hooks to the freshly-produced tool outputs + * only, without re-hooking caller-supplied outputs that were already hooked + * during init. Uses the existing `function_call` items from message history + * for name resolution but drops them from the returned array. + * + * @param freshToolOutputs - Outputs produced in this turn (not yet hooked) + * @param messageHistory - Existing conversation messages; scanned for + * matching `function_call` items to resolve tool names + * @param turnContext - Turn context for hook invocation + * @returns The tool outputs with hook-rewritten `output` values where a + * matching HITL tool defines `onResponseReceived` + */ + private async hookFreshToolOutputs( + freshToolOutputs: models.FunctionCallOutputItem[], + messageHistory: models.InputsUnion, + turnContext: TurnContext, + ): Promise { + if (freshToolOutputs.length === 0) { + return freshToolOutputs; + } + + // Collect function_call items from history so `applyOnResponseReceivedHooks` + // can resolve callId -> toolName. We don't mutate history here — the items + // are only used to power the internal callIdToName map. + const historyArray = Array.isArray(messageHistory) + ? messageHistory + : [ + messageHistory, + ]; + const functionCallItems: models.BaseInputsUnion[] = []; + for (const item of historyArray) { + if (item && typeof item === 'object' && 'type' in item && item.type === 'function_call') { + functionCallItems.push(item as models.BaseInputsUnion); + } + } + + const syntheticInput: models.InputsUnion = [ + ...functionCallItems, + ...freshToolOutputs, + ]; + + const hookedInput = await applyOnResponseReceivedHooks( + syntheticInput, + this.options.tools, + turnContext, + this.contextStore ?? undefined, + this.options.sharedContextSchema, + ); + + if (hookedInput === syntheticInput) { + // No rewrites; return the originals unchanged. + return freshToolOutputs; + } + + // Extract the rewritten tool outputs (they sit after the function_calls). + const hookedArray = Array.isArray(hookedInput) + ? hookedInput + : [ + hookedInput, + ]; + const extracted: models.FunctionCallOutputItem[] = []; + for (const item of hookedArray) { + if ( + item && + typeof item === 'object' && + 'type' in item && + item.type === 'function_call_output' + ) { + extracted.push(item as models.FunctionCallOutputItem); + } + } + return extracted.length === freshToolOutputs.length ? extracted : freshToolOutputs; + } + /** * Safely persist state with error handling. * Wraps state save operations to ensure failures are properly reported. @@ -1164,9 +1296,16 @@ export class ModelResult< if (loadedState) { this.currentState = loadedState; - // Check if we're resuming from awaiting_approval with decisions + // Check if we're resuming from awaiting_approval or awaiting_hitl + // with decisions. `awaiting_hitl` reuses `processApprovalDecisions` + // because the resume mechanism is identical — the caller supplies + // `approveToolCalls`/`rejectToolCalls` for paused call IDs, and we + // re-invoke `executeTool` on approved calls (which re-runs + // `onToolCalled` for HITL tools). + const isResumableStatus = + loadedState.status === 'awaiting_approval' || loadedState.status === 'awaiting_hitl'; if ( - loadedState.status === 'awaiting_approval' && + isResumableStatus && (this.approvedToolCalls.length > 0 || this.rejectedToolCalls.length > 0) ) { // Initialize context store before resuming so tools have access @@ -1319,6 +1458,10 @@ export class ModelResult< // context is handled via contextStore, not on TurnContext }; + // Track approved HITL calls that paused (onToolCalled returned null) — + // these stay in pendingToolCalls so the caller can resume them later. + const hitlPausedIds = new Set(); + // Process approvals - execute the approved tools for (const callId of this.approvedToolCalls) { const toolCall = pendingCalls.find((tc) => tc.id === callId); @@ -1347,7 +1490,9 @@ export class ModelResult< ); if (result === null) { - // HITL tool paused on approval — no unsent result; caller resumes later + // HITL tool paused on approval — keep the call visible to the caller + // via pendingToolCalls (status becomes 'awaiting_hitl' below). + hitlPausedIds.add(callId); continue; } @@ -1370,17 +1515,35 @@ export class ModelResult< unsentResults.push(createRejectedResult(callId, String(toolCall.name), 'Rejected by user')); } - // Remove processed calls from pending - const processedIds = new Set([ - ...this.approvedToolCalls, - ...this.rejectedToolCalls, - ]); + // Remove processed calls from pending. Approved HITL calls that paused are + // NOT considered processed — they stay on pendingToolCalls so getPendingToolCalls() + // still surfaces them to the caller on resume. + const processedIds = new Set( + [ + ...this.approvedToolCalls, + ...this.rejectedToolCalls, + ].filter((id) => !hitlPausedIds.has(id)), + ); const remainingPending = pendingCalls.filter((tc) => !processedIds.has(tc.id)); + // Determine status: + // - Any still-unprocessed approval-required call keeps us in 'awaiting_approval' + // - Otherwise, any HITL paused call moves us to 'awaiting_hitl' + // - Otherwise, we continue with 'in_progress' + const remainingUnresolvedApprovals = remainingPending.filter((tc) => !hitlPausedIds.has(tc.id)); + let nextStatus: ConversationStatus; + if (remainingUnresolvedApprovals.length > 0) { + nextStatus = 'awaiting_approval'; + } else if (hitlPausedIds.size > 0) { + nextStatus = 'awaiting_hitl'; + } else { + nextStatus = 'in_progress'; + } + // Update state - conditionally include optional properties only if they have values const stateUpdates: Partial, 'id' | 'createdAt' | 'updatedAt'>> = { - status: remainingPending.length > 0 ? 'awaiting_approval' : 'in_progress', + status: nextStatus, }; if (remainingPending.length > 0) { stateUpdates.pendingToolCalls = remainingPending; @@ -1403,8 +1566,8 @@ export class ModelResult< await this.saveStateSafely(); } - // If we still have pending approvals, stop here - if (remainingPending.length > 0) { + // If we are paused (for approval or for HITL), stop here + if (nextStatus !== 'in_progress') { return; } @@ -1428,9 +1591,25 @@ export class ModelResult< // Convert to API format const toolOutputs = unsentResultsToAPIFormat(unsentResults); - // Build new input with tool results + // Build turn context for hook resolution + // numberOfTurns represents the current turn number (1-indexed after initial) + const turnContext: TurnContext = { + numberOfTurns: this.allToolExecutionRounds.length + 1, + }; + + // Apply onResponseReceived hooks ONLY to the newly-added tool outputs. + // `currentMessages` already contains caller-supplied outputs that were + // hooked during init (see `initStream` -> `applyOnResponseReceivedHooks`); + // re-hooking them would double-invoke non-idempotent hooks. const currentMessages = this.currentState.messages; - const newInput = appendToMessages(currentMessages, toolOutputs); + const hookedToolOutputs = await this.hookFreshToolOutputs( + toolOutputs, + currentMessages, + turnContext, + ); + + // Build new input with the hooked tool results appended + const newInput = appendToMessages(currentMessages, hookedToolOutputs); // Clear unsent results from state this.currentState = updateState(this.currentState, { @@ -1442,27 +1621,15 @@ export class ModelResult< await this.saveStateSafely(); // Build request with the updated input - // numberOfTurns represents the current turn number (1-indexed after initial) - const turnContext: TurnContext = { - numberOfTurns: this.allToolExecutionRounds.length + 1, - }; - const baseRequest = await this.resolveRequestForContext(turnContext); - // Apply onResponseReceived hooks to caller-supplied tool-output items - // (unsentToolResults merged into newInput above may include caller-provided outputs) - const hookedInput = await applyOnResponseReceivedHooks( - newInput, - this.options.tools, - turnContext, - this.contextStore ?? undefined, - this.options.sharedContextSchema, - ); - - // Create request with the accumulated messages + // Create request with the accumulated messages. No full-input re-hooking: + // the fresh tool outputs were hooked above via `hookFreshToolOutputs`, and + // previously-hooked items in `newInput` (carried over from init) must not + // be re-hooked here (non-idempotent hooks would double-fire). const request: models.ResponsesRequest = { ...baseRequest, - input: hookedInput, + input: newInput, stream: true, }; @@ -1504,8 +1671,14 @@ export class ModelResult< this.toolExecutionPromise = (async () => { await this.initStream(); - // If resuming from approval and still pending, don't continue - if (this.isResumingFromApproval && this.currentState?.status === 'awaiting_approval') { + // If resuming from approval or HITL pause and still pending, don't continue. + // `processApprovalDecisions` runs in initStream for resumes; if it left us + // paused (any remaining pending calls), the outer loop should not execute. + if ( + this.isResumingFromApproval && + (this.currentState?.status === 'awaiting_approval' || + this.currentState?.status === 'awaiting_hitl') + ) { return; } @@ -1580,7 +1753,10 @@ export class ModelResult< await this.resolveAsyncFunctionsForTurn(turnContext); // Execute tools - const toolResults = await this.executeToolRound(currentToolCalls, turnContext); + const { toolResults, pausedCalls } = await this.executeToolRound( + currentToolCalls, + turnContext, + ); // Server-tool output items are already-executed results in the // response; collect them so toolResults presents a unified list. @@ -1619,6 +1795,14 @@ export class ModelResult< // Save tool results to state await this.saveToolResultsToState(toolResults); + // If any HITL tools paused this round, stop here without making a + // follow-up request — sending an incomplete set of outputs would be + // incorrect. Persist the paused calls so the caller can resume later. + if (pausedCalls.length > 0) { + await this.persistHitlPause(currentResponse, pausedCalls); + return; + } + // Apply nextTurnParams await this.applyNextTurnParams(currentToolCalls); @@ -2244,14 +2428,17 @@ export class ModelResult< // ========================================================================= /** - * Check if the conversation requires human approval to continue. - * Returns true if there are pending tool calls awaiting approval. + * Check if the conversation requires human input to continue. + * Returns true when the conversation is paused waiting for caller-supplied + * decisions — either approval/rejection (`awaiting_approval`) or HITL tool + * resume (`awaiting_hitl`). Also returns true whenever `pendingToolCalls` + * is populated regardless of status. */ async requiresApproval(): Promise { await this.initStream(); - // If we have pending tool calls in state, approval is required - if (this.currentState?.status === 'awaiting_approval') { + const status = this.currentState?.status; + if (status === 'awaiting_approval' || status === 'awaiting_hitl') { return true; } diff --git a/packages/agent/src/lib/tool-executor.ts b/packages/agent/src/lib/tool-executor.ts index 8929dea..3ea3a74 100644 --- a/packages/agent/src/lib/tool-executor.ts +++ b/packages/agent/src/lib/tool-executor.ts @@ -517,8 +517,12 @@ export async function applyOnResponseReceivedHooks( } } + // Element type of the array form of InputsUnion — use this so `rewritten` + // is structurally assignable back to InputsUnion without an `as` cast. + type InputsArrayItem = Extract[number]; + let changed = false; - const rewritten: unknown[] = []; + const rewritten: InputsArrayItem[] = []; for (const item of input) { if (!isFunctionCallOutputItem(item)) { rewritten.push(item); @@ -554,17 +558,21 @@ export async function applyOnResponseReceivedHooks( ); newOutput = JSON.stringify(hookResult); } catch (err) { + // Preserve the caller's original output alongside the hook error so the + // model can distinguish a hook failure from a tool-reported error. newOutput = JSON.stringify({ error: err instanceof Error ? err.message : String(err), + originalOutput: parsed, }); } - rewritten.push({ + const replaced: models.FunctionCallOutputItem = { ...item, output: newOutput, - }); + }; + rewritten.push(replaced); changed = true; } - return changed ? (rewritten as models.InputsUnion) : input; + return changed ? rewritten : input; } diff --git a/packages/agent/src/lib/tool-types.ts b/packages/agent/src/lib/tool-types.ts index f43bebc..d34d269 100644 --- a/packages/agent/src/lib/tool-types.ts +++ b/packages/agent/src/lib/tool-types.ts @@ -939,9 +939,22 @@ export interface PartialResponse vi.fn()); + +vi.mock('@openrouter/sdk/funcs/betaResponsesSend', () => ({ + betaResponsesSend: mockBetaResponsesSend, +})); + +// Import ModelResult AFTER vi.mock so the mock is wired in. +const { ModelResult } = await import('../../src/lib/model-result.js'); + const turnContext: TurnContext = { numberOfTurns: 1, }; @@ -345,7 +366,7 @@ describe('applyOnResponseReceivedHooks', () => { expect(result).toBe(input); // same reference, no rewrite }); - it('replaces output with an error object when the hook throws', async () => { + it('replaces output with an error object preserving the original output when the hook throws', async () => { const t = tool({ name: 'approve', inputSchema: z.object({ @@ -357,14 +378,13 @@ describe('applyOnResponseReceivedHooks', () => { }, }); + const originalPayload = { + ok: true, + note: 'carry-me-through', + }; const input: models.InputsUnion = [ callItem('c1', 'approve'), - outputItem( - 'c1', - JSON.stringify({ - ok: true, - }), - ), + outputItem('c1', JSON.stringify(originalPayload)), ]; const result = await applyOnResponseReceivedHooks( @@ -378,8 +398,46 @@ describe('applyOnResponseReceivedHooks', () => { const out = arr[1] as models.FunctionCallOutputItem; const parsed = JSON.parse(out.output as string) as { error: string; + originalOutput: unknown; }; expect(parsed.error).toBe('invalid result'); + // Original caller-supplied output must be preserved (parsed form) so the + // model can distinguish a hook failure from a tool-reported error. + expect(parsed.originalOutput).toEqual(originalPayload); + }); + + it('preserves the raw string as originalOutput when the output is not JSON and the hook throws', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ + amount: z.number(), + }), + onToolCalled: async () => null, + onResponseReceived: async () => { + throw new Error('invalid result'); + }, + }); + + const input: models.InputsUnion = [ + callItem('c1', 'approve'), + outputItem('c1', 'not-json'), + ]; + + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); + const arr = result as unknown[]; + const out = arr[1] as models.FunctionCallOutputItem; + const parsed = JSON.parse(out.output as string) as { + error: string; + originalOutput: unknown; + }; + expect(parsed.error).toBe('invalid result'); + expect(parsed.originalOutput).toBe('not-json'); }); it('passes the parsed raw result (not the raw string) to the hook', async () => { @@ -501,3 +559,887 @@ describe('applyOnResponseReceivedHooks', () => { expect(result).toBe(input); }); }); + +// --------------------------------------------------------------------------- +// Integration tests — full HITL flow through ModelResult +// --------------------------------------------------------------------------- +// +// These tests drive the HITL loop end-to-end: onToolCalled fires from +// ModelResult.executeToolsIfNeeded, and the follow-up turn sends the +// tool output (possibly transformed by onResponseReceived) back to the +// mocked API. Everything below the describe block uses the vi.hoisted +// mock for `@openrouter/sdk/funcs/betaResponsesSend` declared at the +// top of this file. + +/** + * Build a completed OpenResponsesResult with the given output items. + * Intentionally fills only the fields ModelResult actually reads — the + * full schema is huge and any missing fields would fail the SDK's own + * inbound validation, but we never push through that path from a mock. + */ +function makeResponse( + id: string, + output: models.OpenResponsesResult['output'], +): models.OpenResponsesResult { + return { + id, + object: 'response', + createdAt: 0, + model: 'test-model', + status: 'completed', + completedAt: 0, + output, + error: null, + incompleteDetails: null, + temperature: null, + topP: null, + presencePenalty: null, + frequencyPenalty: null, + metadata: null, + instructions: null, + tools: [], + toolChoice: 'auto', + parallelToolCalls: false, + }; +} + +/** + * Wrap a completed response in a one-shot ReadableStream of StreamEvents, + * mirroring what betaResponsesSend returns when stream: true is requested. + * Note: the single event satisfies `StreamEventsResponseCompleted` which + * is a member of the `StreamEvents` union — no cast needed. + */ +function makeCompletedStream( + response: models.OpenResponsesResult, +): ReadableStream { + const completedEvent: models.StreamEventsResponseCompleted = { + type: 'response.completed', + response, + sequenceNumber: 0, + }; + return new ReadableStream({ + start(controller) { + controller.enqueue(completedEvent); + controller.close(); + }, + }); +} + +function makeFunctionCallItem( + callId: string, + name: string, + args: string, +): models.OutputFunctionCallItem { + return { + type: 'function_call', + id: `fc_${callId}`, + callId, + name, + arguments: args, + status: 'completed', + }; +} + +function makeMessageItem(id: string, text: string): models.OutputMessage { + return { + id, + type: 'message', + role: 'assistant', + status: 'completed', + content: [ + { + type: 'output_text', + text, + annotations: [], + }, + ], + }; +} + +/** + * Typeguard for a completed betaResponsesSend invocation — narrows the + * loosely-typed `vi.fn()` mock call payload to the subset we need + * (the `responsesRequest` wrapper with the `input` / `tools` fields). + */ +function isSendCallArg(value: unknown): value is { + responsesRequest: { + input?: models.InputsUnion; + tools?: models.ResponsesRequestToolUnion[]; + }; +} { + if (typeof value !== 'object' || value === null) { + return false; + } + if (!('responsesRequest' in value)) { + return false; + } + const rr = value.responsesRequest; + return typeof rr === 'object' && rr !== null; +} + +/** + * Typeguard narrowing an InputsUnion array element to a function_call_output. + */ +function isFunctionCallOutput(item: unknown): item is models.FunctionCallOutputItem { + if (typeof item !== 'object' || item === null) { + return false; + } + if (!('type' in item)) { + return false; + } + return item.type === 'function_call_output'; +} + +/** + * Typeguard for a plain-object record (used to narrow the raw tool + * output inside onResponseReceived without a type assertion). + */ +function isRawRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +describe('HITL flow through ModelResult (integration)', () => { + beforeEach(() => { + mockBetaResponsesSend.mockReset(); + }); + + it('auto-resolve: onToolCalled returning a value feeds the model on the next turn', async () => { + // Turn 0: model invokes the HITL tool. + const turn0 = makeResponse('resp_turn_0', [ + makeFunctionCallItem( + 'c1', + 'approve', + JSON.stringify({ + amount: 5, + }), + ), + ]); + // Turn 1: model replies after seeing the tool output. + const turn1 = makeResponse('resp_turn_1', [ + makeMessageItem('msg_1', 'approved'), + ]); + + mockBetaResponsesSend + .mockResolvedValueOnce({ + ok: true, + value: makeCompletedStream(turn0), + }) + .mockResolvedValueOnce({ + ok: true, + value: makeCompletedStream(turn1), + }); + + const approve = tool({ + name: 'approve', + inputSchema: z.object({ + amount: z.number(), + }), + outputSchema: z.object({ + ok: z.boolean(), + }), + onToolCalled: async () => ({ + ok: true, + }), + }); + + const tools = [ + approve, + ] as const; + + const result = new ModelResult({ + request: { + model: 'test-model', + input: 'approve 5', + tools: [ + { + type: 'function', + name: 'approve', + description: null, + strict: null, + parameters: {}, + }, + ], + }, + client: {} as OpenRouterCore, + tools, + }); + + const finalResponse = await result.getResponse(); + + // Both turns should have fired — pause did NOT trigger because + // onToolCalled returned a value. + expect(mockBetaResponsesSend).toHaveBeenCalledTimes(2); + expect(finalResponse.id).toBe('resp_turn_1'); + + // Second request's input must contain the function_call_output with + // the hook's return value. + const secondCallArg = mockBetaResponsesSend.mock.calls[1]?.[1]; + if (!isSendCallArg(secondCallArg)) { + throw new Error('Second send call arg did not match expected shape'); + } + const input = secondCallArg.responsesRequest.input; + if (!Array.isArray(input)) { + throw new Error('Expected second-turn input to be an array'); + } + const output = input.find(isFunctionCallOutput); + if (!output) { + throw new Error('Expected a function_call_output in second-turn input'); + } + expect(output.callId).toBe('c1'); + const rawOutput = output.output; + if (typeof rawOutput !== 'string') { + throw new Error('Expected function_call_output.output to be a string'); + } + expect(JSON.parse(rawOutput)).toEqual({ + ok: true, + }); + }); + + it('pause: onToolCalled returning null breaks the loop cleanly', async () => { + // Turn 0: model invokes the HITL tool. No turn 1 should follow. + const turn0 = makeResponse('resp_turn_0', [ + makeFunctionCallItem( + 'c1', + 'approve', + JSON.stringify({ + amount: 5, + }), + ), + ]); + + mockBetaResponsesSend.mockResolvedValueOnce({ + ok: true, + value: makeCompletedStream(turn0), + }); + + const approve = tool({ + name: 'approve', + inputSchema: z.object({ + amount: z.number(), + }), + onToolCalled: async () => null, + }); + + const tools = [ + approve, + ] as const; + + const result = new ModelResult({ + request: { + model: 'test-model', + input: 'approve 5', + tools: [ + { + type: 'function', + name: 'approve', + description: null, + strict: null, + parameters: {}, + }, + ], + }, + client: {} as OpenRouterCore, + tools, + }); + + const finalResponse = await result.getResponse(); + + // The loop must NOT have made a follow-up request — the caller is + // expected to resume by invoking callModel again with a + // function_call_output they produce externally. + expect(mockBetaResponsesSend).toHaveBeenCalledTimes(1); + + // The final response surfaces the pending function_call so the + // caller can see what needs to be resolved. + const toolCalls = await result.getToolCalls(); + expect(toolCalls).toHaveLength(1); + expect(toolCalls[0]?.id).toBe('c1'); + expect(toolCalls[0]?.name).toBe('approve'); + + // The response output should retain the paused function_call item. + const outputs = Array.isArray(finalResponse.output) + ? finalResponse.output + : [ + finalResponse.output, + ]; + const pendingCall = outputs.find( + (item): item is models.OutputFunctionCallItem => + typeof item === 'object' && + item !== null && + 'type' in item && + item.type === 'function_call', + ); + expect(pendingCall).toBeDefined(); + expect(pendingCall?.callId).toBe('c1'); + }); + + it('resume: caller-supplied function_call_output is transformed by onResponseReceived before the model sees it', async () => { + // Single turn: the caller resumes by supplying both the prior + // function_call and its function_call_output; the model replies + // after the hook rewrites the output. + const resumedTurn = makeResponse('resp_resumed', [ + makeMessageItem('msg_1', 'approved after review'), + ]); + + mockBetaResponsesSend.mockResolvedValueOnce({ + ok: true, + value: makeCompletedStream(resumedTurn), + }); + + const onToolCalled = vi.fn(async () => null); + const approve = tool({ + name: 'approve', + inputSchema: z.object({ + amount: z.number(), + }), + onToolCalled, + onResponseReceived: async (raw) => { + // Transform the caller's raw output before the model sees it. + if (!isRawRecord(raw)) { + return { + ok: false, + reviewedAt: 1234, + }; + } + return { + ...raw, + reviewedAt: 1234, + }; + }, + }); + + const tools = [ + approve, + ] as const; + + // Caller-supplied input: the prior function_call plus the caller's + // raw function_call_output for c1. + const resumeInput: models.InputsUnion = [ + makeFunctionCallItem( + 'c1', + 'approve', + JSON.stringify({ + amount: 5, + }), + ), + { + type: 'function_call_output', + id: 'output_c1', + callId: 'c1', + output: JSON.stringify({ + ok: true, + }), + }, + ]; + + const result = new ModelResult({ + request: { + model: 'test-model', + input: resumeInput, + tools: [ + { + type: 'function', + name: 'approve', + description: null, + strict: null, + parameters: {}, + }, + ], + }, + client: {} as OpenRouterCore, + tools, + }); + + const text = await result.getText(); + + // Only one API call — resume is a single turn because the caller + // pre-baked the function_call_output. + expect(mockBetaResponsesSend).toHaveBeenCalledTimes(1); + + // onToolCalled should NOT fire on resume — we are not reacting to a + // fresh function_call from the model, we are supplying a prior one. + expect(onToolCalled).not.toHaveBeenCalled(); + + // The request shipped to the mocked API must carry the hook's + // transformed output, not the caller's raw payload. + const firstCallArg = mockBetaResponsesSend.mock.calls[0]?.[1]; + if (!isSendCallArg(firstCallArg)) { + throw new Error('First send call arg did not match expected shape'); + } + const input = firstCallArg.responsesRequest.input; + if (!Array.isArray(input)) { + throw new Error('Expected resume input to be an array'); + } + const output = input.find(isFunctionCallOutput); + if (!output) { + throw new Error('Expected a function_call_output in the resume input'); + } + const rawOutput = output.output; + if (typeof rawOutput !== 'string') { + throw new Error('Expected function_call_output.output to be a string'); + } + const parsed: unknown = JSON.parse(rawOutput); + expect(parsed).toEqual({ + ok: true, + reviewedAt: 1234, + }); + + // The final text reflects the model's reply after it saw the + // transformed tool output. + expect(text).toBe('approved after review'); + }); +}); + +// --------------------------------------------------------------------------- +// ModelResult state-machine tests for HITL pause + resume fixes +// (verify ConversationStatus transitions and pendingToolCalls visibility) +// --------------------------------------------------------------------------- + +/** + * Minimal in-memory StateAccessor for driving ModelResult through + * awaiting_approval / awaiting_hitl transitions without a real DB. + */ +function createMemoryAccessor( + initial: ConversationState | null = null, +): { + accessor: StateAccessor; + get: () => ConversationState | null; +} { + let state = initial; + const accessor: StateAccessor = { + load: async () => state, + save: async (next) => { + state = next; + }, + }; + return { + accessor, + get: () => state, + }; +} + +describe('HITL pause persists state (Fix #2)', () => { + beforeEach(() => { + mockBetaResponsesSend.mockReset(); + }); + + it('HITL pause sets status=awaiting_hitl and populates pendingToolCalls', async () => { + const turn0 = makeResponse('resp_turn_0', [ + makeFunctionCallItem( + 'c1', + 'approve', + JSON.stringify({ + amount: 5, + }), + ), + ]); + + mockBetaResponsesSend.mockResolvedValueOnce({ + ok: true, + value: makeCompletedStream(turn0), + }); + + const approve = tool({ + name: 'approve', + inputSchema: z.object({ + amount: z.number(), + }), + onToolCalled: async () => null, + }); + + const tools = [ + approve, + ] as const; + + const { accessor, get } = createMemoryAccessor(); + + const result = new ModelResult({ + request: { + model: 'test-model', + input: 'approve 5', + tools: [ + { + type: 'function', + name: 'approve', + description: null, + strict: null, + parameters: {}, + }, + ], + }, + client: {} as OpenRouterCore, + tools, + state: accessor, + }); + + // Drive the loop to completion (pause). + await result.getResponse(); + + // Only the initial turn was sent — no follow-up after pause. + expect(mockBetaResponsesSend).toHaveBeenCalledTimes(1); + + const saved = get(); + expect(saved).not.toBeNull(); + expect(saved?.status).toBe('awaiting_hitl'); + expect(saved?.pendingToolCalls).toHaveLength(1); + expect(saved?.pendingToolCalls?.[0]?.id).toBe('c1'); + + // getPendingToolCalls() surfaces paused calls to the caller. + const pending = await result.getPendingToolCalls(); + expect(pending).toHaveLength(1); + expect(pending[0]?.id).toBe('c1'); + + // requiresApproval() returns true for HITL pauses too. + expect(await result.requiresApproval()).toBe(true); + }); +}); + +describe('Approved HITL pauses retain pendingToolCalls (Fix #9)', () => { + beforeEach(() => { + mockBetaResponsesSend.mockReset(); + }); + + it('resume: approved HITL that returns null stays in pendingToolCalls with status awaiting_hitl', async () => { + // This tool pauses every time — simulating a caller that must + // explicitly supply the output externally later. + const onToolCalled = vi.fn(async () => null); + const approve = tool({ + name: 'approve', + inputSchema: z.object({ + amount: z.number(), + }), + requireApproval: true, + onToolCalled, + }); + + const tools = [ + approve, + ] as const; + + // Pre-populate state as if the prior run left us awaiting_approval + // with one pending call c1. + const pending: ParsedToolCall<(typeof tools)[number]> = { + id: 'c1', + name: 'approve', + arguments: { + amount: 5, + }, + }; + const savedState: ConversationState = { + id: 'conv_test', + messages: [ + makeFunctionCallItem( + 'c1', + 'approve', + JSON.stringify({ + amount: 5, + }), + ), + ], + pendingToolCalls: [ + pending, + ], + status: 'awaiting_approval', + createdAt: 0, + updatedAt: 0, + }; + + const { accessor, get } = createMemoryAccessor(savedState); + + const result = new ModelResult({ + request: { + model: 'test-model', + input: 'approve 5', + tools: [ + { + type: 'function', + name: 'approve', + description: null, + strict: null, + parameters: {}, + }, + ], + }, + client: {} as OpenRouterCore, + tools, + state: accessor, + // Caller approves c1 — but onToolCalled returns null, so the call + // must STAY on pendingToolCalls instead of silently disappearing. + approveToolCalls: [ + 'c1', + ], + }); + + // Drive initStream + processApprovalDecisions. + const pausedCalls = await result.getPendingToolCalls(); + + // The approved HITL call paused, so onToolCalled should have been + // invoked once during processApprovalDecisions. + expect(onToolCalled).toHaveBeenCalledTimes(1); + + // No API request should have been made — we paused before any + // follow-up turn. + expect(mockBetaResponsesSend).not.toHaveBeenCalled(); + + // The call must remain in pendingToolCalls (Fix #9). + expect(pausedCalls).toHaveLength(1); + expect(pausedCalls[0]?.id).toBe('c1'); + + const saved = get(); + expect(saved?.status).toBe('awaiting_hitl'); + expect(saved?.pendingToolCalls).toHaveLength(1); + expect(saved?.pendingToolCalls?.[0]?.id).toBe('c1'); + }); +}); + +describe('hasExecutableToolCalls treats HITL tools as auto-resolvable (Fix #1)', () => { + beforeEach(() => { + mockBetaResponsesSend.mockReset(); + }); + + it('HITL-only tool calls invoke onToolCalled instead of being classified as manual', async () => { + // If hasExecutableToolCalls still used hasExecuteFunction, the loop + // would exit immediately without invoking onToolCalled and no + // follow-up turn would be made. This test locks in that HITL tools + // DO drive the loop forward. + const onToolCalled = vi.fn(async () => ({ + ok: true, + })); + + const turn0 = makeResponse('resp_turn_0', [ + makeFunctionCallItem( + 'c1', + 'approve', + JSON.stringify({ + amount: 5, + }), + ), + ]); + const turn1 = makeResponse('resp_turn_1', [ + makeMessageItem('msg_1', 'approved'), + ]); + + mockBetaResponsesSend + .mockResolvedValueOnce({ + ok: true, + value: makeCompletedStream(turn0), + }) + .mockResolvedValueOnce({ + ok: true, + value: makeCompletedStream(turn1), + }); + + const approve = tool({ + name: 'approve', + inputSchema: z.object({ + amount: z.number(), + }), + outputSchema: z.object({ + ok: z.boolean(), + }), + onToolCalled, + }); + + const tools = [ + approve, + ] as const; + + const result = new ModelResult({ + request: { + model: 'test-model', + input: 'approve 5', + tools: [ + { + type: 'function', + name: 'approve', + description: null, + strict: null, + parameters: {}, + }, + ], + }, + client: {} as OpenRouterCore, + tools, + }); + + const final = await result.getResponse(); + + // Two turns means the HITL tool WAS recognized as executable. + expect(mockBetaResponsesSend).toHaveBeenCalledTimes(2); + expect(onToolCalled).toHaveBeenCalledTimes(1); + expect(final.id).toBe('resp_turn_1'); + }); +}); + +describe('continueWithUnsentResults does not re-hook caller-supplied items (Fix #8)', () => { + beforeEach(() => { + mockBetaResponsesSend.mockReset(); + }); + + it('onResponseReceived fires once per caller-supplied output across init + resume', async () => { + // Scenario: caller supplies a function_call_output for a HITL tool in + // the initial input, the model requests another (approval-required) + // tool call in response, and after the caller approves that second + // call the loop resumes via continueWithUnsentResults. Before Fix #8 + // the initial caller-supplied output would be re-hooked during + // resume because applyOnResponseReceivedHooks walked the full + // accumulated newInput. + const onResponseReceived = vi.fn(async (raw: unknown) => { + if (!isRawRecord(raw)) { + return { + hooked: true, + }; + } + return { + ...raw, + hooked: true, + }; + }); + + const approve = tool({ + name: 'approve', + inputSchema: z.object({ + amount: z.number(), + }), + onToolCalled: async () => ({ + ok: true, + }), + onResponseReceived, + }); + + const needsApproval = tool({ + name: 'dangerous', + inputSchema: z.object({ + x: z.number(), + }), + requireApproval: true, + execute: async () => ({ + done: true, + }), + }); + + const tools = [ + approve, + needsApproval, + ] as const; + + // Initial model turn requests `dangerous`, forcing an approval pause. + const turn0 = makeResponse('resp_turn_0', [ + makeFunctionCallItem( + 'd1', + 'dangerous', + JSON.stringify({ + x: 1, + }), + ), + ]); + // After the caller approves d1 and execution resumes, the model + // replies with a plain message. + const turn1 = makeResponse('resp_turn_1', [ + makeMessageItem('msg_1', 'done'), + ]); + + mockBetaResponsesSend + .mockResolvedValueOnce({ + ok: true, + value: makeCompletedStream(turn0), + }) + .mockResolvedValueOnce({ + ok: true, + value: makeCompletedStream(turn1), + }); + + // Caller-supplied initial input: a function_call_output for `approve` + // that will be hooked during init (via applyOnResponseReceivedHooks). + const initialInput: models.InputsUnion = [ + makeFunctionCallItem( + 'c0', + 'approve', + JSON.stringify({ + amount: 5, + }), + ), + { + type: 'function_call_output', + id: 'output_c0', + callId: 'c0', + output: JSON.stringify({ + ok: true, + }), + }, + ]; + + const { accessor } = createMemoryAccessor(); + + // First pass: initial run. The initial caller-supplied output gets + // hooked once during init. + const result1 = new ModelResult({ + request: { + model: 'test-model', + input: initialInput, + tools: [ + { + type: 'function', + name: 'approve', + description: null, + strict: null, + parameters: {}, + }, + { + type: 'function', + name: 'dangerous', + description: null, + strict: null, + parameters: {}, + }, + ], + }, + client: {} as OpenRouterCore, + tools, + state: accessor, + }); + + await result1.getResponse(); + expect(onResponseReceived).toHaveBeenCalledTimes(1); + + // Second pass: resume by approving d1. continueWithUnsentResults + // must NOT re-hook the earlier c0 output (which lives in the saved + // messages from the first run). + onResponseReceived.mockClear(); + + const result2 = new ModelResult({ + request: { + model: 'test-model', + input: initialInput, + tools: [ + { + type: 'function', + name: 'approve', + description: null, + strict: null, + parameters: {}, + }, + { + type: 'function', + name: 'dangerous', + description: null, + strict: null, + parameters: {}, + }, + ], + }, + client: {} as OpenRouterCore, + tools, + state: accessor, + approveToolCalls: [ + 'd1', + ], + }); + + await result2.getResponse(); + + // On resume, `dangerous` is an execute tool (no hook) and the c0 + // output lives in message history — the hook must NOT run again. + expect(onResponseReceived).not.toHaveBeenCalled(); + }); +}); From 51a983d256d3594060283e6a95b3c2fac0e9cfff Mon Sep 17 00:00:00 2001 From: Matt Apperson Date: Tue, 5 May 2026 20:19:13 -0400 Subject: [PATCH 4/5] fix(tools): address HITL review feedback - Preserve content-array FunctionCallOutputItem shapes in the HITL onResponseReceived pipeline instead of JSON.stringify'ing them; export isContentArray from conversation-state for reuse. - Require outputSchema on HITL tool configs and use it to validate caller-supplied responses. Validation failures (and thrown hooks) replace the output with {error, originalOutput}; executeHITLTool now validates onToolCalled's return unconditionally. - Stop re-processing historical function_call_output items at initStream. When resuming from saved state, only freshly-supplied input items are hooked; history's function_call items still power callId->name resolution. Refactored hookFreshToolOutputs into applyHooksToFreshItems. - Drop the onResponseReceived call on SDK-generated outputs in continueWithUnsentResults. The hook is now strictly for caller-supplied outputs, matching its documented semantics. --- packages/agent/src/lib/conversation-state.ts | 4 +- packages/agent/src/lib/model-result.ts | 172 +++-- packages/agent/src/lib/tool-executor.ts | 153 +++- packages/agent/src/lib/tool-types.ts | 8 +- packages/agent/src/lib/tool.ts | 22 +- packages/agent/tests/unit/hitl-tool.test.ts | 724 ++++++++++++++++--- 6 files changed, 851 insertions(+), 232 deletions(-) diff --git a/packages/agent/src/lib/conversation-state.ts b/packages/agent/src/lib/conversation-state.ts index d1845b6..d672643 100644 --- a/packages/agent/src/lib/conversation-state.ts +++ b/packages/agent/src/lib/conversation-state.ts @@ -244,7 +244,9 @@ export function createRejectedResult { - if (freshToolOutputs.length === 0) { - return freshToolOutputs; + ): Promise { + if (freshItems.length === 0) { + return freshItems; } - // Collect function_call items from history so `applyOnResponseReceivedHooks` - // can resolve callId -> toolName. We don't mutate history here — the items - // are only used to power the internal callIdToName map. - const historyArray = Array.isArray(messageHistory) - ? messageHistory + // Collect function_call items from history so the hook executor can + // resolve callId -> toolName without us having to mirror that logic. + const historyArray = Array.isArray(historicalItems) + ? historicalItems : [ - messageHistory, + historicalItems, ]; const functionCallItems: models.BaseInputsUnion[] = []; for (const item of historyArray) { - if (item && typeof item === 'object' && 'type' in item && item.type === 'function_call') { - functionCallItems.push(item as models.BaseInputsUnion); + if (isFunctionCallItem(item)) { + functionCallItems.push(item); } } + // Build a synthetic input that puts the historical function_calls + // BEFORE the fresh items. `applyOnResponseReceivedHooks` only rewrites + // function_call_output items, so the function_call items are seen only + // as name-resolution context. const syntheticInput: models.InputsUnion = [ ...functionCallItems, - ...freshToolOutputs, + ...freshItems, ]; const hookedInput = await applyOnResponseReceivedHooks( @@ -1209,27 +1222,21 @@ export class ModelResult< if (hookedInput === syntheticInput) { // No rewrites; return the originals unchanged. - return freshToolOutputs; + return freshItems; } - // Extract the rewritten tool outputs (they sit after the function_calls). + // Drop the leading function_call items we prepended; what remains is + // the fresh items in their original order (some with rewritten outputs). const hookedArray = Array.isArray(hookedInput) ? hookedInput : [ hookedInput, ]; - const extracted: models.FunctionCallOutputItem[] = []; - for (const item of hookedArray) { - if ( - item && - typeof item === 'object' && - 'type' in item && - item.type === 'function_call_output' - ) { - extracted.push(item as models.FunctionCallOutputItem); - } + if (hookedArray.length !== syntheticInput.length) { + // Shouldn't happen (hooks only rewrite in-place), but be conservative. + return freshItems; } - return extracted.length === freshToolOutputs.length ? extracted : freshToolOutputs; + return hookedArray.slice(functionCallItems.length); } /** @@ -1358,37 +1365,54 @@ export class ModelResult< // Resolve any async functions first let baseRequest = await this.resolveRequestForContext(initialContext); - // If we have state with existing messages, use those as input - if ( - this.currentState?.messages && + // Split input into "historical" (already in state.messages) and "fresh" + // (newly supplied this turn). `onResponseReceived` must fire only for + // fresh items — re-hooking historical outputs on every callModel call + // would double-invoke non-idempotent hooks. + const hasLoadedHistory = + !!this.currentState?.messages && Array.isArray(this.currentState.messages) && - this.currentState.messages.length > 0 - ) { - // Append new input to existing messages + this.currentState.messages.length > 0; + + if (hasLoadedHistory && this.currentState) { + // `currentState.messages` is InputsUnion — keep it as that union so + // appendToMessages (which expects InputsUnion) accepts it directly. + const historicalMessages: models.InputsUnion = this.currentState.messages; + + // Normalize the caller-supplied input for this turn into an array of + // fresh items. Undefined stays undefined (no new items). The widening + // to BaseInputsUnion[] matches the signature of appendToMessages and + // mirrors the pre-existing pattern elsewhere in this file; the two + // union shapes (InputsUnion1 vs BaseInputsUnion1) describe the same + // SDK input items with different nominal types, and BaseInputsUnion + // already includes `any` in its element type, so the runtime shape + // is preserved either way. const newInput = baseRequest.input; - if (newInput) { - const inputArray = Array.isArray(newInput) - ? newInput + let freshItems: models.BaseInputsUnion[] | undefined; + if (newInput !== undefined) { + freshItems = Array.isArray(newInput) + ? (newInput as models.BaseInputsUnion[]) : [ newInput, ]; - baseRequest = { - ...baseRequest, - input: appendToMessages( - this.currentState.messages, - inputArray as models.BaseInputsUnion[], - ), - }; - } else { - baseRequest = { - ...baseRequest, - input: this.currentState.messages, - }; } - } - // Apply onResponseReceived hooks to caller-supplied tool-output items - if (baseRequest.input !== undefined) { + // Hook fresh items only (historical function_calls serve as + // name-resolution context). Leave historical items untouched. + const hookedFresh = freshItems + ? await this.applyHooksToFreshItems(freshItems, historicalMessages, initialContext) + : undefined; + + baseRequest = { + ...baseRequest, + input: hookedFresh + ? appendToMessages(historicalMessages, hookedFresh) + : historicalMessages, + }; + } else if (baseRequest.input !== undefined) { + // No loaded history — everything in input is fresh. Hook the whole + // thing (non-array inputs pass through applyOnResponseReceivedHooks + // unchanged). const hookedInput = await applyOnResponseReceivedHooks( baseRequest.input, this.options.tools, @@ -1597,19 +1621,13 @@ export class ModelResult< numberOfTurns: this.allToolExecutionRounds.length + 1, }; - // Apply onResponseReceived hooks ONLY to the newly-added tool outputs. - // `currentMessages` already contains caller-supplied outputs that were - // hooked during init (see `initStream` -> `applyOnResponseReceivedHooks`); - // re-hooking them would double-invoke non-idempotent hooks. + // Append SDK-generated tool outputs directly — `onResponseReceived` is + // reserved for caller-supplied outputs (the resume-with-function-call- + // output path, hooked during init). SDK-produced outputs from auto- + // executed tools already went through the tool's own execute/generator + // pipeline and must not be mutated by the resume hook. const currentMessages = this.currentState.messages; - const hookedToolOutputs = await this.hookFreshToolOutputs( - toolOutputs, - currentMessages, - turnContext, - ); - - // Build new input with the hooked tool results appended - const newInput = appendToMessages(currentMessages, hookedToolOutputs); + const newInput = appendToMessages(currentMessages, toolOutputs); // Clear unsent results from state this.currentState = updateState(this.currentState, { @@ -1623,10 +1641,10 @@ export class ModelResult< // Build request with the updated input const baseRequest = await this.resolveRequestForContext(turnContext); - // Create request with the accumulated messages. No full-input re-hooking: - // the fresh tool outputs were hooked above via `hookFreshToolOutputs`, and - // previously-hooked items in `newInput` (carried over from init) must not - // be re-hooked here (non-idempotent hooks would double-fire). + // No hooking here: SDK-generated outputs are appended as-is and any + // caller-supplied items in `newInput` (carried over from init) were + // already hooked during `initStream` — re-hooking would double-fire + // non-idempotent hooks. const request: models.ResponsesRequest = { ...baseRequest, input: newInput, diff --git a/packages/agent/src/lib/tool-executor.ts b/packages/agent/src/lib/tool-executor.ts index 3ea3a74..9b7ab46 100644 --- a/packages/agent/src/lib/tool-executor.ts +++ b/packages/agent/src/lib/tool-executor.ts @@ -1,6 +1,7 @@ import type * as models from '@openrouter/sdk/models'; import * as z4 from 'zod/v4'; import type { $ZodObject, $ZodShape, $ZodType } from 'zod/v4/core'; +import { isContentArray } from './conversation-state.js'; import { isFunctionCallItem, isFunctionCallOutputItem } from './stream-type-guards.js'; import type { ToolContextStore } from './tool-context.js'; import { buildToolExecuteContext } from './tool-context.js'; @@ -357,19 +358,12 @@ export async function executeHITLTool( return null; } - if (tool.function.outputSchema) { - const validatedOutput = validateToolOutput(tool.function.outputSchema, result); - return { - toolCallId: toolCall.id, - toolName: toolCall.name, - result: validatedOutput, - }; - } - + // outputSchema is required on HITL tools — validate unconditionally. + const validatedOutput = validateToolOutput(tool.function.outputSchema, result); return { toolCallId: toolCall.id, toolName: toolCall.name, - result, + result: validatedOutput, }; } catch (error) { return { @@ -471,15 +465,72 @@ function isItemArray( } /** - * Walk the input items and apply `onResponseReceived` hooks for HITL tools. + * Serialize a value for `FunctionCallOutputItem.output`, preserving the + * content-array shape (input_text/input_image/input_file blocks) verbatim so + * multimodal outputs are not corrupted by JSON.stringify. All other values + * are stringified. Matches the detection used elsewhere in the SDK + * (`unsentResultsToAPIFormat`). + */ +function toFunctionCallOutputValue( + value: unknown, +): string | models.FunctionCallOutputItemOutputUnion1[] { + if (isContentArray(value)) { + return value; + } + return JSON.stringify(value); +} + +/** + * Format a validation/hook-error payload alongside the caller's original output. + * The wrapper itself is a synthetic object and is always JSON. The + * `originalOutput` slot preserves a content-array shape verbatim if the + * caller supplied one, so the model can still see the rich payload that + * triggered the error. + */ +function formatHookError( + message: string, + originalOutput: unknown, +): string | models.FunctionCallOutputItemOutputUnion1[] { + if (isContentArray(originalOutput)) { + // When the caller supplied a content array, we cannot cram both the + // error wrapper and the content blocks into a single content array — + // the `FunctionCallOutputItemOutputUnion1` members are visible-to-model + // blocks, not arbitrary JSON. Emit a text block carrying the error plus + // the original content blocks so the model sees both. + return [ + { + type: 'input_text', + text: JSON.stringify({ + error: message, + }), + }, + ...originalOutput, + ]; + } + return JSON.stringify({ + error: message, + originalOutput, + }); +} + +/** + * Walk the input items and apply HITL per-tool validation plus any + * `onResponseReceived` hooks. * * For each `function_call_output` item in `input`, locate the matching * `function_call` (by `callId`) to identify the tool name. If that tool is a - * HITL tool with an `onResponseReceived` hook, invoke it with the parsed output - * and replace the item's `output` with the hook's return value (stringified). + * HITL tool, validate the value the model will see against the tool's + * `outputSchema`: + * + * - With `onResponseReceived`: invoke it with the parsed raw output, validate + * the hook's return against `outputSchema`, then serialize (preserving + * content-array shapes). Hook throws or validation failures are replaced + * with `{error, originalOutput}` so the model sees a tool error. + * - Without `onResponseReceived`: validate the parsed raw output against + * `outputSchema`. If it matches, leave the item untouched. Otherwise + * replace with `{error, originalOutput}` using the same shape. * - * If a hook throws, the output is replaced with `{"error": ""}` so the - * model sees a tool error. Items that don't match a HITL tool are left untouched. + * Items that don't match a HITL tool are left untouched. * * @returns a new input array when any item was rewritten, otherwise the original input. */ @@ -499,14 +550,10 @@ export async function applyOnResponseReceivedHooks( if (hitlTools.length === 0) { return input; } - const hookByName = new Map(); + // All HITL tools participate now (validation runs even without a hook). + const hitlByName = new Map(); for (const t of hitlTools) { - if (t.function.onResponseReceived) { - hookByName.set(t.function.name, t); - } - } - if (hookByName.size === 0) { - return input; + hitlByName.set(t.function.name, t); } // Build callId -> name map from function_call items in the array @@ -533,13 +580,15 @@ export async function applyOnResponseReceivedHooks( rewritten.push(item); continue; } - const tool = hookByName.get(toolName); - if (!tool?.function.onResponseReceived) { + const tool = hitlByName.get(toolName); + if (!tool) { rewritten.push(item); continue; } const raw = item.output; + // Parse string outputs as JSON when possible; leave content arrays and + // non-string inputs (e.g. already-parsed objects) alone. let parsed: unknown = raw; if (typeof raw === 'string') { try { @@ -549,21 +598,49 @@ export async function applyOnResponseReceivedHooks( } } - const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema); + let newOutput: string | models.FunctionCallOutputItemOutputUnion1[]; - let newOutput: string; - try { - const hookResult = await Promise.resolve( - tool.function.onResponseReceived(parsed, executeContext), - ); - newOutput = JSON.stringify(hookResult); - } catch (err) { - // Preserve the caller's original output alongside the hook error so the - // model can distinguish a hook failure from a tool-reported error. - newOutput = JSON.stringify({ - error: err instanceof Error ? err.message : String(err), - originalOutput: parsed, - }); + if (tool.function.onResponseReceived) { + const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema); + try { + const hookResult = await Promise.resolve( + tool.function.onResponseReceived(parsed, executeContext), + ); + // Validate the hook's return against outputSchema — the model must + // only see schema-conformant values. A validation failure here is + // surfaced the same way a hook throw is. + try { + validateToolOutput(tool.function.outputSchema, hookResult); + } catch (validationErr) { + newOutput = formatHookError( + validationErr instanceof Error ? validationErr.message : String(validationErr), + parsed, + ); + const replaced: models.FunctionCallOutputItem = { + ...item, + output: newOutput, + }; + rewritten.push(replaced); + changed = true; + continue; + } + newOutput = toFunctionCallOutputValue(hookResult); + } catch (err) { + // Preserve the caller's original output alongside the hook error so + // the model can distinguish a hook failure from a tool-reported error. + newOutput = formatHookError(err instanceof Error ? err.message : String(err), parsed); + } + } else { + // No hook — validate the parsed raw output against outputSchema. + // On success, leave the item untouched (same as the pre-change + // behavior). On failure, replace with an error wrapper so the model + // sees why the caller-supplied payload was rejected. + const parseResult = z4.safeParse(tool.function.outputSchema, parsed); + if (parseResult.success) { + rewritten.push(item); + continue; + } + newOutput = formatHookError(parseResult.error.message, parsed); } const replaced: models.FunctionCallOutputItem = { diff --git a/packages/agent/src/lib/tool-types.ts b/packages/agent/src/lib/tool-types.ts index d34d269..e768358 100644 --- a/packages/agent/src/lib/tool-types.ts +++ b/packages/agent/src/lib/tool-types.ts @@ -296,7 +296,13 @@ export interface HITLToolFunction< TContext extends Record = Record, TName extends string = string, > extends BaseToolFunction { - outputSchema?: TOutput; + /** + * Required for HITL tools. Used to validate both the `onToolCalled` return + * value (when non-null) and the caller-supplied response that comes back via + * a matching `function_call_output` — whether transformed by + * `onResponseReceived` or passed through directly when no hook is defined. + */ + outputSchema: TOutput; onToolCalled: ( params: zodInfer, context?: ToolExecuteContext, diff --git a/packages/agent/src/lib/tool.ts b/packages/agent/src/lib/tool.ts index cb40c05..2d09bae 100644 --- a/packages/agent/src/lib/tool.ts +++ b/packages/agent/src/lib/tool.ts @@ -130,7 +130,13 @@ type HITLToolConfig< name: TName; description?: string; inputSchema: TInput; - outputSchema?: TOutput; + /** + * Required for HITL tools. Used to validate both the `onToolCalled` return + * value (when non-null) and the caller-supplied response that comes back via + * a matching `function_call_output` — whether transformed by + * `onResponseReceived` or passed through directly when no hook is defined. + */ + outputSchema: TOutput; eventSchema?: undefined; execute?: undefined; /** Zod schema declaring the context data this tool needs */ @@ -293,9 +299,19 @@ export function tool( // Check for HITL tool (has onToolCalled hook) if ('onToolCalled' in config && typeof config.onToolCalled === 'function') { + // outputSchema is required at the type level for HITL configs, but + // defensively check at runtime too — JavaScript callers can bypass types. + const hitlName = config.name; + if (!('outputSchema' in config) || config.outputSchema === undefined) { + throw new Error( + `HITL tool "${hitlName}" must declare an outputSchema. HITL tools require a schema so caller-supplied responses can be validated before the model sees them.`, + ); + } + const fn: HITLTool<$ZodObject<$ZodShape>, $ZodType>['function'] = { name: config.name, inputSchema: config.inputSchema, + outputSchema: config.outputSchema, onToolCalled: config.onToolCalled, }; @@ -303,10 +319,6 @@ export function tool( fn.description = config.description; } - if (config.outputSchema !== undefined) { - fn.outputSchema = config.outputSchema; - } - if (config.contextSchema !== undefined) { fn.contextSchema = config.contextSchema; } diff --git a/packages/agent/tests/unit/hitl-tool.test.ts b/packages/agent/tests/unit/hitl-tool.test.ts index 10b7bba..30c6830 100644 --- a/packages/agent/tests/unit/hitl-tool.test.ts +++ b/packages/agent/tests/unit/hitl-tool.test.ts @@ -87,6 +87,9 @@ describe('tool() factory — HITL tools', () => { inputSchema: z.object({ x: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + }), onToolCalled: async () => ({ ok: true, }), @@ -94,6 +97,21 @@ describe('tool() factory — HITL tools', () => { expect('onResponseReceived' in t.function).toBe(false); }); + it('throws when a HITL tool is created without an outputSchema', () => { + expect(() => + tool({ + name: 'approve', + inputSchema: z.object({ + x: z.number(), + }), + // @ts-expect-error outputSchema is now required for HITL tools + onToolCalled: async () => ({ + ok: true, + }), + }), + ).toThrow(/outputSchema/); + }); + it('isManualTool returns true only for tools with neither execute nor onToolCalled', () => { const manual = tool({ name: 'manual', @@ -107,6 +125,9 @@ describe('tool() factory — HITL tools', () => { inputSchema: z.object({ x: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + }), onToolCalled: async () => null, }); const regular = tool({ @@ -160,6 +181,9 @@ describe('executeHITLTool', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + }), onToolCalled: async () => null, }); @@ -179,6 +203,9 @@ describe('executeHITLTool', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + }), onToolCalled: async () => { throw new Error('boom'); }, @@ -229,6 +256,9 @@ describe('executeTool dispatcher with HITL tools', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + approved: z.boolean(), + }), onToolCalled: async () => ({ approved: true, }), @@ -253,6 +283,9 @@ describe('executeTool dispatcher with HITL tools', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + }), onToolCalled: async () => null, }); @@ -294,13 +327,20 @@ describe('applyOnResponseReceivedHooks', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + reviewedAt: z.number(), + }), onToolCalled: async () => null, onResponseReceived: async (raw) => { - const obj = raw as { - ok: boolean; - }; + if (!isRawRecord(raw)) { + return { + ok: false, + reviewedAt: 1234, + }; + } return { - ...obj, + ...raw, reviewedAt: 1234, }; }, @@ -372,6 +412,9 @@ describe('applyOnResponseReceivedHooks', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + }), onToolCalled: async () => null, onResponseReceived: async () => { throw new Error('invalid result'); @@ -412,6 +455,9 @@ describe('applyOnResponseReceivedHooks', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + }), onToolCalled: async () => null, onResponseReceived: async () => { throw new Error('invalid result'); @@ -441,12 +487,20 @@ describe('applyOnResponseReceivedHooks', () => { }); it('passes the parsed raw result (not the raw string) to the hook', async () => { - const spy = vi.fn(async (raw: unknown) => raw); + // Loose outputSchema accepting any object keeps the focus of this + // test on argument propagation, not schema validation. + const spy = vi.fn(async (raw: unknown) => { + if (!isRawRecord(raw)) { + return {}; + } + return raw; + }); const t: HITLTool = tool({ name: 'approve', inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.record(z.string(), z.unknown()), onToolCalled: async () => null, onResponseReceived: spy, }); @@ -472,12 +526,19 @@ describe('applyOnResponseReceivedHooks', () => { }); it('passes the raw string through to the hook when output is not JSON', async () => { - const spy = vi.fn(async (raw: unknown) => raw); + // Permissive outputSchema so the string hook-return passes validation. + const spy = vi.fn(async (raw: unknown) => { + if (typeof raw !== 'string') { + return 'fallback'; + } + return raw; + }); const t: HITLTool = tool({ name: 'approve', inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.string(), onToolCalled: async () => null, onResponseReceived: spy, }); @@ -503,11 +564,19 @@ describe('applyOnResponseReceivedHooks', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.record(z.string(), z.unknown()), onToolCalled: async () => null, - onResponseReceived: async (raw) => ({ - ...(raw as object), - tagged: true, - }), + onResponseReceived: async (raw) => { + if (!isRawRecord(raw)) { + return { + tagged: true, + }; + } + return { + ...raw, + tagged: true, + }; + }, }); // No function_call in the input at all — just an orphan output @@ -530,12 +599,15 @@ describe('applyOnResponseReceivedHooks', () => { expect(result).toBe(input); }); - it('ignores tools without onResponseReceived even if they are HITL', async () => { + it('validates raw output against outputSchema when no onResponseReceived hook is defined (passes)', async () => { const t = tool({ name: 'approve', inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + }), onToolCalled: async () => null, }); @@ -549,6 +621,264 @@ describe('applyOnResponseReceivedHooks', () => { ), ]; + // Schema-conformant raw outputs should leave the item (and the array + // reference) untouched. + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); + expect(result).toBe(input); + }); + + it('replaces output with an error object when raw does not match outputSchema and no hook is defined', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ + amount: z.number(), + }), + outputSchema: z.object({ + ok: z.boolean(), + }), + onToolCalled: async () => null, + }); + + const badPayload = { + ok: 'nope', + }; + const input: models.InputsUnion = [ + callItem('c1', 'approve'), + outputItem('c1', JSON.stringify(badPayload)), + ]; + + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); + const arr = result as unknown[]; + const out = arr[1] as models.FunctionCallOutputItem; + const parsed = JSON.parse(out.output as string) as { + error: string; + originalOutput: unknown; + }; + expect(typeof parsed.error).toBe('string'); + expect(parsed.error.length).toBeGreaterThan(0); + expect(parsed.originalOutput).toEqual(badPayload); + }); + + it('replaces output with an error object when the hook return does not match outputSchema', async () => { + const t = tool({ + name: 'approve', + inputSchema: z.object({ + amount: z.number(), + }), + outputSchema: z.object({ + ok: z.boolean(), + }), + onToolCalled: async () => null, + // Intentionally return a value that does not match outputSchema. + onResponseReceived: async () => ({ + ok: 'yes' as unknown as boolean, + }), + }); + + const originalPayload = { + ok: true, + }; + const input: models.InputsUnion = [ + callItem('c1', 'approve'), + outputItem('c1', JSON.stringify(originalPayload)), + ]; + + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); + const arr = result as unknown[]; + const out = arr[1] as models.FunctionCallOutputItem; + const parsed = JSON.parse(out.output as string) as { + error: string; + originalOutput: unknown; + }; + expect(typeof parsed.error).toBe('string'); + expect(parsed.error.length).toBeGreaterThan(0); + expect(parsed.originalOutput).toEqual(originalPayload); + }); + + it('preserves a content-array hook return verbatim (no JSON.stringify)', async () => { + const contentArray = [ + { + type: 'input_text' as const, + text: 'hello', + }, + { + type: 'input_image' as const, + detail: 'auto' as const, + imageUrl: 'https://example.com/img.png', + }, + ]; + + const t = tool({ + name: 'describe', + inputSchema: z.object({ + id: z.string(), + }), + // Loose schema — we care about shape preservation, not the schema type. + outputSchema: z.array(z.record(z.string(), z.unknown())), + onToolCalled: async () => null, + onResponseReceived: async () => contentArray, + }); + + const input: models.InputsUnion = [ + callItem('c1', 'describe'), + outputItem( + 'c1', + JSON.stringify({ + ok: true, + }), + ), + ]; + + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); + const arr = result as unknown[]; + const out = arr[1] as models.FunctionCallOutputItem; + // output should be the content-array verbatim, not a JSON string. + expect(Array.isArray(out.output)).toBe(true); + expect(out.output).toEqual(contentArray); + }); + + it('preserves a caller-supplied content-array output across no-hook validation', async () => { + const contentArray = [ + { + type: 'input_text' as const, + text: 'plain text', + }, + ]; + + const t = tool({ + name: 'describe', + inputSchema: z.object({ + id: z.string(), + }), + outputSchema: z.array(z.record(z.string(), z.unknown())), + onToolCalled: async () => null, + }); + + const input: models.InputsUnion = [ + callItem('c1', 'describe'), + { + type: 'function_call_output', + id: 'output_c1', + callId: 'c1', + output: contentArray, + }, + ]; + + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); + // Schema-conformant content-array output should be untouched. + expect(result).toBe(input); + }); + + it('preserves a caller-supplied content-array in originalOutput when the hook throws', async () => { + const contentArray = [ + { + type: 'input_text' as const, + text: 'caller payload', + }, + ]; + + const t = tool({ + name: 'describe', + inputSchema: z.object({ + id: z.string(), + }), + outputSchema: z.object({ + ok: z.boolean(), + }), + onToolCalled: async () => null, + onResponseReceived: async () => { + throw new Error('bad hook'); + }, + }); + + const input: models.InputsUnion = [ + callItem('c1', 'describe'), + { + type: 'function_call_output', + id: 'output_c1', + callId: 'c1', + output: contentArray, + }, + ]; + + const result = await applyOnResponseReceivedHooks( + input, + [ + t, + ], + turnContext, + ); + const arr = result as unknown[]; + const out = arr[1] as models.FunctionCallOutputItem; + // A content-array originalOutput cannot be embedded inside a JSON + // wrapper (the output union is a content array of visible blocks, not + // arbitrary JSON). The replacement uses a content-array form: the + // error as an input_text block followed by the original blocks. + if (!Array.isArray(out.output)) { + throw new Error('Expected content-array output on hook failure'); + } + const errBlock = out.output[0]; + expect(errBlock?.type).toBe('input_text'); + if (!errBlock || errBlock.type !== 'input_text') { + throw new Error('Expected first block to be input_text'); + } + const errPayload = JSON.parse(errBlock.text) as { + error: string; + }; + expect(errPayload.error).toBe('bad hook'); + // Remaining blocks are the caller's original content-array payload. + expect(out.output.slice(1)).toEqual(contentArray); + }); + + it('ignores manual tools (no onToolCalled) — they are not HITL', async () => { + const t = tool({ + name: 'manual', + inputSchema: z.object({ + x: z.number(), + }), + execute: false, + }); + + const input: models.InputsUnion = [ + callItem('c1', 'manual'), + outputItem( + 'c1', + JSON.stringify({ + y: 1, + }), + ), + ]; + const result = await applyOnResponseReceivedHooks( input, [ @@ -817,6 +1147,9 @@ describe('HITL flow through ModelResult (integration)', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + }), onToolCalled: async () => null, }); @@ -892,6 +1225,10 @@ describe('HITL flow through ModelResult (integration)', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + reviewedAt: z.number(), + }), onToolCalled, onResponseReceived: async (raw) => { // Transform the caller's raw output before the model sees it. @@ -902,7 +1239,7 @@ describe('HITL flow through ModelResult (integration)', () => { }; } return { - ...raw, + ok: raw.ok === true, reviewedAt: 1234, }; }, @@ -1044,6 +1381,9 @@ describe('HITL pause persists state (Fix #2)', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + }), onToolCalled: async () => null, }); @@ -1108,6 +1448,9 @@ describe('Approved HITL pauses retain pendingToolCalls (Fix #9)', () => { inputSchema: z.object({ amount: z.number(), }), + outputSchema: z.object({ + ok: z.boolean(), + }), requireApproval: true, onToolCalled, }); @@ -1271,20 +1614,29 @@ describe('hasExecutableToolCalls treats HITL tools as auto-resolvable (Fix #1)', }); }); -describe('continueWithUnsentResults does not re-hook caller-supplied items (Fix #8)', () => { +describe('continueWithUnsentResults does not hook SDK-generated outputs (Fix #4)', () => { beforeEach(() => { mockBetaResponsesSend.mockReset(); }); - it('onResponseReceived fires once per caller-supplied output across init + resume', async () => { - // Scenario: caller supplies a function_call_output for a HITL tool in - // the initial input, the model requests another (approval-required) - // tool call in response, and after the caller approves that second - // call the loop resumes via continueWithUnsentResults. Before Fix #8 - // the initial caller-supplied output would be re-hooked during - // resume because applyOnResponseReceivedHooks walked the full - // accumulated newInput. + it('onResponseReceived does NOT fire for SDK-generated outputs produced during continueWithUnsentResults', async () => { + // Scenario: a HITL tool requires approval AND defines both onToolCalled + // (which produces an output on approval) and onResponseReceived (which + // is documented to apply only to CALLER-supplied outputs — i.e. the + // resume-by-passing-a-function_call_output path). When the caller + // approves the pending call, processApprovalDecisions invokes + // onToolCalled, queues the result in unsentToolResults, and + // continueWithUnsentResults stitches it onto the message history and + // sends the follow-up request. The SDK-generated output must NOT be + // run through onResponseReceived — that hook is reserved for outputs + // the caller produces externally. + const onToolCalled = vi.fn(async () => ({ + ok: true, + })); const onResponseReceived = vi.fn(async (raw: unknown) => { + // Should never run in this test. If it does, the assertion below + // catches it; returning a distinct payload makes misbehaviour + // easy to diagnose in the wire capture as well. if (!isRawRecord(raw)) { return { hooked: true, @@ -1301,82 +1653,229 @@ describe('continueWithUnsentResults does not re-hook caller-supplied items (Fix inputSchema: z.object({ amount: z.number(), }), - onToolCalled: async () => ({ - ok: true, + outputSchema: z.object({ + ok: z.boolean(), }), + requireApproval: true, + onToolCalled, onResponseReceived, }); - const needsApproval = tool({ - name: 'dangerous', + const tools = [ + approve, + ] as const; + + // Pre-populate state as if a prior run left us awaiting_approval with + // one pending call c1. This bypasses the initial model turn (we are + // jumping straight to the resume path) so the only invocation of a + // hook we care about happens inside continueWithUnsentResults. + const pending: ParsedToolCall<(typeof tools)[number]> = { + id: 'c1', + name: 'approve', + arguments: { + amount: 5, + }, + }; + const savedState: ConversationState = { + id: 'conv_test', + messages: [ + makeFunctionCallItem( + 'c1', + 'approve', + JSON.stringify({ + amount: 5, + }), + ), + ], + pendingToolCalls: [ + pending, + ], + status: 'awaiting_approval', + createdAt: 0, + updatedAt: 0, + }; + + // After the approval resolves and the follow-up request fires, the + // model replies with a plain message. + const followupTurn = makeResponse('resp_followup', [ + makeMessageItem('msg_1', 'done'), + ]); + mockBetaResponsesSend.mockResolvedValueOnce({ + ok: true, + value: makeCompletedStream(followupTurn), + }); + + const { accessor } = createMemoryAccessor(savedState); + + const result = new ModelResult({ + request: { + model: 'test-model', + input: 'approve 5', + tools: [ + { + type: 'function', + name: 'approve', + description: null, + strict: null, + parameters: {}, + }, + ], + }, + client: {} as OpenRouterCore, + tools, + state: accessor, + // Caller approves c1 — onToolCalled will produce {ok: true} and the + // follow-up request will carry that SDK-generated output. + approveToolCalls: [ + 'c1', + ], + }); + + await result.getResponse(); + + // onToolCalled fired exactly once (during processApprovalDecisions). + expect(onToolCalled).toHaveBeenCalledTimes(1); + + // The critical assertion: onResponseReceived must NOT have fired. + // SDK-generated outputs are appended verbatim; the hook is reserved + // for caller-supplied outputs only. + expect(onResponseReceived).not.toHaveBeenCalled(); + + // And the wire request must carry the raw SDK-generated output, not + // a hook-transformed version. + expect(mockBetaResponsesSend).toHaveBeenCalledTimes(1); + const callArg = mockBetaResponsesSend.mock.calls[0]?.[1]; + if (!isSendCallArg(callArg)) { + throw new Error('Send call arg did not match expected shape'); + } + const input = callArg.responsesRequest.input; + if (!Array.isArray(input)) { + throw new Error('Expected follow-up input to be an array'); + } + const output = input.find(isFunctionCallOutput); + if (!output) { + throw new Error('Expected a function_call_output in the follow-up input'); + } + const rawOutput = output.output; + if (typeof rawOutput !== 'string') { + throw new Error('Expected function_call_output.output to be a string'); + } + expect(JSON.parse(rawOutput)).toEqual({ + ok: true, + }); + }); +}); + +describe('initStream does not re-hook historical outputs (Fix #3)', () => { + beforeEach(() => { + mockBetaResponsesSend.mockReset(); + }); + + it('onResponseReceived fires only for fresh caller-supplied outputs, not historical ones loaded from state', async () => { + // Scenario: state already contains a function_call / function_call_output + // pair from a prior run (the output was hooked when it was first + // supplied). On a new callModel invocation, the caller passes fresh + // input that includes ANOTHER function_call / function_call_output pair + // for the same HITL tool. The hook must fire for the fresh output + // ONLY — the historical output must not be re-hooked (non-idempotent + // hooks would double-fire otherwise). + const onResponseReceived = vi.fn(async (raw: unknown) => { + if (!isRawRecord(raw)) { + return { + hooked: true, + }; + } + return { + ...raw, + hooked: true, + }; + }); + + const approve = tool({ + name: 'approve', inputSchema: z.object({ - x: z.number(), + amount: z.number(), }), - requireApproval: true, - execute: async () => ({ - done: true, + outputSchema: z.object({ + ok: z.boolean(), }), + onToolCalled: async () => null, + onResponseReceived, }); const tools = [ approve, - needsApproval, ] as const; - // Initial model turn requests `dangerous`, forcing an approval pause. - const turn0 = makeResponse('resp_turn_0', [ - makeFunctionCallItem( - 'd1', - 'dangerous', - JSON.stringify({ - x: 1, - }), - ), - ]); - // After the caller approves d1 and execution resumes, the model - // replies with a plain message. - const turn1 = makeResponse('resp_turn_1', [ - makeMessageItem('msg_1', 'done'), - ]); - - mockBetaResponsesSend - .mockResolvedValueOnce({ - ok: true, - value: makeCompletedStream(turn0), - }) - .mockResolvedValueOnce({ + // Pre-populated state: a historical function_call + function_call_output + // pair (c0) that was already hooked on the prior run. Its `output` + // field here simulates an already-hooked payload. If initStream + // re-hooks it, the mock will observe a second invocation for c0. + const historicalCall = makeFunctionCallItem( + 'c0', + 'approve', + JSON.stringify({ + amount: 1, + }), + ); + const historicalOutput: models.FunctionCallOutputItem = { + type: 'function_call_output', + id: 'output_c0', + callId: 'c0', + output: JSON.stringify({ ok: true, - value: makeCompletedStream(turn1), - }); + hooked: true, + }), + }; + + const savedState: ConversationState = { + id: 'conv_test', + messages: [ + historicalCall, + historicalOutput, + ], + status: 'in_progress', + createdAt: 0, + updatedAt: 0, + }; - // Caller-supplied initial input: a function_call_output for `approve` - // that will be hooked during init (via applyOnResponseReceivedHooks). - const initialInput: models.InputsUnion = [ + // Fresh input: a new function_call / function_call_output pair (c1) + // that the caller is supplying this turn. The hook should fire for + // c1 but NOT for c0. + const freshInput: models.InputsUnion = [ makeFunctionCallItem( - 'c0', + 'c1', 'approve', JSON.stringify({ - amount: 5, + amount: 2, }), ), { type: 'function_call_output', - id: 'output_c0', - callId: 'c0', + id: 'output_c1', + callId: 'c1', output: JSON.stringify({ ok: true, }), }, ]; - const { accessor } = createMemoryAccessor(); + // After the fresh output is hooked and appended to history, the model + // replies with a plain message. + const replyTurn = makeResponse('resp_reply', [ + makeMessageItem('msg_1', 'noted'), + ]); + mockBetaResponsesSend.mockResolvedValueOnce({ + ok: true, + value: makeCompletedStream(replyTurn), + }); + + const { accessor } = createMemoryAccessor(savedState); - // First pass: initial run. The initial caller-supplied output gets - // hooked once during init. - const result1 = new ModelResult({ + const result = new ModelResult({ request: { model: 'test-model', - input: initialInput, + input: freshInput, tools: [ { type: 'function', @@ -1385,13 +1884,6 @@ describe('continueWithUnsentResults does not re-hook caller-supplied items (Fix strict: null, parameters: {}, }, - { - type: 'function', - name: 'dangerous', - description: null, - strict: null, - parameters: {}, - }, ], }, client: {} as OpenRouterCore, @@ -1399,47 +1891,59 @@ describe('continueWithUnsentResults does not re-hook caller-supplied items (Fix state: accessor, }); - await result1.getResponse(); + await result.getResponse(); + + // Hook fired exactly once — for the fresh c1 output — and received + // the fresh payload, NOT the already-hooked historical one. expect(onResponseReceived).toHaveBeenCalledTimes(1); + const hookArg = onResponseReceived.mock.calls[0]?.[0]; + expect(hookArg).toEqual({ + ok: true, + }); - // Second pass: resume by approving d1. continueWithUnsentResults - // must NOT re-hook the earlier c0 output (which lives in the saved - // messages from the first run). - onResponseReceived.mockClear(); + // The wire request must carry both outputs with the correct values: + // - c0: unchanged historical output (not re-hooked) + // - c1: hooked fresh output + expect(mockBetaResponsesSend).toHaveBeenCalledTimes(1); + const callArg = mockBetaResponsesSend.mock.calls[0]?.[1]; + if (!isSendCallArg(callArg)) { + throw new Error('Send call arg did not match expected shape'); + } + const input = callArg.responsesRequest.input; + if (!Array.isArray(input)) { + throw new Error('Expected input to be an array'); + } + const outputs = input.filter(isFunctionCallOutput); + expect(outputs).toHaveLength(2); - const result2 = new ModelResult({ - request: { - model: 'test-model', - input: initialInput, - tools: [ - { - type: 'function', - name: 'approve', - description: null, - strict: null, - parameters: {}, - }, - { - type: 'function', - name: 'dangerous', - description: null, - strict: null, - parameters: {}, - }, - ], - }, - client: {} as OpenRouterCore, - tools, - state: accessor, - approveToolCalls: [ - 'd1', - ], + const c0Output = outputs.find((o) => o.callId === 'c0'); + if (!c0Output) { + throw new Error('Expected c0 output to be preserved'); + } + const c0Raw = c0Output.output; + if (typeof c0Raw !== 'string') { + throw new Error('Expected c0 output.output to be a string'); + } + // c0 preserved verbatim — NOT re-hooked. The hooked: true marker is + // from the historical (prior-run) payload, not a re-hook. + expect(JSON.parse(c0Raw)).toEqual({ + ok: true, + hooked: true, }); - await result2.getResponse(); - - // On resume, `dangerous` is an execute tool (no hook) and the c0 - // output lives in message history — the hook must NOT run again. - expect(onResponseReceived).not.toHaveBeenCalled(); + const c1Output = outputs.find((o) => o.callId === 'c1'); + if (!c1Output) { + throw new Error('Expected c1 output to be present'); + } + const c1Raw = c1Output.output; + if (typeof c1Raw !== 'string') { + throw new Error('Expected c1 output.output to be a string'); + } + // c1 was hooked — the fresh payload plus hooked: true added by the + // onResponseReceived implementation above. + expect(JSON.parse(c1Raw)).toEqual({ + ok: true, + hooked: true, + }); }); }); From 396d44b3371df99ddf660288a8d53cd4fc810728 Mon Sep 17 00:00:00 2001 From: Matt Apperson Date: Tue, 5 May 2026 20:34:12 -0400 Subject: [PATCH 5/5] refactor(tools): extract helpers to keep cc under structural gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit applyOnResponseReceivedHooks and executeToolRound grew cyclomatic complexity above 15 when HITL logic landed, tripping sentrux gate (complex functions 9 -> 11). Extract per-item helpers so the gate matches baseline again: - applyOnResponseReceivedHooks: move per-item hook/validate logic into computeHitlItemOutput + invokeOnResponseReceived; split map builders into buildHitlToolMap / buildCallIdToNameMap; parseRawFunctionCallOutput replaces the inline try/JSON.parse branch. - executeToolRound: move the output-for-model branching into computeToolOutputForModel; describeNonRecord factors out the nested typeof/Array.isArray ternary. Pure refactor — no behavior change. All 280 unit tests still pass; sentrux gate reports "No degradation detected". --- packages/agent/src/lib/model-result.ts | 92 +++++++---- packages/agent/src/lib/tool-executor.ts | 199 ++++++++++++++---------- 2 files changed, 178 insertions(+), 113 deletions(-) diff --git a/packages/agent/src/lib/model-result.ts b/packages/agent/src/lib/model-result.ts index 26d3500..9a3670d 100644 --- a/packages/agent/src/lib/model-result.ts +++ b/packages/agent/src/lib/model-result.ts @@ -92,6 +92,20 @@ function isRecord(value: unknown): value is Record { return typeof value === 'object' && value !== null && !Array.isArray(value); } +/** + * Human-readable label for a value that failed the `isRecord` check. Used + * exclusively to make `toModelOutput` misuse errors specific. + */ +function describeNonRecord(value: unknown): string { + if (value === null) { + return 'null'; + } + if (Array.isArray(value)) { + return 'array'; + } + return typeof value; +} + /** * Type guard for stream event responses * Checks constructor name and readable stream behavior @@ -782,6 +796,52 @@ export class ModelResult< await this.saveStateSafely(stateUpdates); } + /** + * Compute the `output` payload sent to the model for a successfully + * settled tool execution. Routes through `toModelOutput` when the tool + * defines one (which may itself throw to surface an error), falls back to + * `JSON.stringify(result)` otherwise, and emits an error envelope when the + * executor itself reported an error. + */ + private async computeToolOutputForModel(value: { + toolCall: ParsedToolCall; + tool: Tool; + result: { + result: unknown; + error?: Error; + }; + }): Promise { + if (value.result.error) { + return JSON.stringify({ + error: value.result.error.message, + }); + } + + if (!isAutoResolvableTool(value.tool) || !value.tool.function.toModelOutput) { + return JSON.stringify(value.result.result); + } + + // Arguments have already been validated upstream by the tool's Zod + // inputSchema (which must be a ZodObject), so the runtime shape is + // always a record here. A non-record value here signals a real upstream + // bug we want surfaced, not a case to paper over with `{}`. + const rawArgs: unknown = value.toolCall.arguments; + if (!isRecord(rawArgs)) { + throw new Error( + `toolCall.arguments for "${value.toolCall.name}" must be an object after Zod validation, got ${describeNonRecord(rawArgs)}`, + ); + } + + const modelOutputResult = await value.tool.function.toModelOutput({ + output: value.result.result, + input: rawArgs, + }); + if (modelOutputResult.type === 'content') { + return modelOutputResult.value; + } + return JSON.stringify(value.result.result); + } + /** * Execute all tools in a single round in parallel. * Emits tool.result events after tool execution completes. @@ -946,37 +1006,7 @@ export class ModelResult< value.preliminaryResultsForCall.length > 0 ? value.preliminaryResultsForCall : undefined, ); - let outputForModel: string | models.FunctionCallOutputItemOutputUnion1[]; - - if (value.result.error) { - outputForModel = JSON.stringify({ - error: value.result.error.message, - }); - } else if (value.tool.function.toModelOutput) { - // toModelOutput exists - call it (may throw, which surfaces the error). - // Arguments have already been validated upstream by the tool's Zod - // inputSchema (which must be a ZodObject), so the runtime shape is - // always a record here. The `Tool` union widening loses the specific - // InferToolInput type, so we re-narrow defensively — a non-record - // value here signals a real upstream bug we want surfaced, not a - // case to paper over with `{}`. - const rawArgs: unknown = value.toolCall.arguments; - if (!isRecord(rawArgs)) { - throw new Error( - `toolCall.arguments for "${value.toolCall.name}" must be an object after Zod validation, got ${rawArgs === null ? 'null' : Array.isArray(rawArgs) ? 'array' : typeof rawArgs}`, - ); - } - const modelOutputResult = await value.tool.function.toModelOutput({ - output: value.result.result, - input: rawArgs, - }); - outputForModel = - modelOutputResult.type === 'content' - ? modelOutputResult.value - : JSON.stringify(value.result.result); - } else { - outputForModel = JSON.stringify(value.result.result); - } + const outputForModel = await this.computeToolOutputForModel(value); const executedOutput: models.FunctionCallOutputItem = { type: 'function_call_output' as const, diff --git a/packages/agent/src/lib/tool-executor.ts b/packages/agent/src/lib/tool-executor.ts index 9b7ab46..572fa0d 100644 --- a/packages/agent/src/lib/tool-executor.ts +++ b/packages/agent/src/lib/tool-executor.ts @@ -513,6 +513,111 @@ function formatHookError( }); } +/** + * Parse a raw `FunctionCallOutputItem.output` value for HITL processing. + * JSON-parses string payloads when possible; leaves content arrays and + * non-string inputs untouched. + */ +function parseRawFunctionCallOutput(raw: unknown): unknown { + if (typeof raw !== 'string') { + return raw; + } + try { + return JSON.parse(raw); + } catch { + return raw; + } +} + +/** Build a name → HITL tool map from the registered tools. */ +function buildHitlToolMap(tools: readonly Tool[]): Map { + const map = new Map(); + for (const t of tools) { + if (isHITLTool(t)) { + map.set(t.function.name, t); + } + } + return map; +} + +/** + * Build a callId → tool-name map from `function_call` items in an input + * array, so `function_call_output` items can be associated with their tool. + */ +function buildCallIdToNameMap( + input: Extract, +): Map { + const map = new Map(); + for (const item of input) { + if (isFunctionCallItem(item)) { + map.set(item.callId, item.name); + } + } + return map; +} + +type HookOutput = string | models.FunctionCallOutputItemOutputUnion1[]; + +/** + * Invoke `onResponseReceived`, validate the return against the tool's + * `outputSchema`, and convert the result (or any error) to a + * `FunctionCallOutputItem.output` payload. + */ +// biome-ignore lint: parameters match the internal API shape +async function invokeOnResponseReceived( + tool: HITLTool, + parsed: unknown, + context: TurnContext, + contextStore?: ToolContextStore, + sharedSchema?: $ZodObject<$ZodShape>, +): Promise { + const hook = tool.function.onResponseReceived; + if (!hook) { + throw new Error('invokeOnResponseReceived called without onResponseReceived hook'); + } + const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema); + try { + const hookResult = await Promise.resolve(hook(parsed, executeContext)); + const validation = z4.safeParse(tool.function.outputSchema, hookResult); + if (!validation.success) { + return formatHookError(validation.error.message, parsed); + } + return toFunctionCallOutputValue(hookResult); + } catch (err) { + // Preserve the caller's original output alongside the hook error so + // the model can distinguish a hook failure from a tool-reported error. + return formatHookError(err instanceof Error ? err.message : String(err), parsed); + } +} + +/** + * Compute the (optional) replacement output for a single HITL + * `function_call_output` item. Returns `null` when the caller-supplied output + * is schema-conformant and should be left untouched. + */ +// biome-ignore lint: parameters match the internal API shape +async function computeHitlItemOutput( + tool: HITLTool, + item: models.FunctionCallOutputItem, + context: TurnContext, + contextStore?: ToolContextStore, + sharedSchema?: $ZodObject<$ZodShape>, +): Promise { + const parsed = parseRawFunctionCallOutput(item.output); + + if (tool.function.onResponseReceived) { + return invokeOnResponseReceived(tool, parsed, context, contextStore, sharedSchema); + } + + // No hook — validate the parsed raw output against outputSchema. On + // success, leave the item untouched; on failure, surface an error wrapper. + const validation = z4.safeParse(tool.function.outputSchema, parsed); + if (validation.success) { + return null; + } + return formatHookError(validation.error.message, parsed); +} + /** * Walk the input items and apply HITL per-tool validation plus any * `onResponseReceived` hooks. @@ -546,23 +651,12 @@ export async function applyOnResponseReceivedHooks( return input; } - const hitlTools = tools.filter((t): t is HITLTool => isHITLTool(t)); - if (hitlTools.length === 0) { + const hitlByName = buildHitlToolMap(tools); + if (hitlByName.size === 0) { return input; } - // All HITL tools participate now (validation runs even without a hook). - const hitlByName = new Map(); - for (const t of hitlTools) { - hitlByName.set(t.function.name, t); - } - // Build callId -> name map from function_call items in the array - const callIdToName = new Map(); - for (const item of input) { - if (isFunctionCallItem(item)) { - callIdToName.set(item.callId, item.name); - } - } + const callIdToName = buildCallIdToNameMap(input); // Element type of the array form of InputsUnion — use this so `rewritten` // is structurally assignable back to InputsUnion without an `as` cast. @@ -571,83 +665,24 @@ export async function applyOnResponseReceivedHooks( let changed = false; const rewritten: InputsArrayItem[] = []; for (const item of input) { - if (!isFunctionCallOutputItem(item)) { + const tool = isFunctionCallOutputItem(item) + ? hitlByName.get(callIdToName.get(item.callId) ?? '') + : undefined; + if (!tool || !isFunctionCallOutputItem(item)) { rewritten.push(item); continue; } - const toolName = callIdToName.get(item.callId); - if (!toolName) { - rewritten.push(item); - continue; - } - const tool = hitlByName.get(toolName); - if (!tool) { + + const newOutput = await computeHitlItemOutput(tool, item, context, contextStore, sharedSchema); + if (newOutput === null) { rewritten.push(item); continue; } - const raw = item.output; - // Parse string outputs as JSON when possible; leave content arrays and - // non-string inputs (e.g. already-parsed objects) alone. - let parsed: unknown = raw; - if (typeof raw === 'string') { - try { - parsed = JSON.parse(raw); - } catch { - parsed = raw; - } - } - - let newOutput: string | models.FunctionCallOutputItemOutputUnion1[]; - - if (tool.function.onResponseReceived) { - const executeContext = buildExecuteCtx(tool, context, contextStore, sharedSchema); - try { - const hookResult = await Promise.resolve( - tool.function.onResponseReceived(parsed, executeContext), - ); - // Validate the hook's return against outputSchema — the model must - // only see schema-conformant values. A validation failure here is - // surfaced the same way a hook throw is. - try { - validateToolOutput(tool.function.outputSchema, hookResult); - } catch (validationErr) { - newOutput = formatHookError( - validationErr instanceof Error ? validationErr.message : String(validationErr), - parsed, - ); - const replaced: models.FunctionCallOutputItem = { - ...item, - output: newOutput, - }; - rewritten.push(replaced); - changed = true; - continue; - } - newOutput = toFunctionCallOutputValue(hookResult); - } catch (err) { - // Preserve the caller's original output alongside the hook error so - // the model can distinguish a hook failure from a tool-reported error. - newOutput = formatHookError(err instanceof Error ? err.message : String(err), parsed); - } - } else { - // No hook — validate the parsed raw output against outputSchema. - // On success, leave the item untouched (same as the pre-change - // behavior). On failure, replace with an error wrapper so the model - // sees why the caller-supplied payload was rejected. - const parseResult = z4.safeParse(tool.function.outputSchema, parsed); - if (parseResult.success) { - rewritten.push(item); - continue; - } - newOutput = formatHookError(parseResult.error.message, parsed); - } - - const replaced: models.FunctionCallOutputItem = { + rewritten.push({ ...item, output: newOutput, - }; - rewritten.push(replaced); + }); changed = true; }