From 5cc02278283ec858f2a729ca5b56f08e4622d001 Mon Sep 17 00:00:00 2001 From: shaun0927 <70629228+shaun0927@users.noreply.github.com> Date: Wed, 13 May 2026 00:31:08 +0900 Subject: [PATCH] Recover failed plans with opt-in safe candidates Add bounded read-only recovery attempts for compiled plans so safe state-refresh candidates can repair stale or blocked steps under explicit opt-in budgets.\n\nConstraint: Live browser recovery must be opt-in, budgeted, and limited to read-only candidates.\nRejected: General MCTS/live browser branching | unsafe for authenticated sessions and unnecessary for the current harness goal.\nConfidence: medium\nScope-risk: moderate\nDirective: Do not widen candidate execution beyond read-only tools without an explicit destructive-action gate and live verification.\nTested: npm test -- --runTestsByPath tests/orchestration/plan-executor-bounded-recovery.test.ts; npm run build\nNot-tested: Full execute_plan MCP transcript against Chrome fixture. --- src/orchestration/plan-executor.ts | 244 +++++++++++++----- src/tools/orchestration.ts | 12 +- src/types/plan-cache.ts | 35 ++- .../plan-executor-bounded-recovery.test.ts | 72 ++++++ 4 files changed, 293 insertions(+), 70 deletions(-) create mode 100644 tests/orchestration/plan-executor-bounded-recovery.test.ts diff --git a/src/orchestration/plan-executor.ts b/src/orchestration/plan-executor.ts index 3138164f..3da99142 100644 --- a/src/orchestration/plan-executor.ts +++ b/src/orchestration/plan-executor.ts @@ -8,11 +8,14 @@ import { MCPResult, ToolHandler } from '../types/mcp'; import { CompiledPlan, - CompiledStep, - PlanErrorHandler, - PlanExecutionResult, -} from '../types/plan-cache'; -import { withTimeout } from '../utils/with-timeout'; + CompiledStep, + PlanErrorHandler, + PlanExecutionOptions, + PlanExecutionResult, + PlanRecoveryAttempt, +} from '../types/plan-cache'; +import { rankRecoveryCandidates } from '../recovery'; +import { withTimeout } from '../utils/with-timeout'; /** * Recursively substitute ${varName} templates in a value using the params map. @@ -137,13 +140,15 @@ export class PlanExecutor { this.toolResolver = toolResolver; } - async execute( - plan: CompiledPlan, - sessionId: string, - runtimeParams: Record - ): Promise { - const startTime = Date.now(); - let stepsExecuted = 0; + async execute( + plan: CompiledPlan, + sessionId: string, + runtimeParams: Record, + options: PlanExecutionOptions = {} + ): Promise { + const startTime = Date.now(); + let stepsExecuted = 0; + const recoveryAttempts: PlanRecoveryAttempt[] = []; // 1. Build params map: plan defaults first, runtime overrides on top const params: Record = {}; @@ -154,14 +159,25 @@ export class PlanExecutor { } Object.assign(params, runtimeParams); - const failure = (error: string): PlanExecutionResult => ({ - success: false, - planId: plan.id, - error, - durationMs: Date.now() - startTime, - stepsExecuted, - totalSteps: plan.steps.length, - }); + const withRecovery = (result: T): T => { + if (options.boundedRecovery?.enabled) { + result.recovery = { + enabled: true, + attempts: recoveryAttempts, + exhausted: countExecutedRecoveryAttempts(recoveryAttempts) >= (options.boundedRecovery.maxToolCalls ?? 2), + }; + } + return result; + }; + + const failure = (error: string): PlanExecutionResult => withRecovery({ + success: false, + planId: plan.id, + error, + durationMs: Date.now() - startTime, + stepsExecuted, + totalSteps: plan.steps.length, + }); // 2. Execute each step sequentially for (const step of plan.steps) { @@ -200,14 +216,18 @@ export class PlanExecutor { params, stepsExecuted ); - if (recovered !== null) { - stepsExecuted = recovered.stepsExecuted; - // Merge any params updates from recovery into our params - Object.assign(params, recovered.params); - continue; - } - - return failure(`Step ${step.order} (${step.tool}) failed: ${errMsg}`); + if (recovered !== null) { + stepsExecuted = recovered.stepsExecuted; + // Merge any params updates from recovery into our params + Object.assign(params, recovered.params); + continue; + } + + const bounded = await this.tryBoundedRecovery(step, errMsg, sessionId, params, options, recoveryAttempts); + stepsExecuted += bounded.stepsExecuted; + if (bounded.recovered) continue; + + return failure(`Step ${step.order} (${step.tool}) failed: ${errMsg}`); } // d. Check for error result @@ -223,13 +243,17 @@ export class PlanExecutor { params, stepsExecuted ); - if (recovered !== null) { - stepsExecuted = recovered.stepsExecuted; - Object.assign(params, recovered.params); - continue; - } - - return failure(`Step ${step.order} (${step.tool}) returned error: ${errMsg}`); + if (recovered !== null) { + stepsExecuted = recovered.stepsExecuted; + Object.assign(params, recovered.params); + continue; + } + + const bounded = await this.tryBoundedRecovery(step, errMsg, sessionId, params, options, recoveryAttempts); + stepsExecuted += bounded.stepsExecuted; + if (bounded.recovered) continue; + + return failure(`Step ${step.order} (${step.tool}) returned error: ${errMsg}`); } // e. Check for empty result (before storing) — may trigger empty_result handler @@ -242,13 +266,16 @@ export class PlanExecutor { params, stepsExecuted ); - if (recovered !== null) { - stepsExecuted = recovered.stepsExecuted; - Object.assign(params, recovered.params); - continue; - } - // No handler for empty — treat as non-fatal, just skip storing - } + if (recovered !== null) { + stepsExecuted = recovered.stepsExecuted; + Object.assign(params, recovered.params); + continue; + } + const bounded = await this.tryBoundedRecovery(step, 'empty result', sessionId, params, options, recoveryAttempts); + stepsExecuted += bounded.stepsExecuted; + if (bounded.recovered) continue; + // No handler for empty — treat as non-fatal, just skip storing + } // f. Parse and store result if requested if (step.parseResult && step.parseResult.storeAs) { @@ -270,29 +297,103 @@ export class PlanExecutor { const criteriaError = validateSuccessCriteria(plan.successCriteria, params); if (criteriaError) { console.error(`[PlanExecutor] Success criteria failed for plan=${plan.id}: ${criteriaError}`); - return { - success: false, - planId: plan.id, - error: `Success criteria not met: ${criteriaError}`, - durationMs: Date.now() - startTime, - stepsExecuted, - totalSteps: plan.steps.length, - }; + return withRecovery({ + success: false, + planId: plan.id, + error: `Success criteria not met: ${criteriaError}`, + durationMs: Date.now() - startTime, + stepsExecuted, + totalSteps: plan.steps.length, + }); } // 4. Return success with all collected params as data - return { - success: true, - planId: plan.id, - data: params, - durationMs: Date.now() - startTime, - stepsExecuted, - totalSteps: plan.steps.length, - }; - } - - /** - * Attempt to find and run a recovery handler for a given condition. + return withRecovery({ + success: true, + planId: plan.id, + data: params, + durationMs: Date.now() - startTime, + stepsExecuted, + totalSteps: plan.steps.length, + }); + } + + private async tryBoundedRecovery( + failedStep: CompiledStep, + errorText: string, + sessionId: string, + params: Record, + options: PlanExecutionOptions, + recoveryAttempts: PlanRecoveryAttempt[], + ): Promise<{ recovered: boolean; stepsExecuted: number }> { + const config = options.boundedRecovery; + if (!config?.enabled) return { recovered: false, stepsExecuted: 0 }; + + const maxToolCalls = Math.max(0, config.maxToolCalls ?? 2); + if (countExecutedRecoveryAttempts(recoveryAttempts) >= maxToolCalls) { + return { recovered: false, stepsExecuted: 0 }; + } + + const candidates = rankRecoveryCandidates({ + toolName: failedStep.tool, + resultText: errorText, + isError: true, + recentCalls: [{ toolName: failedStep.tool, result: 'error', error: errorText }], + maxCandidates: config.maxCandidates ?? 3, + }); + + let executed = 0; + for (const candidate of candidates) { + if (countExecutedRecoveryAttempts(recoveryAttempts) >= maxToolCalls) break; + if (candidate.risk !== 'read_only' || candidate.blockedReason) { + recoveryAttempts.push({ + tool: candidate.tool, + status: 'blocked', + reason: candidate.blockedReason ?? `risk ${candidate.risk} is not allowed for bounded recovery`, + }); + continue; + } + + const handler = this.toolResolver(candidate.tool); + if (!handler) { + recoveryAttempts.push({ tool: candidate.tool, status: 'failed', reason: 'tool handler not found' }); + continue; + } + + const args = buildSafeRecoveryArgs(candidate.tool, params); + if (!args) { + recoveryAttempts.push({ tool: candidate.tool, status: 'blocked', reason: 'no safe argument template available' }); + continue; + } + + try { + const result = await withTimeout( + handler(sessionId, args), + config.perCandidateTimeoutMs ?? 5000, + `bounded recovery tool=${candidate.tool} for step=${failedStep.order}`, + ); + executed++; + if (!result.isError && !isEmptyResult(result)) { + recoveryAttempts.push({ tool: candidate.tool, status: 'success', reason: candidate.reason }); + return { recovered: true, stepsExecuted: executed }; + } + recoveryAttempts.push({ tool: candidate.tool, status: 'failed', reason: 'candidate returned empty or error result' }); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + recoveryAttempts.push({ + tool: candidate.tool, + status: /timed out/i.test(message) ? 'timeout' : 'failed', + reason: candidate.reason, + error: message, + }); + } + } + + return { recovered: false, stepsExecuted: executed }; + } + + /** + * Attempt to find and run a recovery handler for a given condition. * Returns updated stepsExecuted + params snapshot on success, null if no handler. */ private async tryRecovery( @@ -363,5 +464,20 @@ export class PlanExecutor { } return { stepsExecuted, params }; - } -} + } +} + +function countExecutedRecoveryAttempts(attempts: PlanRecoveryAttempt[]): number { + return attempts.filter((attempt) => attempt.status !== 'blocked').length; +} + +function buildSafeRecoveryArgs(tool: string, params: Record): Record | null { + const tabId = typeof params.tabId === 'string' ? params.tabId : undefined; + switch (tool) { + case 'read_page': + case 'tabs_context': + return tabId ? { tabId } : {}; + default: + return null; + } +} diff --git a/src/tools/orchestration.ts b/src/tools/orchestration.ts index 16923b33..0706af8b 100644 --- a/src/tools/orchestration.ts +++ b/src/tools/orchestration.ts @@ -666,6 +666,10 @@ const executePlanDefinition: MCPToolDefinition = { type: 'string', description: 'Tab ID to execute the plan against', }, + boundedRecovery: { + type: 'boolean', + description: 'Opt-in: try a small, safety-gated read-only recovery search when a plan step fails', + }, params: { type: 'object', description: 'Runtime params merged with plan defaults', @@ -684,6 +688,7 @@ const executePlanHandler: ToolHandler = async ( const planId = args.planId as string; const tabId = args.tabId as string; const runtimeParams = (args.params as Record) || {}; + const boundedRecovery = args.boundedRecovery === true; if (!planId || !tabId) { return { @@ -749,7 +754,11 @@ const executePlanHandler: ToolHandler = async ( // Execute the plan const mergedParams = { tabId, ...runtimeParams }; - const result = await executor.execute(plan, sessionId, mergedParams); + const result = await executor.execute(plan, sessionId, mergedParams, { + boundedRecovery: boundedRecovery + ? { enabled: true, maxCandidates: 3, maxToolCalls: 2, perCandidateTimeoutMs: 5000 } + : undefined, + }); // Update stats registry.updateStats(planId, result.success, result.durationMs); @@ -765,6 +774,7 @@ const executePlanHandler: ToolHandler = async ( durationMs: result.durationMs, data: result.data, error: result.error, + recovery: result.recovery, message: result.success ? `Plan "${planId}" executed successfully in ${result.durationMs}ms (${result.stepsExecuted}/${result.totalSteps} steps)` : `Plan "${planId}" failed: ${result.error}. Consider manual execution.`, diff --git a/src/types/plan-cache.ts b/src/types/plan-cache.ts index a52af178..7ba4046e 100644 --- a/src/types/plan-cache.ts +++ b/src/types/plan-cache.ts @@ -109,8 +109,31 @@ export interface PlanRegistryData { updatedAt: number; } -/** Result of plan execution */ -export interface PlanExecutionResult { +/** Bounded recovery telemetry for plan execution. */ +export interface PlanRecoveryAttempt { + tool: string; + status: 'success' | 'failed' | 'blocked' | 'timeout'; + reason: string; + error?: string; +} + +export interface PlanRecoverySummary { + enabled: boolean; + attempts: PlanRecoveryAttempt[]; + exhausted: boolean; +} + +export interface PlanExecutionOptions { + boundedRecovery?: { + enabled: boolean; + maxCandidates?: number; + maxToolCalls?: number; + perCandidateTimeoutMs?: number; + }; +} + +/** Result of plan execution */ +export interface PlanExecutionResult { /** Whether the plan executed successfully */ success: boolean; /** Plan ID that was executed */ @@ -123,6 +146,8 @@ export interface PlanExecutionResult { durationMs: number; /** Number of steps executed */ stepsExecuted: number; - /** Total steps in plan */ - totalSteps: number; -} + /** Total steps in plan */ + totalSteps: number; + /** Optional bounded recovery summary when opt-in recovery is enabled. */ + recovery?: PlanRecoverySummary; +} diff --git a/tests/orchestration/plan-executor-bounded-recovery.test.ts b/tests/orchestration/plan-executor-bounded-recovery.test.ts new file mode 100644 index 00000000..cc98a57e --- /dev/null +++ b/tests/orchestration/plan-executor-bounded-recovery.test.ts @@ -0,0 +1,72 @@ +import { PlanExecutor } from '../../src/orchestration/plan-executor'; +import type { CompiledPlan } from '../../src/types/plan-cache'; +import type { MCPResult, ToolHandler } from '../../src/types/mcp'; + +const text = (value: string, isError = false): MCPResult => ({ content: [{ type: 'text', text: value }], isError }); + +function plan(): CompiledPlan { + return { + id: 'test-plan', + version: '1', + description: 'test', + parameters: {}, + steps: [ + { order: 1, tool: 'interact', args: { ref: 'old', tabId: '${tabId}' }, timeout: 1000 }, + { order: 2, tool: 'read_page', args: { tabId: '${tabId}' }, timeout: 1000, parseResult: { format: 'text', storeAs: 'page' } }, + ], + errorHandlers: [], + successCriteria: { requiredFields: ['page'] }, + }; +} + +describe('PlanExecutor bounded recovery', () => { + it('is disabled by default and preserves original failure', async () => { + const handlers = new Map([ + ['interact', async () => text('ref is stale', true)], + ['read_page', async () => text('fresh page')], + ]); + const executor = new PlanExecutor((name) => handlers.get(name) ?? null); + + const result = await executor.execute(plan(), 's1', { tabId: 'tab-1' }); + + expect(result.success).toBe(false); + expect(result.recovery).toBeUndefined(); + }); + + it('tries bounded read-only recovery and continues on success', async () => { + const calls: string[] = []; + const handlers = new Map([ + ['interact', async () => { calls.push('interact'); return text('ref is stale', true); }], + ['read_page', async () => { calls.push('read_page'); return text('fresh page with refs'); }], + ]); + const executor = new PlanExecutor((name) => handlers.get(name) ?? null); + + const result = await executor.execute(plan(), 's1', { tabId: 'tab-1' }, { + boundedRecovery: { enabled: true, maxCandidates: 2, maxToolCalls: 1, perCandidateTimeoutMs: 1000 }, + }); + + expect(result.success).toBe(true); + expect(calls).toEqual(['interact', 'read_page', 'read_page']); + expect(result.recovery?.attempts[0]).toMatchObject({ tool: 'read_page', status: 'success' }); + }); + + it('blocks side-effect candidates and enforces max tool-call budget', async () => { + const handlers = new Map([ + ['click', async () => text('CAPTCHA Access Denied Login page detected', true)], + ['read_page', async () => text('', false)], + ['tabs_context', async () => text('tab state')], + ]); + const captchaPlan = plan(); + captchaPlan.steps[0] = { order: 1, tool: 'click', args: { tabId: '${tabId}' }, timeout: 1000 }; + const executor = new PlanExecutor((name) => handlers.get(name) ?? null); + + const result = await executor.execute(captchaPlan, 's1', { tabId: 'tab-1' }, { + boundedRecovery: { enabled: true, maxCandidates: 3, maxToolCalls: 1, perCandidateTimeoutMs: 1000 }, + }); + + expect(result.success).toBe(false); + expect(result.recovery?.attempts.length).toBe(1); + expect(result.recovery?.exhausted).toBe(true); + expect(result.recovery?.attempts[0].tool).toBe('read_page'); + }); +});