diff --git a/src/hints/hint-engine.ts b/src/hints/hint-engine.ts index 589c66681..fd44f975e 100644 --- a/src/hints/hint-engine.ts +++ b/src/hints/hint-engine.ts @@ -18,6 +18,7 @@ import type { ToolCallEvent } from '../dashboard/types'; import type { ActivityTracker } from '../dashboard/activity-tracker'; import { PatternLearner } from './pattern-learner'; import { ProgressTracker } from './progress-tracker.js'; +import { formatCandidateHint, rankRecoveryCandidates, type RecoveryCandidate } from '../recovery'; import { RepeatedCallDetector } from './repeated-call-detector.js'; import { errorRecoveryRules } from './rules/error-recovery'; import { blockingPageRules } from './rules/blocking-page'; @@ -85,6 +86,7 @@ export interface HintResult { tool?: string; reason: string; }; + candidates?: RecoveryCandidate[]; context?: { element?: string; coordinates?: string; @@ -238,9 +240,11 @@ export class HintEngine { const key = escalationKey('progress-tracker-stuck'); const fireCount = (this.hintEscalation.get(key) || 0) + 1; this.hintEscalation.set(key, fireCount); + const candidates = rankRecoveryCandidates({ toolName, resultText, isError, recentCalls }); const rawHintText = 'STOP — you are stuck. The last several tool calls made no meaningful progress ' + '(errors, stale refs, auth redirects, or timeouts). ' + 'Step back and try a completely different approach, or ask the user for help.' + + formatCandidateHint(candidates) + (ledgerHint ? ` ${ledgerHint}` : ''); const severity = fireCount >= 2 ? 'critical' as const : 'warning' as const; this.log({ timestamp: Date.now(), toolName, isError, matchedRule: 'progress-tracker-stuck', hint: rawHintText, severity, fireCount }); @@ -251,6 +255,7 @@ export class HintEngine { fireCount, hint: this.formatHintMessage(severity, rawHintText, fireCount), rawHint: rawHintText, + ...(candidates.length > 0 && { candidates }), }; } @@ -258,8 +263,10 @@ export class HintEngine { const key = escalationKey('progress-tracker-stalling'); const fireCount = (this.hintEscalation.get(key) || 0) + 1; this.hintEscalation.set(key, fireCount); + const candidates = rankRecoveryCandidates({ toolName, resultText, isError, recentCalls }); const rawHintText = 'Warning: recent tool calls are not making progress. ' + 'Consider trying a different approach before getting stuck.' + + formatCandidateHint(candidates) + (ledgerHint ? ` ${ledgerHint}` : ''); const severity = this.getSeverity(fireCount); this.log({ timestamp: Date.now(), toolName, isError, matchedRule: 'progress-tracker-stalling', hint: rawHintText, severity, fireCount }); @@ -270,6 +277,7 @@ export class HintEngine { fireCount, hint: this.formatHintMessage(severity, rawHintText, fireCount), rawHint: rawHintText, + ...(candidates.length > 0 && { candidates }), }; } diff --git a/src/mcp-server.ts b/src/mcp-server.ts index e465ae5d1..2b2e542bd 100644 --- a/src/mcp-server.ts +++ b/src/mcp-server.ts @@ -2050,6 +2050,7 @@ export class MCPServer { rule: hintResult.rule, fireCount: hintResult.fireCount, ...(hintResult.suggestion && { suggestion: hintResult.suggestion }), + ...(hintResult.candidates && { candidates: hintResult.candidates }), ...(hintResult.context && { context: hintResult.context }), }; const content = (result as Record).content; @@ -2258,6 +2259,7 @@ export class MCPServer { rule: hintResult.rule, fireCount: hintResult.fireCount, ...(hintResult.suggestion && { suggestion: hintResult.suggestion }), + ...(hintResult.candidates && { candidates: hintResult.candidates }), ...(hintResult.context && { context: hintResult.context }), }; if (Array.isArray(errResult.content)) { diff --git a/src/orchestration/plan-executor.ts b/src/orchestration/plan-executor.ts index 564c883e7..e51f5cf32 100644 --- a/src/orchestration/plan-executor.ts +++ b/src/orchestration/plan-executor.ts @@ -6,229 +6,247 @@ */ import { MCPResult, ToolHandler } from '../types/mcp'; -import { - CompiledPlan, - CompiledStep, - PlanErrorHandler, - PlanExecutionOptions, - PlanExecutionResult, - PlanFinalVerificationResult, - PlanStepEvidence, -} from '../types/plan-cache'; -import { evaluate } from '../contracts/evaluate'; -import type { EvalContext, NetworkLogEntry } from '../contracts/eval-context'; -import { evaluateTaskSignature, preflightAllowedTools } from '../contracts/task-signature'; -import type { TaskSignatureToolCallSummary } from '../contracts/task-signature'; -import { withTimeout } from '../utils/with-timeout'; - -const SAFE_CONTRACT_MAX_RECOVERY_STEPS = 10; - -function isSafeContractPlan(plan: CompiledPlan): boolean { - return plan.contractVersion === 2 || Array.isArray(plan.allowedTools); -} - -function validateCompiledPlanContract(plan: CompiledPlan, toolResolver: (toolName: string) => ToolHandler | null): string | null { - if (!isSafeContractPlan(plan)) return null; - const allowedTools = new Set(plan.allowedTools || []); - const allSteps = [ - ...plan.steps.map(step => ({ step, source: 'plan' })), - ...plan.errorHandlers.flatMap(handler => handler.steps.map(step => ({ step, source: `errorHandler:${handler.condition}` }))), - ]; - - if (plan.allowedTools && plan.allowedTools.length === 0) return 'allowedTools must not be empty for safe contract plans'; - if (!plan.successCriteria || Object.keys(plan.successCriteria).length === 0) { - return 'safe contract plans require explicit successCriteria'; - } - - for (const { step, source } of allSteps) { - if (!toolResolver(step.tool)) { - return `unknown tool "${step.tool}" in ${source}`; - } - if (allowedTools.size > 0 && !allowedTools.has(step.tool)) { - return `tool "${step.tool}" in ${source} is not in allowedTools`; - } - if (typeof step.timeout !== 'number' || !Number.isFinite(step.timeout) || step.timeout <= 0) { - return `step ${step.order} in ${source} is missing a positive timeout`; - } - const badTemplate = findMalformedTemplate(step.args); - if (badTemplate) return `malformed substitution "${badTemplate}" in step ${step.order}`; - } - - for (const handler of plan.errorHandlers) { - if (handler.steps.length > SAFE_CONTRACT_MAX_RECOVERY_STEPS) { - return `recovery handler "${handler.condition}" exceeds ${SAFE_CONTRACT_MAX_RECOVERY_STEPS} steps`; - } - } - - return null; -} - -function findMalformedTemplate(value: unknown): string | null { - if (typeof value === 'string') { - const matches = value.match(/\$\{[^}]*\}?/g) || []; - for (const match of matches) { - if (!/^\$\{[A-Za-z_][A-Za-z0-9_]*\}$/.test(match)) return match; - } - return null; - } - if (Array.isArray(value)) { - for (const item of value) { - const bad = findMalformedTemplate(item); - if (bad) return bad; - } - } else if (value !== null && typeof value === 'object') { - for (const item of Object.values(value as Record)) { - const bad = findMalformedTemplate(item); - if (bad) return bad; - } - } - return null; -} - -function summarizeMcpResult(mcpResult: MCPResult): string { - const text = mcpResult.content?.[0]?.text || ''; - return text.replace(/\s+/g, ' ').trim().slice(0, 240); -} - -interface SnapshotInput { - url?: string; - dom_text?: string | null | Record; - dom_count?: Record; - network?: NetworkLogEntry[]; - screenshot_png_base64?: string; - has_open_dialog?: boolean; - captured_at?: number; - timestamp?: number; -} - -function buildSnapshotEvalContext(snapshot: SnapshotInput): EvalContext { - const domTextBySelector = typeof snapshot.dom_text === 'object' && snapshot.dom_text !== null - ? snapshot.dom_text as Record - : undefined; - const bodyText = typeof snapshot.dom_text === 'string' || snapshot.dom_text === null - ? snapshot.dom_text - : undefined; - const screenshot = snapshot.screenshot_png_base64 ? Buffer.from(snapshot.screenshot_png_base64, 'base64') : null; - return { - async url() { return snapshot.url ?? ''; }, - async domText(selector) { - if (!domTextBySelector) return bodyText ?? null; - const requestedSelector = selector ?? 'body'; - if (requestedSelector === 'body') return domTextBySelector.body ?? null; - return Object.prototype.hasOwnProperty.call(domTextBySelector, requestedSelector) - ? domTextBySelector[requestedSelector] ?? null - : null; - }, - async domCount(selector) { return snapshot.dom_count?.[selector] ?? 0; }, - async networkSince() { return snapshot.network ?? []; }, - async screenshotPng() { return screenshot; }, - async hasOpenDialog() { return snapshot.has_open_dialog ?? false; }, - }; -} - -function isSnapshotInput(value: unknown): value is SnapshotInput { - return typeof value === 'object' && value !== null; -} - -async function runFinalVerification( - plan: CompiledPlan, - params: Record, -): Promise { - const gate = plan.finalVerification; - if (!gate) return null; - const snapshotParam = gate.snapshotParam || 'finalSnapshot'; - const snapshot = params[snapshotParam]; - if (!isSnapshotInput(snapshot)) { - return { - passed: false, - snapshotParam, - assertions: [], - error: `final verification snapshot param missing or invalid: ${snapshotParam}`, - }; - } - - const capturedAt = typeof snapshot.captured_at === 'number' - ? snapshot.captured_at - : typeof snapshot.timestamp === 'number' - ? snapshot.timestamp - : undefined; - if (gate.freshnessMs !== undefined && capturedAt !== undefined && Date.now() - capturedAt > gate.freshnessMs) { - return { - passed: false, - snapshotParam, - assertions: [], - error: `final verification snapshot is stale: age ${Date.now() - capturedAt}ms exceeds ${gate.freshnessMs}ms`, - }; - } - - const unsupportedEvidence = (gate.requiredEvidence || []).filter(kind => !['dom', 'url', 'network', 'screenshot'].includes(kind)); - if (unsupportedEvidence.length > 0) { - return { - passed: false, - snapshotParam, - assertions: [], - error: `unsupported finalVerification.requiredEvidence: ${unsupportedEvidence.join(', ')}`, - }; - } - - const assertions: PlanFinalVerificationResult['assertions'] = []; - const ctx = buildSnapshotEvalContext(snapshot); - for (let i = 0; i < gate.finalAssertions.length; i++) { - const assertion = gate.finalAssertions[i]; - const result = await evaluate(assertion, ctx); - assertions.push({ index: i, passed: result.passed, evidence: result.evidence }); - if (!result.passed) { - return { - passed: false, - snapshotParam, - assertions, - failedAssertion: { index: i, assertion, evidence: result.evidence }, - }; - } - } - return { passed: true, snapshotParam, assertions }; -} - -/** - * Resolve a dotted path from a parsed plan value. Numeric path segments address - * array indexes so plans can bind values like matches.login.submit.ref or - * results.0.ref without evaluating arbitrary expressions. - */ -function getPathValue(value: unknown, path: string): unknown { - if (!path) return value; - let current = value; - for (const segment of path.split('.')) { - if (current === null || current === undefined) return undefined; - if (Array.isArray(current)) { - if (!/^\d+$/.test(segment)) return undefined; - current = current[Number(segment)]; - continue; - } - if (typeof current !== 'object') return undefined; - current = (current as Record)[segment]; - } - return current; -} - -function resolveParamPath(params: Record, path: string): unknown { - if (Object.prototype.hasOwnProperty.call(params, path)) return params[path]; - const [root, ...rest] = path.split('.'); - if (!Object.prototype.hasOwnProperty.call(params, root)) return undefined; - return getPathValue(params[root], rest.join('.')); -} - -/** - * Recursively substitute ${varName} templates in a value using the params map. - * Handles strings, objects, arrays, and dotted paths. Non-string primitives are returned as-is. - * Missing vars are left as-is (no crash). - */ -function substituteParams(value: unknown, params: Record): unknown { - if (typeof value === 'string') { - return value.replace(/\$\{([^}]+)\}/g, (match, varName) => { - const resolved = resolveParamPath(params, varName); - if (resolved === undefined) return match; - if (typeof resolved === 'string') return resolved; - return JSON.stringify(resolved); +import { + CompiledPlan, + CompiledStep, + PlanErrorHandler, + PlanExecutionOptions, + PlanExecutionResult, + PlanFinalVerificationResult, + PlanRecoveryAttempt, + PlanStepEvidence, +} from '../types/plan-cache'; +import { evaluate } from '../contracts/evaluate'; +import type { EvalContext, NetworkLogEntry } from '../contracts/eval-context'; +import { evaluateTaskSignature, preflightAllowedTools } from '../contracts/task-signature'; +import { rankRecoveryCandidates } from '../recovery'; +import type { TaskSignatureToolCallSummary } from '../contracts/task-signature'; +import { withTimeout } from '../utils/with-timeout'; + +const SAFE_CONTRACT_MAX_RECOVERY_STEPS = 10; + +function isSafeContractPlan(plan: CompiledPlan): boolean { + return plan.contractVersion === 2 || Array.isArray(plan.allowedTools); +} + +function validateCompiledPlanContract(plan: CompiledPlan, toolResolver: (toolName: string) => ToolHandler | null): string | null { + if (!isSafeContractPlan(plan)) return null; + const allowedTools = new Set(plan.allowedTools || []); + const allSteps = [ + ...plan.steps.map(step => ({ step, source: 'plan' })), + ...plan.errorHandlers.flatMap(handler => handler.steps.map(step => ({ step, source: `errorHandler:${handler.condition}` }))), + ]; + + if (plan.allowedTools && plan.allowedTools.length === 0) return 'allowedTools must not be empty for safe contract plans'; + if (!plan.successCriteria || Object.keys(plan.successCriteria).length === 0) { + return 'safe contract plans require explicit successCriteria'; + } + + for (const { step, source } of allSteps) { + if (!toolResolver(step.tool)) { + return `unknown tool "${step.tool}" in ${source}`; + } + if (allowedTools.size > 0 && !allowedTools.has(step.tool)) { + return `tool "${step.tool}" in ${source} is not in allowedTools`; + } + if (typeof step.timeout !== 'number' || !Number.isFinite(step.timeout) || step.timeout <= 0) { + return `step ${step.order} in ${source} is missing a positive timeout`; + } + const badTemplate = findMalformedTemplate(step.args); + if (badTemplate) return `malformed substitution "${badTemplate}" in step ${step.order}`; + } + + for (const handler of plan.errorHandlers) { + if (handler.steps.length > SAFE_CONTRACT_MAX_RECOVERY_STEPS) { + return `recovery handler "${handler.condition}" exceeds ${SAFE_CONTRACT_MAX_RECOVERY_STEPS} steps`; + } + } + + return null; +} + +function findMalformedTemplate(value: unknown): string | null { + if (typeof value === 'string') { + const matches = value.match(/\$\{[^}]*\}?/g) || []; + for (const match of matches) { + if (!/^\$\{[A-Za-z_][A-Za-z0-9_]*\}$/.test(match)) return match; + } + return null; + } + if (Array.isArray(value)) { + for (const item of value) { + const bad = findMalformedTemplate(item); + if (bad) return bad; + } + } else if (value !== null && typeof value === 'object') { + for (const item of Object.values(value as Record)) { + const bad = findMalformedTemplate(item); + if (bad) return bad; + } + } + return null; +} + +function summarizeMcpResult(mcpResult: MCPResult): string { + const text = mcpResult.content?.[0]?.text || ''; + return text.replace(/\s+/g, ' ').trim().slice(0, 240); +} + +interface SnapshotInput { + url?: string; + dom_text?: string | null | Record; + dom_count?: Record; + network?: NetworkLogEntry[]; + screenshot_png_base64?: string; + has_open_dialog?: boolean; + captured_at?: number; + timestamp?: number; +} + +function buildSnapshotEvalContext(snapshot: SnapshotInput): EvalContext { + const domTextBySelector = typeof snapshot.dom_text === 'object' && snapshot.dom_text !== null + ? snapshot.dom_text as Record + : undefined; + const bodyText = typeof snapshot.dom_text === 'string' || snapshot.dom_text === null + ? snapshot.dom_text + : undefined; + const screenshot = snapshot.screenshot_png_base64 ? Buffer.from(snapshot.screenshot_png_base64, 'base64') : null; + return { + async url() { return snapshot.url ?? ''; }, + async domText(selector) { + if (!domTextBySelector) return bodyText ?? null; + const requestedSelector = selector ?? 'body'; + if (requestedSelector === 'body') return domTextBySelector.body ?? null; + return Object.prototype.hasOwnProperty.call(domTextBySelector, requestedSelector) + ? domTextBySelector[requestedSelector] ?? null + : null; + }, + async domCount(selector) { return snapshot.dom_count?.[selector] ?? 0; }, + async networkSince() { return snapshot.network ?? []; }, + async screenshotPng() { return screenshot; }, + async hasOpenDialog() { return snapshot.has_open_dialog ?? false; }, + }; +} + +function isSnapshotInput(value: unknown): value is SnapshotInput { + return typeof value === 'object' && value !== null; +} + +async function runFinalVerification( + plan: CompiledPlan, + params: Record, +): Promise { + const gate = plan.finalVerification; + if (!gate) return null; + const snapshotParam = gate.snapshotParam || 'finalSnapshot'; + const snapshot = params[snapshotParam]; + if (!isSnapshotInput(snapshot)) { + return { + passed: false, + snapshotParam, + assertions: [], + error: `final verification snapshot param missing or invalid: ${snapshotParam}`, + }; + } + + const capturedAt = typeof snapshot.captured_at === 'number' + ? snapshot.captured_at + : typeof snapshot.timestamp === 'number' + ? snapshot.timestamp + : undefined; + if (gate.freshnessMs !== undefined && capturedAt !== undefined && Date.now() - capturedAt > gate.freshnessMs) { + return { + passed: false, + snapshotParam, + assertions: [], + error: `final verification snapshot is stale: age ${Date.now() - capturedAt}ms exceeds ${gate.freshnessMs}ms`, + }; + } + + const unsupportedEvidence = (gate.requiredEvidence || []).filter(kind => !['dom', 'url', 'network', 'screenshot'].includes(kind)); + if (unsupportedEvidence.length > 0) { + return { + passed: false, + snapshotParam, + assertions: [], + error: `unsupported finalVerification.requiredEvidence: ${unsupportedEvidence.join(', ')}`, + }; + } + + const assertions: PlanFinalVerificationResult['assertions'] = []; + const ctx = buildSnapshotEvalContext(snapshot); + for (let i = 0; i < gate.finalAssertions.length; i++) { + const assertion = gate.finalAssertions[i]; + const result = await evaluate(assertion, ctx); + assertions.push({ index: i, passed: result.passed, evidence: result.evidence }); + if (!result.passed) { + return { + passed: false, + snapshotParam, + assertions, + failedAssertion: { index: i, assertion, evidence: result.evidence }, + }; + } + } + return { passed: true, snapshotParam, assertions }; +} + +/** + * Resolve a dotted path from a parsed plan value. Numeric path segments address + * array indexes so plans can bind values like matches.login.submit.ref or + * results.0.ref without evaluating arbitrary expressions. + */ +function getPathValue(value: unknown, path: string): unknown { + if (!path) return value; + let current = value; + for (const segment of path.split('.')) { + if (current === null || current === undefined) return undefined; + if (Array.isArray(current)) { + if (!/^\d+$/.test(segment)) return undefined; + current = current[Number(segment)]; + continue; + } + if (typeof current !== 'object') return undefined; + current = (current as Record)[segment]; + } + return current; +} + +function resolveParamPath(params: Record, path: string): unknown { + if (Object.prototype.hasOwnProperty.call(params, path)) return params[path]; + const [root, ...rest] = path.split('.'); + if (!Object.prototype.hasOwnProperty.call(params, root)) return undefined; + return getPathValue(params[root], rest.join('.')); +} + +/** + * Recursively substitute ${varName} templates in a value using the params map. + * Handles strings, objects, arrays, and dotted paths. Non-string primitives are returned as-is. + * Missing vars are left as-is (no crash). + */ + +function countExecutedRecoveryAttempts(attempts: PlanRecoveryAttempt[]): number { + return attempts.filter((attempt) => attempt.status !== 'blocked').length; +} + +function buildSafeRecoveryArgs(tool: string, params: Record): Record | null { + const tabId = typeof params.tabId === 'string' ? params.tabId : undefined; + switch (tool) { + case 'read_page': + case 'tabs_context': + return tabId ? { tabId } : {}; + default: + return null; + } +} + +function substituteParams(value: unknown, params: Record): unknown { + if (typeof value === 'string') { + return value.replace(/\$\{([^}]+)\}/g, (match, varName) => { + const resolved = resolveParamPath(params, varName); + if (resolved === undefined) return match; + if (typeof resolved === 'string') return resolved; + return JSON.stringify(resolved); }); } if (Array.isArray(value)) { @@ -267,10 +285,10 @@ function extractResult( } catch { parsed = text; } - - if (parseResult.extractField) { - parsed = getPathValue(parsed, parseResult.extractField); - } + + if (parseResult.extractField) { + parsed = getPathValue(parsed, parseResult.extractField); + } return parsed; } @@ -348,20 +366,21 @@ export class PlanExecutor { const startTime = Date.now(); let stepsExecuted = 0; const recentTools: TaskSignatureToolCallSummary[] = []; - const evidence: PlanStepEvidence[] = []; - - const contractError = validateCompiledPlanContract(plan, this.toolResolver); - if (contractError) { - return { - success: false, - planId: plan.id, - error: `Plan contract validation failed: ${contractError}`, - durationMs: Date.now() - startTime, - stepsExecuted, - totalSteps: plan.steps.length, - evidence, - }; - } + const evidence: PlanStepEvidence[] = []; + const recoveryAttempts: PlanRecoveryAttempt[] = []; + + const contractError = validateCompiledPlanContract(plan, this.toolResolver); + if (contractError) { + return { + success: false, + planId: plan.id, + error: `Plan contract validation failed: ${contractError}`, + durationMs: Date.now() - startTime, + stepsExecuted, + totalSteps: plan.steps.length, + evidence, + }; + } if (options.taskSignature) { const preflight = preflightAllowedTools( @@ -377,12 +396,12 @@ export class PlanExecutor { return { success: false, planId: plan.id, - error: preflight.reasons.join('; '), - durationMs: Date.now() - startTime, - stepsExecuted, - totalSteps: plan.steps.length, - evidence, - taskSignature: preflight, + error: preflight.reasons.join('; '), + durationMs: Date.now() - startTime, + stepsExecuted, + totalSteps: plan.steps.length, + evidence, + taskSignature: preflight, }; } } @@ -396,14 +415,25 @@ export class PlanExecutor { } Object.assign(params, runtimeParams); - const failure = (error: string, taskSignature?: PlanExecutionResult['taskSignature']): PlanExecutionResult => ({ + const withRecovery = (result: T): T => { + if (options.boundedRecovery?.enabled) { + result.recovery = { + enabled: true, + attempts: recoveryAttempts, + exhausted: countExecutedRecoveryAttempts(recoveryAttempts) >= (options.boundedRecovery.maxToolCalls ?? 2), + }; + } + return result; + }; + + const failure = (error: string, taskSignature?: PlanExecutionResult['taskSignature']): PlanExecutionResult => withRecovery({ success: false, planId: plan.id, error, durationMs: Date.now() - startTime, stepsExecuted, totalSteps: plan.steps.length, - evidence, + evidence, ...(taskSignature ? { taskSignature } : {}), }); @@ -424,24 +454,24 @@ export class PlanExecutor { // c. Call handler with timeout let mcpResult: MCPResult; - const stepStart = Date.now(); - try { - mcpResult = await withTimeout( - handler(sessionId, substitutedArgs), - step.timeout, - stepLabel - ); - stepsExecuted++; - const empty = isEmptyResult(mcpResult); - evidence.push({ - step: step.order, - tool: step.tool, - source: 'plan', - outcome: mcpResult.isError ? 'error' : empty ? 'empty' : 'success', - durationMs: Date.now() - stepStart, - summary: summarizeMcpResult(mcpResult), - }); - recentTools.push({ tool: step.tool, progressed: !empty && !mcpResult.isError }); + const stepStart = Date.now(); + try { + mcpResult = await withTimeout( + handler(sessionId, substitutedArgs), + step.timeout, + stepLabel + ); + stepsExecuted++; + const empty = isEmptyResult(mcpResult); + evidence.push({ + step: step.order, + tool: step.tool, + source: 'plan', + outcome: mcpResult.isError ? 'error' : empty ? 'empty' : 'success', + durationMs: Date.now() - stepStart, + summary: summarizeMcpResult(mcpResult), + }); + recentTools.push({ tool: step.tool, progressed: !empty && !mcpResult.isError }); if (options.taskSignature) { const taskStatus = await evaluateTaskSignature({ signature: options.taskSignature, @@ -453,12 +483,12 @@ export class PlanExecutor { return { success: true, planId: plan.id, - data: params, - durationMs: Date.now() - startTime, - stepsExecuted, - totalSteps: plan.steps.length, - taskSignature: taskStatus, - evidence, + data: params, + durationMs: Date.now() - startTime, + stepsExecuted, + totalSteps: plan.steps.length, + taskSignature: taskStatus, + evidence, }; } if (taskStatus.status !== 'continue') { @@ -467,25 +497,25 @@ export class PlanExecutor { } } catch (err) { const errMsg = err instanceof Error ? err.message : String(err); - evidence.push({ - step: step.order, - tool: step.tool, - source: 'plan', - outcome: 'error', - durationMs: Date.now() - stepStart, - summary: errMsg.slice(0, 240), - }); - console.error(`[PlanExecutor] Step failed at ${stepLabel}: ${errMsg}`); - - // Check for a matching error handler - const conditionKey = `step${step.order}_error`; - const recovered = await this.tryRecovery( - conditionKey, - plan.errorHandlers, - sessionId, - params, - stepsExecuted, - evidence + evidence.push({ + step: step.order, + tool: step.tool, + source: 'plan', + outcome: 'error', + durationMs: Date.now() - stepStart, + summary: errMsg.slice(0, 240), + }); + console.error(`[PlanExecutor] Step failed at ${stepLabel}: ${errMsg}`); + + // Check for a matching error handler + const conditionKey = `step${step.order}_error`; + const recovered = await this.tryRecovery( + conditionKey, + plan.errorHandlers, + sessionId, + params, + stepsExecuted, + evidence ); if (recovered !== null) { stepsExecuted = recovered.stepsExecuted; @@ -494,6 +524,10 @@ export class PlanExecutor { continue; } + const bounded = await this.tryBoundedRecovery(step, errMsg, sessionId, params, options, recoveryAttempts); + stepsExecuted += bounded.stepsExecuted; + if (bounded.recovered) continue; + return failure(`Step ${step.order} (${step.tool}) failed: ${errMsg}`); } @@ -505,18 +539,22 @@ export class PlanExecutor { const conditionKey = `step${step.order}_error`; const recovered = await this.tryRecovery( conditionKey, - plan.errorHandlers, - sessionId, - params, - stepsExecuted, - evidence - ); + plan.errorHandlers, + sessionId, + params, + stepsExecuted, + evidence + ); if (recovered !== null) { stepsExecuted = recovered.stepsExecuted; Object.assign(params, recovered.params); continue; } + const bounded = await this.tryBoundedRecovery(step, errMsg, sessionId, params, options, recoveryAttempts); + stepsExecuted += bounded.stepsExecuted; + if (bounded.recovered) continue; + return failure(`Step ${step.order} (${step.tool}) returned error: ${errMsg}`); } @@ -525,17 +563,20 @@ export class PlanExecutor { const conditionKey = `step${step.order}_empty_result`; const recovered = await this.tryRecovery( conditionKey, - plan.errorHandlers, - sessionId, - params, - stepsExecuted, - evidence - ); + plan.errorHandlers, + sessionId, + params, + stepsExecuted, + evidence + ); if (recovered !== null) { stepsExecuted = recovered.stepsExecuted; Object.assign(params, recovered.params); continue; } + const bounded = await this.tryBoundedRecovery(step, 'empty result', sessionId, params, options, recoveryAttempts); + stepsExecuted += bounded.stepsExecuted; + if (bounded.recovered) continue; // Compatibility/fail-safe (#1031): pre-existing cached plans treated an // empty, non-error tool result as a completed step unless the plan // explicitly supplied `stepN_empty_result` recovery. Keep that contract @@ -563,14 +604,14 @@ export class PlanExecutor { const criteriaError = validateSuccessCriteria(plan.successCriteria, params); if (criteriaError) { console.error(`[PlanExecutor] Success criteria failed for plan=${plan.id}: ${criteriaError}`); - return { + return withRecovery({ success: false, planId: plan.id, error: `Success criteria not met: ${criteriaError}`, durationMs: Date.now() - startTime, stepsExecuted, totalSteps: plan.steps.length, - evidence, + evidence, ...(options.taskSignature ? { taskSignature: await evaluateTaskSignature({ signature: options.taskSignature, @@ -579,51 +620,129 @@ export class PlanExecutor { toolCount: stepsExecuted, }) } : {}), - }; + }); + } + + // 4. Optional final Outcome Contract verification gate + const finalVerification = await runFinalVerification(plan, params); + if (finalVerification && !finalVerification.passed) { + return withRecovery({ + success: false, + planId: plan.id, + error: finalVerification.error || `Final verification failed at assertion ${finalVerification.failedAssertion?.index ?? 'unknown'}`, + durationMs: Date.now() - startTime, + stepsExecuted, + totalSteps: plan.steps.length, + evidence, + finalVerification, + ...(options.taskSignature + ? { taskSignature: await evaluateTaskSignature({ + signature: options.taskSignature, + recentTools, + elapsedMs: Date.now() - startTime, + toolCount: stepsExecuted, + }) } + : {}), + }); } - // 4. Optional final Outcome Contract verification gate - const finalVerification = await runFinalVerification(plan, params); - if (finalVerification && !finalVerification.passed) { - return { - success: false, - planId: plan.id, - error: finalVerification.error || `Final verification failed at assertion ${finalVerification.failedAssertion?.index ?? 'unknown'}`, - durationMs: Date.now() - startTime, - stepsExecuted, - totalSteps: plan.steps.length, - evidence, - finalVerification, - ...(options.taskSignature - ? { taskSignature: await evaluateTaskSignature({ - signature: options.taskSignature, - recentTools, - elapsedMs: Date.now() - startTime, - toolCount: stepsExecuted, - }) } - : {}), - }; - } - - // 5. Return success with all collected params as data - return { - success: true, - planId: plan.id, - data: params, - durationMs: Date.now() - startTime, - stepsExecuted, - totalSteps: plan.steps.length, - evidence, - ...(finalVerification ? { finalVerification } : {}), - ...(options.taskSignature - ? { taskSignature: await evaluateTaskSignature({ - signature: options.taskSignature, + // 5. Return success with all collected params as data + return withRecovery({ + success: true, + planId: plan.id, + data: params, + durationMs: Date.now() - startTime, + stepsExecuted, + totalSteps: plan.steps.length, + evidence, + ...(finalVerification ? { finalVerification } : {}), + ...(options.taskSignature + ? { taskSignature: await evaluateTaskSignature({ + signature: options.taskSignature, recentTools, elapsedMs: Date.now() - startTime, toolCount: stepsExecuted, }) } : {}), - }; + }); + } + + private async tryBoundedRecovery( + failedStep: CompiledStep, + errorText: string, + sessionId: string, + params: Record, + options: PlanExecutionOptions, + recoveryAttempts: PlanRecoveryAttempt[], + ): Promise<{ recovered: boolean; stepsExecuted: number }> { + const config = options.boundedRecovery; + if (!config?.enabled) return { recovered: false, stepsExecuted: 0 }; + + const maxToolCalls = Math.max(0, config.maxToolCalls ?? 2); + if (countExecutedRecoveryAttempts(recoveryAttempts) >= maxToolCalls) { + return { recovered: false, stepsExecuted: 0 }; + } + + const candidates = rankRecoveryCandidates({ + toolName: failedStep.tool, + resultText: errorText, + isError: true, + recentCalls: [{ toolName: failedStep.tool, result: 'error', error: errorText }], + maxCandidates: config.maxCandidates ?? 3, + }); + + let executed = 0; + for (const candidate of candidates) { + if (countExecutedRecoveryAttempts(recoveryAttempts) >= maxToolCalls) break; + if (candidate.risk !== 'read_only' || candidate.blockedReason) { + recoveryAttempts.push({ + tool: candidate.tool, + status: 'blocked', + reason: candidate.blockedReason ?? `risk ${candidate.risk} is not allowed for bounded recovery`, + }); + continue; + } + + const handler = this.toolResolver(candidate.tool); + if (!handler) { + recoveryAttempts.push({ tool: candidate.tool, status: 'failed', reason: 'tool handler not found' }); + continue; + } + + const args = buildSafeRecoveryArgs(candidate.tool, params); + if (!args) { + recoveryAttempts.push({ tool: candidate.tool, status: 'blocked', reason: 'no safe argument template available' }); + continue; + } + + try { + const result = await withTimeout( + handler(sessionId, args), + config.perCandidateTimeoutMs ?? 5000, + `bounded recovery tool=${candidate.tool} for step=${failedStep.order}`, + ); + executed++; + if (!result.isError && !isEmptyResult(result)) { + recoveryAttempts.push({ + tool: candidate.tool, + status: 'success', + reason: `${candidate.reason}; original step still requires retry`, + }); + return { recovered: false, stepsExecuted: executed }; + } + recoveryAttempts.push({ tool: candidate.tool, status: 'failed', reason: 'candidate returned empty or error result' }); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + recoveryAttempts.push({ + tool: candidate.tool, + status: /timed out/i.test(message) ? 'timeout' : 'failed', + reason: candidate.reason, + error: message, + }); + } + } + + return { recovered: false, stepsExecuted: executed }; } /** @@ -635,8 +754,8 @@ export class PlanExecutor { errorHandlers: PlanErrorHandler[], sessionId: string, params: Record, - currentStepsExecuted: number, - evidence?: PlanStepEvidence[] + currentStepsExecuted: number, + evidence?: PlanStepEvidence[] ): Promise<{ stepsExecuted: number; params: Record } | null> { const handler = errorHandlers.find((h) => h.condition === conditionKey); if (!handler) return null; @@ -659,23 +778,23 @@ export class PlanExecutor { const substitutedArgs = substituteParams(step.args, params) as Record; let mcpResult: MCPResult; - const stepStart = Date.now(); - try { - mcpResult = await withTimeout( - toolHandler(sessionId, substitutedArgs), - step.timeout, - stepLabel - ); - stepsExecuted++; - const empty = isEmptyResult(mcpResult); - evidence?.push({ - step: step.order, - tool: step.tool, - source: 'recovery', - outcome: mcpResult.isError ? 'error' : empty ? 'empty' : 'success', - durationMs: Date.now() - stepStart, - summary: summarizeMcpResult(mcpResult), - }); + const stepStart = Date.now(); + try { + mcpResult = await withTimeout( + toolHandler(sessionId, substitutedArgs), + step.timeout, + stepLabel + ); + stepsExecuted++; + const empty = isEmptyResult(mcpResult); + evidence?.push({ + step: step.order, + tool: step.tool, + source: 'recovery', + outcome: mcpResult.isError ? 'error' : empty ? 'empty' : 'success', + durationMs: Date.now() - stepStart, + summary: summarizeMcpResult(mcpResult), + }); } catch (err) { console.error( `[PlanExecutor] Recovery step failed at ${stepLabel}: ${ diff --git a/src/recovery/candidate-ranker.ts b/src/recovery/candidate-ranker.ts new file mode 100644 index 000000000..31ba9bc27 --- /dev/null +++ b/src/recovery/candidate-ranker.ts @@ -0,0 +1,188 @@ +/** Advisory recovery candidate ranking for stuck/stalling hints. */ +import { policyRankBoost, type RecoveryPolicyRecord } from './policy-learner'; +import { scoreRecoveryOutcome } from './reward-scorer'; + +export type RecoveryCandidateRisk = 'read_only' | 'reversible' | 'side_effect_possible'; + +export interface RecentToolCallLike { + toolName: string; + result: 'pending' | 'success' | 'error' | 'aborted'; + error?: string; + args?: Record; +} + +export interface RecoveryCandidate { + tool: string; + reason: string; + score: number; + risk: RecoveryCandidateRisk; + argsTemplate?: Record; + blockedReason?: string; +} + +export interface RecoveryCandidateRankInput { + toolName: string; + resultText: string; + isError: boolean; + recentCalls: RecentToolCallLike[]; + maxCandidates?: number; + policies?: RecoveryPolicyRecord[]; +} + +const BLIND_INTERACTION_TOOLS = new Set(['click', 'interact', 'computer', 'form_input', 'fill_form', 'javascript_tool']); +const READ_TOOLS = new Set(['read_page', 'tabs_context', 'find', 'query_dom']); + +export function rankRecoveryCandidates(input: RecoveryCandidateRankInput): RecoveryCandidate[] { + const text = `${input.resultText}\n${input.recentCalls.map((c) => c.error ?? '').join('\n')}`.toLowerCase(); + const candidates: RecoveryCandidate[] = []; + + const add = (candidate: Omit & { baseScore: number }) => { + if (candidate.blockedReason) { + candidates.push({ ...candidate, score: -1 }); + return; + } + const repeatedPenalty = repeatedToolCount(input.recentCalls, candidate.tool) * 0.08; + const sameFailedPenalty = candidate.tool === input.toolName && input.isError ? 0.25 : 0; + const riskPenalty = candidate.risk === 'read_only' ? 0 : candidate.risk === 'reversible' ? 0.08 : 0.25; + const evidence = scoreRecoveryOutcome({ + toolName: candidate.tool, + errorText: input.isError ? input.resultText : undefined, + resultText: input.resultText, + freshRefsDiscovered: candidate.tool === 'read_page' && isStaleOrElementFailure(text), + observationOnly: READ_TOOLS.has(candidate.tool), + repeatedFailureCount: repeatedFailureCount(input.recentCalls, input.toolName), + }); + const learnedBoost = policyRankBoost(input.policies, candidate.tool, candidate.risk); + const score = clamp(candidate.baseScore + evidence.score * 0.25 + learnedBoost - repeatedPenalty - sameFailedPenalty - riskPenalty); + candidates.push({ ...candidate, score }); + }; + + if (isBlockingOrAuth(text)) { + add({ + tool: 'read_page', + reason: 'Classify the blocking/auth state before any more interactions.', + risk: 'read_only', + baseScore: 0.78, + }); + add({ + tool: 'tabs_context', + reason: 'Verify the active tab and URL after the blocking redirect.', + risk: 'read_only', + baseScore: 0.62, + }); + if (BLIND_INTERACTION_TOOLS.has(input.toolName)) { + add({ + tool: input.toolName, + reason: 'Blind retry is unsafe on auth, CAPTCHA, or blocking pages.', + risk: 'side_effect_possible', + baseScore: -0.2, + blockedReason: 'blocking/auth signal present', + }); + } + return topCandidates(candidates, input.maxCandidates); + } + + if (isStaleOrElementFailure(text)) { + add({ + tool: 'read_page', + reason: 'Refresh actionable refs after stale/missing element evidence.', + risk: 'read_only', + baseScore: 0.9, + }); + add({ + tool: 'find', + reason: 'Re-resolve the target by accessible text or role instead of reusing an old ref.', + risk: 'read_only', + baseScore: 0.74, + }); + } + + if (isTimeoutOrNetwork(text)) { + add({ + tool: 'tabs_context', + reason: 'Check whether the tab survived the timeout before retrying.', + risk: 'read_only', + baseScore: 0.72, + }); + add({ + tool: 'read_page', + reason: 'Inspect partial page state; the navigation may have produced usable content.', + risk: 'read_only', + baseScore: 0.68, + }); + } + + if (input.isError && BLIND_INTERACTION_TOOLS.has(input.toolName)) { + add({ + tool: 'read_page', + reason: 'Observe the current page before another interaction attempt.', + risk: 'read_only', + baseScore: 0.7, + }); + } + + if (candidates.length === 0) { + add({ + tool: 'tabs_context', + reason: 'Step back and verify tab, URL, and page state before continuing.', + risk: 'read_only', + baseScore: 0.55, + }); + add({ + tool: 'read_page', + reason: 'Collect fresh page evidence instead of repeating the same call.', + risk: 'read_only', + baseScore: 0.5, + }); + } + + return topCandidates(candidates, input.maxCandidates); +} + +export function formatCandidateHint(candidates: RecoveryCandidate[]): string { + if (candidates.length === 0) return ''; + const actionable = candidates.filter((c) => !c.blockedReason).slice(0, 3); + if (actionable.length === 0) return ''; + return ' Suggested recovery order: ' + actionable + .map((c, i) => `${i + 1}) ${c.tool} — ${c.reason}`) + .join(' '); +} + +function topCandidates(candidates: RecoveryCandidate[], max = 3): RecoveryCandidate[] { + const deduped = new Map(); + for (const candidate of candidates) { + const current = deduped.get(candidate.tool); + if (!current || candidate.score > current.score) deduped.set(candidate.tool, candidate); + } + return Array.from(deduped.values()) + .sort((a, b) => { + if (a.blockedReason && !b.blockedReason) return 1; + if (!a.blockedReason && b.blockedReason) return -1; + return b.score - a.score; + }) + .slice(0, max); +} + +function repeatedToolCount(calls: RecentToolCallLike[], tool: string): number { + return calls.filter((call) => call.toolName === tool).length; +} + +function repeatedFailureCount(calls: RecentToolCallLike[], tool: string): number { + return calls.filter((call) => call.toolName === tool && (call.result === 'error' || call.result === 'aborted')).length; +} + +function isStaleOrElementFailure(text: string): boolean { + return /stale|element not found|ref .*not found|ref .*expired|no longer available|not interactive/.test(text); +} + +function isBlockingOrAuth(text: string): boolean { + return /captcha|access denied|forbidden|authredirect|login page detected|blocking page detected|bot-check|blocked by|network security/.test(text); +} + +function isTimeoutOrNetwork(text: string): boolean { + return /timed out|timeout|net::err_|protocol error|target closed/.test(text); +} + +function clamp(value: number): number { + return Math.max(-1, Math.min(1, Number(value.toFixed(3)))); +} diff --git a/src/recovery/index.ts b/src/recovery/index.ts index 66c8a869f..17aa619e2 100644 --- a/src/recovery/index.ts +++ b/src/recovery/index.ts @@ -17,3 +17,10 @@ export type { RecoveryTrajectoryNode, RecoveryTrajectoryNodeInput, } from './trajectory-ledger'; + + +export { formatCandidateHint, rankRecoveryCandidates } from './candidate-ranker'; +export type { RecoveryCandidate, RecoveryCandidateRankInput, RecoveryCandidateRisk, RecentToolCallLike } from './candidate-ranker'; + +export { policyRankBoost, RecoveryPolicyLearner } from './policy-learner'; +export type { RecoveryPolicyOutcome, RecoveryPolicyRecord, RecoveryPolicyLearnerOptions } from './policy-learner'; diff --git a/src/recovery/policy-learner.ts b/src/recovery/policy-learner.ts new file mode 100644 index 000000000..cbaea6420 --- /dev/null +++ b/src/recovery/policy-learner.ts @@ -0,0 +1,179 @@ +/** Evidence-backed recovery policy learning. Advisory only. */ +import * as fs from 'node:fs'; +import * as path from 'node:path'; + +export interface RecoveryPolicyOutcome { + failureFingerprint: string; + domain?: string; + triggerTool: string; + recoveryTool: string; + safetyClass: 'read_only' | 'reversible' | 'side_effect_possible'; + evidenceBacked: boolean; + succeeded: boolean; +} + +export interface RecoveryPolicyRecord { + id: string; + failureFingerprint: string; + domain?: string; + triggerTool: string; + recoveryTool: string; + safetyClass: RecoveryPolicyOutcome['safetyClass']; + attempts: number; + successes: number; + failures: number; + confidence: number; + promoted: boolean; + firstSeen: number; + lastSeen: number; +} + +interface PolicyStoreFile { + version: number; + updatedAt: number; + policies: RecoveryPolicyRecord[]; +} + +export interface RecoveryPolicyLearnerOptions { + filePath?: string; + minAttempts?: number; + minConfidence?: number; + maxPolicies?: number; +} + +export class RecoveryPolicyLearner { + private readonly filePath: string; + private readonly minAttempts: number; + private readonly minConfidence: number; + private readonly maxPolicies: number; + private policies = new Map(); + + constructor(options: RecoveryPolicyLearnerOptions = {}) { + this.filePath = options.filePath ?? path.join(process.cwd(), '.openchrome', 'recovery', 'learned-policies.json'); + this.minAttempts = options.minAttempts ?? 3; + this.minConfidence = options.minConfidence ?? 0.67; + this.maxPolicies = options.maxPolicies ?? 500; + this.load(); + } + + record(outcome: RecoveryPolicyOutcome): RecoveryPolicyRecord | null { + if (!outcome.evidenceBacked) return null; + if (!outcome.failureFingerprint || !outcome.triggerTool || !outcome.recoveryTool) return null; + + const key = policyKey(outcome); + const now = Date.now(); + let record = this.policies.get(key); + if (!record) { + record = { + id: key, + failureFingerprint: outcome.failureFingerprint, + domain: sanitizeDomain(outcome.domain), + triggerTool: outcome.triggerTool, + recoveryTool: outcome.recoveryTool, + safetyClass: outcome.safetyClass, + attempts: 0, + successes: 0, + failures: 0, + confidence: 0, + promoted: false, + firstSeen: now, + lastSeen: now, + }; + this.policies.set(key, record); + } + + record.attempts++; + if (outcome.succeeded) record.successes++; + else record.failures++; + record.lastSeen = now; + record.confidence = round(record.successes / record.attempts); + record.promoted = record.attempts >= this.minAttempts && record.confidence >= this.minConfidence; + this.enforceCap(); + this.save(); + return { ...record }; + } + + getPolicies(filter: { failureFingerprint?: string; domain?: string; triggerTool?: string } = {}): RecoveryPolicyRecord[] { + const domain = sanitizeDomain(filter.domain); + return Array.from(this.policies.values()) + .filter((policy) => policy.promoted) + .filter((policy) => !filter.failureFingerprint || policy.failureFingerprint === filter.failureFingerprint) + .filter((policy) => !domain || policy.domain === domain || policy.domain === undefined) + .filter((policy) => !filter.triggerTool || policy.triggerTool === filter.triggerTool) + .sort((a, b) => b.confidence - a.confidence || b.successes - a.successes) + .map((policy) => ({ ...policy })); + } + + clear(): void { + this.policies.clear(); + this.save(); + } + + private load(): void { + try { + const parsed = JSON.parse(fs.readFileSync(this.filePath, 'utf8')) as PolicyStoreFile; + if (!Array.isArray(parsed.policies)) return; + for (const policy of parsed.policies.slice(-this.maxPolicies)) { + this.policies.set(policy.id, policy); + } + } catch { + // No persisted policies yet. + } + } + + private save(): void { + try { + fs.mkdirSync(path.dirname(this.filePath), { recursive: true }); + const payload: PolicyStoreFile = { + version: 1, + updatedAt: Date.now(), + policies: Array.from(this.policies.values()), + }; + fs.writeFileSync(this.filePath, JSON.stringify(payload, null, 2), 'utf8'); + } catch (err) { + console.error(`[RecoveryPolicyLearner] save skipped: ${err instanceof Error ? err.message : String(err)}`); + } + } + + private enforceCap(): void { + if (this.policies.size <= this.maxPolicies) return; + const sorted = Array.from(this.policies.values()).sort((a, b) => a.lastSeen - b.lastSeen); + for (const policy of sorted.slice(0, this.policies.size - this.maxPolicies)) { + this.policies.delete(policy.id); + } + } +} + +export function policyRankBoost( + policies: RecoveryPolicyRecord[] | undefined, + recoveryTool: string, + safetyClass: RecoveryPolicyOutcome['safetyClass'], +): number { + if (!policies || policies.length === 0) return 0; + const policy = policies.find((item) => item.recoveryTool === recoveryTool && item.safetyClass === safetyClass); + if (!policy) return 0; + return Math.min(0.25, policy.confidence * 0.2 + Math.min(policy.successes, 5) * 0.01); +} + +function policyKey(outcome: RecoveryPolicyOutcome): string { + return [ + outcome.failureFingerprint, + sanitizeDomain(outcome.domain) ?? '*', + outcome.triggerTool, + outcome.recoveryTool, + outcome.safetyClass, + ].join('|'); +} + +function sanitizeDomain(domain: string | undefined): string | undefined { + if (!domain) return undefined; + try { + return new URL(domain.includes('://') ? domain : `https://${domain}`).hostname.toLowerCase(); + } catch { + return domain.toLowerCase().replace(/[^a-z0-9.-]/g, '').slice(0, 120) || undefined; + } +} + +function round(value: number): number { + return Number(value.toFixed(3)); +} diff --git a/src/tools/orchestration.ts b/src/tools/orchestration.ts index 2b638781c..9dec0d0c2 100644 --- a/src/tools/orchestration.ts +++ b/src/tools/orchestration.ts @@ -693,6 +693,10 @@ const executePlanDefinition: MCPToolDefinition = { type: 'string', description: 'Tab ID to execute the plan against', }, + boundedRecovery: { + type: 'boolean', + description: 'Opt-in: try a small, safety-gated read-only recovery search when a plan step fails', + }, params: { type: 'object', description: 'Runtime params merged with plan defaults', @@ -729,6 +733,7 @@ const executePlanHandler: ToolHandler = async ( const planId = args.planId as string; const tabId = args.tabId as string; const runtimeParams = (args.params as Record) || {}; + const boundedRecovery = args.boundedRecovery === true; const rawTaskSignature = args.taskSignature; const rawReflectionStrategy = args.reflectionStrategy; @@ -822,12 +827,12 @@ const executePlanHandler: ToolHandler = async ( // Execute the plan const mergedParams = { tabId, ...runtimeParams }; - const result = await executor.execute( - plan, - sessionId, - mergedParams, - taskSignatureResult?.ok ? { taskSignature: taskSignatureResult.value } : {} - ); + const result = await executor.execute(plan, sessionId, mergedParams, { + ...(taskSignatureResult?.ok ? { taskSignature: taskSignatureResult.value } : {}), + ...(boundedRecovery + ? { boundedRecovery: { enabled: true, maxCandidates: 3, maxToolCalls: 2, perCandidateTimeoutMs: 5000 } } + : {}), + }); // Update stats registry.updateStats(planId, result.success, result.durationMs); @@ -851,6 +856,7 @@ const executePlanHandler: ToolHandler = async ( durationMs: result.durationMs, data: result.data, error: result.error, + recovery: result.recovery, evidence: result.evidence, ...(result.taskSignature ? { taskSignature: result.taskSignature } : {}), ...(reflectionStrategy ? { reflectionStrategy } : {}), diff --git a/src/types/plan-cache.ts b/src/types/plan-cache.ts index a7cc2bd5d..1d8797889 100644 --- a/src/types/plan-cache.ts +++ b/src/types/plan-cache.ts @@ -5,29 +5,29 @@ * bypassing per-step agent LLM round-trips. */ -import type { BrowserTaskSignature, TaskSignatureStatus } from '../contracts/task-signature'; -import type { Assertion, Evidence } from '../contracts/types'; +import type { BrowserTaskSignature, TaskSignatureStatus } from '../contracts/task-signature'; +import type { Assertion, Evidence } from '../contracts/types'; /** A single step in a compiled plan */ -export type PlanStepRisk = 'low' | 'interactive' | 'destructive'; - +export type PlanStepRisk = 'low' | 'interactive' | 'destructive'; + export interface CompiledStep { /** Execution order (1-based) */ order: number; /** MCP tool name (e.g. "javascript_tool", "computer") */ tool: string; - /** Tool arguments — supports ${param} and ${param.path.to.value} template variables */ + /** Tool arguments — supports ${param} and ${param.path.to.value} template variables */ args: Record; /** Step-level timeout in milliseconds */ timeout: number; /** Whether to retry this step on failure */ retryOnFail?: boolean; - /** Risk classification for safe plan-as-code validation. */ - risk?: PlanStepRisk; + /** Risk classification for safe plan-as-code validation. */ + risk?: PlanStepRisk; /** How to parse and store the result for subsequent steps */ parseResult?: { format: 'json' | 'text'; - /** JSON field/path to extract from result */ + /** JSON field/path to extract from result */ extractField?: string; /** Variable name to store result for later steps */ storeAs?: string; @@ -45,33 +45,33 @@ export interface PlanErrorHandler { } /** Success criteria for plan validation */ -export interface PlanSuccessCriteria { - /** Minimum number of data items in result */ - minDataItems?: number; - /** Required fields in extracted data */ - requiredFields?: string[]; - /** Custom validation JS expression (evaluated against params) */ - customCheck?: string; -} - -export interface FinalVerificationGate { - /** Inline Outcome Contract assertions evaluated after successCriteria. */ - finalAssertions: Assertion[]; - /** Params key containing an oc_assert-compatible evidence.snapshot object. Default: finalSnapshot. */ - snapshotParam?: string; - /** Maximum age in ms for snapshot.captured_at / timestamp when present. */ - freshnessMs?: number; - /** Evidence kinds requested by caller. Currently supports bounded snapshot evidence only. */ - requiredEvidence?: Array<'dom' | 'url' | 'network' | 'screenshot' | 'phash'>; -} - -export interface PlanFinalVerificationResult { - passed: boolean; - snapshotParam: string; - assertions: Array<{ index: number; passed: boolean; evidence: Evidence }>; - failedAssertion?: { index: number; assertion: Assertion; evidence: Evidence }; - error?: string; -} +export interface PlanSuccessCriteria { + /** Minimum number of data items in result */ + minDataItems?: number; + /** Required fields in extracted data */ + requiredFields?: string[]; + /** Custom validation JS expression (evaluated against params) */ + customCheck?: string; +} + +export interface FinalVerificationGate { + /** Inline Outcome Contract assertions evaluated after successCriteria. */ + finalAssertions: Assertion[]; + /** Params key containing an oc_assert-compatible evidence.snapshot object. Default: finalSnapshot. */ + snapshotParam?: string; + /** Maximum age in ms for snapshot.captured_at / timestamp when present. */ + freshnessMs?: number; + /** Evidence kinds requested by caller. Currently supports bounded snapshot evidence only. */ + requiredEvidence?: Array<'dom' | 'url' | 'network' | 'screenshot' | 'phash'>; +} + +export interface PlanFinalVerificationResult { + passed: boolean; + snapshotParam: string; + assertions: Array<{ index: number; passed: boolean; evidence: Evidence }>; + failedAssertion?: { index: number; assertion: Assertion; evidence: Evidence }; + error?: string; +} /** Task pattern for matching incoming tasks to cached plans */ export interface TaskPattern { @@ -86,20 +86,20 @@ export interface TaskPattern { } /** A complete compiled plan — a cached sequence of tool calls */ -export interface PlanStepEvidence { - step: number; - tool: string; - source: 'plan' | 'recovery'; - outcome: 'success' | 'error' | 'empty'; - durationMs: number; - summary: string; -} - -export interface CompiledPlan { - /** Optional v2 safe contract gate. Legacy plans without this remain backward compatible. */ - contractVersion?: 2; - /** Explicit allow-list for tools that plan and recovery steps may invoke. */ - allowedTools?: string[]; +export interface PlanStepEvidence { + step: number; + tool: string; + source: 'plan' | 'recovery'; + outcome: 'success' | 'error' | 'empty'; + durationMs: number; + summary: string; +} + +export interface CompiledPlan { + /** Optional v2 safe contract gate. Legacy plans without this remain backward compatible. */ + contractVersion?: 2; + /** Explicit allow-list for tools that plan and recovery steps may invoke. */ + allowedTools?: string[]; /** Unique plan identifier */ id: string; /** Plan version for cache invalidation */ @@ -115,11 +115,11 @@ export interface CompiledPlan { steps: CompiledStep[]; /** Error handlers */ errorHandlers: PlanErrorHandler[]; - /** Success validation criteria */ - successCriteria: PlanSuccessCriteria; - /** Optional final Outcome Contract verification gate (#1031). */ - finalVerification?: FinalVerificationGate; -} + /** Success validation criteria */ + successCriteria: PlanSuccessCriteria; + /** Optional final Outcome Contract verification gate (#1031). */ + finalVerification?: FinalVerificationGate; +} /** Registry entry for a plan with usage statistics */ export interface PlanEntry { @@ -150,8 +150,27 @@ export interface PlanRegistryData { updatedAt: number; } -/** Result of plan execution */ +/** Bounded recovery telemetry for plan execution. */ +export interface PlanRecoveryAttempt { + tool: string; + status: 'success' | 'failed' | 'blocked' | 'timeout'; + reason: string; + error?: string; +} + +export interface PlanRecoverySummary { + enabled: boolean; + attempts: PlanRecoveryAttempt[]; + exhausted: boolean; +} + export interface PlanExecutionOptions { + boundedRecovery?: { + enabled: boolean; + maxCandidates?: number; + maxToolCalls?: number; + perCandidateTimeoutMs?: number; + }; /** Optional deterministic boundary for tool allow-list and loop/budget status. */ taskSignature?: BrowserTaskSignature; } @@ -171,10 +190,12 @@ export interface PlanExecutionResult { stepsExecuted: number; /** Total steps in plan */ totalSteps: number; - /** Present only when execution was bound to a BrowserTaskSignature. */ - taskSignature?: TaskSignatureStatus; - /** Present when a plan opted into final verification. */ - finalVerification?: PlanFinalVerificationResult; - /** Bounded execution evidence suitable for critic/outcome validation. */ - evidence?: PlanStepEvidence[]; -} + /** Optional bounded recovery summary when opt-in recovery is enabled. */ + recovery?: PlanRecoverySummary; + /** Present only when execution was bound to a BrowserTaskSignature. */ + taskSignature?: TaskSignatureStatus; + /** Present when a plan opted into final verification. */ + finalVerification?: PlanFinalVerificationResult; + /** Bounded execution evidence suitable for critic/outcome validation. */ + evidence?: PlanStepEvidence[]; +} diff --git a/tests/orchestration/plan-executor-bounded-recovery.test.ts b/tests/orchestration/plan-executor-bounded-recovery.test.ts new file mode 100644 index 000000000..1c350ea8f --- /dev/null +++ b/tests/orchestration/plan-executor-bounded-recovery.test.ts @@ -0,0 +1,93 @@ +import { PlanExecutor } from '../../src/orchestration/plan-executor'; +import type { CompiledPlan } from '../../src/types/plan-cache'; +import type { MCPResult, ToolHandler } from '../../src/types/mcp'; + +const text = (value: string, isError = false): MCPResult => ({ content: [{ type: 'text', text: value }], isError }); + +function plan(): CompiledPlan { + return { + id: 'test-plan', + version: '1', + description: 'test', + parameters: {}, + steps: [ + { order: 1, tool: 'interact', args: { ref: 'old', tabId: '${tabId}' }, timeout: 1000 }, + { order: 2, tool: 'read_page', args: { tabId: '${tabId}' }, timeout: 1000, parseResult: { format: 'text', storeAs: 'page' } }, + ], + errorHandlers: [], + successCriteria: { requiredFields: ['page'] }, + }; +} + +describe('PlanExecutor bounded recovery', () => { + it('is disabled by default and preserves original failure', async () => { + const handlers = new Map([ + ['interact', async () => text('ref is stale', true)], + ['read_page', async () => text('fresh page')], + ]); + const executor = new PlanExecutor((name) => handlers.get(name) ?? null); + + const result = await executor.execute(plan(), 's1', { tabId: 'tab-1' }); + + expect(result.success).toBe(false); + expect(result.recovery).toBeUndefined(); + }); + + it('tries bounded read-only recovery but does not skip retrying the failed step', async () => { + const calls: string[] = []; + const handlers = new Map([ + ['interact', async () => { calls.push('interact'); return text('ref is stale', true); }], + ['read_page', async () => { calls.push('read_page'); return text('fresh page with refs'); }], + ]); + const executor = new PlanExecutor((name) => handlers.get(name) ?? null); + + const result = await executor.execute(plan(), 's1', { tabId: 'tab-1' }, { + boundedRecovery: { enabled: true, maxCandidates: 2, maxToolCalls: 1, perCandidateTimeoutMs: 1000 }, + }); + + expect(result.success).toBe(false); + expect(calls).toEqual(['interact', 'read_page']); + expect(result.error).toContain('ref is stale'); + expect(result.recovery?.attempts[0]).toMatchObject({ tool: 'read_page', status: 'success' }); + }); + + it('blocks side-effect candidates and enforces max tool-call budget', async () => { + const handlers = new Map([ + ['click', async () => text('CAPTCHA Access Denied Login page detected', true)], + ['read_page', async () => text('', false)], + ['tabs_context', async () => text('tab state')], + ]); + const captchaPlan = plan(); + captchaPlan.steps[0] = { order: 1, tool: 'click', args: { tabId: '${tabId}' }, timeout: 1000 }; + const executor = new PlanExecutor((name) => handlers.get(name) ?? null); + + const result = await executor.execute(captchaPlan, 's1', { tabId: 'tab-1' }, { + boundedRecovery: { enabled: true, maxCandidates: 3, maxToolCalls: 1, perCandidateTimeoutMs: 1000 }, + }); + + expect(result.success).toBe(false); + expect(result.recovery?.attempts.length).toBe(1); + expect(result.recovery?.exhausted).toBe(true); + expect(result.recovery?.attempts[0].tool).toBe('read_page'); + }); + it('lets success criteria judge empty results when no empty-result recovery exists', async () => { + const handlers = new Map([ + ['read_page', async () => text('')], + ]); + const emptyPlan = plan(); + emptyPlan.steps = [ + { order: 1, tool: 'read_page', args: { tabId: '${tabId}' }, timeout: 1000, parseResult: { format: 'text', storeAs: 'page' } }, + ]; + emptyPlan.successCriteria = { requiredFields: ['page'] }; + const executor = new PlanExecutor((name) => handlers.get(name) ?? null); + + const result = await executor.execute(emptyPlan, 's1', { tabId: 'tab-1' }, { + boundedRecovery: { enabled: true, maxCandidates: 2, maxToolCalls: 1, perCandidateTimeoutMs: 1000 }, + }); + + expect(result.success).toBe(true); + expect(result.data).toMatchObject({ page: '' }); + expect(result.recovery?.attempts.every((attempt) => attempt.status !== 'success')).toBe(true); + }); + +}); diff --git a/tests/recovery/candidate-ranker.test.ts b/tests/recovery/candidate-ranker.test.ts new file mode 100644 index 000000000..e05a4359f --- /dev/null +++ b/tests/recovery/candidate-ranker.test.ts @@ -0,0 +1,86 @@ +import { rankRecoveryCandidates } from '../../src/recovery'; + +describe('rankRecoveryCandidates', () => { + it('ranks fresh read_page first for stale refs', () => { + const candidates = rankRecoveryCandidates({ + toolName: 'interact', + resultText: 'Error: ref is stale and no longer available', + isError: true, + recentCalls: [ + { toolName: 'interact', result: 'error', error: 'ref is stale' }, + { toolName: 'interact', result: 'error', error: 'ref is stale' }, + ], + }); + + expect(candidates[0].tool).toBe('read_page'); + expect(candidates[0].risk).toBe('read_only'); + expect(candidates.find((c) => c.tool === 'interact')?.score ?? -1).toBeLessThan(candidates[0].score); + }); + + it('excludes blind retries on blocking/auth pages', () => { + const candidates = rankRecoveryCandidates({ + toolName: 'click', + resultText: 'CAPTCHA Access Denied Login page detected', + isError: false, + recentCalls: [{ toolName: 'click', result: 'success', error: 'Login page detected' }], + }); + + expect(candidates[0].tool).toBe('read_page'); + const blocked = candidates.find((c) => c.tool === 'click'); + expect(blocked?.blockedReason).toBe('blocking/auth signal present'); + }); + + + it('does not block read-only retries on blocking pages', () => { + const candidates = rankRecoveryCandidates({ + toolName: 'find', + resultText: 'Login page detected', + isError: false, + recentCalls: [{ toolName: 'find', result: 'success', error: 'Login page detected' }], + }); + + expect(candidates.find((c) => c.tool === 'find')).toBeUndefined(); + expect(candidates.every((c) => !c.blockedReason)).toBe(true); + expect(candidates.map((c) => c.tool)).toEqual(expect.arrayContaining(['read_page', 'tabs_context'])); + }); + + it('penalizes repeated failures without counting prior successes', () => { + const mostlySuccessful = rankRecoveryCandidates({ + toolName: 'interact', + resultText: 'Error: ref is stale', + isError: true, + recentCalls: [ + { toolName: 'interact', result: 'success' }, + { toolName: 'interact', result: 'success' }, + { toolName: 'interact', result: 'error', error: 'ref is stale' }, + ], + }); + const repeatedlyFailing = rankRecoveryCandidates({ + toolName: 'interact', + resultText: 'Error: ref is stale', + isError: true, + recentCalls: [ + { toolName: 'interact', result: 'error', error: 'ref is stale' }, + { toolName: 'interact', result: 'error', error: 'ref is stale' }, + { toolName: 'interact', result: 'error', error: 'ref is stale' }, + ], + }); + + expect(mostlySuccessful[0].score).toBeGreaterThan(repeatedlyFailing[0].score); + }); + + it('falls back to read-only state checks for ambiguous no-progress loops', () => { + const candidates = rankRecoveryCandidates({ + toolName: 'wait_for', + resultText: 'same state', + isError: false, + recentCalls: [ + { toolName: 'wait_for', result: 'success' }, + { toolName: 'read_page', result: 'success' }, + ], + }); + + expect(candidates.every((c) => c.risk === 'read_only')).toBe(true); + expect(candidates.map((c) => c.tool)).toContain('tabs_context'); + }); +}); diff --git a/tests/recovery/policy-learner.test.ts b/tests/recovery/policy-learner.test.ts new file mode 100644 index 000000000..175387045 --- /dev/null +++ b/tests/recovery/policy-learner.test.ts @@ -0,0 +1,72 @@ +import * as fs from 'node:fs'; +import * as os from 'node:os'; +import * as path from 'node:path'; + +import { RecoveryPolicyLearner, rankRecoveryCandidates } from '../../src/recovery'; + +describe('RecoveryPolicyLearner', () => { + let dir: string; + let filePath: string; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'oc-policy-')); + filePath = path.join(dir, 'policies.json'); + }); + + afterEach(() => fs.rmSync(dir, { recursive: true, force: true })); + + it('promotes repeated evidence-backed recoveries and persists them', () => { + const learner = new RecoveryPolicyLearner({ filePath, minAttempts: 3, minConfidence: 0.67 }); + for (let i = 0; i < 3; i++) { + learner.record({ + failureFingerprint: 'stale-ref', + domain: 'https://example.com/path', + triggerTool: 'interact', + recoveryTool: 'read_page', + safetyClass: 'read_only', + evidenceBacked: true, + succeeded: true, + }); + } + + const reloaded = new RecoveryPolicyLearner({ filePath, minAttempts: 3, minConfidence: 0.67 }); + const policies = reloaded.getPolicies({ failureFingerprint: 'stale-ref', domain: 'example.com' }); + expect(policies).toHaveLength(1); + expect(policies[0]).toMatchObject({ recoveryTool: 'read_page', promoted: true, confidence: 1 }); + }); + + it('does not promote ambiguous outcomes and downgrades confidence on failures', () => { + const learner = new RecoveryPolicyLearner({ filePath, minAttempts: 2, minConfidence: 0.75 }); + expect(learner.record({ + failureFingerprint: 'stale-ref', + triggerTool: 'interact', + recoveryTool: 'read_page', + safetyClass: 'read_only', + evidenceBacked: false, + succeeded: true, + })).toBeNull(); + + learner.record({ failureFingerprint: 'stale-ref', triggerTool: 'interact', recoveryTool: 'read_page', safetyClass: 'read_only', evidenceBacked: true, succeeded: true }); + learner.record({ failureFingerprint: 'stale-ref', triggerTool: 'interact', recoveryTool: 'read_page', safetyClass: 'read_only', evidenceBacked: true, succeeded: false }); + + expect(learner.getPolicies({ failureFingerprint: 'stale-ref' })).toHaveLength(0); + }); + + it('biases ranking without bypassing safety gates', () => { + const learner = new RecoveryPolicyLearner({ filePath, minAttempts: 1, minConfidence: 0.5 }); + learner.record({ failureFingerprint: 'timeout', triggerTool: 'navigate', recoveryTool: 'tabs_context', safetyClass: 'read_only', evidenceBacked: true, succeeded: true }); + const policies = learner.getPolicies({ failureFingerprint: 'timeout', triggerTool: 'navigate' }); + + const candidates = rankRecoveryCandidates({ + toolName: 'navigate', + resultText: 'Navigation timeout', + isError: true, + recentCalls: [{ toolName: 'navigate', result: 'error', error: 'Navigation timeout' }], + policies, + }); + + expect(candidates[0].tool).toBe('tabs_context'); + expect(candidates[0].risk).toBe('read_only'); + expect(candidates.every((candidate) => !candidate.blockedReason || candidate.risk !== 'read_only')).toBe(true); + }); +});