-
Notifications
You must be signed in to change notification settings - Fork 34
feat(harness): add structured failure taxonomy (#1024) #1069
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f8f710d
e027e22
71aa172
e37641d
1969efe
b58bb8b
5483edb
147233a
05cf239
87a5abd
6bd45a9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| /** | ||
| * Shared structured failure categories for OpenChrome runtime/tool failures. | ||
| * | ||
| * These categories are intentionally deterministic and dependency-free so they | ||
| * can be attached to tool responses, run events, evidence bundles, and future | ||
| * recovery policies without changing existing tool behavior. | ||
| */ | ||
| export const FAILURE_CATEGORIES = [ | ||
| 'STALE_REF', | ||
| 'ELEMENT_NOT_FOUND', | ||
| 'NAVIGATION_TIMEOUT', | ||
| 'TAB_UNHEALTHY', | ||
| 'BROWSER_CRASH', | ||
| 'CONNECTION_LOST', | ||
| 'AUTH_REQUIRED', | ||
| 'CAPTCHA_OR_WAF', | ||
| 'NO_PROGRESS', | ||
| 'MAX_STEPS_EXCEEDED', | ||
| 'POSTCONDITION_FAILED', | ||
| 'LLM_WANDERING', | ||
| 'UNKNOWN', | ||
| ] as const; | ||
|
|
||
| export type FailureCategory = typeof FAILURE_CATEGORIES[number]; | ||
|
|
||
| export interface FailureClassification { | ||
| category: FailureCategory; | ||
| /** 0..1 deterministic confidence score. */ | ||
| confidence: number; | ||
| /** Short human-readable explanation suitable for logs/metadata. */ | ||
| reason: string; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,187 @@ | ||
| import type { FailureCategory, FailureClassification } from './categories.js'; | ||
|
|
||
| export interface ClassifyFailureInput { | ||
| /** Error object, string, or arbitrary thrown value. */ | ||
| error?: unknown; | ||
| /** Optional explicit message/result text when no Error object exists. */ | ||
| message?: string; | ||
| /** Tool that produced the failure, if known. */ | ||
| toolName?: string; | ||
| /** HintEngine rule name, if classification is driven by a hint. */ | ||
| hintRule?: string; | ||
| /** Current URL/title can add context for auth and WAF ambiguity. */ | ||
| currentUrl?: string; | ||
| pageTitle?: string; | ||
| /** When true, return UNKNOWN if no pattern matches. Defaults to true. */ | ||
| fallbackToUnknown?: boolean; | ||
| } | ||
|
|
||
| interface Rule { | ||
| category: FailureCategory; | ||
| confidence: number; | ||
| reason: string; | ||
| test(input: NormalizedFailureInput): boolean; | ||
| } | ||
|
|
||
| interface NormalizedFailureInput { | ||
| text: string; | ||
| errorName: string; | ||
| toolName: string; | ||
| hintRule: string; | ||
| currentUrl: string; | ||
| pageTitle: string; | ||
| } | ||
|
|
||
| const AUTH_CONTEXT = /\b(log in|login|signin|sign in|auth|authentication|password|credential|permissions?|mfa|2fa|totp|session expired)\b/i; | ||
| const AUTH_DIRECT = /\b(401|unauthorized|please sign in|session expired)\b/i; | ||
| const FORBIDDEN_SIGNAL = /\b(403|forbidden)\b/i; | ||
| const WAF_CONTEXT = /\b(captcha|cloudflare|akamai|imperva|datadome|human verification|verify you are human|bot[- ]?check|anti[- ]?bot|ip block|request block|access denied|just a moment)\b/i; | ||
|
|
||
| const RULES: Rule[] = [ | ||
| { | ||
| category: 'STALE_REF', | ||
| confidence: 0.95, | ||
| reason: 'Reference is stale or invalid after page changes', | ||
| test: ({ text }) => /\b(stale ref|invalid ref|ref\b.+not found|backendnodeid.+not found|node is detached|no node with given id)\b/i.test(text), | ||
| }, | ||
| { | ||
| category: 'CONNECTION_LOST', | ||
| confidence: 0.95, | ||
| reason: 'CDP/browser transport connection was lost', | ||
| test: ({ text }) => /\b(not connected to chrome|call connect\(\) first|websocket.*closed|websocket is not open|browser has disconnected|browser disconnected|cdpsession connection closed|connection closed|session closed|protocol error.*(?:connection|disconnected)|puppeteer\.connect\(\) timed out|session initialization timed out)\b/i.test(text), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The Useful? React with 👍 / 👎. |
||
| }, | ||
| { | ||
| category: 'BROWSER_CRASH', | ||
| confidence: 0.92, | ||
| reason: 'Browser process or renderer appears to have crashed', | ||
| test: ({ text, errorName }) => /\b(browser crash|browser process.*dead|chrome process.*dead|renderer process.*gone|crashed)\b/i.test(`${errorName} ${text}`) || (/targetclosederror/i.test(errorName) && /\b(crash|crashed|browser)\b/i.test(text)), | ||
| }, | ||
| { | ||
| category: 'TAB_UNHEALTHY', | ||
| confidence: 0.9, | ||
| reason: 'Target tab is closed, missing, frozen, or unhealthy', | ||
| test: ({ text }) => /\b(tab.+not found|target.+not found|invalid tab|no such tab|page closed|target closed|tab health probe timeout|tab.+unhealthy|eviction threshold)\b/i.test(text), | ||
| }, | ||
| { | ||
| category: 'NAVIGATION_TIMEOUT', | ||
| confidence: 0.9, | ||
| reason: 'Navigation or page-load wait timed out', | ||
| test: ({ text, toolName }) => /\b(navigation timeout|page load timeout|waiting for navigation failed|net::err_timed_out|timeout.*navigation|timed out.*navigate|navigate.*timed out)\b/i.test(text) || (toolName === 'navigate' && /\b(timeout|timed out)\b/i.test(text)), | ||
| }, | ||
| { | ||
| category: 'ELEMENT_NOT_FOUND', | ||
| confidence: 0.88, | ||
| reason: 'Requested selector/ref/semantic element could not be found', | ||
| test: ({ text }) => /\b(element not found|no elements? found|no matching element|selector.+not found|selector.+failed|queryselectorall.*(?:0|zero)|could not find (?:an? )?(?:element|selector|ref|button|link|input|field|node)|no good match found|no clickable elements found)\b/i.test(text), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Tighten the Useful? React with 👍 / 👎. |
||
| }, | ||
| { | ||
| category: 'CAPTCHA_OR_WAF', | ||
| confidence: 0.86, | ||
| reason: 'Page indicates CAPTCHA, WAF, bot detection, or access-denied block', | ||
| test: (input) => { | ||
| const combined = `${input.text} ${input.currentUrl} ${input.pageTitle}`; | ||
| if (!WAF_CONTEXT.test(combined)) return false; | ||
| // Access denied is ambiguous. Treat it as auth only when auth context is present. | ||
| if (/access denied/i.test(combined) && AUTH_CONTEXT.test(combined)) return false; | ||
| return true; | ||
| }, | ||
| }, | ||
| { | ||
| category: 'AUTH_REQUIRED', | ||
| confidence: 0.84, | ||
| reason: 'Page or failure indicates missing/expired authentication or credentials', | ||
| test: (input) => { | ||
| const combined = `${input.text} ${input.currentUrl} ${input.pageTitle}`; | ||
| return AUTH_CONTEXT.test(combined) || AUTH_DIRECT.test(combined) || (FORBIDDEN_SIGNAL.test(combined) && AUTH_CONTEXT.test(combined)); | ||
| }, | ||
| }, | ||
| { | ||
| category: 'NO_PROGRESS', | ||
| confidence: 0.82, | ||
| reason: 'Recent actions are stalling or made no meaningful progress', | ||
| test: ({ text, hintRule }) => /\b(progress-tracker-stuck|progress-tracker-stalling|no meaningful progress|stalling|stuck|same-tool-same-result|tool-oscillation|coordinate-click-stall)\b/i.test(`${hintRule} ${text}`), | ||
| }, | ||
| { | ||
| category: 'LLM_WANDERING', | ||
| confidence: 0.78, | ||
| reason: 'Repeated low-value actions suggest agent wandering', | ||
| test: ({ text, hintRule }) => /\b(wandering|oscillation|coordinate-click-stall|screenshot-verification-loop|same-tool-same-result|multiple coordinate clicks|multiple screenshots|escalation ladder)\b/i.test(`${hintRule} ${text}`), | ||
| }, | ||
| { | ||
| category: 'MAX_STEPS_EXCEEDED', | ||
| confidence: 0.9, | ||
| reason: 'Execution exceeded configured step or tool-call budget', | ||
| test: ({ text }) => /\b(max steps|max number of|maximum steps|step limit|max iterations|max tool calls|budget exceeded)\b/i.test(text), | ||
| }, | ||
| { | ||
| category: 'POSTCONDITION_FAILED', | ||
| confidence: 0.9, | ||
| reason: 'Outcome contract or postcondition did not pass', | ||
| test: ({ text }) => /\b(postcondition(?:_| )violation|postcondition failed|success criteria not met|contract.+failed|assertion failed|oc_assert.+failed)\b/i.test(text), | ||
| }, | ||
| ]; | ||
|
|
||
| export function classifyFailure(input: ClassifyFailureInput = {}): FailureClassification[] { | ||
| const normalized = normalize(input); | ||
| const found = new Map<FailureCategory, FailureClassification>(); | ||
|
|
||
| for (const rule of RULES) { | ||
| if (!rule.test(normalized)) continue; | ||
| const prev = found.get(rule.category); | ||
| if (!prev || rule.confidence > prev.confidence) { | ||
| found.set(rule.category, { | ||
| category: rule.category, | ||
| confidence: rule.confidence, | ||
| reason: rule.reason, | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| const results = [...found.values()].sort((a, b) => b.confidence - a.confidence || a.category.localeCompare(b.category)); | ||
| if (results.length === 0 && input.fallbackToUnknown !== false) { | ||
| return [{ category: 'UNKNOWN', confidence: 0.5, reason: 'No failure classifier rule matched' }]; | ||
| } | ||
| return results; | ||
| } | ||
|
|
||
| export function primaryFailureCategory(input: ClassifyFailureInput & { fallbackToUnknown: false }): FailureClassification | undefined; | ||
| export function primaryFailureCategory(input?: ClassifyFailureInput): FailureClassification; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Update the general overload to allow Useful? React with 👍 / 👎. |
||
| export function primaryFailureCategory(input: ClassifyFailureInput = {}): FailureClassification | undefined { | ||
|
Comment on lines
+148
to
+149
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The overloads advertise that Useful? React with 👍 / 👎. |
||
| const [classification] = classifyFailure(input); | ||
| if (classification || input.fallbackToUnknown === false) return classification; | ||
| return { category: 'UNKNOWN', confidence: 0.5, reason: 'No failure classifier rule matched' }; | ||
| } | ||
|
|
||
| function normalize(input: ClassifyFailureInput): NormalizedFailureInput { | ||
| const errorName = errorTypeName(input.error); | ||
| const textParts = [ | ||
| stringifyError(input.error), | ||
| input.message, | ||
| ].filter(Boolean); | ||
| return { | ||
| text: textParts.join(' ').toLowerCase(), | ||
| errorName: errorName.toLowerCase(), | ||
| toolName: (input.toolName ?? '').toLowerCase(), | ||
| hintRule: input.hintRule ?? '', | ||
| currentUrl: input.currentUrl ?? '', | ||
| pageTitle: input.pageTitle ?? '', | ||
| }; | ||
| } | ||
|
|
||
| function errorTypeName(error: unknown): string { | ||
| if (!error || typeof error !== 'object') return ''; | ||
| const ctor = (error as { constructor?: { name?: string } }).constructor?.name; | ||
| const name = (error as { name?: unknown }).name; | ||
| return typeof name === 'string' ? name : ctor ?? ''; | ||
| } | ||
|
|
||
| function stringifyError(error: unknown): string { | ||
| if (error === undefined || error === null) return ''; | ||
| if (typeof error === 'string') return error; | ||
| if (error instanceof Error) return `${error.name}: ${error.message}`; | ||
| try { | ||
| return JSON.stringify(error); | ||
| } catch { | ||
| return String(error); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| export * from './categories.js'; | ||
| export * from './classifier.js'; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,93 @@ | ||
| import { classifyFailure, primaryFailureCategory } from '../../src/failure'; | ||
|
|
||
| function categories(message: string, extra: Parameters<typeof classifyFailure>[0] = {}) { | ||
| return classifyFailure({ message, ...extra }).map((r) => r.category); | ||
| } | ||
|
|
||
| describe('failure classifier', () => { | ||
| it('classifies stale refs', () => { | ||
| expect(primaryFailureCategory({ message: 'Error: stale ref abc is no longer available' }).category).toBe('STALE_REF'); | ||
| }); | ||
|
|
||
| it('classifies missing elements', () => { | ||
| expect(categories('selector failed: no matching element found')).toContain('ELEMENT_NOT_FOUND'); | ||
| }); | ||
|
|
||
| it('classifies navigation timeouts', () => { | ||
| expect(primaryFailureCategory({ toolName: 'navigate', message: 'Navigation timeout of 30000 ms exceeded' }).category).toBe('NAVIGATION_TIMEOUT'); | ||
| }); | ||
|
|
||
| it('classifies tab and target failures', () => { | ||
| expect(categories('invalid tab: no such tab')).toContain('TAB_UNHEALTHY'); | ||
| expect(categories('CDPSession connection closed')).toContain('CONNECTION_LOST'); | ||
| }); | ||
|
|
||
| it('classifies browser crashes', () => { | ||
| const error = new Error('Target closed because the browser crash closed the renderer'); | ||
| error.name = 'TargetClosedError'; | ||
| expect(categories('', { error })).toContain('BROWSER_CRASH'); | ||
| }); | ||
|
|
||
| it('classifies auth-required access denied separately from WAF access denied', () => { | ||
| expect(primaryFailureCategory({ message: 'Access denied: login session expired, please sign in' }).category).toBe('AUTH_REQUIRED'); | ||
| expect(primaryFailureCategory({ message: 'Access Denied reference from Akamai bot block' }).category).toBe('CAPTCHA_OR_WAF'); | ||
| }); | ||
|
|
||
| it('does not treat bare forbidden responses as auth-required', () => { | ||
| expect(categories('403 Forbidden')).not.toContain('AUTH_REQUIRED'); | ||
| expect(primaryFailureCategory({ message: '403 Forbidden', fallbackToUnknown: false })).toBeUndefined(); | ||
| expect(categories('Forbidden: login session expired')).toContain('AUTH_REQUIRED'); | ||
| }); | ||
|
|
||
| it('does not classify generic 403 forbidden server errors as auth-required', () => { | ||
| expect(categories('HTTP 403 Forbidden server error')).not.toContain('AUTH_REQUIRED'); | ||
| }); | ||
|
|
||
| it('classifies forbidden permission and auth contexts as auth-required', () => { | ||
| expect(categories('Forbidden: missing permission to access this tool')).toContain('AUTH_REQUIRED'); | ||
| expect(categories('403 Forbidden: authentication credentials are required')).toContain('AUTH_REQUIRED'); | ||
| }); | ||
|
|
||
| it('classifies CAPTCHA and WAF blockers', () => { | ||
| expect(categories('Cloudflare says verify you are human captcha detected')).toContain('CAPTCHA_OR_WAF'); | ||
| }); | ||
|
|
||
| it('maps progress tracker stuck hints to no progress and wandering', () => { | ||
| const result = classifyFailure({ hintRule: 'progress-tracker-stuck', message: 'STOP — no meaningful progress, screenshot-verification-loop' }); | ||
| expect(result.map((r) => r.category)).toEqual(expect.arrayContaining(['NO_PROGRESS', 'LLM_WANDERING'])); | ||
| }); | ||
|
|
||
| it('classifies step budget and postcondition failures', () => { | ||
| expect(categories('Reached the max number of 10 steps')).toContain('MAX_STEPS_EXCEEDED'); | ||
| expect(categories('postcondition_violation: oc_assert failed')).toContain('POSTCONDITION_FAILED'); | ||
| }); | ||
|
|
||
| it('falls back to UNKNOWN by default and can suppress fallback', () => { | ||
| expect(classifyFailure({ message: 'some unrecognized failure' })).toEqual([ | ||
| { category: 'UNKNOWN', confidence: 0.5, reason: 'No failure classifier rule matched' }, | ||
| ]); | ||
| expect(classifyFailure({ message: 'some unrecognized failure', fallbackToUnknown: false })).toEqual([]); | ||
| }); | ||
|
|
||
| it('classifies protocol errors for missing DOM nodes as stale references, not connection loss', () => { | ||
| const result = primaryFailureCategory({ | ||
| error: new Error('Protocol error (DOM.resolveNode): No node with given id found'), | ||
| toolName: 'click', | ||
| }); | ||
|
|
||
| expect(result.category).toBe('STALE_REF'); | ||
| }); | ||
|
|
||
| it('does not classify navigation context churn as connection loss', () => { | ||
| expect(categories('Execution context was destroyed, most likely because of a navigation')).not.toContain('CONNECTION_LOST'); | ||
| expect(categories('Cannot find context with specified id')).not.toContain('CONNECTION_LOST'); | ||
| expect(categories('Inspected target navigated or closed')).not.toContain('CONNECTION_LOST'); | ||
| expect(categories('Protocol error (Runtime.callFunctionOn): Inspected target navigated or closed')).not.toContain('CONNECTION_LOST'); | ||
| }); | ||
|
|
||
| it('keeps generic could-not-find runtime failures out of element-not-found', () => { | ||
| expect(categories('Could not find expected browser (chrome) locally')).not.toContain('ELEMENT_NOT_FOUND'); | ||
| expect(categories('Could not find element for selector .submit')).toContain('ELEMENT_NOT_FOUND'); | ||
| }); | ||
|
|
||
| }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
session closedfrom connection-loss matcherThe
CONNECTION_LOSTregex treats anysession closedtext as a transport disconnect, but Puppeteer frequently emits messages likeProtocol error (...): Session closed. Most likely the page has been closedwhen only the target tab dies. In that case this rule wins with higher confidence thanTAB_UNHEALTHY, soprimaryFailureCategoryreports a browser connection outage and can trigger reconnect/restart logic instead of tab-level recovery.Useful? React with 👍 / 👎.