diff --git a/src/tools/act.ts b/src/tools/act.ts index a9c1874e..1f2ef9e3 100644 --- a/src/tools/act.ts +++ b/src/tools/act.ts @@ -329,11 +329,12 @@ async function resolveElement( page: Parameters[0], cdpClient: Parameters[1], query: string, - context?: ToolContext + context?: ToolContext, + contextHint?: string ): Promise { try { const matches = await withTimeout( - resolveElementsByAXTree(page, cdpClient, normalizeQuery(query), { useCenter: true, maxResults: 3 }), + resolveElementsByAXTree(page, cdpClient, normalizeQuery(query), { useCenter: true, maxResults: 3, contextHint }), 8000, 'ax-resolution', context @@ -377,14 +378,15 @@ async function executeClick( parsedAction: ParsedAction, stepIndex: number, isStealth: boolean, - context?: ToolContext + context?: ToolContext, + contextHint?: string ): Promise { const target = parsedAction.target; if (!target) { return { step: stepIndex, action: 'click', outcome: 'ELEMENT_NOT_FOUND', error: 'No target specified for click' }; } - const el = await resolveElement(page, cdpClient, target, context); + const el = await resolveElement(page, cdpClient, target, context, contextHint); if (!el) { return { step: stepIndex, action: 'click', target, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${target}"` }; } @@ -415,7 +417,8 @@ async function executeType( parsedAction: ParsedAction, stepIndex: number, isStealth: boolean, - context?: ToolContext + context?: ToolContext, + contextHint?: string ): Promise { const value = parsedAction.value; if (!value) { @@ -424,7 +427,7 @@ async function executeType( // If a target is specified, find and focus it if (parsedAction.target) { - const el = await resolveElement(page, cdpClient, parsedAction.target, context); + const el = await resolveElement(page, cdpClient, parsedAction.target, context, contextHint); if (!el) { return { step: stepIndex, action: 'type', target: parsedAction.target, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${parsedAction.target}"` }; } @@ -463,14 +466,15 @@ async function executeSelect( tabId: string, parsedAction: ParsedAction, stepIndex: number, - context?: ToolContext + context?: ToolContext, + contextHint?: string ): Promise { const query = parsedAction.target || parsedAction.value; if (!query) { return { step: stepIndex, action: 'select', outcome: 'EXCEPTION', error: 'No target specified for select' }; } - const el = await resolveElement(page, cdpClient, query, context); + const el = await resolveElement(page, cdpClient, query, context, contextHint); if (!el) { return { step: stepIndex, action: 'select', target: query, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${query}"` }; } @@ -514,14 +518,15 @@ async function executeHover( cdpClient: any, parsedAction: ParsedAction, stepIndex: number, - context?: ToolContext + context?: ToolContext, + contextHint?: string ): Promise { const target = parsedAction.target; if (!target) { return { step: stepIndex, action: 'hover', outcome: 'EXCEPTION', error: 'No target specified for hover' }; } - const el = await resolveElement(page, cdpClient, target, context); + const el = await resolveElement(page, cdpClient, target, context, contextHint); if (!el) { return { step: stepIndex, action: 'hover', target, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${target}"` }; } @@ -538,10 +543,11 @@ async function executeScroll( cdpClient: any, parsedAction: ParsedAction, stepIndex: number, - context?: ToolContext + context?: ToolContext, + contextHint?: string ): Promise { if (parsedAction.target) { - const el = await resolveElement(page, cdpClient, parsedAction.target, context); + const el = await resolveElement(page, cdpClient, parsedAction.target, context, contextHint); if (!el) { return { step: stepIndex, action: 'scroll', target: parsedAction.target, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${parsedAction.target}"` }; } @@ -617,14 +623,15 @@ async function executeCheckUncheck( parsedAction: ParsedAction, stepIndex: number, isStealth: boolean, - context?: ToolContext + context?: ToolContext, + contextHint?: string ): Promise { const target = parsedAction.target; if (!target) { return { step: stepIndex, action: parsedAction.action, outcome: 'EXCEPTION', error: `No target specified for ${parsedAction.action}` }; } - const el = await resolveElement(page, cdpClient, target, context); + const el = await resolveElement(page, cdpClient, target, context, contextHint); if (!el) { return { step: stepIndex, action: parsedAction.action, target, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${target}"` }; } @@ -673,6 +680,7 @@ const handler: ToolHandler = async ( const verifyMode = coerceVerifyMode(args.verify); const verifyTextSummary = args.verify === undefined ? true : verifyMode !== 'none'; + const actionContext = typeof args.context === 'string' ? args.context.slice(0, 240) : undefined; const timeoutMs = Math.min(Math.max((args.timeout as number) || 30000, 1000), 120000); const variables = normalizeVariables(args.variables); const missingVariables = findMissingVariables(instruction || '', variables); @@ -775,19 +783,19 @@ const handler: ToolHandler = async ( try { switch (parsedAction.action) { case 'click': - result = await executeClick(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context); + result = await executeClick(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context, actionContext); break; case 'type': - result = await executeType(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context); + result = await executeType(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context, actionContext); break; case 'select': - result = await executeSelect(page, cdpClient, sessionId, tabId, parsedAction, i + 1, context); + result = await executeSelect(page, cdpClient, sessionId, tabId, parsedAction, i + 1, context, actionContext); break; case 'hover': - result = await executeHover(page, cdpClient, parsedAction, i + 1, context); + result = await executeHover(page, cdpClient, parsedAction, i + 1, context, actionContext); break; case 'scroll': - result = await executeScroll(page, cdpClient, parsedAction, i + 1, context); + result = await executeScroll(page, cdpClient, parsedAction, i + 1, context, actionContext); break; case 'wait': result = await executeWait(page, parsedAction, i + 1, context); @@ -797,7 +805,7 @@ const handler: ToolHandler = async ( break; case 'check': case 'uncheck': - result = await executeCheckUncheck(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context); + result = await executeCheckUncheck(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context, actionContext); break; default: result = { step: i + 1, action: parsedAction.action, outcome: 'EXCEPTION', error: `Unknown action: ${parsedAction.action}` }; diff --git a/src/utils/ax-element-resolver.ts b/src/utils/ax-element-resolver.ts index e9ff5744..8f74f719 100644 --- a/src/utils/ax-element-resolver.ts +++ b/src/utils/ax-element-resolver.ts @@ -35,6 +35,10 @@ export interface AXNodeFlat { name: string; value?: string; properties: Record; + /** Bounded surrounding text/attributes used only for optional context tie-breaks. */ + contextText?: string; + /** True when the element intersects the viewport, when known. */ + inViewport?: boolean; } /** @@ -66,6 +70,8 @@ export interface AXResolveOptions { useCenter?: boolean; maxResults?: number; depth?: number; + /** Optional caller-supplied context, e.g. act.context. Used only within the best cascade level. */ + contextHint?: string; } /** Parsed query with optional role hint */ @@ -196,7 +202,8 @@ export function cascadeFilter( roleHint: string | null, nameHint: string, maxResults: number = 5, -): Array<{ node: AXNodeFlat; matchLevel: MatchLevel }> { + contextHint?: string, +): Array<{ node: AXNodeFlat; matchLevel: MatchLevel; contextScore: number }> { // Pre-filter: remove disabled and non-interactive nodes const candidates = nodes.filter(n => n.properties['disabled'] !== true && @@ -208,12 +215,16 @@ export function cascadeFilter( const eq = (nodeName: string) => nodeName.normalize('NFC').toLowerCase().trim() === nameLower; const includes = (nodeName: string) => nodeName.normalize('NFC').toLowerCase().trim().includes(nameLower); + const rank = (levelNodes: AXNodeFlat[], matchLevel: MatchLevel) => + rankByContext(levelNodes, contextHint) + .slice(0, maxResults) + .map(({ node, contextScore }) => ({ node, matchLevel, contextScore })); // Level 1: exact role + exact name if (roleHint) { const level1 = candidates.filter(n => n.role.toLowerCase() === roleHint && eq(n.name)); if (level1.length > 0) { - return level1.slice(0, maxResults).map(node => ({ node, matchLevel: 1 as MatchLevel })); + return rank(level1, 1); } } @@ -221,25 +232,72 @@ export function cascadeFilter( if (roleHint) { const level2 = candidates.filter(n => n.role.toLowerCase() === roleHint && includes(n.name)); if (level2.length > 0) { - return level2.slice(0, maxResults).map(node => ({ node, matchLevel: 2 as MatchLevel })); + return rank(level2, 2); } } // Level 3: exact name (any interactive role) const level3 = candidates.filter(n => eq(n.name)); if (level3.length > 0) { - return level3.slice(0, maxResults).map(node => ({ node, matchLevel: 3 as MatchLevel })); + return rank(level3, 3); } // Level 4: name contains (any interactive role) const level4 = candidates.filter(n => includes(n.name)); if (level4.length > 0) { - return level4.slice(0, maxResults).map(node => ({ node, matchLevel: 4 as MatchLevel })); + return rank(level4, 4); } return []; } + +function tokenizeContext(text?: string): string[] { + return (text || '') + .normalize('NFC') + .toLowerCase() + .split(/[^\p{L}\p{N}_-]+/u) + .map(t => t.trim()) + .filter(t => t.length >= 2) + .slice(0, 12); +} + +function scoreNodeContext(node: AXNodeFlat, contextHint?: string): number { + const tokens = tokenizeContext(contextHint); + if (tokens.length === 0) return 0; + + const haystack = [ + node.contextText, + node.properties['description'], + node.properties['container'], + node.properties['formLabel'], + node.properties['data-testid'], + node.properties['aria-label'], + node.properties['title'], + node.properties['placeholder'], + node.properties['name'], + node.properties['id'], + ] + .filter(v => typeof v === 'string') + .join(' ') + .normalize('NFC') + .toLowerCase(); + + if (!haystack) return 0; + let score = 0; + for (const token of tokens) { + if (haystack.includes(token)) score += 1; + } + if (node.inViewport === true) score += 0.1; + return score; +} + +function rankByContext(nodes: AXNodeFlat[], contextHint?: string): Array<{ node: AXNodeFlat; contextScore: number }> { + return nodes + .map((node, index) => ({ node, index, contextScore: scoreNodeContext(node, contextHint) })) + .sort((a, b) => (b.contextScore - a.contextScore) || (a.index - b.index)); +} + // ─── AX Tree Cache ─── const AX_CACHE_TTL_MS = 2000; @@ -288,6 +346,7 @@ export async function getCachedAXTree( name: node.name?.value || '', value: node.value?.value, properties: props, + contextText: typeof props['description'] === 'string' ? String(props['description']).slice(0, 240) : undefined, }); } @@ -322,7 +381,7 @@ export async function resolveElementsByAXTree( query: string, options?: AXResolveOptions, ): Promise { - const { useCenter = true, maxResults = 5, depth = -1 } = options || {}; + const { useCenter = true, maxResults = 5, depth = -1, contextHint } = options || {}; // 1. Parse query const parsed = parseQueryForAX(query); @@ -332,12 +391,12 @@ export async function resolveElementsByAXTree( if (nodes.length === 0) return []; // 3. Cascading filter - const matches = cascadeFilter(nodes, parsed.roleHint, parsed.nameHint, maxResults); + const matches = cascadeFilter(nodes, parsed.roleHint, parsed.nameHint, maxResults, contextHint); if (matches.length === 0) return []; // 4. Resolve coordinates for matches const resolved: AXResolvedElement[] = []; - for (const { node, matchLevel } of matches) { + for (const { node, matchLevel, contextScore } of matches) { if (resolved.length >= maxResults) break; try { @@ -367,7 +426,7 @@ export async function resolveElementsByAXTree( width, height, }, - properties: node.properties, + properties: { ...node.properties, ...(contextScore > 0 ? { contextScore } : {}) }, source: 'ax', }); } catch { diff --git a/tests/utils/ax-element-resolver.test.ts b/tests/utils/ax-element-resolver.test.ts index 18d526e0..e780f21f 100644 --- a/tests/utils/ax-element-resolver.test.ts +++ b/tests/utils/ax-element-resolver.test.ts @@ -161,6 +161,47 @@ describe('AX Element Resolver', () => { expect(results.every(r => r.matchLevel === 1)).toBe(true); }); + + test('context hint reorders repeated labels within the same match level', () => { + const repeated = [ + makeNode('button', 'Add to cart', { description: 'Budget keyboard card $29' }), + makeNode('button', 'Add to cart', { description: 'Noise Cancelling Headphones card $199' }), + makeNode('button', 'Add to cart', { description: 'USB-C hub card $49' }), + ]; + + const results = cascadeFilter(repeated, 'button', 'Add to cart', 3, 'Noise Cancelling Headphones'); + + expect(results).toHaveLength(3); + expect(results[0].node.properties.description).toContain('Headphones'); + expect(results[0].contextScore).toBeGreaterThan(0); + expect(results.every(r => r.matchLevel === 1)).toBe(true); + }); + + test('context hint does not cross cascade levels or overpower a stricter role/name match', () => { + const repeated = [ + makeNode('button', 'Submit'), + makeNode('link', 'Submit shipping details', { description: 'Checkout payment form' }), + ]; + + const results = cascadeFilter(repeated, 'button', 'Submit', 3, 'Checkout payment form'); + + expect(results).toHaveLength(1); + expect(results[0].node.role).toBe('button'); + expect(results[0].matchLevel).toBe(1); + }); + + test('without context hint repeated labels preserve current candidate order', () => { + const repeated = [ + makeNode('button', 'Login', { description: 'Header navigation' }), + makeNode('button', 'Login', { description: 'Footer links' }), + ]; + + const results = cascadeFilter(repeated, 'button', 'Login'); + + expect(results[0].node.properties.description).toBe('Header navigation'); + expect(results[0].contextScore).toBe(0); + }); + describe('real-world Angular Material radio button scenario', () => { test('should pick radio "외부" over button "외부 사용자 유형 도움말"', () => { const results = cascadeFilter(nodes, 'radio', '외부');