Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 28 additions & 20 deletions src/tools/act.ts
Original file line number Diff line number Diff line change
Expand Up @@ -329,11 +329,12 @@ async function resolveElement(
page: Parameters<typeof resolveElementsByAXTree>[0],
cdpClient: Parameters<typeof resolveElementsByAXTree>[1],
query: string,
context?: ToolContext
context?: ToolContext,
contextHint?: string
): Promise<AXResolvedElement | null> {
try {
const matches = await withTimeout(
resolveElementsByAXTree(page, cdpClient, normalizeQuery(query), { useCenter: true, maxResults: 3 }),
resolveElementsByAXTree(page, cdpClient, normalizeQuery(query), { useCenter: true, maxResults: 3, contextHint }),
8000,
'ax-resolution',
context
Expand Down Expand Up @@ -377,14 +378,15 @@ async function executeClick(
parsedAction: ParsedAction,
stepIndex: number,
isStealth: boolean,
context?: ToolContext
context?: ToolContext,
contextHint?: string
): Promise<StepResult> {
const target = parsedAction.target;
if (!target) {
return { step: stepIndex, action: 'click', outcome: 'ELEMENT_NOT_FOUND', error: 'No target specified for click' };
}

const el = await resolveElement(page, cdpClient, target, context);
const el = await resolveElement(page, cdpClient, target, context, contextHint);
if (!el) {
return { step: stepIndex, action: 'click', target, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${target}"` };
}
Expand Down Expand Up @@ -415,7 +417,8 @@ async function executeType(
parsedAction: ParsedAction,
stepIndex: number,
isStealth: boolean,
context?: ToolContext
context?: ToolContext,
contextHint?: string
): Promise<StepResult> {
const value = parsedAction.value;
if (!value) {
Expand All @@ -424,7 +427,7 @@ async function executeType(

// If a target is specified, find and focus it
if (parsedAction.target) {
const el = await resolveElement(page, cdpClient, parsedAction.target, context);
const el = await resolveElement(page, cdpClient, parsedAction.target, context, contextHint);
if (!el) {
return { step: stepIndex, action: 'type', target: parsedAction.target, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${parsedAction.target}"` };
}
Expand Down Expand Up @@ -463,14 +466,15 @@ async function executeSelect(
tabId: string,
parsedAction: ParsedAction,
stepIndex: number,
context?: ToolContext
context?: ToolContext,
contextHint?: string
): Promise<StepResult> {
const query = parsedAction.target || parsedAction.value;
if (!query) {
return { step: stepIndex, action: 'select', outcome: 'EXCEPTION', error: 'No target specified for select' };
}

const el = await resolveElement(page, cdpClient, query, context);
const el = await resolveElement(page, cdpClient, query, context, contextHint);
if (!el) {
return { step: stepIndex, action: 'select', target: query, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${query}"` };
}
Expand Down Expand Up @@ -514,14 +518,15 @@ async function executeHover(
cdpClient: any,
parsedAction: ParsedAction,
stepIndex: number,
context?: ToolContext
context?: ToolContext,
contextHint?: string
): Promise<StepResult> {
const target = parsedAction.target;
if (!target) {
return { step: stepIndex, action: 'hover', outcome: 'EXCEPTION', error: 'No target specified for hover' };
}

const el = await resolveElement(page, cdpClient, target, context);
const el = await resolveElement(page, cdpClient, target, context, contextHint);
if (!el) {
return { step: stepIndex, action: 'hover', target, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${target}"` };
}
Expand All @@ -538,10 +543,11 @@ async function executeScroll(
cdpClient: any,
parsedAction: ParsedAction,
stepIndex: number,
context?: ToolContext
context?: ToolContext,
contextHint?: string
): Promise<StepResult> {
if (parsedAction.target) {
const el = await resolveElement(page, cdpClient, parsedAction.target, context);
const el = await resolveElement(page, cdpClient, parsedAction.target, context, contextHint);
if (!el) {
return { step: stepIndex, action: 'scroll', target: parsedAction.target, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${parsedAction.target}"` };
}
Expand Down Expand Up @@ -617,14 +623,15 @@ async function executeCheckUncheck(
parsedAction: ParsedAction,
stepIndex: number,
isStealth: boolean,
context?: ToolContext
context?: ToolContext,
contextHint?: string
): Promise<StepResult> {
const target = parsedAction.target;
if (!target) {
return { step: stepIndex, action: parsedAction.action, outcome: 'EXCEPTION', error: `No target specified for ${parsedAction.action}` };
}

const el = await resolveElement(page, cdpClient, target, context);
const el = await resolveElement(page, cdpClient, target, context, contextHint);
if (!el) {
return { step: stepIndex, action: parsedAction.action, target, outcome: 'ELEMENT_NOT_FOUND', error: `Could not find "${target}"` };
}
Expand Down Expand Up @@ -673,6 +680,7 @@ const handler: ToolHandler = async (
const verifyMode = coerceVerifyMode(args.verify);
const verifyTextSummary =
args.verify === undefined ? true : verifyMode !== 'none';
const actionContext = typeof args.context === 'string' ? args.context.slice(0, 240) : undefined;
const timeoutMs = Math.min(Math.max((args.timeout as number) || 30000, 1000), 120000);
const variables = normalizeVariables(args.variables);
const missingVariables = findMissingVariables(instruction || '', variables);
Expand Down Expand Up @@ -775,19 +783,19 @@ const handler: ToolHandler = async (
try {
switch (parsedAction.action) {
case 'click':
result = await executeClick(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context);
result = await executeClick(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context, actionContext);
break;
case 'type':
result = await executeType(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context);
result = await executeType(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context, actionContext);
break;
case 'select':
result = await executeSelect(page, cdpClient, sessionId, tabId, parsedAction, i + 1, context);
result = await executeSelect(page, cdpClient, sessionId, tabId, parsedAction, i + 1, context, actionContext);
break;
case 'hover':
result = await executeHover(page, cdpClient, parsedAction, i + 1, context);
result = await executeHover(page, cdpClient, parsedAction, i + 1, context, actionContext);
break;
case 'scroll':
result = await executeScroll(page, cdpClient, parsedAction, i + 1, context);
result = await executeScroll(page, cdpClient, parsedAction, i + 1, context, actionContext);
break;
case 'wait':
result = await executeWait(page, parsedAction, i + 1, context);
Expand All @@ -797,7 +805,7 @@ const handler: ToolHandler = async (
break;
case 'check':
case 'uncheck':
result = await executeCheckUncheck(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context);
result = await executeCheckUncheck(page, cdpClient, sessionId, tabId, parsedAction, i + 1, isStealth, context, actionContext);
break;
default:
result = { step: i + 1, action: parsedAction.action, outcome: 'EXCEPTION', error: `Unknown action: ${parsedAction.action}` };
Expand Down
77 changes: 68 additions & 9 deletions src/utils/ax-element-resolver.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ export interface AXNodeFlat {
name: string;
value?: string;
properties: Record<string, unknown>;
/** Bounded surrounding text/attributes used only for optional context tie-breaks. */
contextText?: string;
/** True when the element intersects the viewport, when known. */
inViewport?: boolean;
}

/**
Expand Down Expand Up @@ -66,6 +70,8 @@ export interface AXResolveOptions {
useCenter?: boolean;
maxResults?: number;
depth?: number;
/** Optional caller-supplied context, e.g. act.context. Used only within the best cascade level. */
contextHint?: string;
}

/** Parsed query with optional role hint */
Expand Down Expand Up @@ -196,7 +202,8 @@ export function cascadeFilter(
roleHint: string | null,
nameHint: string,
maxResults: number = 5,
): Array<{ node: AXNodeFlat; matchLevel: MatchLevel }> {
contextHint?: string,
): Array<{ node: AXNodeFlat; matchLevel: MatchLevel; contextScore: number }> {
// Pre-filter: remove disabled and non-interactive nodes
const candidates = nodes.filter(n =>
n.properties['disabled'] !== true &&
Expand All @@ -208,38 +215,89 @@ export function cascadeFilter(

const eq = (nodeName: string) => nodeName.normalize('NFC').toLowerCase().trim() === nameLower;
const includes = (nodeName: string) => nodeName.normalize('NFC').toLowerCase().trim().includes(nameLower);
const rank = (levelNodes: AXNodeFlat[], matchLevel: MatchLevel) =>
rankByContext(levelNodes, contextHint)
.slice(0, maxResults)
.map(({ node, contextScore }) => ({ node, matchLevel, contextScore }));

// Level 1: exact role + exact name
if (roleHint) {
const level1 = candidates.filter(n => n.role.toLowerCase() === roleHint && eq(n.name));
if (level1.length > 0) {
return level1.slice(0, maxResults).map(node => ({ node, matchLevel: 1 as MatchLevel }));
return rank(level1, 1);
}
}

// Level 2: exact role + name contains
if (roleHint) {
const level2 = candidates.filter(n => n.role.toLowerCase() === roleHint && includes(n.name));
if (level2.length > 0) {
return level2.slice(0, maxResults).map(node => ({ node, matchLevel: 2 as MatchLevel }));
return rank(level2, 2);
}
}

// Level 3: exact name (any interactive role)
const level3 = candidates.filter(n => eq(n.name));
if (level3.length > 0) {
return level3.slice(0, maxResults).map(node => ({ node, matchLevel: 3 as MatchLevel }));
return rank(level3, 3);
}

// Level 4: name contains (any interactive role)
const level4 = candidates.filter(n => includes(n.name));
if (level4.length > 0) {
return level4.slice(0, maxResults).map(node => ({ node, matchLevel: 4 as MatchLevel }));
return rank(level4, 4);
}

return [];
}


function tokenizeContext(text?: string): string[] {
return (text || '')
.normalize('NFC')
.toLowerCase()
.split(/[^\p{L}\p{N}_-]+/u)
.map(t => t.trim())
.filter(t => t.length >= 2)
.slice(0, 12);
}

function scoreNodeContext(node: AXNodeFlat, contextHint?: string): number {
const tokens = tokenizeContext(contextHint);
if (tokens.length === 0) return 0;

const haystack = [
node.contextText,
node.properties['description'],
node.properties['container'],
node.properties['formLabel'],
node.properties['data-testid'],
node.properties['aria-label'],
node.properties['title'],
node.properties['placeholder'],
node.properties['name'],
node.properties['id'],
]
.filter(v => typeof v === 'string')
.join(' ')
.normalize('NFC')
.toLowerCase();

if (!haystack) return 0;
let score = 0;
for (const token of tokens) {
if (haystack.includes(token)) score += 1;
}
if (node.inViewport === true) score += 0.1;
return score;
}

function rankByContext(nodes: AXNodeFlat[], contextHint?: string): Array<{ node: AXNodeFlat; contextScore: number }> {
return nodes
.map((node, index) => ({ node, index, contextScore: scoreNodeContext(node, contextHint) }))
.sort((a, b) => (b.contextScore - a.contextScore) || (a.index - b.index));
}

// ─── AX Tree Cache ───

const AX_CACHE_TTL_MS = 2000;
Expand Down Expand Up @@ -288,6 +346,7 @@ export async function getCachedAXTree(
name: node.name?.value || '',
value: node.value?.value,
properties: props,
contextText: typeof props['description'] === 'string' ? String(props['description']).slice(0, 240) : undefined,
});
Comment on lines 348 to 350
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Set inViewport before applying context tie-breaks

scoreNodeContext adds a viewport bonus, but flattened AX nodes never set inViewport, so every candidate gets the same viewport signal. In context-qualified queries with repeated labels, this lets hidden/offscreen matches outrank visible ones purely by text; because cascadeFilter trims to maxResults before DOM.getBoxModel drops zero-size nodes, visible elements can be excluded and the action can fail with ELEMENT_NOT_FOUND even when a valid on-screen target exists.

Useful? React with 👍 / 👎.

}

Expand Down Expand Up @@ -322,7 +381,7 @@ export async function resolveElementsByAXTree(
query: string,
options?: AXResolveOptions,
): Promise<AXResolvedElement[]> {
const { useCenter = true, maxResults = 5, depth = -1 } = options || {};
const { useCenter = true, maxResults = 5, depth = -1, contextHint } = options || {};

// 1. Parse query
const parsed = parseQueryForAX(query);
Expand All @@ -332,12 +391,12 @@ export async function resolveElementsByAXTree(
if (nodes.length === 0) return [];

// 3. Cascading filter
const matches = cascadeFilter(nodes, parsed.roleHint, parsed.nameHint, maxResults);
const matches = cascadeFilter(nodes, parsed.roleHint, parsed.nameHint, maxResults, contextHint);
if (matches.length === 0) return [];

// 4. Resolve coordinates for matches
const resolved: AXResolvedElement[] = [];
for (const { node, matchLevel } of matches) {
for (const { node, matchLevel, contextScore } of matches) {
if (resolved.length >= maxResults) break;

try {
Expand Down Expand Up @@ -367,7 +426,7 @@ export async function resolveElementsByAXTree(
width,
height,
},
properties: node.properties,
properties: { ...node.properties, ...(contextScore > 0 ? { contextScore } : {}) },
source: 'ax',
});
} catch {
Expand Down
41 changes: 41 additions & 0 deletions tests/utils/ax-element-resolver.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,47 @@ describe('AX Element Resolver', () => {
expect(results.every(r => r.matchLevel === 1)).toBe(true);
});


test('context hint reorders repeated labels within the same match level', () => {
const repeated = [
makeNode('button', 'Add to cart', { description: 'Budget keyboard card $29' }),
makeNode('button', 'Add to cart', { description: 'Noise Cancelling Headphones card $199' }),
makeNode('button', 'Add to cart', { description: 'USB-C hub card $49' }),
];

const results = cascadeFilter(repeated, 'button', 'Add to cart', 3, 'Noise Cancelling Headphones');

expect(results).toHaveLength(3);
expect(results[0].node.properties.description).toContain('Headphones');
expect(results[0].contextScore).toBeGreaterThan(0);
expect(results.every(r => r.matchLevel === 1)).toBe(true);
});

test('context hint does not cross cascade levels or overpower a stricter role/name match', () => {
const repeated = [
makeNode('button', 'Submit'),
makeNode('link', 'Submit shipping details', { description: 'Checkout payment form' }),
];

const results = cascadeFilter(repeated, 'button', 'Submit', 3, 'Checkout payment form');

expect(results).toHaveLength(1);
expect(results[0].node.role).toBe('button');
expect(results[0].matchLevel).toBe(1);
});

test('without context hint repeated labels preserve current candidate order', () => {
const repeated = [
makeNode('button', 'Login', { description: 'Header navigation' }),
makeNode('button', 'Login', { description: 'Footer links' }),
];

const results = cascadeFilter(repeated, 'button', 'Login');

expect(results[0].node.properties.description).toBe('Header navigation');
expect(results[0].contextScore).toBe(0);
});

describe('real-world Angular Material radio button scenario', () => {
test('should pick radio "외부" over button "외부 사용자 유형 도움말"', () => {
const results = cascadeFilter(nodes, 'radio', '외부');
Expand Down
Loading