Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -401,13 +401,13 @@ read_page tabId="tab1" mode="dom"

[page_stats] url: https://example.com | title: Example | scroll: 0,0 | viewport: 1920x1080

[142]<input type="search" placeholder="Search..." aria-label="Search"/> ★
[156]<button type="submit"/>Search ★
[289]<a href="/home"/>Home ★
# [142]<input type="search" placeholder="Search..." aria-label="Search"/> ★
$ [156]<button type="submit"/>Search ★
@ [289]<a href="/home"/>Home ★
[352]<h1/>Welcome to Example
```

DOM mode outputs `[backendNodeId]` as stable identifiers — they persist for the lifetime of the DOM node, unlike `ref_N` IDs which are cleared on each AX-mode `read_page` call.
DOM mode outputs `[backendNodeId]` as stable identifiers — they persist for the lifetime of the DOM node, unlike `ref_N` IDs which are cleared on each AX-mode `read_page` call. A compact marker before an identifier describes the action affordance: `#` text input, `@` link, `$` button/control, `%` visual target. The marker is display metadata only; pass the identifier itself (`142`, `node_142`, or `ref_N`) to action tools.

### JavaScript and Shadow DOM

Expand Down
10 changes: 9 additions & 1 deletion src/dom/dom-serializer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import type { Page } from 'puppeteer-core';
import { MAX_OUTPUT_CHARS, DEFAULT_MAX_SERIALIZER_NODES } from '../config/defaults';
import { withTimeout } from '../utils/with-timeout';
import { formatAffordancePrefix } from '../utils/element-affordance';

export interface DOMSerializerOptions {
maxDepth?: number; // default: -1 (unlimited)
Expand Down Expand Up @@ -169,7 +170,14 @@ function formatElement(
const attrStr = attrParts.length > 0 ? ' ' + attrParts.join(' ') : '';

const interactiveMarker = interactive ? ' ★' : '';
const line = `${indent}[${node.backendNodeId}]<${tagName}${attrStr}/>${textContent}${interactiveMarker}`;
const affordancePrefix = formatAffordancePrefix({
tagName,
role: attrMap.get('role'),
type: attrMap.get('type'),
href: attrMap.get('href'),
contentEditable: attrMap.get('contenteditable'),
});
const line = `${indent}${affordancePrefix}[${node.backendNodeId}]<${tagName}${attrStr}/>${textContent}${interactiveMarker}`;
return line;
}

Expand Down
5 changes: 3 additions & 2 deletions src/tools/find.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { getCircuitBreaker } from '../utils/ralph/circuit-breaker';
import { analyzeScreenshot, formatElementMapAsText } from '../vision/screenshot-analyzer';
import { getVisionMode, trackVisionUsage } from '../vision/config';
import { detectVisionHints, formatVisionHints } from '../vision/auto-detect';
import { formatAffordancePrefix } from '../utils/element-affordance';

const definition: MCPToolDefinition = {
name: 'find',
Expand Down Expand Up @@ -114,7 +115,7 @@ const handler: ToolHandler = async (
);
const scoreLabel = el.matchLevel === 1 ? '\u2605\u2605\u2605' : el.matchLevel === 2 ? '\u2605\u2605' : '\u2605';
axOutput.push(
`[${refId}] ${el.role}: "${el.name}" at (${Math.round(el.rect.x)}, ${Math.round(el.rect.y)}) ${scoreLabel} [AX]`
`${formatAffordancePrefix({ role: el.role })}[${refId}] ${el.role}: "${el.name}" at (${Math.round(el.rect.x)}, ${Math.round(el.rect.y)}) ${scoreLabel} [AX]`
);
}

Expand Down Expand Up @@ -187,7 +188,7 @@ const handler: ToolHandler = async (
// Include score in output for transparency
const scoreLabel = el.score >= 100 ? '★★★' : el.score >= 50 ? '★★' : el.score >= 20 ? '★' : '';
output.push(
`[${refId}] ${el.role}: "${el.name}" at (${Math.round(el.rect.x)}, ${Math.round(el.rect.y)}) ${scoreLabel}`.trim()
`${formatAffordancePrefix({ role: el.role, tagName: el.tagName, type: el.type })}[${refId}] ${el.role}: "${el.name}" at (${Math.round(el.rect.x)}, ${Math.round(el.rect.y)}) ${scoreLabel}`.trim()
);
}
}
Expand Down
126 changes: 126 additions & 0 deletions src/utils/element-affordance.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/**
* Compact action-affordance classification for perception output.
*
* The returned marker is display metadata only. It must be rendered outside
* canonical refs/backendNodeIds so existing ref parsers keep working.
*/
export type ElementAffordance =
| 'text-input'
| 'link'
| 'control'
| 'visual'
| 'text';

export type AffordanceMarker = '#' | '@' | '$' | '%' | '';

export interface ElementAffordanceInput {
tagName?: string | null;
role?: string | null;
type?: string | null;
href?: string | null;
contentEditable?: boolean | string | null;
}

const TEXT_INPUT_TYPES = new Set([
'text',
'password',
'email',
'search',
'url',
'tel',
'number',
]);

const TEXT_INPUT_ROLES = new Set([
'textbox',
'searchbox',
]);

const LINK_ROLES = new Set([
'link',
]);

const CONTROL_ROLES = new Set([
'button',
'checkbox',
'radio',
'combobox',
'listbox',
'menu',
'menuitem',
'menuitemcheckbox',
'menuitemradio',
'option',
'tab',
'switch',
'slider',
'spinbutton',
'treeitem',
]);

const VISUAL_ROLES = new Set([
'image',
'img',
'graphics-symbol',
]);

function normalize(value: string | null | undefined): string {
return (value ?? '').trim().toLowerCase();
}

function isContentEditable(value: ElementAffordanceInput['contentEditable']): boolean {
return value === true || normalize(String(value ?? '')) === 'true' || normalize(String(value ?? '')) === 'plaintext-only';
Comment on lines +71 to +72
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Treat empty contenteditable value as editable

isContentEditable currently recognizes only true and plaintext-only, but in HTML contenteditable without a value is represented as an empty string and is also editable. In DOM mode this means elements like <div contenteditable> are misclassified as plain text and lose the # affordance marker, which weakens the new action-hinting behavior specifically on rich-text editors and similar inputs.

Useful? React with 👍 / 👎.

}
Comment on lines +71 to +73
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Treat empty contenteditable as editable

isContentEditable currently returns true only for boolean true, 'true', or 'plaintext-only', so elements rendered as <div contenteditable> (empty attribute value) are classified as non-editable text. In HTML, an empty contenteditable value is treated as editable, so DOM/find outputs will miss the # marker for valid text-entry targets in that common markup pattern.

Useful? React with 👍 / 👎.


export function classifyElementAffordance(input: ElementAffordanceInput): ElementAffordance {
const tagName = normalize(input.tagName);
const role = normalize(input.role);
const type = normalize(input.type);

if (isContentEditable(input.contentEditable) || TEXT_INPUT_ROLES.has(role)) {
return 'text-input';
}

if (tagName === 'textarea') {
return 'text-input';
}

if (tagName === 'input') {
if (!type || TEXT_INPUT_TYPES.has(type)) return 'text-input';
if (type === 'hidden') return 'text';
return 'control';
}

if (tagName === 'a' || LINK_ROLES.has(role)) {
return 'link';
Comment on lines +94 to +95
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Respect explicit control roles before anchor fallback

classifyElementAffordance checks tagName === 'a' before CONTROL_ROLES.has(role), so elements like <a role="button"> are always labeled as link (@) instead of control ($). In pages that use ARIA-role buttons on anchors (common in SPA component libraries), this produces incorrect affordance metadata and can steer downstream action selection away from the element’s declared semantics.

Useful? React with 👍 / 👎.

}

if (tagName === 'button' || tagName === 'select' || tagName === 'details' || CONTROL_ROLES.has(role)) {
return 'control';
}

if (tagName === 'img' || tagName === 'canvas' || tagName === 'video' || tagName === 'svg' || VISUAL_ROLES.has(role)) {
return 'visual';
}

return 'text';
}

export function affordanceMarkerFor(kind: ElementAffordance): AffordanceMarker {
switch (kind) {
case 'text-input': return '#';
case 'link': return '@';
case 'control': return '$';
case 'visual': return '%';
case 'text': return '';
}
}

export function getAffordanceMarker(input: ElementAffordanceInput): AffordanceMarker {
return affordanceMarkerFor(classifyElementAffordance(input));
}

export function formatAffordancePrefix(input: ElementAffordanceInput): string {
const marker = getAffordanceMarker(input);
return marker ? `${marker} ` : '';
}
27 changes: 27 additions & 0 deletions tests/dom/dom-serializer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,33 @@ describe('DOM Serializer', () => {
expect(result.content).toContain('[801]'); // button
});

test('renders affordance markers outside backendNodeId tokens', async () => {
const affordanceDoc = {
nodeId: 1, backendNodeId: 1, nodeType: 9, nodeName: '#document', localName: '',
children: [{
nodeId: 2, backendNodeId: 2, nodeType: 1, nodeName: 'BODY', localName: 'body',
attributes: [],
children: [
{ nodeId: 3, backendNodeId: 810, nodeType: 1, nodeName: 'INPUT', localName: 'input', attributes: ['type', 'search'], children: [] },
{ nodeId: 4, backendNodeId: 811, nodeType: 1, nodeName: 'A', localName: 'a', attributes: ['href', '/docs'], children: [] },
{ nodeId: 5, backendNodeId: 812, nodeType: 1, nodeName: 'BUTTON', localName: 'button', attributes: [], children: [] },
{ nodeId: 6, backendNodeId: 813, nodeType: 1, nodeName: 'IMG', localName: 'img', attributes: ['alt', 'Logo'], children: [] },
],
}],
};

const page = createMockPageForDOM();
const cdpClient = createMockCDPClientForDOM(affordanceDoc);

const result = await serializeDOM(page as never, cdpClient as never, { includePageStats: false });

expect(result.content).toContain('# [810]<input');
expect(result.content).toContain('@ [811]<a');
expect(result.content).toContain('$ [812]<button');
expect(result.content).toContain('% [813]<img');
expect(result.content).not.toContain('[#810]');
});

test('includes role-based interactive elements', async () => {
const roleDoc = {
nodeId: 1, backendNodeId: 1, nodeType: 9, nodeName: '#document', localName: '',
Expand Down
2 changes: 1 addition & 1 deletion tests/dom/shadow-dom-serializer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ describe('DOM Serializer - Shadow DOM', () => {
// shadow children at depth+2 = depth 4 = 8 spaces
const buttonLine = lines.find(l => l.includes('[2100]<button'));
expect(buttonLine).toBeDefined();
expect(buttonLine!.startsWith(' [2100]<button')).toBe(true);
expect(buttonLine!).toMatch(/^ (?:\$ )?\[2100\]<button/);
});

// 2. Closed shadow root rendering
Expand Down
37 changes: 37 additions & 0 deletions tests/tools/find.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ describe('FindTool', () => {
let mockRefIdManager: ReturnType<typeof createMockRefIdManager>;
let testSessionId: string;
let testTargetId: string;
let mockAXMatches: unknown[];

const getFindHandler = async () => {
jest.resetModules();
Expand All @@ -36,6 +37,11 @@ describe('FindTool', () => {
jest.doMock('../../src/utils/ref-id-manager', () => ({
getRefIdManager: () => mockRefIdManager,
}));
jest.doMock('../../src/utils/ax-element-resolver', () => ({
resolveElementsByAXTree: jest.fn().mockResolvedValue(mockAXMatches),
invalidateAXCache: jest.fn(),
clearAXCache: jest.fn(),
}));

const { registerFindTool } = await import('../../src/tools/find');

Expand All @@ -56,6 +62,8 @@ describe('FindTool', () => {
(getSessionManager as jest.Mock).mockReturnValue(mockSessionManager);
(getRefIdManager as jest.Mock).mockReturnValue(mockRefIdManager);

mockAXMatches = [];

testSessionId = 'test-session-123';
const { targetId, page } = await mockSessionManager.createTarget(testSessionId, 'about:blank');
testTargetId = targetId;
Expand Down Expand Up @@ -98,6 +106,7 @@ describe('FindTool', () => {

expect(page.evaluate).toHaveBeenCalled();
expect(result.content[0].text).toContain('Found');
expect(result.content[0].text).toContain('$ [ref_');
});

test('finds link by keyword', async () => {
Expand Down Expand Up @@ -127,6 +136,7 @@ describe('FindTool', () => {
}) as { content: Array<{ type: string; text: string }> };

expect(result.content[0].text).toContain('link');
expect(result.content[0].text).toContain('@ [ref_');
});

test('finds input by keyword', async () => {
Expand Down Expand Up @@ -157,6 +167,7 @@ describe('FindTool', () => {
}) as { content: Array<{ type: string; text: string }> };

expect(result.content[0].text).toContain('textbox');
expect(result.content[0].text).toContain('# [ref_');
});

test('finds checkbox by keyword', async () => {
Expand Down Expand Up @@ -187,6 +198,7 @@ describe('FindTool', () => {
}) as { content: Array<{ type: string; text: string }> };

expect(result.content[0].text).toContain('checkbox');
expect(result.content[0].text).toContain('$ [ref_');
});

test('finds element by text content', async () => {
Expand Down Expand Up @@ -239,6 +251,31 @@ describe('FindTool', () => {
});
});


describe('AX affordance markers', () => {
test('places marker outside canonical ref for AX-first results', async () => {
mockAXMatches = [{
backendDOMNodeId: 88001,
role: 'link',
name: 'Docs',
matchLevel: 1,
rect: { x: 12, y: 34, width: 100, height: 20 },
properties: {},
source: 'ax',
}];

const handler = await getFindHandler();
const result = await handler(testSessionId, {
tabId: testTargetId,
query: 'Docs link',
}) as { content: Array<{ type: string; text: string }> };

expect(result.content[0].text).toContain('[via AX tree]');
expect(result.content[0].text).toContain('@ [ref_');
expect(result.content[0].text).not.toContain('[@ref_');
});
});

describe('Result Limiting', () => {
test('returns max 20 elements', async () => {
const handler = await getFindHandler();
Expand Down
32 changes: 32 additions & 0 deletions tests/utils/element-affordance.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/// <reference types="jest" />

import { classifyElementAffordance, formatAffordancePrefix, getAffordanceMarker } from '../../src/utils/element-affordance';

describe('element affordance classifier', () => {
test.each([
[{ tagName: 'input', type: 'text' }, 'text-input', '# '],
[{ tagName: 'input', type: 'search' }, 'text-input', '# '],
[{ tagName: 'textarea' }, 'text-input', '# '],
[{ role: 'textbox' }, 'text-input', '# '],
[{ tagName: 'div', contentEditable: 'true' }, 'text-input', '# '],
[{ tagName: 'a', href: '/home' }, 'link', '@ '],
[{ role: 'link' }, 'link', '@ '],
[{ tagName: 'button' }, 'control', '$ '],
[{ tagName: 'input', type: 'checkbox' }, 'control', '$ '],
[{ role: 'combobox' }, 'control', '$ '],
[{ tagName: 'img' }, 'visual', '% '],
[{ role: 'image' }, 'visual', '% '],
[{ tagName: 'p' }, 'text', ''],
])('classifies %o as %s', (input, expectedKind, expectedPrefix) => {
expect(classifyElementAffordance(input)).toBe(expectedKind);
expect(formatAffordancePrefix(input)).toBe(expectedPrefix);
});

test('does not mark hidden inputs as actionable', () => {
expect(getAffordanceMarker({ tagName: 'input', type: 'hidden' })).toBe('');
});

test('treats password fields as text-insertable markers without exposing values', () => {
expect(getAffordanceMarker({ tagName: 'input', type: 'password' })).toBe('#');
});
});
Loading