diff --git a/README.md b/README.md index 3cb01c20..dcf0cf9e 100644 --- a/README.md +++ b/README.md @@ -401,13 +401,13 @@ read_page tabId="tab1" mode="dom" [page_stats] url: https://example.com | title: Example | scroll: 0,0 | viewport: 1920x1080 -[142] ★ -[156]Search ★ -[289]Home ★ +# [142] ★ +$ [156]Search ★ +@ [289]Home ★ [352]
Welcome to Example ``` -DOM mode outputs `[backendNodeId]` as stable identifiers — they persist for the lifetime of the DOM node, unlike `ref_N` IDs which are cleared on each AX-mode `read_page` call. +DOM mode outputs `[backendNodeId]` as stable identifiers — they persist for the lifetime of the DOM node, unlike `ref_N` IDs which are cleared on each AX-mode `read_page` call. A compact marker before an identifier describes the action affordance: `#` text input, `@` link, `$` button/control, `%` visual target. The marker is display metadata only; pass the identifier itself (`142`, `node_142`, or `ref_N`) to action tools. ### JavaScript and Shadow DOM diff --git a/src/dom/dom-serializer.ts b/src/dom/dom-serializer.ts index 861cda4a..6a8ae7f3 100644 --- a/src/dom/dom-serializer.ts +++ b/src/dom/dom-serializer.ts @@ -5,6 +5,7 @@ import type { Page } from 'puppeteer-core'; import { MAX_OUTPUT_CHARS, DEFAULT_MAX_SERIALIZER_NODES } from '../config/defaults'; import { withTimeout } from '../utils/with-timeout'; +import { formatAffordancePrefix } from '../utils/element-affordance'; export interface DOMSerializerOptions { maxDepth?: number; // default: -1 (unlimited) @@ -169,7 +170,14 @@ function formatElement( const attrStr = attrParts.length > 0 ? ' ' + attrParts.join(' ') : ''; const interactiveMarker = interactive ? ' ★' : ''; - const line = `${indent}[${node.backendNodeId}]<${tagName}${attrStr}/>${textContent}${interactiveMarker}`; + const affordancePrefix = formatAffordancePrefix({ + tagName, + role: attrMap.get('role'), + type: attrMap.get('type'), + href: attrMap.get('href'), + contentEditable: attrMap.get('contenteditable'), + }); + const line = `${indent}${affordancePrefix}[${node.backendNodeId}]<${tagName}${attrStr}/>${textContent}${interactiveMarker}`; return line; } diff --git a/src/tools/find.ts b/src/tools/find.ts index bc31aa87..7f1fe73b 100644 --- a/src/tools/find.ts +++ b/src/tools/find.ts @@ -14,6 +14,7 @@ import { getCircuitBreaker } from '../utils/ralph/circuit-breaker'; import { analyzeScreenshot, formatElementMapAsText } from '../vision/screenshot-analyzer'; import { getVisionMode, trackVisionUsage } from '../vision/config'; import { detectVisionHints, formatVisionHints } from '../vision/auto-detect'; +import { formatAffordancePrefix } from '../utils/element-affordance'; const definition: MCPToolDefinition = { name: 'find', @@ -114,7 +115,7 @@ const handler: ToolHandler = async ( ); const scoreLabel = el.matchLevel === 1 ? '\u2605\u2605\u2605' : el.matchLevel === 2 ? '\u2605\u2605' : '\u2605'; axOutput.push( - `[${refId}] ${el.role}: "${el.name}" at (${Math.round(el.rect.x)}, ${Math.round(el.rect.y)}) ${scoreLabel} [AX]` + `${formatAffordancePrefix({ role: el.role })}[${refId}] ${el.role}: "${el.name}" at (${Math.round(el.rect.x)}, ${Math.round(el.rect.y)}) ${scoreLabel} [AX]` ); } @@ -187,7 +188,7 @@ const handler: ToolHandler = async ( // Include score in output for transparency const scoreLabel = el.score >= 100 ? '★★★' : el.score >= 50 ? '★★' : el.score >= 20 ? '★' : ''; output.push( - `[${refId}] ${el.role}: "${el.name}" at (${Math.round(el.rect.x)}, ${Math.round(el.rect.y)}) ${scoreLabel}`.trim() + `${formatAffordancePrefix({ role: el.role, tagName: el.tagName, type: el.type })}[${refId}] ${el.role}: "${el.name}" at (${Math.round(el.rect.x)}, ${Math.round(el.rect.y)}) ${scoreLabel}`.trim() ); } } diff --git a/src/utils/element-affordance.ts b/src/utils/element-affordance.ts new file mode 100644 index 00000000..8c06d720 --- /dev/null +++ b/src/utils/element-affordance.ts @@ -0,0 +1,126 @@ +/** + * Compact action-affordance classification for perception output. + * + * The returned marker is display metadata only. It must be rendered outside + * canonical refs/backendNodeIds so existing ref parsers keep working. + */ +export type ElementAffordance = + | 'text-input' + | 'link' + | 'control' + | 'visual' + | 'text'; + +export type AffordanceMarker = '#' | '@' | '$' | '%' | ''; + +export interface ElementAffordanceInput { + tagName?: string | null; + role?: string | null; + type?: string | null; + href?: string | null; + contentEditable?: boolean | string | null; +} + +const TEXT_INPUT_TYPES = new Set([ + 'text', + 'password', + 'email', + 'search', + 'url', + 'tel', + 'number', +]); + +const TEXT_INPUT_ROLES = new Set([ + 'textbox', + 'searchbox', +]); + +const LINK_ROLES = new Set([ + 'link', +]); + +const CONTROL_ROLES = new Set([ + 'button', + 'checkbox', + 'radio', + 'combobox', + 'listbox', + 'menu', + 'menuitem', + 'menuitemcheckbox', + 'menuitemradio', + 'option', + 'tab', + 'switch', + 'slider', + 'spinbutton', + 'treeitem', +]); + +const VISUAL_ROLES = new Set([ + 'image', + 'img', + 'graphics-symbol', +]); + +function normalize(value: string | null | undefined): string { + return (value ?? '').trim().toLowerCase(); +} + +function isContentEditable(value: ElementAffordanceInput['contentEditable']): boolean { + return value === true || normalize(String(value ?? '')) === 'true' || normalize(String(value ?? '')) === 'plaintext-only'; +} + +export function classifyElementAffordance(input: ElementAffordanceInput): ElementAffordance { + const tagName = normalize(input.tagName); + const role = normalize(input.role); + const type = normalize(input.type); + + if (isContentEditable(input.contentEditable) || TEXT_INPUT_ROLES.has(role)) { + return 'text-input'; + } + + if (tagName === 'textarea') { + return 'text-input'; + } + + if (tagName === 'input') { + if (!type || TEXT_INPUT_TYPES.has(type)) return 'text-input'; + if (type === 'hidden') return 'text'; + return 'control'; + } + + if (tagName === 'a' || LINK_ROLES.has(role)) { + return 'link'; + } + + if (tagName === 'button' || tagName === 'select' || tagName === 'details' || CONTROL_ROLES.has(role)) { + return 'control'; + } + + if (tagName === 'img' || tagName === 'canvas' || tagName === 'video' || tagName === 'svg' || VISUAL_ROLES.has(role)) { + return 'visual'; + } + + return 'text'; +} + +export function affordanceMarkerFor(kind: ElementAffordance): AffordanceMarker { + switch (kind) { + case 'text-input': return '#'; + case 'link': return '@'; + case 'control': return '$'; + case 'visual': return '%'; + case 'text': return ''; + } +} + +export function getAffordanceMarker(input: ElementAffordanceInput): AffordanceMarker { + return affordanceMarkerFor(classifyElementAffordance(input)); +} + +export function formatAffordancePrefix(input: ElementAffordanceInput): string { + const marker = getAffordanceMarker(input); + return marker ? `${marker} ` : ''; +} diff --git a/tests/dom/dom-serializer.test.ts b/tests/dom/dom-serializer.test.ts index 963a4f21..3b1d3bf8 100644 --- a/tests/dom/dom-serializer.test.ts +++ b/tests/dom/dom-serializer.test.ts @@ -374,6 +374,33 @@ describe('DOM Serializer', () => { expect(result.content).toContain('[801]'); // button }); + test('renders affordance markers outside backendNodeId tokens', async () => { + const affordanceDoc = { + nodeId: 1, backendNodeId: 1, nodeType: 9, nodeName: '#document', localName: '', + children: [{ + nodeId: 2, backendNodeId: 2, nodeType: 1, nodeName: 'BODY', localName: 'body', + attributes: [], + children: [ + { nodeId: 3, backendNodeId: 810, nodeType: 1, nodeName: 'INPUT', localName: 'input', attributes: ['type', 'search'], children: [] }, + { nodeId: 4, backendNodeId: 811, nodeType: 1, nodeName: 'A', localName: 'a', attributes: ['href', '/docs'], children: [] }, + { nodeId: 5, backendNodeId: 812, nodeType: 1, nodeName: 'BUTTON', localName: 'button', attributes: [], children: [] }, + { nodeId: 6, backendNodeId: 813, nodeType: 1, nodeName: 'IMG', localName: 'img', attributes: ['alt', 'Logo'], children: [] }, + ], + }], + }; + + const page = createMockPageForDOM(); + const cdpClient = createMockCDPClientForDOM(affordanceDoc); + + const result = await serializeDOM(page as never, cdpClient as never, { includePageStats: false }); + + expect(result.content).toContain('# [810] { const roleDoc = { nodeId: 1, backendNodeId: 1, nodeType: 9, nodeName: '#document', localName: '', diff --git a/tests/dom/shadow-dom-serializer.test.ts b/tests/dom/shadow-dom-serializer.test.ts index 1b228dc8..94900b2a 100644 --- a/tests/dom/shadow-dom-serializer.test.ts +++ b/tests/dom/shadow-dom-serializer.test.ts @@ -163,7 +163,7 @@ describe('DOM Serializer - Shadow DOM', () => { // shadow children at depth+2 = depth 4 = 8 spaces const buttonLine = lines.find(l => l.includes('[2100]