diff --git a/src/dom/dom-serializer.ts b/src/dom/dom-serializer.ts index d58bd4835..74c1c39ce 100644 --- a/src/dom/dom-serializer.ts +++ b/src/dom/dom-serializer.ts @@ -92,6 +92,20 @@ const CONTAINER_TAGS = new Set([ 'div', 'section', 'article', 'main', 'aside', 'header', 'footer', 'nav', 'span', ]); +const INTERACTIVE_HINT_SCAN_MAX_MS = 100; +const INTERACTIVE_HINT_SCAN_MAX_ELEMENTS = 2500; + +interface CustomInteractiveHint { + path: string; + hints: string; +} + +interface CursorInteractiveScanResult { + completed: boolean; + inspected: number; + hints: CustomInteractiveHint[]; +} + /** * Parse flat attributes array into a map */ @@ -115,25 +129,30 @@ function escapeAttributeValue(value: string): string { /** * Check if a node is interactive */ -function isInteractive(tagName: string, attrMap: Map): boolean { +function isNativeInteractive(tagName: string, attrMap: Map): boolean { if (INTERACTIVE_TAGS.has(tagName)) return true; const role = attrMap.get('role'); if (role && INTERACTIVE_ROLES.has(role)) return true; return false; } +function isInteractive(tagName: string, attrMap: Map, customHints?: string): boolean { + return Boolean(customHints) || isNativeInteractive(tagName, attrMap); +} + /** * Check if a DOM node or any descendant contains interactive elements. * Prevents sibling dedup from collapsing groups with clickable elements. */ -function containsInteractive(node: DOMNode): boolean { +function containsInteractive(node: DOMNode, path: string, ctx: SerializeContext): boolean { if (node.nodeType !== NODE_TYPE_ELEMENT) return false; const tag = (node.localName || node.nodeName).toLowerCase(); const attrMap = parseAttributes(node.attributes); - if (isInteractive(tag, attrMap)) return true; + if (isInteractive(tag, attrMap, ctx.customInteractiveHints.get(path))) return true; if (node.children) { + const childPaths = createChildPathMap(node.children, path); for (const child of node.children) { - if (containsInteractive(child)) return true; + if (containsInteractive(child, childPaths.get(child) ?? path, ctx)) return true; } } return false; @@ -164,6 +183,7 @@ function formatElement( indent: string, textContent: string, interactive: boolean, + hints?: string, ): string { const tagName = node.localName || node.nodeName.toLowerCase(); @@ -176,7 +196,9 @@ function formatElement( } const attrStr = attrParts.length > 0 ? ' ' + attrParts.join(' ') : ''; - const interactiveMarker = interactive ? ' ★' : ''; + const interactiveMarker = interactive + ? ` ★${hints ? ` [${hints}]` : ''}` + : ''; const line = `${indent}[${node.backendNodeId}]<${tagName}${attrStr}/>${textContent}${interactiveMarker}`; return line; } @@ -188,13 +210,13 @@ function formatElement( * - Has no meaningful text content * - Is NOT interactive */ -function isCollapsibleContainer(node: DOMNode): boolean { +function isCollapsibleContainer(node: DOMNode, path: string, ctx: SerializeContext): boolean { if (!node.children) return false; const tagName = (node.localName || node.nodeName).toLowerCase(); if (!CONTAINER_TAGS.has(tagName)) return false; const attrMap = parseAttributes(node.attributes); - if (isInteractive(tagName, attrMap)) return false; + if (isInteractive(tagName, attrMap, ctx.customInteractiveHints.get(path))) return false; const text = getDirectTextContent(node); if (text.length > 0) return false; @@ -207,9 +229,10 @@ function isCollapsibleContainer(node: DOMNode): boolean { * Collect a chain of single-child containers starting from node. * Returns the chain nodes and the leaf (first non-container or multi-child node). */ -function collectContainerChain(node: DOMNode): { chain: DOMNode[], leaf: DOMNode } { +function collectContainerChain(node: DOMNode, path: string, ctx: SerializeContext): { chain: DOMNode[], leaf: DOMNode, leafPath: string } { const chain: DOMNode[] = [node]; let current = node; + let currentPath = path; while (chain.length < MAX_CONTAINER_CHAIN) { const elementChildren = (current.children || []).filter( @@ -218,15 +241,18 @@ function collectContainerChain(node: DOMNode): { chain: DOMNode[], leaf: DOMNode if (elementChildren.length !== 1) break; const child = elementChildren[0]; + const childPaths = createChildPathMap(current.children || [], currentPath); + const childPath = childPaths.get(child) ?? currentPath; const childTag = (child.localName || child.nodeName).toLowerCase(); if (!CONTAINER_TAGS.has(childTag)) break; const childAttrMap = parseAttributes(child.attributes); - if (isInteractive(childTag, childAttrMap)) break; + if (isInteractive(childTag, childAttrMap, ctx.customInteractiveHints.get(childPath))) break; if (getDirectTextContent(child).length > 0) break; chain.push(child); current = child; + currentPath = childPath; } // The leaf is the deepest container's single element child, or the last container itself @@ -234,13 +260,15 @@ function collectContainerChain(node: DOMNode): { chain: DOMNode[], leaf: DOMNode c => c.nodeType === NODE_TYPE_ELEMENT && !SKIP_TAGS.has(c.nodeName.toUpperCase()) ); const leaf = lastChildren.length === 1 ? lastChildren[0] : current; + const lastChildPaths = createChildPathMap(current.children || [], currentPath); + const leafPath = lastChildPaths.get(leaf) ?? currentPath; // If leaf is same as last chain entry, the chain didn't find a true leaf if (leaf === current) { - return { chain: [], leaf: node }; // no collapse + return { chain: [], leaf: node, leafPath: path }; // no collapse } - return { chain, leaf }; + return { chain, leaf, leafPath }; } interface SiblingGroup { @@ -286,6 +314,7 @@ interface SerializeContext { includeUserAgentShadowDOM: boolean; nodesVisited: number; maxNodes: number; + customInteractiveHints: Map; /** * Tracks every backendNodeId emitted in the output so the caller can mint * a `[node_refs]` block for the #844 backend-node uid contract. Insertion @@ -294,6 +323,17 @@ interface SerializeContext { emittedBackendNodeIds: Set; } +function createChildPathMap(children: DOMNode[], parentPath: string): Map { + const paths = new Map(); + let elementIndex = 0; + for (const child of children) { + if (child.nodeType !== NODE_TYPE_ELEMENT) continue; + paths.set(child, `${parentPath}/c:${elementIndex}`); + elementIndex += 1; + } + return paths; +} + function appendTruncationMarker(ctx: SerializeContext): void { const truncationMsg = `\n\n[Output truncated at ${ctx.maxOutputChars} chars. Use depth parameter to limit scope.]`; ctx.lines.push(truncationMsg); @@ -318,6 +358,7 @@ function serializeNode( node: DOMNode, depth: number, ctx: SerializeContext, + path = 'd', ): void { if (ctx.truncated) return; @@ -334,8 +375,9 @@ function serializeNode( // Handle document node - just recurse into children if (node.nodeType === NODE_TYPE_DOCUMENT) { if (node.children) { + const childPaths = createChildPathMap(node.children, path); for (const child of node.children) { - serializeNode(child, depth, ctx); + serializeNode(child, depth, ctx, childPaths.get(child) ?? path); if (ctx.truncated) return; } } @@ -355,13 +397,14 @@ function serializeNode( const tagName = node.localName || node.nodeName.toLowerCase(); const attrMap = parseAttributes(node.attributes); - const interactive = isInteractive(tagName, attrMap); + const customHints = ctx.customInteractiveHints.get(path); + const interactive = isInteractive(tagName, attrMap, customHints); const indent = ' '.repeat(depth); // Container chain collapse (only in non-'none' compression mode, non-interactive containers) - if (ctx.compression !== 'none' && !interactive && isCollapsibleContainer(node)) { - const { chain, leaf } = collectContainerChain(node); + if (ctx.compression !== 'none' && !interactive && isCollapsibleContainer(node, path, ctx)) { + const { chain, leaf, leafPath } = collectContainerChain(node, path, ctx); if (chain.length >= 2) { // Build chain prefix: [10]div>[11]section>[12]div> const chainPrefix = chain.map(n => { @@ -372,9 +415,10 @@ function serializeNode( // Serialize the leaf: format its line but with chain prefix prepended const leafTag = leaf.localName || leaf.nodeName.toLowerCase(); const leafAttrMap = parseAttributes(leaf.attributes); - const leafInteractive = isInteractive(leafTag, leafAttrMap); + const leafHints = ctx.customInteractiveHints.get(leafPath); + const leafInteractive = isInteractive(leafTag, leafAttrMap, leafHints); const leafText = getDirectTextContent(leaf); - const leafLine = formatElement(leaf, leafAttrMap, '', leafText, leafInteractive); + const leafLine = formatElement(leaf, leafAttrMap, '', leafText, leafInteractive, leafHints); const fullLine = `${indent}${chainPrefix}${leafLine}\n`; if (ctx.totalChars + fullLine.length > ctx.maxOutputChars) { @@ -392,8 +436,9 @@ function serializeNode( // Recurse into leaf's children if (leaf.children) { + const childPaths = createChildPathMap(leaf.children, leafPath); for (const child of leaf.children) { - serializeNode(child, depth + 1, ctx); + serializeNode(child, depth + 1, ctx, childPaths.get(child) ?? leafPath); if (ctx.truncated) return; } } @@ -403,7 +448,7 @@ function serializeNode( if (!ctx.interactiveOnly || interactive) { const textContent = getDirectTextContent(node); - const line = formatElement(node, attrMap, indent, textContent, interactive); + const line = formatElement(node, attrMap, indent, textContent, interactive, customHints); const lineWithNewline = line + '\n'; if (ctx.totalChars + lineWithNewline.length > ctx.maxOutputChars) { @@ -427,7 +472,7 @@ function serializeNode( ctx.lines.push(separator); ctx.totalChars += separator.length; } - serializeNode(node.contentDocument, depth + 1, ctx); + serializeNode(node.contentDocument, depth + 1, ctx, `${path}/f`); return; // children are inside contentDocument } @@ -453,8 +498,10 @@ function serializeNode( // Shadow root children at depth+2 (inside shadow root boundary) if (shadowRoot.children) { + const shadowPath = `${path}/s:${node.shadowRoots.indexOf(shadowRoot)}`; + const childPaths = createChildPathMap(shadowRoot.children, shadowPath); for (const child of shadowRoot.children) { - serializeNode(child, depth + 2, ctx); + serializeNode(child, depth + 2, ctx, childPaths.get(child) ?? shadowPath); if (ctx.truncated) return; } } @@ -463,6 +510,7 @@ function serializeNode( // Recurse into children if (node.children && ctx.compression !== 'none') { + const childPaths = createChildPathMap(node.children, path); const groups = groupConsecutiveSiblings(node.children); const threshold = ctx.compression === 'aggressive' ? SIBLING_COLLAPSE_THRESHOLD_AGGRESSIVE @@ -473,13 +521,13 @@ function serializeNode( // Skip dedup for groups containing interactive elements to avoid // hiding clickable buttons/links/inputs from the LLM - const groupHasInteractive = group.nodes.some(n => containsInteractive(n)); + const groupHasInteractive = group.nodes.some(n => containsInteractive(n, childPaths.get(n) ?? path, ctx)); if (group.nodes.length >= threshold && !groupHasInteractive) { // Emit first SIBLING_SAMPLE_COUNT with full detail const samples = group.nodes.slice(0, SIBLING_SAMPLE_COUNT); for (const sampleNode of samples) { - serializeNode(sampleNode, depth + 1, ctx); + serializeNode(sampleNode, depth + 1, ctx, childPaths.get(sampleNode) ?? path); if (ctx.truncated) return; } @@ -502,12 +550,12 @@ function serializeNode( // Emit last node if not already shown if (group.nodes.length > SIBLING_SAMPLE_COUNT) { const lastNode = group.nodes[group.nodes.length - 1]; - serializeNode(lastNode, depth + 1, ctx); + serializeNode(lastNode, depth + 1, ctx, childPaths.get(lastNode) ?? path); } } else { // Small group — emit all normally for (const groupNode of group.nodes) { - serializeNode(groupNode, depth + 1, ctx); + serializeNode(groupNode, depth + 1, ctx, childPaths.get(groupNode) ?? path); if (ctx.truncated) return; } } @@ -522,13 +570,115 @@ function serializeNode( } } else if (node.children) { // Original behavior when compression is 'none' + const childPaths = createChildPathMap(node.children, path); for (const child of node.children) { - serializeNode(child, depth + 1, ctx); + serializeNode(child, depth + 1, ctx, childPaths.get(child) ?? path); if (ctx.truncated) return; } } } +async function scanCustomInteractiveElements(page: Page, pierceIframes: boolean): Promise { + // Bound the browser-side scan from inside the evaluated function. Racing + // page.evaluate with a timeout does not abort the in-page work. + return await page.evaluate((maxMs: number, maxElements: number, includeIframes: boolean) => { + const interactiveRoles = new Set([ + 'button', 'link', 'textbox', 'checkbox', 'radio', 'combobox', 'listbox', + 'menu', 'menuitem', 'tab', 'switch', 'slider', + ]); + const interactiveTags = new Set(['a', 'button', 'input', 'select', 'textarea', 'details', 'summary']); + type RootEntry = { root: Document | ShadowRoot; path: string }; + const roots: RootEntry[] = [{ root: document, path: 'd' }]; + const now = () => (typeof performance !== 'undefined' && typeof performance.now === 'function') + ? performance.now() + : Date.now(); + const deadline = now() + maxMs; + let inspected = 0; + let budgetExceeded = false; + const hintsByPath: Array<{ path: string; hints: string }> = []; + + for (let i = 0; i < roots.length; i++) { + const { root, path: rootPath } = roots[i]; + const walker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT); + let current = walker.nextNode(); + const paths = new WeakMap(); + const childIndexes = new WeakMap(); + while (current) { + inspected += 1; + if (inspected > maxElements || now() > deadline) { + budgetExceeded = true; + break; + } + + const el = current as HTMLElement; + const parent = el.parentNode; + const siblingIndex = childIndexes.get(parent as Node) ?? 0; + childIndexes.set(parent as Node, siblingIndex + 1); + const parentPath = parent === root + ? rootPath + : (paths.get(parent as Node) ?? rootPath); + const path = `${parentPath}/c:${siblingIndex}`; + paths.set(el, path); + + if (el.shadowRoot) roots.push({ root: el.shadowRoot, path: `${path}/s:0` }); + if (includeIframes && el.tagName.toLowerCase() === 'iframe') { + try { + const frame = el as HTMLIFrameElement; + if (frame.contentDocument) roots.push({ root: frame.contentDocument, path: `${path}/f` }); + } catch { + // Cross-origin frames are represented by CDP when possible; page + // script cannot inspect them, so custom hints are best-effort. + } + } + current = walker.nextNode(); + + if (el.closest('[hidden], [aria-hidden="true"]')) continue; + + const tag = el.tagName.toLowerCase(); + if (interactiveTags.has(tag)) continue; + const role = el.getAttribute('role'); + if (role && interactiveRoles.has(role.toLowerCase())) continue; + + const style = getComputedStyle(el); + const hasCursorPointer = style.cursor === 'pointer'; + const hasOnClick = el.hasAttribute('onclick') || typeof el.onclick === 'function'; + const tabIndex = el.getAttribute('tabindex'); + const hasTabIndex = tabIndex !== null && tabIndex !== '-1'; + const editable = el.getAttribute('contenteditable'); + const isEditable = el.isContentEditable || editable === '' || editable === 'true' || editable === 'plaintext-only'; + + if (!hasCursorPointer && !hasOnClick && !hasTabIndex && !isEditable) continue; + if (hasCursorPointer && !hasOnClick && !hasTabIndex && !isEditable) { + const parent = el.parentElement; + if (parent && getComputedStyle(parent).cursor === 'pointer') continue; + } + const rect = el.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) continue; + + const hints: string[] = []; + if (hasCursorPointer) hints.push('cursor:pointer'); + if (hasOnClick) hints.push('onclick'); + if (hasTabIndex) hints.push('tabindex'); + if (isEditable) hints.push('contenteditable'); + + const hiddenInput = el.querySelector('input[type="radio"], input[type="checkbox"]') as HTMLInputElement | null; + if (hiddenInput) { + const inputStyle = getComputedStyle(hiddenInput); + const hidden = inputStyle.display === 'none' || inputStyle.visibility === 'hidden' || hiddenInput.hidden; + if (hidden) { + hints.push(`${hiddenInput.type}:${hiddenInput.indeterminate ? 'mixed' : String(hiddenInput.checked)}`); + } + } + + hintsByPath.push({ path, hints: hints.join(', ') }); + } + if (budgetExceeded) break; + } + + return { completed: !budgetExceeded, inspected, hints: hintsByPath }; + }, INTERACTIVE_HINT_SCAN_MAX_MS, INTERACTIVE_HINT_SCAN_MAX_ELEMENTS, pierceIframes); +} + /** * Serialize a page's DOM into a compact text representation */ @@ -571,6 +721,21 @@ export async function serializeDOM( 'serializeDOM:pageStats', ) as PageStats; + let customInteractiveHints = new Map(); + if (interactiveOnly) { + try { + const scanResult = await scanCustomInteractiveElements(page, pierceIframes); + if (scanResult.completed) { + customInteractiveHints = new Map(scanResult.hints.map(({ path, hints }) => [path, hints])); + } + } catch { + // Cursor/onclick hint discovery is opportunistic. Large or hostile pages + // should still serialize using native tags and ARIA roles if this pre-scan + // times out or throws. + customInteractiveHints = new Map(); + } + } + // Get DOM tree via CDP. Always pierce at the CDP layer so shadowRoots are // present; ctx.pierceIframes below controls whether iframe contentDocument // subtrees are emitted. When callers request bounded output depth, avoid @@ -601,6 +766,7 @@ export async function serializeDOM( includeUserAgentShadowDOM, nodesVisited: 0, maxNodes: DEFAULT_MAX_SERIALIZER_NODES, + customInteractiveHints, emittedBackendNodeIds: new Set(), }; diff --git a/tests/dom/dom-serializer-cursor-budget.test.ts b/tests/dom/dom-serializer-cursor-budget.test.ts new file mode 100644 index 000000000..95747f46a --- /dev/null +++ b/tests/dom/dom-serializer-cursor-budget.test.ts @@ -0,0 +1,251 @@ +/// + +import { serializeDOM } from '../../src/dom/dom-serializer'; + +function createMockElement(overrides: Record = {}) { + const initialAttributes = new Map(Object.entries( + (overrides.attributes as Record | undefined) ?? {}, + )); + const element: any = { + attributes: initialAttributes, + removedAttributes: [] as string[], + tagName: 'DIV', + shadowRoot: null, + onclick: null, + parentElement: null, + isContentEditable: false, + computedStyle: { cursor: 'default', display: 'block', visibility: 'visible' }, + closest: jest.fn(() => null), + getAttribute: jest.fn((name: string) => initialAttributes.get(name) ?? null), + hasAttribute: jest.fn((name: string) => initialAttributes.has(name)), + setAttribute: jest.fn((name: string, value: string) => { initialAttributes.set(name, value); }), + removeAttribute: jest.fn((name: string) => { + initialAttributes.delete(name); + element.removedAttributes.push(name); + }), + querySelector: jest.fn(() => null), + getBoundingClientRect: jest.fn(() => ({ width: 10, height: 10 })), + ...overrides, + }; + element.attributes = initialAttributes; + return element; +} + +async function withMockTreeWalker(elements: any[], run: () => void | Promise) { + const previousDocument = (global as any).document; + const previousNodeFilter = (global as any).NodeFilter; + const previousPerformance = (global as any).performance; + const previousGetComputedStyle = (global as any).getComputedStyle; + + const documentRoot: any = { nodes: elements }; + for (const element of elements) { + if (!element.parentNode) element.parentNode = documentRoot; + } + + (global as any).document = { + createTreeWalker: (root: { nodes?: any[] }) => { + const nodes = root.nodes ?? []; + let index = -1; + return { + nextNode: () => nodes[++index] ?? null, + }; + }, + querySelectorAll: () => elements, + nodes: elements, + }; + (global as any).NodeFilter = { SHOW_ELEMENT: 1 }; + (global as any).performance = { now: jest.fn(() => 0) }; + (global as any).getComputedStyle = jest.fn((el: any) => el.computedStyle ?? { + cursor: 'default', + display: 'block', + visibility: 'visible', + }); + + try { + return await run(); + } finally { + (global as any).document = previousDocument; + (global as any).NodeFilter = previousNodeFilter; + (global as any).performance = previousPerformance; + (global as any).getComputedStyle = previousGetComputedStyle; + } +} + +function createStats() { + return { + url: 'https://example.com', + title: 'Test Page', + scrollX: 0, + scrollY: 0, + scrollWidth: 1920, + scrollHeight: 3000, + viewportWidth: 1920, + viewportHeight: 1080, + }; +} + +function createDoc(children: any[]) { + return { + nodeId: 1, backendNodeId: 1, nodeType: 9, nodeName: '#document', localName: '', + children: [{ + nodeId: 2, backendNodeId: 2, nodeType: 1, nodeName: 'BODY', localName: 'body', + attributes: [], + children, + }], + }; +} + +describe('DOM serializer cursor hint scan budget', () => { + it('stops cursor hint discovery at the node cap and keeps native interactive fallback', async () => { + const nativeInteractiveDoc = { + nodeId: 1, backendNodeId: 1, nodeType: 9, nodeName: '#document', localName: '', + children: [{ + nodeId: 2, backendNodeId: 2, nodeType: 1, nodeName: 'BODY', localName: 'body', + attributes: [], + children: [{ + nodeId: 3, backendNodeId: 930, nodeType: 1, nodeName: 'BUTTON', localName: 'button', + attributes: [], + children: [], + }], + }], + }; + const page = { + evaluate: jest.fn() + .mockResolvedValueOnce(createStats()) + .mockImplementationOnce(async ( + fn: Function, + maxMs: number, + maxElements: number, + includeIframes: boolean, + ) => { + const elements = Array.from({ length: maxElements + 5 }, () => createMockElement({ + computedStyle: { cursor: 'pointer', display: 'block', visibility: 'visible' }, + })); + const result = await withMockTreeWalker(elements, () => fn(maxMs, maxElements, includeIframes)); + expect(elements[maxElements - 1].setAttribute).not.toHaveBeenCalled(); + expect(elements[maxElements].setAttribute).not.toHaveBeenCalled(); + return result; + }), + }; + const cdpClient = { + send: jest.fn().mockResolvedValue({ root: nativeInteractiveDoc }), + }; + + const result = await serializeDOM(page as never, cdpClient as never, { interactiveOnly: true }); + + expect(result.content).toContain('[930]