Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/extraction/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ export {
buildOpenGraphExtractor,
buildCssHeuristicExtractor,
buildMultipleItemExtractor,
buildStandardDomExtractor,
} from './strategies';
export type { StrategyResult } from './strategies';
export { buildExtractionPlan, buildFieldPlan, isSafeSelectorToken } from './plan';
export type { ExtractionPlan, ExtractionFieldPlan } from './plan';
export { EXTRACTION_MODES, EXTRACTION_MODE_BUDGETS, parseExtractionMode } from './mode';
export type { ExtractionMode, ExtractionModeBudget } from './mode';
42 changes: 42 additions & 0 deletions src/extraction/mode.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
export const EXTRACTION_MODES = ['fast', 'standard'] as const;
export type ExtractionMode = (typeof EXTRACTION_MODES)[number];

export interface ExtractionModeBudget {
readonly mode: ExtractionMode;
readonly jsonLdTimeoutMs: number;
readonly microdataTimeoutMs: number;
readonly openGraphTimeoutMs: number;
readonly cssTimeoutMs: number;
readonly standardDomTimeoutMs: number;
readonly maxCssNodes: number;
readonly maxStandardDomNodes: number;
}

export const EXTRACTION_MODE_BUDGETS: Record<ExtractionMode, ExtractionModeBudget> = {
fast: {
mode: 'fast',
jsonLdTimeoutMs: 5000,
microdataTimeoutMs: 5000,
openGraphTimeoutMs: 5000,
cssTimeoutMs: 10000,
standardDomTimeoutMs: 0,
maxCssNodes: 500,
maxStandardDomNodes: 0,
},
standard: {
mode: 'standard',
jsonLdTimeoutMs: 5000,
microdataTimeoutMs: 5000,
openGraphTimeoutMs: 5000,
cssTimeoutMs: 10000,
standardDomTimeoutMs: 12000,
maxCssNodes: 1000,
maxStandardDomNodes: 2000,
},
};

export function parseExtractionMode(value: unknown): { ok: true; mode: ExtractionMode } | { ok: false; error: string } {
if (value === undefined || value === null || value === '') return { ok: true, mode: 'fast' };
if (value === 'fast' || value === 'standard') return { ok: true, mode: value };
return { ok: false, error: 'Invalid mode. Use "fast" or "standard".' };
}
9 changes: 9 additions & 0 deletions src/extraction/strategies.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,12 @@ export function buildMultipleItemExtractor(fields: FieldInput, schemaProps: Reco
const plans = normalisePlans(fields);
return `(function(fp,sp,scope){var root=scope?document.querySelector(scope):document.body;if(!root)root=document.body;function gt(el){if(!el)return null;if(el.tagName==='IMG')return el.src||el.getAttribute('data-src')||null;return(el.textContent?.trim()||'').slice(0,500)||null}function norm(v){return String(v||'').toLowerCase().replace(/[^a-z0-9]/g,'')}var cs=['[role="list"]','[class*="list"]','[class*="grid"]','[class*="results"]','[class*="items"]','[class*="products"]','[class*="cards"]','table tbody','ul','ol'];var items=[];for(var c=0;c<cs.length;c++){try{var cd=root.querySelector(cs[c]);if(!cd)continue;var ch=cd.children;if(ch.length<2)continue;var ft=ch[0].tagName,sc=0;for(var x=0;x<ch.length;x++)if(ch[x].tagName===ft)sc++;if(sc/ch.length>0.7){for(var y=0;y<Math.min(ch.length,50);y++)items.push(ch[y]);break}}catch(e){}}if(!items.length){var rows=root.querySelectorAll('tr');if(rows.length>1)for(var r=1;r<Math.min(rows.length,51);r++)items.push(rows[r])}if(!items.length)return[];var res=[];for(var i=0;i<items.length;i++){var it=items[i],ext={},has=false;for(var f2=0;f2<fp.length;f2++){var f=fp[f2],nf=norm(f.field+' '+(f.selectorTokens||[]).join(' '));var isI=nf.includes('image')||nf.includes('img')||nf.includes('photo');var isL=nf.includes('url')||nf.includes('link')||nf.includes('href');var v=null;if(isI){var img=it.querySelector('img');if(img)v=img.src||img.getAttribute('data-src')}else if(isL){var a=it.querySelector('a[href]');if(a)v=a.href}else{var tokens=f.selectorTokens&&f.selectorTokens.length?f.selectorTokens:[];for(var t=0;t<tokens.length&&!v;t++){var raw=tokens[t],fl=raw.toLowerCase(),fk=raw.replace(/([A-Z])/g,'-$1').toLowerCase().replace(/^-/,'');var ss=['[class*="'+fl+'"]','[class*="'+fk+'"]','[itemprop="'+fl+'"]','[data-testid*="'+fl+'"]'];for(var s=0;s<ss.length;s++){try{var el=it.querySelector(ss[s]);if(el){v=gt(el);break}}catch(e){}}}}if(!v){if(nf.includes('title')||nf.includes('name')||nf.includes('headline')){var h=it.querySelector('h1,h2,h3,h4,a');if(h)v=(h.textContent?.trim()||'').slice(0,200)}else if(nf.includes('price')||nf.includes('amount')){var pe=it.querySelector('[class*="price"],[data-price],[itemprop="price"]');if(pe)v=gt(pe)}else if(nf.includes('description')||nf.includes('snippet')||nf.includes('summary')){var de=it.querySelector('p,[class*="desc"],[class*="snippet"]');if(de)v=gt(de)}}ext[f.field]=v;if(v!==null)has=true}if(has)res.push(ext)}return res})(${JSON.stringify(plans)},${JSON.stringify(schemaProps)},${JSON.stringify(scopeSelector||null)})`;
}

export function buildStandardDomExtractor(
fieldNames: string[],
schemaProps: Record<string, SchemaProperty>,
scopeSelector: string | undefined,
maxNodes: number
): string {
return `(function(fn,sp,scope,maxNodes){var r={};var root=scope?document.querySelector(scope):document.body;if(!root)root=document.body;function clean(v,cap){return (v||'').replace(/\\s+/g,' ').trim().slice(0,cap||500)}function text(el){if(!el)return'';if(el.tagName==='IMG')return el.src||el.getAttribute('data-src')||'';return clean(el.getAttribute('content')||el.getAttribute('aria-label')||el.getAttribute('title')||el.textContent||'',500)}function norm(v){return clean(String(v||'').toLowerCase().replace(/[_-]+/g,' '),200)}function labelFor(f){var fl=norm(f);var words=fl.split(' ').filter(Boolean);var out=[fl];if(words.length>1)out.push(words.join(''));return out}function set(f,v){if(r[f]!=null)return;if(v!==null&&v!==undefined&&clean(String(v),500)!=='')r[f]=clean(String(v),500)}var nodes=Array.prototype.slice.call(root.querySelectorAll('label,dt,th,strong,b,h1,h2,h3,h4,h5,h6,[aria-label],[title],[data-testid],[class],[id],p,li,td,div,span'),0,Math.max(0,maxNodes||2000));for(var i=0;i<fn.length;i++){var f=fn[i];if(r[f]!=null)continue;var aliases=labelFor(f);for(var n=0;n<nodes.length;n++){var el=nodes[n];var hay=norm([el.getAttribute('aria-label'),el.getAttribute('title'),el.getAttribute('data-testid'),el.getAttribute('class'),el.id,el.textContent].filter(Boolean).join(' '));var hit=false;for(var a=0;a<aliases.length;a++){if(aliases[a]&&hay.indexOf(aliases[a])!==-1){hit=true;break}}if(!hit)continue;var v='';if(el.matches&&el.matches('input,textarea,select'))v=el.value||el.getAttribute('value')||el.getAttribute('placeholder')||'';if(!v&&el.tagName==='LABEL'){var control=el.control;if(control)v=control.value||control.getAttribute('value')||control.getAttribute('placeholder')||''}if(!v){var next=el.nextElementSibling;if(next)v=text(next)}if(!v&&el.parentElement){var candidate=el.parentElement.querySelector('input,textarea,select,[data-value],[content]');if(candidate)v=candidate.value||candidate.getAttribute('data-value')||candidate.getAttribute('content')||text(candidate)}if(!v)v=text(el);set(f,v);break}}return r})(${JSON.stringify(fieldNames)},${JSON.stringify(schemaProps)},${JSON.stringify(scopeSelector||null)},${JSON.stringify(maxNodes)})`;
}
61 changes: 47 additions & 14 deletions src/tools/extract-data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@ import {
buildMicrodataExtractor,
buildOpenGraphExtractor,
buildCssHeuristicExtractor,
buildStandardDomExtractor,
buildMultipleItemExtractor,
parseExtractionMode,
EXTRACTION_MODE_BUDGETS,
} from '../extraction';
import type { ExtractionMode } from '../extraction';
import type { ExtractionSchema, SchemaProperty } from '../extraction';
import {
OUTPUT_MODE_SCHEMA_PROPERTIES,
Expand Down Expand Up @@ -80,11 +84,19 @@ const handler: ToolHandler = async (
args: Record<string, unknown>,
_context?: ToolContext
): Promise<MCPResult> => {
// Mode validation (#989): validate before any browser/session interaction.
const modeCheck = parseExtractionMode(args.mode);
if (!modeCheck.ok) {
return { content: [{ type: 'text', text: `Error: ${modeCheck.error}` }], isError: true };
}
const extractionMode = modeCheck.mode;
const budget = EXTRACTION_MODE_BUDGETS[extractionMode];

const tabId = args.tabId as string;
const schema = args.schema as ExtractionSchema;
const selector = args.selector as string | undefined;
const multiple = (args.multiple as boolean) ?? false;
const { mode, inlineLimit } = parseOutputMode(args);
const { mode: outputMode, inlineLimit } = parseOutputMode(args);
const waitForReady = args.waitForReady === true;
const readyTimeoutMs = typeof args.readyTimeoutMs === 'number' ? args.readyTimeoutMs : undefined;

Expand Down Expand Up @@ -149,14 +161,17 @@ const handler: ToolHandler = async (
selector: selector || 'auto', fieldCount: fieldNames.length, itemCount: validated.length,
}));

const multiplePayload = {
const multiplePayload: Record<string, unknown> = {
action: 'extract_data', url: pageUrl, multiple: true, items: validated, count: validated.length,
modeUsed: extractionMode,
...(readiness ? { readiness } : {}),
};
const multipleTextWithoutMetrics = JSON.stringify(multiplePayload);
multiplePayload.metrics = { mode: extractionMode, outputChars: multipleTextWithoutMetrics.length };
const multipleInlineResult: MCPResult = {
content: [{ type: 'text', text: JSON.stringify(multiplePayload) }],
};
return resolveOutputMode(mode, inlineLimit, multipleInlineResult, multiplePayload, 'extract_data');
return resolveOutputMode(outputMode, inlineLimit, multipleInlineResult, multiplePayload, 'extract_data');
}

// Single item — layered strategies
Expand All @@ -165,48 +180,62 @@ const handler: ToolHandler = async (

// Strategy 1: JSON-LD
try {
const r = await withTimeout(page.evaluate(buildJsonLdExtractor(fieldNames)) as Promise<Record<string, unknown>>, 5000, 'extract_data:jsonld');
const r = await withTimeout(page.evaluate(buildJsonLdExtractor(fieldNames)) as Promise<Record<string, unknown>>, budget.jsonLdTimeoutMs, 'extract_data:jsonld');
if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('json-ld'); }
} catch { /* non-fatal */ }

if (countFields(merged) >= fieldNames.length) {
const { result, validation } = validateAndCoerce(merged, schema);
return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, mode, inlineLimit, readiness);
return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, extractionMode, outputMode, inlineLimit, readiness);
}

// Strategy 2: Microdata
try {
const r = await withTimeout(page.evaluate(buildMicrodataExtractor(fieldNames)) as Promise<Record<string, unknown>>, 5000, 'extract_data:microdata');
const r = await withTimeout(page.evaluate(buildMicrodataExtractor(fieldNames)) as Promise<Record<string, unknown>>, budget.microdataTimeoutMs, 'extract_data:microdata');
if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('microdata'); }
} catch { /* non-fatal */ }

// Strategy 3: OpenGraph
try {
const r = await withTimeout(page.evaluate(buildOpenGraphExtractor(fieldNames)) as Promise<Record<string, unknown>>, 5000, 'extract_data:opengraph');
const r = await withTimeout(page.evaluate(buildOpenGraphExtractor(fieldNames)) as Promise<Record<string, unknown>>, budget.openGraphTimeoutMs, 'extract_data:opengraph');
if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('opengraph'); }
} catch { /* non-fatal */ }

if (countFields(merged) >= fieldNames.length) {
const { result, validation } = validateAndCoerce(merged, schema);
return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, mode, inlineLimit, readiness);
return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, extractionMode, outputMode, inlineLimit, readiness);
}

// Strategy 4: CSS heuristic
try {
const r = await withTimeout(page.evaluate(buildCssHeuristicExtractor(fieldNames, schemaProps, selector)) as Promise<Record<string, unknown>>, 10000, 'extract_data:css');
const r = await withTimeout(page.evaluate(buildCssHeuristicExtractor(fieldNames, schemaProps, selector)) as Promise<Record<string, unknown>>, budget.cssTimeoutMs, 'extract_data:css');
if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('css-heuristic'); }
} catch { /* non-fatal */ }

if (extractionMode === 'standard' && countFields(merged) < fieldNames.length) {
try {
const r = await withTimeout(
page.evaluate(buildStandardDomExtractor(fieldNames, schemaProps, selector, budget.maxStandardDomNodes)) as Promise<Record<string, unknown>>,
budget.standardDomTimeoutMs,
'extract_data:standard-dom'
);
if (r && typeof r === 'object') {
merged = mergeResults(merged, r);
if (countFields(r) > 0) strategies.push('standard-dom');
}
} catch { /* non-fatal */ }
}

const { result, validation } = validateAndCoerce(merged, schema);
return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, mode, inlineLimit, readiness);
return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, extractionMode, outputMode, inlineLimit, readiness);
} catch (error) {
return { content: [{ type: 'text', text: `Extraction error: ${error instanceof Error ? error.message : String(error)}` }], isError: true };
}
};

function buildResponse(
data: Record<string, unknown>, errors: string[], url: string,
strategies: string[], domain: string, fieldNames: string[]
strategies: string[], domain: string, fieldNames: string[], extractionMode: ExtractionMode,
): { inlineResult: MCPResult; payload: Record<string, unknown> } {
const fieldsFound = Object.entries(data).filter(([, v]) => v !== null && v !== undefined && v !== '').map(([k]) => k);
const fieldsMissing = fieldNames.filter(f => !fieldsFound.includes(f));
Expand All @@ -220,30 +249,34 @@ function buildResponse(

const payload: Record<string, unknown> = {
action: 'extract_data', url, data, fieldsFound: fieldsFound.length, fieldsTotal: fieldNames.length, strategies,
modeUsed: extractionMode,
};
if (fieldsMissing.length > 0) payload.fieldsMissing = fieldsMissing;
if (errors.length > 0) payload.validationErrors = errors;
if (fieldsFound.length === 0) {
payload.message = 'No data extracted. Try: (1) read_page to verify content, (2) provide a CSS selector, (3) wait_for before extracting.';
}

const textWithoutMetrics = JSON.stringify(payload);
payload.metrics = { mode: extractionMode, outputChars: textWithoutMetrics.length };
return { inlineResult: { content: [{ type: 'text', text: JSON.stringify(payload) }] }, payload };
}


async function buildResponseWithMode(
data: Record<string, unknown>, errors: string[], url: string,
strategies: string[], domain: string, fieldNames: string[],
mode: import('./_shared/output-mode').OutputMode, inlineLimit: number,
extractionMode: ExtractionMode, outputMode: import('./_shared/output-mode').OutputMode, inlineLimit: number,
readiness?: Awaited<ReturnType<typeof waitForPageReady>>,
): Promise<MCPResult> {
const { inlineResult, payload } = buildResponse(data, errors, url, strategies, domain, fieldNames);
const { inlineResult, payload } = buildResponse(data, errors, url, strategies, domain, fieldNames, extractionMode);
if (readiness) {
payload.readiness = readiness;
if (inlineResult.content?.[0]?.type === 'text') {
inlineResult.content[0].text = JSON.stringify(payload);
}
}
return resolveOutputMode(mode, inlineLimit, inlineResult, payload, 'extract_data');
return resolveOutputMode(outputMode, inlineLimit, inlineResult, payload, 'extract_data');
}

export const extractDataHandler = handler;
Expand Down
93 changes: 93 additions & 0 deletions tests/tools/extract-data-modes.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/// <reference types="jest" />

jest.mock('../../src/session-manager', () => ({
getSessionManager: jest.fn(),
}));

jest.mock('../../src/memory/domain-memory', () => ({
extractDomainFromUrl: jest.fn(() => 'example.test'),
getDomainMemory: jest.fn(() => ({ record: jest.fn() })),
}));

import { getSessionManager } from '../../src/session-manager';
import { parseExtractionMode, EXTRACTION_MODE_BUDGETS } from '../../src/extraction';
import { extractDataHandler } from '../../src/tools/extract-data';

const schema = {
type: 'object' as const,
properties: {
title: { type: 'string' },
price: { type: 'string' },
},
required: ['title', 'price'],
};

function responseJson(result: Awaited<ReturnType<typeof extractDataHandler>>): Record<string, unknown> {
return JSON.parse(result.content?.[0]?.text ?? '{}') as Record<string, unknown>;
}

describe('extract_data modes', () => {
beforeEach(() => {
jest.clearAllMocks();
});

it('validates and defaults extraction mode to fast', () => {
expect(parseExtractionMode(undefined)).toEqual({ ok: true, mode: 'fast' });
expect(parseExtractionMode('fast')).toEqual({ ok: true, mode: 'fast' });
expect(parseExtractionMode('standard')).toEqual({ ok: true, mode: 'standard' });
expect(parseExtractionMode('deep')).toEqual({ ok: false, error: 'Invalid mode. Use "fast" or "standard".' });
expect(EXTRACTION_MODE_BUDGETS.fast.maxStandardDomNodes).toBe(0);
expect(EXTRACTION_MODE_BUDGETS.standard.maxStandardDomNodes).toBeGreaterThan(EXTRACTION_MODE_BUDGETS.fast.maxStandardDomNodes);
});

it('rejects invalid mode with an actionable error before touching the browser', async () => {
const result = await extractDataHandler('s1', { tabId: 't1', schema, mode: 'deep' });

expect(result.isError).toBe(true);
expect(result.content?.[0]?.text).toContain('Invalid mode');
expect(getSessionManager).not.toHaveBeenCalled();
});

it('uses fast mode by default and does not invoke the standard-only DOM pass', async () => {
const evaluate = jest.fn(async (script: string) => {
expect(script).not.toContain('maxNodes');
if (script.includes('application/ld+json')) return { title: 'Fast title' };
return {};
});
(getSessionManager as jest.Mock).mockReturnValue({
getPage: jest.fn(async () => ({ url: () => 'https://example.test/product', evaluate })),
});

const result = await extractDataHandler('s1', { tabId: 't1', schema });
const body = responseJson(result);

expect(body.modeUsed).toBe('fast');
expect((body.data as Record<string, unknown>).title).toBe('Fast title');
expect(body.fieldsMissing).toEqual(['price']);
expect(body.strategies).toEqual(['json-ld']);
expect((body.metrics as Record<string, unknown>).mode).toBe('fast');
expect(evaluate).toHaveBeenCalledTimes(4);
});

it('standard mode runs the broader DOM pass and can recover fields fast missed', async () => {
const evaluate = jest.fn(async (script: string) => {
if (script.includes('application/ld+json')) return { title: 'Fast title' };
if (script.includes('maxNodes')) return { price: '$19.99' };
return {};
});
(getSessionManager as jest.Mock).mockReturnValue({
getPage: jest.fn(async () => ({ url: () => 'https://example.test/product', evaluate })),
});

const result = await extractDataHandler('s1', { tabId: 't1', schema, mode: 'standard' });
const body = responseJson(result);

expect(body.modeUsed).toBe('standard');
expect(body.fieldsMissing).toBeUndefined();
expect((body.data as Record<string, unknown>).title).toBe('Fast title');
expect((body.data as Record<string, unknown>).price).toBe('$19.99');
expect(body.strategies).toEqual(['json-ld', 'standard-dom']);
expect((body.metrics as Record<string, unknown>).outputChars).toBeGreaterThan(0);
expect(evaluate).toHaveBeenCalledTimes(5);
});
});
Loading