diff --git a/src/extraction/index.ts b/src/extraction/index.ts index 0a1921f9..61e2a449 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -6,7 +6,8 @@ export { buildOpenGraphExtractor, buildCssHeuristicExtractor, buildMultipleItemExtractor, + buildStandardDomExtractor, } from './strategies'; export type { StrategyResult } from './strategies'; -export { buildExtractionPlan, buildFieldPlan, isSafeSelectorToken } from './plan'; -export type { ExtractionPlan, ExtractionFieldPlan } from './plan'; +export { EXTRACTION_MODES, EXTRACTION_MODE_BUDGETS, parseExtractionMode } from './mode'; +export type { ExtractionMode, ExtractionModeBudget } from './mode'; diff --git a/src/extraction/mode.ts b/src/extraction/mode.ts new file mode 100644 index 00000000..5bd8a2e6 --- /dev/null +++ b/src/extraction/mode.ts @@ -0,0 +1,42 @@ +export const EXTRACTION_MODES = ['fast', 'standard'] as const; +export type ExtractionMode = (typeof EXTRACTION_MODES)[number]; + +export interface ExtractionModeBudget { + readonly mode: ExtractionMode; + readonly jsonLdTimeoutMs: number; + readonly microdataTimeoutMs: number; + readonly openGraphTimeoutMs: number; + readonly cssTimeoutMs: number; + readonly standardDomTimeoutMs: number; + readonly maxCssNodes: number; + readonly maxStandardDomNodes: number; +} + +export const EXTRACTION_MODE_BUDGETS: Record = { + fast: { + mode: 'fast', + jsonLdTimeoutMs: 5000, + microdataTimeoutMs: 5000, + openGraphTimeoutMs: 5000, + cssTimeoutMs: 10000, + standardDomTimeoutMs: 0, + maxCssNodes: 500, + maxStandardDomNodes: 0, + }, + standard: { + mode: 'standard', + jsonLdTimeoutMs: 5000, + microdataTimeoutMs: 5000, + openGraphTimeoutMs: 5000, + cssTimeoutMs: 10000, + standardDomTimeoutMs: 12000, + maxCssNodes: 1000, + maxStandardDomNodes: 2000, + }, +}; + +export function parseExtractionMode(value: unknown): { ok: true; mode: ExtractionMode } | { ok: false; error: string } { + if (value === undefined || value === null || value === '') return { ok: true, mode: 'fast' }; + if (value === 'fast' || value === 'standard') return { ok: true, mode: value }; + return { ok: false, error: 'Invalid mode. Use "fast" or "standard".' }; +} diff --git a/src/extraction/strategies.ts b/src/extraction/strategies.ts index ceda6318..aa5fe8c7 100644 --- a/src/extraction/strategies.ts +++ b/src/extraction/strategies.ts @@ -54,3 +54,12 @@ export function buildMultipleItemExtractor(fields: FieldInput, schemaProps: Reco const plans = normalisePlans(fields); return `(function(fp,sp,scope){var root=scope?document.querySelector(scope):document.body;if(!root)root=document.body;function gt(el){if(!el)return null;if(el.tagName==='IMG')return el.src||el.getAttribute('data-src')||null;return(el.textContent?.trim()||'').slice(0,500)||null}function norm(v){return String(v||'').toLowerCase().replace(/[^a-z0-9]/g,'')}var cs=['[role="list"]','[class*="list"]','[class*="grid"]','[class*="results"]','[class*="items"]','[class*="products"]','[class*="cards"]','table tbody','ul','ol'];var items=[];for(var c=0;c0.7){for(var y=0;y1)for(var r=1;r, + scopeSelector: string | undefined, + maxNodes: number +): string { + return `(function(fn,sp,scope,maxNodes){var r={};var root=scope?document.querySelector(scope):document.body;if(!root)root=document.body;function clean(v,cap){return (v||'').replace(/\\s+/g,' ').trim().slice(0,cap||500)}function text(el){if(!el)return'';if(el.tagName==='IMG')return el.src||el.getAttribute('data-src')||'';return clean(el.getAttribute('content')||el.getAttribute('aria-label')||el.getAttribute('title')||el.textContent||'',500)}function norm(v){return clean(String(v||'').toLowerCase().replace(/[_-]+/g,' '),200)}function labelFor(f){var fl=norm(f);var words=fl.split(' ').filter(Boolean);var out=[fl];if(words.length>1)out.push(words.join(''));return out}function set(f,v){if(r[f]!=null)return;if(v!==null&&v!==undefined&&clean(String(v),500)!=='')r[f]=clean(String(v),500)}var nodes=Array.prototype.slice.call(root.querySelectorAll('label,dt,th,strong,b,h1,h2,h3,h4,h5,h6,[aria-label],[title],[data-testid],[class],[id],p,li,td,div,span'),0,Math.max(0,maxNodes||2000));for(var i=0;i, _context?: ToolContext ): Promise => { + // Mode validation (#989): validate before any browser/session interaction. + const modeCheck = parseExtractionMode(args.mode); + if (!modeCheck.ok) { + return { content: [{ type: 'text', text: `Error: ${modeCheck.error}` }], isError: true }; + } + const extractionMode = modeCheck.mode; + const budget = EXTRACTION_MODE_BUDGETS[extractionMode]; + const tabId = args.tabId as string; const schema = args.schema as ExtractionSchema; const selector = args.selector as string | undefined; const multiple = (args.multiple as boolean) ?? false; - const { mode, inlineLimit } = parseOutputMode(args); + const { mode: outputMode, inlineLimit } = parseOutputMode(args); const waitForReady = args.waitForReady === true; const readyTimeoutMs = typeof args.readyTimeoutMs === 'number' ? args.readyTimeoutMs : undefined; @@ -149,14 +161,17 @@ const handler: ToolHandler = async ( selector: selector || 'auto', fieldCount: fieldNames.length, itemCount: validated.length, })); - const multiplePayload = { + const multiplePayload: Record = { action: 'extract_data', url: pageUrl, multiple: true, items: validated, count: validated.length, + modeUsed: extractionMode, ...(readiness ? { readiness } : {}), }; + const multipleTextWithoutMetrics = JSON.stringify(multiplePayload); + multiplePayload.metrics = { mode: extractionMode, outputChars: multipleTextWithoutMetrics.length }; const multipleInlineResult: MCPResult = { content: [{ type: 'text', text: JSON.stringify(multiplePayload) }], }; - return resolveOutputMode(mode, inlineLimit, multipleInlineResult, multiplePayload, 'extract_data'); + return resolveOutputMode(outputMode, inlineLimit, multipleInlineResult, multiplePayload, 'extract_data'); } // Single item — layered strategies @@ -165,40 +180,54 @@ const handler: ToolHandler = async ( // Strategy 1: JSON-LD try { - const r = await withTimeout(page.evaluate(buildJsonLdExtractor(fieldNames)) as Promise>, 5000, 'extract_data:jsonld'); + const r = await withTimeout(page.evaluate(buildJsonLdExtractor(fieldNames)) as Promise>, budget.jsonLdTimeoutMs, 'extract_data:jsonld'); if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('json-ld'); } } catch { /* non-fatal */ } if (countFields(merged) >= fieldNames.length) { const { result, validation } = validateAndCoerce(merged, schema); - return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, mode, inlineLimit, readiness); + return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, extractionMode, outputMode, inlineLimit, readiness); } // Strategy 2: Microdata try { - const r = await withTimeout(page.evaluate(buildMicrodataExtractor(fieldNames)) as Promise>, 5000, 'extract_data:microdata'); + const r = await withTimeout(page.evaluate(buildMicrodataExtractor(fieldNames)) as Promise>, budget.microdataTimeoutMs, 'extract_data:microdata'); if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('microdata'); } } catch { /* non-fatal */ } // Strategy 3: OpenGraph try { - const r = await withTimeout(page.evaluate(buildOpenGraphExtractor(fieldNames)) as Promise>, 5000, 'extract_data:opengraph'); + const r = await withTimeout(page.evaluate(buildOpenGraphExtractor(fieldNames)) as Promise>, budget.openGraphTimeoutMs, 'extract_data:opengraph'); if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('opengraph'); } } catch { /* non-fatal */ } if (countFields(merged) >= fieldNames.length) { const { result, validation } = validateAndCoerce(merged, schema); - return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, mode, inlineLimit, readiness); + return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, extractionMode, outputMode, inlineLimit, readiness); } // Strategy 4: CSS heuristic try { - const r = await withTimeout(page.evaluate(buildCssHeuristicExtractor(fieldNames, schemaProps, selector)) as Promise>, 10000, 'extract_data:css'); + const r = await withTimeout(page.evaluate(buildCssHeuristicExtractor(fieldNames, schemaProps, selector)) as Promise>, budget.cssTimeoutMs, 'extract_data:css'); if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('css-heuristic'); } } catch { /* non-fatal */ } + if (extractionMode === 'standard' && countFields(merged) < fieldNames.length) { + try { + const r = await withTimeout( + page.evaluate(buildStandardDomExtractor(fieldNames, schemaProps, selector, budget.maxStandardDomNodes)) as Promise>, + budget.standardDomTimeoutMs, + 'extract_data:standard-dom' + ); + if (r && typeof r === 'object') { + merged = mergeResults(merged, r); + if (countFields(r) > 0) strategies.push('standard-dom'); + } + } catch { /* non-fatal */ } + } + const { result, validation } = validateAndCoerce(merged, schema); - return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, mode, inlineLimit, readiness); + return buildResponseWithMode(result, validation.errors, pageUrl, strategies, domain, fieldNames, extractionMode, outputMode, inlineLimit, readiness); } catch (error) { return { content: [{ type: 'text', text: `Extraction error: ${error instanceof Error ? error.message : String(error)}` }], isError: true }; } @@ -206,7 +235,7 @@ const handler: ToolHandler = async ( function buildResponse( data: Record, errors: string[], url: string, - strategies: string[], domain: string, fieldNames: string[] + strategies: string[], domain: string, fieldNames: string[], extractionMode: ExtractionMode, ): { inlineResult: MCPResult; payload: Record } { const fieldsFound = Object.entries(data).filter(([, v]) => v !== null && v !== undefined && v !== '').map(([k]) => k); const fieldsMissing = fieldNames.filter(f => !fieldsFound.includes(f)); @@ -220,6 +249,7 @@ function buildResponse( const payload: Record = { action: 'extract_data', url, data, fieldsFound: fieldsFound.length, fieldsTotal: fieldNames.length, strategies, + modeUsed: extractionMode, }; if (fieldsMissing.length > 0) payload.fieldsMissing = fieldsMissing; if (errors.length > 0) payload.validationErrors = errors; @@ -227,23 +257,26 @@ function buildResponse( payload.message = 'No data extracted. Try: (1) read_page to verify content, (2) provide a CSS selector, (3) wait_for before extracting.'; } + const textWithoutMetrics = JSON.stringify(payload); + payload.metrics = { mode: extractionMode, outputChars: textWithoutMetrics.length }; return { inlineResult: { content: [{ type: 'text', text: JSON.stringify(payload) }] }, payload }; } + async function buildResponseWithMode( data: Record, errors: string[], url: string, strategies: string[], domain: string, fieldNames: string[], - mode: import('./_shared/output-mode').OutputMode, inlineLimit: number, + extractionMode: ExtractionMode, outputMode: import('./_shared/output-mode').OutputMode, inlineLimit: number, readiness?: Awaited>, ): Promise { - const { inlineResult, payload } = buildResponse(data, errors, url, strategies, domain, fieldNames); + const { inlineResult, payload } = buildResponse(data, errors, url, strategies, domain, fieldNames, extractionMode); if (readiness) { payload.readiness = readiness; if (inlineResult.content?.[0]?.type === 'text') { inlineResult.content[0].text = JSON.stringify(payload); } } - return resolveOutputMode(mode, inlineLimit, inlineResult, payload, 'extract_data'); + return resolveOutputMode(outputMode, inlineLimit, inlineResult, payload, 'extract_data'); } export const extractDataHandler = handler; diff --git a/tests/tools/extract-data-modes.test.ts b/tests/tools/extract-data-modes.test.ts new file mode 100644 index 00000000..c2fc595e --- /dev/null +++ b/tests/tools/extract-data-modes.test.ts @@ -0,0 +1,93 @@ +/// + +jest.mock('../../src/session-manager', () => ({ + getSessionManager: jest.fn(), +})); + +jest.mock('../../src/memory/domain-memory', () => ({ + extractDomainFromUrl: jest.fn(() => 'example.test'), + getDomainMemory: jest.fn(() => ({ record: jest.fn() })), +})); + +import { getSessionManager } from '../../src/session-manager'; +import { parseExtractionMode, EXTRACTION_MODE_BUDGETS } from '../../src/extraction'; +import { extractDataHandler } from '../../src/tools/extract-data'; + +const schema = { + type: 'object' as const, + properties: { + title: { type: 'string' }, + price: { type: 'string' }, + }, + required: ['title', 'price'], +}; + +function responseJson(result: Awaited>): Record { + return JSON.parse(result.content?.[0]?.text ?? '{}') as Record; +} + +describe('extract_data modes', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + it('validates and defaults extraction mode to fast', () => { + expect(parseExtractionMode(undefined)).toEqual({ ok: true, mode: 'fast' }); + expect(parseExtractionMode('fast')).toEqual({ ok: true, mode: 'fast' }); + expect(parseExtractionMode('standard')).toEqual({ ok: true, mode: 'standard' }); + expect(parseExtractionMode('deep')).toEqual({ ok: false, error: 'Invalid mode. Use "fast" or "standard".' }); + expect(EXTRACTION_MODE_BUDGETS.fast.maxStandardDomNodes).toBe(0); + expect(EXTRACTION_MODE_BUDGETS.standard.maxStandardDomNodes).toBeGreaterThan(EXTRACTION_MODE_BUDGETS.fast.maxStandardDomNodes); + }); + + it('rejects invalid mode with an actionable error before touching the browser', async () => { + const result = await extractDataHandler('s1', { tabId: 't1', schema, mode: 'deep' }); + + expect(result.isError).toBe(true); + expect(result.content?.[0]?.text).toContain('Invalid mode'); + expect(getSessionManager).not.toHaveBeenCalled(); + }); + + it('uses fast mode by default and does not invoke the standard-only DOM pass', async () => { + const evaluate = jest.fn(async (script: string) => { + expect(script).not.toContain('maxNodes'); + if (script.includes('application/ld+json')) return { title: 'Fast title' }; + return {}; + }); + (getSessionManager as jest.Mock).mockReturnValue({ + getPage: jest.fn(async () => ({ url: () => 'https://example.test/product', evaluate })), + }); + + const result = await extractDataHandler('s1', { tabId: 't1', schema }); + const body = responseJson(result); + + expect(body.modeUsed).toBe('fast'); + expect((body.data as Record).title).toBe('Fast title'); + expect(body.fieldsMissing).toEqual(['price']); + expect(body.strategies).toEqual(['json-ld']); + expect((body.metrics as Record).mode).toBe('fast'); + expect(evaluate).toHaveBeenCalledTimes(4); + }); + + it('standard mode runs the broader DOM pass and can recover fields fast missed', async () => { + const evaluate = jest.fn(async (script: string) => { + if (script.includes('application/ld+json')) return { title: 'Fast title' }; + if (script.includes('maxNodes')) return { price: '$19.99' }; + return {}; + }); + (getSessionManager as jest.Mock).mockReturnValue({ + getPage: jest.fn(async () => ({ url: () => 'https://example.test/product', evaluate })), + }); + + const result = await extractDataHandler('s1', { tabId: 't1', schema, mode: 'standard' }); + const body = responseJson(result); + + expect(body.modeUsed).toBe('standard'); + expect(body.fieldsMissing).toBeUndefined(); + expect((body.data as Record).title).toBe('Fast title'); + expect((body.data as Record).price).toBe('$19.99'); + expect(body.strategies).toEqual(['json-ld', 'standard-dom']); + expect((body.metrics as Record).outputChars).toBeGreaterThan(0); + expect(evaluate).toHaveBeenCalledTimes(5); + }); +});