diff --git a/src/extraction/index.ts b/src/extraction/index.ts index 0a1921f90..6d66df85c 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -10,3 +10,5 @@ export { export type { StrategyResult } from './strategies'; export { buildExtractionPlan, buildFieldPlan, isSafeSelectorToken } from './plan'; export type { ExtractionPlan, ExtractionFieldPlan } from './plan'; +export { buildExtractionQueryPlan, parseExtractionQuery, ExtractionQueryParseError } from './query-parser'; +export type { ExtractionQueryPlan } from './query-parser'; diff --git a/src/extraction/query-parser.ts b/src/extraction/query-parser.ts new file mode 100644 index 000000000..08add0c98 --- /dev/null +++ b/src/extraction/query-parser.ts @@ -0,0 +1,285 @@ +import type { ExtractionSchema, SchemaProperty } from './schema-validator'; + +const SAFE_FIELD = /^[a-zA-Z][a-zA-Z0-9_-]*$/; +const SUPPORTED_TYPES = new Set(['string', 'number', 'integer', 'boolean', 'url', 'date']); + +type TokenType = 'braceL' | 'braceR' | 'bracketL' | 'bracketR' | 'parenL' | 'parenR' | 'comma' | 'identifier' | 'string' | 'eof'; + +interface Token { + type: TokenType; + value: string; + pos: number; +} + +export interface QueryFieldNode { + name: string; + list: boolean; + type?: string; + description?: string; + children?: QueryFieldNode[]; + pos: number; +} + +export interface QueryAst { + fields: QueryFieldNode[]; +} + +export interface ExtractionQueryPlan { + schema: ExtractionSchema; + multiple: boolean; + normalizedQuery: string; + rootListField?: string; +} + +export class ExtractionQueryParseError extends Error { + constructor(message: string, public readonly position: number) { + super(`${message} at position ${position}`); + this.name = 'ExtractionQueryParseError'; + } +} + +function tokenize(input: string): Token[] { + const tokens: Token[] = []; + let i = 0; + while (i < input.length) { + const ch = input[i]; + if (/\s/.test(ch)) { i++; continue; } + if (ch === '{') { tokens.push({ type: 'braceL', value: ch, pos: i++ }); continue; } + if (ch === '}') { tokens.push({ type: 'braceR', value: ch, pos: i++ }); continue; } + if (ch === '[') { tokens.push({ type: 'bracketL', value: ch, pos: i++ }); continue; } + if (ch === ']') { tokens.push({ type: 'bracketR', value: ch, pos: i++ }); continue; } + if (ch === '(') { tokens.push({ type: 'parenL', value: ch, pos: i++ }); continue; } + if (ch === ')') { tokens.push({ type: 'parenR', value: ch, pos: i++ }); continue; } + if (ch === ',') { tokens.push({ type: 'comma', value: ch, pos: i++ }); continue; } + if (ch === '"') { + const start = i; + i++; + let value = ''; + while (i < input.length && input[i] !== '"') { + if (input[i] === '\\' && i + 1 < input.length) { + value += input[i + 1]; + i += 2; + } else { + value += input[i++]; + } + } + if (i >= input.length) throw new ExtractionQueryParseError('Unterminated string literal', start); + i++; + tokens.push({ type: 'string', value, pos: start }); + continue; + } + if (/[a-zA-Z_]/.test(ch)) { + const start = i; + i++; + while (i < input.length && /[a-zA-Z0-9_-]/.test(input[i])) i++; + tokens.push({ type: 'identifier', value: input.slice(start, i), pos: start }); + continue; + } + throw new ExtractionQueryParseError(`Unexpected token "${ch}"`, i); + } + tokens.push({ type: 'eof', value: '', pos: input.length }); + return tokens; +} + +class Parser { + private index = 0; + constructor(private readonly tokens: Token[]) {} + + parse(): QueryAst { + this.expect('braceL'); + const fields = this.parseFields('braceR'); + this.expect('braceR'); + this.expect('eof'); + if (fields.length === 0) throw new ExtractionQueryParseError('Query must contain at least one field', 0); + return { fields }; + } + + private parseFields(end: TokenType): QueryFieldNode[] { + const fields: QueryFieldNode[] = []; + while (this.peek().type !== end && this.peek().type !== 'eof') { + fields.push(this.parseField()); + if (this.peek().type === 'comma') this.index++; + } + return fields; + } + + private parseField(): QueryFieldNode { + const nameToken = this.expect('identifier'); + if (!SAFE_FIELD.test(nameToken.value)) { + throw new ExtractionQueryParseError(`Unsafe field name "${nameToken.value}"`, nameToken.pos); + } + + let list = false; + if (this.peek().type === 'bracketL') { + this.index++; + this.expect('bracketR'); + list = true; + } + + let type: string | undefined; + let description: string | undefined; + if (this.peek().type === 'parenL') { + this.index++; + const args = this.parseArgs(); + type = args.type; + description = args.description; + this.expect('parenR'); + } + + let children: QueryFieldNode[] | undefined; + if (this.peek().type === 'braceL') { + this.index++; + children = this.parseFields('braceR'); + this.expect('braceR'); + if (children.length === 0) { + throw new ExtractionQueryParseError(`List/object field "${nameToken.value}" must contain at least one child field`, nameToken.pos); + } + } + + if (list && !children) { + throw new ExtractionQueryParseError(`List field "${nameToken.value}" must have a child block`, nameToken.pos); + } + + return { name: nameToken.value, list, type, description, children, pos: nameToken.pos }; + } + + private parseArgs(): { type?: string; description?: string } { + let type: string | undefined; + let description: string | undefined; + while (this.peek().type !== 'parenR' && this.peek().type !== 'eof') { + const token = this.peek(); + if (token.type === 'identifier') { + this.index++; + const normalized = token.value.toLowerCase(); + if (!SUPPORTED_TYPES.has(normalized)) { + throw new ExtractionQueryParseError(`Unsupported type "${token.value}"`, token.pos); + } + type = normalized === 'url' || normalized === 'date' ? 'string' : normalized; + if (normalized === 'url' || normalized === 'date') { + description = description ? `${description}; type hint: ${normalized}` : `type hint: ${normalized}`; + } + } else if (token.type === 'string') { + this.index++; + description = description ? `${description}; ${token.value}` : token.value; + } else if (token.type === 'comma') { + this.index++; + } else { + throw new ExtractionQueryParseError(`Unexpected argument token "${token.value}"`, token.pos); + } + } + return { type, description }; + } + + private peek(): Token { return this.tokens[this.index]; } + + private expect(type: TokenType): Token { + const token = this.peek(); + if (token.type !== type) { + throw new ExtractionQueryParseError(`Expected ${type}, got ${token.type}${token.value ? ` "${token.value}"` : ''}`, token.pos); + } + this.index++; + return token; + } +} + + +function assertScalarFields(fields: QueryFieldNode[], context: string): void { + for (const field of fields) { + if (field.list || field.children) { + throw new ExtractionQueryParseError( + `Nested field "${field.name}" is not supported in the local query subset (${context}). Use flat fields or one root list block.`, + field.pos, + ); + } + } +} + +function normalizeFields(fields: QueryFieldNode[]): string { + return fields.map((field) => { + const typePart = field.type ? `(${field.type})` : ''; + const descPart = field.description ? `:${field.description}` : ''; + if (field.list) { + return `${field.name}[]{${normalizeFields(field.children || [])}}${descPart}`; + } + if (field.children) { + return `${field.name}{${normalizeFields(field.children)}}${descPart}`; + } + return `${field.name}${typePart}${descPart}`; + }).join(' '); +} + +function fieldToProperty(field: QueryFieldNode): SchemaProperty { + if (field.list) { + return { + type: 'array', + description: field.description, + items: { + type: 'object', + properties: fieldsToProperties(field.children || []), + required: (field.children || []).map(child => child.name), + }, + }; + } + if (field.children) { + return { + type: 'object', + description: field.description, + properties: fieldsToProperties(field.children), + required: field.children.map(child => child.name), + }; + } + return { + type: field.type || 'string', + ...(field.description ? { description: field.description } : {}), + }; +} + +function fieldsToProperties(fields: QueryFieldNode[]): Record { + const props: Record = {}; + for (const field of fields) { + props[field.name] = fieldToProperty(field); + } + return props; +} + +export function parseExtractionQuery(query: string): QueryAst { + if (!query || query.trim().length === 0) { + throw new ExtractionQueryParseError('Query must be a non-empty string', 0); + } + return new Parser(tokenize(query)).parse(); +} + +export function buildExtractionQueryPlan(query: string): ExtractionQueryPlan { + const ast = parseExtractionQuery(query); + const normalizedQuery = `{ ${normalizeFields(ast.fields)} }`; + if (ast.fields.length === 1 && ast.fields[0].list) { + const root = ast.fields[0]; + assertScalarFields(root.children || [], `root list ${root.name}`); + return { + multiple: true, + normalizedQuery, + rootListField: root.name, + schema: { + type: 'array', + ...(root.description ? { description: root.description } : {}), + items: { + type: 'object', + properties: fieldsToProperties(root.children || []), + required: (root.children || []).map(child => child.name), + }, + }, + }; + } + + assertScalarFields(ast.fields, 'root object'); + + return { + multiple: false, + normalizedQuery, + schema: { + type: 'object', + properties: fieldsToProperties(ast.fields), + required: ast.fields.map(field => field.name), + }, + }; +} diff --git a/src/extraction/strategies.ts b/src/extraction/strategies.ts index ceda63181..cf1c66cad 100644 --- a/src/extraction/strategies.ts +++ b/src/extraction/strategies.ts @@ -9,7 +9,6 @@ interface RuntimeFieldPlan { field: string; aliases: string[]; selectorTokens: string[]; - expectedType?: string | string[]; } function normalisePlans(fields: FieldInput): RuntimeFieldPlan[] { @@ -21,18 +20,13 @@ function normalisePlans(fields: FieldInput): RuntimeFieldPlan[] { field: field.field, aliases: field.aliases, selectorTokens: field.selectorTokens, - expectedType: field.expectedType, }; }); } export function buildJsonLdExtractor(fields: FieldInput): string { const plans = normalisePlans(fields); - // val(v, t): project JSON-LD value based on declared schema type. - // Scalar types attempt common JSON-LD shape projections; object/untyped preserve as-is. - // IIFE scalar projection keys tried in order: @value, value, ratingValue, name. - const valFn = `function val(v,t){if(v===null||v===undefined)return v;var st={'string':1,'number':1,'integer':1,'boolean':1};var ts=Array.isArray(t)?t:[t];var isScalar=ts.some(function(x){return st[x]});if(!isScalar)return v;if(typeof v!=='object')return v;var proj=['@value','value','ratingValue','name'];for(var pi=0;pi, scopeSelector?: string): string { diff --git a/src/query-debug/store.ts b/src/query-debug/store.ts new file mode 100644 index 000000000..47c250c94 --- /dev/null +++ b/src/query-debug/store.ts @@ -0,0 +1,80 @@ +export type QueryDebugKind = 'extract' | 'element'; + +export interface QueryDebugRecord { + kind: QueryDebugKind; + sessionId: string; + tabId: string; + timestamp: string; + normalized?: string; + modeUsed?: string; + schemaSummary?: { fields: string[]; multiple: boolean; queryRoot?: string }; + strategies?: string[]; + fieldsFound?: string[]; + fieldsMissing?: string[]; + durations?: Record; + output?: { chars: number; truncated: boolean }; + notes?: string[]; +} + +const MAX_RECORDS_PER_KEY = 5; +const MAX_TEXT_CHARS = 240; +const SECRET_PATTERNS = [ + /password\s*[:=]\s*[^\s,;]+/gi, + /token\s*[:=]\s*[^\s,;]+/gi, + /api[_-]?key\s*[:=]\s*[^\s,;]+/gi, + /bearer\s+[a-z0-9._-]+/gi, +]; + +const records = new Map(); + +function key(sessionId: string, tabId: string, kind: QueryDebugKind): string { + return `${sessionId}::${tabId}::${kind}`; +} + +export function sanitizeDebugText(value: string): string { + let sanitized = value.slice(0, MAX_TEXT_CHARS); + for (const pattern of SECRET_PATTERNS) { + sanitized = sanitized.replace(pattern, '[REDACTED]'); + } + return sanitized; +} + +function sanitizeRecord(record: QueryDebugRecord): QueryDebugRecord { + return { + ...record, + normalized: record.normalized ? sanitizeDebugText(record.normalized) : undefined, + notes: record.notes?.map(sanitizeDebugText).slice(0, 8), + schemaSummary: record.schemaSummary + ? { ...record.schemaSummary, fields: record.schemaSummary.fields.slice(0, 40).map(sanitizeDebugText) } + : undefined, + fieldsFound: record.fieldsFound?.slice(0, 40).map(sanitizeDebugText), + fieldsMissing: record.fieldsMissing?.slice(0, 40).map(sanitizeDebugText), + strategies: record.strategies?.slice(0, 12).map(sanitizeDebugText), + }; +} + +export function recordQueryDebug(record: QueryDebugRecord): QueryDebugRecord { + const safe = sanitizeRecord(record); + const k = key(safe.sessionId, safe.tabId, safe.kind); + const list = records.get(k) || []; + list.unshift(safe); + records.set(k, list.slice(0, MAX_RECORDS_PER_KEY)); + return safe; +} + +export function getLatestQueryDebug(sessionId: string, tabId: string, kind: QueryDebugKind = 'extract'): QueryDebugRecord | null { + return records.get(key(sessionId, tabId, kind))?.[0] || null; +} + +export function clearQueryDebug(sessionId?: string, tabId?: string): void { + if (!sessionId && !tabId) { + records.clear(); + return; + } + for (const k of Array.from(records.keys())) { + const [s, t] = k.split('::'); + if ((sessionId === undefined || s === sessionId) && (tabId === undefined || t === tabId)) { + records.delete(k); + } + } +} diff --git a/src/tools/extract-data.ts b/src/tools/extract-data.ts index a4d1c0cc7..219a19731 100644 --- a/src/tools/extract-data.ts +++ b/src/tools/extract-data.ts @@ -6,8 +6,8 @@ import { MCPServer } from '../mcp-server'; import { MCPToolDefinition, MCPResult, ToolHandler, ToolContext } from '../types/mcp'; import { getSessionManager } from '../session-manager'; import { withTimeout } from '../utils/with-timeout'; -import { waitForPageReady, PageReadyResult } from '../utils/page-ready-state'; import { getDomainMemory, extractDomainFromUrl } from '../memory/domain-memory'; +import { recordQueryDebug } from '../query-debug/store'; import { validateSchema, validateAndCoerce, @@ -16,6 +16,9 @@ import { buildOpenGraphExtractor, buildCssHeuristicExtractor, buildMultipleItemExtractor, + buildExtractionPlan, + buildExtractionQueryPlan, + ExtractionQueryParseError, } from '../extraction'; import type { ExtractionSchema, SchemaProperty } from '../extraction'; @@ -36,6 +39,17 @@ const definition: MCPToolDefinition = { 'JSON Schema defining output structure. ' + 'Example: { "type": "object", "properties": { "title": { "type": "string" }, "price": { "type": "number" } } }', }, + query: { + type: 'string', + description: + 'OpenChrome local extraction query. Example: { products[] { product_name product_price(number) product_url(url) } }. ' + + 'Mutually exclusive with schema; no external AgentQL/API calls are made.', + }, + mode: { + type: 'string', + enum: ['fast'], + description: 'Extraction mode placeholder. V1 supports only fast/local extraction; standard mode is tracked separately in #989.', + }, instruction: { type: 'string', description: 'Optional natural language hint (e.g., "product details")', @@ -52,16 +66,8 @@ const definition: MCPToolDefinition = { type: 'boolean', description: 'Include field-level extraction diagnostics. Default: false', }, - waitForReady: { - type: 'boolean', - description: 'Opt in to a bounded page-ready gate before extraction. Waits for document readiness and a short DOM mutation quiet window. Default: false', - }, - readyTimeoutMs: { - type: 'number', - description: 'Maximum wait for waitForReady in milliseconds. Default: 5000', - }, }, - required: ['tabId', 'schema'], + required: ['tabId'], }, }; @@ -79,25 +85,97 @@ function countFields(data: Record): number { return Object.values(data).filter(v => v !== null && v !== undefined && v !== '').length; } -export const extractDataHandler: ToolHandler = async ( +const handler: ToolHandler = async ( sessionId: string, args: Record, _context?: ToolContext ): Promise => { const tabId = args.tabId as string; - const schema = args.schema as ExtractionSchema; + let schema = args.schema as ExtractionSchema | undefined; + const query = args.query as string | undefined; const selector = args.selector as string | undefined; - const multiple = (args.multiple as boolean) ?? false; + const mode = (args.mode as string | undefined) || 'fast'; + let multiple = (args.multiple as boolean) ?? false; const debug = (args.debug as boolean) ?? false; - const waitForReady = (args.waitForReady as boolean) ?? false; - const readyTimeoutMs = args.readyTimeoutMs as number | undefined; + const startedAt = Date.now(); if (!tabId) { return { content: [{ type: 'text', text: 'Error: tabId is required' }], isError: true }; } + if (mode !== 'fast') { + return { content: [{ type: 'text', text: 'Error: Invalid mode. V1 extract_data query mode supports only mode="fast"; standard mode is tracked in #989.' }], isError: true }; + } + + if (schema && query) { + return { content: [{ type: 'text', text: 'Error: Provide either schema or query, not both.' }], isError: true }; + } + + let queryPlan: ReturnType | null = null; + if (query) { + try { + queryPlan = buildExtractionQueryPlan(query); + schema = queryPlan.schema; + if (queryPlan.multiple) multiple = true; + } catch (error) { + const detail = error instanceof ExtractionQueryParseError ? error.message : String(error); + recordQueryDebug({ + kind: 'extract', + sessionId, + tabId, + timestamp: new Date().toISOString(), + normalized: query, + modeUsed: mode, + strategies: [], + fieldsFound: [], + fieldsMissing: [], + durations: { totalMs: Math.max(0, Date.now() - startedAt) }, + output: { chars: 0, truncated: false }, + notes: [`parser failure: ${detail}`], + }); + return { + content: [{ + type: 'text', + text: `Error: Invalid query — ${detail}. Example: { products[] { product_name product_price(number) product_url(url) } }`, + }], + isError: true, + }; + } + } + + if (!schema) { + recordQueryDebug({ + kind: 'extract', + sessionId, + tabId, + timestamp: new Date().toISOString(), + modeUsed: mode, + strategies: [], + fieldsFound: [], + fieldsMissing: [], + durations: { totalMs: Math.max(0, Date.now() - startedAt) }, + output: { chars: 0, truncated: false }, + notes: ['schema/query missing'], + }); + return { content: [{ type: 'text', text: 'Error: Either schema or query is required. Example query: { title price(number) }' }], isError: true }; + } + const schemaCheck = validateSchema(schema); if (!schemaCheck.valid) { + recordQueryDebug({ + kind: 'extract', + sessionId, + tabId, + timestamp: new Date().toISOString(), + normalized: queryPlan?.normalizedQuery, + modeUsed: mode, + strategies: [], + fieldsFound: [], + fieldsMissing: [], + durations: { totalMs: Math.max(0, Date.now() - startedAt) }, + output: { chars: 0, truncated: false }, + notes: [`schema validation failure: ${schemaCheck.error}`], + }); return { content: [{ type: 'text', text: `Error: Invalid schema — ${schemaCheck.error}` }], isError: true }; } @@ -112,35 +190,55 @@ export const extractDataHandler: ToolHandler = async ( } try { - let readiness: PageReadyResult | undefined; - if (waitForReady) { - readiness = await waitForPageReady(page, { timeoutMs: readyTimeoutMs }, _context); - } - const schemaProps: Record = multiple ? (schema.items?.properties || schema.properties || {}) : (schema.properties || {}); - // Sanitize field names to prevent CSS selector injection in strategy builders - const safeFieldPattern = /^[a-zA-Z0-9_-]+$/; - const fieldNames = Object.keys(schemaProps).filter(f => safeFieldPattern.test(f)); + // Keep schema keys intact in output; strategy builders sanitize only selector tokens. + const fieldNames = Object.keys(schemaProps); if (fieldNames.length === 0) { return { content: [{ type: 'text', text: 'Error: Schema must define at least one property' }], isError: true }; } + const plan = buildExtractionPlan(schemaProps); + const fieldPlans = plan.fields.filter(f => fieldNames.includes(f.field)); + const pageUrl = page.url(); const domain = extractDomainFromUrl(pageUrl); + const writeExtractDebug = (payload: { strategies: string[]; data?: Record; fieldsFound?: string[]; fieldsMissing?: string[]; notes?: string[]; outputValue?: unknown }): void => { + const fieldsFound = payload.fieldsFound || Object.entries(payload.data || {}) + .filter(([, v]) => v !== null && v !== undefined && v !== '') + .map(([k]) => k); + const fieldsMissing = payload.fieldsMissing || fieldNames.filter(f => !fieldsFound.includes(f)); + const outputChars = JSON.stringify(payload.outputValue ?? payload.data ?? {}).length; + recordQueryDebug({ + kind: 'extract', + sessionId, + tabId, + timestamp: new Date().toISOString(), + normalized: queryPlan?.normalizedQuery, + modeUsed: mode, + schemaSummary: { fields: fieldNames, multiple, ...(queryPlan?.rootListField ? { queryRoot: queryPlan.rootListField } : {}) }, + strategies: payload.strategies, + fieldsFound, + fieldsMissing, + durations: { totalMs: Math.max(0, Date.now() - startedAt) }, + output: { chars: outputChars, truncated: outputChars > 12000 }, + notes: payload.notes, + }); + }; + // Multiple items mode if (multiple) { - const multiScript = buildMultipleItemExtractor(fieldNames, schemaProps, selector); + const multiScript = buildMultipleItemExtractor(fieldPlans, schemaProps, selector); const rawItems = await withTimeout(page.evaluate(multiScript) as Promise[]>, 15000, 'extract_data'); if (!Array.isArray(rawItems) || rawItems.length === 0) { + writeExtractDebug({ strategies: ['multiple-item'], fieldsFound: [], fieldsMissing: fieldNames, notes: ['no repeating items found'], outputValue: [] }); return { content: [{ type: 'text', text: JSON.stringify({ - action: 'extract_data', url: pageUrl, multiple: true, items: [], count: 0, - ...(readiness && { readiness }), + action: 'extract_data', url: pageUrl, multiple: true, ...(queryPlan ? { queryRoot: queryPlan.rootListField } : {}), items: [], count: 0, message: 'No repeating items found. Try a more specific selector or check if the page has loaded.', }) }], }; @@ -150,14 +248,22 @@ export const extractDataHandler: ToolHandler = async ( const validated = rawItems.map(raw => validateAndCoerce(raw, itemSchema).result); const domainMemory = getDomainMemory(); - domainMemory.record(domain, `extract:multiple:${fieldNames.sort().join(',')}`, JSON.stringify({ + const memoryKey = queryPlan + ? `extract:multiple-query:${queryPlan.normalizedQuery}` + : `extract:multiple:${[...fieldNames].sort().join(',')}`; + domainMemory.record(domain, memoryKey, JSON.stringify({ selector: selector || 'auto', fieldCount: fieldNames.length, itemCount: validated.length, })); + writeExtractDebug({ + strategies: ['multiple-item'], + fieldsFound: fieldNames.filter(f => validated.some(item => item[f] !== null && item[f] !== undefined && item[f] !== '')), + outputValue: validated, + }); + return { content: [{ type: 'text', text: JSON.stringify({ - action: 'extract_data', url: pageUrl, multiple: true, items: validated, count: validated.length, - ...(readiness && { readiness }), + action: 'extract_data', url: pageUrl, multiple: true, ...(queryPlan ? { queryRoot: queryPlan.rootListField } : {}), items: validated, count: validated.length, }) }], }; } @@ -165,43 +271,57 @@ export const extractDataHandler: ToolHandler = async ( // Single item — layered strategies let merged: Record = {}; const strategies: string[] = []; + const fieldDiagnostics: Record = {}; + + function mergeWithDiagnostics(r: Record, strategy: string): void { + const before = new Set(Object.entries(merged).filter(([, v]) => v !== null && v !== undefined && v !== '').map(([k]) => k)); + merged = mergeResults(merged, r); + for (const [field, value] of Object.entries(r)) { + if (value === null || value === undefined || value === '' || before.has(field)) continue; + const fp = fieldPlans.find(p => p.field === field); + fieldDiagnostics[field] = { + resolvedVia: strategy, + aliasesTried: fp?.aliases || [field], + }; + } + } // Strategy 1: JSON-LD try { - const r = await withTimeout(page.evaluate(buildJsonLdExtractor(fieldNames)) as Promise>, 5000, 'extract_data:jsonld'); - if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('json-ld'); } + const r = await withTimeout(page.evaluate(buildJsonLdExtractor(fieldPlans)) as Promise>, 5000, 'extract_data:jsonld'); + if (r && typeof r === 'object') { mergeWithDiagnostics(r, 'json-ld'); if (countFields(r) > 0) strategies.push('json-ld'); } } catch { /* non-fatal */ } if (countFields(merged) >= fieldNames.length) { const { result, validation } = validateAndCoerce(merged, schema); - return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, debug ? fieldDiagnostics : undefined, readiness); + return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined, writeExtractDebug); } // Strategy 2: Microdata try { - const r = await withTimeout(page.evaluate(buildMicrodataExtractor(fieldNames)) as Promise>, 5000, 'extract_data:microdata'); - if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('microdata'); } + const r = await withTimeout(page.evaluate(buildMicrodataExtractor(fieldPlans)) as Promise>, 5000, 'extract_data:microdata'); + if (r && typeof r === 'object') { mergeWithDiagnostics(r, 'microdata'); if (countFields(r) > 0) strategies.push('microdata'); } } catch { /* non-fatal */ } // Strategy 3: OpenGraph try { - const r = await withTimeout(page.evaluate(buildOpenGraphExtractor(fieldNames)) as Promise>, 5000, 'extract_data:opengraph'); - if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('opengraph'); } + const r = await withTimeout(page.evaluate(buildOpenGraphExtractor(fieldPlans)) as Promise>, 5000, 'extract_data:opengraph'); + if (r && typeof r === 'object') { mergeWithDiagnostics(r, 'opengraph'); if (countFields(r) > 0) strategies.push('opengraph'); } } catch { /* non-fatal */ } if (countFields(merged) >= fieldNames.length) { const { result, validation } = validateAndCoerce(merged, schema); - return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, debug ? fieldDiagnostics : undefined, readiness); + return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined, writeExtractDebug); } // Strategy 4: CSS heuristic try { - const r = await withTimeout(page.evaluate(buildCssHeuristicExtractor(fieldNames, schemaProps, selector)) as Promise>, 10000, 'extract_data:css'); - if (r && typeof r === 'object') { merged = mergeResults(merged, r); if (countFields(r) > 0) strategies.push('css-heuristic'); } + const r = await withTimeout(page.evaluate(buildCssHeuristicExtractor(fieldPlans, schemaProps, selector)) as Promise>, 10000, 'extract_data:css'); + if (r && typeof r === 'object') { mergeWithDiagnostics(r, 'css-heuristic'); if (countFields(r) > 0) strategies.push('css-heuristic'); } } catch { /* non-fatal */ } const { result, validation } = validateAndCoerce(merged, schema); - return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, debug ? fieldDiagnostics : undefined, readiness); + return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined, writeExtractDebug); } catch (error) { return { content: [{ type: 'text', text: `Extraction error: ${error instanceof Error ? error.message : String(error)}` }], isError: true }; } @@ -209,25 +329,30 @@ export const extractDataHandler: ToolHandler = async ( function buildResponse( data: Record, errors: string[], url: string, - strategies: string[], domain: string, fieldNames: string[], + strategies: string[], domain: string, fieldNames: string[], normalizedQuery?: string, fieldDiagnostics?: Record, - readiness?: PageReadyResult, + writeExtractDebug?: (payload: { strategies: string[]; data?: Record; fieldsFound?: string[]; fieldsMissing?: string[]; notes?: string[]; outputValue?: unknown }) => void ): MCPResult { const fieldsFound = Object.entries(data).filter(([, v]) => v !== null && v !== undefined && v !== '').map(([k]) => k); const fieldsMissing = fieldNames.filter(f => !fieldsFound.includes(f)); if (fieldsFound.length > 0) { const dm = getDomainMemory(); - dm.record(domain, `extract:single:${fieldNames.sort().join(',')}`, JSON.stringify({ + const memoryKey = normalizedQuery + ? `extract:single-query:${normalizedQuery}` + : `extract:single:${[...fieldNames].sort().join(',')}`; + dm.record(domain, memoryKey, JSON.stringify({ strategies, fieldsFound: fieldsFound.length, fieldsTotal: fieldNames.length, })); } + writeExtractDebug?.({ strategies, data, fieldsFound, fieldsMissing }); + const response: Record = { action: 'extract_data', url, data, fieldsFound: fieldsFound.length, fieldsTotal: fieldNames.length, strategies, }; - if (readiness) response.readiness = readiness; if (fieldsMissing.length > 0) response.fieldsMissing = fieldsMissing; + if (fieldDiagnostics) response.fieldDiagnostics = fieldDiagnostics; if (errors.length > 0) response.validationErrors = errors; if (fieldsFound.length === 0) { response.message = 'No data extracted. Try: (1) read_page to verify content, (2) provide a CSS selector, (3) wait_for before extracting.'; @@ -237,5 +362,5 @@ function buildResponse( } export function registerExtractDataTool(server: MCPServer): void { - server.registerTool('extract_data', extractDataHandler, definition); + server.registerTool('extract_data', handler, definition); } diff --git a/src/tools/index.ts b/src/tools/index.ts index 845311904..c97869e42 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -101,6 +101,7 @@ import { registerValidatePageTool } from './validate-page'; // Structured extraction (#571) import { registerExtractDataTool } from './extract-data'; +import { registerOcQueryDebugTool } from './oc-query-debug'; // 2FA tools (#575) import { registerTotpGenerateTool } from './totp-generate'; @@ -257,6 +258,7 @@ export function registerAllTools(server: MCPServer): void { // Structured extraction (#571) registerExtractDataTool(server); + registerOcQueryDebugTool(server); // 2FA tools (#575) registerTotpGenerateTool(server); diff --git a/src/tools/oc-query-debug.ts b/src/tools/oc-query-debug.ts new file mode 100644 index 000000000..443d4f304 --- /dev/null +++ b/src/tools/oc-query-debug.ts @@ -0,0 +1,39 @@ +import { MCPServer } from '../mcp-server'; +import { MCPToolDefinition, MCPResult, ToolHandler } from '../types/mcp'; +import { getLatestQueryDebug, QueryDebugKind } from '../query-debug/store'; + +const definition: MCPToolDefinition = { + name: 'oc_query_debug', + description: 'Return the latest bounded local query debug record for extract_data or element resolution. No full DOM/HTML is stored.', + inputSchema: { + type: 'object', + properties: { + tabId: { type: 'string', description: 'Tab ID whose latest debug record should be returned.' }, + kind: { type: 'string', enum: ['extract', 'element'], description: 'Debug record kind. Default: extract.' }, + }, + required: ['tabId'], + }, +}; + +const handler: ToolHandler = async (sessionId: string, args: Record): Promise => { + const tabId = args.tabId as string | undefined; + const kind = (args.kind as QueryDebugKind | undefined) || 'extract'; + if (!tabId) { + return { content: [{ type: 'text', text: 'Error: tabId is required' }], isError: true }; + } + if (kind !== 'extract' && kind !== 'element') { + return { content: [{ type: 'text', text: 'Error: kind must be "extract" or "element"' }], isError: true }; + } + + const record = getLatestQueryDebug(sessionId, tabId, kind); + return { + content: [{ + type: 'text', + text: JSON.stringify(record ? { action: 'oc_query_debug', found: true, record } : { action: 'oc_query_debug', found: false, kind, tabId }), + }], + }; +}; + +export function registerOcQueryDebugTool(server: MCPServer): void { + server.registerTool('oc_query_debug', handler, definition); +} diff --git a/tests/extraction/query-parser.test.ts b/tests/extraction/query-parser.test.ts new file mode 100644 index 000000000..4c31c1125 --- /dev/null +++ b/tests/extraction/query-parser.test.ts @@ -0,0 +1,69 @@ +/// + +import { + buildExtractionQueryPlan, + ExtractionQueryParseError, + parseExtractionQuery, +} from '../../src/extraction/query-parser'; + +describe('extraction query parser', () => { + test('parses flat object fields with type hints', () => { + const plan = buildExtractionQueryPlan('{ title price(number) available(boolean) }'); + + expect(plan.multiple).toBe(false); + expect(plan.schema).toEqual({ + type: 'object', + properties: { + title: { type: 'string' }, + price: { type: 'number' }, + available: { type: 'boolean' }, + }, + required: ['title', 'price', 'available'], + }); + }); + + test('parses list object and infers multiple extraction', () => { + const plan = buildExtractionQueryPlan('{ products[] { product_name product_price(number) product_url(url) } }'); + + expect(plan.multiple).toBe(true); + expect(plan.rootListField).toBe('products'); + expect(plan.schema.type).toBe('array'); + expect(plan.schema.items?.properties?.product_price).toEqual({ type: 'number' }); + expect(plan.schema.items?.properties?.product_url).toEqual({ + type: 'string', + description: 'type hint: url', + }); + }); + + test('preserves field descriptions', () => { + const plan = buildExtractionQueryPlan('{ price(number, "current sale price, not MSRP") }'); + + expect(plan.schema.properties?.price).toEqual({ + type: 'number', + description: 'current sale price, not MSRP', + }); + }); + + test('rejects unmatched braces with position', () => { + expect(() => parseExtractionQuery('{ title price(number)')).toThrow(ExtractionQueryParseError); + expect(() => parseExtractionQuery('{ title price(number)')).toThrow(/position/); + }); + + test('rejects unsupported types', () => { + expect(() => parseExtractionQuery('{ price(currency) }')).toThrow(/Unsupported type "currency"/); + }); + + test('rejects empty list blocks', () => { + expect(() => parseExtractionQuery('{ products[] { } }')).toThrow(/must contain at least one child field/); + }); + + test('rejects unsafe field names', () => { + expect(() => parseExtractionQuery('{ _private }')).toThrow(/Unsafe field name/); + expect(() => parseExtractionQuery('{ product$name }')).toThrow(/Unexpected token/); + }); + + test('query plan rejects nested object shapes outside executable v1 subset', () => { + expect(() => buildExtractionQueryPlan('{ product { name price(number) } }')).toThrow(/Nested field/); + expect(() => buildExtractionQueryPlan('{ products[] { variants[] { name } } }')).toThrow(/Nested field/); + }); +}); diff --git a/tests/extraction/strategies.test.ts b/tests/extraction/strategies.test.ts index 33506e37a..77e3a4960 100644 --- a/tests/extraction/strategies.test.ts +++ b/tests/extraction/strategies.test.ts @@ -1,5 +1,5 @@ import { buildExtractionPlan } from '../../src/extraction/plan'; -import { buildCssHeuristicExtractor, buildJsonLdExtractor, buildOpenGraphExtractor } from '../../src/extraction/strategies'; +import { buildCssHeuristicExtractor, buildJsonLdExtractor } from '../../src/extraction/strategies'; function runExtractor(script: string, documentMock: unknown): T { const previous = (global as any).document; @@ -51,118 +51,4 @@ describe('schema-aware extraction strategies', () => { expect(result).toEqual({ salePrice: '$19.99' }); }); - - test('OpenGraph resolves site_name after alias normalization', () => { - const plan = buildExtractionPlan({ site_name: { type: 'string' } }); - const script = buildOpenGraphExtractor(plan.fields); - const documentMock = { - querySelector: (selector: string) => selector === 'meta[property="og:site_name"]' - ? { getAttribute: (name: string) => name === 'content' ? 'OpenChrome' : null } - : null, - }; - - expect(runExtractor>(script, documentMock)).toEqual({ site_name: 'OpenChrome' }); - }); -}); - -test('JSON-LD ignores inherited enumerable properties when matching aliases', () => { - const plan = buildExtractionPlan({ headline: { type: 'string' } }); - const script = buildJsonLdExtractor(plan.fields); - const documentMock = { - querySelectorAll: (selector: string) => selector === 'script[type="application/ld+json"]' - ? [{ textContent: JSON.stringify({ description: 'own description' }) }] - : [], - }; - - Object.defineProperty(Object.prototype, 'headline', { - configurable: true, - enumerable: true, - value: 'inherited headline', - }); - try { - expect(runExtractor>(script, documentMock)).toEqual({}); - } finally { - delete (Object.prototype as { headline?: string }).headline; - } -}); - - -test('JSON-LD own alias is not suppressed by inherited result fields', () => { - const plan = buildExtractionPlan({ headline: { type: 'string' } }); - const script = buildJsonLdExtractor(plan.fields); - const documentMock = { - querySelectorAll: (selector: string) => selector === 'script[type="application/ld+json"]' - ? [{ textContent: JSON.stringify({ name: 'Own headline' }) }] - : [], - }; - - Object.defineProperty(Object.prototype, 'headline', { - configurable: true, - enumerable: true, - value: 'inherited headline', - }); - try { - expect(runExtractor>(script, documentMock)).toEqual({ headline: 'Own headline' }); - } finally { - delete (Object.prototype as { headline?: string }).headline; - } -}); - -test('JSON-LD scalar projection: string field extracts ratingValue from object', () => { - // When schema declares type: "string", val() should project ratingValue from the nested object - const plan = buildExtractionPlan({ rating: { type: 'string' } }); - const script = buildJsonLdExtractor(plan.fields); - const documentMock = { - querySelectorAll: (selector: string) => selector === 'script[type="application/ld+json"]' - ? [{ textContent: JSON.stringify({ '@type': 'Product', aggregateRating: { ratingValue: '4.7', reviewCount: 120 } }) }] - : [], - }; - - const result = runExtractor>(script, documentMock); - // rating alias matches aggregateRating; ratingValue projected from the nested object - expect(result.rating).toBe('4.7'); -}); - -test('JSON-LD object-typed field preserves nested object as-is', () => { - // When schema declares type: "object", val() should return the raw nested value - const plan = buildExtractionPlan({ aggregateRating: { type: 'object', properties: { ratingValue: { type: 'string' } } } }); - const script = buildJsonLdExtractor(plan.fields); - const nestedRating = { ratingValue: '4.7', reviewCount: 120 }; - const documentMock = { - querySelectorAll: (selector: string) => selector === 'script[type="application/ld+json"]' - ? [{ textContent: JSON.stringify({ '@type': 'Product', aggregateRating: nestedRating }) }] - : [], - }; - - const result = runExtractor>(script, documentMock); - expect(result.aggregateRating).toEqual(nestedRating); -}); - -test('JSON-LD scalar projection: number field extracts @value from typed value object', () => { - // When schema declares type: "number", val() should project @value from { "@value": "4.7" } - const plan = buildExtractionPlan({ price: { type: 'number' } }); - const script = buildJsonLdExtractor(plan.fields); - const documentMock = { - querySelectorAll: (selector: string) => selector === 'script[type="application/ld+json"]' - ? [{ textContent: JSON.stringify({ '@type': 'Offer', price: { '@value': '29.99' } }) }] - : [], - }; - - const result = runExtractor>(script, documentMock); - expect(result.price).toBe('29.99'); -}); - -test('JSON-LD untyped field preserves raw object value', () => { - // When no type declared, val() should preserve the value as-is (no projection) - const plan = buildExtractionPlan({ brand: {} }); - const script = buildJsonLdExtractor(plan.fields); - const brandObj = { '@type': 'Brand', name: 'Acme' }; - const documentMock = { - querySelectorAll: (selector: string) => selector === 'script[type="application/ld+json"]' - ? [{ textContent: JSON.stringify({ '@type': 'Product', brand: brandObj }) }] - : [], - }; - - const result = runExtractor>(script, documentMock); - expect(result.brand).toEqual(brandObj); }); diff --git a/tests/query-debug/store.test.ts b/tests/query-debug/store.test.ts new file mode 100644 index 000000000..b7b05e3d1 --- /dev/null +++ b/tests/query-debug/store.test.ts @@ -0,0 +1,23 @@ +/// + +import { clearQueryDebug, getLatestQueryDebug, recordQueryDebug, sanitizeDebugText } from '../../src/query-debug/store'; + +describe('query debug store', () => { + beforeEach(() => clearQueryDebug()); + + test('stores latest bounded record by session tab and kind', () => { + for (let i = 0; i < 7; i++) { + recordQueryDebug({ kind: 'extract', sessionId: 's1', tabId: 't1', timestamp: `2026-01-01T00:00:0${i}Z`, normalized: `{ title${i} }` }); + } + + const latest = getLatestQueryDebug('s1', 't1', 'extract'); + expect(latest?.normalized).toBe('{ title6 }'); + expect(getLatestQueryDebug('s1', 'missing', 'extract')).toBeNull(); + }); + + test('redacts token-like text and caps long strings', () => { + const text = sanitizeDebugText(`token=secret ${'x'.repeat(400)}`); + expect(text).toContain('[REDACTED]'); + expect(text.length).toBeLessThanOrEqual(240); + }); +}); diff --git a/tests/tools/extract-data.test.ts b/tests/tools/extract-data.test.ts new file mode 100644 index 000000000..67a6f05c5 --- /dev/null +++ b/tests/tools/extract-data.test.ts @@ -0,0 +1,205 @@ +/// + +import { createMockSessionManager } from '../utils/mock-session'; + +jest.mock('../../src/session-manager', () => ({ + getSessionManager: jest.fn(), +})); + +jest.mock('../../src/memory/domain-memory', () => ({ + extractDomainFromUrl: jest.fn(() => 'example.com'), + getDomainMemory: jest.fn(() => ({ record: jest.fn() })), +})); + +import { getSessionManager } from '../../src/session-manager'; + +describe('ExtractDataTool query mode', () => { + let mockSessionManager: ReturnType; + let testSessionId: string; + let testTargetId: string; + + const getExtractDataHandler = async () => { + jest.resetModules(); + jest.doMock('../../src/session-manager', () => ({ + getSessionManager: () => mockSessionManager, + })); + jest.doMock('../../src/memory/domain-memory', () => ({ + extractDomainFromUrl: () => 'example.com', + getDomainMemory: () => ({ record: jest.fn() }), + })); + + const { registerExtractDataTool } = await import('../../src/tools/extract-data'); + const tools: Map) => Promise }> = new Map(); + const mockServer = { + registerTool: (name: string, handler: unknown) => { + tools.set(name, { handler: handler as (sessionId: string, args: Record) => Promise }); + }, + }; + + registerExtractDataTool(mockServer as unknown as Parameters[0]); + return tools.get('extract_data')!.handler; + }; + + beforeEach(async () => { + mockSessionManager = createMockSessionManager(); + (getSessionManager as jest.Mock).mockReturnValue(mockSessionManager); + testSessionId = 'test-session-123'; + const { targetId, page } = await mockSessionManager.createTarget(testSessionId, 'https://example.com/products'); + testTargetId = targetId; + (page.url as jest.Mock).mockReturnValue('https://example.com/products'); + }); + + afterEach(() => { + jest.clearAllMocks(); + }); + + test('returns actionable error when neither schema nor query is provided', async () => { + const handler = await getExtractDataHandler(); + const result = await handler(testSessionId, { tabId: testTargetId }) as { isError?: boolean; content: Array<{ text: string }> }; + + expect(result.isError).toBe(true); + expect(result.content[0].text).toContain('Either schema or query is required'); + }); + + test('rejects ambiguous schema and query together', async () => { + const handler = await getExtractDataHandler(); + const result = await handler(testSessionId, { + tabId: testTargetId, + schema: { type: 'object', properties: { title: { type: 'string' } } }, + query: '{ title }', + }) as { isError?: boolean; content: Array<{ text: string }> }; + + expect(result.isError).toBe(true); + expect(result.content[0].text).toContain('either schema or query, not both'); + }); + + test('keeps schema-only extraction behavior working', async () => { + const handler = await getExtractDataHandler(); + const page = (await mockSessionManager.getPage(testSessionId, testTargetId))!; + (page.evaluate as jest.Mock).mockResolvedValueOnce({ title: 'Schema Title' }); + + const result = await handler(testSessionId, { + tabId: testTargetId, + schema: { type: 'object', properties: { title: { type: 'string' } } }, + }) as { content: Array<{ text: string }> }; + + const payload = JSON.parse(result.content[0].text); + expect(payload.data.title).toBe('Schema Title'); + expect(payload.fieldsFound).toBe(1); + }); + + test('extracts query-only single object through generated schema', async () => { + const handler = await getExtractDataHandler(); + const page = (await mockSessionManager.getPage(testSessionId, testTargetId))!; + (page.evaluate as jest.Mock).mockResolvedValueOnce({ title: 'Query Title', price: '$12.50' }); + + const result = await handler(testSessionId, { + tabId: testTargetId, + query: '{ title price(number) }', + }) as { content: Array<{ text: string }> }; + + const payload = JSON.parse(result.content[0].text); + expect(payload.data).toEqual({ title: 'Query Title', price: 12.5 }); + expect(payload.fieldsFound).toBe(2); + }); + + + test('records bounded query debug after extraction', async () => { + const handler = await getExtractDataHandler(); + const { clearQueryDebug, getLatestQueryDebug } = await import('../../src/query-debug/store'); + clearQueryDebug(); + const page = (await mockSessionManager.getPage(testSessionId, testTargetId))!; + (page.evaluate as jest.Mock).mockResolvedValueOnce({ title: 'Debug Title' }); + + await handler(testSessionId, { + tabId: testTargetId, + query: '{ title missing_field }', + }) as { content: Array<{ text: string }> }; + + const debug = getLatestQueryDebug(testSessionId, testTargetId, 'extract'); + expect(debug).not.toBeNull(); + expect(debug?.normalized).toBe('{ title missing_field }'); + expect(debug?.schemaSummary?.fields).toEqual(['title', 'missing_field']); + expect(debug?.fieldsFound).toEqual(['title']); + expect(debug?.fieldsMissing).toEqual(['missing_field']); + expect(debug?.durations?.totalMs).toEqual(expect.any(Number)); + }); + + test('infers multiple extraction from list query', async () => { + const handler = await getExtractDataHandler(); + const page = (await mockSessionManager.getPage(testSessionId, testTargetId))!; + (page.evaluate as jest.Mock).mockResolvedValueOnce([ + { product_name: 'A', product_price: '$10' }, + { product_name: 'B', product_price: '$12' }, + ]); + + const result = await handler(testSessionId, { + tabId: testTargetId, + query: '{ products[] { product_name product_price(number) } }', + }) as { content: Array<{ text: string }> }; + + const payload = JSON.parse(result.content[0].text); + expect(payload.multiple).toBe(true); + expect(payload.queryRoot).toBe('products'); + expect(payload.items).toEqual([ + { product_name: 'A', product_price: 10 }, + { product_name: 'B', product_price: 12 }, + ]); + }); + + test('accepts fast mode placeholder without changing query extraction', async () => { + const handler = await getExtractDataHandler(); + const page = (await mockSessionManager.getPage(testSessionId, testTargetId))!; + (page.evaluate as jest.Mock).mockResolvedValueOnce({ title: 'Fast Title' }); + + const result = await handler(testSessionId, { + tabId: testTargetId, + query: '{ title }', + mode: 'fast', + }) as { content: Array<{ text: string }> }; + + const payload = JSON.parse(result.content[0].text); + expect(payload.data.title).toBe('Fast Title'); + }); + + test('rejects unsupported extraction mode', async () => { + const handler = await getExtractDataHandler(); + const result = await handler(testSessionId, { + tabId: testTargetId, + query: '{ title }', + mode: 'standard', + }) as { isError?: boolean; content: Array<{ text: string }> }; + + expect(result.isError).toBe(true); + expect(result.content[0].text).toContain('supports only mode="fast"'); + }); + + + + test('records parser failure in query debug', async () => { + const handler = await getExtractDataHandler(); + const { clearQueryDebug, getLatestQueryDebug } = await import('../../src/query-debug/store'); + clearQueryDebug(); + + await handler(testSessionId, { + tabId: testTargetId, + query: '{ products[] { } }', + }) as { isError?: boolean; content: Array<{ text: string }> }; + + const debug = getLatestQueryDebug(testSessionId, testTargetId, 'extract'); + expect(debug?.notes?.[0]).toContain('parser failure'); + expect(debug?.strategies).toEqual([]); + }); + + test('returns parser error with example for invalid query', async () => { + const handler = await getExtractDataHandler(); + const result = await handler(testSessionId, { + tabId: testTargetId, + query: '{ products[] { } }', + }) as { isError?: boolean; content: Array<{ text: string }> }; + + expect(result.isError).toBe(true); + expect(result.content[0].text).toContain('Invalid query'); + expect(result.content[0].text).toContain('Example:'); + }); +}); diff --git a/tests/tools/oc-query-debug.test.ts b/tests/tools/oc-query-debug.test.ts new file mode 100644 index 000000000..6f71147a1 --- /dev/null +++ b/tests/tools/oc-query-debug.test.ts @@ -0,0 +1,36 @@ +/// + +describe('oc_query_debug tool', () => { + const getHandler = async () => { + jest.resetModules(); + const { registerOcQueryDebugTool } = await import('../../src/tools/oc-query-debug'); + const tools = new Map) => Promise<{ content: Array<{ text: string }>; isError?: boolean }> }>(); + registerOcQueryDebugTool({ + registerTool: (name: string, handler: unknown) => tools.set(name, { handler: handler as never }), + } as never); + return tools.get('oc_query_debug')!.handler; + }; + + beforeEach(async () => { + jest.resetModules(); + const { clearQueryDebug } = await import('../../src/query-debug/store'); + clearQueryDebug(); + }); + + test('returns not found for unknown tab', async () => { + const handler = await getHandler(); + const result = await handler('s1', { tabId: 't1' }); + const payload = JSON.parse(result.content[0].text); + expect(payload.found).toBe(false); + }); + + test('returns latest extract debug record', async () => { + const handler = await getHandler(); + const { recordQueryDebug } = await import('../../src/query-debug/store'); + recordQueryDebug({ kind: 'extract', sessionId: 's1', tabId: 't1', timestamp: new Date().toISOString(), normalized: '{ title }', fieldsFound: ['title'] }); + const result = await handler('s1', { tabId: 't1', kind: 'extract' }); + const payload = JSON.parse(result.content[0].text); + expect(payload.found).toBe(true); + expect(payload.record.normalized).toBe('{ title }'); + }); +});