diff --git a/src/query-debug/store.ts b/src/query-debug/store.ts new file mode 100644 index 00000000..47c250c9 --- /dev/null +++ b/src/query-debug/store.ts @@ -0,0 +1,80 @@ +export type QueryDebugKind = 'extract' | 'element'; + +export interface QueryDebugRecord { + kind: QueryDebugKind; + sessionId: string; + tabId: string; + timestamp: string; + normalized?: string; + modeUsed?: string; + schemaSummary?: { fields: string[]; multiple: boolean; queryRoot?: string }; + strategies?: string[]; + fieldsFound?: string[]; + fieldsMissing?: string[]; + durations?: Record; + output?: { chars: number; truncated: boolean }; + notes?: string[]; +} + +const MAX_RECORDS_PER_KEY = 5; +const MAX_TEXT_CHARS = 240; +const SECRET_PATTERNS = [ + /password\s*[:=]\s*[^\s,;]+/gi, + /token\s*[:=]\s*[^\s,;]+/gi, + /api[_-]?key\s*[:=]\s*[^\s,;]+/gi, + /bearer\s+[a-z0-9._-]+/gi, +]; + +const records = new Map(); + +function key(sessionId: string, tabId: string, kind: QueryDebugKind): string { + return `${sessionId}::${tabId}::${kind}`; +} + +export function sanitizeDebugText(value: string): string { + let sanitized = value.slice(0, MAX_TEXT_CHARS); + for (const pattern of SECRET_PATTERNS) { + sanitized = sanitized.replace(pattern, '[REDACTED]'); + } + return sanitized; +} + +function sanitizeRecord(record: QueryDebugRecord): QueryDebugRecord { + return { + ...record, + normalized: record.normalized ? sanitizeDebugText(record.normalized) : undefined, + notes: record.notes?.map(sanitizeDebugText).slice(0, 8), + schemaSummary: record.schemaSummary + ? { ...record.schemaSummary, fields: record.schemaSummary.fields.slice(0, 40).map(sanitizeDebugText) } + : undefined, + fieldsFound: record.fieldsFound?.slice(0, 40).map(sanitizeDebugText), + fieldsMissing: record.fieldsMissing?.slice(0, 40).map(sanitizeDebugText), + strategies: record.strategies?.slice(0, 12).map(sanitizeDebugText), + }; +} + +export function recordQueryDebug(record: QueryDebugRecord): QueryDebugRecord { + const safe = sanitizeRecord(record); + const k = key(safe.sessionId, safe.tabId, safe.kind); + const list = records.get(k) || []; + list.unshift(safe); + records.set(k, list.slice(0, MAX_RECORDS_PER_KEY)); + return safe; +} + +export function getLatestQueryDebug(sessionId: string, tabId: string, kind: QueryDebugKind = 'extract'): QueryDebugRecord | null { + return records.get(key(sessionId, tabId, kind))?.[0] || null; +} + +export function clearQueryDebug(sessionId?: string, tabId?: string): void { + if (!sessionId && !tabId) { + records.clear(); + return; + } + for (const k of Array.from(records.keys())) { + const [s, t] = k.split('::'); + if ((sessionId === undefined || s === sessionId) && (tabId === undefined || t === tabId)) { + records.delete(k); + } + } +} diff --git a/src/tools/extract-data.ts b/src/tools/extract-data.ts index 0899cce3..219a1973 100644 --- a/src/tools/extract-data.ts +++ b/src/tools/extract-data.ts @@ -7,6 +7,7 @@ import { MCPToolDefinition, MCPResult, ToolHandler, ToolContext } from '../types import { getSessionManager } from '../session-manager'; import { withTimeout } from '../utils/with-timeout'; import { getDomainMemory, extractDomainFromUrl } from '../memory/domain-memory'; +import { recordQueryDebug } from '../query-debug/store'; import { validateSchema, validateAndCoerce, @@ -96,6 +97,7 @@ const handler: ToolHandler = async ( const mode = (args.mode as string | undefined) || 'fast'; let multiple = (args.multiple as boolean) ?? false; const debug = (args.debug as boolean) ?? false; + const startedAt = Date.now(); if (!tabId) { return { content: [{ type: 'text', text: 'Error: tabId is required' }], isError: true }; @@ -117,6 +119,20 @@ const handler: ToolHandler = async ( if (queryPlan.multiple) multiple = true; } catch (error) { const detail = error instanceof ExtractionQueryParseError ? error.message : String(error); + recordQueryDebug({ + kind: 'extract', + sessionId, + tabId, + timestamp: new Date().toISOString(), + normalized: query, + modeUsed: mode, + strategies: [], + fieldsFound: [], + fieldsMissing: [], + durations: { totalMs: Math.max(0, Date.now() - startedAt) }, + output: { chars: 0, truncated: false }, + notes: [`parser failure: ${detail}`], + }); return { content: [{ type: 'text', @@ -128,11 +144,38 @@ const handler: ToolHandler = async ( } if (!schema) { + recordQueryDebug({ + kind: 'extract', + sessionId, + tabId, + timestamp: new Date().toISOString(), + modeUsed: mode, + strategies: [], + fieldsFound: [], + fieldsMissing: [], + durations: { totalMs: Math.max(0, Date.now() - startedAt) }, + output: { chars: 0, truncated: false }, + notes: ['schema/query missing'], + }); return { content: [{ type: 'text', text: 'Error: Either schema or query is required. Example query: { title price(number) }' }], isError: true }; } const schemaCheck = validateSchema(schema); if (!schemaCheck.valid) { + recordQueryDebug({ + kind: 'extract', + sessionId, + tabId, + timestamp: new Date().toISOString(), + normalized: queryPlan?.normalizedQuery, + modeUsed: mode, + strategies: [], + fieldsFound: [], + fieldsMissing: [], + durations: { totalMs: Math.max(0, Date.now() - startedAt) }, + output: { chars: 0, truncated: false }, + notes: [`schema validation failure: ${schemaCheck.error}`], + }); return { content: [{ type: 'text', text: `Error: Invalid schema — ${schemaCheck.error}` }], isError: true }; } @@ -163,12 +206,36 @@ const handler: ToolHandler = async ( const pageUrl = page.url(); const domain = extractDomainFromUrl(pageUrl); + const writeExtractDebug = (payload: { strategies: string[]; data?: Record; fieldsFound?: string[]; fieldsMissing?: string[]; notes?: string[]; outputValue?: unknown }): void => { + const fieldsFound = payload.fieldsFound || Object.entries(payload.data || {}) + .filter(([, v]) => v !== null && v !== undefined && v !== '') + .map(([k]) => k); + const fieldsMissing = payload.fieldsMissing || fieldNames.filter(f => !fieldsFound.includes(f)); + const outputChars = JSON.stringify(payload.outputValue ?? payload.data ?? {}).length; + recordQueryDebug({ + kind: 'extract', + sessionId, + tabId, + timestamp: new Date().toISOString(), + normalized: queryPlan?.normalizedQuery, + modeUsed: mode, + schemaSummary: { fields: fieldNames, multiple, ...(queryPlan?.rootListField ? { queryRoot: queryPlan.rootListField } : {}) }, + strategies: payload.strategies, + fieldsFound, + fieldsMissing, + durations: { totalMs: Math.max(0, Date.now() - startedAt) }, + output: { chars: outputChars, truncated: outputChars > 12000 }, + notes: payload.notes, + }); + }; + // Multiple items mode if (multiple) { const multiScript = buildMultipleItemExtractor(fieldPlans, schemaProps, selector); const rawItems = await withTimeout(page.evaluate(multiScript) as Promise[]>, 15000, 'extract_data'); if (!Array.isArray(rawItems) || rawItems.length === 0) { + writeExtractDebug({ strategies: ['multiple-item'], fieldsFound: [], fieldsMissing: fieldNames, notes: ['no repeating items found'], outputValue: [] }); return { content: [{ type: 'text', text: JSON.stringify({ action: 'extract_data', url: pageUrl, multiple: true, ...(queryPlan ? { queryRoot: queryPlan.rootListField } : {}), items: [], count: 0, @@ -188,6 +255,12 @@ const handler: ToolHandler = async ( selector: selector || 'auto', fieldCount: fieldNames.length, itemCount: validated.length, })); + writeExtractDebug({ + strategies: ['multiple-item'], + fieldsFound: fieldNames.filter(f => validated.some(item => item[f] !== null && item[f] !== undefined && item[f] !== '')), + outputValue: validated, + }); + return { content: [{ type: 'text', text: JSON.stringify({ action: 'extract_data', url: pageUrl, multiple: true, ...(queryPlan ? { queryRoot: queryPlan.rootListField } : {}), items: validated, count: validated.length, @@ -221,7 +294,7 @@ const handler: ToolHandler = async ( if (countFields(merged) >= fieldNames.length) { const { result, validation } = validateAndCoerce(merged, schema); - return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined); + return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined, writeExtractDebug); } // Strategy 2: Microdata @@ -238,7 +311,7 @@ const handler: ToolHandler = async ( if (countFields(merged) >= fieldNames.length) { const { result, validation } = validateAndCoerce(merged, schema); - return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined); + return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined, writeExtractDebug); } // Strategy 4: CSS heuristic @@ -248,7 +321,7 @@ const handler: ToolHandler = async ( } catch { /* non-fatal */ } const { result, validation } = validateAndCoerce(merged, schema); - return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined); + return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined, writeExtractDebug); } catch (error) { return { content: [{ type: 'text', text: `Extraction error: ${error instanceof Error ? error.message : String(error)}` }], isError: true }; } @@ -257,7 +330,8 @@ const handler: ToolHandler = async ( function buildResponse( data: Record, errors: string[], url: string, strategies: string[], domain: string, fieldNames: string[], normalizedQuery?: string, - fieldDiagnostics?: Record + fieldDiagnostics?: Record, + writeExtractDebug?: (payload: { strategies: string[]; data?: Record; fieldsFound?: string[]; fieldsMissing?: string[]; notes?: string[]; outputValue?: unknown }) => void ): MCPResult { const fieldsFound = Object.entries(data).filter(([, v]) => v !== null && v !== undefined && v !== '').map(([k]) => k); const fieldsMissing = fieldNames.filter(f => !fieldsFound.includes(f)); @@ -272,6 +346,8 @@ function buildResponse( })); } + writeExtractDebug?.({ strategies, data, fieldsFound, fieldsMissing }); + const response: Record = { action: 'extract_data', url, data, fieldsFound: fieldsFound.length, fieldsTotal: fieldNames.length, strategies, }; diff --git a/src/tools/index.ts b/src/tools/index.ts index c822f389..5451488f 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -95,6 +95,7 @@ import { registerValidatePageTool } from './validate-page'; // Structured extraction (#571) import { registerExtractDataTool } from './extract-data'; +import { registerOcQueryDebugTool } from './oc-query-debug'; // 2FA tools (#575) import { registerTotpGenerateTool } from './totp-generate'; @@ -231,6 +232,7 @@ export function registerAllTools(server: MCPServer): void { // Structured extraction (#571) registerExtractDataTool(server); + registerOcQueryDebugTool(server); // 2FA tools (#575) registerTotpGenerateTool(server); diff --git a/src/tools/oc-query-debug.ts b/src/tools/oc-query-debug.ts new file mode 100644 index 00000000..443d4f30 --- /dev/null +++ b/src/tools/oc-query-debug.ts @@ -0,0 +1,39 @@ +import { MCPServer } from '../mcp-server'; +import { MCPToolDefinition, MCPResult, ToolHandler } from '../types/mcp'; +import { getLatestQueryDebug, QueryDebugKind } from '../query-debug/store'; + +const definition: MCPToolDefinition = { + name: 'oc_query_debug', + description: 'Return the latest bounded local query debug record for extract_data or element resolution. No full DOM/HTML is stored.', + inputSchema: { + type: 'object', + properties: { + tabId: { type: 'string', description: 'Tab ID whose latest debug record should be returned.' }, + kind: { type: 'string', enum: ['extract', 'element'], description: 'Debug record kind. Default: extract.' }, + }, + required: ['tabId'], + }, +}; + +const handler: ToolHandler = async (sessionId: string, args: Record): Promise => { + const tabId = args.tabId as string | undefined; + const kind = (args.kind as QueryDebugKind | undefined) || 'extract'; + if (!tabId) { + return { content: [{ type: 'text', text: 'Error: tabId is required' }], isError: true }; + } + if (kind !== 'extract' && kind !== 'element') { + return { content: [{ type: 'text', text: 'Error: kind must be "extract" or "element"' }], isError: true }; + } + + const record = getLatestQueryDebug(sessionId, tabId, kind); + return { + content: [{ + type: 'text', + text: JSON.stringify(record ? { action: 'oc_query_debug', found: true, record } : { action: 'oc_query_debug', found: false, kind, tabId }), + }], + }; +}; + +export function registerOcQueryDebugTool(server: MCPServer): void { + server.registerTool('oc_query_debug', handler, definition); +} diff --git a/tests/query-debug/store.test.ts b/tests/query-debug/store.test.ts new file mode 100644 index 00000000..b7b05e3d --- /dev/null +++ b/tests/query-debug/store.test.ts @@ -0,0 +1,23 @@ +/// + +import { clearQueryDebug, getLatestQueryDebug, recordQueryDebug, sanitizeDebugText } from '../../src/query-debug/store'; + +describe('query debug store', () => { + beforeEach(() => clearQueryDebug()); + + test('stores latest bounded record by session tab and kind', () => { + for (let i = 0; i < 7; i++) { + recordQueryDebug({ kind: 'extract', sessionId: 's1', tabId: 't1', timestamp: `2026-01-01T00:00:0${i}Z`, normalized: `{ title${i} }` }); + } + + const latest = getLatestQueryDebug('s1', 't1', 'extract'); + expect(latest?.normalized).toBe('{ title6 }'); + expect(getLatestQueryDebug('s1', 'missing', 'extract')).toBeNull(); + }); + + test('redacts token-like text and caps long strings', () => { + const text = sanitizeDebugText(`token=secret ${'x'.repeat(400)}`); + expect(text).toContain('[REDACTED]'); + expect(text.length).toBeLessThanOrEqual(240); + }); +}); diff --git a/tests/tools/extract-data.test.ts b/tests/tools/extract-data.test.ts index 0fc3f005..67a6f05c 100644 --- a/tests/tools/extract-data.test.ts +++ b/tests/tools/extract-data.test.ts @@ -103,6 +103,28 @@ describe('ExtractDataTool query mode', () => { expect(payload.fieldsFound).toBe(2); }); + + test('records bounded query debug after extraction', async () => { + const handler = await getExtractDataHandler(); + const { clearQueryDebug, getLatestQueryDebug } = await import('../../src/query-debug/store'); + clearQueryDebug(); + const page = (await mockSessionManager.getPage(testSessionId, testTargetId))!; + (page.evaluate as jest.Mock).mockResolvedValueOnce({ title: 'Debug Title' }); + + await handler(testSessionId, { + tabId: testTargetId, + query: '{ title missing_field }', + }) as { content: Array<{ text: string }> }; + + const debug = getLatestQueryDebug(testSessionId, testTargetId, 'extract'); + expect(debug).not.toBeNull(); + expect(debug?.normalized).toBe('{ title missing_field }'); + expect(debug?.schemaSummary?.fields).toEqual(['title', 'missing_field']); + expect(debug?.fieldsFound).toEqual(['title']); + expect(debug?.fieldsMissing).toEqual(['missing_field']); + expect(debug?.durations?.totalMs).toEqual(expect.any(Number)); + }); + test('infers multiple extraction from list query', async () => { const handler = await getExtractDataHandler(); const page = (await mockSessionManager.getPage(testSessionId, testTargetId))!; @@ -152,6 +174,23 @@ describe('ExtractDataTool query mode', () => { expect(result.content[0].text).toContain('supports only mode="fast"'); }); + + + test('records parser failure in query debug', async () => { + const handler = await getExtractDataHandler(); + const { clearQueryDebug, getLatestQueryDebug } = await import('../../src/query-debug/store'); + clearQueryDebug(); + + await handler(testSessionId, { + tabId: testTargetId, + query: '{ products[] { } }', + }) as { isError?: boolean; content: Array<{ text: string }> }; + + const debug = getLatestQueryDebug(testSessionId, testTargetId, 'extract'); + expect(debug?.notes?.[0]).toContain('parser failure'); + expect(debug?.strategies).toEqual([]); + }); + test('returns parser error with example for invalid query', async () => { const handler = await getExtractDataHandler(); const result = await handler(testSessionId, { diff --git a/tests/tools/oc-query-debug.test.ts b/tests/tools/oc-query-debug.test.ts new file mode 100644 index 00000000..6f71147a --- /dev/null +++ b/tests/tools/oc-query-debug.test.ts @@ -0,0 +1,36 @@ +/// + +describe('oc_query_debug tool', () => { + const getHandler = async () => { + jest.resetModules(); + const { registerOcQueryDebugTool } = await import('../../src/tools/oc-query-debug'); + const tools = new Map) => Promise<{ content: Array<{ text: string }>; isError?: boolean }> }>(); + registerOcQueryDebugTool({ + registerTool: (name: string, handler: unknown) => tools.set(name, { handler: handler as never }), + } as never); + return tools.get('oc_query_debug')!.handler; + }; + + beforeEach(async () => { + jest.resetModules(); + const { clearQueryDebug } = await import('../../src/query-debug/store'); + clearQueryDebug(); + }); + + test('returns not found for unknown tab', async () => { + const handler = await getHandler(); + const result = await handler('s1', { tabId: 't1' }); + const payload = JSON.parse(result.content[0].text); + expect(payload.found).toBe(false); + }); + + test('returns latest extract debug record', async () => { + const handler = await getHandler(); + const { recordQueryDebug } = await import('../../src/query-debug/store'); + recordQueryDebug({ kind: 'extract', sessionId: 's1', tabId: 't1', timestamp: new Date().toISOString(), normalized: '{ title }', fieldsFound: ['title'] }); + const result = await handler('s1', { tabId: 't1', kind: 'extract' }); + const payload = JSON.parse(result.content[0].text); + expect(payload.found).toBe(true); + expect(payload.record.normalized).toBe('{ title }'); + }); +});