Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions src/query-debug/store.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
export type QueryDebugKind = 'extract' | 'element';

export interface QueryDebugRecord {
kind: QueryDebugKind;
sessionId: string;
tabId: string;
timestamp: string;
normalized?: string;
modeUsed?: string;
schemaSummary?: { fields: string[]; multiple: boolean; queryRoot?: string };
strategies?: string[];
fieldsFound?: string[];
fieldsMissing?: string[];
durations?: Record<string, number>;
output?: { chars: number; truncated: boolean };
notes?: string[];
}

const MAX_RECORDS_PER_KEY = 5;
const MAX_TEXT_CHARS = 240;
const SECRET_PATTERNS = [
/password\s*[:=]\s*[^\s,;]+/gi,
/token\s*[:=]\s*[^\s,;]+/gi,
/api[_-]?key\s*[:=]\s*[^\s,;]+/gi,
/bearer\s+[a-z0-9._-]+/gi,
];

const records = new Map<string, QueryDebugRecord[]>();

function key(sessionId: string, tabId: string, kind: QueryDebugKind): string {
return `${sessionId}::${tabId}::${kind}`;
}

export function sanitizeDebugText(value: string): string {
let sanitized = value.slice(0, MAX_TEXT_CHARS);
for (const pattern of SECRET_PATTERNS) {
sanitized = sanitized.replace(pattern, '[REDACTED]');
}
return sanitized;
}

function sanitizeRecord(record: QueryDebugRecord): QueryDebugRecord {
return {
...record,
normalized: record.normalized ? sanitizeDebugText(record.normalized) : undefined,
notes: record.notes?.map(sanitizeDebugText).slice(0, 8),
schemaSummary: record.schemaSummary
? { ...record.schemaSummary, fields: record.schemaSummary.fields.slice(0, 40).map(sanitizeDebugText) }
: undefined,
fieldsFound: record.fieldsFound?.slice(0, 40).map(sanitizeDebugText),
fieldsMissing: record.fieldsMissing?.slice(0, 40).map(sanitizeDebugText),
strategies: record.strategies?.slice(0, 12).map(sanitizeDebugText),
};
}

export function recordQueryDebug(record: QueryDebugRecord): QueryDebugRecord {
const safe = sanitizeRecord(record);
const k = key(safe.sessionId, safe.tabId, safe.kind);
const list = records.get(k) || [];
list.unshift(safe);
records.set(k, list.slice(0, MAX_RECORDS_PER_KEY));
return safe;
}

export function getLatestQueryDebug(sessionId: string, tabId: string, kind: QueryDebugKind = 'extract'): QueryDebugRecord | null {
return records.get(key(sessionId, tabId, kind))?.[0] || null;
}

export function clearQueryDebug(sessionId?: string, tabId?: string): void {
if (!sessionId && !tabId) {
records.clear();
return;
}
for (const k of Array.from(records.keys())) {
const [s, t] = k.split('::');
if ((sessionId === undefined || s === sessionId) && (tabId === undefined || t === tabId)) {
records.delete(k);
}
}
}
84 changes: 80 additions & 4 deletions src/tools/extract-data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { MCPToolDefinition, MCPResult, ToolHandler, ToolContext } from '../types
import { getSessionManager } from '../session-manager';
import { withTimeout } from '../utils/with-timeout';
import { getDomainMemory, extractDomainFromUrl } from '../memory/domain-memory';
import { recordQueryDebug } from '../query-debug/store';
import {
validateSchema,
validateAndCoerce,
Expand Down Expand Up @@ -96,6 +97,7 @@ const handler: ToolHandler = async (
const mode = (args.mode as string | undefined) || 'fast';
let multiple = (args.multiple as boolean) ?? false;
const debug = (args.debug as boolean) ?? false;
const startedAt = Date.now();

if (!tabId) {
return { content: [{ type: 'text', text: 'Error: tabId is required' }], isError: true };
Expand All @@ -117,6 +119,20 @@ const handler: ToolHandler = async (
if (queryPlan.multiple) multiple = true;
} catch (error) {
const detail = error instanceof ExtractionQueryParseError ? error.message : String(error);
recordQueryDebug({
kind: 'extract',
sessionId,
tabId,
timestamp: new Date().toISOString(),
normalized: query,
modeUsed: mode,
strategies: [],
fieldsFound: [],
fieldsMissing: [],
durations: { totalMs: Math.max(0, Date.now() - startedAt) },
output: { chars: 0, truncated: false },
notes: [`parser failure: ${detail}`],
});
return {
content: [{
type: 'text',
Expand All @@ -128,11 +144,38 @@ const handler: ToolHandler = async (
}

if (!schema) {
recordQueryDebug({
kind: 'extract',
sessionId,
tabId,
timestamp: new Date().toISOString(),
modeUsed: mode,
strategies: [],
fieldsFound: [],
fieldsMissing: [],
durations: { totalMs: Math.max(0, Date.now() - startedAt) },
output: { chars: 0, truncated: false },
notes: ['schema/query missing'],
});
return { content: [{ type: 'text', text: 'Error: Either schema or query is required. Example query: { title price(number) }' }], isError: true };
}

const schemaCheck = validateSchema(schema);
if (!schemaCheck.valid) {
recordQueryDebug({
kind: 'extract',
sessionId,
tabId,
timestamp: new Date().toISOString(),
normalized: queryPlan?.normalizedQuery,
modeUsed: mode,
strategies: [],
fieldsFound: [],
fieldsMissing: [],
durations: { totalMs: Math.max(0, Date.now() - startedAt) },
output: { chars: 0, truncated: false },
notes: [`schema validation failure: ${schemaCheck.error}`],
});
return { content: [{ type: 'text', text: `Error: Invalid schema — ${schemaCheck.error}` }], isError: true };
}

Expand Down Expand Up @@ -163,12 +206,36 @@ const handler: ToolHandler = async (
const pageUrl = page.url();
const domain = extractDomainFromUrl(pageUrl);

const writeExtractDebug = (payload: { strategies: string[]; data?: Record<string, unknown>; fieldsFound?: string[]; fieldsMissing?: string[]; notes?: string[]; outputValue?: unknown }): void => {
const fieldsFound = payload.fieldsFound || Object.entries(payload.data || {})
.filter(([, v]) => v !== null && v !== undefined && v !== '')
.map(([k]) => k);
const fieldsMissing = payload.fieldsMissing || fieldNames.filter(f => !fieldsFound.includes(f));
const outputChars = JSON.stringify(payload.outputValue ?? payload.data ?? {}).length;
recordQueryDebug({
kind: 'extract',
sessionId,
tabId,
timestamp: new Date().toISOString(),
normalized: queryPlan?.normalizedQuery,
modeUsed: mode,
schemaSummary: { fields: fieldNames, multiple, ...(queryPlan?.rootListField ? { queryRoot: queryPlan.rootListField } : {}) },
strategies: payload.strategies,
fieldsFound,
fieldsMissing,
durations: { totalMs: Math.max(0, Date.now() - startedAt) },
output: { chars: outputChars, truncated: outputChars > 12000 },
notes: payload.notes,
});
};

// Multiple items mode
if (multiple) {
const multiScript = buildMultipleItemExtractor(fieldPlans, schemaProps, selector);
const rawItems = await withTimeout(page.evaluate(multiScript) as Promise<Record<string, unknown>[]>, 15000, 'extract_data');

if (!Array.isArray(rawItems) || rawItems.length === 0) {
writeExtractDebug({ strategies: ['multiple-item'], fieldsFound: [], fieldsMissing: fieldNames, notes: ['no repeating items found'], outputValue: [] });
return {
content: [{ type: 'text', text: JSON.stringify({
action: 'extract_data', url: pageUrl, multiple: true, ...(queryPlan ? { queryRoot: queryPlan.rootListField } : {}), items: [], count: 0,
Expand All @@ -188,6 +255,12 @@ const handler: ToolHandler = async (
selector: selector || 'auto', fieldCount: fieldNames.length, itemCount: validated.length,
}));

writeExtractDebug({
strategies: ['multiple-item'],
fieldsFound: fieldNames.filter(f => validated.some(item => item[f] !== null && item[f] !== undefined && item[f] !== '')),
outputValue: validated,
});

return {
content: [{ type: 'text', text: JSON.stringify({
action: 'extract_data', url: pageUrl, multiple: true, ...(queryPlan ? { queryRoot: queryPlan.rootListField } : {}), items: validated, count: validated.length,
Expand Down Expand Up @@ -221,7 +294,7 @@ const handler: ToolHandler = async (

if (countFields(merged) >= fieldNames.length) {
const { result, validation } = validateAndCoerce(merged, schema);
return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined);
return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined, writeExtractDebug);
}

// Strategy 2: Microdata
Expand All @@ -238,7 +311,7 @@ const handler: ToolHandler = async (

if (countFields(merged) >= fieldNames.length) {
const { result, validation } = validateAndCoerce(merged, schema);
return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined);
return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined, writeExtractDebug);
}

// Strategy 4: CSS heuristic
Expand All @@ -248,7 +321,7 @@ const handler: ToolHandler = async (
} catch { /* non-fatal */ }

const { result, validation } = validateAndCoerce(merged, schema);
return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined);
return buildResponse(result, validation.errors, pageUrl, strategies, domain, fieldNames, queryPlan?.normalizedQuery, debug ? fieldDiagnostics : undefined, writeExtractDebug);
} catch (error) {
return { content: [{ type: 'text', text: `Extraction error: ${error instanceof Error ? error.message : String(error)}` }], isError: true };
}
Expand All @@ -257,7 +330,8 @@ const handler: ToolHandler = async (
function buildResponse(
data: Record<string, unknown>, errors: string[], url: string,
strategies: string[], domain: string, fieldNames: string[], normalizedQuery?: string,
fieldDiagnostics?: Record<string, { resolvedVia?: string; aliasesTried: string[] }>
fieldDiagnostics?: Record<string, { resolvedVia?: string; aliasesTried: string[] }>,
writeExtractDebug?: (payload: { strategies: string[]; data?: Record<string, unknown>; fieldsFound?: string[]; fieldsMissing?: string[]; notes?: string[]; outputValue?: unknown }) => void
): MCPResult {
const fieldsFound = Object.entries(data).filter(([, v]) => v !== null && v !== undefined && v !== '').map(([k]) => k);
const fieldsMissing = fieldNames.filter(f => !fieldsFound.includes(f));
Expand All @@ -272,6 +346,8 @@ function buildResponse(
}));
}

writeExtractDebug?.({ strategies, data, fieldsFound, fieldsMissing });

const response: Record<string, unknown> = {
action: 'extract_data', url, data, fieldsFound: fieldsFound.length, fieldsTotal: fieldNames.length, strategies,
};
Expand Down
2 changes: 2 additions & 0 deletions src/tools/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ import { registerValidatePageTool } from './validate-page';

// Structured extraction (#571)
import { registerExtractDataTool } from './extract-data';
import { registerOcQueryDebugTool } from './oc-query-debug';

// 2FA tools (#575)
import { registerTotpGenerateTool } from './totp-generate';
Expand Down Expand Up @@ -231,6 +232,7 @@ export function registerAllTools(server: MCPServer): void {

// Structured extraction (#571)
registerExtractDataTool(server);
registerOcQueryDebugTool(server);

// 2FA tools (#575)
registerTotpGenerateTool(server);
Expand Down
39 changes: 39 additions & 0 deletions src/tools/oc-query-debug.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { MCPServer } from '../mcp-server';
import { MCPToolDefinition, MCPResult, ToolHandler } from '../types/mcp';
import { getLatestQueryDebug, QueryDebugKind } from '../query-debug/store';

const definition: MCPToolDefinition = {
name: 'oc_query_debug',
description: 'Return the latest bounded local query debug record for extract_data or element resolution. No full DOM/HTML is stored.',
inputSchema: {
type: 'object',
properties: {
tabId: { type: 'string', description: 'Tab ID whose latest debug record should be returned.' },
kind: { type: 'string', enum: ['extract', 'element'], description: 'Debug record kind. Default: extract.' },
},
required: ['tabId'],
},
};

const handler: ToolHandler = async (sessionId: string, args: Record<string, unknown>): Promise<MCPResult> => {
const tabId = args.tabId as string | undefined;
const kind = (args.kind as QueryDebugKind | undefined) || 'extract';
if (!tabId) {
return { content: [{ type: 'text', text: 'Error: tabId is required' }], isError: true };
}
if (kind !== 'extract' && kind !== 'element') {
return { content: [{ type: 'text', text: 'Error: kind must be "extract" or "element"' }], isError: true };
}

const record = getLatestQueryDebug(sessionId, tabId, kind);
return {
content: [{
type: 'text',
text: JSON.stringify(record ? { action: 'oc_query_debug', found: true, record } : { action: 'oc_query_debug', found: false, kind, tabId }),
}],
};
};

export function registerOcQueryDebugTool(server: MCPServer): void {
server.registerTool('oc_query_debug', handler, definition);
}
23 changes: 23 additions & 0 deletions tests/query-debug/store.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/// <reference types="jest" />

import { clearQueryDebug, getLatestQueryDebug, recordQueryDebug, sanitizeDebugText } from '../../src/query-debug/store';

describe('query debug store', () => {
beforeEach(() => clearQueryDebug());

test('stores latest bounded record by session tab and kind', () => {
for (let i = 0; i < 7; i++) {
recordQueryDebug({ kind: 'extract', sessionId: 's1', tabId: 't1', timestamp: `2026-01-01T00:00:0${i}Z`, normalized: `{ title${i} }` });
}

const latest = getLatestQueryDebug('s1', 't1', 'extract');
expect(latest?.normalized).toBe('{ title6 }');
expect(getLatestQueryDebug('s1', 'missing', 'extract')).toBeNull();
});

test('redacts token-like text and caps long strings', () => {
const text = sanitizeDebugText(`token=secret ${'x'.repeat(400)}`);
expect(text).toContain('[REDACTED]');
expect(text.length).toBeLessThanOrEqual(240);
});
});
39 changes: 39 additions & 0 deletions tests/tools/extract-data.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,28 @@ describe('ExtractDataTool query mode', () => {
expect(payload.fieldsFound).toBe(2);
});


test('records bounded query debug after extraction', async () => {
const handler = await getExtractDataHandler();
const { clearQueryDebug, getLatestQueryDebug } = await import('../../src/query-debug/store');
clearQueryDebug();
const page = (await mockSessionManager.getPage(testSessionId, testTargetId))!;
(page.evaluate as jest.Mock).mockResolvedValueOnce({ title: 'Debug Title' });

await handler(testSessionId, {
tabId: testTargetId,
query: '{ title missing_field }',
}) as { content: Array<{ text: string }> };

const debug = getLatestQueryDebug(testSessionId, testTargetId, 'extract');
expect(debug).not.toBeNull();
expect(debug?.normalized).toBe('{ title missing_field }');
expect(debug?.schemaSummary?.fields).toEqual(['title', 'missing_field']);
expect(debug?.fieldsFound).toEqual(['title']);
expect(debug?.fieldsMissing).toEqual(['missing_field']);
expect(debug?.durations?.totalMs).toEqual(expect.any(Number));
});

test('infers multiple extraction from list query', async () => {
const handler = await getExtractDataHandler();
const page = (await mockSessionManager.getPage(testSessionId, testTargetId))!;
Expand Down Expand Up @@ -152,6 +174,23 @@ describe('ExtractDataTool query mode', () => {
expect(result.content[0].text).toContain('supports only mode="fast"');
});



test('records parser failure in query debug', async () => {
const handler = await getExtractDataHandler();
const { clearQueryDebug, getLatestQueryDebug } = await import('../../src/query-debug/store');
clearQueryDebug();

await handler(testSessionId, {
tabId: testTargetId,
query: '{ products[] { } }',
}) as { isError?: boolean; content: Array<{ text: string }> };

const debug = getLatestQueryDebug(testSessionId, testTargetId, 'extract');
expect(debug?.notes?.[0]).toContain('parser failure');
expect(debug?.strategies).toEqual([]);
});

test('returns parser error with example for invalid query', async () => {
const handler = await getExtractDataHandler();
const result = await handler(testSessionId, {
Expand Down
Loading