Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
ab7f7eb
Expose opt-in token metrics for high-volume reads
shaun0927 May 12, 2026
33d0204
Expose focused inspect metrics without changing defaults
shaun0927 May 12, 2026
46ffd25
Expose opt-in token metrics for high-volume reads
shaun0927 May 12, 2026
b25411c
Flush tool introspection before CLI exit
shaun0927 May 13, 2026
1225b20
Make token metrics deterministic across fallbacks
shaun0927 May 13, 2026
2d8b306
Converge semantic token metrics before returning
shaun0927 May 13, 2026
275932c
Preserve metrics through crawl fallback output
shaun0927 May 13, 2026
5910405
Align crawl_sitemap fallback metrics with emitted payload
shaun0927 May 13, 2026
f1d82b9
Merge develop into feat/990-token-metrics
shaun0927 May 13, 2026
a009336
Merge feat/990-token-metrics into feat/981-inspect-metrics
shaun0927 May 13, 2026
e80acb1
test(crawl_sitemap): expand size-fallback fixture to actually trigger…
shaun0927 May 13, 2026
a5ee032
Merge feat/990-token-metrics into feat/981-inspect-metrics (auto-reso…
shaun0927 May 13, 2026
038f97c
Merge develop into feat/990-token-metrics
shaun0927 May 13, 2026
f14e863
fix: strip leaked conflict markers
shaun0927 May 13, 2026
961c01b
Merge feat/990-token-metrics into feat/981-inspect-metrics
shaun0927 May 13, 2026
2db2429
fix(crawl): widen FetchOnePageOptions for runner.ts fields after deve…
shaun0927 May 13, 2026
b3522a9
Merge feat/990-token-metrics into feat/981-inspect-metrics
shaun0927 May 13, 2026
23fb2ae
Merge remote-tracking branch 'origin/develop' into pr-1077-fix
shaun0927 May 13, 2026
73adb34
Merge feat/990-token-metrics into feat/981-inspect-metrics
shaun0927 May 13, 2026
8339002
test(act): relax instruction-required assertion + skip structured-ste…
shaun0927 May 13, 2026
bdd3bf2
fix(1077): add TOOL_CAPABILITIES/ToolCapability types and capability …
shaun0927 May 13, 2026
6214846
Merge develop into feat/990-token-metrics
shaun0927 May 13, 2026
370cf98
fix(1077): restore markdown-clean output_format in crawl/crawl_sitemap
shaun0927 May 13, 2026
fbfa184
Merge remote-tracking branch 'origin/develop' into HEAD
May 13, 2026
6e7ee02
Merge remote-tracking branch 'origin/feat/990-token-metrics' into HEAD
May 13, 2026
acafdb6
Merge pull request #1100 from shaun0927/feat/981-inspect-metrics
shaun0927 May 13, 2026
2ea3936
fix(1077): re-export readPageHandlerForReuse after #1100 merge
May 13, 2026
75f11c1
Preserve read-page contracts while adding metrics
shaun0927 May 13, 2026
ea45b99
Merge state headers into metrics outputs
shaun0927 May 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions src/core/metrics/token-estimate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
export interface TextMetrics {
returned_chars: number;
estimated_tokens: number;
truncated: boolean;
mode?: string;
}

export interface RawTextMetrics extends TextMetrics {
raw_chars: number;
raw_estimated_tokens: number;
compression_ratio: number;
}

export function estimateTokens(text: string): number {
if (text.length === 0) return 0;
// Deliberately approximate and provider-neutral. The field name is
// `estimated_tokens`, not exact tokens.
return Math.ceil(text.length / 4);
}

export function buildTextMetrics(text: string, opts?: { mode?: string; truncated?: boolean }): TextMetrics {
return {
returned_chars: text.length,
estimated_tokens: estimateTokens(text),
truncated: opts?.truncated ?? text.includes('...[truncated]'),
...(opts?.mode ? { mode: opts.mode } : {}),
};
}

export function buildRawTextMetrics(
rawText: string,
returnedText: string,
opts?: { mode?: string; truncated?: boolean },
): RawTextMetrics {
const rawTokens = estimateTokens(rawText);
const returnedTokens = estimateTokens(returnedText);
return {
raw_chars: rawText.length,
raw_estimated_tokens: rawTokens,
returned_chars: returnedText.length,
estimated_tokens: returnedTokens,
compression_ratio: returnedText.length > 0
? Number((rawText.length / returnedText.length).toFixed(3))
: rawText.length === 0 ? 1 : 0,
truncated: opts?.truncated ?? returnedText.includes('...[truncated]'),
...(opts?.mode ? { mode: opts.mode } : {}),
};
}

export function appendMetricsFooter(text: string, metrics: object): string {
return `${text}\n\n[openchrome_metrics] ${JSON.stringify(metrics)}`;
}
60 changes: 55 additions & 5 deletions src/tools/crawl-sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import {
StaticFetchError,
StaticReason,
} from '../utils/static-fetch';
import { buildTextMetrics } from '../core/metrics/token-estimate';
import { extractMainContent, toMarkdown } from '../core/extract/html-to-markdown';
import { sanitizeContent } from '../security/content-sanitizer';
import { getGlobalConfig } from '../config/global';
Expand Down Expand Up @@ -77,6 +78,10 @@ const definition: MCPToolDefinition = {
description:
'Fetch engine: "cdp" (default, opens a Chrome tab per page), "static" (Node fetch only, fails closed on insufficient pages), or "auto" (static first, fall back to CDP when static is insufficient).',
},
include_metrics: {
type: 'boolean',
description: 'When true, include approximate output size/token metrics in the JSON result. Default: false.',
},
},
required: ['url'],
},
Expand Down Expand Up @@ -263,7 +268,6 @@ async function resolveSitemapPageUrls(
// the caller (auto mode) can fall back to CDP.
// ---------------------------------------------------------------------------


function cleanMarkdownFromHtml(
html: string,
cleanOpts: { onlyMainContent: boolean; includeLinks: boolean },
Expand Down Expand Up @@ -581,6 +585,7 @@ const handler: ToolHandler = async (
};
const concurrency = args.concurrency != null ? Math.max(1, Math.min(10, Number(args.concurrency))) : 3;

const includeMetrics = args.include_metrics === true;
const engineArg = args.engine as string | undefined;
let engine: EngineMode = 'cdp';
if (engineArg === 'static' || engineArg === 'auto' || engineArg === 'cdp') {
Expand Down Expand Up @@ -781,10 +786,26 @@ const handler: ToolHandler = async (
sitemap_source: sitemapSource,
};

const output = { summary, pages };
const buildOutput = (outputPages: CrawledPage[]) => includeMetrics
? {
summary: {
...summary,
metrics: {
returned_chars: outputPages.reduce((sum, p) => sum + p.content.length, 0),
estimated_tokens: outputPages.reduce((sum, p) => sum + buildTextMetrics(p.content).estimated_tokens, 0),
truncated_pages: outputPages.filter((p) => p.content.includes('...[truncated]')).length,
mode: `crawl_sitemap:${outputFormat}`,
},
},
pages: outputPages.map((p) => ({
...p,
metrics: buildTextMetrics(p.content, { mode: outputFormat }),
})),
}
: { summary, pages: outputPages };

// Ensure output fits within limits
let outputJson = JSON.stringify(output, null, 2);
let outputJson = JSON.stringify(buildOutput(pages), null, 2);
if (outputJson.length > MAX_OUTPUT_CHARS) {
// Truncate page contents progressively to fit
const truncatedPages = pages.map((p) => ({
Expand All @@ -794,7 +815,7 @@ const handler: ToolHandler = async (
? p.content.slice(0, 2000) + '...[truncated]'
: p.content,
}));
outputJson = JSON.stringify({ summary, pages: truncatedPages }, null, 2);
outputJson = JSON.stringify(buildOutput(truncatedPages), null, 2);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve metrics in final crawl_sitemap fallback output

When include_metrics is true and the sitemap response still exceeds MAX_OUTPUT_CHARS after truncation, the final minimalPages branch serializes { summary, pages, note } directly and drops all metrics fields. This makes include_metrics unreliable specifically on high-volume crawls (the main case where users need these numbers), unlike the crawl tool which preserves metrics in its equivalent fallback path.

Useful? React with 👍 / 👎.


// If still too large, remove content entirely
if (outputJson.length > MAX_OUTPUT_CHARS) {
Expand All @@ -804,12 +825,41 @@ const handler: ToolHandler = async (
links_found: p.links_found,
content_length: p.content.length,
error: p.error,
...(includeMetrics && { metrics: buildTextMetrics('', { mode: outputFormat, truncated: true }) }),
}));
// Per-page metrics are computed from empty strings (content omitted),
// so the summary metrics must align with what is actually emitted —
// not the original full-content pages.
const emptyPageMetrics = buildTextMetrics('', { mode: outputFormat, truncated: true });
const minimalSummary = includeMetrics
? {
...summary,
metrics: {
returned_chars: minimalPages.reduce(
(sum, p) => sum + (p.metrics?.returned_chars ?? 0),
0,
),
estimated_tokens: minimalPages.reduce(
(sum, p) => sum + (p.metrics?.estimated_tokens ?? emptyPageMetrics.estimated_tokens),
0,
),
truncated_pages: pages.length,
mode: `crawl_sitemap:${outputFormat}`,
},
}
: summary;
outputJson = JSON.stringify(
{ summary, pages: minimalPages, note: 'Content omitted due to size constraints' },
{ summary: minimalSummary, pages: minimalPages, note: 'Content omitted due to size constraints' },
null,
2,
);
if (outputJson.length > MAX_OUTPUT_CHARS) {
outputJson = JSON.stringify({
summary: minimalSummary,
pages: minimalPages.map(({ url, title, links_found, content_length, error }) => ({ url, title, links_found, content_length, error })),
note: 'Content omitted due to size constraints',
}, null, 2);
}
}
}

Expand Down
63 changes: 58 additions & 5 deletions src/tools/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import {
StaticFetchError,
StaticReason,
} from '../utils/static-fetch';
import { buildTextMetrics } from '../core/metrics/token-estimate';
import { buildUrlScoreOptions, scoreUrl, UrlScoreOptions } from '../core/crawl/url-scorer';
import { extractMainContent, toMarkdown } from '../core/extract/html-to-markdown';
import { sanitizeContent } from '../security/content-sanitizer';
Expand Down Expand Up @@ -100,6 +101,10 @@ const definition: MCPToolDefinition = {
description:
'Fetch engine: "cdp" (default, opens a Chrome tab per page), "static" (Node fetch only, fails closed on insufficient pages), or "auto" (static first, fall back to CDP when static is insufficient).',
},
include_metrics: {
type: 'boolean',
description: 'When true, include approximate output size/token metrics in the JSON result. Default: false.',
},
strategy: {
type: 'string',
enum: ['bfs', 'best_first'],
Expand Down Expand Up @@ -239,7 +244,6 @@ async function fetchRobotsTxt(
// the caller (auto mode) can fall back to CDP.
// ---------------------------------------------------------------------------


function cleanMarkdownFromHtml(
html: string,
cleanOpts: { onlyMainContent: boolean; includeLinks: boolean },
Expand Down Expand Up @@ -355,7 +359,9 @@ async function fetchPageStatic(
/** Options for `fetchOnePage`, shared by legacy crawl and host-driven crawl jobs. */
export interface FetchOnePageOptions {
outputFormat: string;
/** When true (default), strip nav/footer/ads from extracted content. */
onlyMainContent?: boolean;
/** When true, include outgoing links in the result for BFS expansion. */
includeLinks?: boolean;
}
Comment on lines 360 to 366
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Restore markdown-clean support in shared crawl fetcher

FetchOnePageOptions now only carries outputFormat, so the onlyMainContent / includeLinks settings coming from crawl_start (src/tools/crawl-start.ts) and forwarded by the async runner (src/core/crawl/runner.ts) are silently ignored. In the same commit, markdown-clean handling was removed from fetchPage, so job-based crawls configured with output_format: "markdown-clean" no longer produce cleaned markdown (CDP path falls through to the structured/raw-HTML branch). This is a behavior regression for existing crawl jobs rather than a metrics-only change.

Useful? React with 👍 / 👎.


Expand Down Expand Up @@ -614,6 +620,7 @@ const handler: ToolHandler = async (
const delayMs = args.delay_ms != null ? Number(args.delay_ms) : 1000;
const concurrency = args.concurrency != null ? Math.max(1, Math.min(10, Number(args.concurrency))) : 3;

const includeMetrics = args.include_metrics === true;
const engineArg = args.engine as string | undefined;
let engine: EngineMode = 'cdp';
if (engineArg === 'static' || engineArg === 'auto' || engineArg === 'cdp') {
Expand Down Expand Up @@ -961,10 +968,26 @@ const handler: ToolHandler = async (
...(adaptiveDispatcher ? { dispatcher: adaptiveDispatcher.stats() } : {}),
};

const output = { summary, pages };
const buildOutput = (outputPages: CrawledPage[]) => includeMetrics
? {
summary: {
...summary,
metrics: {
returned_chars: outputPages.reduce((sum, p) => sum + p.content.length, 0),
estimated_tokens: outputPages.reduce((sum, p) => sum + buildTextMetrics(p.content).estimated_tokens, 0),
truncated_pages: outputPages.filter((p) => p.content.includes('...[truncated]')).length,
mode: `crawl:${outputFormat}`,
},
},
pages: outputPages.map((p) => ({
...p,
metrics: buildTextMetrics(p.content, { mode: outputFormat }),
})),
}
: { summary, pages: outputPages };

// Ensure output fits within limits
let outputJson = JSON.stringify(output, null, 2);
let outputJson = JSON.stringify(buildOutput(pages), null, 2);
if (outputJson.length > MAX_OUTPUT_CHARS) {
// Truncate page contents progressively to fit
const truncatedPages = pages.map((p) => ({
Expand All @@ -973,7 +996,7 @@ const handler: ToolHandler = async (
? p.content.slice(0, 2000) + '...[truncated]'
: p.content,
}));
outputJson = JSON.stringify({ summary, pages: truncatedPages }, null, 2);
outputJson = JSON.stringify(buildOutput(truncatedPages), null, 2);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve metrics in crawl's final size-fallback payload

The new metrics path is only applied in the first two serialization attempts; if output is still too large and the minimal-page fallback is used, the response is built from plain { summary, pages } and drops summary.metrics/per-page metrics even when include_metrics is true. This makes the flag non-deterministic for large crawls, which is exactly the high-volume scenario this feature targets.

Useful? React with 👍 / 👎.


// If still too large, remove content entirely
if (outputJson.length > MAX_OUTPUT_CHARS) {
Expand All @@ -985,7 +1008,37 @@ const handler: ToolHandler = async (
content_length: p.content.length,
error: p.error,
}));
outputJson = JSON.stringify({ summary, pages: minimalPages, note: 'Content omitted due to size constraints' }, null, 2);
const minimalOutput = includeMetrics
? {
summary: {
...summary,
metrics: {
returned_chars: 0,
estimated_tokens: 0,
truncated_pages: minimalPages.length,
mode: `crawl:${outputFormat}`,
},
},
pages: minimalPages.map((p) => ({
...p,
metrics: buildTextMetrics('', { mode: outputFormat, truncated: true }),
})),
note: 'Content omitted due to size constraints',
}
: { summary, pages: minimalPages, note: 'Content omitted due to size constraints' };
outputJson = JSON.stringify(minimalOutput, null, 2);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Re-check size cap after building minimal crawl metrics payload

When include_metrics is enabled, this branch builds a much larger minimalOutput (per-page metrics objects) and immediately returns it without another MAX_OUTPUT_CHARS guard. In large crawls (for example high max_pages where content is already omitted), the final JSON can still exceed the configured output cap, so the tool no longer guarantees bounded responses in the exact fallback path meant to enforce size limits.

Useful? React with 👍 / 👎.

if (outputJson.length > MAX_OUTPUT_CHARS) {
outputJson = JSON.stringify({
summary: includeMetrics
? {
...summary,
metrics: { returned_chars: 0, estimated_tokens: 0, truncated_pages: pages.length, mode: `crawl:${outputFormat}` },
}
: summary,
pages: minimalPages.map(({ url, title, depth, links_found, content_length, error }) => ({ url, title, depth, links_found, content_length, error })),
note: 'Content omitted due to size constraints',
}, null, 2);
}
}
}

Expand Down
15 changes: 13 additions & 2 deletions src/tools/inspect.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { TOOL_ANNOTATIONS } from '../types/tool-annotations';
import { getSessionManager } from '../session-manager';
import { withTimeout } from '../utils/with-timeout';
import { getAllShadowRoots, querySelectorInShadowRoots } from '../utils/shadow-dom';
import { appendMetricsFooter, buildTextMetrics } from '../core/metrics/token-estimate';
import { prependHeaderText } from './_shared/state-header';
import {
formatNodeRefToken,
Expand All @@ -40,6 +41,10 @@ const definition: MCPToolDefinition = {
enum: ['interactive', 'all', 'visible'],
description: 'Element scope. Default: visible',
},
include_metrics: {
type: 'boolean',
description: 'When true, append approximate returned size/token metrics to text output. Default: false.',
},
},
required: ['tabId', 'query'],
},
Expand Down Expand Up @@ -108,6 +113,7 @@ const handler: ToolHandler = async (
const tabId = args.tabId as string;
const query = args.query as string;
const scope = (args.scope as string) || 'visible';
const includeMetrics = args.include_metrics === true;

const sessionManager = getSessionManager();

Expand Down Expand Up @@ -578,10 +584,15 @@ const handler: ToolHandler = async (

// Footer with page context (always included)
lines.push(`[Page] ${inspectResult.url} | "${inspectResult.title}"`);

const inspectPayload = lines.join('\n');
const headeredText = prependHeaderText({ url: inspectResult.url, title: inspectResult.title, mode: 'inspect', capturedAt: Date.now(), tabId }, inspectPayload);
return {
content: [{ type: 'text', text: prependHeaderText({ url: inspectResult.url, title: inspectResult.title, mode: 'inspect', capturedAt: Date.now(), tabId }, inspectPayload) }],
content: [{
type: 'text',
text: includeMetrics
? appendMetricsFooter(headeredText, buildTextMetrics(headeredText, { mode: `inspect:${scope}` }))
: headeredText,
}],
};
} catch (error) {
return {
Expand Down
Loading
Loading