-
Notifications
You must be signed in to change notification settings - Fork 37
feat(core): token and compression metrics for high-volume read tools (#990) #1077
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ab7f7eb
33d0204
46ffd25
b25411c
1225b20
2d8b306
275932c
5910405
f1d82b9
a009336
e80acb1
a5ee032
038f97c
f14e863
961c01b
2db2429
b3522a9
23fb2ae
73adb34
8339002
bdd3bf2
6214846
370cf98
fbfa184
6e7ee02
acafdb6
2ea3936
75f11c1
ea45b99
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| export interface TextMetrics { | ||
| returned_chars: number; | ||
| estimated_tokens: number; | ||
| truncated: boolean; | ||
| mode?: string; | ||
| } | ||
|
|
||
| export interface RawTextMetrics extends TextMetrics { | ||
| raw_chars: number; | ||
| raw_estimated_tokens: number; | ||
| compression_ratio: number; | ||
| } | ||
|
|
||
| export function estimateTokens(text: string): number { | ||
| if (text.length === 0) return 0; | ||
| // Deliberately approximate and provider-neutral. The field name is | ||
| // `estimated_tokens`, not exact tokens. | ||
| return Math.ceil(text.length / 4); | ||
| } | ||
|
|
||
| export function buildTextMetrics(text: string, opts?: { mode?: string; truncated?: boolean }): TextMetrics { | ||
| return { | ||
| returned_chars: text.length, | ||
| estimated_tokens: estimateTokens(text), | ||
| truncated: opts?.truncated ?? text.includes('...[truncated]'), | ||
| ...(opts?.mode ? { mode: opts.mode } : {}), | ||
| }; | ||
| } | ||
|
|
||
| export function buildRawTextMetrics( | ||
| rawText: string, | ||
| returnedText: string, | ||
| opts?: { mode?: string; truncated?: boolean }, | ||
| ): RawTextMetrics { | ||
| const rawTokens = estimateTokens(rawText); | ||
| const returnedTokens = estimateTokens(returnedText); | ||
| return { | ||
| raw_chars: rawText.length, | ||
| raw_estimated_tokens: rawTokens, | ||
| returned_chars: returnedText.length, | ||
| estimated_tokens: returnedTokens, | ||
| compression_ratio: returnedText.length > 0 | ||
| ? Number((rawText.length / returnedText.length).toFixed(3)) | ||
| : rawText.length === 0 ? 1 : 0, | ||
| truncated: opts?.truncated ?? returnedText.includes('...[truncated]'), | ||
| ...(opts?.mode ? { mode: opts.mode } : {}), | ||
| }; | ||
| } | ||
|
|
||
| export function appendMetricsFooter(text: string, metrics: object): string { | ||
| return `${text}\n\n[openchrome_metrics] ${JSON.stringify(metrics)}`; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,7 @@ import { | |
| StaticFetchError, | ||
| StaticReason, | ||
| } from '../utils/static-fetch'; | ||
| import { buildTextMetrics } from '../core/metrics/token-estimate'; | ||
| import { buildUrlScoreOptions, scoreUrl, UrlScoreOptions } from '../core/crawl/url-scorer'; | ||
| import { extractMainContent, toMarkdown } from '../core/extract/html-to-markdown'; | ||
| import { sanitizeContent } from '../security/content-sanitizer'; | ||
|
|
@@ -100,6 +101,10 @@ const definition: MCPToolDefinition = { | |
| description: | ||
| 'Fetch engine: "cdp" (default, opens a Chrome tab per page), "static" (Node fetch only, fails closed on insufficient pages), or "auto" (static first, fall back to CDP when static is insufficient).', | ||
| }, | ||
| include_metrics: { | ||
| type: 'boolean', | ||
| description: 'When true, include approximate output size/token metrics in the JSON result. Default: false.', | ||
| }, | ||
| strategy: { | ||
| type: 'string', | ||
| enum: ['bfs', 'best_first'], | ||
|
|
@@ -239,7 +244,6 @@ async function fetchRobotsTxt( | |
| // the caller (auto mode) can fall back to CDP. | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
|
|
||
| function cleanMarkdownFromHtml( | ||
| html: string, | ||
| cleanOpts: { onlyMainContent: boolean; includeLinks: boolean }, | ||
|
|
@@ -355,7 +359,9 @@ async function fetchPageStatic( | |
| /** Options for `fetchOnePage`, shared by legacy crawl and host-driven crawl jobs. */ | ||
| export interface FetchOnePageOptions { | ||
| outputFormat: string; | ||
| /** When true (default), strip nav/footer/ads from extracted content. */ | ||
| onlyMainContent?: boolean; | ||
| /** When true, include outgoing links in the result for BFS expansion. */ | ||
| includeLinks?: boolean; | ||
| } | ||
|
Comment on lines
360
to
366
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
|
|
||
|
|
@@ -614,6 +620,7 @@ const handler: ToolHandler = async ( | |
| const delayMs = args.delay_ms != null ? Number(args.delay_ms) : 1000; | ||
| const concurrency = args.concurrency != null ? Math.max(1, Math.min(10, Number(args.concurrency))) : 3; | ||
|
|
||
| const includeMetrics = args.include_metrics === true; | ||
| const engineArg = args.engine as string | undefined; | ||
| let engine: EngineMode = 'cdp'; | ||
| if (engineArg === 'static' || engineArg === 'auto' || engineArg === 'cdp') { | ||
|
|
@@ -961,10 +968,26 @@ const handler: ToolHandler = async ( | |
| ...(adaptiveDispatcher ? { dispatcher: adaptiveDispatcher.stats() } : {}), | ||
| }; | ||
|
|
||
| const output = { summary, pages }; | ||
| const buildOutput = (outputPages: CrawledPage[]) => includeMetrics | ||
| ? { | ||
| summary: { | ||
| ...summary, | ||
| metrics: { | ||
| returned_chars: outputPages.reduce((sum, p) => sum + p.content.length, 0), | ||
| estimated_tokens: outputPages.reduce((sum, p) => sum + buildTextMetrics(p.content).estimated_tokens, 0), | ||
| truncated_pages: outputPages.filter((p) => p.content.includes('...[truncated]')).length, | ||
| mode: `crawl:${outputFormat}`, | ||
| }, | ||
| }, | ||
| pages: outputPages.map((p) => ({ | ||
| ...p, | ||
| metrics: buildTextMetrics(p.content, { mode: outputFormat }), | ||
| })), | ||
| } | ||
| : { summary, pages: outputPages }; | ||
|
|
||
| // Ensure output fits within limits | ||
| let outputJson = JSON.stringify(output, null, 2); | ||
| let outputJson = JSON.stringify(buildOutput(pages), null, 2); | ||
| if (outputJson.length > MAX_OUTPUT_CHARS) { | ||
| // Truncate page contents progressively to fit | ||
| const truncatedPages = pages.map((p) => ({ | ||
|
|
@@ -973,7 +996,7 @@ const handler: ToolHandler = async ( | |
| ? p.content.slice(0, 2000) + '...[truncated]' | ||
| : p.content, | ||
| })); | ||
| outputJson = JSON.stringify({ summary, pages: truncatedPages }, null, 2); | ||
| outputJson = JSON.stringify(buildOutput(truncatedPages), null, 2); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The new metrics path is only applied in the first two serialization attempts; if output is still too large and the minimal-page fallback is used, the response is built from plain Useful? React with 👍 / 👎. |
||
|
|
||
| // If still too large, remove content entirely | ||
| if (outputJson.length > MAX_OUTPUT_CHARS) { | ||
|
|
@@ -985,7 +1008,37 @@ const handler: ToolHandler = async ( | |
| content_length: p.content.length, | ||
| error: p.error, | ||
| })); | ||
| outputJson = JSON.stringify({ summary, pages: minimalPages, note: 'Content omitted due to size constraints' }, null, 2); | ||
| const minimalOutput = includeMetrics | ||
| ? { | ||
| summary: { | ||
| ...summary, | ||
| metrics: { | ||
| returned_chars: 0, | ||
| estimated_tokens: 0, | ||
| truncated_pages: minimalPages.length, | ||
| mode: `crawl:${outputFormat}`, | ||
| }, | ||
| }, | ||
| pages: minimalPages.map((p) => ({ | ||
| ...p, | ||
| metrics: buildTextMetrics('', { mode: outputFormat, truncated: true }), | ||
| })), | ||
| note: 'Content omitted due to size constraints', | ||
| } | ||
| : { summary, pages: minimalPages, note: 'Content omitted due to size constraints' }; | ||
| outputJson = JSON.stringify(minimalOutput, null, 2); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When Useful? React with 👍 / 👎. |
||
| if (outputJson.length > MAX_OUTPUT_CHARS) { | ||
| outputJson = JSON.stringify({ | ||
| summary: includeMetrics | ||
| ? { | ||
| ...summary, | ||
| metrics: { returned_chars: 0, estimated_tokens: 0, truncated_pages: pages.length, mode: `crawl:${outputFormat}` }, | ||
| } | ||
| : summary, | ||
| pages: minimalPages.map(({ url, title, depth, links_found, content_length, error }) => ({ url, title, depth, links_found, content_length, error })), | ||
| note: 'Content omitted due to size constraints', | ||
| }, null, 2); | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When
include_metricsistrueand the sitemap response still exceedsMAX_OUTPUT_CHARSafter truncation, the finalminimalPagesbranch serializes{ summary, pages, note }directly and drops all metrics fields. This makesinclude_metricsunreliable specifically on high-volume crawls (the main case where users need these numbers), unlike thecrawltool which preserves metrics in its equivalent fallback path.Useful? React with 👍 / 👎.