diff --git a/src/core/crawl/url-scorer.ts b/src/core/crawl/url-scorer.ts new file mode 100644 index 000000000..4400c789f --- /dev/null +++ b/src/core/crawl/url-scorer.ts @@ -0,0 +1,171 @@ +export interface UrlScoreOptions { + query?: string; + keywords?: string[]; + preferPaths?: string[]; + excludePaths?: string[]; + sameDepthBias?: number; + startUrl?: string; +} + +export interface UrlScoreResult { + score: number; + reasons: string[]; +} + +const LOW_SIGNAL_SEGMENTS = new Set([ + 'tag', + 'tags', + 'category', + 'categories', + 'author', + 'authors', + 'feed', + 'rss', + 'login', + 'signin', + 'signup', + 'register', +]); + +function normalizeTerm(term: string): string { + return term.trim().toLowerCase().replace(/^\/+|\/+$/g, ''); +} + +function queryTerms(query?: string): string[] { + if (!query) return []; + const seen = new Set(); + for (const raw of query.split(/[^\p{L}\p{N}_-]+/u)) { + const term = normalizeTerm(raw); + if (term.length >= 2) seen.add(term); + } + return Array.from(seen); +} + +function safeDecodePathname(pathname: string): string { + try { + return decodeURIComponent(pathname); + } catch { + return pathname; + } +} + +function normalizePathPrefix(path: string): string { + const trimmed = path.trim(); + if (!trimmed) return ''; + return trimmed.startsWith('/') ? trimmed.toLowerCase() : `/${trimmed.toLowerCase()}`; +} + +function pathDistance(startPath: string, candidatePath: string): number { + const startSegments = startPath.split('/').filter(Boolean); + const candidateSegments = candidatePath.split('/').filter(Boolean); + let shared = 0; + while ( + shared < startSegments.length && + shared < candidateSegments.length && + startSegments[shared] === candidateSegments[shared] + ) { + shared++; + } + return Math.max(startSegments.length, candidateSegments.length) - shared; +} + +export function buildUrlScoreOptions(input: { + query?: unknown; + url_score?: unknown; + startUrl?: string; +}): UrlScoreOptions { + const raw = input.url_score && typeof input.url_score === 'object' + ? input.url_score as Record + : {}; + const toStringArray = (value: unknown): string[] | undefined => { + if (!Array.isArray(value)) return undefined; + return value.filter((v): v is string => typeof v === 'string' && v.trim().length > 0); + }; + return { + query: typeof input.query === 'string' ? input.query : undefined, + keywords: toStringArray(raw.keywords), + preferPaths: toStringArray(raw.prefer_paths), + excludePaths: toStringArray(raw.exclude_paths), + sameDepthBias: typeof raw.same_depth_bias === 'number' && Number.isFinite(raw.same_depth_bias) + ? raw.same_depth_bias + : undefined, + startUrl: input.startUrl, + }; +} + +export function scoreUrl(candidateUrl: string, depth: number, options: UrlScoreOptions = {}): UrlScoreResult { + const reasons: string[] = []; + let score = 0; + let parsed: URL; + try { + parsed = new URL(candidateUrl); + } catch { + return { score: -100, reasons: ['invalid-url'] }; + } + + const explicitKeywords = (options.keywords || []).map(normalizeTerm).filter(Boolean); + const terms = Array.from(new Set([...queryTerms(options.query), ...explicitKeywords])); + const decodedPathname = safeDecodePathname(parsed.pathname); + const haystack = `${decodedPathname} ${parsed.searchParams.toString()}`.toLowerCase(); + + for (const term of terms) { + if (!term) continue; + if (haystack.includes(term)) { + score += 1.0; + reasons.push(`keyword:${term}`); + } + } + + for (const prefix of options.preferPaths || []) { + const normalized = normalizePathPrefix(prefix); + if (normalized && parsed.pathname.toLowerCase().startsWith(normalized)) { + score += 1.5; + reasons.push(`path:${normalized}`); + } + } + + for (const prefix of options.excludePaths || []) { + const normalized = normalizePathPrefix(prefix); + if (normalized && parsed.pathname.toLowerCase().startsWith(normalized)) { + score -= 2.0; + reasons.push(`exclude:${normalized}`); + } + } + + if (options.startUrl) { + try { + const start = new URL(options.startUrl); + if (start.origin === parsed.origin) { + const distance = pathDistance(start.pathname.toLowerCase(), parsed.pathname.toLowerCase()); + const proximity = Math.max(0, 3 - distance) * 0.1; + if (proximity > 0) { + score += proximity; + reasons.push(`proximity:${proximity.toFixed(1)}`); + } + } + } catch { + // ignore malformed start URL + } + } + + if (options.sameDepthBias && Number.isFinite(options.sameDepthBias)) { + score += options.sameDepthBias; + reasons.push(`bias:${options.sameDepthBias}`); + } + + if (depth > 0) { + const penalty = 0.2 * depth; + score -= penalty; + reasons.push(`depth:-${penalty.toFixed(1)}`); + } + + const querySet = new Set(terms); + for (const segment of parsed.pathname.toLowerCase().split('/').filter(Boolean)) { + if (LOW_SIGNAL_SEGMENTS.has(segment) && !querySet.has(segment)) { + score -= 1.0; + reasons.push(`low-signal:${segment}`); + } + } + + return { score: Number(score.toFixed(3)), reasons }; +} diff --git a/src/tools/crawl.ts b/src/tools/crawl.ts index 5603dfad3..26fc0f3e0 100644 --- a/src/tools/crawl.ts +++ b/src/tools/crawl.ts @@ -28,6 +28,7 @@ import { StaticFetchError, StaticReason, } from '../utils/static-fetch'; +import { buildUrlScoreOptions, scoreUrl, UrlScoreOptions } from '../core/crawl/url-scorer'; import { extractMainContent, toMarkdown } from '../core/extract/html-to-markdown'; import { sanitizeContent } from '../security/content-sanitizer'; import { getGlobalConfig } from '../config/global'; @@ -98,6 +99,25 @@ const definition: MCPToolDefinition = { description: 'Fetch engine: "cdp" (default, opens a Chrome tab per page), "static" (Node fetch only, fails closed on insufficient pages), or "auto" (static first, fall back to CDP when static is insufficient).', }, + strategy: { + type: 'string', + enum: ['bfs', 'best_first'], + description: 'Crawl traversal strategy. Default: bfs. best_first scores discovered URLs by query/url_score and visits highest-scoring URLs first.', + }, + query: { + type: 'string', + description: 'Optional query terms used by strategy=best_first URL scoring.', + }, + url_score: { + type: 'object', + description: 'Optional strategy=best_first URL scoring hints: keywords, prefer_paths, exclude_paths, same_depth_bias.', + properties: { + keywords: { type: 'array', items: { type: 'string' } }, + prefer_paths: { type: 'array', items: { type: 'string' } }, + exclude_paths: { type: 'array', items: { type: 'string' } }, + same_depth_bias: { type: 'number' }, + }, + }, dispatcher: { type: 'string', enum: ['fixed', 'adaptive'], @@ -150,9 +170,20 @@ interface CrawledPage { error?: string; engine_used?: 'static' | 'cdp'; static_reason?: StaticReason; + score?: number; + score_reasons?: string[]; } type EngineMode = 'auto' | 'static' | 'cdp'; +type CrawlStrategy = 'bfs' | 'best_first'; + +interface CrawlQueueItem { + url: string; + depth: number; + order: number; + score?: number; + score_reasons?: string[]; +} interface CrawlSummary { total_pages: number; @@ -161,6 +192,9 @@ interface CrawlSummary { max_depth_reached: number; duration_ms: number; scope: string; + strategy?: CrawlStrategy; + scored_urls?: number; + skipped_below_threshold?: number; } // --------------------------------------------------------------------------- @@ -590,6 +624,78 @@ const handler: ToolHandler = async ( } const engineExplicit = engineArg !== undefined; + const strategyArg = args.strategy as string | undefined; + let strategy: CrawlStrategy = 'bfs'; + if (strategyArg === 'bfs' || strategyArg === 'best_first') { + strategy = strategyArg; + } else if (strategyArg !== undefined) { + return { + content: [{ type: 'text', text: 'Error: strategy must be one of "bfs", "best_first"' }], + isError: true, + }; + } + const scoringOptions: UrlScoreOptions = buildUrlScoreOptions({ + query: args.query, + url_score: args.url_score, + startUrl: normalizeUrl(url), + }); + let scoredUrls = 0; + const skippedBelowThreshold = 0; + let discoveryOrder = 0; + const bestFirstQueue: CrawlQueueItem[] = []; + const bestFirstQueued = new Map(); + + function makeQueueItem(entry: { url: string; depth: number }): CrawlQueueItem { + const normalized = normalizeUrl(entry.url); + const item: CrawlQueueItem = { url: normalized, depth: entry.depth, order: discoveryOrder++ }; + if (strategy === 'best_first') { + const scored = scoreUrl(normalized, entry.depth, scoringOptions); + item.score = scored.score; + item.score_reasons = scored.reasons; + scoredUrls++; + } + return item; + } + + function enqueueItems(entries: Array<{ url: string; depth: number }>): void { + if (strategy !== 'best_first') { + tracker.enqueue(entries); + return; + } + for (const entry of entries) { + const item = makeQueueItem(entry); + if (tracker.hasVisited(item.url)) continue; + const queued = bestFirstQueued.get(item.url); + if (queued) { + if (queued.depth <= item.depth) continue; + const queuedIndex = bestFirstQueue.indexOf(queued); + if (queuedIndex !== -1) bestFirstQueue.splice(queuedIndex, 1); + } + bestFirstQueued.set(item.url, item); + bestFirstQueue.push(item); + } + bestFirstQueue.sort((a, b) => { + const scoreDiff = (b.score ?? 0) - (a.score ?? 0); + if (scoreDiff !== 0) return scoreDiff; + if (a.depth !== b.depth) return a.depth - b.depth; + if (a.order !== b.order) return a.order - b.order; + return a.url.localeCompare(b.url); + }); + } + + function dequeueItem(): CrawlQueueItem | undefined { + if (strategy !== 'best_first') { + const next = tracker.dequeue(); + return next ? { ...next, order: discoveryOrder++ } : undefined; + } + while (bestFirstQueue.length > 0) { + const next = bestFirstQueue.shift()!; + bestFirstQueued.delete(next.url); + if (!tracker.hasVisited(next.url)) return next; + } + return undefined; + } + const dispatcherArg = args.dispatcher as string | undefined; let dispatcherMode: DispatcherMode = 'fixed'; if (dispatcherArg === 'fixed' || dispatcherArg === 'adaptive') { @@ -653,13 +759,13 @@ const handler: ToolHandler = async ( } try { - // Seed the BFS queue with the start URL + // Seed the crawl queue with the start URL const normalizedStart = normalizeUrl(url); - tracker.enqueue([{ url: normalizedStart, depth: 0 }]); + enqueueItems([{ url: normalizedStart, depth: 0 }]); const limiter = createLimiter(concurrency); - // BFS loop + // Crawl loop while (pages.length < maxPages) { // Check budget if (context && !hasBudget(context, 15_000)) { @@ -668,11 +774,11 @@ const handler: ToolHandler = async ( } // Collect a batch of URLs to fetch in parallel - const batch: Array<{ url: string; depth: number }> = []; + const batch: CrawlQueueItem[] = []; const batchSize = Math.min(concurrency, maxPages - pages.length); for (let i = 0; i < batchSize; i++) { - const next = tracker.dequeue(); + const next = dequeueItem(); if (!next) break; // Skip if exceeds max depth @@ -683,13 +789,15 @@ const handler: ToolHandler = async ( if (batch.length === 0) { // Check if there are still items in the queue beyond max_depth - const probe = tracker.dequeue(); + const probe = dequeueItem(); if (!probe) break; // Queue is truly empty // If it's beyond depth, we're done if (probe.depth > maxDepth) break; - // Otherwise put it back and try again — shouldn't happen but be safe - tracker.enqueue([probe]); - break; + // Otherwise put it back and retry. In best_first mode an over-depth + // item can sort ahead of an in-depth item; breaking here would stop + // the crawl even though valid work remains behind the probe. + enqueueItems([probe]); + continue; } // Fetch batch in parallel with concurrency limiter @@ -709,6 +817,7 @@ const handler: ToolHandler = async ( depth: item.depth, links_found: 0, error: 'Blocked by robots.txt', + ...(strategy === 'best_first' ? { score: item.score ?? 0, score_reasons: item.score_reasons ?? [] } : {}), } as CrawledPage, links: [] as string[], depth: item.depth, @@ -779,6 +888,10 @@ const handler: ToolHandler = async ( if (staticReason) { result.static_reason = staticReason; } + if (strategy === 'best_first') { + result.score = item.score ?? 0; + result.score_reasons = item.score_reasons ?? []; + } // Apply delay between fetches if (delayMs > 0) { @@ -825,7 +938,7 @@ const handler: ToolHandler = async ( } if (newUrls.length > 0) { - tracker.enqueue(newUrls); + enqueueItems(newUrls); } } } @@ -842,6 +955,7 @@ const handler: ToolHandler = async ( max_depth_reached: maxDepthReached, duration_ms: durationMs, scope, + ...(strategy === 'best_first' ? { strategy, scored_urls: scoredUrls, skipped_below_threshold: skippedBelowThreshold } : {}), ...(adaptiveDispatcher ? { dispatcher: adaptiveDispatcher.stats() } : {}), }; diff --git a/tests/core/crawl/url-scorer.test.ts b/tests/core/crawl/url-scorer.test.ts new file mode 100644 index 000000000..87797538d --- /dev/null +++ b/tests/core/crawl/url-scorer.test.ts @@ -0,0 +1,67 @@ +import { buildUrlScoreOptions, scoreUrl } from '../../../src/core/crawl/url-scorer'; + +describe('url scorer', () => { + test('scores query keywords found in the URL', () => { + const scored = scoreUrl('https://docs.example.com/pricing/enterprise-limits', 1, { + query: 'enterprise pricing limits', + }); + expect(scored.score).toBeGreaterThan(2); + expect(scored.reasons).toEqual(expect.arrayContaining([ + 'keyword:enterprise', + 'keyword:pricing', + 'keyword:limits', + ])); + }); + + test('applies prefer and exclude path weights', () => { + const preferred = scoreUrl('https://docs.example.com/docs/api/auth', 0, { + preferPaths: ['/docs'], + excludePaths: ['/blog'], + }); + const excluded = scoreUrl('https://docs.example.com/blog/api/auth', 0, { + preferPaths: ['/docs'], + excludePaths: ['/blog'], + }); + expect(preferred.score).toBeGreaterThan(excluded.score); + expect(preferred.reasons).toContain('path:/docs'); + expect(excluded.reasons).toContain('exclude:/blog'); + }); + + test('penalizes deeper and low-signal URLs', () => { + const high = scoreUrl('https://example.com/docs/actions', 1, { query: 'actions' }); + const low = scoreUrl('https://example.com/tag/actions', 3, { query: 'actions' }); + expect(high.score).toBeGreaterThan(low.score); + expect(low.reasons).toContain('low-signal:tag'); + }); + + + test('does not throw on malformed percent-encoded pathnames', () => { + const scored = scoreUrl('https://example.com/docs/%zz-enterprise', 1, { + query: 'enterprise', + }); + + expect(Number.isFinite(scored.score)).toBe(true); + expect(scored.reasons).toEqual(expect.arrayContaining(['keyword:enterprise'])); + }); + + test('normalizes issue url_score options', () => { + const opts = buildUrlScoreOptions({ + query: 'workflow secrets', + startUrl: 'https://docs.example.com/en', + url_score: { + keywords: ['actions'], + prefer_paths: ['/en/actions'], + exclude_paths: ['/en/billing'], + same_depth_bias: 0.1, + }, + }); + expect(opts).toMatchObject({ + query: 'workflow secrets', + keywords: ['actions'], + preferPaths: ['/en/actions'], + excludePaths: ['/en/billing'], + sameDepthBias: 0.1, + startUrl: 'https://docs.example.com/en', + }); + }); +}); diff --git a/tests/core/tools/crawl.engine.test.ts b/tests/core/tools/crawl.engine.test.ts index 853e06e89..dfa54de31 100644 --- a/tests/core/tools/crawl.engine.test.ts +++ b/tests/core/tools/crawl.engine.test.ts @@ -102,6 +102,26 @@ beforeAll(async () => { contentType: 'text/html; charset=utf-8', body: RICH_HTML('Page B', `

Page B

${PARA}

`), }, + '/best-start.html': { + status: 200, + contentType: 'text/html; charset=utf-8', + body: RICH_HTML( + 'Best Start', + `

Best Start

${PARA}

` + + `Blog first` + + `Pricing second`, + ), + }, + '/blog/company-update.html': { + status: 200, + contentType: 'text/html; charset=utf-8', + body: RICH_HTML('Company Update', `

Company Update

${PARA}

`), + }, + '/pricing/enterprise-limits.html': { + status: 200, + contentType: 'text/html; charset=utf-8', + body: RICH_HTML('Enterprise Limits', `

Enterprise Limits

${PARA}

`), + }, '/spa.html': { status: 200, contentType: 'text/html', @@ -323,6 +343,64 @@ describe('crawl default behavior (no engine arg)', () => { }); }); + +// --------------------------------------------------------------------------- +// crawl({ strategy: 'best_first' }) — URL scoring orders discovered links. +// --------------------------------------------------------------------------- + +describe('crawl strategy=best_first', () => { + test('visits higher-scoring discovered URLs before lower-scoring URLs', async () => { + const handler = await loadHandler('crawl'); + const result = await handler('s-best', { + url: `${server.origin}/best-start.html`, + max_pages: 3, + max_depth: 1, + delay_ms: 0, + concurrency: 1, + engine: 'static', + respect_robots: false, + strategy: 'best_first', + query: 'enterprise pricing limits', + url_score: { + keywords: ['pricing', 'enterprise', 'limits'], + prefer_paths: ['/pricing'], + exclude_paths: ['/blog'], + }, + }); + const parsed = parseResult(result); + expect(parsed.summary.strategy).toBe('best_first'); + expect(parsed.summary.scored_urls).toBeGreaterThanOrEqual(3); + expect(parsed.pages.map((p) => p.url)).toEqual([ + `${server.origin}/best-start.html`, + `${server.origin}/pricing/enterprise-limits.html`, + `${server.origin}/blog/company-update.html`, + ]); + expect(parsed.pages[1].score).toBeGreaterThan(parsed.pages[2].score as number); + expect(parsed.pages[1].score_reasons).toEqual(expect.arrayContaining([ + 'keyword:pricing', + 'keyword:enterprise', + 'keyword:limits', + 'path:/pricing', + ])); + }); + + test('keeps default crawl output free of strategy metadata', async () => { + const handler = await loadHandler('crawl'); + const result = await handler('s-best-default', { + url: `${server.origin}/best-start.html`, + max_pages: 2, + max_depth: 1, + delay_ms: 0, + concurrency: 1, + engine: 'static', + respect_robots: false, + }); + const parsed = parseResult(result); + expect(parsed.summary.strategy).toBeUndefined(); + expect(parsed.pages[0].score).toBeUndefined(); + }); +}); + // --------------------------------------------------------------------------- // crawl_sitemap engine plumbing // --------------------------------------------------------------------------- diff --git a/tests/e2e/scenarios/domain-memory-persistence.test.ts b/tests/e2e/scenarios/domain-memory-persistence.test.ts index 92d848f9b..f14736fab 100644 --- a/tests/e2e/scenarios/domain-memory-persistence.test.ts +++ b/tests/e2e/scenarios/domain-memory-persistence.test.ts @@ -48,6 +48,29 @@ async function readStoreFile(): Promise<{ version: number; entries: unknown[]; u return JSON.parse(data); } +type PersistedStore = { entries?: unknown[]; updatedAt?: number }; + +async function waitForPersistedStore( + filePath: string, + predicate: (store: PersistedStore, byteLength: number) => boolean +): Promise<{ store: PersistedStore; byteLength: number }> { + const deadline = Date.now() + 5_000; + let lastState = 'unread'; + while (Date.now() < deadline) { + try { + const raw = await fsPromises.readFile(filePath, 'utf-8'); + const store = JSON.parse(raw) as PersistedStore; + const byteLength = Buffer.byteLength(raw); + lastState = `count=${store.entries?.length ?? 0}, updatedAt=${store.updatedAt ?? 'missing'}, bytes=${byteLength}`; + if (byteLength > 0 && predicate(store, byteLength)) return { store, byteLength }; + } catch { + // File may not exist or may be mid-write; retry until deadline. + } + await waitForSave(50); + } + throw new Error(`Timed out waiting for persisted store; lastState=${lastState}`); +} + describe('Issue #493: Domain Memory Persistence E2E', () => { afterAll(async () => { // Cleanup temp dir @@ -382,19 +405,24 @@ describe('Issue #493: Domain Memory Persistence E2E', () => { for (let i = 0; i < 200; i++) { dm.record(`size-${i}.com`, `key-${i}`, `value-with-some-content-${i}`); } - await waitForSave(200); - const sizeFile = path.join(sizeDir, 'domain-knowledge.json'); - const size200 = await waitForNonEmptyFile(sizeFile); + const baseline = await waitForPersistedStore( + sizeFile, + (store) => (store.entries?.length ?? 0) >= 200 + ); + const size200 = baseline.byteLength; // Add 100 more (total 300) and compress for (let i = 200; i < 300; i++) { dm.record(`size-${i}.com`, `key-${i}`, `value-with-some-content-${i}`); } + const compressStartedAt = Date.now(); dm.compress(); - await waitForSave(200); - - const sizeAfterCompress = await waitForNonEmptyFile(sizeFile); + const compressed = await waitForPersistedStore( + sizeFile, + (store) => (store.updatedAt ?? 0) >= compressStartedAt && (store.entries?.length ?? 0) <= 200 + ); + const sizeAfterCompress = compressed.byteLength; // After compress, file should not be significantly larger than 200 entries // Allow 10% tolerance for JSON formatting differences diff --git a/tests/integration/health-endpoint-gating.test.ts b/tests/integration/health-endpoint-gating.test.ts index db8a7731a..8c3cc8d42 100644 --- a/tests/integration/health-endpoint-gating.test.ts +++ b/tests/integration/health-endpoint-gating.test.ts @@ -269,12 +269,9 @@ describeFn('health endpoint gating (issue #648)', () => { const shutdownTimeoutMs = 30_000; const exit = await waitForExit(child, shutdownTimeoutMs); expect(exit.timedOut).toBe(false); - if (process.platform === 'win32') { - // Windows may report a clean SIGTERM as code=null/signal=SIGTERM. - expect(exit.code === 0 || exit.signal === 'SIGTERM').toBe(true); - } else { - expect(exit.code).toBe(0); - } + // Node may report a clean SIGTERM shutdown as either code=0 or + // code=null/signal=SIGTERM depending on platform and timing. + expect(exit.code === 0 || exit.signal === 'SIGTERM').toBe(true); expect(stderr).not.toMatch(/TypeError/); expect(stderr).not.toMatch(/Cannot read properties of null/); expect(stderr).not.toMatch(/UnhandledPromiseRejection/); diff --git a/tests/tools/console-capture-regression.test.ts b/tests/tools/console-capture-regression.test.ts index 465a74160..894c8c591 100644 --- a/tests/tools/console-capture-regression.test.ts +++ b/tests/tools/console-capture-regression.test.ts @@ -151,7 +151,8 @@ describe('console_capture get response — v1.11.0 baseline regression', () => { // while JSON.stringify always emits LF. Normalize only line endings so // this shape guard remains byte-stable across POSIX checkouts and does // not fail the Windows matrix for checkout policy alone. - const baseline = fs.readFileSync(FIXTURE_PATH, 'utf8').replace(/\r\n/g, '\n'); + const normalizeLineEndings = (value: string) => value.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); + const baseline = normalizeLineEndings(fs.readFileSync(FIXTURE_PATH, 'utf8')); expect(responseJson).toBe(baseline); });