Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 215 additions & 0 deletions src/tools/search/crawl4ai-scraper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
import axios from 'axios';
import type * as t from './types';
import { createDefaultLogger } from './utils';

/**
* Crawl4AI scraper implementation
* Uses the Crawl4AI API to scrape web pages with advanced extraction capabilities
*
* Features:
* - Purpose-built for content extraction
* - Multiple extraction strategies (cosine, LLM, etc.)
* - Chunking strategies for large content
* - Returns markdown and text content
* - Includes metadata from scraped pages
*
* @example
* ```typescript
* const scraper = createCrawl4AIScraper({
* apiKey: 'your-crawl4ai-api-key',
* extractionStrategy: 'cosine',
* chunkingStrategy: 'sliding_window',
* timeout: 10000
* });
*
* const [url, response] = await scraper.scrapeUrl('https://example.com');
* if (response.success) {
* const [content] = scraper.extractContent(response);
* console.log(content);
* }
* ```
*/
export class Crawl4AIScraper implements t.BaseScraper {
private apiKey: string;
private apiUrl: string;
private timeout: number;
private logger: t.Logger;
private extractionStrategy?: string;
private chunkingStrategy?: string;
private fitStrategy?: string;

constructor(config: t.Crawl4AIScraperConfig = {}) {
this.apiKey = config.apiKey ?? process.env.CRAWL4AI_API_KEY ?? '';

this.apiUrl =
config.apiUrl ??
process.env.CRAWL4AI_API_URL ??
'https://api.crawl4ai.com';

this.timeout = config.timeout ?? 10000;
this.extractionStrategy = config.extractionStrategy;
this.chunkingStrategy = config.chunkingStrategy;
this.fitStrategy = config.fitStrategy;

// crawl4ai has ways to filter raw markdown,
// by default, we'll assume a fit (pruning) strategy
// to process raw markdown before passing it back
this.fitStrategy = config.fitStrategy === "raw" ? "raw" : "fit";

this.logger = config.logger || createDefaultLogger();

if (!this.apiKey) {
this.logger.info(
'CRAWL4AI_API_KEY is not set. Using public/unauthenticated mode.'
);
}

this.logger.debug(
`Crawl4AI scraper initialized with API URL: ${this.apiUrl}`
);
}

/**
* Scrape a single URL
* @param url URL to scrape
* @param options Scrape options
* @returns Scrape response
*/
async scrapeUrl(
url: string,
options: t.Crawl4AIScrapeOptions = {}
): Promise<[string, t.Crawl4AIScrapeResponse]> {
try {
// Crawl4AI /md endpoint for simple markdown extraction
const payload: Record<string, unknown> = {
url,
cache: '0', // Bypass cache by default
f: this.fitStrategy
};

// Build headers - only include Authorization if API key is provided
const headers: Record<string, string> = {
'Content-Type': 'application/json',
};

if (this.apiKey) {
headers['Authorization'] = `Bearer ${this.apiKey}`;
}

const response = await axios.post(`${this.apiUrl}/md`, payload, {
headers,
timeout: options.timeout ?? this.timeout,
});

return [url, { success: true, data: response.data }];
} catch (error) {
const errorMessage =
error instanceof Error ? error.message : String(error);
this.logger.error(`Crawl4AI scrape failed for ${url}:`, errorMessage);
return [
url,
{
success: false,
error: `Crawl4AI API request failed: ${errorMessage}`,
},
];
}
}

/**
* Extract content from scrape response
* @param response Scrape response
* @returns Extracted content or empty string if not available
*/
extractContent(
response: t.Crawl4AIScrapeResponse
): [string, undefined | t.References] {
if (!response.success || !response.data) {
return ['', undefined];
}

// Crawl4AI /md endpoint returns markdown directly at root level
if (response.data.markdown != null) {
return [response.data.markdown, undefined];
}

// Fallback for /crawl endpoint which returns results array
if (
response.data.results &&
Array.isArray(response.data.results) &&
response.data.results.length > 0
) {
const result = response.data.results[0];

// If there's fit markdown from /crawl, try that first
if (result.markdown?.fit_markdown != null) {
return [result.markdown.fit_markdown, undefined]
}

// Extract from markdown object (Crawl4AI /crawl structure)
if (result.markdown?.raw_markdown != null) {
return [result.markdown.raw_markdown, undefined];
}

// Fallback to markdown_with_citations if raw_markdown not available
if (result.markdown?.markdown_with_citations != null) {
return [result.markdown.markdown_with_citations, undefined];
}

// Fallback to HTML if no markdown
if (result.html != null) {
return [result.html, undefined];
}
}

// Fallback to text field
if (response.data.text != null) {
return [response.data.text, undefined];
}

return ['', undefined];
}

/**
* Extract metadata from scrape response
* @param response Scrape response
* @returns Metadata object
*/
extractMetadata(
response: t.Crawl4AIScrapeResponse
): Record<string, string | number | boolean | null | undefined> {
if (!response.success || !response.data) {
return {};
}

// Crawl4AI returns results array
if (
response.data.results &&
Array.isArray(response.data.results) &&
response.data.results.length > 0
) {
const result = response.data.results[0];
if (result.metadata) {
return result.metadata;
}
}

// Legacy format support (if data has metadata directly)
if (response.data.metadata) {
return response.data.metadata;
}

return {};
}
}

/**
* Create a Crawl4AI scraper instance
* @param config Scraper configuration
* @returns Crawl4AI scraper instance
*/
export const createCrawl4AIScraper = (
config: t.Crawl4AIScraperConfig = {}
): Crawl4AIScraper => {
return new Crawl4AIScraper(config);
};
12 changes: 12 additions & 0 deletions src/tools/search/tool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import {
import { createSearchAPI, createSourceProcessor } from './search';
import { createSerperScraper } from './serper-scraper';
import { createFirecrawlScraper } from './firecrawl';
import { createCrawl4AIScraper } from './crawl4ai-scraper';
import { expandHighlights } from './highlights';
import { formatResultsForLLM } from './format';
import { createDefaultLogger } from './utils';
Expand Down Expand Up @@ -372,6 +373,9 @@ export const createSearchTool = (
firecrawlVersion,
firecrawlOptions,
serperScraperOptions,
crawl4aiApiKey,
crawl4aiApiUrl,
crawl4aiOptions,
scraperTimeout,
jinaApiKey,
jinaApiUrl,
Expand Down Expand Up @@ -420,6 +424,14 @@ export const createSearchTool = (
timeout: scraperTimeout ?? serperScraperOptions?.timeout,
logger,
});
} else if (scraperProvider === 'crawl4ai') {
scraperInstance = createCrawl4AIScraper({
...crawl4aiOptions,
apiKey: crawl4aiApiKey,
apiUrl: crawl4aiApiUrl,
timeout: scraperTimeout ?? crawl4aiOptions?.timeout,
logger,
});
} else {
scraperInstance = createFirecrawlScraper({
...firecrawlOptions,
Expand Down
60 changes: 56 additions & 4 deletions src/tools/search/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import type { BaseReranker } from './rerankers';
import { DATE_RANGE } from './schema';

export type SearchProvider = 'serper' | 'searxng';
export type ScraperProvider = 'firecrawl' | 'serper';
export type ScraperProvider = 'firecrawl' | 'serper' | 'crawl4ai';
export type RerankerType = 'infinity' | 'jina' | 'cohere' | 'none';

export interface Highlight {
Expand Down Expand Up @@ -107,6 +107,16 @@ export interface SerperScraperConfig {
includeMarkdown?: boolean;
}

export interface Crawl4AIScraperConfig {
apiKey?: string;
apiUrl?: string;
timeout?: number;
logger?: Logger;
extractionStrategy?: string;
chunkingStrategy?: string;
fitStrategy?: string;
}

export interface ScraperContentResult {
content: string;
}
Expand Down Expand Up @@ -164,6 +174,9 @@ export interface SearchToolConfig
scraperProvider?: ScraperProvider;
scraperTimeout?: number;
serperScraperOptions?: SerperScraperConfig;
crawl4aiApiKey?: string;
crawl4aiApiUrl?: string;
crawl4aiOptions?: Crawl4AIScraperConfig;
onSearchResults?: (
results: SearchResult,
runnableConfig?: RunnableConfig
Expand All @@ -187,12 +200,23 @@ export interface BaseScraper {
scrapeUrl(
url: string,
options?: unknown
): Promise<[string, FirecrawlScrapeResponse | SerperScrapeResponse]>;
): Promise<
[
string,
FirecrawlScrapeResponse | SerperScrapeResponse | Crawl4AIScrapeResponse,
]
>;
extractContent(
response: FirecrawlScrapeResponse | SerperScrapeResponse
response:
| FirecrawlScrapeResponse
| SerperScrapeResponse
| Crawl4AIScrapeResponse
): [string, undefined | References];
extractMetadata(
response: FirecrawlScrapeResponse | SerperScrapeResponse
response:
| FirecrawlScrapeResponse
| SerperScrapeResponse
| Crawl4AIScrapeResponse
):
| ScrapeMetadata
| Record<string, string | number | boolean | null | undefined>;
Expand All @@ -209,6 +233,11 @@ export type SerperScrapeOptions = Omit<
'apiKey' | 'apiUrl' | 'logger'
>;

export type Crawl4AIScrapeOptions = Omit<
Crawl4AIScraperConfig,
'apiKey' | 'apiUrl' | 'logger'
>;

export interface ScrapeMetadata {
// Core source information
sourceURL?: string;
Expand Down Expand Up @@ -295,6 +324,29 @@ export interface SerperScrapeResponse {
error?: string;
}

export interface Crawl4AIScrapeResponse {
success: boolean;
data?: {
markdown?: string;
text?: string;
html?: string;
metadata?: Record<string, string | number | boolean | null | undefined>;
// /crawl endpoint returns results array
results?: Array<{
url?: string;
markdown?: {
raw_markdown?: string;
fit_markdown?: string;
markdown_with_citations?: string;
references_markdown?: string;
};
html?: string;
metadata?: Record<string, string | number | boolean | null | undefined>;
}>;
};
error?: string;
}

export interface FirecrawlScraperConfig {
apiKey?: string;
apiUrl?: string;
Expand Down