Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
"joplin-turndown-plugin-gfm": "^1.0.12",
"jsdom": "^26.0.0",
"koffi": "^2.9.0",
"lodash": "^4.17.21",
"lodash": "^4.17.23",
"marked": "^14.1.2",
"ollama-ai-provider": "^1.2.0",
"openai": "^5.20.2",
Expand Down
105 changes: 34 additions & 71 deletions apps/api/pnpm-lock.yaml

Large diffs are not rendered by default.

99 changes: 99 additions & 0 deletions apps/api/src/__tests__/snips/v2/crawl.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
crawl,
crawlOngoing,
crawlStart,
map,
Identity,
idmux,
scrapeTimeout,
Expand All @@ -23,6 +24,13 @@ import { describe, it, expect } from "@jest/globals";

let identity: Identity;

const normalizeUrlForCompare = (value: string) => {
const url = new URL(value);
url.hash = "";
const href = url.href;
return href.endsWith("/") ? href.slice(0, -1) : href;
};

beforeAll(async () => {
identity = await idmux({
name: "crawl",
Expand Down Expand Up @@ -69,6 +77,97 @@ describe("Crawl tests", () => {
10 * scrapeTimeout,
);

concurrentIf(ALLOW_TEST_SUITE_WEBSITE)(
"works with sitemap: only",
async () => {
const results = await crawl(
{
url: base,
limit: 10,
sitemap: "only",
},
identity,
);

expect(results.completed).toBeGreaterThan(0);
},
10 * scrapeTimeout,
);

concurrentIf(ALLOW_TEST_SUITE_WEBSITE)(
"sitemap-only results are subset of map-only + start URL",
async () => {
const mapResponse = await map(
{
url: base,
sitemap: "only",
includeSubdomains: false,
ignoreQueryParameters: false,
limit: 500,
},
identity,
);

expect(mapResponse.statusCode).toBe(200);
expect(mapResponse.body.success).toBe(true);

const sitemapUrls = new Set(
mapResponse.body.links.map(link => normalizeUrlForCompare(link.url)),
);
const baseNormalized = normalizeUrlForCompare(base);

const results = await crawl(
{
url: base,
limit: 50,
sitemap: "only",
},
identity,
);

expect(results.success).toBe(true);
if (results.success) {
for (const page of results.data) {
const pageUrl =
page.metadata.url ?? page.metadata.sourceURL ?? base;
const normalized = normalizeUrlForCompare(pageUrl);
expect(
normalized === baseNormalized || sitemapUrls.has(normalized),
).toBe(true);
}
}
},
10 * scrapeTimeout,
);

concurrentIf(TEST_PRODUCTION)(
"no sitemap found -> start URL only",
async () => {
const noSitemapUrl = "https://example.com";
const results = await crawl(
{
url: noSitemapUrl,
limit: 10,
sitemap: "only",
},
identity,
);

expect(results.success).toBe(true);
if (results.success) {
expect(results.data.length).toBe(1);
const pageUrl =
results.data[0].metadata.url ??
results.data[0].metadata.sourceURL ??
noSitemapUrl;
expect(normalizeUrlForCompare(pageUrl)).toBe(
normalizeUrlForCompare(noSitemapUrl),
);
}
},
10 * scrapeTimeout,
);

concurrentIf(ALLOW_TEST_SUITE_WEBSITE)(
"filters URLs properly",
async () => {
Expand Down
14 changes: 12 additions & 2 deletions apps/api/src/__tests__/snips/v2/types-validation.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
BatchScrapeRequestInput,
SearchRequest,
SearchRequestInput,
toV2CrawlerOptions,
} from "../../../controllers/v2/types";

describe("V2 Types Validation", () => {
Expand Down Expand Up @@ -606,11 +607,11 @@ describe("V2 Types Validation", () => {
it("should handle sitemap enum values", () => {
const input: CrawlRequestInput = {
url: "https://example.com",
sitemap: "include",
sitemap: "only",
};

const result = crawlRequestSchema.parse(input);
expect(result.sitemap).toBe("include");
expect(result.sitemap).toBe("only");
});

it("should reject invalid sitemap value", () => {
Expand All @@ -621,6 +622,15 @@ describe("V2 Types Validation", () => {

expect(() => crawlRequestSchema.parse(input)).toThrow();
});

it("should map sitemapOnly to sitemap=only", () => {
const result = toV2CrawlerOptions({
sitemapOnly: true,
ignoreSitemap: false,
});

expect(result.sitemap).toBe("only");
});
});

describe("mapRequestSchema", () => {
Expand Down
2 changes: 1 addition & 1 deletion apps/api/src/controllers/v2/crawl-params-preview.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ type CrawlParamsPreviewResponse =
crawlEntireDomain?: boolean;
allowExternalLinks?: boolean;
allowSubdomains?: boolean;
sitemap?: "skip" | "include";
sitemap?: "skip" | "include" | "only";
ignoreQueryParameters?: boolean;
deduplicateSimilarURLs?: boolean;
delay?: number;
Expand Down
7 changes: 4 additions & 3 deletions apps/api/src/controllers/v2/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -868,7 +868,7 @@ export const crawlerOptions = z.strictObject({
allowExternalLinks: z.boolean().prefault(false),
allowSubdomains: z.boolean().prefault(false),
ignoreRobotsTxt: z.boolean().prefault(false),
sitemap: z.enum(["skip", "include"]).prefault("include"),
sitemap: z.enum(["skip", "include", "only"]).prefault("include"),
deduplicateSimilarURLs: z.boolean().prefault(true),
ignoreQueryParameters: z.boolean().prefault(false),
regexOnFullURL: z.boolean().prefault(false),
Expand Down Expand Up @@ -1283,6 +1283,7 @@ export function toV0CrawlerOptions(x: CrawlerOptions) {
allowSubdomains: x.allowSubdomains,
ignoreRobotsTxt: x.ignoreRobotsTxt,
ignoreSitemap: x.sitemap === "skip",
sitemapOnly: x.sitemap === "only",
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
regexOnFullURL: x.regexOnFullURL,
Expand All @@ -1301,7 +1302,7 @@ export function toV2CrawlerOptions(x: any): CrawlerOptions {
allowExternalLinks: x.allowExternalContentLinks,
allowSubdomains: x.allowSubdomains,
ignoreRobotsTxt: x.ignoreRobotsTxt,
sitemap: x.ignoreSitemap ? "skip" : "include",
sitemap: x.sitemapOnly ? "only" : x.ignoreSitemap ? "skip" : "include",
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
regexOnFullURL: x.regexOnFullURL,
Expand All @@ -1326,7 +1327,7 @@ function fromV0CrawlerOptions(
allowExternalLinks: x.allowExternalContentLinks,
allowSubdomains: x.allowSubdomains,
ignoreRobotsTxt: x.ignoreRobotsTxt,
sitemap: x.ignoreSitemap ? "skip" : "include",
sitemap: x.sitemapOnly ? "only" : x.ignoreSitemap ? "skip" : "include",
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
regexOnFullURL: x.regexOnFullURL,
Expand Down
139 changes: 71 additions & 68 deletions apps/api/src/services/worker/scrape-worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -356,78 +356,81 @@ async function processJob(job: NuQJob<ScrapeJobSingleUrls>) {
doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
);

const links = await crawler.filterLinks(
await crawler.extractLinksFromHTML(
rawHtml ?? "",
doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!,
),
Infinity,
sc.crawlerOptions?.maxDepth ?? 10,
);
logger.debug("Discovered " + links.links.length + " links...", {
linksLength: links.links.length,
});
if (!sc.crawlerOptions?.sitemapOnly) {
const links = await crawler.filterLinks(
await crawler.extractLinksFromHTML(
rawHtml ?? "",
doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!,
),
Infinity,
sc.crawlerOptions?.maxDepth ?? 10,
);
logger.debug("Discovered " + links.links.length + " links...", {
linksLength: links.links.length,
});

// Store robots blocked URLs in Redis set
for (const [url, reason] of links.denialReasons) {
if (reason === "URL blocked by robots.txt") {
await recordRobotsBlocked(job.data.crawl_id, url);
// Store robots blocked URLs in Redis set
for (const [url, reason] of links.denialReasons) {
if (reason === "URL blocked by robots.txt") {
await recordRobotsBlocked(job.data.crawl_id, url);
}
}
}

for (const link of links.links) {
if (await lockURL(job.data.crawl_id, sc, link)) {
// This seems to work really welel
const jobPriority = await getJobPriority({
team_id: sc.team_id,
basePriority: job.data.crawl_id ? 20 : 10,
});
const jobId = uuidv7();

logger.debug(
"Determined job priority " +
jobPriority +
" for URL " +
JSON.stringify(link),
{ jobPriority, url: link },
);

await addScrapeJob(
{
url: link,
mode: "single_urls",
for (const link of links.links) {
if (await lockURL(job.data.crawl_id, sc, link)) {
// This seems to work really welel
const jobPriority = await getJobPriority({
team_id: sc.team_id,
scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
internalOptions: sc.internalOptions,
crawlerOptions: {
...sc.crawlerOptions,
currentDiscoveryDepth:
(job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1,
basePriority: job.data.crawl_id ? 20 : 10,
});
const jobId = uuidv7();

logger.debug(
"Determined job priority " +
jobPriority +
" for URL " +
JSON.stringify(link),
{ jobPriority, url: link },
);

await addScrapeJob(
{
url: link,
mode: "single_urls",
team_id: sc.team_id,
scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
internalOptions: sc.internalOptions,
crawlerOptions: {
...sc.crawlerOptions,
currentDiscoveryDepth:
(job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) +
1,
},
origin: job.data.origin,
integration: job.data.integration,
crawl_id: job.data.crawl_id,
requestId: job.data.requestId,
webhook: job.data.webhook,
v1: job.data.v1,
zeroDataRetention: job.data.zeroDataRetention,
apiKeyId: job.data.apiKeyId,
},
origin: job.data.origin,
integration: job.data.integration,
crawl_id: job.data.crawl_id,
requestId: job.data.requestId,
webhook: job.data.webhook,
v1: job.data.v1,
zeroDataRetention: job.data.zeroDataRetention,
apiKeyId: job.data.apiKeyId,
},
jobId,
jobPriority,
);

await addCrawlJob(job.data.crawl_id, jobId, logger);
logger.debug("Added job for URL " + JSON.stringify(link), {
jobPriority,
url: link,
newJobId: jobId,
});
} else {
// TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
// logger.debug("Could not lock URL " + JSON.stringify(link), {
// url: link,
// });
jobId,
jobPriority,
);

await addCrawlJob(job.data.crawl_id, jobId, logger);
logger.debug("Added job for URL " + JSON.stringify(link), {
jobPriority,
url: link,
newJobId: jobId,
});
} else {
// TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
// logger.debug("Could not lock URL " + JSON.stringify(link), {
// url: link,
// });
}
}
}

Expand Down Expand Up @@ -719,7 +722,7 @@ async function kickoffGetIndexLinks(
crawler: WebCrawler,
url: string,
) {
if (sc.crawlerOptions.ignoreSitemap) {
if (sc.crawlerOptions.ignoreSitemap || sc.crawlerOptions.sitemapOnly) {
return [];
}

Expand Down
2 changes: 1 addition & 1 deletion apps/js-sdk/firecrawl/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "4.11.2",
"version": "4.11.3",
"description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js",
"types": "dist/index.d.ts",
Expand Down
Loading
Loading