code · pull · Jan 23, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026
diff --git a/apps/api/package.json b/apps/api/package.json
@@ -113,7 +113,7 @@
     "joplin-turndown-plugin-gfm": "^1.0.12",
     "jsdom": "^26.0.0",
     "koffi": "^2.9.0",
-    "lodash": "^4.17.21",
+    "lodash": "^4.17.23",
     "marked": "^14.1.2",
     "ollama-ai-provider": "^1.2.0",
     "openai": "^5.20.2",

diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml
diff --git a/apps/api/src/__tests__/snips/v2/crawl.test.ts b/apps/api/src/__tests__/snips/v2/crawl.test.ts
@@ -13,6 +13,7 @@ import {
   crawl,
   crawlOngoing,
   crawlStart,
+  map,
   Identity,
   idmux,
   scrapeTimeout,
@@ -23,6 +24,13 @@ import { describe, it, expect } from "@jest/globals";
 
 let identity: Identity;
 
+const normalizeUrlForCompare = (value: string) => {
+  const url = new URL(value);
+  url.hash = "";
+  const href = url.href;
+  return href.endsWith("/") ? href.slice(0, -1) : href;
+};
+
 beforeAll(async () => {
   identity = await idmux({
     name: "crawl",
@@ -69,6 +77,97 @@ describe("Crawl tests", () => {
     10 * scrapeTimeout,
   );
 
+  concurrentIf(ALLOW_TEST_SUITE_WEBSITE)(
+    "works with sitemap: only",
+    async () => {
+      const results = await crawl(
+        {
+          url: base,
+          limit: 10,
+          sitemap: "only",
+        },
+        identity,
+      );
+
+      expect(results.completed).toBeGreaterThan(0);
+    },
+    10 * scrapeTimeout,
+  );
+
+  concurrentIf(ALLOW_TEST_SUITE_WEBSITE)(
+    "sitemap-only results are subset of map-only + start URL",
+    async () => {
+      const mapResponse = await map(
+        {
+          url: base,
+          sitemap: "only",
+          includeSubdomains: false,
+          ignoreQueryParameters: false,
+          limit: 500,
+        },
+        identity,
+      );
+
+      expect(mapResponse.statusCode).toBe(200);
+      expect(mapResponse.body.success).toBe(true);
+
+      const sitemapUrls = new Set(
+        mapResponse.body.links.map(link => normalizeUrlForCompare(link.url)),
+      );
+      const baseNormalized = normalizeUrlForCompare(base);
+
+      const results = await crawl(
+        {
+          url: base,
+          limit: 50,
+          sitemap: "only",
+        },
+        identity,
+      );
+
+      expect(results.success).toBe(true);
+      if (results.success) {
+        for (const page of results.data) {
+          const pageUrl =
+            page.metadata.url ?? page.metadata.sourceURL ?? base;
+          const normalized = normalizeUrlForCompare(pageUrl);
+          expect(
+            normalized === baseNormalized || sitemapUrls.has(normalized),
+          ).toBe(true);
+        }
+      }
+    },
+    10 * scrapeTimeout,
+  );
+
+  concurrentIf(TEST_PRODUCTION)(
+    "no sitemap found -> start URL only",
+    async () => {
+      const noSitemapUrl = "https://example.com";
+      const results = await crawl(
+        {
+          url: noSitemapUrl,
+          limit: 10,
+          sitemap: "only",
+        },
+        identity,
+      );
+
+      expect(results.success).toBe(true);
+      if (results.success) {
+        expect(results.data.length).toBe(1);
+        const pageUrl =
+          results.data[0].metadata.url ??
+          results.data[0].metadata.sourceURL ??
+          noSitemapUrl;
+        expect(normalizeUrlForCompare(pageUrl)).toBe(
+          normalizeUrlForCompare(noSitemapUrl),
+        );
+      }
+    },
+    10 * scrapeTimeout,
+  );
+
   concurrentIf(ALLOW_TEST_SUITE_WEBSITE)(
     "filters URLs properly",
     async () => {

diff --git a/apps/api/src/__tests__/snips/v2/types-validation.test.ts b/apps/api/src/__tests__/snips/v2/types-validation.test.ts
@@ -19,6 +19,7 @@ import {
   BatchScrapeRequestInput,
   SearchRequest,
   SearchRequestInput,
+  toV2CrawlerOptions,
 } from "../../../controllers/v2/types";
 
 describe("V2 Types Validation", () => {
@@ -606,11 +607,11 @@ describe("V2 Types Validation", () => {
     it("should handle sitemap enum values", () => {
       const input: CrawlRequestInput = {
         url: "https://example.com",
-        sitemap: "include",
+        sitemap: "only",
       };
 
       const result = crawlRequestSchema.parse(input);
-      expect(result.sitemap).toBe("include");
+      expect(result.sitemap).toBe("only");
     });
 
     it("should reject invalid sitemap value", () => {
@@ -621,6 +622,15 @@ describe("V2 Types Validation", () => {
 
       expect(() => crawlRequestSchema.parse(input)).toThrow();
     });
+
+    it("should map sitemapOnly to sitemap=only", () => {
+      const result = toV2CrawlerOptions({
+        sitemapOnly: true,
+        ignoreSitemap: false,
+      });
+
+      expect(result.sitemap).toBe("only");
+    });
   });
 
   describe("mapRequestSchema", () => {

diff --git a/apps/api/src/controllers/v2/crawl-params-preview.ts b/apps/api/src/controllers/v2/crawl-params-preview.ts
@@ -29,7 +29,7 @@ type CrawlParamsPreviewResponse =
         crawlEntireDomain?: boolean;
         allowExternalLinks?: boolean;
         allowSubdomains?: boolean;
-        sitemap?: "skip" | "include";
+        sitemap?: "skip" | "include" | "only";
         ignoreQueryParameters?: boolean;
         deduplicateSimilarURLs?: boolean;
         delay?: number;

diff --git a/apps/api/src/controllers/v2/types.ts b/apps/api/src/controllers/v2/types.ts
@@ -868,7 +868,7 @@ export const crawlerOptions = z.strictObject({
   allowExternalLinks: z.boolean().prefault(false),
   allowSubdomains: z.boolean().prefault(false),
   ignoreRobotsTxt: z.boolean().prefault(false),
-  sitemap: z.enum(["skip", "include"]).prefault("include"),
+  sitemap: z.enum(["skip", "include", "only"]).prefault("include"),
   deduplicateSimilarURLs: z.boolean().prefault(true),
   ignoreQueryParameters: z.boolean().prefault(false),
   regexOnFullURL: z.boolean().prefault(false),
@@ -1283,6 +1283,7 @@ export function toV0CrawlerOptions(x: CrawlerOptions) {
     allowSubdomains: x.allowSubdomains,
     ignoreRobotsTxt: x.ignoreRobotsTxt,
     ignoreSitemap: x.sitemap === "skip",
+    sitemapOnly: x.sitemap === "only",
     deduplicateSimilarURLs: x.deduplicateSimilarURLs,
     ignoreQueryParameters: x.ignoreQueryParameters,
     regexOnFullURL: x.regexOnFullURL,
@@ -1301,7 +1302,7 @@ export function toV2CrawlerOptions(x: any): CrawlerOptions {
     allowExternalLinks: x.allowExternalContentLinks,
     allowSubdomains: x.allowSubdomains,
     ignoreRobotsTxt: x.ignoreRobotsTxt,
-    sitemap: x.ignoreSitemap ? "skip" : "include",
+    sitemap: x.sitemapOnly ? "only" : x.ignoreSitemap ? "skip" : "include",
     deduplicateSimilarURLs: x.deduplicateSimilarURLs,
     ignoreQueryParameters: x.ignoreQueryParameters,
     regexOnFullURL: x.regexOnFullURL,
@@ -1326,7 +1327,7 @@ function fromV0CrawlerOptions(
       allowExternalLinks: x.allowExternalContentLinks,
       allowSubdomains: x.allowSubdomains,
       ignoreRobotsTxt: x.ignoreRobotsTxt,
-      sitemap: x.ignoreSitemap ? "skip" : "include",
+      sitemap: x.sitemapOnly ? "only" : x.ignoreSitemap ? "skip" : "include",
       deduplicateSimilarURLs: x.deduplicateSimilarURLs,
       ignoreQueryParameters: x.ignoreQueryParameters,
       regexOnFullURL: x.regexOnFullURL,

diff --git a/apps/api/src/services/worker/scrape-worker.ts b/apps/api/src/services/worker/scrape-worker.ts
@@ -356,78 +356,81 @@ async function processJob(job: NuQJob<ScrapeJobSingleUrls>) {
             doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
           );
 
-          const links = await crawler.filterLinks(
-            await crawler.extractLinksFromHTML(
-              rawHtml ?? "",
-              doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!,
-            ),
-            Infinity,
-            sc.crawlerOptions?.maxDepth ?? 10,
-          );
-          logger.debug("Discovered " + links.links.length + " links...", {
-            linksLength: links.links.length,
-          });
+          if (!sc.crawlerOptions?.sitemapOnly) {
+            const links = await crawler.filterLinks(
+              await crawler.extractLinksFromHTML(
+                rawHtml ?? "",
+                doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!,
+              ),
+              Infinity,
+              sc.crawlerOptions?.maxDepth ?? 10,
+            );
+            logger.debug("Discovered " + links.links.length + " links...", {
+              linksLength: links.links.length,
+            });
 
-          // Store robots blocked URLs in Redis set
-          for (const [url, reason] of links.denialReasons) {
-            if (reason === "URL blocked by robots.txt") {
-              await recordRobotsBlocked(job.data.crawl_id, url);
+            // Store robots blocked URLs in Redis set
+            for (const [url, reason] of links.denialReasons) {
+              if (reason === "URL blocked by robots.txt") {
+                await recordRobotsBlocked(job.data.crawl_id, url);
+              }
             }
-          }
 
-          for (const link of links.links) {
-            if (await lockURL(job.data.crawl_id, sc, link)) {
-              // This seems to work really welel
-              const jobPriority = await getJobPriority({
-                team_id: sc.team_id,
-                basePriority: job.data.crawl_id ? 20 : 10,
-              });
-              const jobId = uuidv7();
-
-              logger.debug(
-                "Determined job priority " +
-                  jobPriority +
-                  " for URL " +
-                  JSON.stringify(link),
-                { jobPriority, url: link },
-              );
-
-              await addScrapeJob(
-                {
-                  url: link,
-                  mode: "single_urls",
+            for (const link of links.links) {
+              if (await lockURL(job.data.crawl_id, sc, link)) {
+                // This seems to work really welel
+                const jobPriority = await getJobPriority({
                   team_id: sc.team_id,
-                  scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
-                  internalOptions: sc.internalOptions,
-                  crawlerOptions: {
-                    ...sc.crawlerOptions,
-                    currentDiscoveryDepth:
-                      (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1,
+                  basePriority: job.data.crawl_id ? 20 : 10,
+                });
+                const jobId = uuidv7();
+
+                logger.debug(
+                  "Determined job priority " +
+                    jobPriority +
+                    " for URL " +
+                    JSON.stringify(link),
+                  { jobPriority, url: link },
+                );
+
+                await addScrapeJob(
+                  {
+                    url: link,
+                    mode: "single_urls",
+                    team_id: sc.team_id,
+                    scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
+                    internalOptions: sc.internalOptions,
+                    crawlerOptions: {
+                      ...sc.crawlerOptions,
+                      currentDiscoveryDepth:
+                        (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) +
+                        1,
+                    },
+                    origin: job.data.origin,
+                    integration: job.data.integration,
+                    crawl_id: job.data.crawl_id,
+                    requestId: job.data.requestId,
+                    webhook: job.data.webhook,
+                    v1: job.data.v1,
+                    zeroDataRetention: job.data.zeroDataRetention,
+                    apiKeyId: job.data.apiKeyId,
                   },
-                  origin: job.data.origin,
-                  integration: job.data.integration,
-                  crawl_id: job.data.crawl_id,
-                  requestId: job.data.requestId,
-                  webhook: job.data.webhook,
-                  v1: job.data.v1,
-                  zeroDataRetention: job.data.zeroDataRetention,
-                  apiKeyId: job.data.apiKeyId,
-                },
-                jobId,
-                jobPriority,
-              );
-
-              await addCrawlJob(job.data.crawl_id, jobId, logger);
-              logger.debug("Added job for URL " + JSON.stringify(link), {
-                jobPriority,
-                url: link,
-                newJobId: jobId,
-              });
-            } else {
-              // TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
-              // logger.debug("Could not lock URL " + JSON.stringify(link), {
-              //   url: link,
-              // });
+                  jobId,
+                  jobPriority,
+                );
+
+                await addCrawlJob(job.data.crawl_id, jobId, logger);
+                logger.debug("Added job for URL " + JSON.stringify(link), {
+                  jobPriority,
+                  url: link,
+                  newJobId: jobId,
+                });
+              } else {
+                // TODO: removed this, ok? too many 'not useful' logs (?) Mogery!
+                // logger.debug("Could not lock URL " + JSON.stringify(link), {
+                //   url: link,
+                // });
+              }
             }
           }
 
@@ -719,7 +722,7 @@ async function kickoffGetIndexLinks(
   crawler: WebCrawler,
   url: string,
 ) {
-  if (sc.crawlerOptions.ignoreSitemap) {
+  if (sc.crawlerOptions.ignoreSitemap || sc.crawlerOptions.sitemapOnly) {
     return [];
   }
 

diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@mendable/firecrawl-js",
-  "version": "4.11.2",
+  "version": "4.11.3",
   "description": "JavaScript SDK for Firecrawl API",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",