feat(scrapeless): Integrate Scrapeless AI SDK and enhance API functionality

joy-chanboop · joy-chanboop · commit fa1e7c7b6abd · 2025-06-21T09:51:05.000+08:00
- Added @scrapeless-ai/sdk as a dependency.
- Updated API endpoints for scraping and crawling functionalities.
- Implemented error handling and job management for scraping tasks.
- Refactored existing methods to utilize the new SDK for improved performance and reliability.
diff --git a/components/scrapeless/actions/crawler/crawler.mjs b/components/scrapeless/actions/crawler/crawler.mjs
@@ -1,7 +1,7 @@
 import scrapeless from "../../scrapeless.app.mjs";
 
 export default {
-  key: "crawler-api",
+  key: "scrapeless-crawler",
   name: "Crawler",
   description: "Crawl any website at scale and say goodbye to blocks. [See the documentation](https://apidocs.scrapeless.com/api-17509010).",
   version: "0.0.1",
@@ -31,33 +31,35 @@ export default {
       scrapeless, apiServer, ...inputProps
     } = this;
 
-    if (apiServer === "crawl") {
-      const submitData = {
-        limit: inputProps.limitCrawlPages,
-        url: inputProps.url,
-      };
-      const response = await scrapeless.crawlerCrawl({
-        $,
-        submitData,
-        ...inputProps,
-      });
+    const browserOptions = {
+      "proxy_country": "ANY",
+      "session_name": "Crawl",
+      "session_recording": true,
+      "session_ttl": 900,
+    };
 
-      $.export("$summary", `Successfully retrieved crawling results for ${inputProps.url}`);
-      return response;
+    let response;
+
+    if (apiServer === "crawl") {
+      response =
+        await scrapeless._scrapelessClient().scrapingCrawl.crawl.crawlUrl(inputProps.url, {
+          limit: inputProps.limitCrawlPages,
+          browserOptions,
+        });
     }
 
     if (apiServer === "scrape") {
-      const submitData = {
-        url: inputProps.url,
-      };
-      const response = await scrapeless.crawlerScrape({
-        $,
-        submitData,
-        ...inputProps,
-      });
+      response =
+        await scrapeless._scrapelessClient().scrapingCrawl.scrape.scrapeUrl(inputProps.url, {
+          browserOptions,
+        });
+    }
 
-      $.export("$summary", `Successfully retrieved scraping results for ${inputProps.url}`);
-      return response;
+    if (response?.status === "completed" && response?.data) {
+      $.export("$summary", `Successfully retrieved crawling results for ${inputProps.url}`);
+      return response.data;
+    } else {
+      throw new Error(response?.error || "Failed to retrieve crawling results");
     }
   },
   async additionalProps() {
diff --git a/components/scrapeless/actions/scraping-api/scraping-api.mjs b/components/scrapeless/actions/scraping-api/scraping-api.mjs
@@ -1,5 +1,5 @@
 import scrapeless from "../../scrapeless.app.mjs";
-
+import { log } from "../../common/utils.mjs";
 export default {
   key: "scrapeless-scraping-api",
   name: "Scraping API",
@@ -27,22 +27,69 @@ export default {
       scrapeless, apiServer, ...inputProps
     } = this;
 
+    const MAX_RETRIES = 3;
+    // 10 seconds
+    const DELAY = 1000 * 10;
+    const { run } = $.context;
+
+    let submitData;
+    let job;
+
+    // pre check if the job is already in the context
+    if (run?.context?.job) {
+      job = run.context.job;
+    }
+
     if (apiServer === "googleSearch") {
-      const submitData = {
+      submitData = {
         actor: "scraper.google.search",
-        q: inputProps.q,
-        hl: inputProps.hl,
-        gl: inputProps.gl,
+        input: {
+          q: inputProps.q,
+          hl: inputProps.hl,
+          gl: inputProps.gl,
+        },
       };
-      const response = await scrapeless.scrapingApi({
-        $,
-        submitData,
-        ...inputProps,
+    }
+
+    if (!submitData) {
+      throw new Error("No actor found");
+    }
+    // 1. Create a new scraping job
+    if (!job) {
+      job = await scrapeless._scrapelessClient().deepserp.createTask({
+        actor: submitData.actor,
+        input: submitData.input,
       });
 
-      $.export("$summary", "Successfully retrieved scraping results for Google Search");
-      return response;
+      if (job.status === 200) {
+        $.export("$summary", "Successfully retrieved scraping results");
+        return job.data;
+      }
+
+      log("task in progress");
     }
+
+    // 2. Wait for the job to complete
+    if (run.runs === 1) {
+      $.flow.rerun(DELAY, {
+        job,
+      }, MAX_RETRIES);
+    } else if (run.runs > MAX_RETRIES ) {
+      throw new Error("Max retries reached");
+    } else if (job && job?.data?.taskId) {
+      const result = await scrapeless._scrapelessClient().deepserp.getTaskResult(job.data.taskId);
+      if (result.status === 200) {
+        $.export("$summary", "Successfully retrieved scraping results");
+        return result.data;
+      } else {
+        $.flow.rerun(DELAY, {
+          job,
+        }, MAX_RETRIES);
+      }
+    } else {
+      throw new Error("No job found");
+    }
+
   },
   async additionalProps() {
     const { apiServer } = this;
diff --git a/components/scrapeless/actions/universal-scraping-api/universal-scraping-api.mjs b/components/scrapeless/actions/universal-scraping-api/universal-scraping-api.mjs
@@ -25,21 +25,21 @@ export default {
   },
   async run({ $ }) {
     const {
-      apiServer, ...rest
+      scrapeless,
+      apiServer, ...inputProps
     } = this;
 
     if (apiServer === "webUnlocker") {
-      const submitData = {
+      const response = await scrapeless._scrapelessClient().universal.scrape({
         actor: "unlocker.webunlocker",
-        country: rest.country,
-        url: rest.url,
-        jsRender: rest.jsRender,
-        headless: rest.headless,
-      };
-      const response = await this.scrapeless.universalScrapingApi({
-        $,
-        submitData,
-        ...rest,
+        input: {
+          url: inputProps.url,
+          headless: inputProps.headless,
+          js_render: inputProps.jsRender,
+        },
+        proxy: {
+          country: inputProps.country,
+        },
       });
 
       $.export("$summary", "Successfully retrieved scraping results for Web Unlocker");
diff --git a/components/scrapeless/package.json b/components/scrapeless/package.json
@@ -13,6 +13,7 @@
     "access": "public"
   },
   "dependencies": {
-    "@pipedream/platform": "^3.0.3"
+    "@pipedream/platform": "^3.0.3",
+    "@scrapeless-ai/sdk": "^1.4.0"
   }
 }
diff --git a/components/scrapeless/scrapeless.app.mjs b/components/scrapeless/scrapeless.app.mjs
@@ -1,11 +1,13 @@
 import { axios } from "@pipedream/platform";
+import { ConfigurationError } from "@pipedream/platform";
+import { Scrapeless } from "@scrapeless-ai/sdk";
 
 export default {
   type: "app",
   app: "scrapeless",
   methods: {
     _baseUrl() {
-      return "https://scrapeless-nodes.norains.com/api/v1";
+      return "https://api.scrapeless.com/api";
     },
     _headers() {
       return {
@@ -21,70 +23,28 @@ export default {
         ...opts,
       });
     },
+    _scrapelessClient() {
+      const { api_key } = this.$auth;
+      if (!api_key) {
+        throw new ConfigurationError("API key is required");
+      }
+
+      return new Scrapeless({
+        apiKey: api_key,
+        baseUrl: this._baseUrl(),
+      });
+    },
     submitScrapeJob(opts = {}) {
       return this._makeRequest({
         method: "POST",
-        path: "/nodes/scraper/request",
+        path: "/scraper/request",
         ...opts,
       });
     },
     getScrapeResult({ scrapeJobId }) {
       return this._makeRequest({
-        path: `/nodes/scraper/result/${scrapeJobId}`,
-      });
-    },
-    async scrapingApi({ submitData }) {
-      const path = "/nodes/deepserp";
-      const res = await this._makeRequest({
-        method: "POST",
-        path,
-        data: submitData,
-      });
-
-      return res;
-    },
-    async universalScrapingApi({ submitData }) {
-      const path = "/nodes/universal-scraping/unlocker";
-      const res = await this._makeRequest({
-        method: "POST",
-        path,
-        data: submitData,
+        path: `/scraper/result/${scrapeJobId}`,
       });
-      return res;
-    },
-    async crawlerCrawl({ submitData }) {
-      const path = "/nodes/crawler/crawl";
-
-      const data = {
-        url: submitData.url,
-        limit: submitData.limit,
-      };
-
-      const res = await this._makeRequest({
-        method: "POST",
-        path,
-        data,
-      });
-
-      return res;
-    },
-    async crawlerScrape({ submitData }) {
-      const path = "/nodes/crawler/scrape";
-
-      const data = {
-        url: submitData.url,
-      };
-
-      try {
-        const response = await this._makeRequest({
-          method: "POST",
-          path,
-          data,
-        });
-        return response;
-      } catch (error) {
-        throw new Error(error.message);
-      }
     },
   },
 
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@`
`13`	`13`	`"access": "public"`
`14`	`14`	`},`
`15`	`15`	`"dependencies": {`
`16`		`- "@pipedream/platform": "^3.0.3"`
	`16`	`+ "@pipedream/platform": "^3.0.3",`
	`17`	`+ "@scrapeless-ai/sdk": "^1.4.0"`
`17`	`18`	`}`
`18`	`19`	`}`