Skip to content

Commit fa1e7c7

Browse files
committed
feat(scrapeless): Integrate Scrapeless AI SDK and enhance API functionality
- Added @scrapeless-ai/sdk as a dependency. - Updated API endpoints for scraping and crawling functionalities. - Implemented error handling and job management for scraping tasks. - Refactored existing methods to utilize the new SDK for improved performance and reliability.
1 parent 5d82077 commit fa1e7c7

File tree

6 files changed

+857
-153
lines changed

6 files changed

+857
-153
lines changed

components/scrapeless/actions/crawler/crawler.mjs

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import scrapeless from "../../scrapeless.app.mjs";
22

33
export default {
4-
key: "crawler-api",
4+
key: "scrapeless-crawler",
55
name: "Crawler",
66
description: "Crawl any website at scale and say goodbye to blocks. [See the documentation](https://apidocs.scrapeless.com/api-17509010).",
77
version: "0.0.1",
@@ -31,33 +31,35 @@ export default {
3131
scrapeless, apiServer, ...inputProps
3232
} = this;
3333

34-
if (apiServer === "crawl") {
35-
const submitData = {
36-
limit: inputProps.limitCrawlPages,
37-
url: inputProps.url,
38-
};
39-
const response = await scrapeless.crawlerCrawl({
40-
$,
41-
submitData,
42-
...inputProps,
43-
});
34+
const browserOptions = {
35+
"proxy_country": "ANY",
36+
"session_name": "Crawl",
37+
"session_recording": true,
38+
"session_ttl": 900,
39+
};
4440

45-
$.export("$summary", `Successfully retrieved crawling results for ${inputProps.url}`);
46-
return response;
41+
let response;
42+
43+
if (apiServer === "crawl") {
44+
response =
45+
await scrapeless._scrapelessClient().scrapingCrawl.crawl.crawlUrl(inputProps.url, {
46+
limit: inputProps.limitCrawlPages,
47+
browserOptions,
48+
});
4749
}
4850

4951
if (apiServer === "scrape") {
50-
const submitData = {
51-
url: inputProps.url,
52-
};
53-
const response = await scrapeless.crawlerScrape({
54-
$,
55-
submitData,
56-
...inputProps,
57-
});
52+
response =
53+
await scrapeless._scrapelessClient().scrapingCrawl.scrape.scrapeUrl(inputProps.url, {
54+
browserOptions,
55+
});
56+
}
5857

59-
$.export("$summary", `Successfully retrieved scraping results for ${inputProps.url}`);
60-
return response;
58+
if (response?.status === "completed" && response?.data) {
59+
$.export("$summary", `Successfully retrieved crawling results for ${inputProps.url}`);
60+
return response.data;
61+
} else {
62+
throw new Error(response?.error || "Failed to retrieve crawling results");
6163
}
6264
},
6365
async additionalProps() {

components/scrapeless/actions/scraping-api/scraping-api.mjs

Lines changed: 58 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import scrapeless from "../../scrapeless.app.mjs";
2-
2+
import { log } from "../../common/utils.mjs";
33
export default {
44
key: "scrapeless-scraping-api",
55
name: "Scraping API",
@@ -27,22 +27,69 @@ export default {
2727
scrapeless, apiServer, ...inputProps
2828
} = this;
2929

30+
const MAX_RETRIES = 3;
31+
// 10 seconds
32+
const DELAY = 1000 * 10;
33+
const { run } = $.context;
34+
35+
let submitData;
36+
let job;
37+
38+
// pre check if the job is already in the context
39+
if (run?.context?.job) {
40+
job = run.context.job;
41+
}
42+
3043
if (apiServer === "googleSearch") {
31-
const submitData = {
44+
submitData = {
3245
actor: "scraper.google.search",
33-
q: inputProps.q,
34-
hl: inputProps.hl,
35-
gl: inputProps.gl,
46+
input: {
47+
q: inputProps.q,
48+
hl: inputProps.hl,
49+
gl: inputProps.gl,
50+
},
3651
};
37-
const response = await scrapeless.scrapingApi({
38-
$,
39-
submitData,
40-
...inputProps,
52+
}
53+
54+
if (!submitData) {
55+
throw new Error("No actor found");
56+
}
57+
// 1. Create a new scraping job
58+
if (!job) {
59+
job = await scrapeless._scrapelessClient().deepserp.createTask({
60+
actor: submitData.actor,
61+
input: submitData.input,
4162
});
4263

43-
$.export("$summary", "Successfully retrieved scraping results for Google Search");
44-
return response;
64+
if (job.status === 200) {
65+
$.export("$summary", "Successfully retrieved scraping results");
66+
return job.data;
67+
}
68+
69+
log("task in progress");
4570
}
71+
72+
// 2. Wait for the job to complete
73+
if (run.runs === 1) {
74+
$.flow.rerun(DELAY, {
75+
job,
76+
}, MAX_RETRIES);
77+
} else if (run.runs > MAX_RETRIES ) {
78+
throw new Error("Max retries reached");
79+
} else if (job && job?.data?.taskId) {
80+
const result = await scrapeless._scrapelessClient().deepserp.getTaskResult(job.data.taskId);
81+
if (result.status === 200) {
82+
$.export("$summary", "Successfully retrieved scraping results");
83+
return result.data;
84+
} else {
85+
$.flow.rerun(DELAY, {
86+
job,
87+
}, MAX_RETRIES);
88+
}
89+
} else {
90+
throw new Error("No job found");
91+
}
92+
4693
},
4794
async additionalProps() {
4895
const { apiServer } = this;

components/scrapeless/actions/universal-scraping-api/universal-scraping-api.mjs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,21 @@ export default {
2525
},
2626
async run({ $ }) {
2727
const {
28-
apiServer, ...rest
28+
scrapeless,
29+
apiServer, ...inputProps
2930
} = this;
3031

3132
if (apiServer === "webUnlocker") {
32-
const submitData = {
33+
const response = await scrapeless._scrapelessClient().universal.scrape({
3334
actor: "unlocker.webunlocker",
34-
country: rest.country,
35-
url: rest.url,
36-
jsRender: rest.jsRender,
37-
headless: rest.headless,
38-
};
39-
const response = await this.scrapeless.universalScrapingApi({
40-
$,
41-
submitData,
42-
...rest,
35+
input: {
36+
url: inputProps.url,
37+
headless: inputProps.headless,
38+
js_render: inputProps.jsRender,
39+
},
40+
proxy: {
41+
country: inputProps.country,
42+
},
4343
});
4444

4545
$.export("$summary", "Successfully retrieved scraping results for Web Unlocker");

components/scrapeless/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
"access": "public"
1414
},
1515
"dependencies": {
16-
"@pipedream/platform": "^3.0.3"
16+
"@pipedream/platform": "^3.0.3",
17+
"@scrapeless-ai/sdk": "^1.4.0"
1718
}
1819
}

components/scrapeless/scrapeless.app.mjs

Lines changed: 16 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import { axios } from "@pipedream/platform";
2+
import { ConfigurationError } from "@pipedream/platform";
3+
import { Scrapeless } from "@scrapeless-ai/sdk";
24

35
export default {
46
type: "app",
57
app: "scrapeless",
68
methods: {
79
_baseUrl() {
8-
return "https://scrapeless-nodes.norains.com/api/v1";
10+
return "https://api.scrapeless.com/api";
911
},
1012
_headers() {
1113
return {
@@ -21,70 +23,28 @@ export default {
2123
...opts,
2224
});
2325
},
26+
_scrapelessClient() {
27+
const { api_key } = this.$auth;
28+
if (!api_key) {
29+
throw new ConfigurationError("API key is required");
30+
}
31+
32+
return new Scrapeless({
33+
apiKey: api_key,
34+
baseUrl: this._baseUrl(),
35+
});
36+
},
2437
submitScrapeJob(opts = {}) {
2538
return this._makeRequest({
2639
method: "POST",
27-
path: "/nodes/scraper/request",
40+
path: "/scraper/request",
2841
...opts,
2942
});
3043
},
3144
getScrapeResult({ scrapeJobId }) {
3245
return this._makeRequest({
33-
path: `/nodes/scraper/result/${scrapeJobId}`,
34-
});
35-
},
36-
async scrapingApi({ submitData }) {
37-
const path = "/nodes/deepserp";
38-
const res = await this._makeRequest({
39-
method: "POST",
40-
path,
41-
data: submitData,
42-
});
43-
44-
return res;
45-
},
46-
async universalScrapingApi({ submitData }) {
47-
const path = "/nodes/universal-scraping/unlocker";
48-
const res = await this._makeRequest({
49-
method: "POST",
50-
path,
51-
data: submitData,
46+
path: `/scraper/result/${scrapeJobId}`,
5247
});
53-
return res;
54-
},
55-
async crawlerCrawl({ submitData }) {
56-
const path = "/nodes/crawler/crawl";
57-
58-
const data = {
59-
url: submitData.url,
60-
limit: submitData.limit,
61-
};
62-
63-
const res = await this._makeRequest({
64-
method: "POST",
65-
path,
66-
data,
67-
});
68-
69-
return res;
70-
},
71-
async crawlerScrape({ submitData }) {
72-
const path = "/nodes/crawler/scrape";
73-
74-
const data = {
75-
url: submitData.url,
76-
};
77-
78-
try {
79-
const response = await this._makeRequest({
80-
method: "POST",
81-
path,
82-
data,
83-
});
84-
return response;
85-
} catch (error) {
86-
throw new Error(error.message);
87-
}
8848
},
8949
},
9050

0 commit comments

Comments
 (0)