Skip to content

Commit 5930110

Browse files
committed
update to handle cf workers
1 parent 363659a commit 5930110

File tree

9 files changed

+51
-69
lines changed

9 files changed

+51
-69
lines changed

.github/workflows/publish.yaml

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
name: Publish to NPM
23

34
on:
@@ -12,17 +13,25 @@ jobs:
1213

1314
steps:
1415
- uses: actions/checkout@v2
15-
- uses: volta-cli/action@v1
16-
- run: npm ci --no-audit
17-
- run: npm run lint --if-present
18-
- run: npm test
19-
- run: npm run build --if-present
20-
env:
21-
CI: true
16+
17+
- name: Install Deno
18+
run: |
19+
curl -fsSL https://deno.land/x/install/install.sh | sh
20+
echo "DENO_INSTALL=/home/runner/.deno" >> $GITHUB_ENV
21+
echo "$DENO_INSTALL/bin" >> $GITHUB_PATH
22+
23+
- name: Run Deno build script
24+
run: deno run -A build.ts
25+
26+
- name: Navigate to npm directory
27+
run: cd ./npm
28+
2229
- name: Setup .npmrc file to publish to npm
2330
run: |
2431
echo "//registry.npmjs.org/:_authToken=$NPM_TOKEN" > .npmrc
2532
env:
26-
NPM_TOKEN: ${{secrets.NPM_AUTOMATION_TOKEN}}
33+
NPM_TOKEN: ${{ secrets.NPM_AUTOMATION_TOKEN }}
34+
2735
- name: Publish to NPM
28-
run: npm publish
36+
run: npm publish --dry-run
37+
working-directory: ./npm

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
# Scrapfly SDK
22

3-
`npm install scrapfly-sdk`
3+
`npm install scrapfly-sdk`
4+
`deno add jsr:@scrapfly/scrapfly-sdk`
5+
`bun jsr add @scrapfly/scrapfly-sdk`
46

5-
Typescript/NodeJS SDK for [Scrapfly.io](https://scrapfly.io/) web scraping API which allows to:
7+
Typescript/Javascript SDK for [Scrapfly.io](https://scrapfly.io/) web scraping API which allows to:
68

79
- Scrape the web without being blocked.
810
- Use headless browsers to access Javascript-powered page data.
@@ -11,6 +13,10 @@ Typescript/NodeJS SDK for [Scrapfly.io](https://scrapfly.io/) web scraping API w
1113

1214
For web scraping guides see [our blog](https://scrapfly.io/blog/) and [#scrapeguide](https://scrapfly.io/blog/tag/scrapeguide/) tag for how to scrape specific targets.
1315

16+
The SDK is distributed through:
17+
- [npmjs.com/package/scrapfly-sdk](https://www.npmjs.com/package/scrapfly-sdk)
18+
- [jsr.io/@scrapfly/scrapfly-sdk](https://jsr.io/@scrapfly/scrapfly-sdk)
19+
1420
## Quick Intro
1521

1622
1. Register a [Scrapfly account for free](https://scrapfly.io/register)

__tests__/config/extraction.test.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,8 @@ Deno.test('url param generation: sets extraction_model', async () => {
114114
});
115115
});
116116

117-
Deno.test({
117+
// XXX: could add auto compression but support is difficult
118+
/* Deno.test({
118119
name: 'url param generation: compresses body',
119120
async fn() {
120121
const config = new ExtractionConfig({
@@ -139,7 +140,7 @@ Deno.test({
139140
sanitizeResources: false,
140141
sanitizeOps: false,
141142
});
142-
143+
*/
143144
Deno.test('url param generation: fails to missing compression state with declared compression format', async () => {
144145
const config = new ExtractionConfig({
145146
body: input_html,

__tests__/utils.test.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ Deno.test('fetchRetry: fails after max retries', async () => {
8181

8282
await assertRejects(
8383
async () => {
84-
await fetchRetry(request, {}, 3, 1000);
84+
await fetchRetry(request, 3);
8585
},
8686
Error,
8787
'Fetch failed with status: 500'
@@ -92,6 +92,8 @@ Deno.test('fetchRetry: fails after max retries', async () => {
9292
fetchStub.restore();
9393
});
9494

95+
// XXX: should we support built-in timeout?
96+
/*
9597
Deno.test('fetchRetry: fails due to timeout', async () => {
9698
9799
const request = new Request('https://httpbin.dev/delay/3');
@@ -103,4 +105,4 @@ Deno.test('fetchRetry: fails due to timeout', async () => {
103105
Error,
104106
'The signal has been aborted'
105107
);
106-
});
108+
}); */

build.ts

Lines changed: 2 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,4 @@
11
import { build, emptyDir } from "@deno/dnt";
2-
import { rollup } from "npm:rollup";
3-
import resolve from "npm:@rollup/plugin-node-resolve";
4-
import commonjs from "npm:@rollup/plugin-commonjs";
5-
import { terser } from "npm:rollup-plugin-terser";
6-
7-
async function bundleForBrowser() {
8-
console.log("Bundling for browser...");
9-
const bundle = await rollup({
10-
input: "npm/esm/main.js",
11-
plugins: [
12-
resolve({ browser: true, preferBuiltins: false }),
13-
commonjs(),
14-
terser(),
15-
],
16-
logLevel: 'debug',
17-
});
18-
19-
console.log("Writing to npm/dist/bundle.js");
20-
await bundle.write({
21-
file: "npm/dist/bundle.js",
22-
format: "esm",
23-
sourcemap: true,
24-
});
25-
console.log("Closing Rollup");
26-
await bundle.close();
27-
}
28-
292
await emptyDir("./npm");
303

314
const { version, description } = JSON.parse(Deno.readTextFileSync("deno.json"));
@@ -53,13 +26,12 @@ await build({
5326
url: "https://github.com/scrapfly/typescript-scrapfly/issues",
5427
},
5528
homepage: "https://scrapfly.io/",
56-
main: "./esm/src/main.js", // Point to the ESM output
57-
types: "./esm/src/main.d.ts", // Point to the TypeScript declarations
29+
main: "./esm/main.js",
30+
types: "./esm/main.d.ts",
5831
},
5932
postBuild: async () => {
6033
Deno.copyFileSync("LICENSE", "npm/LICENSE");
6134
Deno.copyFileSync("README.md", "npm/README.md");
62-
await bundleForBrowser();
6335
},
6436
});
6537

src/deps.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import * as cheerio from "npm:[email protected]";
1+
import * as cheerio from 'https://cdn.skypack.dev/[email protected]?dts';
22
import * as path from 'jsr:@std/[email protected]';
33

4-
export { cheerio, path};
4+
export { cheerio, path };

src/extractionconfig.ts

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
import * as errors from './errors.ts';
22
import { urlsafe_b64encode } from './utils.ts';
3-
import { gzipSync } from 'node:zlib';
4-
import { Buffer } from 'node:buffer';
53

64
export enum CompressionFormat {
75
/**
@@ -114,17 +112,18 @@ export class ExtractionConfig {
114112
);
115113
}
116114
if (this.is_document_compressed === false) {
117-
if (this.document_compression_format === CompressionFormat.GZIP) {
118-
// XXX: This breaks cloudflare workers as they don't support node:zlib
119-
const compressed = gzipSync(Buffer.from(this.body as string, 'utf-8'));
120-
this.body = new Uint8Array(compressed);
121-
} else {
122-
throw new errors.ExtractionConfigError(
123-
`Auto compression for ${this.document_compression_format} format isn't available. ` +
124-
`You can manually compress to ${this.document_compression_format}` +
125-
`or choose the gzip format for auto compression`,
126-
);
127-
}
115+
// if (this.document_compression_format === CompressionFormat.GZIP) {
116+
// XXX: This breaks cloudflare workers for some reason
117+
// const compressed = gzip(new TextEncoder().encode(this.body as string));
118+
// this.body = new Uint8Array(compressed);
119+
// throw new Error("automatic gzip is not supported yet, pass gzipped ");
120+
// } else {
121+
throw new errors.ExtractionConfigError(
122+
`Auto compression for ${this.document_compression_format} format isn't available. ` +
123+
`You can manually compress to ${this.document_compression_format}` +
124+
`or choose the gzip format for auto compression`,
125+
);
126+
// }
128127
}
129128
}
130129

src/result.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ export class ExtractionResult {
312312
content_type: string;
313313
data_quality?: string;
314314

315-
constructor(response: { data: string; content_type: string, data_quality?: string }) {
315+
constructor(response: { data: string; content_type: string; data_quality?: string }) {
316316
this.data = response.data;
317317
this.content_type = response.content_type;
318318
this.data_quality = response.data_quality;

src/utils.ts

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,15 @@ export function urlsafe_b64encode(data: string): string {
1010

1111
export async function fetchRetry(
1212
config: Request,
13-
init: RequestInit = {},
1413
retries: number = 3,
1514
retryDelay: number = 1000,
16-
timeout: number = 160000, // Timeout in milliseconds
1715
): Promise<Response> {
1816
let lastError: any = null;
1917

2018
for (let attempt = 1; attempt <= retries; attempt++) {
21-
const controller = new AbortController();
22-
const timeoutId = setTimeout(() => controller.abort(), timeout);
23-
2419
try {
2520
// XXX: this breaks cloudflare workers as they don't support init options
26-
const response = await fetch(config, { ...init, signal: controller.signal });
27-
clearTimeout(timeoutId);
21+
const response = await fetch(config);
2822
// retry 5xx status codes
2923
if (response.status >= 500 && response.status < 600) {
3024
lastError = new Error(`Fetch failed with status: ${response.status}`);
@@ -35,7 +29,6 @@ export async function fetchRetry(
3529
return response;
3630
}
3731
} catch (error) {
38-
clearTimeout(timeoutId);
3932
lastError = error;
4033

4134
if (attempt === retries || error.name === 'AbortError') {

0 commit comments

Comments
 (0)