Skip to content

Commit 8a32f82

Browse files
committed
add header normalization for header checks in screenshot API and web scraping API selector generator and format code
1 parent a09d6b9 commit 8a32f82

File tree

8 files changed

+47
-17
lines changed

8 files changed

+47
-17
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ $ deno lint
9999
# publish JSR:
100100
$ deno publish
101101
# build NPM package:
102-
$ deno build-npm
102+
$ deno task build-npm
103103
# publish NPM:
104104
$ cd npm && npm publish
105105
```

__tests__/client/screenshot.test.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { assertEquals, assertRejects } from "https://deno.land/[email protected]/asser
55
import { stub } from "https://deno.land/std/testing/mock.ts";
66
import type { RequestOptions } from '../../src/utils.ts';
77
import { mockedStream, responseFactory } from '../utils.ts';
8+
import { ScreenshotResult } from '../../src/result.ts';
89

910
Deno.test('screenshot: succeeds', async () => {
1011
const KEY = '__API_KEY__';
@@ -23,7 +24,7 @@ Deno.test('screenshot: succeeds', async () => {
2324
status: 200,
2425
headers: {
2526
'content-encoding': 'gzip',
26-
'content-type': 'image/png',
27+
'ContEnT-TyPe': 'image/png', // ensure case insensitivity
2728
'x-scrapfly-upstream-http-code': '200',
2829
'x-scrapfly-upstream-url': url,
2930
},

__tests__/result.test.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ Deno.test('cheerio selector lazy loads and caches itself', async () => {
1212
// cheerio.load is called exactly once - means it's cached
1313
});
1414

15+
16+
Deno.test('cheerio selector loads with case sensitive headers', async () => {
17+
const response = JSON.parse(await Deno.readTextFile('__tests__/data/response_html_case_sensitive_headers.json'));
18+
const result = new ScrapeResult(response);
19+
assertEquals(result.selector('h1').text(), 'Herman Melville - Moby-Dick');
20+
});
21+
22+
1523
Deno.test('throws ContentTypeError when accessing .selector on JSON data', async () => {
1624
const responseJsonSuccess = JSON.parse(await Deno.readTextFile('__tests__/data/response_json_success.json'));
1725
const result = new ScrapeResult(responseJsonSuccess);

deno.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
},
55
"name": "@scrapfly/scrapfly-sdk",
66
"exports": "./src/main.ts",
7-
"version": "0.6.4",
7+
"version": "0.6.5",
88
"description": "SDK for Scrapfly.io API for web scraping, screenshotting and data extraction",
99
"tasks": {
1010
"start": "deno run --allow-net --allow-read src/main.ts",

examples/bun/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Bun is a modern javascript runtime that can execute both javascript and typescri
44

55
These examples demonstrate Typescript SDK usage with Bun and for that install the SDK using jsr.io which distributes Typescript files:
66

7-
```
7+
```bash
88
$ bunx jsr add @scrapfly/scrapfly-sdk
99
```
1010

src/client.ts

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,9 @@ export class ScrapflyClient {
9595
/**
9696
* Handle clob and blob large objects
9797
*/
98-
async handleLargeObjects(result: any, format: "clob" | "blob"): Promise<ScrapeResult> {
98+
async handleLargeObjects(result: any, format: 'clob' | 'blob'): Promise<ScrapeResult> {
9999
let response: Response;
100-
100+
101101
try {
102102
const url = new URL(result.content);
103103
const params = { key: this.key };
@@ -117,14 +117,14 @@ export class ScrapflyClient {
117117
}
118118

119119
const content: string = await response.text();
120-
result.content = content
120+
result.content = content;
121121
if (format === 'clob') {
122-
result.format = 'text'
122+
result.format = 'text';
123123
}
124124
if (format === 'blob') {
125-
result.format = 'binary'
125+
result.format = 'binary';
126126
}
127-
return result
127+
return result;
128128
}
129129

130130
/**
@@ -209,9 +209,9 @@ export class ScrapflyClient {
209209
throw new errors.ApiHttpClientError(JSON.stringify(data));
210210
}
211211

212-
const content_format = data.result.format
212+
const content_format = data.result.format;
213213
if (content_format === 'clob' || content_format === 'blob') {
214-
data.result = await this.handleLargeObjects(data.result, content_format)
214+
data.result = await this.handleLargeObjects(data.result, content_format);
215215
}
216216

217217
const result = this.handleResponse(

src/result.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import type { HttpMethod, Rec } from './types.ts';
22
import * as errors from './errors.ts';
33
import { cheerio } from './deps.ts';
4+
import { normalizeHeaders } from './utils.ts';
45

56
export type ConfigData = {
67
url: string;
@@ -208,7 +209,8 @@ export class ScrapeResult {
208209

209210
get selector(): cheerio.CheerioAPI {
210211
if (!this._selector) {
211-
if (!this.result.response_headers['content-type'].includes('text/html')) {
212+
const headers = normalizeHeaders(this.result.response_headers);
213+
if (!headers['content-type'].includes('text/html')) {
212214
throw new errors.ContentTypeError(
213215
`Cannot use selector on non-html content-type, received: ${this.result.response_headers['content-type']}`,
214216
);
@@ -287,20 +289,22 @@ export class ScreenshotResult {
287289
}
288290

289291
private defineMetadata(response: Response): ScreenshotMetadata {
290-
const contentType = response.headers.get('content-type');
292+
const headers = normalizeHeaders(response.headers);
293+
const contentType = headers['content-type'];
291294
let extension_name = '';
292295
if (contentType) {
293296
extension_name = contentType.split('/')[1].split(';')[0];
294297
}
295298
return {
296299
extension_name: extension_name,
297-
upstream_status_code: parseInt(response.headers.get('X-Scrapfly-Upstream-Http-Code') || '200', 10),
298-
upstream_url: response.headers.get('X-Scrapfly-Upstream-Url') || '',
300+
upstream_status_code: parseInt(headers['x-scrapfly-upstream-http-code'] || '200', 10),
301+
upstream_url: headers['x-scrapfly-upstream-url'] || '',
299302
};
300303
}
301304

302305
private decodeResponse(response: Response, data: ArrayBuffer): object | null {
303-
if (response.headers.get('content-type') === 'json') {
306+
const headers = normalizeHeaders(response.headers);
307+
if (headers['content-type'] === 'json') {
304308
return JSON.parse(new TextDecoder().decode(data));
305309
}
306310
return null;

src/utils.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { log } from './logger.ts';
2+
import type { Rec } from './types.ts';
23

34
export function urlsafe_b64encode(data: string): string {
45
const encoder = new TextEncoder();
@@ -53,3 +54,19 @@ export async function fetchRetry(
5354

5455
throw lastError;
5556
}
57+
58+
export function normalizeHeaders(headers: Rec<string> | Headers): Rec<string> {
59+
const normalizedHeaders: Rec<string> = {};
60+
61+
if (headers instanceof Headers) {
62+
headers.forEach((value, key) => {
63+
normalizedHeaders[key.toLowerCase()] = value;
64+
});
65+
} else {
66+
Object.keys(headers).forEach((key) => {
67+
normalizedHeaders[key.toLowerCase()] = headers[key];
68+
});
69+
}
70+
71+
return normalizedHeaders;
72+
}

0 commit comments

Comments
 (0)