Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit d1883f0

Browse files
committedNov 7, 2024·
add scrape extraction params and rename extraction template options
1 parent 923097b commit d1883f0

File tree

7 files changed

+108
-23
lines changed

7 files changed

+108
-23
lines changed
 

‎__tests__/client/extraction.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ Deno.test('extract: fails due to invalid config', async () => {
3939
new ExtractionConfig({
4040
body: html,
4141
content_type: 'text/html',
42-
ephemeral_template: { source: 'html' },
43-
template: 'template',
42+
extraction_ephemeral_template: { source: 'html' },
43+
extraction_template: 'template',
4444
}),
4545
);
4646
},

‎__tests__/config/extraction.test.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,11 @@ Deno.test('url param generation: sets charset', async () => {
5858
});
5959
});
6060

61-
Deno.test('url param generation: sets template', async () => {
61+
Deno.test('url param generation: sets extraction_template', async () => {
6262
const config = new ExtractionConfig({
6363
body: input_html,
6464
content_type: input_content_type,
65-
template: 'my_template',
65+
extraction_template: 'my_template',
6666
});
6767
const params = config.toApiParams({ key: '1234' });
6868
assertEquals(params, {
@@ -72,11 +72,11 @@ Deno.test('url param generation: sets template', async () => {
7272
});
7373
});
7474

75-
Deno.test('url param generation: sets ephemeral_template', async () => {
75+
Deno.test('url param generation: sets extraction_ephemeral_template', async () => {
7676
const config = new ExtractionConfig({
7777
body: input_html,
7878
content_type: input_content_type,
79-
ephemeral_template: { source: 'html', selectors: [] },
79+
extraction_ephemeral_template: { source: 'html', selectors: [] },
8080
});
8181
const params = config.toApiParams({ key: '1234' });
8282
assertEquals(params, {

‎__tests__/config/scrape.test.ts

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { HttpMethod } from '../../src/types.ts';
33
import { ScrapeConfigError } from '../../src/errors.ts';
44
import { assertEquals, assertThrows } from "https://deno.land/std@0.224.0/assert/mod.ts";
55

6+
const input_content_type = 'text/html';
67

78
Deno.test('scrapeconfig loads', () => {
89
const config = new ScrapeConfig({ url: 'http://httpbin.dev/get' });
@@ -15,8 +16,6 @@ Deno.test('scrapeconfig throws on unknown options', () => {
1516
}, ScrapeConfigError, "Invalid option provided: foobar");
1617
});
1718

18-
19-
2019
Deno.test('scrapeconfig allowed methods', () => {
2120
(['GET', 'POST', 'PUT', 'PATCH', 'HEAD'] as HttpMethod[]).forEach((method) => {
2221
const config = new ScrapeConfig({
@@ -360,6 +359,58 @@ Deno.test('url param generation: proxy_pool sets', () => {
360359
});
361360
});
362361

362+
Deno.test('url param generation: sets extraction_template', async () => {
363+
const config = new ScrapeConfig({
364+
url: 'http://httpbin.dev/get',
365+
extraction_template: 'my_template',
366+
});
367+
const params = config.toApiParams({ key: '1234' });
368+
assertEquals(params, {
369+
key: '1234',
370+
url: 'http://httpbin.dev/get',
371+
extraction_template: 'my_template',
372+
});
373+
});
374+
375+
Deno.test('url param generation: sets extraction_ephemeral_template', async () => {
376+
const config = new ScrapeConfig({
377+
url: 'http://httpbin.dev/get',
378+
extraction_ephemeral_template: { source: 'html', selectors: [] },
379+
});
380+
const params = config.toApiParams({ key: '1234' });
381+
assertEquals(params, {
382+
key: '1234',
383+
url: 'http://httpbin.dev/get',
384+
extraction_template: 'ephemeral:eyJzb3VyY2UiOiJodG1sIiwic2VsZWN0b3JzIjpbXX0',
385+
});
386+
});
387+
388+
Deno.test('url param generation: sets extraction_prompt', async () => {
389+
const config = new ScrapeConfig({
390+
url: 'http://httpbin.dev/get',
391+
extraction_prompt: 'summarize the document',
392+
});
393+
const params = config.toApiParams({ key: '1234' });
394+
assertEquals(params, {
395+
key: '1234',
396+
url: 'http://httpbin.dev/get',
397+
extraction_prompt: 'summarize the document',
398+
});
399+
});
400+
401+
Deno.test('url param generation: sets extraction_model', async () => {
402+
const config = new ScrapeConfig({
403+
url: 'http://httpbin.dev/get',
404+
extraction_model: 'review_list',
405+
});
406+
const params = config.toApiParams({ key: '1234' });
407+
assertEquals(params, {
408+
key: '1234',
409+
url: 'http://httpbin.dev/get',
410+
extraction_model: 'review_list',
411+
});
412+
});
413+
363414
Deno.test('url param generation: session sets', () => {
364415
const config = new ScrapeConfig({
365416
url: 'http://httpbin.dev/get',

‎examples/deno/deno_examples.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ export async function extractionTemplates(apiKey: string){
190190
body: html,
191191
content_type: "text/html",
192192
// provide template:
193-
ephemeral_template: template,
193+
extraction_ephemeral_template: template,
194194
})
195195
);
196196
console.log('product extract');

‎examples/node_commonjs/commonjs_examples.cjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ async function extractionTemplates(apiKey){
190190
body: html,
191191
content_type: "text/html",
192192
// provide template:
193-
ephemeral_template: template,
193+
extraction_ephemeral_template: template,
194194
})
195195
);
196196
console.log('product extract');

‎src/extractionconfig.ts

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import * as errors from './errors.ts';
22
import { urlsafe_b64encode } from './utils.ts';
3+
import { ExtractionConfigError } from './errors.ts';
34

45
export enum CompressionFormat {
56
/**
@@ -21,8 +22,8 @@ type ExtractionConfigOptions = {
2122
content_type: string;
2223
url?: string;
2324
charset?: string;
24-
template?: string; // saved template name
25-
ephemeral_template?: object; // ephemeraly declared json template
25+
extraction_template?: string; // saved template name
26+
extraction_ephemeral_template?: object; // ephemeraly declared json template
2627
extraction_prompt?: string;
2728
extraction_model?: string;
2829
is_document_compressed?: boolean;
@@ -35,8 +36,8 @@ export class ExtractionConfig {
3536
content_type: string;
3637
url?: string;
3738
charset?: string;
38-
template?: string; // saved template name
39-
ephemeral_template?: object; // ephemeraly declared json template
39+
extraction_template?: string; // saved template name
40+
extraction_ephemeral_template?: object; // ephemeraly declared json template
4041
extraction_prompt?: string;
4142
extraction_model?: string;
4243
is_document_compressed?: boolean;
@@ -57,8 +58,8 @@ export class ExtractionConfig {
5758
this.content_type = options.content_type;
5859
this.url = options.url ?? this.url;
5960
this.charset = options.charset ?? this.charset;
60-
this.template = options.template ?? this.template;
61-
this.ephemeral_template = options.ephemeral_template ?? this.ephemeral_template;
61+
this.extraction_template = options.extraction_template ?? this.extraction_template;
62+
this.extraction_ephemeral_template = options.extraction_ephemeral_template ?? this.extraction_ephemeral_template;
6263
this.extraction_prompt = options.extraction_prompt ?? this.extraction_prompt;
6364
this.extraction_model = options.extraction_model ?? this.extraction_model;
6465
this.is_document_compressed = options.is_document_compressed ?? this.is_document_compressed;
@@ -90,18 +91,18 @@ export class ExtractionConfig {
9091
params.charset = this.charset;
9192
}
9293

93-
if (this.template && this.ephemeral_template) {
94-
throw new errors.ExtractionConfigError(
95-
'You cannot pass both parameters template and ephemeral_template. You must choose',
94+
if (this.extraction_template && this.extraction_ephemeral_template) {
95+
throw new ExtractionConfigError(
96+
'You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose',
9697
);
9798
}
9899

99-
if (this.template) {
100-
params.extraction_template = this.template;
100+
if (this.extraction_template) {
101+
params.extraction_template = this.extraction_template;
101102
}
102103

103-
if (this.ephemeral_template) {
104-
params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.ephemeral_template));
104+
if (this.extraction_ephemeral_template) {
105+
params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.extraction_ephemeral_template));
105106
}
106107

107108
if (this.extraction_prompt) {

‎src/scrapeconfig.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ type ScrapeConfigOptions = {
6161
tags?: string[];
6262
format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw' | Format;
6363
format_options?: ('no_links' | 'no_images' | 'only_content' | FormatOption)[];
64+
extraction_template?: string; // saved template name
65+
extraction_ephemeral_template?: object; // ephemeraly declared json template
66+
extraction_prompt?: string;
67+
extraction_model?: string;
6468
correlation_id?: string;
6569
cookies?: Rec<string>;
6670
body?: string;
@@ -104,6 +108,10 @@ export class ScrapeConfig {
104108
tags: Set<string> = new Set<string>();
105109
format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw' | Format;
106110
format_options?: ('no_links' | 'no_images' | 'only_content' | FormatOption)[];
111+
extraction_template?: string; // saved template name
112+
extraction_ephemeral_template?: object; // ephemeraly declared json template
113+
extraction_prompt?: string;
114+
extraction_model?: string;
107115
correlation_id?: string;
108116
cookies?: Rec<string>;
109117
body?: string;
@@ -163,6 +171,10 @@ export class ScrapeConfig {
163171
this.tags = new Set(options.tags) ?? this.tags;
164172
this.format = options.format ?? this.format;
165173
this.format_options = options.format_options ?? this.format_options;
174+
this.extraction_template = options.extraction_template ?? this.extraction_template;
175+
this.extraction_ephemeral_template = options.extraction_ephemeral_template ?? this.extraction_ephemeral_template;
176+
this.extraction_prompt = options.extraction_prompt ?? this.extraction_prompt;
177+
this.extraction_model = options.extraction_model ?? this.extraction_model;
166178
this.correlation_id = options.correlation_id ?? this.correlation_id;
167179
this.cookies = options.cookies
168180
? Object.fromEntries(Object.entries(options.cookies).map(([k, v]) => [k.toLowerCase(), v]))
@@ -338,6 +350,27 @@ export class ScrapeConfig {
338350
params.format += ':' + this.format_options.join(',');
339351
}
340352
}
353+
if (this.extraction_template && this.extraction_ephemeral_template) {
354+
throw new ScrapeConfigError(
355+
'You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose',
356+
);
357+
}
358+
359+
if (this.extraction_template) {
360+
params.extraction_template = this.extraction_template;
361+
}
362+
363+
if (this.extraction_ephemeral_template) {
364+
params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.extraction_ephemeral_template));
365+
}
366+
367+
if (this.extraction_prompt) {
368+
params.extraction_prompt = this.extraction_prompt;
369+
}
370+
371+
if (this.extraction_model) {
372+
params.extraction_model = this.extraction_model;
373+
}
341374
if (this.correlation_id) {
342375
params.correlation_id = this.correlation_id;
343376
}

0 commit comments

Comments
 (0)