-
Notifications
You must be signed in to change notification settings - Fork 35
feat(extraction): local query DSL for extract_data (#986) #1099
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6d721df
2db67d2
b565579
6c4ff70
98d1eee
4366798
2b487c7
f16af38
2e48202
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,285 @@ | ||
| import type { ExtractionSchema, SchemaProperty } from './schema-validator'; | ||
|
|
||
| const SAFE_FIELD = /^[a-zA-Z][a-zA-Z0-9_-]*$/; | ||
| const SUPPORTED_TYPES = new Set(['string', 'number', 'integer', 'boolean', 'url', 'date']); | ||
|
|
||
| type TokenType = 'braceL' | 'braceR' | 'bracketL' | 'bracketR' | 'parenL' | 'parenR' | 'comma' | 'identifier' | 'string' | 'eof'; | ||
|
|
||
| interface Token { | ||
| type: TokenType; | ||
| value: string; | ||
| pos: number; | ||
| } | ||
|
|
||
| export interface QueryFieldNode { | ||
| name: string; | ||
| list: boolean; | ||
| type?: string; | ||
| description?: string; | ||
| children?: QueryFieldNode[]; | ||
| pos: number; | ||
| } | ||
|
|
||
| export interface QueryAst { | ||
| fields: QueryFieldNode[]; | ||
| } | ||
|
|
||
| export interface ExtractionQueryPlan { | ||
| schema: ExtractionSchema; | ||
| multiple: boolean; | ||
| normalizedQuery: string; | ||
| rootListField?: string; | ||
| } | ||
|
|
||
| export class ExtractionQueryParseError extends Error { | ||
| constructor(message: string, public readonly position: number) { | ||
| super(`${message} at position ${position}`); | ||
| this.name = 'ExtractionQueryParseError'; | ||
| } | ||
| } | ||
|
|
||
| function tokenize(input: string): Token[] { | ||
| const tokens: Token[] = []; | ||
| let i = 0; | ||
| while (i < input.length) { | ||
| const ch = input[i]; | ||
| if (/\s/.test(ch)) { i++; continue; } | ||
| if (ch === '{') { tokens.push({ type: 'braceL', value: ch, pos: i++ }); continue; } | ||
| if (ch === '}') { tokens.push({ type: 'braceR', value: ch, pos: i++ }); continue; } | ||
| if (ch === '[') { tokens.push({ type: 'bracketL', value: ch, pos: i++ }); continue; } | ||
| if (ch === ']') { tokens.push({ type: 'bracketR', value: ch, pos: i++ }); continue; } | ||
| if (ch === '(') { tokens.push({ type: 'parenL', value: ch, pos: i++ }); continue; } | ||
| if (ch === ')') { tokens.push({ type: 'parenR', value: ch, pos: i++ }); continue; } | ||
| if (ch === ',') { tokens.push({ type: 'comma', value: ch, pos: i++ }); continue; } | ||
| if (ch === '"') { | ||
| const start = i; | ||
| i++; | ||
| let value = ''; | ||
| while (i < input.length && input[i] !== '"') { | ||
| if (input[i] === '\\' && i + 1 < input.length) { | ||
| value += input[i + 1]; | ||
| i += 2; | ||
| } else { | ||
| value += input[i++]; | ||
| } | ||
| } | ||
| if (i >= input.length) throw new ExtractionQueryParseError('Unterminated string literal', start); | ||
| i++; | ||
| tokens.push({ type: 'string', value, pos: start }); | ||
| continue; | ||
| } | ||
| if (/[a-zA-Z_]/.test(ch)) { | ||
| const start = i; | ||
| i++; | ||
| while (i < input.length && /[a-zA-Z0-9_-]/.test(input[i])) i++; | ||
| tokens.push({ type: 'identifier', value: input.slice(start, i), pos: start }); | ||
| continue; | ||
| } | ||
| throw new ExtractionQueryParseError(`Unexpected token "${ch}"`, i); | ||
| } | ||
| tokens.push({ type: 'eof', value: '', pos: input.length }); | ||
| return tokens; | ||
| } | ||
|
|
||
| class Parser { | ||
| private index = 0; | ||
| constructor(private readonly tokens: Token[]) {} | ||
|
|
||
| parse(): QueryAst { | ||
| this.expect('braceL'); | ||
| const fields = this.parseFields('braceR'); | ||
| this.expect('braceR'); | ||
| this.expect('eof'); | ||
| if (fields.length === 0) throw new ExtractionQueryParseError('Query must contain at least one field', 0); | ||
| return { fields }; | ||
| } | ||
|
|
||
| private parseFields(end: TokenType): QueryFieldNode[] { | ||
| const fields: QueryFieldNode[] = []; | ||
| while (this.peek().type !== end && this.peek().type !== 'eof') { | ||
| fields.push(this.parseField()); | ||
| if (this.peek().type === 'comma') this.index++; | ||
| } | ||
| return fields; | ||
| } | ||
|
|
||
| private parseField(): QueryFieldNode { | ||
| const nameToken = this.expect('identifier'); | ||
| if (!SAFE_FIELD.test(nameToken.value)) { | ||
| throw new ExtractionQueryParseError(`Unsafe field name "${nameToken.value}"`, nameToken.pos); | ||
| } | ||
|
|
||
| let list = false; | ||
| if (this.peek().type === 'bracketL') { | ||
| this.index++; | ||
| this.expect('bracketR'); | ||
| list = true; | ||
| } | ||
|
|
||
| let type: string | undefined; | ||
| let description: string | undefined; | ||
| if (this.peek().type === 'parenL') { | ||
| this.index++; | ||
| const args = this.parseArgs(); | ||
| type = args.type; | ||
| description = args.description; | ||
| this.expect('parenR'); | ||
| } | ||
|
|
||
| let children: QueryFieldNode[] | undefined; | ||
| if (this.peek().type === 'braceL') { | ||
| this.index++; | ||
| children = this.parseFields('braceR'); | ||
| this.expect('braceR'); | ||
| if (children.length === 0) { | ||
| throw new ExtractionQueryParseError(`List/object field "${nameToken.value}" must contain at least one child field`, nameToken.pos); | ||
| } | ||
| } | ||
|
|
||
| if (list && !children) { | ||
| throw new ExtractionQueryParseError(`List field "${nameToken.value}" must have a child block`, nameToken.pos); | ||
| } | ||
|
|
||
| return { name: nameToken.value, list, type, description, children, pos: nameToken.pos }; | ||
| } | ||
|
|
||
| private parseArgs(): { type?: string; description?: string } { | ||
| let type: string | undefined; | ||
| let description: string | undefined; | ||
| while (this.peek().type !== 'parenR' && this.peek().type !== 'eof') { | ||
| const token = this.peek(); | ||
| if (token.type === 'identifier') { | ||
| this.index++; | ||
| const normalized = token.value.toLowerCase(); | ||
| if (!SUPPORTED_TYPES.has(normalized)) { | ||
| throw new ExtractionQueryParseError(`Unsupported type "${token.value}"`, token.pos); | ||
| } | ||
| type = normalized === 'url' || normalized === 'date' ? 'string' : normalized; | ||
| if (normalized === 'url' || normalized === 'date') { | ||
| description = description ? `${description}; type hint: ${normalized}` : `type hint: ${normalized}`; | ||
| } | ||
| } else if (token.type === 'string') { | ||
| this.index++; | ||
| description = description ? `${description}; ${token.value}` : token.value; | ||
| } else if (token.type === 'comma') { | ||
| this.index++; | ||
| } else { | ||
| throw new ExtractionQueryParseError(`Unexpected argument token "${token.value}"`, token.pos); | ||
| } | ||
| } | ||
| return { type, description }; | ||
| } | ||
|
|
||
| private peek(): Token { return this.tokens[this.index]; } | ||
|
|
||
| private expect(type: TokenType): Token { | ||
| const token = this.peek(); | ||
| if (token.type !== type) { | ||
| throw new ExtractionQueryParseError(`Expected ${type}, got ${token.type}${token.value ? ` "${token.value}"` : ''}`, token.pos); | ||
| } | ||
| this.index++; | ||
| return token; | ||
| } | ||
| } | ||
|
|
||
|
|
||
| function assertScalarFields(fields: QueryFieldNode[], context: string): void { | ||
| for (const field of fields) { | ||
| if (field.list || field.children) { | ||
| throw new ExtractionQueryParseError( | ||
| `Nested field "${field.name}" is not supported in the local query subset (${context}). Use flat fields or one root list block.`, | ||
| field.pos, | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| function normalizeFields(fields: QueryFieldNode[]): string { | ||
| return fields.map((field) => { | ||
| const typePart = field.type ? `(${field.type})` : ''; | ||
| const descPart = field.description ? `:${field.description}` : ''; | ||
| if (field.list) { | ||
| return `${field.name}[]{${normalizeFields(field.children || [])}}${descPart}`; | ||
| } | ||
| if (field.children) { | ||
| return `${field.name}{${normalizeFields(field.children)}}${descPart}`; | ||
| } | ||
| return `${field.name}${typePart}${descPart}`; | ||
| }).join(' '); | ||
| } | ||
|
|
||
| function fieldToProperty(field: QueryFieldNode): SchemaProperty { | ||
| if (field.list) { | ||
| return { | ||
| type: 'array', | ||
| description: field.description, | ||
| items: { | ||
| type: 'object', | ||
| properties: fieldsToProperties(field.children || []), | ||
| required: (field.children || []).map(child => child.name), | ||
| }, | ||
| }; | ||
| } | ||
| if (field.children) { | ||
| return { | ||
| type: 'object', | ||
| description: field.description, | ||
| properties: fieldsToProperties(field.children), | ||
| required: field.children.map(child => child.name), | ||
| }; | ||
| } | ||
| return { | ||
| type: field.type || 'string', | ||
| ...(field.description ? { description: field.description } : {}), | ||
| }; | ||
| } | ||
|
|
||
| function fieldsToProperties(fields: QueryFieldNode[]): Record<string, SchemaProperty> { | ||
| const props: Record<string, SchemaProperty> = {}; | ||
| for (const field of fields) { | ||
| props[field.name] = fieldToProperty(field); | ||
| } | ||
| return props; | ||
| } | ||
|
|
||
| export function parseExtractionQuery(query: string): QueryAst { | ||
| if (!query || query.trim().length === 0) { | ||
| throw new ExtractionQueryParseError('Query must be a non-empty string', 0); | ||
| } | ||
| return new Parser(tokenize(query)).parse(); | ||
| } | ||
|
|
||
| export function buildExtractionQueryPlan(query: string): ExtractionQueryPlan { | ||
| const ast = parseExtractionQuery(query); | ||
| const normalizedQuery = `{ ${normalizeFields(ast.fields)} }`; | ||
| if (ast.fields.length === 1 && ast.fields[0].list) { | ||
| const root = ast.fields[0]; | ||
| assertScalarFields(root.children || [], `root list ${root.name}`); | ||
| return { | ||
| multiple: true, | ||
| normalizedQuery, | ||
| rootListField: root.name, | ||
| schema: { | ||
| type: 'array', | ||
| ...(root.description ? { description: root.description } : {}), | ||
| items: { | ||
| type: 'object', | ||
| properties: fieldsToProperties(root.children || []), | ||
| required: (root.children || []).map(child => child.name), | ||
| }, | ||
| }, | ||
| }; | ||
| } | ||
|
|
||
| assertScalarFields(ast.fields, 'root object'); | ||
|
|
||
| return { | ||
| multiple: false, | ||
| normalizedQuery, | ||
| schema: { | ||
| type: 'object', | ||
| properties: fieldsToProperties(ast.fields), | ||
| required: ast.fields.map(field => field.name), | ||
| }, | ||
| }; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,7 +9,6 @@ interface RuntimeFieldPlan { | |
| field: string; | ||
| aliases: string[]; | ||
| selectorTokens: string[]; | ||
| expectedType?: string | string[]; | ||
| } | ||
|
|
||
| function normalisePlans(fields: FieldInput): RuntimeFieldPlan[] { | ||
|
|
@@ -21,18 +20,13 @@ function normalisePlans(fields: FieldInput): RuntimeFieldPlan[] { | |
| field: field.field, | ||
| aliases: field.aliases, | ||
| selectorTokens: field.selectorTokens, | ||
| expectedType: field.expectedType, | ||
| }; | ||
| }); | ||
| } | ||
|
|
||
| export function buildJsonLdExtractor(fields: FieldInput): string { | ||
| const plans = normalisePlans(fields); | ||
| // val(v, t): project JSON-LD value based on declared schema type. | ||
| // Scalar types attempt common JSON-LD shape projections; object/untyped preserve as-is. | ||
| // IIFE scalar projection keys tried in order: @value, value, ratingValue, name. | ||
| const valFn = `function val(v,t){if(v===null||v===undefined)return v;var st={'string':1,'number':1,'integer':1,'boolean':1};var ts=Array.isArray(t)?t:[t];var isScalar=ts.some(function(x){return st[x]});if(!isScalar)return v;if(typeof v!=='object')return v;var proj=['@value','value','ratingValue','name'];for(var pi=0;pi<proj.length;pi++){if(has(v,proj[pi]))return v[proj[pi]]}return v}`; | ||
| return `(function(fp){var r=Object.create(null);var sc=document.querySelectorAll('script[type="application/ld+json"]');function has(o,k){return !!o&&Object.prototype.hasOwnProperty.call(o,k)}function get(o,k){return has(o,k)?o[k]:undefined}function norm(v){return String(v||'').toLowerCase().replace(/[^a-z0-9]/g,'')}function read(item,keys){for(var a=0;a<keys.length;a++){var key=keys[a];if(has(item,key))return item[key];var nk=norm(key);if(nk){for(var p in item){if(!has(item,p))continue;if(norm(p)===nk)return item[p]}}}return undefined}${valFn}for(var i=0;i<sc.length;i++){try{var d=JSON.parse(sc[i].textContent||'');var g=get(d,'@graph');var it=Array.isArray(d)?d:(g?g:[d]);for(var j=0;j<it.length;j++){var item=it[j];if(!item||typeof item!=='object')continue;for(var k=0;k<fp.length;k++){var f=fp[k];if(has(r,f.field)&&r[f.field]!=null)continue;var keys=f.aliases&&f.aliases.length?f.aliases:[f.field];var v=read(item,keys);var offers=get(item,'offers');if(v===undefined&&offers){var of=Array.isArray(offers)?offers:[offers];for(var o=0;o<of.length;o++){v=read(of[o],keys);if(v!==undefined)break}}if(v!==undefined)r[f.field]=val(v,f.expectedType)}}}catch(e){}}return r})(${JSON.stringify(plans)})`; | ||
| return `(function(fp){var r={};var sc=document.querySelectorAll('script[type="application/ld+json"]');function norm(v){return String(v||'').toLowerCase().replace(/[^a-z0-9]/g,'')}function read(item,keys){for(var a=0;a<keys.length;a++){var key=keys[a];if(item[key]!==undefined)return item[key];var nk=norm(key);for(var p in item){if(norm(p)===nk)return item[p]}}return undefined}function val(v){if(typeof v==='object'&&v&&!Array.isArray(v)){return v.name||v['@value']||v.value||v.ratingValue||JSON.stringify(v)}return v}for(var i=0;i<sc.length;i++){try{var d=JSON.parse(sc[i].textContent||'');var it=Array.isArray(d)?d:(d['@graph']?d['@graph']:[d]);for(var j=0;j<it.length;j++){var item=it[j];if(!item||typeof item!=='object')continue;for(var k=0;k<fp.length;k++){var f=fp[k];if(r[f.field]!=null)continue;var keys=f.aliases&&f.aliases.length?f.aliases:[f.field];var v=read(item,keys);if(v===undefined&&item.offers){var of=Array.isArray(item.offers)?item.offers:[item.offers];for(var o=0;o<of.length;o++){v=read(of[o],keys);if(v!==undefined)break}}if(v!==undefined)r[f.field]=val(v)}}}catch(e){}}return r})(${JSON.stringify(plans)})`; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The new JSON-LD script now reads values via Useful? React with 👍 / 👎. |
||
| } | ||
|
|
||
| export function buildMicrodataExtractor(fields: FieldInput): string { | ||
|
|
@@ -42,7 +36,7 @@ export function buildMicrodataExtractor(fields: FieldInput): string { | |
|
|
||
| export function buildOpenGraphExtractor(fields: FieldInput): string { | ||
| const plans = normalisePlans(fields); | ||
| return `(function(fp){var r={};var mp={'title':['og:title','twitter:title'],'name':['og:title','twitter:title','og:site_name'],'headline':['og:title','twitter:title'],'description':['og:description','twitter:description','description'],'summary':['og:description','twitter:description','description'],'image':['og:image','twitter:image'],'imageurl':['og:image','twitter:image'],'url':['og:url'],'link':['og:url'],'author':['author','article:author'],'publisheddate':['article:published_time','date'],'publishedat':['article:published_time','date'],'date':['article:published_time','date'],'sitename':['og:site_name']};function norm(v){return String(v||'').toLowerCase().replace(/[^a-z0-9]/g,'')}for(var i=0;i<fp.length;i++){var f=fp[i];if(r[f.field]!=null)continue;var cd=[];for(var a=0;a<(f.aliases||[]).length;a++){var alias=f.aliases[a];var key=norm(alias);var mapped=mp[key]||[];cd=cd.concat(mapped);if(cd.indexOf(alias)===-1)cd.push(alias)}for(var c=0;c<cd.length;c++){try{var m=document.querySelector('meta[property="'+cd[c]+'"]')||document.querySelector('meta[name="'+cd[c]+'"]');if(m){var ct=m.getAttribute('content');if(ct){r[f.field]=ct;break}}}catch(e){}}if(!r[f.field]&&(norm(f.field)==='url'||norm(f.field)==='canonical')){var lk=document.querySelector('link[rel="canonical"]');if(lk)r[f.field]=lk.getAttribute('href')}}return r})(${JSON.stringify(plans)})`; | ||
| return `(function(fp){var r={};var mp={'title':['og:title','twitter:title'],'name':['og:title','twitter:title','og:site_name'],'headline':['og:title','twitter:title'],'description':['og:description','twitter:description','description'],'summary':['og:description','twitter:description','description'],'image':['og:image','twitter:image'],'imageurl':['og:image','twitter:image'],'url':['og:url'],'link':['og:url'],'author':['author','article:author'],'publisheddate':['article:published_time','date'],'publishedat':['article:published_time','date'],'date':['article:published_time','date'],'site_name':['og:site_name']};function norm(v){return String(v||'').toLowerCase().replace(/[^a-z0-9]/g,'')}for(var i=0;i<fp.length;i++){var f=fp[i];if(r[f.field]!=null)continue;var cd=[];for(var a=0;a<(f.aliases||[]).length;a++){var key=norm(f.aliases[a]);cd=cd.concat(mp[key]||[f.aliases[a]])}for(var c=0;c<cd.length;c++){try{var m=document.querySelector('meta[property="'+cd[c]+'"]')||document.querySelector('meta[name="'+cd[c]+'"]');if(m){var ct=m.getAttribute('content');if(ct){r[f.field]=ct;break}}}catch(e){}}if(!r[f.field]&&(norm(f.field)==='url'||norm(f.field)==='canonical')){var lk=document.querySelector('link[rel="canonical"]');if(lk)r[f.field]=lk.getAttribute('href')}}return r})(${JSON.stringify(plans)})`; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Use the normalized map key ( Useful? React with 👍 / 👎. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When an alias has a mapping entry (for example Useful? React with 👍 / 👎. |
||
| } | ||
|
|
||
| export function buildCssHeuristicExtractor(fields: FieldInput, schemaProps: Record<string, SchemaProperty>, scopeSelector?: string): string { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Stop projecting every JSON-LD object through
val()without consulting the field schema type. In this commitexpectedTypewas removed from runtime plans, soval()now converts objects toname/@value/value/ratingValue/JSON string for all fields; when callers request an object field (e.g.brandoraggregateRating),validateAndCoercelater sees a string and coerces it tonull, causing previously extractable structured fields to disappear.Useful? React with 👍 / 👎.