Skip to content
Merged
2 changes: 2 additions & 0 deletions src/extraction/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ export {
export type { StrategyResult } from './strategies';
export { buildExtractionPlan, buildFieldPlan, isSafeSelectorToken } from './plan';
export type { ExtractionPlan, ExtractionFieldPlan } from './plan';
export { buildExtractionQueryPlan, parseExtractionQuery, ExtractionQueryParseError } from './query-parser';
export type { ExtractionQueryPlan } from './query-parser';
285 changes: 285 additions & 0 deletions src/extraction/query-parser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
import type { ExtractionSchema, SchemaProperty } from './schema-validator';

const SAFE_FIELD = /^[a-zA-Z][a-zA-Z0-9_-]*$/;
const SUPPORTED_TYPES = new Set(['string', 'number', 'integer', 'boolean', 'url', 'date']);

type TokenType = 'braceL' | 'braceR' | 'bracketL' | 'bracketR' | 'parenL' | 'parenR' | 'comma' | 'identifier' | 'string' | 'eof';

interface Token {
type: TokenType;
value: string;
pos: number;
}

export interface QueryFieldNode {
name: string;
list: boolean;
type?: string;
description?: string;
children?: QueryFieldNode[];
pos: number;
}

export interface QueryAst {
fields: QueryFieldNode[];
}

export interface ExtractionQueryPlan {
schema: ExtractionSchema;
multiple: boolean;
normalizedQuery: string;
rootListField?: string;
}

export class ExtractionQueryParseError extends Error {
constructor(message: string, public readonly position: number) {
super(`${message} at position ${position}`);
this.name = 'ExtractionQueryParseError';
}
}

function tokenize(input: string): Token[] {
const tokens: Token[] = [];
let i = 0;
while (i < input.length) {
const ch = input[i];
if (/\s/.test(ch)) { i++; continue; }
if (ch === '{') { tokens.push({ type: 'braceL', value: ch, pos: i++ }); continue; }
if (ch === '}') { tokens.push({ type: 'braceR', value: ch, pos: i++ }); continue; }
if (ch === '[') { tokens.push({ type: 'bracketL', value: ch, pos: i++ }); continue; }
if (ch === ']') { tokens.push({ type: 'bracketR', value: ch, pos: i++ }); continue; }
if (ch === '(') { tokens.push({ type: 'parenL', value: ch, pos: i++ }); continue; }
if (ch === ')') { tokens.push({ type: 'parenR', value: ch, pos: i++ }); continue; }
if (ch === ',') { tokens.push({ type: 'comma', value: ch, pos: i++ }); continue; }
if (ch === '"') {
const start = i;
i++;
let value = '';
while (i < input.length && input[i] !== '"') {
if (input[i] === '\\' && i + 1 < input.length) {
value += input[i + 1];
i += 2;
} else {
value += input[i++];
}
}
if (i >= input.length) throw new ExtractionQueryParseError('Unterminated string literal', start);
i++;
tokens.push({ type: 'string', value, pos: start });
continue;
}
if (/[a-zA-Z_]/.test(ch)) {
const start = i;
i++;
while (i < input.length && /[a-zA-Z0-9_-]/.test(input[i])) i++;
tokens.push({ type: 'identifier', value: input.slice(start, i), pos: start });
continue;
}
throw new ExtractionQueryParseError(`Unexpected token "${ch}"`, i);
}
tokens.push({ type: 'eof', value: '', pos: input.length });
return tokens;
}

class Parser {
private index = 0;
constructor(private readonly tokens: Token[]) {}

parse(): QueryAst {
this.expect('braceL');
const fields = this.parseFields('braceR');
this.expect('braceR');
this.expect('eof');
if (fields.length === 0) throw new ExtractionQueryParseError('Query must contain at least one field', 0);
return { fields };
}

private parseFields(end: TokenType): QueryFieldNode[] {
const fields: QueryFieldNode[] = [];
while (this.peek().type !== end && this.peek().type !== 'eof') {
fields.push(this.parseField());
if (this.peek().type === 'comma') this.index++;
}
return fields;
}

private parseField(): QueryFieldNode {
const nameToken = this.expect('identifier');
if (!SAFE_FIELD.test(nameToken.value)) {
throw new ExtractionQueryParseError(`Unsafe field name "${nameToken.value}"`, nameToken.pos);
}

let list = false;
if (this.peek().type === 'bracketL') {
this.index++;
this.expect('bracketR');
list = true;
}

let type: string | undefined;
let description: string | undefined;
if (this.peek().type === 'parenL') {
this.index++;
const args = this.parseArgs();
type = args.type;
description = args.description;
this.expect('parenR');
}

let children: QueryFieldNode[] | undefined;
if (this.peek().type === 'braceL') {
this.index++;
children = this.parseFields('braceR');
this.expect('braceR');
if (children.length === 0) {
throw new ExtractionQueryParseError(`List/object field "${nameToken.value}" must contain at least one child field`, nameToken.pos);
}
}

if (list && !children) {
throw new ExtractionQueryParseError(`List field "${nameToken.value}" must have a child block`, nameToken.pos);
}

return { name: nameToken.value, list, type, description, children, pos: nameToken.pos };
}

private parseArgs(): { type?: string; description?: string } {
let type: string | undefined;
let description: string | undefined;
while (this.peek().type !== 'parenR' && this.peek().type !== 'eof') {
const token = this.peek();
if (token.type === 'identifier') {
this.index++;
const normalized = token.value.toLowerCase();
if (!SUPPORTED_TYPES.has(normalized)) {
throw new ExtractionQueryParseError(`Unsupported type "${token.value}"`, token.pos);
}
type = normalized === 'url' || normalized === 'date' ? 'string' : normalized;
if (normalized === 'url' || normalized === 'date') {
description = description ? `${description}; type hint: ${normalized}` : `type hint: ${normalized}`;
}
} else if (token.type === 'string') {
this.index++;
description = description ? `${description}; ${token.value}` : token.value;
} else if (token.type === 'comma') {
this.index++;
} else {
throw new ExtractionQueryParseError(`Unexpected argument token "${token.value}"`, token.pos);
}
}
return { type, description };
}

private peek(): Token { return this.tokens[this.index]; }

private expect(type: TokenType): Token {
const token = this.peek();
if (token.type !== type) {
throw new ExtractionQueryParseError(`Expected ${type}, got ${token.type}${token.value ? ` "${token.value}"` : ''}`, token.pos);
}
this.index++;
return token;
}
}


function assertScalarFields(fields: QueryFieldNode[], context: string): void {
for (const field of fields) {
if (field.list || field.children) {
throw new ExtractionQueryParseError(
`Nested field "${field.name}" is not supported in the local query subset (${context}). Use flat fields or one root list block.`,
field.pos,
);
}
}
}

function normalizeFields(fields: QueryFieldNode[]): string {
return fields.map((field) => {
const typePart = field.type ? `(${field.type})` : '';
const descPart = field.description ? `:${field.description}` : '';
if (field.list) {
return `${field.name}[]{${normalizeFields(field.children || [])}}${descPart}`;
}
if (field.children) {
return `${field.name}{${normalizeFields(field.children)}}${descPart}`;
}
return `${field.name}${typePart}${descPart}`;
}).join(' ');
}

function fieldToProperty(field: QueryFieldNode): SchemaProperty {
if (field.list) {
return {
type: 'array',
description: field.description,
items: {
type: 'object',
properties: fieldsToProperties(field.children || []),
required: (field.children || []).map(child => child.name),
},
};
}
if (field.children) {
return {
type: 'object',
description: field.description,
properties: fieldsToProperties(field.children),
required: field.children.map(child => child.name),
};
}
return {
type: field.type || 'string',
...(field.description ? { description: field.description } : {}),
};
}

function fieldsToProperties(fields: QueryFieldNode[]): Record<string, SchemaProperty> {
const props: Record<string, SchemaProperty> = {};
for (const field of fields) {
props[field.name] = fieldToProperty(field);
}
return props;
}

export function parseExtractionQuery(query: string): QueryAst {
if (!query || query.trim().length === 0) {
throw new ExtractionQueryParseError('Query must be a non-empty string', 0);
}
return new Parser(tokenize(query)).parse();
}

export function buildExtractionQueryPlan(query: string): ExtractionQueryPlan {
const ast = parseExtractionQuery(query);
const normalizedQuery = `{ ${normalizeFields(ast.fields)} }`;
if (ast.fields.length === 1 && ast.fields[0].list) {
const root = ast.fields[0];
assertScalarFields(root.children || [], `root list ${root.name}`);
return {
multiple: true,
normalizedQuery,
rootListField: root.name,
schema: {
type: 'array',
...(root.description ? { description: root.description } : {}),
items: {
type: 'object',
properties: fieldsToProperties(root.children || []),
required: (root.children || []).map(child => child.name),
},
},
};
}

assertScalarFields(ast.fields, 'root object');

return {
multiple: false,
normalizedQuery,
schema: {
type: 'object',
properties: fieldsToProperties(ast.fields),
required: ast.fields.map(field => field.name),
},
};
}
10 changes: 2 additions & 8 deletions src/extraction/strategies.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ interface RuntimeFieldPlan {
field: string;
aliases: string[];
selectorTokens: string[];
expectedType?: string | string[];
}

function normalisePlans(fields: FieldInput): RuntimeFieldPlan[] {
Expand All @@ -21,18 +20,13 @@ function normalisePlans(fields: FieldInput): RuntimeFieldPlan[] {
field: field.field,
aliases: field.aliases,
selectorTokens: field.selectorTokens,
expectedType: field.expectedType,
};
});
}

export function buildJsonLdExtractor(fields: FieldInput): string {
const plans = normalisePlans(fields);
// val(v, t): project JSON-LD value based on declared schema type.
// Scalar types attempt common JSON-LD shape projections; object/untyped preserve as-is.
// IIFE scalar projection keys tried in order: @value, value, ratingValue, name.
const valFn = `function val(v,t){if(v===null||v===undefined)return v;var st={'string':1,'number':1,'integer':1,'boolean':1};var ts=Array.isArray(t)?t:[t];var isScalar=ts.some(function(x){return st[x]});if(!isScalar)return v;if(typeof v!=='object')return v;var proj=['@value','value','ratingValue','name'];for(var pi=0;pi<proj.length;pi++){if(has(v,proj[pi]))return v[proj[pi]]}return v}`;
return `(function(fp){var r=Object.create(null);var sc=document.querySelectorAll('script[type="application/ld+json"]');function has(o,k){return !!o&&Object.prototype.hasOwnProperty.call(o,k)}function get(o,k){return has(o,k)?o[k]:undefined}function norm(v){return String(v||'').toLowerCase().replace(/[^a-z0-9]/g,'')}function read(item,keys){for(var a=0;a<keys.length;a++){var key=keys[a];if(has(item,key))return item[key];var nk=norm(key);if(nk){for(var p in item){if(!has(item,p))continue;if(norm(p)===nk)return item[p]}}}return undefined}${valFn}for(var i=0;i<sc.length;i++){try{var d=JSON.parse(sc[i].textContent||'');var g=get(d,'@graph');var it=Array.isArray(d)?d:(g?g:[d]);for(var j=0;j<it.length;j++){var item=it[j];if(!item||typeof item!=='object')continue;for(var k=0;k<fp.length;k++){var f=fp[k];if(has(r,f.field)&&r[f.field]!=null)continue;var keys=f.aliases&&f.aliases.length?f.aliases:[f.field];var v=read(item,keys);var offers=get(item,'offers');if(v===undefined&&offers){var of=Array.isArray(offers)?offers:[offers];for(var o=0;o<of.length;o++){v=read(of[o],keys);if(v!==undefined)break}}if(v!==undefined)r[f.field]=val(v,f.expectedType)}}}catch(e){}}return r})(${JSON.stringify(plans)})`;
return `(function(fp){var r={};var sc=document.querySelectorAll('script[type="application/ld+json"]');function norm(v){return String(v||'').toLowerCase().replace(/[^a-z0-9]/g,'')}function read(item,keys){for(var a=0;a<keys.length;a++){var key=keys[a];if(item[key]!==undefined)return item[key];var nk=norm(key);for(var p in item){if(norm(p)===nk)return item[p]}}return undefined}function val(v){if(typeof v==='object'&&v&&!Array.isArray(v)){return v.name||v['@value']||v.value||v.ratingValue||JSON.stringify(v)}return v}for(var i=0;i<sc.length;i++){try{var d=JSON.parse(sc[i].textContent||'');var it=Array.isArray(d)?d:(d['@graph']?d['@graph']:[d]);for(var j=0;j<it.length;j++){var item=it[j];if(!item||typeof item!=='object')continue;for(var k=0;k<fp.length;k++){var f=fp[k];if(r[f.field]!=null)continue;var keys=f.aliases&&f.aliases.length?f.aliases:[f.field];var v=read(item,keys);if(v===undefined&&item.offers){var of=Array.isArray(item.offers)?item.offers:[item.offers];for(var o=0;o<of.length;o++){v=read(of[o],keys);if(v!==undefined)break}}if(v!==undefined)r[f.field]=val(v)}}}catch(e){}}return r})(${JSON.stringify(plans)})`;
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve object-typed JSON-LD values during projection

Stop projecting every JSON-LD object through val() without consulting the field schema type. In this commit expectedType was removed from runtime plans, so val() now converts objects to name/@value/value/ratingValue/JSON string for all fields; when callers request an object field (e.g. brand or aggregateRating), validateAndCoerce later sees a string and coerces it to null, causing previously extractable structured fields to disappear.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Restore own-property checks in JSON-LD extraction

The new JSON-LD script now reads values via item[key] and iterates for (var p in item) without hasOwnProperty guards, and it stores results in a plain {}. On pages where Object.prototype is extended (by site code or third-party scripts), inherited properties can be mistaken for extracted fields, or inherited r[field] values can block real assignments, producing incorrect output. The previous implementation explicitly guarded against inherited keys, so this is a regression in extraction correctness.

Useful? React with 👍 / 👎.

}

export function buildMicrodataExtractor(fields: FieldInput): string {
Expand All @@ -42,7 +36,7 @@ export function buildMicrodataExtractor(fields: FieldInput): string {

export function buildOpenGraphExtractor(fields: FieldInput): string {
const plans = normalisePlans(fields);
return `(function(fp){var r={};var mp={'title':['og:title','twitter:title'],'name':['og:title','twitter:title','og:site_name'],'headline':['og:title','twitter:title'],'description':['og:description','twitter:description','description'],'summary':['og:description','twitter:description','description'],'image':['og:image','twitter:image'],'imageurl':['og:image','twitter:image'],'url':['og:url'],'link':['og:url'],'author':['author','article:author'],'publisheddate':['article:published_time','date'],'publishedat':['article:published_time','date'],'date':['article:published_time','date'],'sitename':['og:site_name']};function norm(v){return String(v||'').toLowerCase().replace(/[^a-z0-9]/g,'')}for(var i=0;i<fp.length;i++){var f=fp[i];if(r[f.field]!=null)continue;var cd=[];for(var a=0;a<(f.aliases||[]).length;a++){var alias=f.aliases[a];var key=norm(alias);var mapped=mp[key]||[];cd=cd.concat(mapped);if(cd.indexOf(alias)===-1)cd.push(alias)}for(var c=0;c<cd.length;c++){try{var m=document.querySelector('meta[property="'+cd[c]+'"]')||document.querySelector('meta[name="'+cd[c]+'"]');if(m){var ct=m.getAttribute('content');if(ct){r[f.field]=ct;break}}}catch(e){}}if(!r[f.field]&&(norm(f.field)==='url'||norm(f.field)==='canonical')){var lk=document.querySelector('link[rel="canonical"]');if(lk)r[f.field]=lk.getAttribute('href')}}return r})(${JSON.stringify(plans)})`;
return `(function(fp){var r={};var mp={'title':['og:title','twitter:title'],'name':['og:title','twitter:title','og:site_name'],'headline':['og:title','twitter:title'],'description':['og:description','twitter:description','description'],'summary':['og:description','twitter:description','description'],'image':['og:image','twitter:image'],'imageurl':['og:image','twitter:image'],'url':['og:url'],'link':['og:url'],'author':['author','article:author'],'publisheddate':['article:published_time','date'],'publishedat':['article:published_time','date'],'date':['article:published_time','date'],'site_name':['og:site_name']};function norm(v){return String(v||'').toLowerCase().replace(/[^a-z0-9]/g,'')}for(var i=0;i<fp.length;i++){var f=fp[i];if(r[f.field]!=null)continue;var cd=[];for(var a=0;a<(f.aliases||[]).length;a++){var key=norm(f.aliases[a]);cd=cd.concat(mp[key]||[f.aliases[a]])}for(var c=0;c<cd.length;c++){try{var m=document.querySelector('meta[property="'+cd[c]+'"]')||document.querySelector('meta[name="'+cd[c]+'"]');if(m){var ct=m.getAttribute('content');if(ct){r[f.field]=ct;break}}}catch(e){}}if(!r[f.field]&&(norm(f.field)==='url'||norm(f.field)==='canonical')){var lk=document.querySelector('link[rel="canonical"]');if(lk)r[f.field]=lk.getAttribute('href')}}return r})(${JSON.stringify(plans)})`;
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Restore normalized key for OpenGraph site_name mapping

Use the normalized map key (sitename) for the og:site_name alias mapping. The alias lookup normalizes keys with norm(...), which strips underscores, so the new 'site_name' map entry is never hit; requests for a site_name field now skip og:site_name and can miss values that were previously resolved.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve fallback alias lookup for mapped OpenGraph keys

When an alias has a mapping entry (for example title), the new candidate-building logic only appends mapped OG/Twitter keys and no longer appends the original alias itself. That drops previous fallback behavior where meta[name="title"] (and similar non-OG tags) could still satisfy extraction, so pages that only expose generic meta names will now miss values that were previously found.

Useful? React with 👍 / 👎.

}

export function buildCssHeuristicExtractor(fields: FieldInput, schemaProps: Record<string, SchemaProperty>, scopeSelector?: string): string {
Expand Down
Loading