Skip to content

Implement artworks parser #354

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,6 @@ build-iPhoneSimulator/
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
.rvmrc
.DS_Store

.idea
node_modules
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,13 @@ Add also to your array the painting thumbnails present in the result page file (
Test against 2 other similar result pages to make sure it works against different layouts. (Pages that contain the same kind of carrousel. Don't necessarily have to be paintings.)

The suggested time for this challenge is 4 hours. But, you can take your time and work more on it if you want.

## How to run parser

1. `yarn install`

2. `node run.js -i files/van-gogh-paintings.html -o result.json`

## How to run tests

1. `yarn test`
19 changes: 19 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"type": "module",
"name": "code-challenge",
"version": "1.0.0",
"description": "Goal is to extract a list of Van Gogh paintings from the attached Google search results page.",
"main": "parsers/Parser.js",
"scripts": {
"test": "mocha tests/test.js"
},
"author": "",
"license": "ISC",
"dependencies": {
"cheerio": "^1.1.2",
"lodash": "^4.17.21",
"mocha": "^11.7.1",
"protocol-buffers": "^5.0.0",
"yargs": "^18.0.0"
}
}
118 changes: 118 additions & 0 deletions parsers/Parser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import _ from 'lodash';
import * as cheerio from 'cheerio';
import Ved from '../utils/Ved.js';
import Cache from '../utils/Cache.js';

export class Parser {
constructor(body, opt = {}) {
this.$ = cheerio.load(body);
const url = opt.url ? new URL(opt.url) : {};
this.domain = 'https://' + (url.hostname || this.getDefaultDomain());
this.ved = new Ved();
this.cache = new Cache();
}

getDefaultDomain() {
return 'www.google.com';
}

parse() {
return {};
}

parseAll() {
const parsed_object = this.parse(this);
return this.filterEmpty(parsed_object);
}

filterEmpty(obj) {
if (!obj) return {};
return _.pickBy(obj, v => _.isNumber(v) ? _.isFinite(v) : _.isBoolean(v) || !_.isEmpty(v));
}

toAbsoluteLink(link) {
return /^\/\//.test(link) ? 'https:' + link : link && link[0] === '/' ? this.domain + link.replace(/ /g, '+') : link || undefined;
}

getScripts() {
return this.cache.memoize('getScripts', () => this.$('script').contents().toArray().filter(c => c.type === 'text'));
}

parseVeds() {
return this.cache.memoize('parseVeds', () => {
const $ = this.$;
const veds = [...$('[data-ved]').toArray().map(el => $(el).attr('data-ved')), ...$('a[href*="&ved="]').toArray().map(el => {
try {
const url = new URL(this.toAbsoluteLink($(el).attr('href')));
return url.searchParams.get('ved');
} catch (e) {
return null;
}
})].filter(Boolean);

return _.chain(veds)
.groupBy(ved => this.ved.decode(ved)?.id)
.omitBy(_.isNil)
.mapValues(v => new Set(v))
.value();
});
}

getElementsByVedId(ids, container = this.$) {
const $ = this.$;
const containerEl = container === $ ? $.root()[0] : container[0];
const cacheKey = `vedId:${containerEl.id || 'root'}:${Array.isArray(ids) ? ids.join(',') : ids}`;

return this.cache.memoize(cacheKey, () => {
const parsedVeds = this.parseVeds();
const idArray = Array.isArray(ids) ? ids : [ids];
const veds = idArray.reduce((acc, id) => [...acc, ...(parsedVeds[id] || [])], []);

if (!veds.length) {
return $([]);
}

return this.getAllVedElements(Array.from(veds.values()), container);
});
}

getAllVedElements(veds, container = this.$) {
const $ = this.$;
const containerEl = container === $ ? $.root()[0] : container[0];
const cacheKey = `allVedEls:${containerEl.id || 'root'}:${Array.isArray(veds) ? veds.join(',') : veds}`;

return this.cache.memoize(cacheKey, () => {
container = container === $ ? $.root() : $(container);
return container.find('[data-ved], a[href*="&ved="]')
.filter((index,
el) => veds.includes($(el).attr('data-ved')) || veds.some(ved => $(el).attr('href')?.includes(`&ved=${ved}`)));
});
}

decodeLiteralEscapes(text) {
return text && text.replace(/(?<!\\)((\\\\)*)\\x([0-9a-f]{2})/g, (m, backslashes, backslashesInner,
code) => backslashes + String.fromCharCode(parseInt(code, 16)));
}

_collectTextNodes(container) {
const $ = this.$;
let results = [];

$(container).contents().each((_, el) => {
if (el.type === 'text') {
const text = $(el).text().replace(/\s+/g, ' ').trim();
if (text) {
results.push(text);
}
} else if (el.type === 'tag') {
results = results.concat(this._collectTextNodes(el));
}
});

return results;
}

getAllTextNodes(container) {
return this.cache.memoizeDeep('getAllTextNodes', container, () => this._collectTextNodes(container));
}
}
39 changes: 39 additions & 0 deletions parsers/TextSearchParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import {Parser} from './Parser.js';

class TextSearchParser extends Parser {
parse() {
return {
artworks: this.parseArtworks(),
};
}

parseArtworks() {
const $ = this.$;
return this.getElementsByVedId([55222, 29428]).toArray().map(item => {
item = $(item);
const [name, ...extensions] = this.getAllTextNodes(item);
const link = this.toAbsoluteLink(item.attr('href'));
const imageCnt = item.find('a>img');
let image = imageCnt.attr('data-src');
if (!image) {
const imageId = imageCnt.attr('id');
if (imageId) {
const script = this.getScripts()
.find(s => s.data.includes(imageId))?.data;
if (script) {
const imageMatch = script.match(/'(data:image\/[^;]+;base64,[^']+)'/);
if (imageMatch) {
image = this.decodeLiteralEscapes(imageMatch[1]);
}
}
}
}

return this.filterEmpty({
name, extensions, link, image,
});
});
}
}

export default TextSearchParser;
38 changes: 38 additions & 0 deletions run.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import {promises as fs} from 'fs';
import path from 'path';
import yargs from 'yargs/yargs';
import {fileURLToPath} from 'url';
import TextSearchParser from './parsers/TextSearchParser.js';

const run_command_line = async () => {
const argv = yargs(process.argv.slice(2))
.usage('Usage: $0 -i <input> [options]')
.option('input', {
alias: 'i', type: 'string', description: 'Input file path', requiresArg: true, demandOption: true,
})
.option('output', {
alias: 'o', type: 'string', default: 'parsed.json', description: 'Output file path',
})
.parse();

const options = {
input: argv.input, output: argv.output,
};

const inputPath = path.isAbsolute(options.input) ? options.input : path.resolve(process.cwd(), options.input);

const fileContent = await fs.readFile(inputPath, 'utf8');
const parser = new TextSearchParser(fileContent);
const result = parser.parseAll();

const outputPath = path.isAbsolute(options.output) ? options.output : path.resolve(process.cwd(), options.output);
const outputDir = path.dirname(outputPath);
await fs.mkdir(outputDir, {recursive: true});
await fs.writeFile(outputPath, JSON.stringify(result, null, 2));

console.log(`Successfully parsed and saved results to ${path.relative(process.cwd(), outputPath)}`);
};

if (process.argv[1] === fileURLToPath(import.meta.url)) {
run_command_line();
}
Loading