serpapi · Lannister34 · Aug 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -49,3 +49,6 @@ build-iPhoneSimulator/
 # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
 .rvmrc
 .DS_Store
+
+.idea
+node_modules
diff --git a/README.md b/README.md
@@ -26,3 +26,13 @@ Add also to your array the painting thumbnails present in the result page file (
 Test against 2 other similar result pages to make sure it works against different layouts. (Pages that contain the same kind of carrousel. Don't necessarily have to be paintings.)
 
 The suggested time for this challenge is 4 hours. But, you can take your time and work more on it if you want.
+
+## How to run parser
+
+1. `yarn install`
+
+2. `node run.js -i files/van-gogh-paintings.html -o result.json`
+
+## How to run tests
+
+1. `yarn test`
diff --git a/package.json b/package.json
@@ -0,0 +1,19 @@
+{
+  "type": "module",
+  "name": "code-challenge",
+  "version": "1.0.0",
+  "description": "Goal is to extract a list of Van Gogh paintings from the attached Google search results page.",
+  "main": "parsers/Parser.js",
+  "scripts": {
+    "test": "mocha tests/test.js"
+  },
+  "author": "",
+  "license": "ISC",
+  "dependencies": {
+    "cheerio": "^1.1.2",
+    "lodash": "^4.17.21",
+    "mocha": "^11.7.1",
+    "protocol-buffers": "^5.0.0",
+    "yargs": "^18.0.0"
+  }
+}
diff --git a/parsers/Parser.js b/parsers/Parser.js
@@ -0,0 +1,118 @@
+import _ from 'lodash';
+import * as cheerio from 'cheerio';
+import Ved from '../utils/Ved.js';
+import Cache from '../utils/Cache.js';
+
+export class Parser {
+  constructor(body, opt = {}) {
+    this.$ = cheerio.load(body);
+    const url = opt.url ? new URL(opt.url) : {};
+    this.domain = 'https://' + (url.hostname || this.getDefaultDomain());
+    this.ved = new Ved();
+    this.cache = new Cache();
+  }
+
+  getDefaultDomain() {
+    return 'www.google.com';
+  }
+
+  parse() {
+    return {};
+  }
+
+  parseAll() {
+    const parsed_object = this.parse(this);
+    return this.filterEmpty(parsed_object);
+  }
+
+  filterEmpty(obj) {
+    if (!obj) return {};
+    return _.pickBy(obj, v => _.isNumber(v) ? _.isFinite(v) : _.isBoolean(v) || !_.isEmpty(v));
+  }
+
+  toAbsoluteLink(link) {
+    return /^\/\//.test(link) ? 'https:' + link : link && link[0] === '/' ? this.domain + link.replace(/ /g, '+') : link || undefined;
+  }
+
+  getScripts() {
+    return this.cache.memoize('getScripts', () => this.$('script').contents().toArray().filter(c => c.type === 'text'));
+  }
+
+  parseVeds() {
+    return this.cache.memoize('parseVeds', () => {
+      const $ = this.$;
+      const veds = [...$('[data-ved]').toArray().map(el => $(el).attr('data-ved')), ...$('a[href*="&ved="]').toArray().map(el => {
+        try {
+          const url = new URL(this.toAbsoluteLink($(el).attr('href')));
+          return url.searchParams.get('ved');
+        } catch (e) {
+          return null;
+        }
+      })].filter(Boolean);
+
+      return _.chain(veds)
+        .groupBy(ved => this.ved.decode(ved)?.id)
+        .omitBy(_.isNil)
+        .mapValues(v => new Set(v))
+        .value();
+    });
+  }
+
+  getElementsByVedId(ids, container = this.$) {
+    const $ = this.$;
+    const containerEl = container === $ ? $.root()[0] : container[0];
+    const cacheKey = `vedId:${containerEl.id || 'root'}:${Array.isArray(ids) ? ids.join(',') : ids}`;
+
+    return this.cache.memoize(cacheKey, () => {
+      const parsedVeds = this.parseVeds();
+      const idArray = Array.isArray(ids) ? ids : [ids];
+      const veds = idArray.reduce((acc, id) => [...acc, ...(parsedVeds[id] || [])], []);
+
+      if (!veds.length) {
+        return $([]);
+      }
+
+      return this.getAllVedElements(Array.from(veds.values()), container);
+    });
+  }
+
+  getAllVedElements(veds, container = this.$) {
+    const $ = this.$;
+    const containerEl = container === $ ? $.root()[0] : container[0];
+    const cacheKey = `allVedEls:${containerEl.id || 'root'}:${Array.isArray(veds) ? veds.join(',') : veds}`;
+
+    return this.cache.memoize(cacheKey, () => {
+      container = container === $ ? $.root() : $(container);
+      return container.find('[data-ved], a[href*="&ved="]')
+        .filter((index,
+          el) => veds.includes($(el).attr('data-ved')) || veds.some(ved => $(el).attr('href')?.includes(`&ved=${ved}`)));
+    });
+  }
+
+  decodeLiteralEscapes(text) {
+    return text && text.replace(/(?<!\\)((\\\\)*)\\x([0-9a-f]{2})/g, (m, backslashes, backslashesInner,
+      code) => backslashes + String.fromCharCode(parseInt(code, 16)));
+  }
+
+  _collectTextNodes(container) {
+    const $ = this.$;
+    let results = [];
+
+    $(container).contents().each((_, el) => {
+      if (el.type === 'text') {
+        const text = $(el).text().replace(/\s+/g, ' ').trim();
+        if (text) {
+          results.push(text);
+        }
+      } else if (el.type === 'tag') {
+        results = results.concat(this._collectTextNodes(el));
+      }
+    });
+
+    return results;
+  }
+
+  getAllTextNodes(container) {
+    return this.cache.memoizeDeep('getAllTextNodes', container, () => this._collectTextNodes(container));
+  }
+}
diff --git a/parsers/TextSearchParser.js b/parsers/TextSearchParser.js
@@ -0,0 +1,39 @@
+import {Parser} from './Parser.js';
+
+class TextSearchParser extends Parser {
+  parse() {
+    return {
+      artworks: this.parseArtworks(),
+    };
+  }
+
+  parseArtworks() {
+    const $ = this.$;
+    return this.getElementsByVedId([55222, 29428]).toArray().map(item => {
+      item = $(item);
+      const [name, ...extensions] = this.getAllTextNodes(item);
+      const link = this.toAbsoluteLink(item.attr('href'));
+      const imageCnt = item.find('a>img');
+      let image = imageCnt.attr('data-src');
+      if (!image) {
+        const imageId = imageCnt.attr('id');
+        if (imageId) {
+          const script = this.getScripts()
+            .find(s => s.data.includes(imageId))?.data;
+          if (script) {
+            const imageMatch = script.match(/'(data:image\/[^;]+;base64,[^']+)'/);
+            if (imageMatch) {
+              image = this.decodeLiteralEscapes(imageMatch[1]);
+            }
+          }
+        }
+      }
+
+      return this.filterEmpty({
+        name, extensions, link, image,
+      });
+    });
+  }
+}
+
+export default TextSearchParser;
diff --git a/run.js b/run.js
@@ -0,0 +1,38 @@
+import {promises as fs} from 'fs';
+import path from 'path';
+import yargs from 'yargs/yargs';
+import {fileURLToPath} from 'url';
+import TextSearchParser from './parsers/TextSearchParser.js';
+
+const run_command_line = async () => {
+  const argv = yargs(process.argv.slice(2))
+    .usage('Usage: $0 -i <input> [options]')
+    .option('input', {
+      alias: 'i', type: 'string', description: 'Input file path', requiresArg: true, demandOption: true,
+    })
+    .option('output', {
+      alias: 'o', type: 'string', default: 'parsed.json', description: 'Output file path',
+    })
+    .parse();
+
+  const options = {
+    input: argv.input, output: argv.output,
+  };
+
+  const inputPath = path.isAbsolute(options.input) ? options.input : path.resolve(process.cwd(), options.input);
+
+  const fileContent = await fs.readFile(inputPath, 'utf8');
+  const parser = new TextSearchParser(fileContent);
+  const result = parser.parseAll();
+
+  const outputPath = path.isAbsolute(options.output) ? options.output : path.resolve(process.cwd(), options.output);
+  const outputDir = path.dirname(outputPath);
+  await fs.mkdir(outputDir, {recursive: true});
+  await fs.writeFile(outputPath, JSON.stringify(result, null, 2));
+
+  console.log(`Successfully parsed and saved results to ${path.relative(process.cwd(), outputPath)}`);
+};
+
+if (process.argv[1] === fileURLToPath(import.meta.url)) {
+  run_command_line();
+}