Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUGFIX-8] Improve extraction of resource titles from Markdown links #11

Merged
merged 10 commits into from
Sep 24, 2022
128 changes: 112 additions & 16 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
const fs = require("fs");
const path = require("path");
const remark = require("remark");
const { Objects, Strings } = require("./lib/functions");
const languages = require("./languages");
const commandLineArgs = require("command-line-args");

Expand Down Expand Up @@ -31,24 +32,25 @@ const excludes = [
* @returns {string} an string with the name of the section related with the input heading
*/
function getSectionNameFromHeadingContent(children) {
// visit nodes in depth
const walk = (children, depth) =>
children.reduce((text, node, index) => {
if (!node || !node.type) return text;
if (!node || !node.type) return text; // not AST, maybe plain text
switch (node.type) {
//
// meaningfull nodes
//
case "emphasis":
text += "_" + walk(node.children, depth + 1) + "_";
break;
case "inlineCode":
text += "`" + node.value + "`";
break;
case "strong":
text += "**" + walk(node.children, depth + 1) + "**";
text += Strings.templater(remarkTokenAST(node), {
text: walk(node.children, depth + 1),
});
break;
case "inlineCode":
case "text":
text += node.value;
text += Strings.templater(remarkTokenAST(node), {
text: node.value,
});
break;
//
// skipped nodes
Expand All @@ -67,6 +69,100 @@ function getSectionNameFromHeadingContent(children) {
return walk(children, 0);
}

/**
* Parses the contents of a link from remark-parse into a readable format.
*
* @param {Array<Object>} children - an array of AST items defined by remark-parse for
* the content of a link (A)
*
* @returns {string} an string with the text of the related input link
*/
function getLinkTextFromLinkNodes(children) {
// visit nodes in depth
const walk = (children, depth) => {
// not AST, maybe plain text
if (!Array.isArray(children)) return Objects.toString(children);
// AST children array nodes
return children.reduce((text, node, index) => {
if (!node || !node.type) return text; // not AST, maybe plain text
switch (node.type) {
//
// rebuild meaningfull nodes
//
case "image":
text += Strings.templater(remarkTokenAST(node), {
text: node.alt || node.title,
url: node.url,
});
break;
case "inlineCode":
case "text":
text += Strings.templater(remarkTokenAST(node), {
text: node.value,
});
break;
case "emphasis":
case "strong":
text += Strings.templater(remarkTokenAST(node), {
text: walk(node.children, depth + 1),
});
break;
//
// skipped nodes
//
default:
console.log(
"getLinkTextFromLinkNodes::skipped",
depth,
node.type,
node
);
break;
}
return text;
}, "");
};

return walk(children, 0);
}

/**
* Gets the template related with AST remark-parse node.
* @param {Object} node - AST node defined by remark-parse
* @returns {string} - the template string
*/
function remarkTokenAST(node) {
if (node && node.type) {
switch (node.type) {
case "break": // {type: 'break', position: {...}}
return "<br/>";
case "emphasis": // {type: 'emphasis', children: [...], position: {...}}
return Strings.wrap("{{text}}", "_");
case "heading": // {type: 'heading', depth: 1, children: [...], position: {...}}
return ["#".repeat(item.depth || 0), "{{text}}"].join("");
case "image": // {type: 'image', title: '...', url: '...', alt: '...', position: {...}}
return "![{{text}}]({{url}})";
case "inlineCode": // {type: 'inlineCode', value: '...', position: {...}}
return Strings.wrap("{{text}}", "`");
case "link": // {type: 'link', title: '...', url: '...', children: [...], position: {...}}
return "[{{text}}]({{url}})";
case "list": // {type: 'list', ordered: false, start: null, spread: false, children: [...], position: {...}}
case "listItem": // {type: 'listItem', spread: false, checked: null, children: [...], position: {...}}
// TODO: generate token for list/listItem
break;
case "strong": // {type: 'strong', children: [...], position: {...}}
return Strings.wrap("{{text}}", "**");
case "html": // {type: 'html', value: '...', position: {...}}
case "paragraph": // {type: 'paragraph', children: [...], position: {...}}
case "text": // {type: 'text', value: '...', position: {...}}
return Strings.wrap("{{text}}"); // identity
default:
break;
}
}
throw new Error("Unrecognized remark node type: " + (node && node.type));
}

/**
* Parses a list item generated from remark-parse into a readable format.
*
Expand All @@ -81,17 +177,15 @@ function getSectionNameFromHeadingContent(children) {
* @return {Object} Returns an Object containing details about the piece of media.
*/
function parseListItem(listItem) {
let stripParens = function (s) {
if (s.slice(0, 1) === "(" && s.slice(-1) === ")") return s.slice(1, -1);
return s;
};
let entry = {};
let s = ""; // If we need to build up a string over multiple listItem elements
let leftParen,
rightParen = -1; // If we need to parse parenthesized text
const [link, ...otherStuff] = listItem; // head of listItem = url, the rest is "other stuff"
// head of listItem = url, the rest is "other stuff"
const [link, ...otherStuff] = listItem;
entry.url = link.url;
entry.title = link.children[0].value;
// link.children || link.value => weak way to check if link.type === "link"
entry.title = getLinkTextFromLinkNodes(link.children || link.value);
// remember to get OTHER STUFF!! remember there may be multiple links!
for (let i of otherStuff) {
if (s === "") {
Expand All @@ -117,7 +211,7 @@ function parseListItem(listItem) {
// other links found
if (entry.otherLinks === undefined) entry.otherLinks = [];
entry.otherLinks.push({
title: stripParens(i.children[0].value),
title: Strings.stripParens(getLinkTextFromLinkNodes(i.children)),
url: i.url,
});
// entry.otherLinks = [...entry.otherLinks, {title: i.children[0].value, url: i.url}]; // <-- i wish i could get this syntax to work with arrays
Expand Down Expand Up @@ -154,7 +248,9 @@ function parseListItem(listItem) {
s += i.value;
} else {
// finally, we have reached the end of the note
entry.notes.push(stripParens(s + i.value.slice(0, rightParen + 1)));
entry.notes.push(
Strings.stripParens(s + i.value.slice(0, rightParen + 1))
);
s = "";
// this is a copypaste of another block of code. probably not a good thing tbh.
leftParen = i.value.indexOf("(");
Expand Down
19 changes: 19 additions & 0 deletions lib/functions/Objects.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/**
* To string
* @param {any} o - the object to get it text representation
* @returns {string} the `o` as string
*/
function toString(o) {
// null or undefined
if (o === null || o === void 0) return o;
// is string
if (typeof o === "string") return o;
// has a toString function in their prototype
if (typeof o.toString === "function") return o.toString();
// as string in the latest intent
return String(o);
}

module.exports = {
toString,
};
45 changes: 45 additions & 0 deletions lib/functions/Strings.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/**
* Strip wrapped parenthesis from a string.
* @param {string} s - the string to process
* @returns {string} the stripped string if parens found, the input string if don't
*/
function stripParens(s) {
// null or undefined
if (s === null || s === void 0) return s;
// is wrapped by ( and )?, then unwrap
if (s.slice(0, 1) === "(" && s.slice(-1) === ")") return s.slice(1, -1);
// leave as it is
return s;
}

/**
* Replaces a data tokens in a template string.
* @param {string} template - the template string
* @param {object} context - the data used to replace the tokens with
* @returns string replace
*/
function templater(template, context = {}) {
// replaceAll using a replacer function
return template.replace(
/{{([^{}]+)}}/g, // {{key}}
(matchedText, key) => context[key] || ""
);
}

/**
* Wraps a string between other that acts as token.
* @param {string} s - the text to wrap
* @param {string} token - the text to wrap with between
* @returns a string in the form `${token}${s}${token}`
*/
function wrap(s, token = "") {
// avoid mix concatenate/sum string/numbers using array join hack
//return `${token}${s}${token}`;
return [token, token].join(s);
}

module.exports = {
stripParens,
templater,
wrap,
};
7 changes: 7 additions & 0 deletions lib/functions/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
const Objects = require("./Objects");
const Strings = require("./Strings");

module.exports = {
Objects,
Strings,
};