From 98fd1540f3840be85e2ed0b4bf0fa29426db1afd Mon Sep 17 00:00:00 2001 From: abaevbog Date: Sun, 4 Aug 2024 21:27:33 -0700 Subject: [PATCH] decode uri components in doi when possible (#28) Also, attempt to drop last bracket or paren if there is an opened one before DOI to not skip and pass existing tests. Fixes: zotero/zotero#3218 --- test/tests/utilitiesTest.js | 18 ++++++++++++++---- utilities.js | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/test/tests/utilitiesTest.js b/test/tests/utilitiesTest.js index 17ba91f..60b1bc4 100644 --- a/test/tests/utilitiesTest.js +++ b/test/tests/utilitiesTest.js @@ -93,13 +93,23 @@ describe("Zotero.Utilities", function() { assert.equal(cleanDOI(`Foo bar ${doi}. Foo bar`), doi); }); - // FIXME - it.skip("should parse a DOI in parentheses", function () { + it("should parse a DOI with URL-encoded < and >", function () { + const encodedUri = "10.1002/1096-9128(200005)12:6%3C375::AID-CPE480%3E3.0.CO;2-M"; + const expected = "10.1002/1096-9128(200005)12:6<375::AID-CPE480>3.0.CO;2-M"; + assert.equal(cleanDOI(`Foo bar ${encodedUri}. Foo bar`), expected); + }); + + it("should parse a DOI URL with encoded characters", function () { + const encodedUri = "https://doi.org/10.1002/1096-9128(200005)12:6%3C375::AID-CPE480%3E3.0.CO;2-M"; + const expected = "10.1002/1096-9128(200005)12:6<375::AID-CPE480>3.0.CO;2-M"; + assert.equal(cleanDOI(`Foo bar ${encodedUri}. Foo bar`), expected); + }); + + it("should parse a DOI in parentheses", function () { assert.equal(cleanDOI(`Foo bar (${doi}) foo bar`), doi); }); - // FIXME - it.skip("should parse a DOI in brackets", function () { + it("should parse a DOI in brackets", function () { assert.equal(cleanDOI(`Foo bar [${doi}] foo bar`), doi); }); }); diff --git a/utilities.js b/utilities.js index 6808118..7c3581f 100644 --- a/utilities.js +++ b/utilities.js @@ -482,9 +482,38 @@ var Utilities = { if(typeof(x) != "string") { throw new Error("cleanDOI: argument must be a string"); } - + // If it's a URL, decode it + if (x.match(/^https?:/)) { + x = decodeURIComponent(x); + } + // Even if it's not a URL, decode %3C followed by %3E as < > + var openingPos = x.indexOf("%3C"); + if (openingPos != -1 && openingPos < x.indexOf("%3E")) { + x = x.replace(/%3C/g, "<"); + x = x.replace(/%3E/g, ">"); + } var doi = x.match(/10(?:\.[0-9]{4,})?\/[^\s]*[^\s\.,]/); - return doi ? doi[0] : null; + if (!doi) { + return null; + } + var result = doi[0]; + + // Check if the DOI ends with a bracket + var trailingBracket = result.slice(-1); + if ([']', ')', '}'].includes(trailingBracket)) { + // Check the portion of the string before the matched DOI for an unclosed bracket + let beforeDOI = x.slice(0, doi.index); + let openingBracket = { + ']': '[', + ')': '(', + '}': '{' + }[trailingBracket]; + if (beforeDOI.lastIndexOf(openingBracket) > beforeDOI.lastIndexOf(trailingBracket)) { + // Remove the trailing bracket from the DOI + result = result.slice(0, -1); + } + } + return result; }, /**