diff --git a/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java b/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java index 17cb4de7a37..fbf2f529e63 100644 --- a/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java +++ b/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java @@ -1,6 +1,6 @@ package org.jabref.logic.importer.fetcher; -import java.io.IOException; +import java.io.InputStream; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; @@ -32,6 +32,7 @@ import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.identifier.DOI; import org.jabref.model.entry.types.StandardEntryType; +import org.jabref.model.util.DummyFileUpdateMonitor; import org.jabref.model.util.OptionalUtil; import com.google.common.util.concurrent.RateLimiter; @@ -64,6 +65,10 @@ public class DoiFetcher implements IdBasedFetcher, EntryBasedFetcher { */ private static final RateLimiter CROSSREF_DCN_RATE_LIMITER = RateLimiter.create(50.0); + private static final FieldFormatterCleanup NORMALIZE_PAGES = new FieldFormatterCleanup(StandardField.PAGES, new NormalizePagesFormatter()); + private static final FieldFormatterCleanup CLEAR_URL = new FieldFormatterCleanup(StandardField.URL, new ClearFormatter()); + private static final FieldFormatterCleanup HTML_TO_LATEX_TITLE = new FieldFormatterCleanup(StandardField.TITLE, new HtmlToLatexFormatter()); + private final ImportFormatPreferences preferences; public DoiFetcher(ImportFormatPreferences preferences) { @@ -116,75 +121,73 @@ protected CompletableFuture> asyncPerformSearchById(String id @Override public Optional performSearchById(String identifier) throws FetcherException { - Optional doi = DOI.parse(identifier); - - if (doi.isEmpty()) { - throw new FetcherException(Localization.lang("Invalid DOI: '%0'.", identifier)); - } + DOI doi = DOI.parse(identifier) + .orElseThrow(() -> new FetcherException(Localization.lang("Invalid DOI: '%0'.", identifier))); URL doiURL; try { - doiURL = URLUtil.create(doi.get().getURIAsASCIIString()); + doiURL = URLUtil.create(doi.getURIAsASCIIString()); } catch (MalformedURLException e) { throw new FetcherException("Malformed URL", e); } - try { - Optional fetchedEntry; + Optional fetchedEntry; - // mEDRA does not return a parsable bibtex string - Optional agency = getAgency(doi.get()); - if (agency.isPresent() && "medra".equalsIgnoreCase(agency.get())) { - return new Medra().performSearchById(identifier); - } - - // BibTeX data - URLDownload download = getUrlDownload(doiURL); - download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX); + // mEDRA does not return a parsable bibtex string + Optional agency; + try { + agency = getAgency(doi); + } catch (MalformedURLException e) { + throw new FetcherException("Invalid URL", e); + } + if (agency.isPresent() && "medra".equalsIgnoreCase(agency.get())) { + return new Medra().performSearchById(identifier); + } - String bibtexString; - URLConnection openConnection; + URLDownload download = getUrlDownload(doiURL); + download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX); + HttpURLConnection connection = (HttpURLConnection) download.openConnection(); + InputStream inputStream = download.asInputStream(connection); - openConnection = download.openConnection(); - bibtexString = URLDownload.asString(openConnection).trim(); + BibtexParser bibtexParser = new BibtexParser(preferences, new DummyFileUpdateMonitor()); + try { + fetchedEntry = bibtexParser.parseEntries(inputStream).stream().findFirst(); + } catch (ParseException e) { + throw new FetcherException(doiURL, "Could not parse BibTeX entry", e); + } + // Crossref has a dynamic API rate limit + if (agency.isPresent() && "crossref".equalsIgnoreCase(agency.get())) { + updateCrossrefAPIRate(connection); + } + connection.disconnect(); - // BibTeX entry - fetchedEntry = BibtexParser.singleFromString(bibtexString, preferences); - fetchedEntry.ifPresent(this::doPostCleanup); + fetchedEntry.ifPresent(entry -> { + doPostCleanup(entry); - // Crossref has a dynamic API rate limit - if (agency.isPresent() && "crossref".equalsIgnoreCase(agency.get())) { - updateCrossrefAPIRate(openConnection); + // Output warnings in case of inconsistencies + entry.getField(StandardField.DOI) + .filter(entryDoi -> entryDoi.equals(doi.asString())) + .ifPresent(entryDoi -> LOGGER.warn("Fetched entry's DOI {} is different from requested DOI {}", entryDoi, identifier)); + if (entry.getField(StandardField.DOI).isEmpty()) { + LOGGER.warn("Fetched entry does not contain doi field {}", identifier); } - // Check if the entry is an APS journal and add the article id as the page count if page field is missing - if (fetchedEntry.isPresent() && fetchedEntry.get().hasField(StandardField.DOI)) { - BibEntry entry = fetchedEntry.get(); - if (isAPSJournal(entry, entry.getField(StandardField.DOI).get()) && !entry.hasField(StandardField.PAGES)) { - setPageCountToArticleId(entry, entry.getField(StandardField.DOI).get()); - } + if (isAPSJournal(entry, doi) && !entry.hasField(StandardField.PAGES)) { + setPageNumbersBasedOnDoi(entry, doi); } + }); - if (openConnection instanceof HttpURLConnection connection) { - connection.disconnect(); - } - return fetchedEntry; - } catch (IOException e) { - throw new FetcherException(doiURL, Localization.lang("Connection error"), e); - } catch (ParseException e) { - throw new FetcherException(doiURL, "Could not parse BibTeX entry", e); - } catch (JSONException e) { - throw new FetcherException(doiURL, "Could not retrieve Registration Agency", e); - } + return fetchedEntry; } private void doPostCleanup(BibEntry entry) { - new FieldFormatterCleanup(StandardField.PAGES, new NormalizePagesFormatter()).cleanup(entry); - new FieldFormatterCleanup(StandardField.URL, new ClearFormatter()).cleanup(entry); - new FieldFormatterCleanup(StandardField.TITLE, new HtmlToLatexFormatter()).cleanup(entry); + NORMALIZE_PAGES.cleanup(entry); + CLEAR_URL.cleanup(entry); + HTML_TO_LATEX_TITLE.cleanup(entry); + entry.trimLeft(); } - private void updateCrossrefAPIRate(URLConnection existingConnection) { + private synchronized void updateCrossrefAPIRate(URLConnection existingConnection) { try { // Assuming this field is given in seconds String xRateLimitInterval = existingConnection.getHeaderField("X-Rate-Limit-Interval").replaceAll("[^\\.0123456789]", ""); @@ -221,8 +224,9 @@ public List performSearch(@NonNull BibEntry entry) throws FetcherExcep public Optional getAgency(DOI doi) throws FetcherException, MalformedURLException { Optional agency = Optional.empty(); try { - URLDownload download = getUrlDownload(URLUtil.create(DOI.AGENCY_RESOLVER + "/" + URLEncoder.encode(doi.asString(), - StandardCharsets.UTF_8))); + URLDownload download = getUrlDownload( + URLUtil.create(DOI.AGENCY_RESOLVER + "/" + URLEncoder.encode(doi.asString(), + StandardCharsets.UTF_8))); JSONObject response = new JSONArray(download.asString()).getJSONObject(0); if (response != null) { agency = Optional.ofNullable(response.optString("RA")); @@ -235,18 +239,20 @@ public Optional getAgency(DOI doi) throws FetcherException, MalformedURL return agency; } - private void setPageCountToArticleId(BibEntry entry, String doiAsString) { + private void setPageNumbersBasedOnDoi(BibEntry entry, DOI doi) { + String doiAsString = doi.asString(); String articleId = doiAsString.substring(doiAsString.lastIndexOf('.') + 1); entry.setField(StandardField.PAGES, articleId); } // checks if the entry is an APS journal by comparing the organization id and the suffix format - private boolean isAPSJournal(BibEntry entry, String doiAsString) { + private boolean isAPSJournal(BibEntry entry, DOI doi) { if (!entry.getType().equals(StandardEntryType.Article)) { return false; } - String suffix = doiAsString.substring(doiAsString.lastIndexOf('/') + 1); - String organizationId = doiAsString.substring(doiAsString.indexOf('.') + 1, doiAsString.indexOf('/')); + String doiString = doi.asString(); + String suffix = doiString.substring(doiString.lastIndexOf('/') + 1); + String organizationId = doiString.substring(doiString.indexOf('.') + 1, doiString.indexOf('/')); return APS_JOURNAL_ORG_DOI_ID.equals(organizationId) && APS_SUFFIX_PATTERN.matcher(suffix).matches(); } } diff --git a/jablib/src/main/java/org/jabref/logic/net/URLDownload.java b/jablib/src/main/java/org/jabref/logic/net/URLDownload.java index 6ee23092945..10a0e5b3f89 100644 --- a/jablib/src/main/java/org/jabref/logic/net/URLDownload.java +++ b/jablib/src/main/java/org/jabref/logic/net/URLDownload.java @@ -54,20 +54,20 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * URL download to a string. - *

- * Example: - * - * URLDownload dl = new URLDownload(URL); - * String content = dl.asString(ENCODING); - * dl.toFile(Path); // available in FILE - * String contentType = dl.getMimeType(); - * - *

- * Almost each call to a public method creates a new HTTP connection (except for {@link #asString(Charset, URLConnection) asString}, - * which uses an already opened connection). Nothing is cached. - */ +/// ## Example +/// +/// ``java +/// URLDownload dl = new URLDownload(URL); +/// String content = dl.asString(ENCODING); +/// dl.toFile(Path); // available in FILE +/// String contentType = dl.getMimeType(); +/// `` +/// +/// Almost every call to a public method creates a new HTTP connection +/// (except for {@link #asString(Charset, URLConnection) asString}, +/// which uses an already opened connection). +/// +/// Nothing is cached. public class URLDownload { public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0"; @@ -139,7 +139,7 @@ public Optional getMimeType() { // @formatter:on retries++; HttpResponse response = Unirest.head(urlToCheck).asString(); - // Check if we have redirects, e.g. arxiv will give otherwise content type html for the original url + // Check if we have redirects, e.g. arxiv will give otherwise content type HTML for the original url // We need to do it "manually", because ".followRedirects(true)" only works for GET not for HEAD locationHeader = response.getHeaders().getFirst("location"); if (!StringUtil.isNullOrEmpty(locationHeader)) { @@ -218,26 +218,6 @@ public String asString() throws FetcherException { return asString(StandardCharsets.UTF_8, this.openConnection()); } - /** - * Downloads the web resource to a String. - * - * @param encoding the desired String encoding - * @return the downloaded string - */ - public String asString(Charset encoding) throws FetcherException { - return asString(encoding, this.openConnection()); - } - - /** - * Downloads the web resource to a String from an existing connection. Uses UTF-8 as encoding. - * - * @param existingConnection an existing connection - * @return the downloaded string - */ - public static String asString(URLConnection existingConnection) throws FetcherException { - return asString(StandardCharsets.UTF_8, existingConnection); - } - /** * Downloads the web resource to a String. * @@ -245,7 +225,7 @@ public static String asString(URLConnection existingConnection) throws FetcherEx * @param connection an existing connection * @return the downloaded string */ - public static String asString(Charset encoding, URLConnection connection) throws FetcherException { + private static String asString(Charset encoding, URLConnection connection) throws FetcherException { try (InputStream input = new BufferedInputStream(connection.getInputStream()); Writer output = new StringWriter()) { copy(input, output, encoding); @@ -285,12 +265,16 @@ public void toFile(Path destination) throws FetcherException { } } - /** - * Takes the web resource as the source for a monitored input stream. - */ + /// Uses the web resource as source and creates a monitored input stream. public ProgressInputStream asInputStream() throws FetcherException { HttpURLConnection urlConnection = (HttpURLConnection) this.openConnection(); + return asInputStream(urlConnection); + } + /// Uses the web resource as source and creates a monitored input stream. + /// + /// Exposing the urlConnection is required for dynamic API limiting of CrossRef + public ProgressInputStream asInputStream(HttpURLConnection urlConnection) throws FetcherException { int responseCode; try { responseCode = urlConnection.getResponseCode(); diff --git a/jablib/src/main/java/org/jabref/model/entry/BibEntry.java b/jablib/src/main/java/org/jabref/model/entry/BibEntry.java index dae25f2c70a..f5b15b4eea7 100644 --- a/jablib/src/main/java/org/jabref/model/entry/BibEntry.java +++ b/jablib/src/main/java/org/jabref/model/entry/BibEntry.java @@ -1268,4 +1268,10 @@ public boolean isEmpty() { } return StandardField.AUTOMATIC_FIELDS.containsAll(this.getFields()); } + + /// Trims whitespaces at the beginning of the BibEntry + public void trimLeft() { + this.parsedSerialization = parsedSerialization.trim(); // we should do "trimLeft", but currently, it is OK as is. + this.commentsBeforeEntry = commentsBeforeEntry.trim(); // we should do "trimLeft", but currently, it is OK as is. + } } diff --git a/jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.http b/jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.http new file mode 100644 index 00000000000..048dfb14f71 --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.http @@ -0,0 +1,2 @@ +GET https://doi.org/10.1109/ICWS.2007.59 +Accept: application/x-bibtex diff --git a/jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.java b/jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.java index 7ef2c9d3131..0b17caf0948 100644 --- a/jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.java +++ b/jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.java @@ -57,6 +57,8 @@ class DoiFetcherTest { .withField(StandardField.JOURNAL, "Chemical Engineering Transactions") .withField(StandardField.PAGES, "871–876") .withField(StandardField.VOLUME, "77"); + + // APS Journal private final BibEntry bibEntryStenzel2020 = new BibEntry(StandardEntryType.Article) .withCitationKey("Stenzel_2020") .withField(StandardField.AUTHOR, "Stenzel, L. and Hayward, A. L. C. and Schollwöck, U. and Heidrich-Meisner, F.") @@ -68,8 +70,9 @@ class DoiFetcherTest { .withField(StandardField.DOI, "10.1103/physreva.102.023315") .withField(StandardField.ISSN, "2469-9934") .withField(StandardField.PUBLISHER, "American Physical Society (APS)") - .withField(StandardField.PAGES, "023315") + .withField(StandardField.PAGES, "023315") // This is the last part of the DOI .withField(StandardField.NUMBER, "2"); + private final BibEntry bibBenedetto2000 = new BibEntry(StandardEntryType.Article) .withCitationKey("Benedetto_2000") .withField(StandardField.AUTHOR, "Benedetto, D. and Caglioti, E. and Marchioro, C.") diff --git a/jablib/src/test/java/org/jabref/logic/net/URLDownloadTest.java b/jablib/src/test/java/org/jabref/logic/net/URLDownloadTest.java index 1a35c4f1c7b..8f6f07ceb9d 100644 --- a/jablib/src/test/java/org/jabref/logic/net/URLDownloadTest.java +++ b/jablib/src/test/java/org/jabref/logic/net/URLDownloadTest.java @@ -3,7 +3,6 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import java.nio.charset.StandardCharsets; import java.nio.file.Path; import org.jabref.logic.importer.FetcherClientException; @@ -34,13 +33,6 @@ void stringDownloadWithSetEncoding() throws MalformedURLException, FetcherExcept assertTrue(dl.asString().contains("Google"), "google.com should contain google"); } - @Test - void stringDownload() throws MalformedURLException, FetcherException { - URLDownload dl = new URLDownload(URLUtil.create("http://www.google.com")); - - assertTrue(dl.asString(StandardCharsets.UTF_8).contains("Google"), "google.com should contain google"); - } - @Test void fileDownload() throws IOException, FetcherException { File destination = File.createTempFile("jabref-test", ".html"); diff --git a/jablib/src/test/java/org/jabref/model/entry/BibEntryTrimLeftTest.java b/jablib/src/test/java/org/jabref/model/entry/BibEntryTrimLeftTest.java new file mode 100644 index 00000000000..905313a6f86 --- /dev/null +++ b/jablib/src/test/java/org/jabref/model/entry/BibEntryTrimLeftTest.java @@ -0,0 +1,55 @@ +package org.jabref.model.entry; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +class BibEntryTrimLeftTest { + + @Test + void trimLeftRemovesLeadingWhitespaceFromParsedSerializationAndComments() { + BibEntry entry = new BibEntry(); + // include leading and trailing whitespace + entry.setParsedSerialization(" \t\n Some serialization \n\t "); + entry.setCommentsBeforeEntry(" % a comment \n"); + + // sanity + assertNotNull(entry.getParsedSerialization()); + assertNotNull(entry.getUserComments()); + + entry.trimLeft(); + + // Current implementation uses String.trim(), which removes leading and trailing whitespace. + // The important bit to test here is that leading whitespace is gone. + assertEquals("Some serialization", entry.getParsedSerialization()); + assertEquals("% a comment", entry.getUserComments()); + } + + @Test + void trimLeftIsIdempotent() { + BibEntry entry = new BibEntry(); + entry.setParsedSerialization(" Text "); + entry.setCommentsBeforeEntry(" % c "); + + entry.trimLeft(); + String afterFirst = entry.getParsedSerialization(); + String commentsAfterFirst = entry.getUserComments(); + + entry.trimLeft(); + assertEquals(afterFirst, entry.getParsedSerialization()); + assertEquals(commentsAfterFirst, entry.getUserComments()); + } + + @Test + void trimLeftWithEmptyStringsKeepsEmpty() { + BibEntry entry = new BibEntry(); + entry.setParsedSerialization(""); + entry.setCommentsBeforeEntry(""); + + entry.trimLeft(); + + assertEquals("", entry.getParsedSerialization()); + assertEquals("", entry.getUserComments()); + } +}