Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 61 additions & 55 deletions jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package org.jabref.logic.importer.fetcher;

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
Expand Down Expand Up @@ -32,6 +32,7 @@
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.identifier.DOI;
import org.jabref.model.entry.types.StandardEntryType;
import org.jabref.model.util.DummyFileUpdateMonitor;
import org.jabref.model.util.OptionalUtil;

import com.google.common.util.concurrent.RateLimiter;
Expand Down Expand Up @@ -64,6 +65,10 @@ public class DoiFetcher implements IdBasedFetcher, EntryBasedFetcher {
*/
private static final RateLimiter CROSSREF_DCN_RATE_LIMITER = RateLimiter.create(50.0);

private static final FieldFormatterCleanup NORMALIZE_PAGES = new FieldFormatterCleanup(StandardField.PAGES, new NormalizePagesFormatter());
private static final FieldFormatterCleanup CLEAR_URL = new FieldFormatterCleanup(StandardField.URL, new ClearFormatter());
private static final FieldFormatterCleanup HTML_TO_LATEX_TITLE = new FieldFormatterCleanup(StandardField.TITLE, new HtmlToLatexFormatter());

private final ImportFormatPreferences preferences;

public DoiFetcher(ImportFormatPreferences preferences) {
Expand Down Expand Up @@ -116,75 +121,73 @@ protected CompletableFuture<Optional<BibEntry>> asyncPerformSearchById(String id

@Override
public Optional<BibEntry> performSearchById(String identifier) throws FetcherException {
Optional<DOI> doi = DOI.parse(identifier);

if (doi.isEmpty()) {
throw new FetcherException(Localization.lang("Invalid DOI: '%0'.", identifier));
}
DOI doi = DOI.parse(identifier)
.orElseThrow(() -> new FetcherException(Localization.lang("Invalid DOI: '%0'.", identifier)));

URL doiURL;
try {
doiURL = URLUtil.create(doi.get().getURIAsASCIIString());
doiURL = URLUtil.create(doi.getURIAsASCIIString());
} catch (MalformedURLException e) {
throw new FetcherException("Malformed URL", e);
}

try {
Optional<BibEntry> fetchedEntry;
Optional<BibEntry> fetchedEntry;

// mEDRA does not return a parsable bibtex string
Optional<String> agency = getAgency(doi.get());
if (agency.isPresent() && "medra".equalsIgnoreCase(agency.get())) {
return new Medra().performSearchById(identifier);
}

// BibTeX data
URLDownload download = getUrlDownload(doiURL);
download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX);
// mEDRA does not return a parsable bibtex string
Optional<String> agency;
try {
agency = getAgency(doi);
} catch (MalformedURLException e) {
throw new FetcherException("Invalid URL", e);
}
if (agency.isPresent() && "medra".equalsIgnoreCase(agency.get())) {
return new Medra().performSearchById(identifier);
}

String bibtexString;
URLConnection openConnection;
URLDownload download = getUrlDownload(doiURL);
download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX);
HttpURLConnection connection = (HttpURLConnection) download.openConnection();
InputStream inputStream = download.asInputStream(connection);

openConnection = download.openConnection();
bibtexString = URLDownload.asString(openConnection).trim();
BibtexParser bibtexParser = new BibtexParser(preferences, new DummyFileUpdateMonitor());
try {
fetchedEntry = bibtexParser.parseEntries(inputStream).stream().findFirst();
} catch (ParseException e) {
throw new FetcherException(doiURL, "Could not parse BibTeX entry", e);
}
// Crossref has a dynamic API rate limit
if (agency.isPresent() && "crossref".equalsIgnoreCase(agency.get())) {
updateCrossrefAPIRate(connection);
}
connection.disconnect();

// BibTeX entry
fetchedEntry = BibtexParser.singleFromString(bibtexString, preferences);
fetchedEntry.ifPresent(this::doPostCleanup);
fetchedEntry.ifPresent(entry -> {
doPostCleanup(entry);

// Crossref has a dynamic API rate limit
if (agency.isPresent() && "crossref".equalsIgnoreCase(agency.get())) {
updateCrossrefAPIRate(openConnection);
// Output warnings in case of inconsistencies
entry.getField(StandardField.DOI)
.filter(entryDoi -> entryDoi.equals(doi.asString()))
.ifPresent(entryDoi -> LOGGER.warn("Fetched entry's DOI {} is different from requested DOI {}", entryDoi, identifier));
if (entry.getField(StandardField.DOI).isEmpty()) {
LOGGER.warn("Fetched entry does not contain doi field {}", identifier);
}

// Check if the entry is an APS journal and add the article id as the page count if page field is missing
if (fetchedEntry.isPresent() && fetchedEntry.get().hasField(StandardField.DOI)) {
BibEntry entry = fetchedEntry.get();
if (isAPSJournal(entry, entry.getField(StandardField.DOI).get()) && !entry.hasField(StandardField.PAGES)) {
setPageCountToArticleId(entry, entry.getField(StandardField.DOI).get());
}
if (isAPSJournal(entry, doi) && !entry.hasField(StandardField.PAGES)) {
setPageNumbersBasedOnDoi(entry, doi);
}
});

if (openConnection instanceof HttpURLConnection connection) {
connection.disconnect();
}
return fetchedEntry;
} catch (IOException e) {
throw new FetcherException(doiURL, Localization.lang("Connection error"), e);
} catch (ParseException e) {
throw new FetcherException(doiURL, "Could not parse BibTeX entry", e);
} catch (JSONException e) {
throw new FetcherException(doiURL, "Could not retrieve Registration Agency", e);
}
return fetchedEntry;
}

private void doPostCleanup(BibEntry entry) {
new FieldFormatterCleanup(StandardField.PAGES, new NormalizePagesFormatter()).cleanup(entry);
new FieldFormatterCleanup(StandardField.URL, new ClearFormatter()).cleanup(entry);
new FieldFormatterCleanup(StandardField.TITLE, new HtmlToLatexFormatter()).cleanup(entry);
NORMALIZE_PAGES.cleanup(entry);
CLEAR_URL.cleanup(entry);
HTML_TO_LATEX_TITLE.cleanup(entry);
entry.trimLeft();
}

private void updateCrossrefAPIRate(URLConnection existingConnection) {
private synchronized void updateCrossrefAPIRate(URLConnection existingConnection) {
try {
// Assuming this field is given in seconds
String xRateLimitInterval = existingConnection.getHeaderField("X-Rate-Limit-Interval").replaceAll("[^\\.0123456789]", "");
Expand Down Expand Up @@ -221,8 +224,9 @@ public List<BibEntry> performSearch(@NonNull BibEntry entry) throws FetcherExcep
public Optional<String> getAgency(DOI doi) throws FetcherException, MalformedURLException {
Optional<String> agency = Optional.empty();
try {
URLDownload download = getUrlDownload(URLUtil.create(DOI.AGENCY_RESOLVER + "/" + URLEncoder.encode(doi.asString(),
StandardCharsets.UTF_8)));
URLDownload download = getUrlDownload(
URLUtil.create(DOI.AGENCY_RESOLVER + "/" + URLEncoder.encode(doi.asString(),
StandardCharsets.UTF_8)));
JSONObject response = new JSONArray(download.asString()).getJSONObject(0);
if (response != null) {
agency = Optional.ofNullable(response.optString("RA"));
Expand All @@ -235,18 +239,20 @@ public Optional<String> getAgency(DOI doi) throws FetcherException, MalformedURL
return agency;
}

private void setPageCountToArticleId(BibEntry entry, String doiAsString) {
private void setPageNumbersBasedOnDoi(BibEntry entry, DOI doi) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Count and page numbers are two different thing
E.g total pages

String doiAsString = doi.asString();
String articleId = doiAsString.substring(doiAsString.lastIndexOf('.') + 1);
entry.setField(StandardField.PAGES, articleId);
}

// checks if the entry is an APS journal by comparing the organization id and the suffix format
private boolean isAPSJournal(BibEntry entry, String doiAsString) {
private boolean isAPSJournal(BibEntry entry, DOI doi) {
if (!entry.getType().equals(StandardEntryType.Article)) {
return false;
}
String suffix = doiAsString.substring(doiAsString.lastIndexOf('/') + 1);
String organizationId = doiAsString.substring(doiAsString.indexOf('.') + 1, doiAsString.indexOf('/'));
String doiString = doi.asString();
String suffix = doiString.substring(doiString.lastIndexOf('/') + 1);
String organizationId = doiString.substring(doiString.indexOf('.') + 1, doiString.indexOf('/'));
return APS_JOURNAL_ORG_DOI_ID.equals(organizationId) && APS_SUFFIX_PATTERN.matcher(suffix).matches();
}
}
62 changes: 23 additions & 39 deletions jablib/src/main/java/org/jabref/logic/net/URLDownload.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,20 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* URL download to a string.
* <p>
* Example:
* <code>
* URLDownload dl = new URLDownload(URL);
* String content = dl.asString(ENCODING);
* dl.toFile(Path); // available in FILE
* String contentType = dl.getMimeType();
* </code>
* <br/><br/>
* Almost each call to a public method creates a new HTTP connection (except for {@link #asString(Charset, URLConnection) asString},
* which uses an already opened connection). Nothing is cached.
*/
/// ## Example
///
/// ``java
/// URLDownload dl = new URLDownload(URL);
/// String content = dl.asString(ENCODING);
/// dl.toFile(Path); // available in FILE
/// String contentType = dl.getMimeType();
/// ``
///
/// Almost every call to a public method creates a new HTTP connection
/// (except for {@link #asString(Charset, URLConnection) asString},
/// which uses an already opened connection).
///
/// Nothing is cached.
public class URLDownload {

public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0";
Expand Down Expand Up @@ -139,7 +139,7 @@ public Optional<String> getMimeType() {
// @formatter:on
retries++;
HttpResponse<String> response = Unirest.head(urlToCheck).asString();
// Check if we have redirects, e.g. arxiv will give otherwise content type html for the original url
// Check if we have redirects, e.g. arxiv will give otherwise content type HTML for the original url
// We need to do it "manually", because ".followRedirects(true)" only works for GET not for HEAD
locationHeader = response.getHeaders().getFirst("location");
if (!StringUtil.isNullOrEmpty(locationHeader)) {
Expand Down Expand Up @@ -218,34 +218,14 @@ public String asString() throws FetcherException {
return asString(StandardCharsets.UTF_8, this.openConnection());
}

/**
* Downloads the web resource to a String.
*
* @param encoding the desired String encoding
* @return the downloaded string
*/
public String asString(Charset encoding) throws FetcherException {
return asString(encoding, this.openConnection());
}

/**
* Downloads the web resource to a String from an existing connection. Uses UTF-8 as encoding.
*
* @param existingConnection an existing connection
* @return the downloaded string
*/
public static String asString(URLConnection existingConnection) throws FetcherException {
return asString(StandardCharsets.UTF_8, existingConnection);
}

/**
* Downloads the web resource to a String.
*
* @param encoding the desired String encoding
* @param connection an existing connection
* @return the downloaded string
*/
public static String asString(Charset encoding, URLConnection connection) throws FetcherException {
private static String asString(Charset encoding, URLConnection connection) throws FetcherException {
try (InputStream input = new BufferedInputStream(connection.getInputStream());
Writer output = new StringWriter()) {
copy(input, output, encoding);
Expand Down Expand Up @@ -285,12 +265,16 @@ public void toFile(Path destination) throws FetcherException {
}
}

/**
* Takes the web resource as the source for a monitored input stream.
*/
/// Uses the web resource as source and creates a monitored input stream.
public ProgressInputStream asInputStream() throws FetcherException {
HttpURLConnection urlConnection = (HttpURLConnection) this.openConnection();
return asInputStream(urlConnection);
}

/// Uses the web resource as source and creates a monitored input stream.
///
/// Exposing the urlConnection is required for dynamic API limiting of CrossRef
public ProgressInputStream asInputStream(HttpURLConnection urlConnection) throws FetcherException {
int responseCode;
try {
responseCode = urlConnection.getResponseCode();
Expand Down
6 changes: 6 additions & 0 deletions jablib/src/main/java/org/jabref/model/entry/BibEntry.java
Original file line number Diff line number Diff line change
Expand Up @@ -1268,4 +1268,10 @@ public boolean isEmpty() {
}
return StandardField.AUTOMATIC_FIELDS.containsAll(this.getFields());
}

/// Trims whitespaces at the beginning of the BibEntry
public void trimLeft() {
this.parsedSerialization = parsedSerialization.trim(); // we should do "trimLeft", but currently, it is OK as is.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a test

this.commentsBeforeEntry = commentsBeforeEntry.trim(); // we should do "trimLeft", but currently, it is OK as is.
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
GET https://doi.org/10.1109/ICWS.2007.59
Accept: application/x-bibtex
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ class DoiFetcherTest {
.withField(StandardField.JOURNAL, "Chemical Engineering Transactions")
.withField(StandardField.PAGES, "871–876")
.withField(StandardField.VOLUME, "77");

// APS Journal
private final BibEntry bibEntryStenzel2020 = new BibEntry(StandardEntryType.Article)
.withCitationKey("Stenzel_2020")
.withField(StandardField.AUTHOR, "Stenzel, L. and Hayward, A. L. C. and Schollwöck, U. and Heidrich-Meisner, F.")
Expand All @@ -68,8 +70,9 @@ class DoiFetcherTest {
.withField(StandardField.DOI, "10.1103/physreva.102.023315")
.withField(StandardField.ISSN, "2469-9934")
.withField(StandardField.PUBLISHER, "American Physical Society (APS)")
.withField(StandardField.PAGES, "023315")
.withField(StandardField.PAGES, "023315") // This is the last part of the DOI
.withField(StandardField.NUMBER, "2");

private final BibEntry bibBenedetto2000 = new BibEntry(StandardEntryType.Article)
.withCitationKey("Benedetto_2000")
.withField(StandardField.AUTHOR, "Benedetto, D. and Caglioti, E. and Marchioro, C.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;

import org.jabref.logic.importer.FetcherClientException;
Expand Down Expand Up @@ -34,13 +33,6 @@ void stringDownloadWithSetEncoding() throws MalformedURLException, FetcherExcept
assertTrue(dl.asString().contains("Google"), "google.com should contain google");
}

@Test
void stringDownload() throws MalformedURLException, FetcherException {
URLDownload dl = new URLDownload(URLUtil.create("http://www.google.com"));

assertTrue(dl.asString(StandardCharsets.UTF_8).contains("Google"), "google.com should contain google");
}

@Test
void fileDownload() throws IOException, FetcherException {
File destination = File.createTempFile("jabref-test", ".html");
Expand Down