Skip to content

Commit

Permalink
Improve parsing of short DOIs (#6920)
Browse files Browse the repository at this point in the history
  • Loading branch information
PremKolar committed Sep 20, 2020
1 parent 96c2fc8 commit d944eae
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 16 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- JabRef no longer opens the entry editor with the first entry on startup [#6855](https://github.com/JabRef/jabref/issues/6855)
- Fetch by ID: (long) "SAO/NASA Astrophysics Data System" replaced by (short) "SAO/NASA ADS" [#6876](https://github.com/JabRef/jabref/pull/6876)
- We changed the title of the window "Manage field names and content": to have the same title as the corresponding menu item [#6895](https://github.com/JabRef/jabref/pull/6895)
- Improved detection of "short" DOIs [6880](https://github.com/JabRef/jabref/issues/6880)

### Fixed

Expand Down
4 changes: 1 addition & 3 deletions src/main/java/org/jabref/logic/cleanup/DoiCleanup.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public List<FieldChange> cleanup(BibEntry entry) {
// Doi field seems to contain Doi -> cleanup note, url, ee field
for (Field field : FIELDS) {
entry.getField(field).flatMap(DOI::parse)
.ifPresent(unused -> removeFieldValue(entry, field, changes));
.ifPresent(unused -> removeFieldValue(entry, field, changes));
}
}
} else {
Expand All @@ -58,12 +58,10 @@ public List<FieldChange> cleanup(BibEntry entry) {
// Update Doi
Optional<FieldChange> change = entry.setField(StandardField.DOI, doi.get().getDOI());
change.ifPresent(changes::add);

removeFieldValue(entry, field, changes);
}
}
}

return changes;
}

Expand Down
50 changes: 38 additions & 12 deletions src/main/java/org/jabref/model/entry/identifier/DOI.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,39 @@ public class DOI implements Identifier {
+ ")"; // end group \1

// Regex (Short DOI)
private static final String SHORT_DOI_SHORTCUT = ""
+ "^\\s*(?:https?://)?(?:www\\.)?(?:doi\\.org/)([a-z0-9]{4,10})\\s*$"; // eg https://doi.org/bfrhmx
private static final String SHORT_DOI_EXP_PREFIX = ""
+ "^(?:" // can begin with...
+ "\\s*(?:https?://)?(?:www\\.)?" // optional url parts "http(s)://"+"www."
+ "[a-zA-Z\\.]*doi[a-zA-Z\\.]*" // eg "dx.doi." or "doi.acm." or "doi." if with url, must include "doi", otherwise too ambiguous
+ "\\.[a-zA-Z]{2,10}/)?"; // ".org" or ".de" or ".academy"
private static final String SHORT_DOI_EXP = ""
+ "(?:urn:)?" // optional urn
+ "(?:doi:)?" // optional doi
+ "(?:" // begin "any one of these"
+ "(?:[\\s/]?(?:(?:urn:)|(?:doi:)|(?:urn:doi:)))" // "doi:10/12ab" or " urn:10/12ab" or "/urn:doi:/10/12ab" ...
+ "|(?:\\s?/?)" // or "/10/12ab" or " /10/12ab" or "10/12ab" or " 10/12ab"
+ ")" // end "any one of these"
+ "(" // begin group \1
+ "10" // directory indicator
+ "[/:%]" // divider
+ "[a-zA-Z0-9]+"
+ ")"; // end group \1
+ "[/%:]" // divider
+ "[a-zA-Z0-9]{4,}" // at least 4 characters
+ ")" // end group \1
+ "\\s*$"; // must be the end
private static final String FIND_SHORT_DOI_EXP = ""
+ "(?:urn:)?" // optional urn
+ "(?:doi:)?" // optional doi
+ "(?:" // begin "any one of these"
+ "(?:/urn:)" // urn:10/ab12
+ "|" // or...
+ "(?:/doi:)" // doi:10/ab12
+ "|" // or...
+ "(?:/urn:doi:)" // urn:doi:10/ab12
+ "|" // or...
+ "(?:\\s/?)" // /10/ab12 or 10/ab12 (but not eg "2020/10/ab12")
+ "|" // or...
+ "(?:doi\\.org/)" // doi.org/10/ab12
+ ")" // end "any one of these"
+ "(" // begin group \1
+ "10" // directory indicator
+ "[/:]" // divider
+ "[a-zA-Z0-9]+"
+ "[/%:]" // divider
+ "(?:[^\\s]+)" // suffix alphanumeric without space
+ ")"; // end group \1

Expand All @@ -73,12 +91,13 @@ public class DOI implements Identifier {
private static final Pattern EXACT_DOI_PATT = Pattern.compile("^(?:https?://[^\\s]+?)?" + DOI_EXP + "$", Pattern.CASE_INSENSITIVE);
private static final Pattern DOI_PATT = Pattern.compile("(?:https?://[^\\s]+?)?" + FIND_DOI_EXP, Pattern.CASE_INSENSITIVE);
// Pattern (short DOI)
private static final Pattern EXACT_SHORT_DOI_PATT = Pattern.compile("^(?:https?://[^\\s]+?)?" + SHORT_DOI_EXP, Pattern.CASE_INSENSITIVE);
private static final Pattern EXACT_SHORT_DOI_SHORTCUT = Pattern.compile(SHORT_DOI_SHORTCUT, Pattern.CASE_INSENSITIVE); // eg doi.org/bfrhmx (no "10/")
private static final Pattern EXACT_SHORT_DOI_PATT = Pattern.compile(SHORT_DOI_EXP_PREFIX + SHORT_DOI_EXP, Pattern.CASE_INSENSITIVE);
private static final Pattern SHORT_DOI_PATT = Pattern.compile("(?:https?://[^\\s]+?)?" + FIND_SHORT_DOI_EXP, Pattern.CASE_INSENSITIVE);
// DOI
private final String doi;
// Short DOI
private boolean isShortDoi;
private boolean isShortDoi = false;

/**
* Creates a DOI from various schemes including URL, URN, and plain DOIs/Short DOIs.
Expand Down Expand Up @@ -116,7 +135,14 @@ public DOI(String doi) {
this.doi = shortDoiMatcher.group(1);
isShortDoi = true;
} else {
throw new IllegalArgumentException(trimmedDoi + " is not a valid DOI/Short DOI.");
// Shortcut DOI without the "10/" as in "doi.org/d8dn"
Matcher shortcutDoiMatcher = EXACT_SHORT_DOI_SHORTCUT.matcher(trimmedDoi);
if (shortcutDoiMatcher.find()) {
this.doi = "10/" + shortcutDoiMatcher.group(1);
isShortDoi = true;
} else {
throw new IllegalArgumentException(trimmedDoi + " is not a valid DOI/Short DOI.");
}
}
}
}
Expand Down
29 changes: 28 additions & 1 deletion src/test/java/org/jabref/model/entry/identifier/DOITest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ public void acceptPlainDoi() {
@Test
public void acceptPlainShortDoi() {
assertEquals("10/gf4gqc", new DOI("10/gf4gqc").getDOI());
assertEquals("10/1000", new DOI("10/1000").getDOI());
assertEquals("10/aaaa", new DOI("10/aaaa").getDOI());
}

@Test
Expand All @@ -43,6 +45,9 @@ public void rejectEmbeddedDoi() {
@Test
public void rejectEmbeddedShortDoi() {
assertThrows(IllegalArgumentException.class, () -> new DOI("other stuff 10/gf4gqc end"));
assertThrows(IllegalArgumentException.class, () -> new DOI("10/2021/01"));
assertThrows(IllegalArgumentException.class, () -> new DOI("01/10/2021"));
assertThrows(IllegalArgumentException.class, () -> new DOI("https://www.abc.de/10/abcd"));
}

@Test
Expand Down Expand Up @@ -86,15 +91,24 @@ public void acceptDoiPrefixInShortDoi() {
@Test
public void acceptURNPrefix() {
assertEquals("10.123/456", new DOI("urn:10.123/456").getDOI());
assertEquals("10.123/456", new DOI("urn:doi:10.123/456").getDOI());
assertEquals("10.123/456", new DOI("http://doi.org/urn:doi:10.123/456").getDOI());
// : is also allowed as divider, will be replaced by RESOLVER
assertEquals("10.123:456ABC/zyz", new DOI("http://doi.org/urn:doi:10.123:456ABC%2Fzyz").getDOI());
}

@Test
public void acceptShortcutShortDoi() {
assertEquals("10/d8dn", new DOI("https://doi.org/d8dn").getDOI());
assertEquals("10/d8dn", new DOI(" https://doi.org/d8dn ").getDOI());
assertEquals("10/d8dn", new DOI("doi.org/d8dn").getDOI());
assertEquals("10/d8dn", new DOI("www.doi.org/d8dn").getDOI());
assertEquals("10/d8dn", new DOI(" doi.org/d8dn ").getDOI());
}

@Test
public void acceptURNPrefixInShortDoi() {
assertEquals("10/gf4gqc", new DOI("urn:10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI("doi:10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI("urn:doi:10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI("http://doi.org/urn:doi:10/gf4gqc").getDOI());
// : is also allowed as divider, will be replaced by RESOLVER
Expand Down Expand Up @@ -138,6 +152,12 @@ public void acceptURLShortDoi() {
assertEquals("10/gf4gqc", new DOI("https://dx.doi.org/10%2Fgf4gqc").getDOI());
// other domains
assertEquals("10/gf4gqc", new DOI("http://doi.acm.org/10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI("www.doi.acm.org/10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI("doi.acm.org/10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI("10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI("/10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI(" /10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI(" 10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI("http://doi.acm.net/10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI("http://doi.acm.com/10/gf4gqc").getDOI());
assertEquals("10/gf4gqc", new DOI("http://doi.acm.de/10/gf4gqc").getDOI());
Expand All @@ -149,6 +169,13 @@ public void acceptURLShortDoi() {
assertEquals("10/gf4gqc", new DOI("http://doi.ieeecomputersociety.org/10/gf4gqc").getDOI());
}

@Test
public void rejectURLShortDoi() {
assertThrows(IllegalArgumentException.class, () -> new DOI("http://www.cs.utexas.edu/users/kaufmann/itp-trusted-extensions-aug-2010/summary/summary.pdf"));
assertThrows(IllegalArgumentException.class, () -> new DOI("http://www.cs.utexas.edu/users/kaufmann/itp-trusted-extensions-aug-20/10/summary/summary.pdf"));
assertThrows(IllegalArgumentException.class, () -> new DOI("http://www.boi.org/10/2010bingbong"));
}

@Test
public void correctlyDecodeHttpDOIs() {
// See http://www.doi.org/doi_handbook/2_Numbering.html#2.5.2.4
Expand Down

0 comments on commit d944eae

Please sign in to comment.