Skip to content

Commit

Permalink
Minor improvements for Google scholar fetcher
Browse files Browse the repository at this point in the history
Detect Google captcha div.
Follow up PRs might show or give more info to the user.
  • Loading branch information
stefan-kolb committed Mar 15, 2020
1 parent 8f68de9 commit 8c9c1bf
Showing 1 changed file with 18 additions and 6 deletions.
24 changes: 18 additions & 6 deletions src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,14 @@
import org.apache.http.client.utils.URIBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* FulltextFetcher implementation that attempts to find a PDF URL at GoogleScholar.
*
* <p>
* Search String infos: https://scholar.google.com/intl/en/scholar/help.html#searching
*/
public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher {
Expand All @@ -58,11 +59,10 @@ public GoogleScholar(ImportFormatPreferences importFormatPreferences) {
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherException {
Objects.requireNonNull(entry);
Optional<URL> pdfLink = Optional.empty();

// Search in title
if (!entry.hasField(StandardField.TITLE)) {
return pdfLink;
return Optional.empty();
}

try {
Expand All @@ -74,12 +74,10 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherExc
// as_occt field to search in
uriBuilder.addParameter("as_occt", "title");

pdfLink = search(uriBuilder.toString());
return search(uriBuilder.toString());
} catch (URISyntaxException e) {
throw new FetcherException("Building URI failed.", e);
}

return pdfLink;
}

@Override
Expand All @@ -91,6 +89,11 @@ private Optional<URL> search(String url) throws IOException {
Optional<URL> pdfLink = Optional.empty();

Document doc = Jsoup.connect(url).userAgent(URLDownload.USER_AGENT).get();

if (needsCaptcha(doc.body().html())) {
LOGGER.warn("Hit Google traffic limitation. Captcha prevents automatic fetching.");
return Optional.empty();
}
// Check results for PDF link
// TODO: link always on first result or none?
for (int i = 0; i < NUM_RESULTS; i++) {
Expand All @@ -111,6 +114,10 @@ private Optional<URL> search(String url) throws IOException {
return pdfLink;
}

private boolean needsCaptcha(String body) {
return body.contains("id=\"gs_captcha_ccl\"");
}

@Override
public String getName() {
return "Google Scholar";
Expand Down Expand Up @@ -158,6 +165,11 @@ public List<BibEntry> performSearch(String query) throws FetcherException {
private void addHitsFromQuery(List<BibEntry> entryList, String queryURL) throws IOException, FetcherException {
String content = new URLDownload(queryURL).asString();

if (needsCaptcha(content)) {
throw new FetcherException("Fetching from Google Scholar failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null);
}

Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content);
while (matcher.find()) {
String citationsPageURL = matcher.group().replace("&amp;", "&");
Expand Down

0 comments on commit 8c9c1bf

Please sign in to comment.