From 8c9c1bf3c73e9797049c74e6c383fedb0dbcfcc0 Mon Sep 17 00:00:00 2001 From: Stefan Kolb Date: Sun, 15 Mar 2020 18:15:44 +0100 Subject: [PATCH] Minor improvements for Google scholar fetcher Detect Google captcha div. Follow up PRs might show or give more info to the user. --- .../logic/importer/fetcher/GoogleScholar.java | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index 4e28444ff28f..73057a916091 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -29,13 +29,14 @@ import org.apache.http.client.utils.URIBuilder; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * FulltextFetcher implementation that attempts to find a PDF URL at GoogleScholar. - * + *

* Search String infos: https://scholar.google.com/intl/en/scholar/help.html#searching */ public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher { @@ -58,11 +59,10 @@ public GoogleScholar(ImportFormatPreferences importFormatPreferences) { @Override public Optional findFullText(BibEntry entry) throws IOException, FetcherException { Objects.requireNonNull(entry); - Optional pdfLink = Optional.empty(); // Search in title if (!entry.hasField(StandardField.TITLE)) { - return pdfLink; + return Optional.empty(); } try { @@ -74,12 +74,10 @@ public Optional findFullText(BibEntry entry) throws IOException, FetcherExc // as_occt field to search in uriBuilder.addParameter("as_occt", "title"); - pdfLink = search(uriBuilder.toString()); + return search(uriBuilder.toString()); } catch (URISyntaxException e) { throw new FetcherException("Building URI failed.", e); } - - return pdfLink; } @Override @@ -91,6 +89,11 @@ private Optional search(String url) throws IOException { Optional pdfLink = Optional.empty(); Document doc = Jsoup.connect(url).userAgent(URLDownload.USER_AGENT).get(); + + if (needsCaptcha(doc.body().html())) { + LOGGER.warn("Hit Google traffic limitation. Captcha prevents automatic fetching."); + return Optional.empty(); + } // Check results for PDF link // TODO: link always on first result or none? for (int i = 0; i < NUM_RESULTS; i++) { @@ -111,6 +114,10 @@ private Optional search(String url) throws IOException { return pdfLink; } + private boolean needsCaptcha(String body) { + return body.contains("id=\"gs_captcha_ccl\""); + } + @Override public String getName() { return "Google Scholar"; @@ -158,6 +165,11 @@ public List performSearch(String query) throws FetcherException { private void addHitsFromQuery(List entryList, String queryURL) throws IOException, FetcherException { String content = new URLDownload(queryURL).asString(); + if (needsCaptcha(content)) { + throw new FetcherException("Fetching from Google Scholar failed.", + Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null); + } + Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content); while (matcher.find()) { String citationsPageURL = matcher.group().replace("&", "&");