From 8c9c1bf3c73e9797049c74e6c383fedb0dbcfcc0 Mon Sep 17 00:00:00 2001
From: Stefan Kolb <stefan-kolb@web.de>
Date: Sun, 15 Mar 2020 18:15:44 +0100
Subject: [PATCH] Minor improvements for Google scholar fetcher

Detect Google captcha div.
Follow up PRs might show or give more info to the user.
---
 .../logic/importer/fetcher/GoogleScholar.java | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
index 4e28444ff28f..73057a916091 100644
--- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
+++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
@@ -29,13 +29,14 @@
 import org.apache.http.client.utils.URIBuilder;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
  * FulltextFetcher implementation that attempts to find a PDF URL at GoogleScholar.
- *
+ * <p>
  * Search String infos: https://scholar.google.com/intl/en/scholar/help.html#searching
  */
 public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher {
@@ -58,11 +59,10 @@ public GoogleScholar(ImportFormatPreferences importFormatPreferences) {
     @Override
     public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherException {
         Objects.requireNonNull(entry);
-        Optional<URL> pdfLink = Optional.empty();
 
         // Search in title
         if (!entry.hasField(StandardField.TITLE)) {
-            return pdfLink;
+            return Optional.empty();
         }
 
         try {
@@ -74,12 +74,10 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherExc
             // as_occt field to search in
             uriBuilder.addParameter("as_occt", "title");
 
-            pdfLink = search(uriBuilder.toString());
+            return search(uriBuilder.toString());
         } catch (URISyntaxException e) {
             throw new FetcherException("Building URI failed.", e);
         }
-
-        return pdfLink;
     }
 
     @Override
@@ -91,6 +89,11 @@ private Optional<URL> search(String url) throws IOException {
         Optional<URL> pdfLink = Optional.empty();
 
         Document doc = Jsoup.connect(url).userAgent(URLDownload.USER_AGENT).get();
+
+        if (needsCaptcha(doc.body().html())) {
+            LOGGER.warn("Hit Google traffic limitation. Captcha prevents automatic fetching.");
+            return Optional.empty();
+        }
         // Check results for PDF link
         // TODO: link always on first result or none?
         for (int i = 0; i < NUM_RESULTS; i++) {
@@ -111,6 +114,10 @@ private Optional<URL> search(String url) throws IOException {
         return pdfLink;
     }
 
+    private boolean needsCaptcha(String body) {
+        return body.contains("id=\"gs_captcha_ccl\"");
+    }
+
     @Override
     public String getName() {
         return "Google Scholar";
@@ -158,6 +165,11 @@ public List<BibEntry> performSearch(String query) throws FetcherException {
     private void addHitsFromQuery(List<BibEntry> entryList, String queryURL) throws IOException, FetcherException {
         String content = new URLDownload(queryURL).asString();
 
+        if (needsCaptcha(content)) {
+            throw new FetcherException("Fetching from Google Scholar failed.",
+                    Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null);
+        }
+
         Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content);
         while (matcher.find()) {
             String citationsPageURL = matcher.group().replace("&amp;", "&");