Skip to content

Commit

Permalink
Finish and fix archive scholar fetcher
Browse files Browse the repository at this point in the history
  • Loading branch information
Siedlerchr committed Nov 4, 2023
1 parent c2467cd commit d036a40
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 92 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,22 @@
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.IntStream;

import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.PagedSearchBasedParserFetcher;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.fetcher.transformers.ScholarArchiveQueryTransformer;
import org.jabref.logic.importer.util.JsonReader;
import org.jabref.logic.net.URLDownload;
import org.jabref.model.entry.AuthorList;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.types.EntryType;
import org.jabref.model.entry.types.StandardEntryType;

import jakarta.ws.rs.core.MediaType;
import kong.unirest.json.JSONArray;
import kong.unirest.json.JSONException;
import kong.unirest.json.JSONObject;
Expand All @@ -28,13 +31,10 @@

public class ScholarArchiveFetcher implements PagedSearchBasedParserFetcher {

// Define a constant for the fetcher name.
public static final String FETCHER_NAME = "ScholarArchive";

// Initialize the logger for this class.
private static final Logger LOGGER = LoggerFactory.getLogger(ScholarArchiveFetcher.class);

// Define the API URL for ScholarArchive.
private static final String API_URL = "https://scholar.archive.org/search";

/**
Expand All @@ -47,21 +47,21 @@ public class ScholarArchiveFetcher implements PagedSearchBasedParserFetcher {
@Override
public URL getURLForQuery(QueryNode luceneQuery, int pageNumber) throws URISyntaxException, MalformedURLException, FetcherException {
URIBuilder uriBuilder = new URIBuilder(API_URL);

// Add search query parameter to the URL.
uriBuilder.addParameter("q", new ScholarArchiveQueryTransformer().transformLuceneQuery(luceneQuery).orElse(""));

// Add page number and page size parameters to the URL.
uriBuilder.addParameter("from", String.valueOf(getPageSize() * pageNumber));
uriBuilder.addParameter("size", String.valueOf(getPageSize()));

// Specify the response format as JSON.
uriBuilder.addParameter("format", "json");

// Build the URL.
LOGGER.debug("using URL for search {}", uriBuilder.build());
return uriBuilder.build().toURL();
}

@Override
public URLDownload getUrlDownload(URL url) {
URLDownload download = new URLDownload(url);
download.addHeader("Accept", MediaType.APPLICATION_JSON);
return download;
}
/**
* Gets the list of BibEntry by given Json response from scholar archive fetcher API
*
Expand All @@ -72,12 +72,10 @@ public Parser getParser() {
return inputStream -> {
// Read the API response.
JSONObject response = JsonReader.toJsonObject(inputStream);

// Parse the JSON response into a list of BibEntry objects.
JSONObject jsonObject = new JSONObject(response);
List<BibEntry> entries = new ArrayList<>();
if (jsonObject.has("results")) {
JSONArray results = jsonObject.getJSONArray("results");
if (response.has("results")) {
JSONArray results = response.getJSONArray("results");
for (int i = 0; i < results.length(); i++) {
JSONObject jsonEntry = results.getJSONObject(i);
BibEntry entry = parseJSONtoBibtex(jsonEntry);
Expand All @@ -99,19 +97,17 @@ public String getName() {
return FETCHER_NAME;
}

/**
* Parse from Json object that contain one article to BibEntry
*
* @param jsonEntry the search query
* @return BibEntry
* @throws ParseException
*/
private BibEntry parseJSONtoBibtex(JSONObject jsonEntry) throws ParseException {
try {
BibEntry entry = new BibEntry();
EntryType entryType = StandardEntryType.InCollection;
JSONObject biblio = jsonEntry.optJSONObject("biblio");
JSONObject abstracts = jsonEntry.optJSONObject("abstracts");

JSONArray abstracts = jsonEntry.getJSONArray("abstracts");
String foundAbstract = IntStream.range(0, abstracts.length())
.mapToObj(abstracts::getJSONObject)
.map(object -> object.optString("body"))
.findFirst().orElse("");

// publication type
String type = biblio.optString("release_type");
Expand All @@ -132,37 +128,30 @@ private BibEntry parseJSONtoBibtex(JSONObject jsonEntry) throws ParseException {

entry.setField(StandardField.YEAR, String.valueOf(biblio.optInt("release_year")));
entry.setField(StandardField.VOLUME, String.valueOf(biblio.optInt("volume_int")));
entry.setField(StandardField.ABSTRACT, abstracts.optString("body"));
entry.setField(StandardField.ABSTRACT, foundAbstract);

// Date
String dateString = biblio.optString("date");
entry.setField(StandardField.DATE, dateString);

// Authors
// Authors are in contrib_names
if (biblio.has("contrib_names")) {
JSONArray authors = biblio.getJSONArray("contrib_names");
List<String> authorList = new ArrayList<>();
for (int i = 0; i < authors.length(); i++) {
authorList.add(authors.getString(i));
}
AuthorList parsedAuthors = AuthorList.parse(String.join(" and ", authorList));
entry.setField(StandardField.AUTHOR, String.join(" and ", parsedAuthors.getAsFirstLastNamesWithAnd()));
} else {
LOGGER.debug("No author found.");
entry.setField(StandardField.AUTHOR, parsedAuthors.getAsLastFirstNamesWithAnd(false));
}

// ISSN
if (biblio.has("issns")) {
JSONArray issn = biblio.getJSONArray("issns");
List<String> issnList = new ArrayList<>();
for (int i = 0; i < issn.length(); i++) {
issnList.add(issn.getString(i));
}
entry.setField(StandardField.ISSN, String.join(" ", issnList));
} else {
LOGGER.debug("No issns found.");
}

return entry;
} catch (JSONException exception) {
throw new ParseException("ScholarArchive API JSON format has changed", exception);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,64 +6,31 @@
*/
public class ScholarArchiveQueryTransformer extends AbstractQueryTransformer {

/**
* Returns the operator for logical "AND" used in the Scholar Archive query language.
*
* @return A string representing the logical "AND" operator.
*/
@Override
protected String getLogicalAndOperator() {
return " AND ";
}

/**
* Returns the operator for logical "OR" used in the Scholar Archive query language.
*
* @return A string representing the logical "OR" operator.
*/
@Override
protected String getLogicalOrOperator() {
return " OR ";
}

/**
* Returns the operator for logical "NOT" used in the Scholar Archive query language.
*
* @return A string representing the logical "NOT" operator.
*/
@Override
protected String getLogicalNotOperator() {
return "NOT ";
}

/**
* Transforms the author query segment into a 'contrib_names' key-value pair for the Scholar Archive query.
*
* @param author the author's name to be searched in the Scholar Archive.
* @return A string query segment representing the author search criterion.
*/
@Override
protected String handleAuthor(String author) {
return createKeyValuePair("contrib_names", author);
}

/**
* Transforms the title query segment into a 'title' key-value pair for the Scholar Archive query.
*
* @param title the title of the work to be searched in the Scholar Archive.
* @return A string query segment representing the title search criterion.
*/
@Override
protected String handleTitle(String title) {
return createKeyValuePair("title", title);
}

/**
* Transforms the journal title query segment into a 'container_name' key-value pair for the Scholar Archive query.
*
* @param journalTitle the name of the journal to be searched in the Scholar Archive.
* @return A string query segment representing the journal title search criterion.
*/
@Override
protected String handleJournal(String journalTitle) {
return createKeyValuePair("container_name", journalTitle);
Expand All @@ -90,13 +57,11 @@ protected String handleYear(String year) {
*/
@Override
protected String handleYearRange(String yearRange) {
// This method presumably parses the year range into individual components.
parseYearRange(yearRange);
if (endYear == Integer.MAX_VALUE) {
// If no specific end year is set, it assumes the range extends to the current year.
return yearRange;
}
// Formats the year range for inclusion in the Scholar Archive query.
return "publication.startDate:[" + startYear + " TO " + endYear + "]";
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,61 +1,49 @@
package org.jabref.logic.importer.fetcher;

import java.util.Collections;
import java.util.List;

import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.SearchBasedParserFetcher;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.types.StandardEntryType;
import org.jabref.testutils.category.FetcherTest;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Answers;
import org.mockito.Mock;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import static org.mockito.MockitoAnnotations.openMocks;
import static org.junit.jupiter.api.Assertions.assertTrue;

@FetcherTest
public class ScholarArchiveFetcherTest {
private ScholarArchiveFetcher fetcher;
private BibEntry bibEntry;

@Mock
private ImportFormatPreferences preferences;

@BeforeEach
public void setUp() {
openMocks(this);
fetcher = new ScholarArchiveFetcher();
bibEntry = new BibEntry(StandardEntryType.Article)
.withField(StandardField.TITLE, "Article title")
.withField(StandardField.AUTHOR, "Sam Liu");
bibEntry = new BibEntry(StandardEntryType.InCollection)
.withField(StandardField.TITLE, "Query expansion using associated queries")
.withField(StandardField.AUTHOR, "Billerbeck, Bodo and Scholer, Falk and Williams, Hugh E. and Zobel, Justin")
.withField(StandardField.VOLUME, "0")
.withField(StandardField.DOI, "10.1145/956863.956866")
.withField(StandardField.JOURNAL, "Proceedings of the twelfth international conference on Information and knowledge management - CIKM '03")
.withField(StandardField.PUBLISHER, "ACM Press")
.withField(StandardField.TYPE, "paper-conference")
.withField(StandardField.YEAR, "2003");
}

@Test
public void getNameReturnsCorrectName() {
assertEquals("ScholarArchive", fetcher.getName());
}

@Test
public void getParserReturnsNonNullParser() {
Parser parser = fetcher.getParser();
assertEquals(Parser.class, parser.getClass());
}

@Test
public void performSearchReturnsExpectedResults() throws FetcherException {
SearchBasedParserFetcher fetcherMock = mock(SearchBasedParserFetcher.class, Answers.RETURNS_DEEP_STUBS);
when(fetcherMock.performSearch("query")).thenReturn(Collections.singletonList(bibEntry));
List<BibEntry> fetchedEntries = fetcher.performSearch("query");
assertEquals(Collections.singletonList(bibEntry), fetchedEntries);
fetchedEntries.forEach(s -> s.clearField(StandardField.ABSTRACT));

assertTrue(fetchedEntries.contains(bibEntry));
}
}

Expand Down

0 comments on commit d036a40

Please sign in to comment.