Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zbMATH: Updates and multiple fixes. #3052

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
258 changes: 175 additions & 83 deletions zbMATH.js
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
{
"translatorID": "1d84c107-9dbb-4b87-8208-e3632b87889f",
"label": "zbMATH",
"creator": "Philipp Zumstein",
"creator": "Philipp Zumstein and contributors",
"target": "^https?://(www\\.)?zbmath\\.org/",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2021-09-10 18:47:46"
"lastUpdated": "2023-06-15 09:57:22"
}

/*
***** BEGIN LICENSE BLOCK *****

zbMATH Translator, Copyright © 2014 Philipp Zumstein
zbMATH Translator, Copyright © 2014 Philipp Zumstein and contributors.
This file is part of Zotero.

Zotero is free software: you can redistribute it and/or modify
Expand All @@ -35,105 +35,163 @@
*/


function detectWeb(doc, _url) {
if (ZU.xpath(doc, '//div[@class="list"]/article').length > 0) {
function detectWeb(doc) {
if (doc.querySelector(".content-result")) { // search results present
return "multiple";
}
else if (ZU.xpath(doc, '//a[contains(@class, "bib")]').length > 0) { // contains
else if (doc.querySelector(".bib")) {
// it is a single entry --> generic fallback = journalArticle
return "journalArticle";
}
return false;
}

function scrape(doc, _url) {
var bibArray = doc.getElementsByClassName("bib");
var bibUrl = bibArray[0].getAttribute('href');// e.g. "bibtex/06115874.bib"
function getSearchResults(doc, checkOnly) {
let items = {};
let found = false;
let rows = doc.querySelectorAll(".content-result .list > article");
for (let row of rows) {
let href = attr(row, ".title a", "href");
let title = ZU.trimInternal(text(row, ".title"));
if (!href || !title) continue;
if (checkOnly) return true;
found = true;
items[href] = title;
}
return found && items;
zoe-translates marked this conversation as resolved.
Show resolved Hide resolved
}

ZU.doGet(bibUrl, function (text) {
// Z.debug(text);

var trans = Zotero.loadTranslator('import');
trans.setTranslator('9cb70025-a888-4a29-a210-93ec52da40d4');// https://github.com/zotero/translators/blob/master/BibTeX.js
trans.setString(text);
async function doWeb(doc, url) {
if (detectWeb(doc, url) == 'multiple') {
let items = await Zotero.selectItems(getSearchResults(doc, false));
if (!items) return;
for (let url of Object.keys(items)) {
await scrape(await requestDocument(url));
}
}
else {
await scrape(doc, url);
}
}

trans.setHandler('itemDone', function (obj, item) {
item.title = item.title.replace(/\.$/, '');

if (item.publisher) {
var publisherSeparation = item.publisher.indexOf(":");
if (publisherSeparation != -1) {
item.place = item.publisher.substr(0, publisherSeparation);
item.publisher = item.publisher.substr(publisherSeparation + 1);
}
async function scrape(doc, url) {
let bibURL = attr(doc, ".bib", "href");
if (!bibURL) {
Z.debug(`Error: document at ${url} does not contain bibTeX link`);
return;
}
let bibTeXDoc = await ZU.requestText(bibURL);
zoe-translates marked this conversation as resolved.
Show resolved Hide resolved
if (!bibTeXDoc) {
Z.debug(`Error: failed to request BibTeX content at ${bibURL}`);
return;
}

let trans = Zotero.loadTranslator('import');
trans.setTranslator('9cb70025-a888-4a29-a210-93ec52da40d4'); // BibTeX
trans.setString(bibTeXDoc);

trans.setHandler('itemDone', function (obj, item) {
item.title = item.title.replace(/\.$/, '');

if (item.publisher) {
let splitPublisher = item.publisher.split(":");
if (splitPublisher[1]) {
item.place = splitPublisher[0];
item.publisher = splitPublisher[1].trim();
}
}

// The BibTeX contains MSC as keywords. Scrape the textual labels to
// MSC as well as the words under "Keywords".
function cleanText(element) {
return ZU.trimInternal(element.textContent.trim());
}

function pushWord(element) {
let word = cleanText(element);
if (word) {
item.tags.push(word);
}

// keywords are normally not in the bib file, so we take them from the page
// moreover, the meaning of the MSC classification is also only given on the page
if (item.tags.length == 0) {
var keywords = ZU.xpath(doc, '//div[@class="keywords"]/a');
for (var i = 0; i < keywords.length; i++) {
item.tags.push(keywords[i].textContent);
}

// Keywords
doc.querySelectorAll(".keywords a").forEach(pushWord);
// Labels to MSC identifiers. Don't put the replaced MathJax/MathML
// in $ $, because this isn't very useful in tags.
doc.querySelectorAll(".classification td.space")
.forEach(node => pushWord(cleanupMath(node, false/* laTeXify */)));

// add abstract but not review
let abstractOrReview = doc.querySelector(".abstract");
if (abstractOrReview.innerText.trim().indexOf('Summary') == 0) {
// Strip the MathJax/MathML, put in the LaTeX math text, and
// surround them with $ $.
item.abstractNote = cleanupMath(abstractOrReview)
.innerText.trim().replace(/^Summary:\s*/i, "");
}

item.attachments = [{
title: "Snapshot",
document: doc
}];

let zbIDElem = doc.querySelector(".label");
if (zbIDElem) {
let zbID = cleanText(zbIDElem); // Zbl number (see also URL).

if (zbID) {
zbID = zbID.replace(/^Zbl\s*/i, "Zbl: ");
if (!item.extra) {
item.extra = zbID;
}
var classifications = ZU.xpath(doc, '//div[@class="classification"]//tr');
for (let classification of classifications) {
item.extra = (item.extra ? item.extra + "\n" : '') + 'MSC2010: ' + ZU.trimInternal(ZU.xpathText(classification, './td', null, " = "));
else {
item.extra += `\n${zbID}`;
}
}

// add abstract but not review
var abstractOrReview = ZU.xpathText(doc, '//div[@class="abstract"]');
if (abstractOrReview.indexOf('Summary') == 0) {
item.abstractNote = abstractOrReview.replace(/^Summary:?\s*/, '');
}

item.attachments = [{
title: "Snapshot",
document: doc
}];

var id = ZU.xpath(doc, '//div[@class="title"]/a[@class="label"]')[0];
if (id) {
if (!item.extra) item.extra = '';
else item.extra += "\n";

item.extra += 'Zbl: ' + ZU.trimInternal(id.textContent)
.replace(/^\s*Zbl\s+/i, ''); // e.g. Zbl 1255.05045
item.url = id.href;
// zb permalink, cleaner than document URL
item.url = zbIDElem.href;
}

item.complete();
// Z.debug(item);
});

trans.translate();
}

item.complete();
});

trans.translate();
}

// Clean up the MathJaX-rendered text in elements. Returns a clone of the node
// with the duplicate-causing elements removed and the LaTeX math text
// converted to text nodes (surrounded with $ $ if laTeXify = true).
function cleanupMath(element, laTeXify = true) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd want to hear from @dstillman or @AbeJellinek what Zotero's view is on handling LaTeX/MathJaX in fields. It's currently not supported, so adding things like $$ doesn't do any good, but given the nature of the translator it might still make sense?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think generally we just save the LaTeX as is, even though nothing in Zotero will render it. I hadn't thought about it very much but I think I agree with the approach here - try to use the rendered Unicode version of the LaTeX for short fields, keep the LaTeX in the abstract and similar.

let dup = element.cloneNode(true/* deep */);
let doc = dup.ownerDocument;

function doWeb(doc, url) {
if (detectWeb(doc, url) == "multiple") {
var items = {};
var rows = ZU.xpath(doc, '//div[@class="list"]/article');
for (let row of rows) {
var title = ZU.xpathText(row, './div[@class="title"]/a[1]');
var link = ZU.xpathText(row, './div[@class="title"]/a[1]/@href');
items[link] = title;
}
Zotero.selectItems(items, function (items) {
if (items) ZU.processDocuments(Object.keys(items), scrape);
// Delete the rendered MathML whose innert text tends to cause dupes
dup.querySelectorAll(":scope span[id^='MathJax-Element']")
.forEach(node => node.remove());

// Keep "math/tex" "script" tags and convert them to text.
dup.querySelectorAll(":scope script[type='math/tex']")
.forEach((node) => {
let content = node.textContent.trim();
if (laTeXify) {
content = `$${content}$`;
}

let textNode = doc.createTextNode(content);
let parentNode = node.parentNode;

parentNode.replaceChild(textNode, node);
});
}
else {
scrape(doc, url);
}
return dup;
}

/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "https://www.zbmath.org/?q=an:06115874",
"url": "https://zbmath.org/?q=an:06115874",
"detectedItemType": "journalArticle",
"items": [
{
"itemType": "journalArticle",
Expand All @@ -158,16 +216,16 @@ var testCases = [
"date": "2012",
"DOI": "10.1002/rsa.20472",
"ISSN": "1042-9832",
"abstractNote": "We prove that a given tree TT on n vertices with bounded maximum degree is contained asymptotically almost surely in the binomial random graph G(n,(1+ε)lognn)G\\left(n,\\frac {(1+\\varepsilon)\\log n}{n}\\right) provided that TT belongs to one of the following two classes: \n\n(1)TT has linearly many leaves; (2)TT has a path of linear length all of whose vertices have degree two in TT.",
"extra": "MSC2010: 05C05 = Trees\nMSC2010: 05C80 = Random graphs (graph-theoretic aspects)\nZbl: 1255.05045",
"abstractNote": "We prove that a given tree $T$ on n vertices with bounded maximum degree is contained asymptotically almost surely in the binomial random graph $G\\left(n,\\frac {(1+\\varepsilon)\\log n}{n}\\right)$ provided that $T$ belongs to one of the following two classes: \n\n(1)$T$ has linearly many leaves; (2)$T$ has a path of linear length all of whose vertices have degree two in $T$.",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't quite know what to do about this -- we don't actually support TeX in Zotero fields (other then the new notes), so this is a bit messy, but I'm also not sure what else we could do.

Copy link
Collaborator Author

@zoe-translates zoe-translates Jun 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, we don't, and yes this is a bit messy here. I'd like to hear more thoughts about this too.

Before this change, you can see that the MathJax-rendered elements became "TT" for a one-letter math symbol. It's even worse now, for without the change it would become "TTT" under newer MathJax. In addition, more complicated MathML text loses meaning when converted to text in the usual way. For instance, the fraction line became lost, so "log n over n" became lognn in the text.

In other words, without further processing, meaning could be easily destroyed, and silently. It's difficult to spot the change from "T" to "TTT" in the wall of text.

So I chose to preserve the LaTeX-y annotation as substitute, and mark it so, using the $ .. $. This at least signals to the reader that here used to be some rendered math, and the LaTeX source is in principle a lossless substitute.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think this is a reasonable approach. And, honestly, we could probably support math in abstract fields pretty easily (just showing as $…$ in edit mode).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, this is roughly consistent with the arXiv translator's output (e.g. see https://arxiv.org/abs/2306.07357). There, the abstract is handed to us by the OAI API, which is a verbatim copy of what the preprint author puts into that field.

"extra": "Zbl: 1255.05045",
"issue": "4",
"itemID": "zbMATH06115874",
"journalAbbreviation": "Random Struct. Algorithms",
"language": "English",
"libraryCatalog": "zbMATH",
"pages": "391–412",
"publicationTitle": "Random Structures & Algorithms",
"url": "https://www.zbmath.org/?q=an%3A1255.05045",
"url": "https://zbmath.org/1255.05045",
"volume": "41",
"attachments": [
{
Expand All @@ -176,6 +234,18 @@ var testCases = [
}
],
"tags": [
{
"tag": "05C05"
},
{
"tag": "05C80"
},
{
"tag": "Random graphs (graph-theoretic aspects)"
},
{
"tag": "Trees"
},
{
"tag": "random graphs"
},
Expand All @@ -196,12 +266,14 @@ var testCases = [
},
{
"type": "web",
"url": "http://www.zbmath.org/?q=se:00001331+ai:bollobas.bela",
"url": "https://zbmath.org/?q=se:00001331+ai:bollobas.bela",
"detectedItemType": "multiple",
"items": "multiple"
},
{
"type": "web",
"url": "https://zbmath.org/?q=an:06212000",
"detectedItemType": "journalArticle",
"items": [
{
"itemType": "journalArticle",
Expand Down Expand Up @@ -231,15 +303,16 @@ var testCases = [
"date": "2013",
"DOI": "10.1137/090771478",
"ISSN": "0895-4801",
"extra": "MSC2010: 90C27 = Combinatorial optimization\nMSC2010: 05C85 = Graph algorithms (graph-theoretic aspects)\nMSC2010: 91A06 = nn-person games, n>2n>2\nZbl: 1273.90167",
"abstractNote": "We study a natural network creation game, in which each node locally tries to minimize its local diameter or its local average distance to other nodes by swapping one incident edge at a time. The central question is what structure the resulting equilibrium graphs have, in particular, how well they globally minimize diameter. For the local-average-distance version, we prove an upper bound of $2^{O(\\sqrt{\\lg n})}$, a lower bound of 3, and a tight bound of exactly 2 for trees, and give evidence of a general polylogarithmic upper bound. For the local-diameter version, we prove a lower bound of $\\Omega(\\sqrt{n})$ and a tight upper bound of 3 for trees. The same bounds apply, up to constant factors, to the price of anarchy. Our network creation games are closely related to the previously studied unilateral network creation game. The main difference is that our model has no parameter $\\alpha$ for the link creation cost, so our results effectively apply for all values of $\\alpha$ without additional effort; furthermore, equilibrium can be checked in polynomial time in our model, unlike in previous models. Our perspective enables simpler proofs that get at the heart of network creation games.",
"extra": "Zbl: 1273.90167",
"issue": "2",
"itemID": "zbMATH06212000",
"journalAbbreviation": "SIAM J. Discrete Math.",
"language": "English",
"libraryCatalog": "zbMATH",
"pages": "656–668",
"publicationTitle": "SIAM Journal on Discrete Mathematics",
"url": "https://zbmath.org/?q=an%3A1273.90167",
"url": "https://zbmath.org/1273.90167",
"volume": "27",
"attachments": [
{
Expand All @@ -248,12 +321,30 @@ var testCases = [
}
],
"tags": [
{
"tag": "05C85"
},
{
"tag": "90C27"
},
{
"tag": "91A06"
},
{
"tag": "Combinatorial optimization"
},
{
"tag": "Graph algorithms (graph-theoretic aspects)"
},
{
"tag": "equilibrium"
},
{
"tag": "low diameter"
},
{
"tag": "n-person games, n>2"
},
{
"tag": "network creation"
},
Expand All @@ -271,7 +362,8 @@ var testCases = [
},
{
"type": "web",
"url": "http://zbmath.org/?q=cc:35",
"url": "https://zbmath.org/?q=cc:35",
"detectedItemType": "multiple",
"items": "multiple"
}
]
Expand Down