diff --git a/zbMATH.js b/zbMATH.js index 5db03ff5a3..00dbdb9ef4 100644 --- a/zbMATH.js +++ b/zbMATH.js @@ -1,7 +1,7 @@ { "translatorID": "1d84c107-9dbb-4b87-8208-e3632b87889f", "label": "zbMATH", - "creator": "Philipp Zumstein", + "creator": "Philipp Zumstein and contributors", "target": "^https?://(www\\.)?zbmath\\.org/", "minVersion": "3.0", "maxVersion": "", @@ -9,13 +9,13 @@ "inRepository": true, "translatorType": 4, "browserSupport": "gcsibv", - "lastUpdated": "2021-09-10 18:47:46" + "lastUpdated": "2023-06-16 06:21:57" } /* ***** BEGIN LICENSE BLOCK ***** - zbMATH Translator, Copyright © 2014 Philipp Zumstein + zbMATH Translator, Copyright © 2014 Philipp Zumstein and contributors. This file is part of Zotero. Zotero is free software: you can redistribute it and/or modify @@ -35,105 +35,235 @@ */ -function detectWeb(doc, _url) { - if (ZU.xpath(doc, '//div[@class="list"]/article').length > 0) { +function detectWeb(doc) { + if (doc.querySelector(".content-result")) { // search results present return "multiple"; } - else if (ZU.xpath(doc, '//a[contains(@class, "bib")]').length > 0) { // contains + else if (doc.querySelector(".bib")) { // it is a single entry --> generic fallback = journalArticle return "journalArticle"; } return false; } -function scrape(doc, _url) { - var bibArray = doc.getElementsByClassName("bib"); - var bibUrl = bibArray[0].getAttribute('href');// e.g. "bibtex/06115874.bib" +function getSearchResults(doc) { + let items = {}; + let found = false; + let rows = doc.querySelectorAll(".content-result .list > article"); + for (let row of rows) { + let href = attr(row, ".title a", "href"); + let titleElem = row.querySelector(".title"); + if (!titleElem) continue; + let title = cleanText(cleanupMath(titleElem, false/* laTeXify*/)); + if (!href || !title) continue; + found = true; + items[href] = title; + } + return found ? items : false; +} + +async function doWeb(doc, url) { + if (detectWeb(doc, url) == 'multiple') { + let items = await Zotero.selectItems(getSearchResults(doc)); + if (!items) return; + for (let url of Object.keys(items)) { + await scrape(await requestDocument(url)); + } + } + else { + await scrape(doc, url); + } +} - ZU.doGet(bibUrl, function (text) { - // Z.debug(text); - - var trans = Zotero.loadTranslator('import'); - trans.setTranslator('9cb70025-a888-4a29-a210-93ec52da40d4');// https://github.com/zotero/translators/blob/master/BibTeX.js - trans.setString(text); +async function scrape(doc, url) { + let bibURL = attr(doc, ".bib", "href"); + if (!bibURL) { + Z.debug(`Error: document at ${url} does not contain bibTeX link`); + return; + } + let bibTeXDoc = await requestText(bibURL); + if (!bibTeXDoc) { + Z.debug(`Error: failed to request BibTeX content at ${bibURL}`); + return; + } - trans.setHandler('itemDone', function (obj, item) { - item.title = item.title.replace(/\.$/, ''); - - if (item.publisher) { - var publisherSeparation = item.publisher.indexOf(":"); - if (publisherSeparation != -1) { - item.place = item.publisher.substr(0, publisherSeparation); - item.publisher = item.publisher.substr(publisherSeparation + 1); - } + let trans = Zotero.loadTranslator('import'); + trans.setTranslator('9cb70025-a888-4a29-a210-93ec52da40d4'); // BibTeX + trans.setString(bibTeXDoc); + + trans.setHandler('itemDone', function (obj, item) { + item.title = item.title.replace(/\.$/, ''); + + if (item.publisher) { + let splitPublisher = item.publisher.split(":"); + if (splitPublisher[1]) { + item.place = splitPublisher[0]; + item.publisher = splitPublisher[1].trim(); } - - // keywords are normally not in the bib file, so we take them from the page - // moreover, the meaning of the MSC classification is also only given on the page - if (item.tags.length == 0) { - var keywords = ZU.xpath(doc, '//div[@class="keywords"]/a'); - for (var i = 0; i < keywords.length; i++) { - item.tags.push(keywords[i].textContent); + } + + // The BibTeX contains MSC as keywords. Scrape the textual labels to + // MSC as well as the words under "Keywords". + function pushWord(element) { + let word = cleanText(element); + if (word) { + item.tags.push(word); + } + } + + // Keywords + doc.querySelectorAll(".keywords a").forEach(pushWord); + // Labels to MSC identifiers. Don't put the replaced MathJax/MathML + // in $ $, because this isn't very useful in tags (brief inline text). + doc.querySelectorAll(".classification td.space") + .forEach(node => pushWord(cleanupMath(node, false/* laTeXify */))); + + // add abstract but not review + let abstractOrReview = doc.querySelector(".abstract"); + if (abstractOrReview.innerText.trim().indexOf('Summary') == 0) { + // Strip the MathJax/MathML, put in the LaTeX math text, and + // surround them with $ $. + item.abstractNote = cleanupMath(abstractOrReview) + .innerText.trim().replace(/^Summary:\s*/i, ""); + } + + // Math in the title from the BibTeX data is not well-parsed by the + // import translator; falling back to page-scraping. + let titleNode = doc.querySelector("article .title strong"); + if (titleNode && titleNode.innerText) { + item.title = cleanupMath(titleNode, false) + .innerText.replace(/\.$/, ""); + } + + item.attachments = [{ + title: "Snapshot", + document: doc + }]; + + let zbIDElem = doc.querySelector(".label"); + if (zbIDElem) { + let zbID = cleanText(zbIDElem); // Zbl number (see also URL). + + if (zbID) { + zbID = zbID.replace(/^Zbl\s*/i, "Zbl: "); + if (!item.extra) { + item.extra = zbID; } - var classifications = ZU.xpath(doc, '//div[@class="classification"]//tr'); - for (let classification of classifications) { - item.extra = (item.extra ? item.extra + "\n" : '') + 'MSC2010: ' + ZU.trimInternal(ZU.xpathText(classification, './td', null, " = ")); + else { + item.extra += `\n${zbID}`; } - } - - // add abstract but not review - var abstractOrReview = ZU.xpathText(doc, '//div[@class="abstract"]'); - if (abstractOrReview.indexOf('Summary') == 0) { - item.abstractNote = abstractOrReview.replace(/^Summary:?\s*/, ''); - } - - item.attachments = [{ - title: "Snapshot", - document: doc - }]; - var id = ZU.xpath(doc, '//div[@class="title"]/a[@class="label"]')[0]; - if (id) { - if (!item.extra) item.extra = ''; - else item.extra += "\n"; - - item.extra += 'Zbl: ' + ZU.trimInternal(id.textContent) - .replace(/^\s*Zbl\s+/i, ''); // e.g. Zbl 1255.05045 - item.url = id.href; + // zb permalink, cleaner than document URL + item.url = zbIDElem.href; } - - item.complete(); - // Z.debug(item); - }); - - trans.translate(); + } + + item.complete(); }); + + trans.translate(); } +// Utility functions + +// Convenience function to clean up the displayed text of a node, suitable for +// inline elements +function cleanText(element) { + return ZU.trimInternal(element.innerText.trim()); +} + +/** + * Clean up the MathJax-rendered text in elements. Returns a cloned node with + * the duplicate-causing elements removed and the LaTeX math text converted to + * text nodes (surrounded with $ $ if laTeXify = true). + * + * NOTE: If the node containes unrendered MathJax source, the surrounding + * delimiters \( and \) will be removed if laTeXify is false, or replaced with + * $ $ if laTeXify is true. + * + * @param {HTMLElement} element - The DOM element node to operate on + * @param {boolean} [laTeXify=true] - Whether to replace the MathML nodes with + * LaTeX source surrounded by $ $. If set to false, the replacement will be the + * inner text of the rendered MathML, which is better suited for brief inline + * math text, such as in titles and tags. + * @returns {Node} Cloned node with MathML replaced + */ +function cleanupMath(element, laTeXify = true) { + let dup = element.cloneNode(true/* deep */); + let doc = dup.ownerDocument; + + let deleteNode = node => node.remove(); + let textify = (node, content) => { + let textNode = doc.createTextNode(content); + node.parentNode.replaceChild(textNode, node); + }; -function doWeb(doc, url) { - if (detectWeb(doc, url) == "multiple") { - var items = {}; - var rows = ZU.xpath(doc, '//div[@class="list"]/article'); - for (let row of rows) { - var title = ZU.xpathText(row, './div[@class="title"]/a[1]'); - var link = ZU.xpathText(row, './div[@class="title"]/a[1]/@href'); - items[link] = title; + // Top-level contentful MathML elements. + let baseMathList = dup.querySelectorAll("span[id^='MathJax-Element']"); + if (!baseMathList.length) { + // When we're saving one of the multiples from getSearchResults, even + // if there's math, MathJax code will not be run. In this case, we do a + // text replacement + dup.normalize(); + let walker = doc.createTreeWalker(dup, 4/* SHOW_TEXT */); + let currentNode; + while ((currentNode = walker.nextNode())/* assignment */) { + currentNode.textContent + = currentNode.textContent.replace( + /\\\(|\\\)/g, // \( or \); MathJax source text + laTeXify ? "$" : "", + ); } - Zotero.selectItems(items, function (items) { - if (items) ZU.processDocuments(Object.keys(items), scrape); - }); + return dup; + } + + if (laTeXify) { + // Delete the rendered MathML whose inner text tends to cause dupes + baseMathList.forEach(deleteNode); + + // Keep "math/tex" "script" tags and convert them to text. + dup.querySelectorAll("script[type='math/tex']") + .forEach((node) => { + let content = node.textContent.trim(); + content = `$${content}$`; + textify(node, content); + }); } else { - scrape(doc, url); + // Operation mode is to un-LaTeXify by reusing rendered MathML, + // especially the text nodes + let nodeList = dup.querySelectorAll("span.MJX_Assistive_MathML"); + if (nodeList.length) { + // An accurate and uniform way to extract the MathML content, from + // assistive MathML. The top-level math node is replaced by + // the text so extracted. This works even with SVG renderer. + nodeList.forEach(node => textify(node.parentNode, cleanText(node))); + } + else { + Z.debug("Warning: Assistive MathML turned off; text-conversion results may be inaccurate."); // bad + if (dup.querySelector("span[id^='MathJax-Element-'] svg")) { + Z.debug("Warning: using SVG MathJaX renderer; no text available!"); // worse + } + + // Strip any annotations + dup.querySelectorAll("annotation").forEach(deleteNode); + + baseMathList.forEach(node => textify(node, cleanText(node))); + } + + // Then delete the LaTeX source in