Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

strip out navigational and other superfluous elements #862

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Readability-readerable.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
var REGEXPS = {
// NOTE: These two regular expressions are duplicated in
// Readability.js. Please keep both copies in sync.
unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
unlikelyCandidates: /-ad-|ai2html|banner|combx|comment|community|cover-wrap|credentials|date|hide|hidden|disqus|extra|footer|gdpr|legends|nav|paywall|meta|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|share|sharing|yom-remote|byline|topbar|article-meta|brand|tooltip/i,
okMaybeItsACandidate: /and|article|body|column|content|main|shadow|header|summary/i,
};

function isNodeVisible(node) {
Expand Down
82 changes: 58 additions & 24 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Readability.prototype = {
DEFAULT_N_TOP_CANDIDATES: 5,

// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre,summary,article,header,main".toUpperCase().split(","),

// The default number of chars an article must have in order to return a result
DEFAULT_CHAR_THRESHOLD: 500,
Expand All @@ -122,17 +122,17 @@ Readability.prototype = {
REGEXPS: {
// NOTE: These two regular expressions are duplicated in
// Readability-readerable.js. Please keep both copies in sync.
unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
unlikelyCandidates: /-ad-|ai2html|banner|combx|comment|community|cover-wrap|credentials|date|hide|hidden|disqus|extra|footer|gdpr|legends|nav|paywall|meta|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|share|sharing|yom-remote|byline|topbar|article-meta|brand|tooltip/i,
okMaybeItsACandidate: /and|article|body|column|content|main|shadow|header|summary/i,

positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby|p-author/i,
positive: /article|body|content|entry|header|hentry|h-entry|intro|intro|intro|intro|main|main-article|main-content|page|lead|leading|pagination|primary|post|text|blog|story|summary|strapline/i,
negative: /-ad-|affiliate|credentials|controls|date|desktop|hidden|nav|^hid$| hid$| hid |^hid |hide|banner|login|gate|combx|comment|com-|contact|foot|footer|footnote|gdpr|icon|^icon|icons$|icons|masthead|media|meta|paywall|nav|outbrain|promo|related|scroll|share|sharing|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|tooltip|widget|video-player|video|jw-player|jw-aspect|modal|carousel|overlay|byline|brand|disclosure|nav|logo|account|cart|dock/i,
extraneous: /print|affiliate|archive|button|comment|controls|discuss|e[\-]?mail|meta|icons|share|reply|all|login|sign|single|utility|icons|nav|video-player|jw-player|modal|video|paidcontent|carousel|overlay|social|topbar|article-meta|onetrust-consent-sdk|logo|account|cart|hamburger|traffic|weather|search/i,
byline: /byline|author|dateline|credentials|writtenby|p-author|article-author/i,
Comment on lines +125 to +131
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are some really significant changes, and we'd need to have some testcases to help explain why we're making these changes.

Some of the changes also seem like mistakes, e.g. nav is in there several times, and icon will always match when ^icon and icons$ and icons match.

replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
shareElements: /(\b|_)(share|sharedaddy|social|sharebar)(\b|_)/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
tokenize: /\W+/g,
Expand All @@ -148,7 +148,10 @@ Readability.prototype = {
jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/
},

UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ],
UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog", "nav" ],

NODES_TO_CLEAN_FIRST: ["object", "embed", "footer", "link", "aside", "nav", ".icons", ".byline", ".sub-nav", ".identity", ".logo", ".video-player", ".jw-player", ".jw-wrapper", ".video", ".byline", ".author", ".dateline", ".credentials", ".writtenby", ".p-author", ".article-author", ".navigation", ".hidden-xs", ".hidden-sm", ".brand", ".modalContent", ".noPrint", ".noprint", ".screenonly", ".breadcrumb", ".breadcrumbs", "amp-iframe", "amp-img", "amp-ad", ".advert", ".ads", ".brand", ".search", ".nav", ".user", ".users", "#onetrust-consent-sdk", "#branding", "#branding-content" ],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A lot of these aren't nodenames, so _clean won't actually remove them, I think?

It would also be really helpful to better understand what the source of this list of elements is.

NODES_TO_CLEAN_SECOND: [ "iframe", "input", "textarea", "select", "button", "svg"],

DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]),

Expand Down Expand Up @@ -679,11 +682,9 @@ Readability.prototype = {
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
this._cleanConditionally(articleContent, "fieldset");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "footer");
this._clean(articleContent, "link");
this._clean(articleContent, "aside");
this.NODES_TO_CLEAN_FIRST.forEach((el) => {
this._clean(articleContent, el);
});

// Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
Expand All @@ -696,11 +697,9 @@ Readability.prototype = {
});
});

this._clean(articleContent, "iframe");
this._clean(articleContent, "input");
this._clean(articleContent, "textarea");
this._clean(articleContent, "select");
this._clean(articleContent, "button");
this.NODES_TO_CLEAN_SECOND.forEach((el) => {
this._clean(articleContent, el);
});
this._cleanHeaders(articleContent);

// Do these last as the previous stuff may have removed junk
Expand All @@ -709,6 +708,13 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "ul");
this._cleanConditionally(articleContent, "div");

//scale down h2-h5 because it's too large most of the time (intro's in h2, etc)
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h5"]), "h6");
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h4"]), "h5");
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h3"]), "h4");
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h2"]), "h3");
Comment on lines +711 to +715
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment doesn't seem like a great reason to change the semantics of the headers - the "right" solution would probably to revert intro paragraphs into, well, paragraphs, instead of headers, if they meet some threshold / algorithm.



// replace H1 with H2 as H1 should be only title that is displayed separately
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2");

Expand Down Expand Up @@ -756,6 +762,9 @@ Readability.prototype = {

switch (node.tagName) {
case "DIV":
case "MAIN":
case "HEADER":
case "ARTICLE":
node.readability.contentScore += 5;
break;

Expand Down Expand Up @@ -826,6 +835,8 @@ Readability.prototype = {
// works the way that it splits both texts into words and then finds words that are unique in second text
// the result is given by the lower length of unique parts
_textSimilarity: function(textA, textB) {
if (!textA || !textB) return 0;
if (Math.abs(textA.length - textB.length) > 25) return 0;
var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
if (!tokensA.length || !tokensB.length) {
Expand Down Expand Up @@ -885,6 +896,11 @@ Readability.prototype = {
return null;
}

var fullArticleText = document.body.innerText;
if(fullArticleText.length) {
fullArticleText = fullArticleText.split(/[\r\n]+/).filter((el) => el.length > 50);
}

var pageCacheHtml = page.innerHTML;

while (true) {
Expand All @@ -896,15 +912,15 @@ Readability.prototype = {
// used inappropriately (as in, where they contain no other block level elements.)
var elementsToScore = [];
var node = this._doc.documentElement;

let shouldRemoveTitleHeader = true;

while (node) {

if (node.tagName === "HTML") {
this._articleLang = node.getAttribute("lang");
}

var matchString = node.className + " " + node.id;

if (!this._isProbablyVisible(node)) {
Expand Down Expand Up @@ -1013,6 +1029,8 @@ Readability.prototype = {
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
**/
var candidates = [];
var elementsCounter = 0;

this._forEachNode(elementsToScore, function(elementToScore) {
if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
return;
Expand All @@ -1026,9 +1044,11 @@ Readability.prototype = {
var ancestors = this._getNodeAncestors(elementToScore, 5);
if (ancestors.length === 0)
return;

elementsCounter++;

var contentScore = 0;

// Add a point for the paragraph itself as a base.
contentScore += 1;

Expand All @@ -1037,6 +1057,20 @@ Readability.prototype = {

// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += Math.min(Math.floor(innerText.length / 100), 3);

if(innerText.length > 100 && elementsCounter < 10)
fullArticleText.forEach((el) => {
if (el.length > 5 && innerText.indexOf(el) != -1) {
var extra = Math.max(Math.max(0, 10 * (10 - elementsCounter)), 10);
// console.log('add ', extra, innerText);
contentScore += extra;
}
});

// extra score for headers
if(elementToScore.tagName && elementToScore.tagName.length == 2 && elementToScore.tagName.toLowerCase().startsWith('h')) {
contentScore += 100;
}

// Initialize and score ancestors.
this._forEachNode(ancestors, function(ancestor, level) {
Expand Down Expand Up @@ -1546,7 +1580,7 @@ Readability.prototype = {

// get article published time
metadata.publishedTime = jsonld.datePublished ||
values["article:published_time"] || null;
values["article:published_time"] || null;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a mistake as this is a continuation line vs. the line before.


// in many sites the meta value is escaped with HTML entities,
// so here we need to unescape it
Expand Down Expand Up @@ -2304,7 +2338,7 @@ Readability.prototype = {
excerpt: metadata.excerpt,
siteName: metadata.siteName || this._articleSiteName,
publishedTime: metadata.publishedTime
};
};
}
};

Expand Down