Spaces:
Running
Running
/* | |
* Copyright (c) 2010 Arc90 Inc | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
/* | |
* This code is heavily based on Arc90's readability.js (1.7.1) script | |
* available at: http://code.google.com/p/arc90labs-readability | |
*/ | |
/** | |
* Public constructor. | |
* @param {HTMLDocument} doc The document to parse. | |
* @param {Object} options The options object. | |
*/ | |
function Readability(doc, options) { | |
// In some older versions, people passed a URI as the first argument. Cope: | |
if (options && options.documentElement) { | |
doc = options; | |
options = arguments[2]; | |
} else if (!doc || !doc.documentElement) { | |
throw new Error("First argument to Readability constructor should be a document object."); | |
} | |
options = options || {}; | |
this._doc = doc; | |
this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__; | |
this._articleTitle = null; | |
this._articleByline = null; | |
this._articleDir = null; | |
this._articleSiteName = null; | |
this._attempts = []; | |
// Configurable options | |
this._debug = !!options.debug; | |
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; | |
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; | |
this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; | |
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); | |
this._keepClasses = !!options.keepClasses; | |
this._serializer = options.serializer || function(el) { | |
return el.innerHTML; | |
}; | |
this._disableJSONLD = !!options.disableJSONLD; | |
this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos; | |
// Start with all flags set | |
this._flags = this.FLAG_STRIP_UNLIKELYS | | |
this.FLAG_WEIGHT_CLASSES | | |
this.FLAG_CLEAN_CONDITIONALLY; | |
// Control whether log messages are sent to the console | |
if (this._debug) { | |
let logNode = function(node) { | |
if (node.nodeType == node.TEXT_NODE) { | |
return `${node.nodeName} ("${node.textContent}")`; | |
} | |
let attrPairs = Array.from(node.attributes || [], function(attr) { | |
return `${attr.name}="${attr.value}"`; | |
}).join(" "); | |
return `<${node.localName} ${attrPairs}>`; | |
}; | |
this.log = function () { | |
if (typeof console !== "undefined") { | |
let args = Array.from(arguments, arg => { | |
if (arg && arg.nodeType == this.ELEMENT_NODE) { | |
return logNode(arg); | |
} | |
return arg; | |
}); | |
args.unshift("Reader: (Readability)"); | |
console.log.apply(console, args); | |
} else if (typeof dump !== "undefined") { | |
/* global dump */ | |
var msg = Array.prototype.map.call(arguments, function(x) { | |
return (x && x.nodeName) ? logNode(x) : x; | |
}).join(" "); | |
dump("Reader: (Readability) " + msg + "\n"); | |
} | |
}; | |
} else { | |
this.log = function () {}; | |
} | |
} | |
Readability.prototype = { | |
FLAG_STRIP_UNLIKELYS: 0x1, | |
FLAG_WEIGHT_CLASSES: 0x2, | |
FLAG_CLEAN_CONDITIONALLY: 0x4, | |
// https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType | |
ELEMENT_NODE: 1, | |
TEXT_NODE: 3, | |
// Max number of nodes supported by this parser. Default: 0 (no limit) | |
DEFAULT_MAX_ELEMS_TO_PARSE: 0, | |
// The number of top candidates to consider when analysing how | |
// tight the competition is among candidates. | |
DEFAULT_N_TOP_CANDIDATES: 5, | |
// Element tags to score by default. | |
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), | |
// The default number of chars an article must have in order to return a result | |
DEFAULT_CHAR_THRESHOLD: 500, | |
// All of the regular expressions in use within readability. | |
// Defined up here so we don't instantiate them repeatedly in loops. | |
REGEXPS: { | |
// NOTE: These two regular expressions are duplicated in | |
// Readability-readerable.js. Please keep both copies in sync. | |
unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, | |
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, | |
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, | |
negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, | |
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, | |
byline: /byline|author|dateline|writtenby|p-author/i, | |
replaceFonts: /<(\/?)font[^>]*>/gi, | |
normalize: /\s{2,}/g, | |
videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, | |
shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, | |
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, | |
prevLink: /(prev|earl|old|new|<|«)/i, | |
tokenize: /\W+/g, | |
whitespace: /^\s*$/, | |
hasContent: /\S$/, | |
hashUrl: /^#.+/, | |
srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, | |
b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, | |
// Commas as used in Latin, Sindhi, Chinese and various other scripts. | |
// see: https://en.wikipedia.org/wiki/Comma#Comma_variants | |
commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g, | |
// See: https://schema.org/Article | |
jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/ | |
}, | |
UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ], | |
DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]), | |
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], | |
PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ], | |
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ], | |
// The commented out elements qualify as phrasing content but tend to be | |
// removed by readability when put into paragraphs, so we ignore them here. | |
PHRASING_ELEMS: [ | |
// "CANVAS", "IFRAME", "SVG", "VIDEO", | |
"ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", | |
"DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", | |
"MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q", | |
"RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB", | |
"SUP", "TEXTAREA", "TIME", "VAR", "WBR" | |
], | |
// These are the classes that readability sets itself. | |
CLASSES_TO_PRESERVE: [ "page" ], | |
// These are the list of HTML entities that need to be escaped. | |
HTML_ESCAPE_MAP: { | |
"lt": "<", | |
"gt": ">", | |
"amp": "&", | |
"quot": '"', | |
"apos": "'", | |
}, | |
/** | |
* Run any post-process modifications to article content as necessary. | |
* | |
* @param Element | |
* @return void | |
**/ | |
_postProcessContent: function(articleContent) { | |
// Readability cannot open relative uris so we convert them to absolute uris. | |
this._fixRelativeUris(articleContent); | |
this._simplifyNestedElements(articleContent); | |
if (!this._keepClasses) { | |
// Remove classes. | |
this._cleanClasses(articleContent); | |
} | |
}, | |
/** | |
* Iterates over a NodeList, calls `filterFn` for each node and removes node | |
* if function returned `true`. | |
* | |
* If function is not passed, removes all the nodes in node list. | |
* | |
* @param NodeList nodeList The nodes to operate on | |
* @param Function filterFn the function to use as a filter | |
* @return void | |
*/ | |
_removeNodes: function(nodeList, filterFn) { | |
// Avoid ever operating on live node lists. | |
if (this._docJSDOMParser && nodeList._isLiveNodeList) { | |
throw new Error("Do not pass live node lists to _removeNodes"); | |
} | |
for (var i = nodeList.length - 1; i >= 0; i--) { | |
var node = nodeList[i]; | |
var parentNode = node.parentNode; | |
if (parentNode) { | |
if (!filterFn || filterFn.call(this, node, i, nodeList)) { | |
parentNode.removeChild(node); | |
} | |
} | |
} | |
}, | |
/** | |
* Iterates over a NodeList, and calls _setNodeTag for each node. | |
* | |
* @param NodeList nodeList The nodes to operate on | |
* @param String newTagName the new tag name to use | |
* @return void | |
*/ | |
_replaceNodeTags: function(nodeList, newTagName) { | |
// Avoid ever operating on live node lists. | |
if (this._docJSDOMParser && nodeList._isLiveNodeList) { | |
throw new Error("Do not pass live node lists to _replaceNodeTags"); | |
} | |
for (const node of nodeList) { | |
this._setNodeTag(node, newTagName); | |
} | |
}, | |
/** | |
* Iterate over a NodeList, which doesn't natively fully implement the Array | |
* interface. | |
* | |
* For convenience, the current object context is applied to the provided | |
* iterate function. | |
* | |
* @param NodeList nodeList The NodeList. | |
* @param Function fn The iterate function. | |
* @return void | |
*/ | |
_forEachNode: function(nodeList, fn) { | |
Array.prototype.forEach.call(nodeList, fn, this); | |
}, | |
/** | |
* Iterate over a NodeList, and return the first node that passes | |
* the supplied test function | |
* | |
* For convenience, the current object context is applied to the provided | |
* test function. | |
* | |
* @param NodeList nodeList The NodeList. | |
* @param Function fn The test function. | |
* @return void | |
*/ | |
_findNode: function(nodeList, fn) { | |
return Array.prototype.find.call(nodeList, fn, this); | |
}, | |
/** | |
* Iterate over a NodeList, return true if any of the provided iterate | |
* function calls returns true, false otherwise. | |
* | |
* For convenience, the current object context is applied to the | |
* provided iterate function. | |
* | |
* @param NodeList nodeList The NodeList. | |
* @param Function fn The iterate function. | |
* @return Boolean | |
*/ | |
_someNode: function(nodeList, fn) { | |
return Array.prototype.some.call(nodeList, fn, this); | |
}, | |
/** | |
* Iterate over a NodeList, return true if all of the provided iterate | |
* function calls return true, false otherwise. | |
* | |
* For convenience, the current object context is applied to the | |
* provided iterate function. | |
* | |
* @param NodeList nodeList The NodeList. | |
* @param Function fn The iterate function. | |
* @return Boolean | |
*/ | |
_everyNode: function(nodeList, fn) { | |
return Array.prototype.every.call(nodeList, fn, this); | |
}, | |
/** | |
* Concat all nodelists passed as arguments. | |
* | |
* @return ...NodeList | |
* @return Array | |
*/ | |
_concatNodeLists: function() { | |
var slice = Array.prototype.slice; | |
var args = slice.call(arguments); | |
var nodeLists = args.map(function(list) { | |
return slice.call(list); | |
}); | |
return Array.prototype.concat.apply([], nodeLists); | |
}, | |
_getAllNodesWithTag: function(node, tagNames) { | |
if (node.querySelectorAll) { | |
return node.querySelectorAll(tagNames.join(",")); | |
} | |
return [].concat.apply([], tagNames.map(function(tag) { | |
var collection = node.getElementsByTagName(tag); | |
return Array.isArray(collection) ? collection : Array.from(collection); | |
})); | |
}, | |
/** | |
* Removes the class="" attribute from every element in the given | |
* subtree, except those that match CLASSES_TO_PRESERVE and | |
* the classesToPreserve array from the options object. | |
* | |
* @param Element | |
* @return void | |
*/ | |
_cleanClasses: function(node) { | |
var classesToPreserve = this._classesToPreserve; | |
var className = (node.getAttribute("class") || "") | |
.split(/\s+/) | |
.filter(function(cls) { | |
return classesToPreserve.indexOf(cls) != -1; | |
}) | |
.join(" "); | |
if (className) { | |
node.setAttribute("class", className); | |
} else { | |
node.removeAttribute("class"); | |
} | |
for (node = node.firstElementChild; node; node = node.nextElementSibling) { | |
this._cleanClasses(node); | |
} | |
}, | |
/** | |
* Converts each <a> and <img> uri in the given element to an absolute URI, | |
* ignoring #ref URIs. | |
* | |
* @param Element | |
* @return void | |
*/ | |
_fixRelativeUris: function(articleContent) { | |
var baseURI = this._doc.baseURI; | |
var documentURI = this._doc.documentURI; | |
function toAbsoluteURI(uri) { | |
// Leave hash links alone if the base URI matches the document URI: | |
if (baseURI == documentURI && uri.charAt(0) == "#") { | |
return uri; | |
} | |
// Otherwise, resolve against base URI: | |
try { | |
return new URL(uri, baseURI).href; | |
} catch (ex) { | |
// Something went wrong, just return the original: | |
} | |
return uri; | |
} | |
var links = this._getAllNodesWithTag(articleContent, ["a"]); | |
this._forEachNode(links, function(link) { | |
var href = link.getAttribute("href"); | |
if (href) { | |
// Remove links with javascript: URIs, since | |
// they won't work after scripts have been removed from the page. | |
if (href.indexOf("javascript:") === 0) { | |
// if the link only contains simple text content, it can be converted to a text node | |
if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) { | |
var text = this._doc.createTextNode(link.textContent); | |
link.parentNode.replaceChild(text, link); | |
} else { | |
// if the link has multiple children, they should all be preserved | |
var container = this._doc.createElement("span"); | |
while (link.firstChild) { | |
container.appendChild(link.firstChild); | |
} | |
link.parentNode.replaceChild(container, link); | |
} | |
} else { | |
link.setAttribute("href", toAbsoluteURI(href)); | |
} | |
} | |
}); | |
var medias = this._getAllNodesWithTag(articleContent, [ | |
"img", "picture", "figure", "video", "audio", "source" | |
]); | |
this._forEachNode(medias, function(media) { | |
var src = media.getAttribute("src"); | |
var poster = media.getAttribute("poster"); | |
var srcset = media.getAttribute("srcset"); | |
if (src) { | |
media.setAttribute("src", toAbsoluteURI(src)); | |
} | |
if (poster) { | |
media.setAttribute("poster", toAbsoluteURI(poster)); | |
} | |
if (srcset) { | |
var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { | |
return toAbsoluteURI(p1) + (p2 || "") + p3; | |
}); | |
media.setAttribute("srcset", newSrcset); | |
} | |
}); | |
}, | |
_simplifyNestedElements: function(articleContent) { | |
var node = articleContent; | |
while (node) { | |
if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) { | |
if (this._isElementWithoutContent(node)) { | |
node = this._removeAndGetNext(node); | |
continue; | |
} else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) { | |
var child = node.children[0]; | |
for (var i = 0; i < node.attributes.length; i++) { | |
child.setAttribute(node.attributes[i].name, node.attributes[i].value); | |
} | |
node.parentNode.replaceChild(child, node); | |
node = child; | |
continue; | |
} | |
} | |
node = this._getNextNode(node); | |
} | |
}, | |
/** | |
* Get the article title as an H1. | |
* | |
* @return string | |
**/ | |
_getArticleTitle: function() { | |
var doc = this._doc; | |
var curTitle = ""; | |
var origTitle = ""; | |
try { | |
curTitle = origTitle = doc.title.trim(); | |
// If they had an element with id "title" in their HTML | |
if (typeof curTitle !== "string") | |
curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]); | |
} catch (e) {/* ignore exceptions setting the title. */} | |
var titleHadHierarchicalSeparators = false; | |
function wordCount(str) { | |
return str.split(/\s+/).length; | |
} | |
// If there's a separator in the title, first remove the final part | |
if ((/ [\|\-\\\/>»] /).test(curTitle)) { | |
titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); | |
curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1"); | |
// If the resulting title is too short (3 words or fewer), remove | |
// the first part instead: | |
if (wordCount(curTitle) < 3) | |
curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1"); | |
} else if (curTitle.indexOf(": ") !== -1) { | |
// Check if we have an heading containing this exact string, so we | |
// could assume it's the full title. | |
var headings = this._concatNodeLists( | |
doc.getElementsByTagName("h1"), | |
doc.getElementsByTagName("h2") | |
); | |
var trimmedTitle = curTitle.trim(); | |
var match = this._someNode(headings, function(heading) { | |
return heading.textContent.trim() === trimmedTitle; | |
}); | |
// If we don't, let's extract the title out of the original title string. | |
if (!match) { | |
curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1); | |
// If the title is now too short, try the first colon instead: | |
if (wordCount(curTitle) < 3) { | |
curTitle = origTitle.substring(origTitle.indexOf(":") + 1); | |
// But if we have too many words before the colon there's something weird | |
// with the titles and the H tags so let's just use the original title instead | |
} else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) { | |
curTitle = origTitle; | |
} | |
} | |
} else if (curTitle.length > 150 || curTitle.length < 15) { | |
var hOnes = doc.getElementsByTagName("h1"); | |
if (hOnes.length === 1) | |
curTitle = this._getInnerText(hOnes[0]); | |
} | |
curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); | |
// If we now have 4 words or fewer as our title, and either no | |
// 'hierarchical' separators (\, /, > or ») were found in the original | |
// title or we decreased the number of words by more than 1 word, use | |
// the original title. | |
var curTitleWordCount = wordCount(curTitle); | |
if (curTitleWordCount <= 4 && | |
(!titleHadHierarchicalSeparators || | |
curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { | |
curTitle = origTitle; | |
} | |
return curTitle; | |
}, | |
/** | |
* Prepare the HTML document for readability to scrape it. | |
* This includes things like stripping javascript, CSS, and handling terrible markup. | |
* | |
* @return void | |
**/ | |
_prepDocument: function() { | |
var doc = this._doc; | |
// Remove all style tags in head | |
this._removeNodes(this._getAllNodesWithTag(doc, ["style"])); | |
if (doc.body) { | |
this._replaceBrs(doc.body); | |
} | |
this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); | |
}, | |
/** | |
* Finds the next node, starting from the given node, and ignoring | |
* whitespace in between. If the given node is an element, the same node is | |
* returned. | |
*/ | |
_nextNode: function (node) { | |
var next = node; | |
while (next | |
&& (next.nodeType != this.ELEMENT_NODE) | |
&& this.REGEXPS.whitespace.test(next.textContent)) { | |
next = next.nextSibling; | |
} | |
return next; | |
}, | |
/** | |
* Replaces 2 or more successive <br> elements with a single <p>. | |
* Whitespace between <br> elements are ignored. For example: | |
* <div>foo<br>bar<br> <br><br>abc</div> | |
* will become: | |
* <div>foo<br>bar<p>abc</p></div> | |
*/ | |
_replaceBrs: function (elem) { | |
this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) { | |
var next = br.nextSibling; | |
// Whether 2 or more <br> elements have been found and replaced with a | |
// <p> block. | |
var replaced = false; | |
// If we find a <br> chain, remove the <br>s until we hit another node | |
// or non-whitespace. This leaves behind the first <br> in the chain | |
// (which will be replaced with a <p> later). | |
while ((next = this._nextNode(next)) && (next.tagName == "BR")) { | |
replaced = true; | |
var brSibling = next.nextSibling; | |
next.parentNode.removeChild(next); | |
next = brSibling; | |
} | |
// If we removed a <br> chain, replace the remaining <br> with a <p>. Add | |
// all sibling nodes as children of the <p> until we hit another <br> | |
// chain. | |
if (replaced) { | |
var p = this._doc.createElement("p"); | |
br.parentNode.replaceChild(p, br); | |
next = p.nextSibling; | |
while (next) { | |
// If we've hit another <br><br>, we're done adding children to this <p>. | |
if (next.tagName == "BR") { | |
var nextElem = this._nextNode(next.nextSibling); | |
if (nextElem && nextElem.tagName == "BR") | |
break; | |
} | |
if (!this._isPhrasingContent(next)) | |
break; | |
// Otherwise, make this node a child of the new <p>. | |
var sibling = next.nextSibling; | |
p.appendChild(next); | |
next = sibling; | |
} | |
while (p.lastChild && this._isWhitespace(p.lastChild)) { | |
p.removeChild(p.lastChild); | |
} | |
if (p.parentNode.tagName === "P") | |
this._setNodeTag(p.parentNode, "DIV"); | |
} | |
}); | |
}, | |
_setNodeTag: function (node, tag) { | |
this.log("_setNodeTag", node, tag); | |
if (this._docJSDOMParser) { | |
node.localName = tag.toLowerCase(); | |
node.tagName = tag.toUpperCase(); | |
return node; | |
} | |
var replacement = node.ownerDocument.createElement(tag); | |
while (node.firstChild) { | |
replacement.appendChild(node.firstChild); | |
} | |
node.parentNode.replaceChild(replacement, node); | |
if (node.readability) | |
replacement.readability = node.readability; | |
for (var i = 0; i < node.attributes.length; i++) { | |
try { | |
replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); | |
} catch (ex) { | |
/* it's possible for setAttribute() to throw if the attribute name | |
* isn't a valid XML Name. Such attributes can however be parsed from | |
* source in HTML docs, see https://github.com/whatwg/html/issues/4275, | |
* so we can hit them here and then throw. We don't care about such | |
* attributes so we ignore them. | |
*/ | |
} | |
} | |
return replacement; | |
}, | |
/** | |
* Prepare the article node for display. Clean out any inline styles, | |
* iframes, forms, strip extraneous <p> tags, etc. | |
* | |
* @param Element | |
* @return void | |
**/ | |
_prepArticle: function(articleContent) { | |
this._cleanStyles(articleContent); | |
// Check for data tables before we continue, to avoid removing items in | |
// those tables, which will often be isolated even though they're | |
// visually linked to other content-ful elements (text, images, etc.). | |
this._markDataTables(articleContent); | |
this._fixLazyImages(articleContent); | |
// Clean out junk from the article content | |
this._cleanConditionally(articleContent, "form"); | |
this._cleanConditionally(articleContent, "fieldset"); | |
this._clean(articleContent, "object"); | |
this._clean(articleContent, "embed"); | |
this._clean(articleContent, "footer"); | |
this._clean(articleContent, "link"); | |
this._clean(articleContent, "aside"); | |
// Clean out elements with little content that have "share" in their id/class combinations from final top candidates, | |
// which means we don't remove the top candidates even they have "share". | |
var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; | |
this._forEachNode(articleContent.children, function (topCandidate) { | |
this._cleanMatchedNodes(topCandidate, function (node, matchString) { | |
return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold; | |
}); | |
}); | |
this._clean(articleContent, "iframe"); | |
this._clean(articleContent, "input"); | |
this._clean(articleContent, "textarea"); | |
this._clean(articleContent, "select"); | |
this._clean(articleContent, "button"); | |
this._cleanHeaders(articleContent); | |
// Do these last as the previous stuff may have removed junk | |
// that will affect these | |
this._cleanConditionally(articleContent, "table"); | |
this._cleanConditionally(articleContent, "ul"); | |
this._cleanConditionally(articleContent, "div"); | |
// replace H1 with H2 as H1 should be only title that is displayed separately | |
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2"); | |
// Remove extra paragraphs | |
this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { | |
var imgCount = paragraph.getElementsByTagName("img").length; | |
var embedCount = paragraph.getElementsByTagName("embed").length; | |
var objectCount = paragraph.getElementsByTagName("object").length; | |
// At this point, nasty iframes have been removed, only remain embedded video ones. | |
var iframeCount = paragraph.getElementsByTagName("iframe").length; | |
var totalCount = imgCount + embedCount + objectCount + iframeCount; | |
return totalCount === 0 && !this._getInnerText(paragraph, false); | |
}); | |
this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) { | |
var next = this._nextNode(br.nextSibling); | |
if (next && next.tagName == "P") | |
br.parentNode.removeChild(br); | |
}); | |
// Remove single-cell tables | |
this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) { | |
var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; | |
if (this._hasSingleTagInsideElement(tbody, "TR")) { | |
var row = tbody.firstElementChild; | |
if (this._hasSingleTagInsideElement(row, "TD")) { | |
var cell = row.firstElementChild; | |
cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); | |
table.parentNode.replaceChild(cell, table); | |
} | |
} | |
}); | |
}, | |
/** | |
* Initialize a node with the readability object. Also checks the | |
* className/id for special names to add to its score. | |
* | |
* @param Element | |
* @return void | |
**/ | |
_initializeNode: function(node) { | |
node.readability = {"contentScore": 0}; | |
switch (node.tagName) { | |
case "DIV": | |
node.readability.contentScore += 5; | |
break; | |
case "PRE": | |
case "TD": | |
case "BLOCKQUOTE": | |
node.readability.contentScore += 3; | |
break; | |
case "ADDRESS": | |
case "OL": | |
case "UL": | |
case "DL": | |
case "DD": | |
case "DT": | |
case "LI": | |
case "FORM": | |
node.readability.contentScore -= 3; | |
break; | |
case "H1": | |
case "H2": | |
case "H3": | |
case "H4": | |
case "H5": | |
case "H6": | |
case "TH": | |
node.readability.contentScore -= 5; | |
break; | |
} | |
node.readability.contentScore += this._getClassWeight(node); | |
}, | |
_removeAndGetNext: function(node) { | |
var nextNode = this._getNextNode(node, true); | |
node.parentNode.removeChild(node); | |
return nextNode; | |
}, | |
/** | |
* Traverse the DOM from node to node, starting at the node passed in. | |
* Pass true for the second parameter to indicate this node itself | |
* (and its kids) are going away, and we want the next node over. | |
* | |
* Calling this in a loop will traverse the DOM depth-first. | |
*/ | |
_getNextNode: function(node, ignoreSelfAndKids) { | |
// First check for kids if those aren't being ignored | |
if (!ignoreSelfAndKids && node.firstElementChild) { | |
return node.firstElementChild; | |
} | |
// Then for siblings... | |
if (node.nextElementSibling) { | |
return node.nextElementSibling; | |
} | |
// And finally, move up the parent chain *and* find a sibling | |
// (because this is depth-first traversal, we will have already | |
// seen the parent nodes themselves). | |
do { | |
node = node.parentNode; | |
} while (node && !node.nextElementSibling); | |
return node && node.nextElementSibling; | |
}, | |
// compares second text to first one | |
// 1 = same text, 0 = completely different text | |
// works the way that it splits both texts into words and then finds words that are unique in second text | |
// the result is given by the lower length of unique parts | |
_textSimilarity: function(textA, textB) { | |
var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); | |
var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); | |
if (!tokensA.length || !tokensB.length) { | |
return 0; | |
} | |
var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); | |
var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; | |
return 1 - distanceB; | |
}, | |
_checkByline: function(node, matchString) { | |
if (this._articleByline) { | |
return false; | |
} | |
if (node.getAttribute !== undefined) { | |
var rel = node.getAttribute("rel"); | |
var itemprop = node.getAttribute("itemprop"); | |
} | |
if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { | |
this._articleByline = node.textContent.trim(); | |
return true; | |
} | |
return false; | |
}, | |
_getNodeAncestors: function(node, maxDepth) { | |
maxDepth = maxDepth || 0; | |
var i = 0, ancestors = []; | |
while (node.parentNode) { | |
ancestors.push(node.parentNode); | |
if (maxDepth && ++i === maxDepth) | |
break; | |
node = node.parentNode; | |
} | |
return ancestors; | |
}, | |
/*** | |
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is | |
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div. | |
* | |
* @param page a document to run upon. Needs to be a full document, complete with body. | |
* @return Element | |
**/ | |
_grabArticle: function (page) { | |
this.log("**** grabArticle ****"); | |
var doc = this._doc; | |
var isPaging = page !== null; | |
page = page ? page : this._doc.body; | |
// We can't grab an article if we don't have a page! | |
if (!page) { | |
this.log("No body found in document. Abort."); | |
return null; | |
} | |
var pageCacheHtml = page.innerHTML; | |
while (true) { | |
this.log("Starting grabArticle loop"); | |
var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); | |
// First, node prepping. Trash nodes that look cruddy (like ones with the | |
// class name "comment", etc), and turn divs into P tags where they have been | |
// used inappropriately (as in, where they contain no other block level elements.) | |
var elementsToScore = []; | |
var node = this._doc.documentElement; | |
let shouldRemoveTitleHeader = true; | |
while (node) { | |
if (node.tagName === "HTML") { | |
this._articleLang = node.getAttribute("lang"); | |
} | |
var matchString = node.className + " " + node.id; | |
if (!this._isProbablyVisible(node)) { | |
this.log("Removing hidden node - " + matchString); | |
node = this._removeAndGetNext(node); | |
continue; | |
} | |
// User is not able to see elements applied with both "aria-modal = true" and "role = dialog" | |
if (node.getAttribute("aria-modal") == "true" && node.getAttribute("role") == "dialog") { | |
node = this._removeAndGetNext(node); | |
continue; | |
} | |
// Check to see if this node is a byline, and remove it if it is. | |
if (this._checkByline(node, matchString)) { | |
node = this._removeAndGetNext(node); | |
continue; | |
} | |
if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { | |
this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim()); | |
shouldRemoveTitleHeader = false; | |
node = this._removeAndGetNext(node); | |
continue; | |
} | |
// Remove unlikely candidates | |
if (stripUnlikelyCandidates) { | |
if (this.REGEXPS.unlikelyCandidates.test(matchString) && | |
!this.REGEXPS.okMaybeItsACandidate.test(matchString) && | |
!this._hasAncestorTag(node, "table") && | |
!this._hasAncestorTag(node, "code") && | |
node.tagName !== "BODY" && | |
node.tagName !== "A") { | |
this.log("Removing unlikely candidate - " + matchString); | |
node = this._removeAndGetNext(node); | |
continue; | |
} | |
if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { | |
this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString); | |
node = this._removeAndGetNext(node); | |
continue; | |
} | |
} | |
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). | |
if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || | |
node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || | |
node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && | |
this._isElementWithoutContent(node)) { | |
node = this._removeAndGetNext(node); | |
continue; | |
} | |
if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { | |
elementsToScore.push(node); | |
} | |
// Turn all divs that don't have children block level elements into p's | |
if (node.tagName === "DIV") { | |
// Put phrasing content into paragraphs. | |
var p = null; | |
var childNode = node.firstChild; | |
while (childNode) { | |
var nextSibling = childNode.nextSibling; | |
if (this._isPhrasingContent(childNode)) { | |
if (p !== null) { | |
p.appendChild(childNode); | |
} else if (!this._isWhitespace(childNode)) { | |
p = doc.createElement("p"); | |
node.replaceChild(p, childNode); | |
p.appendChild(childNode); | |
} | |
} else if (p !== null) { | |
while (p.lastChild && this._isWhitespace(p.lastChild)) { | |
p.removeChild(p.lastChild); | |
} | |
p = null; | |
} | |
childNode = nextSibling; | |
} | |
// Sites like http://mobile.slate.com encloses each paragraph with a DIV | |
// element. DIVs with only a P element inside and no text content can be | |
// safely converted into plain P elements to avoid confusing the scoring | |
// algorithm with DIVs with are, in practice, paragraphs. | |
if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { | |
var newNode = node.children[0]; | |
node.parentNode.replaceChild(newNode, node); | |
node = newNode; | |
elementsToScore.push(node); | |
} else if (!this._hasChildBlockElement(node)) { | |
node = this._setNodeTag(node, "P"); | |
elementsToScore.push(node); | |
} | |
} | |
node = this._getNextNode(node); | |
} | |
/** | |
* Loop through all paragraphs, and assign a score to them based on how content-y they look. | |
* Then add their score to their parent node. | |
* | |
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density. | |
**/ | |
var candidates = []; | |
this._forEachNode(elementsToScore, function(elementToScore) { | |
if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") | |
return; | |
// If this paragraph is less than 25 characters, don't even count it. | |
var innerText = this._getInnerText(elementToScore); | |
if (innerText.length < 25) | |
return; | |
// Exclude nodes with no ancestor. | |
var ancestors = this._getNodeAncestors(elementToScore, 5); | |
if (ancestors.length === 0) | |
return; | |
var contentScore = 0; | |
// Add a point for the paragraph itself as a base. | |
contentScore += 1; | |
// Add points for any commas within this paragraph. | |
contentScore += innerText.split(this.REGEXPS.commas).length; | |
// For every 100 characters in this paragraph, add another point. Up to 3 points. | |
contentScore += Math.min(Math.floor(innerText.length / 100), 3); | |
// Initialize and score ancestors. | |
this._forEachNode(ancestors, function(ancestor, level) { | |
if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined") | |
return; | |
if (typeof(ancestor.readability) === "undefined") { | |
this._initializeNode(ancestor); | |
candidates.push(ancestor); | |
} | |
// Node score divider: | |
// - parent: 1 (no division) | |
// - grandparent: 2 | |
// - great grandparent+: ancestor level * 3 | |
if (level === 0) | |
var scoreDivider = 1; | |
else if (level === 1) | |
scoreDivider = 2; | |
else | |
scoreDivider = level * 3; | |
ancestor.readability.contentScore += contentScore / scoreDivider; | |
}); | |
}); | |
// After we've calculated scores, loop through all of the possible | |
// candidate nodes we found and find the one with the highest score. | |
var topCandidates = []; | |
for (var c = 0, cl = candidates.length; c < cl; c += 1) { | |
var candidate = candidates[c]; | |
// Scale the final candidates score based on link density. Good content | |
// should have a relatively small link density (5% or less) and be mostly | |
// unaffected by this operation. | |
var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); | |
candidate.readability.contentScore = candidateScore; | |
this.log("Candidate:", candidate, "with score " + candidateScore); | |
for (var t = 0; t < this._nbTopCandidates; t++) { | |
var aTopCandidate = topCandidates[t]; | |
if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { | |
topCandidates.splice(t, 0, candidate); | |
if (topCandidates.length > this._nbTopCandidates) | |
topCandidates.pop(); | |
break; | |
} | |
} | |
} | |
var topCandidate = topCandidates[0] || null; | |
var neededToCreateTopCandidate = false; | |
var parentOfTopCandidate; | |
// If we still have no top candidate, just use the body as a last resort. | |
// We also have to copy the body node so it is something we can modify. | |
if (topCandidate === null || topCandidate.tagName === "BODY") { | |
// Move all of the page's children into topCandidate | |
topCandidate = doc.createElement("DIV"); | |
neededToCreateTopCandidate = true; | |
// Move everything (not just elements, also text nodes etc.) into the container | |
// so we even include text directly in the body: | |
while (page.firstChild) { | |
this.log("Moving child out:", page.firstChild); | |
topCandidate.appendChild(page.firstChild); | |
} | |
page.appendChild(topCandidate); | |
this._initializeNode(topCandidate); | |
} else if (topCandidate) { | |
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array | |
// and whose scores are quite closed with current `topCandidate` node. | |
var alternativeCandidateAncestors = []; | |
for (var i = 1; i < topCandidates.length; i++) { | |
if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) { | |
alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i])); | |
} | |
} | |
var MINIMUM_TOPCANDIDATES = 3; | |
if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) { | |
parentOfTopCandidate = topCandidate.parentNode; | |
while (parentOfTopCandidate.tagName !== "BODY") { | |
var listsContainingThisAncestor = 0; | |
for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) { | |
listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate)); | |
} | |
if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { | |
topCandidate = parentOfTopCandidate; | |
break; | |
} | |
parentOfTopCandidate = parentOfTopCandidate.parentNode; | |
} | |
} | |
if (!topCandidate.readability) { | |
this._initializeNode(topCandidate); | |
} | |
// Because of our bonus system, parents of candidates might have scores | |
// themselves. They get half of the node. There won't be nodes with higher | |
// scores than our topCandidate, but if we see the score going *up* in the first | |
// few steps up the tree, that's a decent sign that there might be more content | |
// lurking in other places that we want to unify in. The sibling stuff | |
// below does some of that - but only if we've looked high enough up the DOM | |
// tree. | |
parentOfTopCandidate = topCandidate.parentNode; | |
var lastScore = topCandidate.readability.contentScore; | |
// The scores shouldn't get too low. | |
var scoreThreshold = lastScore / 3; | |
while (parentOfTopCandidate.tagName !== "BODY") { | |
if (!parentOfTopCandidate.readability) { | |
parentOfTopCandidate = parentOfTopCandidate.parentNode; | |
continue; | |
} | |
var parentScore = parentOfTopCandidate.readability.contentScore; | |
if (parentScore < scoreThreshold) | |
break; | |
if (parentScore > lastScore) { | |
// Alright! We found a better parent to use. | |
topCandidate = parentOfTopCandidate; | |
break; | |
} | |
lastScore = parentOfTopCandidate.readability.contentScore; | |
parentOfTopCandidate = parentOfTopCandidate.parentNode; | |
} | |
// If the top candidate is the only child, use parent instead. This will help sibling | |
// joining logic when adjacent content is actually located in parent's sibling node. | |
parentOfTopCandidate = topCandidate.parentNode; | |
while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) { | |
topCandidate = parentOfTopCandidate; | |
parentOfTopCandidate = topCandidate.parentNode; | |
} | |
if (!topCandidate.readability) { | |
this._initializeNode(topCandidate); | |
} | |
} | |
// Now that we have the top candidate, look through its siblings for content | |
// that might also be related. Things like preambles, content split by ads | |
// that we removed, etc. | |
var articleContent = doc.createElement("DIV"); | |
if (isPaging) | |
articleContent.id = "readability-content"; | |
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); | |
// Keep potential top candidate's parent node to try to get text direction of it later. | |
parentOfTopCandidate = topCandidate.parentNode; | |
var siblings = parentOfTopCandidate.children; | |
for (var s = 0, sl = siblings.length; s < sl; s++) { | |
var sibling = siblings[s]; | |
var append = false; | |
this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); | |
this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); | |
if (sibling === topCandidate) { | |
append = true; | |
} else { | |
var contentBonus = 0; | |
// Give a bonus if sibling nodes and top candidates have the example same classname | |
if (sibling.className === topCandidate.className && topCandidate.className !== "") | |
contentBonus += topCandidate.readability.contentScore * 0.2; | |
if (sibling.readability && | |
((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { | |
append = true; | |
} else if (sibling.nodeName === "P") { | |
var linkDensity = this._getLinkDensity(sibling); | |
var nodeContent = this._getInnerText(sibling); | |
var nodeLength = nodeContent.length; | |
if (nodeLength > 80 && linkDensity < 0.25) { | |
append = true; | |
} else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && | |
nodeContent.search(/\.( |$)/) !== -1) { | |
append = true; | |
} | |
} | |
} | |
if (append) { | |
this.log("Appending node:", sibling); | |
if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { | |
// We have a node that isn't a common block level element, like a form or td tag. | |
// Turn it into a div so it doesn't get filtered out later by accident. | |
this.log("Altering sibling:", sibling, "to div."); | |
sibling = this._setNodeTag(sibling, "DIV"); | |
} | |
articleContent.appendChild(sibling); | |
// Fetch children again to make it compatible | |
// with DOM parsers without live collection support. | |
siblings = parentOfTopCandidate.children; | |
// siblings is a reference to the children array, and | |
// sibling is removed from the array when we call appendChild(). | |
// As a result, we must revisit this index since the nodes | |
// have been shifted. | |
s -= 1; | |
sl -= 1; | |
} | |
} | |
if (this._debug) | |
this.log("Article content pre-prep: " + articleContent.innerHTML); | |
// So we have all of the content that we need. Now we clean it up for presentation. | |
this._prepArticle(articleContent); | |
if (this._debug) | |
this.log("Article content post-prep: " + articleContent.innerHTML); | |
if (neededToCreateTopCandidate) { | |
// We already created a fake div thing, and there wouldn't have been any siblings left | |
// for the previous loop, so there's no point trying to create a new div, and then | |
// move all the children over. Just assign IDs and class names here. No need to append | |
// because that already happened anyway. | |
topCandidate.id = "readability-page-1"; | |
topCandidate.className = "page"; | |
} else { | |
var div = doc.createElement("DIV"); | |
div.id = "readability-page-1"; | |
div.className = "page"; | |
while (articleContent.firstChild) { | |
div.appendChild(articleContent.firstChild); | |
} | |
articleContent.appendChild(div); | |
} | |
if (this._debug) | |
this.log("Article content after paging: " + articleContent.innerHTML); | |
var parseSuccessful = true; | |
// Now that we've gone through the full algorithm, check to see if | |
// we got any meaningful content. If we didn't, we may need to re-run | |
// grabArticle with different flags set. This gives us a higher likelihood of | |
// finding the content, and the sieve approach gives us a higher likelihood of | |
// finding the -right- content. | |
var textLength = this._getInnerText(articleContent, true).length; | |
if (textLength < this._charThreshold) { | |
parseSuccessful = false; | |
page.innerHTML = pageCacheHtml; | |
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { | |
this._removeFlag(this.FLAG_STRIP_UNLIKELYS); | |
this._attempts.push({articleContent: articleContent, textLength: textLength}); | |
} else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { | |
this._removeFlag(this.FLAG_WEIGHT_CLASSES); | |
this._attempts.push({articleContent: articleContent, textLength: textLength}); | |
} else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { | |
this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); | |
this._attempts.push({articleContent: articleContent, textLength: textLength}); | |
} else { | |
this._attempts.push({articleContent: articleContent, textLength: textLength}); | |
// No luck after removing flags, just return the longest text we found during the different loops | |
this._attempts.sort(function (a, b) { | |
return b.textLength - a.textLength; | |
}); | |
// But first check if we actually have something | |
if (!this._attempts[0].textLength) { | |
return null; | |
} | |
articleContent = this._attempts[0].articleContent; | |
parseSuccessful = true; | |
} | |
} | |
if (parseSuccessful) { | |
// Find out text direction from ancestors of final top candidate. | |
var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate)); | |
this._someNode(ancestors, function(ancestor) { | |
if (!ancestor.tagName) | |
return false; | |
var articleDir = ancestor.getAttribute("dir"); | |
if (articleDir) { | |
this._articleDir = articleDir; | |
return true; | |
} | |
return false; | |
}); | |
return articleContent; | |
} | |
} | |
}, | |
/** | |
* Check whether the input string could be a byline. | |
* This verifies that the input is a string, and that the length | |
* is less than 100 chars. | |
* | |
* @param possibleByline {string} - a string to check whether its a byline. | |
* @return Boolean - whether the input string is a byline. | |
*/ | |
_isValidByline: function(byline) { | |
if (typeof byline == "string" || byline instanceof String) { | |
byline = byline.trim(); | |
return (byline.length > 0) && (byline.length < 100); | |
} | |
return false; | |
}, | |
/** | |
* Converts some of the common HTML entities in string to their corresponding characters. | |
* | |
* @param str {string} - a string to unescape. | |
* @return string without HTML entity. | |
*/ | |
_unescapeHtmlEntities: function(str) { | |
if (!str) { | |
return str; | |
} | |
var htmlEscapeMap = this.HTML_ESCAPE_MAP; | |
return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { | |
return htmlEscapeMap[tag]; | |
}).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { | |
var num = parseInt(hex || numStr, hex ? 16 : 10); | |
return String.fromCharCode(num); | |
}); | |
}, | |
/** | |
* Try to extract metadata from JSON-LD object. | |
* For now, only Schema.org objects of type Article or its subtypes are supported. | |
* @return Object with any metadata that could be extracted (possibly none) | |
*/ | |
_getJSONLD: function (doc) { | |
var scripts = this._getAllNodesWithTag(doc, ["script"]); | |
var metadata; | |
this._forEachNode(scripts, function(jsonLdElement) { | |
if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") { | |
try { | |
// Strip CDATA markers if present | |
var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, ""); | |
var parsed = JSON.parse(content); | |
if ( | |
!parsed["@context"] || | |
!parsed["@context"].match(/^https?\:\/\/schema\.org$/) | |
) { | |
return; | |
} | |
if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { | |
parsed = parsed["@graph"].find(function(it) { | |
return (it["@type"] || "").match( | |
this.REGEXPS.jsonLdArticleTypes | |
); | |
}); | |
} | |
if ( | |
!parsed || | |
!parsed["@type"] || | |
!parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) | |
) { | |
return; | |
} | |
metadata = {}; | |
if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) { | |
// we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz | |
// put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either | |
// "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. | |
var title = this._getArticleTitle(); | |
var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; | |
var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75; | |
if (headlineMatches && !nameMatches) { | |
metadata.title = parsed.headline; | |
} else { | |
metadata.title = parsed.name; | |
} | |
} else if (typeof parsed.name === "string") { | |
metadata.title = parsed.name.trim(); | |
} else if (typeof parsed.headline === "string") { | |
metadata.title = parsed.headline.trim(); | |
} | |
if (parsed.author) { | |
if (typeof parsed.author.name === "string") { | |
metadata.byline = parsed.author.name.trim(); | |
} else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") { | |
metadata.byline = parsed.author | |
.filter(function(author) { | |
return author && typeof author.name === "string"; | |
}) | |
.map(function(author) { | |
return author.name.trim(); | |
}) | |
.join(", "); | |
} | |
} | |
if (typeof parsed.description === "string") { | |
metadata.excerpt = parsed.description.trim(); | |
} | |
if ( | |
parsed.publisher && | |
typeof parsed.publisher.name === "string" | |
) { | |
metadata.siteName = parsed.publisher.name.trim(); | |
} | |
if (typeof parsed.datePublished === "string") { | |
metadata.datePublished = parsed.datePublished.trim(); | |
} | |
return; | |
} catch (err) { | |
this.log(err.message); | |
} | |
} | |
}); | |
return metadata ? metadata : {}; | |
}, | |
/** | |
* Attempts to get excerpt and byline metadata for the article. | |
* | |
* @param {Object} jsonld — object containing any metadata that | |
* could be extracted from JSON-LD object. | |
* | |
* @return Object with optional "excerpt" and "byline" properties | |
*/ | |
_getArticleMetadata: function(jsonld) { | |
var metadata = {}; | |
var values = {}; | |
var metaElements = this._doc.getElementsByTagName("meta"); | |
// property is a space-separated list of values | |
var propertyPattern = /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi; | |
// name is a single value | |
var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i; | |
// Find description tags. | |
this._forEachNode(metaElements, function(element) { | |
var elementName = element.getAttribute("name"); | |
var elementProperty = element.getAttribute("property"); | |
var content = element.getAttribute("content"); | |
if (!content) { | |
return; | |
} | |
var matches = null; | |
var name = null; | |
if (elementProperty) { | |
matches = elementProperty.match(propertyPattern); | |
if (matches) { | |
// Convert to lowercase, and remove any whitespace | |
// so we can match below. | |
name = matches[0].toLowerCase().replace(/\s/g, ""); | |
// multiple authors | |
values[name] = content.trim(); | |
} | |
} | |
if (!matches && elementName && namePattern.test(elementName)) { | |
name = elementName; | |
if (content) { | |
// Convert to lowercase, remove any whitespace, and convert dots | |
// to colons so we can match below. | |
name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":"); | |
values[name] = content.trim(); | |
} | |
} | |
}); | |
// get title | |
metadata.title = jsonld.title || | |
values["dc:title"] || | |
values["dcterm:title"] || | |
values["og:title"] || | |
values["weibo:article:title"] || | |
values["weibo:webpage:title"] || | |
values["title"] || | |
values["twitter:title"]; | |
if (!metadata.title) { | |
metadata.title = this._getArticleTitle(); | |
} | |
// get author | |
metadata.byline = jsonld.byline || | |
values["dc:creator"] || | |
values["dcterm:creator"] || | |
values["author"]; | |
// get description | |
metadata.excerpt = jsonld.excerpt || | |
values["dc:description"] || | |
values["dcterm:description"] || | |
values["og:description"] || | |
values["weibo:article:description"] || | |
values["weibo:webpage:description"] || | |
values["description"] || | |
values["twitter:description"]; | |
// get site name | |
metadata.siteName = jsonld.siteName || | |
values["og:site_name"]; | |
// get article published time | |
metadata.publishedTime = jsonld.datePublished || | |
values["article:published_time"] || null; | |
// in many sites the meta value is escaped with HTML entities, | |
// so here we need to unescape it | |
metadata.title = this._unescapeHtmlEntities(metadata.title); | |
metadata.byline = this._unescapeHtmlEntities(metadata.byline); | |
metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); | |
metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); | |
metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime); | |
return metadata; | |
}, | |
/** | |
* Check if node is image, or if node contains exactly only one image | |
* whether as a direct child or as its descendants. | |
* | |
* @param Element | |
**/ | |
_isSingleImage: function(node) { | |
if (node.tagName === "IMG") { | |
return true; | |
} | |
if (node.children.length !== 1 || node.textContent.trim() !== "") { | |
return false; | |
} | |
return this._isSingleImage(node.children[0]); | |
}, | |
/** | |
* Find all <noscript> that are located after <img> nodes, and which contain only one | |
* <img> element. Replace the first image with the image from inside the <noscript> tag, | |
* and remove the <noscript> tag. This improves the quality of the images we use on | |
* some sites (e.g. Medium). | |
* | |
* @param Element | |
**/ | |
_unwrapNoscriptImages: function(doc) { | |
// Find img without source or attributes that might contains image, and remove it. | |
// This is done to prevent a placeholder img is replaced by img from noscript in next step. | |
var imgs = Array.from(doc.getElementsByTagName("img")); | |
this._forEachNode(imgs, function(img) { | |
for (var i = 0; i < img.attributes.length; i++) { | |
var attr = img.attributes[i]; | |
switch (attr.name) { | |
case "src": | |
case "srcset": | |
case "data-src": | |
case "data-srcset": | |
return; | |
} | |
if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { | |
return; | |
} | |
} | |
img.parentNode.removeChild(img); | |
}); | |
// Next find noscript and try to extract its image | |
var noscripts = Array.from(doc.getElementsByTagName("noscript")); | |
this._forEachNode(noscripts, function(noscript) { | |
// Parse content of noscript and make sure it only contains image | |
var tmp = doc.createElement("div"); | |
tmp.innerHTML = noscript.innerHTML; | |
if (!this._isSingleImage(tmp)) { | |
return; | |
} | |
// If noscript has previous sibling and it only contains image, | |
// replace it with noscript content. However we also keep old | |
// attributes that might contains image. | |
var prevElement = noscript.previousElementSibling; | |
if (prevElement && this._isSingleImage(prevElement)) { | |
var prevImg = prevElement; | |
if (prevImg.tagName !== "IMG") { | |
prevImg = prevElement.getElementsByTagName("img")[0]; | |
} | |
var newImg = tmp.getElementsByTagName("img")[0]; | |
for (var i = 0; i < prevImg.attributes.length; i++) { | |
var attr = prevImg.attributes[i]; | |
if (attr.value === "") { | |
continue; | |
} | |
if (attr.name === "src" || attr.name === "srcset" || /\.(jpg|jpeg|png|webp)/i.test(attr.value)) { | |
if (newImg.getAttribute(attr.name) === attr.value) { | |
continue; | |
} | |
var attrName = attr.name; | |
if (newImg.hasAttribute(attrName)) { | |
attrName = "data-old-" + attrName; | |
} | |
newImg.setAttribute(attrName, attr.value); | |
} | |
} | |
noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement); | |
} | |
}); | |
}, | |
/** | |
* Removes script tags from the document. | |
* | |
* @param Element | |
**/ | |
_removeScripts: function(doc) { | |
this._removeNodes(this._getAllNodesWithTag(doc, ["script", "noscript"])); | |
}, | |
/** | |
* Check if this node has only whitespace and a single element with given tag | |
* Returns false if the DIV node contains non-empty text nodes | |
* or if it contains no element with given tag or more than 1 element. | |
* | |
* @param Element | |
* @param string tag of child element | |
**/ | |
_hasSingleTagInsideElement: function(element, tag) { | |
// There should be exactly 1 element child with given tag | |
if (element.children.length != 1 || element.children[0].tagName !== tag) { | |
return false; | |
} | |
// And there should be no text nodes with real content | |
return !this._someNode(element.childNodes, function(node) { | |
return node.nodeType === this.TEXT_NODE && | |
this.REGEXPS.hasContent.test(node.textContent); | |
}); | |
}, | |
_isElementWithoutContent: function(node) { | |
return node.nodeType === this.ELEMENT_NODE && | |
node.textContent.trim().length == 0 && | |
(node.children.length == 0 || | |
node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length); | |
}, | |
/** | |
* Determine whether element has any children block level elements. | |
* | |
* @param Element | |
*/ | |
_hasChildBlockElement: function (element) { | |
return this._someNode(element.childNodes, function(node) { | |
return this.DIV_TO_P_ELEMS.has(node.tagName) || | |
this._hasChildBlockElement(node); | |
}); | |
}, | |
/*** | |
* Determine if a node qualifies as phrasing content. | |
* https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content | |
**/ | |
_isPhrasingContent: function(node) { | |
return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 || | |
((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") && | |
this._everyNode(node.childNodes, this._isPhrasingContent)); | |
}, | |
_isWhitespace: function(node) { | |
return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) || | |
(node.nodeType === this.ELEMENT_NODE && node.tagName === "BR"); | |
}, | |
/** | |
* Get the inner text of a node - cross browser compatibly. | |
* This also strips out any excess whitespace to be found. | |
* | |
* @param Element | |
* @param Boolean normalizeSpaces (default: true) | |
* @return string | |
**/ | |
_getInnerText: function(e, normalizeSpaces) { | |
normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces; | |
var textContent = e.textContent.trim(); | |
if (normalizeSpaces) { | |
return textContent.replace(this.REGEXPS.normalize, " "); | |
} | |
return textContent; | |
}, | |
/** | |
* Get the number of times a string s appears in the node e. | |
* | |
* @param Element | |
* @param string - what to split on. Default is "," | |
* @return number (integer) | |
**/ | |
_getCharCount: function(e, s) { | |
s = s || ","; | |
return this._getInnerText(e).split(s).length - 1; | |
}, | |
/** | |
* Remove the style attribute on every e and under. | |
* TODO: Test if getElementsByTagName(*) is faster. | |
* | |
* @param Element | |
* @return void | |
**/ | |
_cleanStyles: function(e) { | |
if (!e || e.tagName.toLowerCase() === "svg") | |
return; | |
// Remove `style` and deprecated presentational attributes | |
for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) { | |
e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]); | |
} | |
if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) { | |
e.removeAttribute("width"); | |
e.removeAttribute("height"); | |
} | |
var cur = e.firstElementChild; | |
while (cur !== null) { | |
this._cleanStyles(cur); | |
cur = cur.nextElementSibling; | |
} | |
}, | |
/** | |
* Get the density of links as a percentage of the content | |
* This is the amount of text that is inside a link divided by the total text in the node. | |
* | |
* @param Element | |
* @return number (float) | |
**/ | |
_getLinkDensity: function(element) { | |
var textLength = this._getInnerText(element).length; | |
if (textLength === 0) | |
return 0; | |
var linkLength = 0; | |
// XXX implement _reduceNodeList? | |
this._forEachNode(element.getElementsByTagName("a"), function(linkNode) { | |
var href = linkNode.getAttribute("href"); | |
var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1; | |
linkLength += this._getInnerText(linkNode).length * coefficient; | |
}); | |
return linkLength / textLength; | |
}, | |
/** | |
* Get an elements class/id weight. Uses regular expressions to tell if this | |
* element looks good or bad. | |
* | |
* @param Element | |
* @return number (Integer) | |
**/ | |
_getClassWeight: function(e) { | |
if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) | |
return 0; | |
var weight = 0; | |
// Look for a special classname | |
if (typeof(e.className) === "string" && e.className !== "") { | |
if (this.REGEXPS.negative.test(e.className)) | |
weight -= 25; | |
if (this.REGEXPS.positive.test(e.className)) | |
weight += 25; | |
} | |
// Look for a special ID | |
if (typeof(e.id) === "string" && e.id !== "") { | |
if (this.REGEXPS.negative.test(e.id)) | |
weight -= 25; | |
if (this.REGEXPS.positive.test(e.id)) | |
weight += 25; | |
} | |
return weight; | |
}, | |
/** | |
* Clean a node of all elements of type "tag". | |
* (Unless it's a youtube/vimeo video. People love movies.) | |
* | |
* @param Element | |
* @param string tag to clean | |
* @return void | |
**/ | |
_clean: function(e, tag) { | |
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1; | |
this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(element) { | |
// Allow youtube and vimeo videos through as people usually want to see those. | |
if (isEmbed) { | |
// First, check the elements attributes to see if any of them contain youtube or vimeo | |
for (var i = 0; i < element.attributes.length; i++) { | |
if (this._allowedVideoRegex.test(element.attributes[i].value)) { | |
return false; | |
} | |
} | |
// For embed with <object> tag, check inner HTML as well. | |
if (element.tagName === "object" && this._allowedVideoRegex.test(element.innerHTML)) { | |
return false; | |
} | |
} | |
return true; | |
}); | |
}, | |
/** | |
* Check if a given node has one of its ancestor tag name matching the | |
* provided one. | |
* @param HTMLElement node | |
* @param String tagName | |
* @param Number maxDepth | |
* @param Function filterFn a filter to invoke to determine whether this node 'counts' | |
* @return Boolean | |
*/ | |
_hasAncestorTag: function(node, tagName, maxDepth, filterFn) { | |
maxDepth = maxDepth || 3; | |
tagName = tagName.toUpperCase(); | |
var depth = 0; | |
while (node.parentNode) { | |
if (maxDepth > 0 && depth > maxDepth) | |
return false; | |
if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode))) | |
return true; | |
node = node.parentNode; | |
depth++; | |
} | |
return false; | |
}, | |
/** | |
* Return an object indicating how many rows and columns this table has. | |
*/ | |
_getRowAndColumnCount: function(table) { | |
var rows = 0; | |
var columns = 0; | |
var trs = table.getElementsByTagName("tr"); | |
for (var i = 0; i < trs.length; i++) { | |
var rowspan = trs[i].getAttribute("rowspan") || 0; | |
if (rowspan) { | |
rowspan = parseInt(rowspan, 10); | |
} | |
rows += (rowspan || 1); | |
// Now look for column-related info | |
var columnsInThisRow = 0; | |
var cells = trs[i].getElementsByTagName("td"); | |
for (var j = 0; j < cells.length; j++) { | |
var colspan = cells[j].getAttribute("colspan") || 0; | |
if (colspan) { | |
colspan = parseInt(colspan, 10); | |
} | |
columnsInThisRow += (colspan || 1); | |
} | |
columns = Math.max(columns, columnsInThisRow); | |
} | |
return {rows: rows, columns: columns}; | |
}, | |
/** | |
* Look for 'data' (as opposed to 'layout') tables, for which we use | |
* similar checks as | |
* https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19 | |
*/ | |
_markDataTables: function(root) { | |
var tables = root.getElementsByTagName("table"); | |
for (var i = 0; i < tables.length; i++) { | |
var table = tables[i]; | |
var role = table.getAttribute("role"); | |
if (role == "presentation") { | |
table._readabilityDataTable = false; | |
continue; | |
} | |
var datatable = table.getAttribute("datatable"); | |
if (datatable == "0") { | |
table._readabilityDataTable = false; | |
continue; | |
} | |
var summary = table.getAttribute("summary"); | |
if (summary) { | |
table._readabilityDataTable = true; | |
continue; | |
} | |
var caption = table.getElementsByTagName("caption")[0]; | |
if (caption && caption.childNodes.length > 0) { | |
table._readabilityDataTable = true; | |
continue; | |
} | |
// If the table has a descendant with any of these tags, consider a data table: | |
var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"]; | |
var descendantExists = function(tag) { | |
return !!table.getElementsByTagName(tag)[0]; | |
}; | |
if (dataTableDescendants.some(descendantExists)) { | |
this.log("Data table because found data-y descendant"); | |
table._readabilityDataTable = true; | |
continue; | |
} | |
// Nested tables indicate a layout table: | |
if (table.getElementsByTagName("table")[0]) { | |
table._readabilityDataTable = false; | |
continue; | |
} | |
var sizeInfo = this._getRowAndColumnCount(table); | |
if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) { | |
table._readabilityDataTable = true; | |
continue; | |
} | |
// Now just go by size entirely: | |
table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10; | |
} | |
}, | |
/* convert images and figures that have properties like data-src into images that can be loaded without JS */ | |
_fixLazyImages: function (root) { | |
this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) { | |
// In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute. | |
// So, here we check if the data uri is too short, just might as well remove it. | |
if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) { | |
// Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes. | |
var parts = this.REGEXPS.b64DataUrl.exec(elem.src); | |
if (parts[1] === "image/svg+xml") { | |
return; | |
} | |
// Make sure this element has other attributes which contains image. | |
// If it doesn't, then this src is important and shouldn't be removed. | |
var srcCouldBeRemoved = false; | |
for (var i = 0; i < elem.attributes.length; i++) { | |
var attr = elem.attributes[i]; | |
if (attr.name === "src") { | |
continue; | |
} | |
if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { | |
srcCouldBeRemoved = true; | |
break; | |
} | |
} | |
// Here we assume if image is less than 100 bytes (or 133B after encoded to base64) | |
// it will be too small, therefore it might be placeholder image. | |
if (srcCouldBeRemoved) { | |
var b64starts = elem.src.search(/base64\s*/i) + 7; | |
var b64length = elem.src.length - b64starts; | |
if (b64length < 133) { | |
elem.removeAttribute("src"); | |
} | |
} | |
} | |
// also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 | |
if ((elem.src || (elem.srcset && elem.srcset != "null")) && elem.className.toLowerCase().indexOf("lazy") === -1) { | |
return; | |
} | |
for (var j = 0; j < elem.attributes.length; j++) { | |
attr = elem.attributes[j]; | |
if (attr.name === "src" || attr.name === "srcset" || attr.name === "alt") { | |
continue; | |
} | |
var copyTo = null; | |
if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) { | |
copyTo = "srcset"; | |
} else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) { | |
copyTo = "src"; | |
} | |
if (copyTo) { | |
//if this is an img or picture, set the attribute directly | |
if (elem.tagName === "IMG" || elem.tagName === "PICTURE") { | |
elem.setAttribute(copyTo, attr.value); | |
} else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) { | |
//if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure | |
//see the nytimes-3 testcase for an example | |
var img = this._doc.createElement("img"); | |
img.setAttribute(copyTo, attr.value); | |
elem.appendChild(img); | |
} | |
} | |
} | |
}); | |
}, | |
_getTextDensity: function(e, tags) { | |
var textLength = this._getInnerText(e, true).length; | |
if (textLength === 0) { | |
return 0; | |
} | |
var childrenLength = 0; | |
var children = this._getAllNodesWithTag(e, tags); | |
this._forEachNode(children, (child) => childrenLength += this._getInnerText(child, true).length); | |
return childrenLength / textLength; | |
}, | |
/** | |
* Clean an element of all tags of type "tag" if they look fishy. | |
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. | |
* | |
* @return void | |
**/ | |
_cleanConditionally: function(e, tag) { | |
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) | |
return; | |
// Gather counts for other typical elements embedded within. | |
// Traverse backwards so we can remove nodes at the same time | |
// without effecting the traversal. | |
// | |
// TODO: Consider taking into account original contentScore here. | |
this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(node) { | |
// First check if this node IS data table, in which case don't remove it. | |
var isDataTable = function(t) { | |
return t._readabilityDataTable; | |
}; | |
var isList = tag === "ul" || tag === "ol"; | |
if (!isList) { | |
var listLength = 0; | |
var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]); | |
this._forEachNode(listNodes, (list) => listLength += this._getInnerText(list).length); | |
isList = listLength / this._getInnerText(node).length > 0.9; | |
} | |
if (tag === "table" && isDataTable(node)) { | |
return false; | |
} | |
// Next check if we're inside a data table, in which case don't remove it as well. | |
if (this._hasAncestorTag(node, "table", -1, isDataTable)) { | |
return false; | |
} | |
if (this._hasAncestorTag(node, "code")) { | |
return false; | |
} | |
var weight = this._getClassWeight(node); | |
this.log("Cleaning Conditionally", node); | |
var contentScore = 0; | |
if (weight + contentScore < 0) { | |
return true; | |
} | |
if (this._getCharCount(node, ",") < 10) { | |
// If there are not very many commas, and the number of | |
// non-paragraph elements is more than paragraphs or other | |
// ominous signs, remove the element. | |
var p = node.getElementsByTagName("p").length; | |
var img = node.getElementsByTagName("img").length; | |
var li = node.getElementsByTagName("li").length - 100; | |
var input = node.getElementsByTagName("input").length; | |
var headingDensity = this._getTextDensity(node, ["h1", "h2", "h3", "h4", "h5", "h6"]); | |
var embedCount = 0; | |
var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]); | |
for (var i = 0; i < embeds.length; i++) { | |
// If this embed has attribute that matches video regex, don't delete it. | |
for (var j = 0; j < embeds[i].attributes.length; j++) { | |
if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) { | |
return false; | |
} | |
} | |
// For embed with <object> tag, check inner HTML as well. | |
if (embeds[i].tagName === "object" && this._allowedVideoRegex.test(embeds[i].innerHTML)) { | |
return false; | |
} | |
embedCount++; | |
} | |
var linkDensity = this._getLinkDensity(node); | |
var contentLength = this._getInnerText(node).length; | |
var haveToRemove = | |
(img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || | |
(!isList && li > p) || | |
(input > Math.floor(p/3)) || | |
(!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || | |
(!isList && weight < 25 && linkDensity > 0.2) || | |
(weight >= 25 && linkDensity > 0.5) || | |
((embedCount === 1 && contentLength < 75) || embedCount > 1); | |
// Allow simple lists of images to remain in pages | |
if (isList && haveToRemove) { | |
for (var x = 0; x < node.children.length; x++) { | |
let child = node.children[x]; | |
// Don't filter in lists with li's that contain more than one child | |
if (child.children.length > 1) { | |
return haveToRemove; | |
} | |
} | |
let li_count = node.getElementsByTagName("li").length; | |
// Only allow the list to remain if every li contains an image | |
if (img == li_count) { | |
return false; | |
} | |
} | |
return haveToRemove; | |
} | |
return false; | |
}); | |
}, | |
/** | |
* Clean out elements that match the specified conditions | |
* | |
* @param Element | |
* @param Function determines whether a node should be removed | |
* @return void | |
**/ | |
_cleanMatchedNodes: function(e, filter) { | |
var endOfSearchMarkerNode = this._getNextNode(e, true); | |
var next = this._getNextNode(e); | |
while (next && next != endOfSearchMarkerNode) { | |
if (filter.call(this, next, next.className + " " + next.id)) { | |
next = this._removeAndGetNext(next); | |
} else { | |
next = this._getNextNode(next); | |
} | |
} | |
}, | |
/** | |
* Clean out spurious headers from an Element. | |
* | |
* @param Element | |
* @return void | |
**/ | |
_cleanHeaders: function(e) { | |
let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]); | |
this._removeNodes(headingNodes, function(node) { | |
let shouldRemove = this._getClassWeight(node) < 0; | |
if (shouldRemove) { | |
this.log("Removing header with low class weight:", node); | |
} | |
return shouldRemove; | |
}); | |
}, | |
/** | |
* Check if this node is an H1 or H2 element whose content is mostly | |
* the same as the article title. | |
* | |
* @param Element the node to check. | |
* @return boolean indicating whether this is a title-like header. | |
*/ | |
_headerDuplicatesTitle: function(node) { | |
if (node.tagName != "H1" && node.tagName != "H2") { | |
return false; | |
} | |
var heading = this._getInnerText(node, false); | |
this.log("Evaluating similarity of header:", heading, this._articleTitle); | |
return this._textSimilarity(this._articleTitle, heading) > 0.75; | |
}, | |
_flagIsActive: function(flag) { | |
return (this._flags & flag) > 0; | |
}, | |
_removeFlag: function(flag) { | |
this._flags = this._flags & ~flag; | |
}, | |
_isProbablyVisible: function(node) { | |
// Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes. | |
return (!node.style || node.style.display != "none") | |
&& (!node.style || node.style.visibility != "hidden") | |
&& !node.hasAttribute("hidden") | |
//check for "fallback-image" so that wikimedia math images are displayed | |
&& (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1)); | |
}, | |
/** | |
* Runs readability. | |
* | |
* Workflow: | |
* 1. Prep the document by removing script tags, css, etc. | |
* 2. Build readability's DOM tree. | |
* 3. Grab the article content from the current dom tree. | |
* 4. Replace the current DOM tree with the new one. | |
* 5. Read peacefully. | |
* | |
* @return void | |
**/ | |
parse: function () { | |
// Avoid parsing too large documents, as per configuration option | |
if (this._maxElemsToParse > 0) { | |
var numTags = this._doc.getElementsByTagName("*").length; | |
if (numTags > this._maxElemsToParse) { | |
throw new Error("Aborting parsing document; " + numTags + " elements found"); | |
} | |
} | |
// Unwrap image from noscript | |
this._unwrapNoscriptImages(this._doc); | |
// Extract JSON-LD metadata before removing scripts | |
var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc); | |
// Remove script tags from the document. | |
this._removeScripts(this._doc); | |
this._prepDocument(); | |
var metadata = this._getArticleMetadata(jsonLd); | |
this._articleTitle = metadata.title; | |
var articleContent = this._grabArticle(); | |
if (!articleContent) | |
return null; | |
this.log("Grabbed: " + articleContent.innerHTML); | |
this._postProcessContent(articleContent); | |
// If we haven't found an excerpt in the article's metadata, use the article's | |
// first paragraph as the excerpt. This is used for displaying a preview of | |
// the article's content. | |
if (!metadata.excerpt) { | |
var paragraphs = articleContent.getElementsByTagName("p"); | |
if (paragraphs.length > 0) { | |
metadata.excerpt = paragraphs[0].textContent.trim(); | |
} | |
} | |
var textContent = articleContent.textContent; | |
return { | |
title: this._articleTitle, | |
byline: metadata.byline || this._articleByline, | |
dir: this._articleDir, | |
lang: this._articleLang, | |
content: this._serializer(articleContent), | |
textContent: textContent, | |
length: textContent.length, | |
excerpt: metadata.excerpt, | |
siteName: metadata.siteName || this._articleSiteName, | |
publishedTime: metadata.publishedTime | |
}; | |
} | |
}; | |
if (typeof module === "object") { | |
/* global module */ | |
module.exports = Readability; | |
} | |