diff --git a/docs/changes.rst b/docs/changes.rst index 03c260a0c..a5cedbcab 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -4,6 +4,10 @@ Changelog 16.1.3 (unreleased) ------------------- +- Add ``euphorie.htmllaundry`` module. + The original ``htmllaundry`` package fails with ``lxml`` 5.2. + [ale-rt, maurits] + - CSV download of similar title details. Ref: scrum-2198 diff --git a/setup.py b/setup.py index 1c8fbc701..d6336af50 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,6 @@ "decorator", "py-bcrypt", "ftw.upgrade", - "htmllaundry", "lxml", "markdownify", "nltk", diff --git a/src/euphorie/client/browser/risk.py b/src/euphorie/client/browser/risk.py index d6e079665..b576e8f1c 100644 --- a/src/euphorie/client/browser/risk.py +++ b/src/euphorie/client/browser/risk.py @@ -19,7 +19,7 @@ from euphorie.content.survey import ISurvey from euphorie.content.utils import IToolTypesInfo from euphorie.content.utils import parse_scaled_answers -from htmllaundry import StripMarkup +from euphorie.htmllaundry.utils import strip_markup from io import BytesIO from plone import api from plone.memoize.instance import memoize @@ -154,7 +154,7 @@ def solutions_available_for_action_plan(self): if solution_id not in existing_measure_ids: solutions.append( { - "description": StripMarkup(solution.description), + "description": strip_markup(solution.description), "action": getattr(solution, "action", "") or "", "requirements": solution.requirements, "id": solution_id, diff --git a/src/euphorie/client/docx/html.py b/src/euphorie/client/docx/html.py index e89fab82e..039e0254a 100644 --- a/src/euphorie/client/docx/html.py +++ b/src/euphorie/client/docx/html.py @@ -1,7 +1,7 @@ +from euphorie.htmllaundry.utils import strip_markup from lxml import etree import docx -import htmllaundry import lxml.html @@ -120,7 +120,7 @@ def __call__(self, markup, doc, style=None, next_style=None): try: markup_doc = lxml.html.document_fromstring(markup) except etree.XMLSyntaxError: - text = htmllaundry.StripMarkup(markup) + text = strip_markup(markup) text = text.replace(" ", "\n") doc.add_paragraph(text) return doc diff --git a/src/euphorie/content/configure.zcml b/src/euphorie/content/configure.zcml index 55244630c..5dd1970bb 100644 --- a/src/euphorie/content/configure.zcml +++ b/src/euphorie/content/configure.zcml @@ -14,7 +14,7 @@ - + diff --git a/src/euphorie/content/dependency.py b/src/euphorie/content/dependency.py index 2569d2bb7..f3cf96990 100644 --- a/src/euphorie/content/dependency.py +++ b/src/euphorie/content/dependency.py @@ -9,7 +9,7 @@ """ from euphorie.content.user import BaseValidator -from htmllaundry.z3cform import HtmlText +from euphorie.htmllaundry.z3cform import HtmlText from plonetheme.nuplone.z3cform.directives import Dependency from z3c.form.interfaces import IForm from z3c.form.interfaces import IValidator diff --git a/src/euphorie/content/help.py b/src/euphorie/content/help.py index 82c46b1d8..75c22c530 100644 --- a/src/euphorie/content/help.py +++ b/src/euphorie/content/help.py @@ -11,7 +11,7 @@ from .. import MessageFactory as _ from euphorie.content.utils import StripMarkup -from htmllaundry.z3cform import HtmlText +from euphorie.htmllaundry.z3cform import HtmlText from plone.app.z3cform.wysiwyg import WysiwygFieldWidget from plone.autoform import directives from plone.indexer import indexer diff --git a/src/euphorie/content/module.py b/src/euphorie/content/module.py index 1ec5a3936..05291cbd6 100644 --- a/src/euphorie/content/module.py +++ b/src/euphorie/content/module.py @@ -20,7 +20,7 @@ from Acquisition import aq_chain from euphorie.content.dependency import ConditionalTextLine from euphorie.content.utils import ensure_image_size -from htmllaundry.z3cform import HtmlText +from euphorie.htmllaundry.z3cform import HtmlText from plone.app.dexterity.behaviors.metadata import IBasic from plone.app.z3cform.wysiwyg import WysiwygFieldWidget from plone.autoform import directives diff --git a/src/euphorie/content/page.py b/src/euphorie/content/page.py index fdd3acafc..33eb7d949 100644 --- a/src/euphorie/content/page.py +++ b/src/euphorie/content/page.py @@ -1,7 +1,7 @@ from .. import MessageFactory as _ from euphorie.content.behaviour.richdescription import IRichDescription from euphorie.content.utils import StripMarkup -from htmllaundry.z3cform import HtmlText +from euphorie.htmllaundry.z3cform import HtmlText from plone.app.dexterity.behaviors.metadata import IBasic from plone.app.z3cform.wysiwyg import WysiwygFieldWidget from plone.autoform import directives diff --git a/src/euphorie/content/risk.py b/src/euphorie/content/risk.py index 437e33abc..9b2f09409 100644 --- a/src/euphorie/content/risk.py +++ b/src/euphorie/content/risk.py @@ -18,7 +18,7 @@ from Acquisition import aq_chain from Acquisition import aq_inner from euphorie.content.utils import ensure_image_size -from htmllaundry.z3cform import HtmlText +from euphorie.htmllaundry.z3cform import HtmlText from plone import api from plone.app.dexterity.behaviors.metadata import IBasic from plone.autoform import directives diff --git a/src/euphorie/content/survey.py b/src/euphorie/content/survey.py index 3b24fd28a..d384d77c8 100644 --- a/src/euphorie/content/survey.py +++ b/src/euphorie/content/survey.py @@ -23,7 +23,7 @@ from euphorie.content.dependency import ConditionalTextLine from euphorie.content.utils import get_tool_type_default from euphorie.content.utils import IToolTypesInfo -from htmllaundry.z3cform import HtmlText +from euphorie.htmllaundry.z3cform import HtmlText from plone.app.dexterity.behaviors.metadata import IBasic from plone.app.z3cform.wysiwyg import WysiwygFieldWidget from plone.autoform import directives diff --git a/src/euphorie/htmllaundry/__init__.py b/src/euphorie/htmllaundry/__init__.py new file mode 100644 index 000000000..a9eca9439 --- /dev/null +++ b/src/euphorie/htmllaundry/__init__.py @@ -0,0 +1,9 @@ +# htmllaundry was originally a library developed by Wichert Akkerman +# +# See: https://github.com/syslabcom/htmllaundry/tree/master +# +# A subset of the now unmaintained package code was copied and pasted +# into euphorie.htmllaundry +# +# The original htmllaundry package was licensed under the BSD license +# diff --git a/src/euphorie/htmllaundry/cleaners.py b/src/euphorie/htmllaundry/cleaners.py new file mode 100644 index 000000000..57eb7e061 --- /dev/null +++ b/src/euphorie/htmllaundry/cleaners.py @@ -0,0 +1,113 @@ +try: + from lxml_html_clean.clean import _find_external_links + from lxml_html_clean.clean import Cleaner +except ImportError: + # BBB for lxml 5.1 or earlier, Plone 6.0.10 or earlier + from lxml.html.clean import _find_external_links + from lxml.html.clean import Cleaner + + +marker = [] + + +class LaundryCleaner(Cleaner): + link_target = marker + + def __call__(self, doc): + super().__call__(doc) + if self.link_target is not marker: + self.force_link_target(doc, self.link_target) + + def force_link_target(self, doc, target): + for el in _find_external_links(doc): + if target is None: + if "target" in el.attrib: + del el.attrib["target"] + elif isinstance(target, (list, tuple)): + el.set("target", target[0]) + else: + el.set("target", target) + + +DocumentCleaner = LaundryCleaner( + page_structure=False, + remove_unknown_tags=False, + allow_tags=[ + "blockquote", + "a", + "img", + "em", + "p", + "strong", + "h3", + "h4", + "h5", + "ul", + "ol", + "li", + "sub", + "sup", + "abbr", + "acronym", + "dl", + "dt", + "dd", + "cite", + "dft", + "br", + "table", + "tr", + "td", + "th", + "thead", + "tbody", + "tfoot", + ], + safe_attrs_only=True, + add_nofollow=True, + scripts=True, + javascript=True, + comments=False, + style=True, + links=False, + meta=False, + processing_instructions=False, + frames=False, + annoying_tags=False, +) + + +# Useful for line fields such as titles +LineCleaner = LaundryCleaner( + page_structure=False, + safe_attrs_only=True, + remove_unknown_tags=False, # Weird API.. + allow_tags=["em", "strong"], + add_nofollow=True, + scripts=True, + javascript=True, + comments=False, + style=True, + processing_instructions=False, + frames=False, + annoying_tags=False, +) + +CommentCleaner = LaundryCleaner( + page_structure=False, + safe_attrs_only=True, + remove_unknown_tags=False, # Weird API.. + allow_tags=["blockquote", "a", "em", "p", "strong"], + add_nofollow=True, + scripts=False, + javascript=True, + comments=False, + style=True, + processing_instructions=False, + frames=False, + annoying_tags=False, + link_target=["_blank"], +) + + +__all__ = ["DocumentCleaner", "LineCleaner", "CommentCleaner"] diff --git a/src/euphorie/htmllaundry/configure.zcml b/src/euphorie/htmllaundry/configure.zcml new file mode 100644 index 000000000..6efd411cb --- /dev/null +++ b/src/euphorie/htmllaundry/configure.zcml @@ -0,0 +1,5 @@ + + + + + diff --git a/src/euphorie/htmllaundry/tests/__init__.py b/src/euphorie/htmllaundry/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/euphorie/htmllaundry/tests/test_htmllaundry.py b/src/euphorie/htmllaundry/tests/test_htmllaundry.py new file mode 100644 index 000000000..c45cacd4c --- /dev/null +++ b/src/euphorie/htmllaundry/tests/test_htmllaundry.py @@ -0,0 +1,287 @@ +import unittest + + +class Mock: + pass + + +class strip_markup_tests(unittest.TestCase): + def strip_markup(self, *a, **kw): + from euphorie.htmllaundry.utils import strip_markup + + return strip_markup(*a, **kw) + + def testEmpty(self): + obj = Mock() + obj.description = "" + self.assertEqual(self.strip_markup(""), "") + + def testNoMarkup(self): + self.assertEqual(self.strip_markup("Test"), "Test") + + def testSingleTag(self): + self.assertEqual(self.strip_markup("Test me"), "Test me") + + def testMultipleTags(self): + self.assertEqual( + self.strip_markup("Test me now"), "Test me now" + ) + + def testStrayBracket(self): + self.assertEqual(self.strip_markup("Test me >"), "Test me >") + + +class remove_empty_tags_tests(unittest.TestCase): + def _remove(self, str, extra_tags=[]): + from euphorie.htmllaundry.utils import remove_empty_tags + + import lxml.etree + + fragment = lxml.etree.fromstring(str) + fragment = remove_empty_tags(fragment, extra_tags) + return lxml.etree.tostring(fragment, encoding="utf8").decode() + + def testRemoveEmptyParagraphElement(self): + self.assertEqual(self._remove("

"), "
") + + def testRemoveEmptyParagraph(self): + self.assertEqual(self._remove("

"), "
") + + def testRemoveParagraphWithWhitespace(self): + self.assertEqual(self._remove("

"), "
") + + def testRemoveParagraphWithUnicodeWhitespace(self): + self.assertEqual(self._remove("

\xa0

"), "
") + + def testKeepEmptyImageElement(self): + self.assertEqual( + self._remove('
'), + '
', + ) + + def testCollapseBreaks(self): + self.assertEqual( + self._remove("

one

two

"), + "

one
two

", + ) + + def testNestedData(self): + self.assertEqual( + self._remove("

Test

"), + "

Test

", + ) + + def testKeepElementsWithTail(self): + self.assertEqual( + self._remove("One
two
three"), + "One
two
three", + ) + + def testTrailingBreak(self): + self.assertEqual(self._remove("
Test
"), "
Test
") + + def testLeadingBreak(self): + self.assertEqual(self._remove("

Test
"), "
Test
") + + def testDoNotRemoveEmptyAnchorElement(self): + # Should not remove empty tag because it's used as an anchor: + self.assertEqual( + self._remove('

'), '

' + ) + self.assertEqual( + self._remove('

'), '

' + ) + self.assertEqual( + self._remove('

'), + '

', + ) + self.assertEqual( + self._remove('

'), + '

', + ) + + # Should not remove tag because it's non-empty: + self.assertEqual( + self._remove('

Link

'), + '

Link

', + ) + self.assertEqual( + self._remove('

Link

'), + '

Link

', + ) + self.assertEqual( + self._remove('

Link

'), + '

Link

', + ) + + # Should remove because it's an useless empty tag. + self.assertEqual( + self._remove('

Content

'), "

Content

" + ) + self.assertEqual( + self._remove('

Content

'), "

Content

" + ) + + def testExtraAllowedEmptyTags(self): + self.assertEqual( + self._remove("
Test
", ["td"]), + "
Test
", + ) + + +class ForceLinkTargetTests(unittest.TestCase): + def force_link_target(self, str, target="_blank"): + from euphorie.htmllaundry.cleaners import LaundryCleaner + + import lxml.etree + + fragment = lxml.etree.fromstring(str) + cleaner = LaundryCleaner() + cleaner.force_link_target(fragment, target) + return lxml.etree.tostring(fragment, encoding="utf8").decode() + + def testNoAnchor(self): + self.assertEqual(self.force_link_target("

"), "

") + + def testAddTarget(self): + self.assertEqual( + self.force_link_target( + '
', "_blank" + ), + '
', + ) + + def testRemoveTarget(self): + self.assertEqual( + self.force_link_target( + '
', None + ), + '
', + ) + + +class strip_outer_breaks_tests(unittest.TestCase): + def _strip(self, str): + from euphorie.htmllaundry.utils import strip_outer_breaks + + import lxml.etree + + fragment = lxml.etree.fromstring(str) + strip_outer_breaks(fragment) + return lxml.etree.tostring(fragment, encoding="utf8").decode() + + def testNoBreak(self): + self.assertEqual( + self._strip("Dummy text"), "Dummy text" + ) + + def testTrailingBreak(self): + self.assertEqual( + self._strip("Dummy text
"), "Dummy text" + ) + + def testLeadingBreak(self): + self.assertEqual( + self._strip("
Dummy text"), "Dummy text" + ) + + def testBreakAfterElement(self): + self.assertEqual( + self._strip("

Dummy


text"), + "

Dummy

text", + ) + + +class SanizeTests(unittest.TestCase): + def sanitize(self, *a, **kw): + from euphorie.htmllaundry.utils import sanitize + + return sanitize(*a, **kw) + + def testEmpty(self): + self.assertEqual(self.sanitize(""), "") + + def testParagraph(self): + self.assertEqual(self.sanitize("

Test

"), "

Test

") + + def test_link_in_unwrapped_text(self): + self.assertEqual( + self.sanitize('There is a
link in here.'), + '

There is a link in here.

', + ) + + def testParagraphCustomWrapperNotUsedIfAlreadyWrapped(self): + self.assertEqual(self.sanitize("

Test

", wrap="span"), "

Test

") + + def testParagraphWithWhitespace(self): + self.assertEqual(self.sanitize("

Test

\n

\xa0

\n"), "

Test

\n\n") + + def testLeadingBreak(self): + self.assertEqual(self.sanitize("

Test

"), "

Test

") + + def testHeaderAndText(self): + self.assertEqual( + self.sanitize("

Title

Test

"), "

Title

Test

" + ) + + def testUnwrappedText(self): + self.assertEqual(self.sanitize("Hello, World"), "

Hello, World

") + + def testUnwrappedTextWithCustomWrapper(self): + self.assertEqual( + self.sanitize("Hello, World", wrap="strong"), + "Hello, World", + ) + + def testTrailingUnwrappedText(self): + self.assertEqual( + self.sanitize("

Hello,

World"), "

Hello,

World

" + ) + + def testTrailingUnwrappedTextWithCustomWrapper(self): + self.assertEqual( + self.sanitize("

Hello,

World", wrap="b"), "

Hello,

World" + ) + + def testUnwrappedTextEverywhere(self): + self.assertEqual( + self.sanitize( + "Hello,

you

nice and decent person." + ), + "

Hello,

you

nice and decent " + + "person.

", + ) + + def testUnwrappedTextEverywhereWithCustomWrapper(self): + self.assertEqual( + self.sanitize("Hello,

you

nice person.", wrap="div"), + "
Hello,

you

nice " + "person.
", + ) + + def testStripStyleAttributes(self): + self.assertEqual( + self.sanitize('

Hello

'), "

Hello

" + ) + + def testJavascriptLink(self): + self.assertEqual( + self.sanitize( + "

" + "click me

" + ), + '

click me

', + ) + + def testSkipWrapping(self): + self.assertEqual( + self.sanitize("Hello, you nice person.", wrap=None), + "Hello, you nice person.", + ) + + def testRejectBadWrapElement(self): + self.assertRaises(ValueError, self.sanitize, "

Hello,

World", wrap="xxx") + self.assertRaises( + ValueError, + self.sanitize, + "Hello, you nice person.", + wrap="", + ) diff --git a/src/euphorie/htmllaundry/utils.py b/src/euphorie/htmllaundry/utils.py new file mode 100644 index 000000000..f8bd88222 --- /dev/null +++ b/src/euphorie/htmllaundry/utils.py @@ -0,0 +1,188 @@ +from euphorie.htmllaundry.cleaners import DocumentCleaner +from lxml import etree +from lxml import html +from lxml.html import defs + +import re + + +INLINE_TAGS = defs.special_inline_tags | defs.phrase_tags | defs.font_style_tags +TAG = re.compile("<.*?>") +ANCHORS = etree.XPath( + "descendant-or-self::a | descendant-or-self::x:a", + namespaces={"x": html.XHTML_NAMESPACE}, +) +ALL_WHITESPACE = re.compile(r"^\s*$", re.UNICODE) + + +def is_whitespace(txt): + """Utility method to test if txt is all whitespace or None.""" + return txt is None or bool(ALL_WHITESPACE.match(txt)) + + +def strip_markup(markup): + """Strip all markup from a HTML fragment.""" + return TAG.sub("", markup) + + +StripMarkup = strip_markup # BBB for htmllaundry <2.0 + + +def remove_element(el): + parent = el.getparent() + if el.tail: + previous = el.getprevious() + if previous is not None: + if previous.tail: + previous.tail += el.tail + else: + previous.tail = el.tail + else: + if parent.text: + parent.text += el.tail + else: + parent.text = el.tail + + parent.remove(el) + + +def remove_empty_tags(doc, extra_empty_tags=[]): + """Removes all empty tags from a HTML document. Javascript editors + and browsers have a nasty habit of leaving stray tags around after + their contents have been removed. This function removes all such + empty tags, leaving only valid empty tags. + + In addition consecutive
tags are folded into a single tag. + This forces whitespace styling to be done using CSS instead of via an + editor, which almost always produces better and more consistent results. + """ + empty_tags = {"br", "hr", "img", "input"} + empty_tags.update(set(extra_empty_tags)) + legal_empty_tags = frozenset(empty_tags) + + if hasattr(doc, "getroot"): + doc = doc.getroot() + + def clean(doc): + victims = [] + for el in doc.iter(): + if el.tag == "br": + preceding = el.getprevious() + parent = el.getparent() + + if ( + (preceding is None and not parent.text) + or ( + preceding is not None + and preceding.tag == el.tag + and not preceding.tail + ) + or (not el.tail and el.getnext() is None) + ): + victims.append(el) + continue + + if el.tag in legal_empty_tags: + continue + + # Empty can be used as anchor. + if (el.tag == "a") and (("name" in el.attrib) or ("id" in el.attrib)): + continue + + if len(el) == 0 and is_whitespace(el.text): + victims.append(el) + continue + + if victims and victims[0] == doc: + doc.clear() + return 0 + else: + for victim in victims: + remove_element(victim) + + return len(victims) + + while clean(doc): + pass + + return doc + + +def strip_outer_breaks(doc): + """Remove any toplevel break elements.""" + victims = [] + + for i in range(len(doc)): + el = doc[i] + if el.tag == "br": + victims.append(el) + + for victim in victims: + remove_element(victim) + + +MARKER = "LAUNDRY-INSERT" + + +def wrap_text(doc, element="p"): + """Make sure there is no unwrapped text at the top level. Any bare text + found is wrapped in a `

` element. + """ + + def par(text): + el = etree.Element(element, {MARKER: ""}) + el.text = text + return el + + if doc.text: + doc.insert(0, par(doc.text)) + doc.text = None + + while True: + for i, el in enumerate(doc): + if html._nons(el.tag) in INLINE_TAGS and i and MARKER in doc[i - 1].attrib: + doc[i - 1].append(el) + break + if not is_whitespace(el.tail): + doc.insert(i + 1, par(el.tail)) + el.tail = None + break + else: + break + + for el in doc: + if MARKER in el.attrib: + del el.attrib[MARKER] + + +def sanitize(input, cleaner=DocumentCleaner, wrap="p"): + """Cleanup markup using a given cleanup configuration. + Unwrapped text will be wrapped with wrap parameter. + """ + if "body" not in cleaner.allow_tags: + cleaner.allow_tags.append("body") + + input = "%s" % input + document = html.document_fromstring(input) + bodies = [e for e in document if html._nons(e.tag) == "body"] + body = bodies[0] + + cleaned = cleaner.clean_html(body) + remove_empty_tags(cleaned) + strip_outer_breaks(cleaned) + + if wrap is not None: + if wrap in html.defs.tags: + wrap_text(cleaned, wrap) + else: + raise ValueError( + "Invalid html tag provided for wrapping the sanitized text" + ) + + output = "".join( + [etree.tostring(fragment, encoding=str) for fragment in cleaned.iterchildren()] + ) + if wrap is None and cleaned.text: + output = cleaned.text + output + + return output diff --git a/src/euphorie/htmllaundry/z3cform.py b/src/euphorie/htmllaundry/z3cform.py new file mode 100644 index 000000000..b785b9591 --- /dev/null +++ b/src/euphorie/htmllaundry/z3cform.py @@ -0,0 +1,34 @@ +from euphorie.htmllaundry.utils import sanitize +from z3c.form.converter import FieldDataConverter +from z3c.form.interfaces import IWidget +from zope.component import adapter +from zope.interface import implementer +from zope.schema import Text +from zope.schema.interfaces import IText + + +class IHtmlText(IText): + pass + + +@implementer(IHtmlText) +class HtmlText(Text): + """A HTML field. This is similar to a standard Text field, but will + sanitize all markup passed into it. + """ + + pass + + +@adapter(IHtmlText, IWidget) +class HtmlDataConverter(FieldDataConverter): + """z3c.form data convertor for HTML forms. This convertor + sanitizes all input, guaranteeing simple and valid markup + as a result. + """ + + def toFieldValue(self, value): + data = super().toFieldValue(value) + if data: + data = sanitize(data) + return data