\xa0
one
two
one
two
Test
Test
Content
" + ) + self.assertEqual( + self._remove(''), "Content
" + ) + + def testExtraAllowedEmptyTags(self): + self.assertEqual( + self._remove("Test |
Test |
Dummy
Dummy
text", + ) + + +class SanizeTests(unittest.TestCase): + def sanitize(self, *a, **kw): + from euphorie.htmllaundry.utils import sanitize + + return sanitize(*a, **kw) + + def testEmpty(self): + self.assertEqual(self.sanitize(""), "") + + def testParagraph(self): + self.assertEqual(self.sanitize("Test
"), "Test
") + + def test_link_in_unwrapped_text(self): + self.assertEqual( + self.sanitize('There is a link in here.'), + 'There is a link in here.
', + ) + + def testParagraphCustomWrapperNotUsedIfAlreadyWrapped(self): + self.assertEqual(self.sanitize("Test
", wrap="span"), "Test
") + + def testParagraphWithWhitespace(self): + self.assertEqual(self.sanitize("Test
\n\xa0
\n"), "Test
\n\n") + + def testLeadingBreak(self): + self.assertEqual(self.sanitize("Test
"), "Test
") + + def testHeaderAndText(self): + self.assertEqual( + self.sanitize("Test
"), "Test
" + ) + + def testUnwrappedText(self): + self.assertEqual(self.sanitize("Hello, World"), "Hello, World
") + + def testUnwrappedTextWithCustomWrapper(self): + self.assertEqual( + self.sanitize("Hello, World", wrap="strong"), + "Hello, World", + ) + + def testTrailingUnwrappedText(self): + self.assertEqual( + self.sanitize("Hello,
World"), "Hello,
World
" + ) + + def testTrailingUnwrappedTextWithCustomWrapper(self): + self.assertEqual( + self.sanitize("Hello,
World", wrap="b"), "Hello,
World" + ) + + def testUnwrappedTextEverywhere(self): + self.assertEqual( + self.sanitize( + "Hello,you
nice and decent person." + ), + "Hello,
you
nice and decent " + + "person.
", + ) + + def testUnwrappedTextEverywhereWithCustomWrapper(self): + self.assertEqual( + self.sanitize("Hello,you
nice person.", wrap="div"), + "you
Hello
'), "Hello
" + ) + + def testJavascriptLink(self): + self.assertEqual( + self.sanitize( + "" + ), + '', + ) + + def testSkipWrapping(self): + self.assertEqual( + self.sanitize("Hello, you nice person.", wrap=None), + "Hello, you nice person.", + ) + + def testRejectBadWrapElement(self): + self.assertRaises(ValueError, self.sanitize, "Hello,
World", wrap="xxx") + self.assertRaises( + ValueError, + self.sanitize, + "Hello, you nice person.", + wrap="", + ) diff --git a/src/euphorie/htmllaundry/utils.py b/src/euphorie/htmllaundry/utils.py new file mode 100644 index 000000000..f8bd88222 --- /dev/null +++ b/src/euphorie/htmllaundry/utils.py @@ -0,0 +1,188 @@ +from euphorie.htmllaundry.cleaners import DocumentCleaner +from lxml import etree +from lxml import html +from lxml.html import defs + +import re + + +INLINE_TAGS = defs.special_inline_tags | defs.phrase_tags | defs.font_style_tags +TAG = re.compile("<.*?>") +ANCHORS = etree.XPath( + "descendant-or-self::a | descendant-or-self::x:a", + namespaces={"x": html.XHTML_NAMESPACE}, +) +ALL_WHITESPACE = re.compile(r"^\s*$", re.UNICODE) + + +def is_whitespace(txt): + """Utility method to test if txt is all whitespace or None.""" + return txt is None or bool(ALL_WHITESPACE.match(txt)) + + +def strip_markup(markup): + """Strip all markup from a HTML fragment.""" + return TAG.sub("", markup) + + +StripMarkup = strip_markup # BBB for htmllaundry <2.0 + + +def remove_element(el): + parent = el.getparent() + if el.tail: + previous = el.getprevious() + if previous is not None: + if previous.tail: + previous.tail += el.tail + else: + previous.tail = el.tail + else: + if parent.text: + parent.text += el.tail + else: + parent.text = el.tail + + parent.remove(el) + + +def remove_empty_tags(doc, extra_empty_tags=[]): + """Removes all empty tags from a HTML document. Javascript editors + and browsers have a nasty habit of leaving stray tags around after + their contents have been removed. This function removes all such + empty tags, leaving only valid empty tags. + + In addition consecutive` element. + """ + + def par(text): + el = etree.Element(element, {MARKER: ""}) + el.text = text + return el + + if doc.text: + doc.insert(0, par(doc.text)) + doc.text = None + + while True: + for i, el in enumerate(doc): + if html._nons(el.tag) in INLINE_TAGS and i and MARKER in doc[i - 1].attrib: + doc[i - 1].append(el) + break + if not is_whitespace(el.tail): + doc.insert(i + 1, par(el.tail)) + el.tail = None + break + else: + break + + for el in doc: + if MARKER in el.attrib: + del el.attrib[MARKER] + + +def sanitize(input, cleaner=DocumentCleaner, wrap="p"): + """Cleanup markup using a given cleanup configuration. + Unwrapped text will be wrapped with wrap parameter. + """ + if "body" not in cleaner.allow_tags: + cleaner.allow_tags.append("body") + + input = "
%s" % input + document = html.document_fromstring(input) + bodies = [e for e in document if html._nons(e.tag) == "body"] + body = bodies[0] + + cleaned = cleaner.clean_html(body) + remove_empty_tags(cleaned) + strip_outer_breaks(cleaned) + + if wrap is not None: + if wrap in html.defs.tags: + wrap_text(cleaned, wrap) + else: + raise ValueError( + "Invalid html tag provided for wrapping the sanitized text" + ) + + output = "".join( + [etree.tostring(fragment, encoding=str) for fragment in cleaned.iterchildren()] + ) + if wrap is None and cleaned.text: + output = cleaned.text + output + + return output diff --git a/src/euphorie/htmllaundry/z3cform.py b/src/euphorie/htmllaundry/z3cform.py new file mode 100644 index 000000000..b785b9591 --- /dev/null +++ b/src/euphorie/htmllaundry/z3cform.py @@ -0,0 +1,34 @@ +from euphorie.htmllaundry.utils import sanitize +from z3c.form.converter import FieldDataConverter +from z3c.form.interfaces import IWidget +from zope.component import adapter +from zope.interface import implementer +from zope.schema import Text +from zope.schema.interfaces import IText + + +class IHtmlText(IText): + pass + + +@implementer(IHtmlText) +class HtmlText(Text): + """A HTML field. This is similar to a standard Text field, but will + sanitize all markup passed into it. + """ + + pass + + +@adapter(IHtmlText, IWidget) +class HtmlDataConverter(FieldDataConverter): + """z3c.form data convertor for HTML forms. This convertor + sanitizes all input, guaranteeing simple and valid markup + as a result. + """ + + def toFieldValue(self, value): + data = super().toFieldValue(value) + if data: + data = sanitize(data) + return data