Skip to content
This repository has been archived by the owner on Sep 29, 2023. It is now read-only.

Feature/summernote #149

Merged
merged 7 commits into from
Apr 24, 2019
205 changes: 147 additions & 58 deletions bones/textBone.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,34 +8,38 @@
from server.bones import baseBone
from server.bones.stringBone import LanguageWrapper
from server.config import conf
import logging, string


_attrsMargins = ["margin","margin-left","margin-right","margin-top","margin-bottom"]
_attrsSpacing = ["spacing","spacing-left","spacing-right","spacing-top","spacing-bottom"]
_attrsDescr = ["title","alt"]
_defaultTags = {
"validTags": [ 'font','b', 'a', 'i', 'u', 'span', 'div','p', 'img', 'ol', 'ul','li','acronym', #List of HTML-Tags which are valid
'h1','h2','h3','h4','h5','h6', 'table', 'tr', 'td', 'th', 'br', 'hr', 'strong'],
"validAttrs": { "font": ["color"], #Mapping of valid parameters for each tag (if a tag is not listed here: no parameters allowed)
"a": ["href","target"]+_attrsDescr,
"acronym": ["title"],
"div": ["align","width","height"]+_attrsMargins+_attrsSpacing,
"p":["align","width","height"]+_attrsMargins+_attrsSpacing,
"span":["align","width","height"]+_attrsMargins+_attrsSpacing,
"img":[ "src","target", "width","height", "align" ]+_attrsDescr+_attrsMargins+_attrsSpacing,
"table": [ "width","align", "border", "cellspacing", "cellpadding" ]+_attrsDescr,
"td" : [ "colspan", "rowspan", "width", "height" ]+_attrsMargins+_attrsSpacing
},
"validStyles": ["font-weight","font-style","text-decoration","color", "display"], #List of CSS-Directives we allow
"singleTags": ["br","img", "hr"] # List of tags, which dont have a corresponding end tag
"validTags": [ # List of HTML-Tags which are valid
'b', 'a', 'i', 'u', 'span', 'div', 'p', 'img', 'ol', 'ul', 'li', 'abbr', 'sub', 'sup',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table', 'thead', 'tbody', 'tfoot', 'tr', 'td', 'th', 'br',
'hr', 'strong', 'blockquote', 'em'],
"validAttrs": { # Mapping of valid parameters for each tag (if a tag is not listed here: no parameters allowed)
"a": ["href", "target", "title"],
"abbr": ["title"],
"span": ["title"],
"img": ["src", "srcset", "alt", "title"],
"td": ["colspan", "rowspan"],
"p": ["data-indent"],
"blockquote": ["cite"]
},
"validStyles": [
"color"
], # List of CSS-Directives we allow
"validClasses": ["vitxt-*", "viur-txt-*"], # List of valid class-names that are valid
"singleTags": ["br", "img", "hr"] # List of tags, which don't have a corresponding end tag
}
del _attrsDescr, _attrsSpacing, _attrsMargins

class HtmlSerializer( HTMLParser.HTMLParser ): #html.parser.HTMLParser
def __init__(self, validHtml=None ):

class HtmlSerializer(HTMLParser.HTMLParser): # html.parser.HTMLParser
def __init__(self, validHtml=None):
global _defaultTags
HTMLParser.HTMLParser.__init__(self)
self.result = ""
self.openTagsList = []
self.result = "" # The final result that will be returned
self.openTagsList = [] # List of tags that still need to be closed
self.tagCache = [] # Tuple of tags that have been processed but not written yet
self.validHtml = validHtml

def handle_data(self, data):
Expand All @@ -46,71 +50,156 @@ def handle_data(self, data):
.replace("'", "'") \
.replace("\n", "") \
.replace("\0", "")
if data:
if data.strip():
self.flushCache()
self.result += data

def handle_charref(self, name):
self.result += "&#%s;" % ( name )
self.flushCache()
self.result += "&#%s;" % (name)

def handle_entityref(self, name): #FIXME
def handle_entityref(self, name): # FIXME
if name in htmlentitydefs.entitydefs.keys():
self.result += "&%s;" % ( name )
self.flushCache()
self.result += "&%s;" % (name)

def flushCache(self):
"""
Flush pending tags into the result and push their corresponding end-tags onto the stack
"""
for start, end in self.tagCache:
self.result += start
self.openTagsList.insert(0, end)
self.tagCache = []

def handle_starttag(self, tag, attrs):
""" Delete all tags except for legal ones """
filterChars = "\"'\\\0\r\n@()"
if self.validHtml and tag in self.validHtml["validTags"]:
self.result = self.result + '<' + tag
cacheTagStart = '<' + tag
isBlankTarget = False
styles = None
classes = None
for k, v in attrs:
if not tag in self.validHtml["validAttrs"] or not k in self.validHtml["validAttrs"][ tag ]:
k = k.strip()
v = v.strip()
if any([c in k for c in filterChars]) or any([c in v for c in filterChars]):
if k in {"title", "href", "alt"} and not any([c in v for c in "\"'\\\0\r\n"]):
# If we have a title or href attribute, ignore @ and ()
pass
else:
# Either the key or the value contains a character that's not supposed to be there
continue
elif k == "class":
# Classes are handled below
classes = v.split(" ")
continue
elif k == "style":
# Styles are handled below
styles = v.split(";")
continue
elif k == "src":
# We ensure that any src tag starts with an actual url
checker = v.lower()
if not (checker.startswith("http://") or checker.startswith("https://") or \
checker.startswith("/")):
continue
if not tag in self.validHtml["validAttrs"].keys() or not k in \
self.validHtml["validAttrs"][tag]:
# That attribute is not valid on this tag
continue
if k.lower()[0:2] != 'on' and v.lower()[0:10] != 'javascript':
self.result = '%s %s="%s"' % (self.result, k, v)
if tag=="a" and k=="target" and v.lower()=="_blank":
cacheTagStart += ' %s="%s"' % (k, v)
if tag == "a" and k == "target" and v.lower() == "_blank":
isBlankTarget = True
if "style" in [ k for (k,v) in attrs ]:
if styles:
syleRes = {}
styles = [ v for (k,v) in attrs if k=="style"][0].split(";")
for s in styles:
style = s[ : s.find(":") ].strip()
value = s[ s.find(":")+1 : ].strip()
if style in self.validHtml["validStyles"] and not any( [(x in value) for x in ["\"",":",";"]] ):
syleRes[ style ] = value
if len( syleRes.keys() ):
self.result += " style=\"%s\"" % "; ".join( [("%s: %s" % (k,v)) for (k,v) in syleRes.items()] )
style = s[: s.find(":")].strip()
value = s[s.find(":") + 1:].strip()
if any([c in style for c in filterChars]) or any(
[c in value for c in filterChars]):
# Either the key or the value contains a character that's not supposed to be there
continue
if value.lower().startswith("expression") or value.lower().startswith("import"):
# IE evaluates JS inside styles if the keyword expression is present
continue
if style in self.validHtml["validStyles"] and not any(
[(x in value) for x in ["\"", ":", ";"]]):
syleRes[style] = value
if len(syleRes.keys()):
cacheTagStart += " style=\"%s\"" % "; ".join(
[("%s: %s" % (k, v)) for (k, v) in syleRes.items()])
if classes:
validClasses = []
for currentClass in classes:
validClassChars = string.ascii_lowercase + string.ascii_uppercase + string.digits + "-"
if not all([x in validClassChars for x in currentClass]):
# The class contains invalid characters
continue
isOkay = False
for validClass in self.validHtml["validClasses"]:
# Check if the classname matches or is white-listed by a prefix
if validClass == currentClass:
isOkay = True
break
if validClass.endswith("*"):
validClass = validClass[:-1]
if currentClass.startswith(validClass):
isOkay = True
break
if isOkay:
validClasses.append(currentClass)
if validClasses:
cacheTagStart += " class=\"%s\"" % " ".join(validClasses)
if isBlankTarget:
self.result += " rel=\"noopener noreferrer\""
# Add rel tag to prevent the browser to pass window.opener around
cacheTagStart += " rel=\"noopener noreferrer\""
if tag in self.validHtml["singleTags"]:
self.result = self.result + ' />'
# Single-Tags do have a visual representation; ensure it makes it into the result
self.flushCache()
self.result += cacheTagStart + '>' # dont need slash in void elements in html5
else:
self.result = self.result + '>'
self.openTagsList.insert( 0, tag)
# We opened a 'normal' tag; push it on the cache so it can be discarded later if
# we detect it has no content
cacheTagStart += '>'
self.tagCache.append((cacheTagStart, tag))
else:
self.result += " "

def handle_endtag(self, tag):
if self.validHtml and tag in self.openTagsList:
for endTag in self.openTagsList[ : ]: #Close all currently open Tags until we reach the current one
self.result = "%s</%s>" % (self.result, endTag)
self.openTagsList.remove( endTag)
if endTag==tag:
break
else:
self.result += " "
if self.validHtml:
if self.tagCache:
# Check if that element is still on the cache
# and just silently drop the cache up to that point
if tag in [x[1] for x in self.tagCache] + self.openTagsList:
for tagCache in self.tagCache[::-1]:
self.tagCache.remove(tagCache)
if tagCache[1] == tag:
return
if tag in self.openTagsList:
# Close all currently open Tags until we reach the current one. If no one is found,
# we just close everything and ignore the tag that should have been closed
for endTag in self.openTagsList[:]:
self.result += "</%s>" % endTag
self.openTagsList.remove(endTag)
if endTag == tag:
break

def cleanup(self): #FIXME: vertauschte tags
def cleanup(self): # FIXME: vertauschte tags
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's meant with this fixme?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tsteinruecken What about this point?

""" Append missing closing tags """
self.flushCache()
for tag in self.openTagsList:
endTag = '</%s>' % tag
self.result += endTag

def santinize( self, instr ):
def sanitize(self, instr):
self.result = ""
self.openTagsList = []
self.feed(instr.replace("\n", " "))
self.close()
self.cleanup()
return( self.result )
return self.result



Expand Down Expand Up @@ -160,7 +249,7 @@ def serialize( self, valuesCache, name, entity ):
for lang in self.languages:
if isinstance(valuesCache[name], dict) and lang in valuesCache[name]:
val = valuesCache[name][ lang ]
if not val or (not HtmlSerializer().santinize(val).strip() and not "<img " in val):
if not val or (not HtmlSerializer().sanitize(val).strip() and not "<img " in val):
#This text is empty (ie. it might contain only an empty <p> tag
continue
entity.set("%s.%s" % (name, lang), val, self.indexed)
Expand Down Expand Up @@ -219,7 +308,7 @@ def fromClient( self, valuesCache, name, data ):
val = data["%s.%s" % (name,lang)]
err = self.isInvalid(val) #Returns None on success, error-str otherwise
if not err:
valuesCache[name][lang] = HtmlSerializer(self.validHtml).santinize(val)
valuesCache[name][lang] = HtmlSerializer(self.validHtml).sanitize(val)
else:
lastError = err
if not any(valuesCache[name].values()) and not lastError:
Expand All @@ -237,7 +326,7 @@ def fromClient( self, valuesCache, name, data ):
value = unicode(value)
err = self.isInvalid(value)
if not err:
valuesCache[name] = HtmlSerializer(self.validHtml).santinize(value)
valuesCache[name] = HtmlSerializer(self.validHtml).sanitize(value)
return err

def isInvalid( self, value ):
Expand Down Expand Up @@ -290,14 +379,14 @@ def getSearchTags(self, valuesCache, name):
return( res )
if self.languages:
for v in valuesCache.get(name).values():
value = HtmlSerializer( None ).santinize( v.lower() )
value = HtmlSerializer( None ).sanitize(v.lower())
for line in unicode(value).splitlines():
for key in line.split(" "):
key = "".join( [ c for c in key if c.lower() in conf["viur.searchValidChars"] ] )
if key and key not in res and len(key)>3:
res.append( key.lower() )
else:
value = HtmlSerializer( None ).santinize( valuesCache.get(name).lower() )
value = HtmlSerializer( None ).sanitize(valuesCache.get(name).lower())
for line in unicode(value).splitlines():
for key in line.split(" "):
key = "".join( [ c for c in key if c.lower() in conf["viur.searchValidChars"] ] )
Expand Down