Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
killwords and end are currently ignored.
ONLY USE FOR KNOWN-SAFE HTML.
"""
tree = html5lib.parseFragment(html)
if text_length(tree) <= length:
return jinja2.Markup(html)
else:
# Get a truncated version of the tree.
short, _ = trim(tree, length, killwords, end)
# Serialize the parsed tree back to html.
walker = html5lib.treewalkers.getTreeWalker('etree')
stream = walker(short)
serializer = html5lib.serializer.htmlserializer.HTMLSerializer(
quote_attr_values=True, omit_optional_tags=False)
return jinja2.Markup(force_unicode(serializer.render(stream)))
for search,replace in SMILEY_REPLACEMENTS:
text = text.replace(search, replace)
for regex,replace in BBCODE_REGEXES:
text = regex.sub(replace, text)
for search,replace in BBCODE_REPLACEMENTS:
text = text.replace(search, replace)
t4 = time.clock()
doc = parser.parse(text)
t5 = time.clock()
walker = treewalkers.getTreeWalker('etree')
stream = walker(doc)
s = serializer.htmlserializer.HTMLSerializer()
output_generator = s.serialize(stream)
t6 = time.clock()
done = Markup(''.join(list(output_generator)))
t7 = time.clock()
print('Init:%f, BR:%f, Regex:%f, Parse:%f, Serial:%f, Join:%f, All:%f' % (t2-t1, t3-t2, t4-t3, t5-t4, t6-t5, t7-t6, t7-t1))
return done
Ensure that the text does not contain any malicious HTML code which might
break the page.
"""
try:
import html5lib
from html5lib import sanitizer, serializer, treewalkers
except ImportError:
raise Exception("html5lib not available")
p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
tree = p.parseFragment(string)
walker = treewalkers.getTreeWalker("etree")
stream = walker(tree)
s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
quote_attr_values=True)
return s.render(stream)
def clean(self, value):
chars = super(HTMLField, self).clean(value)
#chars = chars.encode('utf-8') # should really find out where we have decoded input to unicode and do it there instead
p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) # could use Beautiful Soup here instead
s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
dom_tree = p.parseFragment(chars) #encoding="utf-8") - unicode input seems to work fine
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
gen = s.serialize(stream)
out = ""
for i in gen:
out += i
return out
"""
tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
dom = parser.parse(src)
for tag, attr in url_attributes:
for e in dom.getElementsByTagName(tag):
u = e.getAttribute(attr)
if u and not url_filter(urljoin(src_base, u)):
rewritten = urljoin(rewrite_base, u)
if u != rewritten:
e.setAttribute(attr, rewritten)
tree_walker = html5lib.treewalkers.getTreeWalker('dom')
html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
return u''.join(html_serializer.serialize(tree_walker(dom)))
def to_unicode(self):
"""Return the unicode serialization of myself."""
container_len = len(self.CONTAINER_TAG) + 2 # 2 for the <>
walker = getTreeWalker(self.TREEBUILDER)
stream = walker(self._root)
serializer = HTMLSerializer(quote_attr_values=True,
omit_optional_tags=False)
return serializer.render(stream)[container_len:-container_len - 1]
def sanitize_html(value):
"""A custom filter that sanitzes html output to make sure there is no bad stuff in it"""
p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parseFragment(value)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
return "".join(s.serialize(stream))
def _serialize_stream(document_tree):
walker = html5lib.treewalkers.getTreeWalker('lxml')
stream = walker(document_tree)
serializer = htmlserializer.HTMLSerializer(omit_optional_tags=False,
quote_attr_values=True)
return unicode(serializer.render(stream))