Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# that consist only of a fragment identifier, because Google Reader
# changes href=#foo to href=http://site/#foo
for tag, attr in url_attributes:
for e in dom.getElementsByTagName(tag):
u = e.getAttribute(attr)
if u:
e.setAttribute(attr, urlparse.urljoin(base_url, u))
# Return the HTML5 serialization of the of the result (we don't
# want the : this breaks feed readers).
body = dom.getElementsByTagName('body')[0]
tree_walker = html5lib.treewalkers.getTreeWalker('dom')
try:
html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
except AttributeError:
html_serializer = html5lib.serializer.HTMLSerializer()
return u''.join(html_serializer.serialize(tree_walker(body)))
elif opts.html:
kwargs = {}
for opt in serializer.HTMLSerializer.options:
try:
kwargs[opt] = getattr(opts,opt)
except:
pass
if not kwargs['quote_char']:
del kwargs['quote_char']
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
if sys.version_info[0] >= 3:
encoding = None
else:
encoding = "utf-8"
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
sys.stdout.write(text)
if not text.endswith('\n'): sys.stdout.write('\n')
if opts.error:
errList=[]
for pos, errorcode, datavars in parser.errors:
errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
csstext = _embed_css(_resolve_path(css.getAttribute("href"), rootdirs), rootdirs)
ncss = dom.createElement("style")
ncss.setAttribute(u"type", u"text/css")
ncss.appendChild(dom.createTextNode(csstext))
css.parentNode.insertBefore(ncss, css)
css.parentNode.removeChild(css)
_embed_images(dom, rootdirs)
#Save out the new html file
with open(outfile, "w") as htmlfile:
# Fix error due to changes new version of html5lib (> 0.9999...)
try:
serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
except AttributeError:
serializer = html5lib.serializer.HTMLSerializer()
walker = html5lib.treewalkers.getTreeWalker("dom")
for line in serializer.serialize(walker(dom)):
htmlfile.write(line)
def clean_html(data, full=True, parser=DEFAULT_PARSER):
"""
Cleans HTML from XSS vulnerabilities using html5lib
If full is False, only the contents inside will be returned (without
the tags).
"""
if full:
dom_tree = parser.parse(data)
else:
dom_tree = parser.parseFragment(data)
walker = treewalkers.getTreeWalker('dom')
kwargs = _filter_kwargs()
stream = TextSanitizer(walker(dom_tree), **kwargs)
s = serializer.HTMLSerializer(
omit_optional_tags=False,
quote_attr_values='always',
)
return u''.join(s.serialize(stream))
def trim_html(html):
if not isinstance(html, Markup):
raise TypeError("trim_html: expected Markup, got {!r}".format(type(html)))
# TODO i think this could be combined with the bleach.clean call to avoid a
# double parse? filters apply during serialization, bleach applies during
# tokenization
# TODO alternatively, could this apply during tokenization to avoid
# bothering with any markup we're not even going to show?
tree = html5lib.parse(html)
walker = html5lib.getTreeWalker('etree')
stream = walker(tree)
stream = TrimFilter(stream)
serializer = html5lib.serializer.HTMLSerializer()
return Markup(u''.join(serializer.serialize(stream)).strip())
for child in node.getchildren():
new_child = transform(child)
if new_child != child:
if new_child is not None:
child.addnext(new_child)
node.remove(child)
return node
prefix, root_node = html5lib.parseFragment(text, treebuilder='lxml')
node = transform(root_node)
node.attrib.clear()
node.tail = None
walker = html5lib.treewalkers.getTreeWalker('lxml')
stream = walker(node)
serializer = html5lib.serializer.htmlserializer.HTMLSerializer(omit_optional_tags=True)
output_generator = serializer.serialize(stream)
return u''.join(output_generator)
for item in parser.log:
print(item)
if document is not None:
if opts.xml:
sys.stdout.write(document.toxml("utf-8"))
elif opts.tree:
if not hasattr(document,'__getitem__'):
document = [document]
for fragment in document:
print(parser.tree.testSerializer(fragment))
elif opts.hilite:
sys.stdout.write(document.hilite("utf-8"))
elif opts.html:
kwargs = {}
for opt in serializer.HTMLSerializer.options:
try:
kwargs[opt] = getattr(opts,opt)
except:
pass
if not kwargs['quote_char']:
del kwargs['quote_char']
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
if sys.version_info[0] >= 3:
encoding = None
else:
encoding = "utf-8"
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
sys.stdout.write(text)
if not text.endswith('\n'): sys.stdout.write('\n')
if opts.error: