Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def sanitize_html(value):
"""A custom filter that sanitzes html output to make sure there is no bad stuff in it"""
p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parseFragment(value)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
return "".join(s.serialize(stream))
def sanitize(string):
"""
Ensure that the text does not contain any malicious HTML code which might
break the page.
"""
try:
import html5lib
from html5lib import sanitizer, serializer, treewalkers
except ImportError:
raise Exception("html5lib not available")
p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
tree = p.parseFragment(string)
walker = treewalkers.getTreeWalker("etree")
stream = walker(tree)
s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
quote_attr_values=True)
return s.render(stream)
def get_toc(self, path):
# Only have TOC on tutorial pages. Don't do work for others.
if not (re.search('/tutorials', path) or re.search('/mobile', path) or re.search('style-guide', path)):
return ''
toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
if toc is None or not self.request.cache:
template_text = render_to_string(path, {})
parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
dom_tree = parser.parse(template_text)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
toc = []
current = None
innerTagCount = 0
for element in stream:
if element['type'] == 'StartTag':
if element['name'] in ['h2']:
for attr in element['data']:
if attr[0] == 'id':
current = {
'level' : int(element['name'][-1:]) - 1,
'id' : attr[1],
'text': ''
}
elif current is not None:
innerTagCount += 1
def strip_tags(html):
if html:
builder = treebuilders.getTreeBuilder("dom")
parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags)
tree = parser.parseFragment(html)
walker = treewalkers.getTreeWalker("dom")
stream = walker(tree)
serializer = HTMLSerializer()
return serializer.render(stream)
t3 = time.clock()
for search,replace in SMILEY_REPLACEMENTS:
text = text.replace(search, replace)
for regex,replace in BBCODE_REGEXES:
text = regex.sub(replace, text)
for search,replace in BBCODE_REPLACEMENTS:
text = text.replace(search, replace)
t4 = time.clock()
doc = parser.parse(text)
t5 = time.clock()
walker = treewalkers.getTreeWalker('etree')
stream = walker(doc)
s = serializer.htmlserializer.HTMLSerializer()
output_generator = s.serialize(stream)
t6 = time.clock()
done = Markup(''.join(list(output_generator)))
t7 = time.clock()
print('Init:%f, BR:%f, Regex:%f, Parse:%f, Serial:%f, Join:%f, All:%f' % (t2-t1, t3-t2, t4-t3, t5-t4, t6-t5, t7-t6, t7-t1))
return done
if opts.encoding:
print "Encoding:", parser.tokenizer.stream.charEncoding
if opts.xml:
sys.stdout.write(document.toxml("utf-8"))
elif opts.tree:
if not hasattr(document,'__getitem__'): document = [document]
for fragment in document:
print parser.tree.testSerializer(fragment).encode("utf-8")
elif opts.hilite:
sys.stdout.write(document.hilite("utf-8"))
elif opts.html:
kwargs = {}
for opt in serializer.HTMLSerializer.options:
kwargs[opt] = getattr(opts,opt)
if not kwargs['quote_char']: del kwargs['quote_char']
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding='utf-8'):
sys.stdout.write(text)
if not text.endswith('\n'): sys.stdout.write('\n')
if opts.error:
errList=[]
for pos, errorcode, datavars in parser.errors:
errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
script_node = document.createElement('script')
script_text = document.createTextNode('window.MathJax = {"showMathMenu":false,"messageStyle":"none","errorSettings":{"style":{"color":"#000000","font-style":"normal"}},"HTML-CSS":{"linebreaks":{"automatic":true,"width":"container"},"EqnChunk":150,"EqnChunkDelay":20},"tex2jax":{"inlineMath":[["[math]","[/math]"]],"displayMath":[],"ignoreClass":"edit_latex|qtext_editor_content|ignore_latex","processClass":"render_latex","processEnvironments":false,"preview":"none"},"TeX":{"noUndefined":{"attributes":{"mathcolor":"red"}},"noErrors":{"multiLine":true,"style":{"max-width":"100%","overflow":"hidden"}},"Macros":{"C":"{\\mathbb{C}}","N":"{\\mathbb{N}}","O":"{\\emptyset}","Q":"{\\mathbb{Q}}","R":"{\\mathbb{R}}","Z":"{\\mathbb{Z}}"}},"fast-preview":{"disabled":true},"Safe":{"allow":{"URLs":"none","classes":"none","cssIDs":"none","styles":"none","fontsize":"none","require":"none"}}};')
script_node.appendChild(script_text)
head_node.appendChild(script_node)
# and then load MathJax:
script_node = document.createElement('script')
script_node.setAttribute('type', 'text/javascript')
script_node.setAttribute('src', 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js?config=TeX-AMS-MML_HTMLorMML,Safe')
head_node.appendChild(script_node)
new_page.appendChild(head_node)
body_node = document.createElement('body')
# This step processes Quora's HTML into a more lightweight and portable form.
cleanup_tree(document, answer_node, body_node)
new_page.appendChild(body_node)
# Okay! Finally, save the HTML.
walker = treewalkers.getTreeWalker('dom')(new_page)
try:
with open(args.output_dir + '/' + filename, 'wb', 0o600) as saved_page:
saved_page.write(b'')
saved_page.write(serializer.serialize(new_page, 'dom', 'utf-8', omit_optional_tags=False))
except IOError as error:
print('[ERROR] Failed to save to file %s (%s)' % (filename, error.strerror), file=sys.stderr)
print('Done', file=sys.stderr)
# Parse
parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
tree = parser.parse(filtered, encoding='utf-8')
# Move introduction above conformance requirements
introduction = tree.findall("//*[@id='introduction']")[0]
intro_ps = introduction.xpath("following-sibling::*")
target = tree.findall("//*[@id='conformance-requirements']")[0]
target.addprevious(introduction)
target = introduction
target.addnext(intro_ps[2])
target.addnext(intro_ps[1])
target.addnext(intro_ps[0])
# Serialize
tokens = html5lib.treewalkers.getTreeWalker('lxml')(tree)
serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
for text in serializer.serialize(tokens, encoding='utf-8'):
pre_anolis_buffer.write(text)
filtered = pre_anolis_buffer
# replace data-x with data-anolis-xref
print "fixing xrefs"
filtered.seek(0)
# Parse
builder = treebuilders.getTreeBuilder("lxml", etree)
try:
parser = html5lib.HTMLParser(tree=builder, namespaceHTMLElements=False)
except TypeError:
parser = html5lib.HTMLParser(tree=builder)
doc = None
if 'xhtml' in node['type']:
try:
from xml.dom import minidom
doc = minidom.parseString(node['value'])
except:
node['type']='text/html'
if not doc:
from html5lib import html5parser, treebuilders
p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
doc = p.parseFragment(node['value'], encoding='utf-8')
from html5lib import treewalkers, serializer
from html5lib.filters import sanitizer
walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
tree = xhtml.serialize(walker, encoding='utf-8')
node['value'] = ''.join([str(token) for token in tree])