How to use the html5lib.treewalkers function in html5lib

To help you get started, we’ve selected a few html5lib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github qgriffith / OpenEats / templatetags / sanitize.py View on Github external
def sanitize_html(value):
    """A custom filter that sanitzes html output to make sure there is no bad stuff in it"""
    p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(value)

    walker = treewalkers.getTreeWalker("dom")

    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))
github sebix / python-textile / textile / tools / sanitizer.py View on Github external
def sanitize(string):
    """
    Ensure that the text does not contain any malicious HTML code which might
    break the page.
    """
    try:
        import html5lib
        from html5lib import sanitizer, serializer, treewalkers
    except ImportError:
        raise Exception("html5lib not available")

    p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
    tree = p.parseFragment(string)

    walker = treewalkers.getTreeWalker("etree")
    stream = walker(tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
            quote_attr_values=True)
    return s.render(stream)
github html5rocks / www.html5rocks.com / main.py View on Github external
def get_toc(self, path):

    # Only have TOC on tutorial pages. Don't do work for others.
    if not (re.search('/tutorials', path) or re.search('/mobile', path) or re.search('style-guide', path)):
      return ''

    toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
    if toc is None or not self.request.cache:
      template_text = render_to_string(path, {})

      parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
      dom_tree = parser.parse(template_text)
      walker = treewalkers.getTreeWalker("dom")
      stream = walker(dom_tree)
      toc = []
      current = None
      innerTagCount = 0
      for element in stream:
        if element['type'] == 'StartTag':
          if element['name'] in ['h2']:
            for attr in element['data']:
              if attr[0] == 'id':
                current = {
                  'level' : int(element['name'][-1:]) - 1,
                  'id' : attr[1],
                  'text': ''
                }
          elif current is not None:
            innerTagCount += 1
github tantalor / emend / app / emend / html.py View on Github external
def strip_tags(html):
  if html:
    builder = treebuilders.getTreeBuilder("dom")
    parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags)
    tree = parser.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(tree)
    serializer = HTMLSerializer()
    return serializer.render(stream)
github Treeki / bitBoard / bitBoard / parser.py View on Github external
t3 = time.clock()

	for search,replace in SMILEY_REPLACEMENTS:
		text = text.replace(search, replace)

	for regex,replace in BBCODE_REGEXES:
		text = regex.sub(replace, text)

	for search,replace in BBCODE_REPLACEMENTS:
		text = text.replace(search, replace)

	t4 = time.clock()
	doc = parser.parse(text)
	t5 = time.clock()

	walker = treewalkers.getTreeWalker('etree')
	stream = walker(doc)
	s = serializer.htmlserializer.HTMLSerializer()
	output_generator = s.serialize(stream)
	t6 = time.clock()

	done = Markup(''.join(list(output_generator)))
	t7 = time.clock()
	print('Init:%f, BR:%f, Regex:%f, Parse:%f, Serial:%f, Join:%f, All:%f' % (t2-t1, t3-t2, t4-t3, t5-t4, t6-t5, t7-t6, t7-t1))
	return done
github Komodo / KomodoEdit / contrib / html5lib / examples / parse.py View on Github external
if opts.encoding:
        print "Encoding:", parser.tokenizer.stream.charEncoding
    if opts.xml:
        sys.stdout.write(document.toxml("utf-8"))
    elif opts.tree:
        if not hasattr(document,'__getitem__'): document = [document]
        for fragment in document:
            print parser.tree.testSerializer(fragment).encode("utf-8")
    elif opts.hilite:
        sys.stdout.write(document.hilite("utf-8"))
    elif opts.html:
        kwargs = {}
        for opt in serializer.HTMLSerializer.options:
            kwargs[opt] = getattr(opts,opt)
        if not kwargs['quote_char']: del kwargs['quote_char']
        tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
        for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding='utf-8'):
            sys.stdout.write(text)
        if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList=[]
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
github t3nsor / quora-backup / converter.py View on Github external
script_node = document.createElement('script')
    script_text = document.createTextNode('window.MathJax = {"showMathMenu":false,"messageStyle":"none","errorSettings":{"style":{"color":"#000000","font-style":"normal"}},"HTML-CSS":{"linebreaks":{"automatic":true,"width":"container"},"EqnChunk":150,"EqnChunkDelay":20},"tex2jax":{"inlineMath":[["[math]","[/math]"]],"displayMath":[],"ignoreClass":"edit_latex|qtext_editor_content|ignore_latex","processClass":"render_latex","processEnvironments":false,"preview":"none"},"TeX":{"noUndefined":{"attributes":{"mathcolor":"red"}},"noErrors":{"multiLine":true,"style":{"max-width":"100%","overflow":"hidden"}},"Macros":{"C":"{\\mathbb{C}}","N":"{\\mathbb{N}}","O":"{\\emptyset}","Q":"{\\mathbb{Q}}","R":"{\\mathbb{R}}","Z":"{\\mathbb{Z}}"}},"fast-preview":{"disabled":true},"Safe":{"allow":{"URLs":"none","classes":"none","cssIDs":"none","styles":"none","fontsize":"none","require":"none"}}};')
    script_node.appendChild(script_text)
    head_node.appendChild(script_node)
    # and then load MathJax:
    script_node = document.createElement('script')
    script_node.setAttribute('type', 'text/javascript')
    script_node.setAttribute('src', 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js?config=TeX-AMS-MML_HTMLorMML,Safe')
    head_node.appendChild(script_node)
    new_page.appendChild(head_node)
    body_node = document.createElement('body')
    # This step processes Quora's HTML into a more lightweight and portable form.
    cleanup_tree(document, answer_node, body_node)
    new_page.appendChild(body_node)
    # Okay! Finally, save the HTML.
    walker = treewalkers.getTreeWalker('dom')(new_page)
    try:
        with open(args.output_dir + '/' + filename, 'wb', 0o600) as saved_page:
            saved_page.write(b'')
            saved_page.write(serializer.serialize(new_page, 'dom', 'utf-8', omit_optional_tags=False))
    except IOError as error:
        print('[ERROR] Failed to save to file %s (%s)' % (filename, error.strerror), file=sys.stderr)

print('Done', file=sys.stderr)
github w3c / html-tools / publish.py View on Github external
# Parse
        parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
        tree = parser.parse(filtered, encoding='utf-8')

        # Move introduction above conformance requirements
        introduction = tree.findall("//*[@id='introduction']")[0]
        intro_ps = introduction.xpath("following-sibling::*")
        target = tree.findall("//*[@id='conformance-requirements']")[0]
        target.addprevious(introduction)
        target = introduction
        target.addnext(intro_ps[2])
        target.addnext(intro_ps[1])
        target.addnext(intro_ps[0])

        # Serialize
        tokens = html5lib.treewalkers.getTreeWalker('lxml')(tree)
        serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
        for text in serializer.serialize(tokens, encoding='utf-8'):
            pre_anolis_buffer.write(text)

        filtered = pre_anolis_buffer

    # replace data-x with data-anolis-xref
    print "fixing xrefs"
    filtered.seek(0)

    # Parse
    builder = treebuilders.getTreeBuilder("lxml", etree)
    try:
        parser = html5lib.HTMLParser(tree=builder, namespaceHTMLElements=False)
    except TypeError:
        parser = html5lib.HTMLParser(tree=builder)
github rubys / venus / planet / scrub.py View on Github external
doc = None
            if 'xhtml' in node['type']:
              try:
                from xml.dom import minidom
                doc = minidom.parseString(node['value'])
              except:
                node['type']='text/html'

            if not doc:
              from html5lib import html5parser, treebuilders
              p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
              doc = p.parseFragment(node['value'], encoding='utf-8')

            from html5lib import treewalkers, serializer
            from html5lib.filters import sanitizer
            walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
            xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
            tree = xhtml.serialize(walker, encoding='utf-8')

            node['value'] = ''.join([str(token) for token in tree])