How to use the html5lib.treewalkers.getTreeWalker function in html5lib

To help you get started, we’ve selected a few html5lib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Ravenbrook / mps / manual / source / make-mmref.py View on Github external
attribute) pairs that contain URLs to be rewritten.

    """
    tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
    parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
    dom = parser.parse(src)

    for tag, attr in url_attributes:
        for e in dom.getElementsByTagName(tag):
            u = e.getAttribute(attr)
            if u and not url_filter(urljoin(src_base, u)):
                rewritten = urljoin(rewrite_base, u)
                if u != rewritten:
                    e.setAttribute(attr, rewritten)

    tree_walker = html5lib.treewalkers.getTreeWalker('dom')
    html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
    return u''.join(html_serializer.serialize(tree_walker(dom)))
github chagel / CNPROG / utils / html.py View on Github external
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
                            tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)
github modernmt / DataCollection / baseline / html2text.py View on Github external
def html2text(html, sanitize=False, ignore_br=False):
    """ Takes utf-8 encoded page and returns unicode text """
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html.decode("utf-8"))
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    space_introducing_tags = set(['th', 'td'])
    # Add space around spans
    # This technically violates the standard as spans
    # don't introduce whitespace. In practice whitespace
    # is often added via CSS and spans rarely end in the
    # middle of a word.
    space_introducing_tags.add('span')

    line_break_tags = block_level_elements
    line_break_tags.add('tr')  #  introduces line-break
    line_break_tags.add('li')  # <li> introduces line-break
    line_break_tags.add('option')  # <option> introduces line-break

    if ignore_br:</option></li>
github mozilla / kitsune / apps / wiki / parser.py View on Github external
def to_unicode(self):
        """Return the unicode serialization of myself."""
        container_len = len(self.CONTAINER_TAG) + 2  # 2 for the &lt;&gt;
        walker = getTreeWalker(self.TREEBUILDER)
        stream = walker(self._root)
        serializer = HTMLSerializer(quote_attr_values=True,
                                    omit_optional_tags=False)
        return serializer.render(stream)[container_len:-container_len - 1]
github mozilla / addons-server / src / olympia / translations / utils.py View on Github external
"""
    Return a slice of ``html`` &lt;= length chars.

    killwords and end are currently ignored.

    ONLY USE FOR KNOWN-SAFE HTML.
    """
    tree = html5lib.parseFragment(html)
    if text_length(tree) &lt;= length:
        return jinja2.Markup(html)
    else:
        # Get a truncated version of the tree.
        short, _ = trim(tree, length, killwords, end)

        # Serialize the parsed tree back to html.
        walker = html5lib.treewalkers.getTreeWalker('etree')
        stream = walker(short)
        serializer = html5lib.serializer.HTMLSerializer(
            quote_attr_values='always', omit_optional_tags=False)
        return jinja2.Markup(force_text(serializer.render(stream)))
github gallantlab / pycortex / cortex / webgl / htmlembed.py View on Github external
ncss = dom.createElement("style")
            ncss.setAttribute(u"type", u"text/css")
            ncss.appendChild(dom.createTextNode(csstext))
            css.parentNode.insertBefore(ncss, css)
            css.parentNode.removeChild(css)

    _embed_images(dom, rootdirs)

    #Save out the new html file
    with open(outfile, "w") as htmlfile:
        # Fix error due to changes new version of html5lib (> 0.9999...)
        try:
            serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
        except AttributeError:
            serializer = html5lib.serializer.HTMLSerializer()
        walker = html5lib.treewalkers.getTreeWalker("dom")

        for line in serializer.serialize(walker(dom)):
            htmlfile.write(line)
github html5lib / html5lib-python / parse.py View on Github external
document = [document]
            for fragment in document:
                print(parser.tree.testSerializer(fragment))
        elif opts.hilite:
            sys.stdout.write(document.hilite("utf-8"))
        elif opts.html:
            kwargs = {}
            for opt in serializer.HTMLSerializer.options:
                try:
                    kwargs[opt] = getattr(opts,opt)
                except:
                    pass
            if not kwargs['quote_char']:
                del kwargs['quote_char']

            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
            if sys.version_info[0] >= 3:
                encoding = None
            else:
                encoding = "utf-8"
            for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
                sys.stdout.write(text)
            if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList=[]
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
github pypa / warehouse / warehouse / filters.py View on Github external
def camoify(ctx, value):
    request = ctx.get("request") or get_current_request()

    # Parse the rendered output and replace any inline images that don't point
    # to HTTPS with camouflaged images.
    tree_builder = html5lib.treebuilders.getTreeBuilder("dom")
    parser = html5lib.html5parser.HTMLParser(tree=tree_builder)
    dom = parser.parse(value)

    for element in dom.getElementsByTagName("img"):
        src = element.getAttribute("src")
        if src:
            element.setAttribute("src", request.camo_url(src))

    tree_walker = html5lib.treewalkers.getTreeWalker("dom")
    html_serializer = html5lib.serializer.HTMLSerializer()
    camoed = "".join(html_serializer.serialize(tree_walker(dom)))

    return camoed