How to use the html5lib.serializer function in html5lib

To help you get started, we’ve selected a few html5lib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github neurodebian / neurodebian / sphinx / sphinxext / feed / absolutify_urls.py View on Github external
# that consist only of a fragment identifier, because Google Reader
    # changes href=#foo to href=http://site/#foo
    for tag, attr in url_attributes:
        for e in dom.getElementsByTagName(tag):
            u = e.getAttribute(attr)
            if u:
                e.setAttribute(attr, urlparse.urljoin(base_url, u))

    # Return the HTML5 serialization of the  of the result (we don't
    # want the : this breaks feed readers).
    body = dom.getElementsByTagName('body')[0]
    tree_walker = html5lib.treewalkers.getTreeWalker('dom')
    try:
        html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
    except AttributeError:
        html_serializer = html5lib.serializer.HTMLSerializer()
    return u''.join(html_serializer.serialize(tree_walker(body)))
github html5lib / html5lib-python / parse.py View on Github external
elif opts.html:
            kwargs = {}
            for opt in serializer.HTMLSerializer.options:
                try:
                    kwargs[opt] = getattr(opts,opt)
                except:
                    pass
            if not kwargs['quote_char']:
                del kwargs['quote_char']

            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
            if sys.version_info[0] >= 3:
                encoding = None
            else:
                encoding = "utf-8"
            for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
                sys.stdout.write(text)
            if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList=[]
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
github gallantlab / pycortex / cortex / webgl / htmlembed.py View on Github external
csstext = _embed_css(_resolve_path(css.getAttribute("href"), rootdirs), rootdirs)
            ncss = dom.createElement("style")
            ncss.setAttribute(u"type", u"text/css")
            ncss.appendChild(dom.createTextNode(csstext))
            css.parentNode.insertBefore(ncss, css)
            css.parentNode.removeChild(css)

    _embed_images(dom, rootdirs)

    #Save out the new html file
    with open(outfile, "w") as htmlfile:
        # Fix error due to changes new version of html5lib (> 0.9999...)
        try:
            serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
        except AttributeError:
            serializer = html5lib.serializer.HTMLSerializer()
        walker = html5lib.treewalkers.getTreeWalker("dom")

        for line in serializer.serialize(walker(dom)):
            htmlfile.write(line)
github divio / djangocms-text-ckeditor / djangocms_text_ckeditor / html.py View on Github external
def clean_html(data, full=True, parser=DEFAULT_PARSER):
    """
    Cleans HTML from XSS vulnerabilities using html5lib
    If full is False, only the contents inside  will be returned (without
    the  tags).
    """
    if full:
        dom_tree = parser.parse(data)
    else:
        dom_tree = parser.parseFragment(data)
    walker = treewalkers.getTreeWalker('dom')
    kwargs = _filter_kwargs()
    stream = TextSanitizer(walker(dom_tree), **kwargs)
    s = serializer.HTMLSerializer(
        omit_optional_tags=False,
        quote_attr_values='always',
    )
    return u''.join(s.serialize(stream))
github eevee / spline / spline / display / rendering.py View on Github external
def trim_html(html):
    if not isinstance(html, Markup):
        raise TypeError("trim_html: expected Markup, got {!r}".format(type(html)))

    # TODO i think this could be combined with the bleach.clean call to avoid a
    # double parse?  filters apply during serialization, bleach applies during
    # tokenization
    # TODO alternatively, could this apply during tokenization to avoid
    # bothering with any markup we're not even going to show?
    tree = html5lib.parse(html)
    walker = html5lib.getTreeWalker('etree')
    stream = walker(tree)
    stream = TrimFilter(stream)
    serializer = html5lib.serializer.HTMLSerializer()

    return Markup(u''.join(serializer.serialize(stream)).strip())
github mitsuhiko / bf3-aggregator / bf3.py View on Github external
for child in node.getchildren():
                new_child = transform(child)
                if new_child != child:
                    if new_child is not None:
                        child.addnext(new_child)
                    node.remove(child)
            return node

        prefix, root_node = html5lib.parseFragment(text, treebuilder='lxml')
        node = transform(root_node)
        node.attrib.clear()
        node.tail = None

        walker = html5lib.treewalkers.getTreeWalker('lxml')
        stream = walker(node)
        serializer = html5lib.serializer.htmlserializer.HTMLSerializer(omit_optional_tags=True)
        output_generator = serializer.serialize(stream)
        return u''.join(output_generator)
github html5lib / html5lib-python / parse.py View on Github external
for item in parser.log:
        print(item)

    if document is not None:
        if opts.xml:
            sys.stdout.write(document.toxml("utf-8"))
        elif opts.tree:
            if not hasattr(document,'__getitem__'):
                document = [document]
            for fragment in document:
                print(parser.tree.testSerializer(fragment))
        elif opts.hilite:
            sys.stdout.write(document.hilite("utf-8"))
        elif opts.html:
            kwargs = {}
            for opt in serializer.HTMLSerializer.options:
                try:
                    kwargs[opt] = getattr(opts,opt)
                except:
                    pass
            if not kwargs['quote_char']:
                del kwargs['quote_char']

            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
            if sys.version_info[0] >= 3:
                encoding = None
            else:
                encoding = "utf-8"
            for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
                sys.stdout.write(text)
            if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error: