How to use the html5lib.serializer.htmlserializer.HTMLSerializer function in html5lib

To help you get started, we’ve selected a few html5lib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mozilla / addons-server / apps / translations / utils.py View on Github external
killwords and end are currently ignored.

    ONLY USE FOR KNOWN-SAFE HTML.
    """
    tree = html5lib.parseFragment(html)
    if text_length(tree) <= length:
        return jinja2.Markup(html)
    else:
        # Get a truncated version of the tree.
        short, _ = trim(tree, length, killwords, end)

        # Serialize the parsed tree back to html.
        walker = html5lib.treewalkers.getTreeWalker('etree')
        stream = walker(short)
        serializer = html5lib.serializer.htmlserializer.HTMLSerializer(
            quote_attr_values=True, omit_optional_tags=False)
        return jinja2.Markup(force_unicode(serializer.render(stream)))
github Treeki / bitBoard / bitBoard / parser.py View on Github external
for search,replace in SMILEY_REPLACEMENTS:
		text = text.replace(search, replace)

	for regex,replace in BBCODE_REGEXES:
		text = regex.sub(replace, text)

	for search,replace in BBCODE_REPLACEMENTS:
		text = text.replace(search, replace)

	t4 = time.clock()
	doc = parser.parse(text)
	t5 = time.clock()

	walker = treewalkers.getTreeWalker('etree')
	stream = walker(doc)
	s = serializer.htmlserializer.HTMLSerializer()
	output_generator = s.serialize(stream)
	t6 = time.clock()

	done = Markup(''.join(list(output_generator)))
	t7 = time.clock()
	print('Init:%f, BR:%f, Regex:%f, Parse:%f, Serial:%f, Join:%f, All:%f' % (t2-t1, t3-t2, t4-t3, t5-t4, t6-t5, t7-t6, t7-t1))
	return done
github sebix / python-textile / textile / tools / sanitizer.py View on Github external
Ensure that the text does not contain any malicious HTML code which might
    break the page.
    """
    try:
        import html5lib
        from html5lib import sanitizer, serializer, treewalkers
    except ImportError:
        raise Exception("html5lib not available")

    p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
    tree = p.parseFragment(string)

    walker = treewalkers.getTreeWalker("etree")
    stream = walker(tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
            quote_attr_values=True)
    return s.render(stream)
github thehub / hubplus / apps / plus_lib / utils.py View on Github external
def clean(self, value):
        chars = super(HTMLField, self).clean(value)
        #chars = chars.encode('utf-8') # should really find out where we have decoded input to unicode and do it there instead
        p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) # could use Beautiful Soup here instead
        s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
        dom_tree = p.parseFragment(chars) #encoding="utf-8")  - unicode input seems to work fine
        
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)
        gen = s.serialize(stream)
        out = ""
        for i in gen:
            out += i
        return out
github Ravenbrook / mps / manual / source / make-mmref.py View on Github external
"""
    tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
    parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
    dom = parser.parse(src)

    for tag, attr in url_attributes:
        for e in dom.getElementsByTagName(tag):
            u = e.getAttribute(attr)
            if u and not url_filter(urljoin(src_base, u)):
                rewritten = urljoin(rewrite_base, u)
                if u != rewritten:
                    e.setAttribute(attr, rewritten)

    tree_walker = html5lib.treewalkers.getTreeWalker('dom')
    html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
    return u''.join(html_serializer.serialize(tree_walker(dom)))
github mozilla / kitsune / kitsune / wiki / parser.py View on Github external
def to_unicode(self):
        """Return the unicode serialization of myself."""
        container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
        walker = getTreeWalker(self.TREEBUILDER)
        stream = walker(self._root)
        serializer = HTMLSerializer(quote_attr_values=True,
                                    omit_optional_tags=False)
        return serializer.render(stream)[container_len:-container_len - 1]
github qgriffith / OpenEats / templatetags / sanitize.py View on Github external
def sanitize_html(value):
    """A custom filter that sanitzes html output to make sure there is no bad stuff in it"""
    p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(value)

    walker = treewalkers.getTreeWalker("dom")

    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))
github mozilla / popcorn_maker / popcorn_gallery / popcorn / templates.py View on Github external
def _serialize_stream(document_tree):
    walker = html5lib.treewalkers.getTreeWalker('lxml')
    stream = walker(document_tree)
    serializer = htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                               quote_attr_values=True)
    return unicode(serializer.render(stream))