How to use html5lib - 10 common examples

To help you get started, we’ve selected a few html5lib examples, based on popular ways it is used in public projects.

github html5lib / html5lib-python / tests / View on Github external
if innerHTML:
            innerHTML = str(innerHTML, "utf8")

        if errors:
            errors = str(errors, "utf8")
            errors = errors.split("\n")

        expected = str(expected, "utf8")

            if innerHTML:
                document = p.parseFragment(io.BytesIO(input), innerHTML)
                    document = p.parse(io.BytesIO(input))
                except constants.DataLossWarning:
                    sys.stderr.write("Test input causes known dataloss, skipping")
            errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"), 
                                  "\nExpected:", expected,
                                  "\nTraceback:", traceback.format_exc()])
            self.assertTrue(False, errorMsg)
        output = convertTreeDump(p.tree.testSerializer(document))
        output = attrlist.sub(sortattrs, output)
        expected = convertExpected(expected)
        expected = attrlist.sub(sortattrs, expected)
        errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"), 
                              "\nExpected:", expected,
                              "\nReceived:", output])
github tiddlyweb / tiddlyweb / test / View on Github external
def import_wiki(filename='wiki', hostname='localhost', port=8080):
    f =, encoding='utf-8')
    wikitext =

    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup'))
    soup = parser.parse(wikitext)
    store_area = soup.find('div', id='storeArea')
    divs = store_area.findAll('div')

    _do_recipe(hostname, port)
    _do_bag(hostname, port)

    for tiddler in divs:
        _do_tiddler(hostname, port, tiddler)
github sphinx-doc / sphinx / tests / View on Github external
:copyright: Copyright 2007-2019 by the Sphinx team, see AUTHORS.
    :license: BSD, see LICENSE for details.

import re
import xml.etree.cElementTree as ElementTree
from hashlib import md5

import pytest
from html5lib import getTreeBuilder, HTMLParser
from test_build_html import flat_dict, tail_check, check_xpath

from sphinx.util.docutils import is_html5_writer_available

TREE_BUILDER = getTreeBuilder('etree', implementation=ElementTree)
HTML_PARSER = HTMLParser(TREE_BUILDER, namespaceHTMLElements=False)

etree_cache = {}

@pytest.mark.skipif(not is_html5_writer_available(), reason='HTML5 writer is not available')
def cached_etree_parse():
    def parse(fname):
        if fname in etree_cache:
            return etree_cache[fname]
        with (fname).open('rb') as fp:
            etree = HTML_PARSER.parse(fp)
            etree_cache[fname] = etree
            return etree
github tiddlyweb / tiddlywebwiki / test / View on Github external
def _parse(content):
    parser = html5lib.liberalxmlparser.XMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup'))
    soup = parser.parseFragment(content)
    tiddler_div = soup.find('div')
    return tiddler_div
github web-platform-tests / wpt / tools / manifest / View on Github external
    parsers = {"html":lambda x:html5lib.parse(x, treebuilder="etree"),
               "xhtml":lambda x:ElementTree.parse(x, XMLParser.XMLParser()),
github guillemhs / ScraperBot / examples_and_tests / scrapy / Bot4u / build / lxml / build / lib.linux-i686-2.7 / lxml / html / View on Github external
def insertComment(self, data, parent=None):
        if not self.rootInserted:
            _base.TreeBuilder.insertComment(self, data, parent)
github web-platform-tests / wpt / tools / third_party / html5lib / View on Github external
                    kwargs[opt] = getattr(opts, opt)
            if not kwargs['quote_char']:
                del kwargs['quote_char']

            if opts.sanitize:
                kwargs["sanitize"] = True

            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
            if sys.version_info[0] >= 3:
                encoding = None
                encoding = "utf-8"
            for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
            if not text.endswith('\n'):
    if opts.error:
        errList = []
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
github html5lib / html5lib-python / tests / View on Github external
def test_all_tokens(self):
        expected = [
            {'data': [], 'type': 'StartTag', 'name': 'html'},
            {'data': [], 'type': 'StartTag', 'name': 'head'},
            {'data': [], 'type': 'EndTag', 'name': 'head'},
            {'data': [], 'type': 'StartTag', 'name': 'body'},
            {'data': 'a', 'type': 'Characters'},
            {'data': [], 'type': 'StartTag', 'name': 'div'},
            {'data': 'b', 'type': 'Characters'},
            {'data': [], 'type': 'EndTag', 'name': 'div'},
            {'data': 'c', 'type': 'Characters'},
            {'data': [], 'type': 'EndTag', 'name': 'body'},
            {'data': [], 'type': 'EndTag', 'name': 'html'}
        for treeName, treeCls in treeTypes.items():
            p = html5parser.HTMLParser(tree = treeCls["builder"])
            document = p.parse("a<div>b</div>c")
            document = treeCls.get("adapter", lambda x: x)(document)
            output = treeCls["walker"](document)
            for expectedToken, outputToken in zip(expected, output):
                self.assertEquals(expectedToken, outputToken)
github html5lib / html5lib-python / tests / View on Github external
def sanitize_html(stream):
  return ''.join([token.toxml() for token in
github html5lib / html5lib-python / tests / View on Github external
def sanitize_html(self,stream):
    return ''.join([token.toxml() for token in