How to use the html5lib.HTMLParser function in html5lib

To help you get started, we’ve selected a few html5lib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tiddlyweb / tiddlyweb / test / importer.py View on Github external
def import_wiki(filename='wiki', hostname='localhost', port=8080):
    f = codecs.open(filename, encoding='utf-8')
    wikitext = f.read()
    f.close()

    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup'))
    soup = parser.parse(wikitext)
    store_area = soup.find('div', id='storeArea')
    divs = store_area.findAll('div')

    _do_recipe(hostname, port)
    _do_bag(hostname, port)

    for tiddler in divs:
        _do_tiddler(hostname, port, tiddler)
github sphinx-doc / sphinx / tests / test_build_html5.py View on Github external
:copyright: Copyright 2007-2019 by the Sphinx team, see AUTHORS.
    :license: BSD, see LICENSE for details.
"""

import re
import xml.etree.cElementTree as ElementTree
from hashlib import md5

import pytest
from html5lib import getTreeBuilder, HTMLParser
from test_build_html import flat_dict, tail_check, check_xpath

from sphinx.util.docutils import is_html5_writer_available

TREE_BUILDER = getTreeBuilder('etree', implementation=ElementTree)
HTML_PARSER = HTMLParser(TREE_BUILDER, namespaceHTMLElements=False)


etree_cache = {}


@pytest.mark.skipif(not is_html5_writer_available(), reason='HTML5 writer is not available')
@pytest.fixture(scope='module')
def cached_etree_parse():
    def parse(fname):
        if fname in etree_cache:
            return etree_cache[fname]
        with (fname).open('rb') as fp:
            etree = HTML_PARSER.parse(fp)
            etree_cache.clear()
            etree_cache[fname] = etree
            return etree
github GovReady / govready-q / guidedmodules / module_logic.py View on Github external
url = answers.task.get_static_asset_url(url, use_data_urls=use_data_urls)

                    # Check final URL.
                    import urllib.parse
                    u = urllib.parse.urlparse(url)
                    
                    # Allow data URLs in some cases.
                    if use_data_urls and allow_dataurl and u.scheme == "data":
                        return url

                    if u.scheme not in ("", "http", "https", "mailto"):
                        return "javascript:alert('Invalid link.');"
                    return url

                import html5lib, xml.etree
                dom = html5lib.HTMLParser().parseFragment(output)
                for node in dom.iter():
                    if node.get("href"):
                        node.set("href", rewrite_url(node.get("href")))
                    if node.get("src"):
                        node.set("src", rewrite_url(node.get("src"), allow_dataurl=(node.tag == "{http://www.w3.org/1999/xhtml}img")))
                output = html5lib.serialize(dom, quote_attr_values="always", omit_optional_tags=False, alphabetical_attributes=True)

                # But the p's within p's fix gives us a lot of empty p's.
                output = output.replace("<p></p>", "")

                return output

        raise ValueError("Cannot render %s to %s." % (template_format, output_format))
         
    else:
        raise ValueError("Invalid template format encountered: %s." % template_format)
github openembedded / bitbake / lib / bs4 / diagnose.py View on Github external
b = time.time()
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a))
github scrapinghub / extruct / extruct / rdflibxml / __init__.py View on Github external
self.http_status = 500
                # Something nasty happened:-(
                if not rdfOutput : raise e
                err = self.options.add_error(str(e), context = name)
                self.options.processor_graph.add_http_context(err, 500)
                return copyErrors(graph, self.options)

            dom = None
            try :
                msg = ""
                parser = None
                if self.options.host_language == HostLanguage.html5 :
                    import warnings
                    warnings.filterwarnings("ignore", category=DeprecationWarning)
                    import html5lib
                    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
                    if self.charset :
                        # This means the HTTP header has provided a charset, or the
                        # file is a local file when we suppose it to be a utf-8
                        dom = parser.parse(input, encoding=self.charset)
                    else :
                        # No charset set. The HTMLLib parser tries to sniff into the
                        # the file to find a meta header for the charset; if that
                        # works, fine, otherwise it falls back on window-...
                        dom = parser.parse(input)

                    try :
                        if isstring :
                            input.close()
                            input = self._get_input(name)
                        else :
                            input.seek(0)
github google / coursebuilder-core / coursebuilder / common / tags.py View on Github external
def get_components_using_html5lib(html):
    """Find lesson components using the pure python html5lib library."""

    parser = html5lib.HTMLParser(
        tree=html5lib.treebuilders.getTreeBuilder('etree', cElementTree),
        namespaceHTMLElements=False)
    content = parser.parseFragment('<div>%s</div>' % html)[0]
    components = []
    for component in content.findall('.//*[@instanceid]'):
        component_dict = {'cpt_name': component.tag}
        component_dict.update(component.attrib)
        components.append(component_dict)
    return components
github jtackaberry / stagehand / external / bs4 / diagnose.py View on Github external
b = time.time()
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a))
github w3c / html-tools / publish.py View on Github external
serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
        for text in serializer.serialize(tokens, encoding='utf-8'):
            pre_anolis_buffer.write(text)

        filtered = pre_anolis_buffer

    # replace data-x with data-anolis-xref
    print "fixing xrefs"
    filtered.seek(0)

    # Parse
    builder = treebuilders.getTreeBuilder("lxml", etree)
    try:
        parser = html5lib.HTMLParser(tree=builder, namespaceHTMLElements=False)
    except TypeError:
        parser = html5lib.HTMLParser(tree=builder)
    tree = parser.parse(filtered, encoding='utf-8')

    # Move introduction above conformance requirements
    data_x = tree.findall("//*[@data-x]")
    non_alphanumeric_spaces = re.compile(r"[^a-zA-Z0-9 \-\_\/\|]+")
    for refel in data_x:
        refel.attrib["data-anolis-xref"] = refel.get("data-x")
        if refel.tag == "dfn" and not refel.get("id", False) and refel.attrib["data-anolis-xref"]:
            refel.attrib["id"] = generateID(refel.attrib["data-anolis-xref"], refel)
        del refel.attrib["data-x"]
    # utils.ids = {}

    print 'indexing'
    # filtered.seek(0)
    # tree = generator.fromFile(filtered, **opts)
    generator.process(tree, **opts)
github h3llrais3r / Auto-Subliminal / libpy2 / bs4 / diagnose.py View on Github external
b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print "Raw lxml parsed the markup in %.2fs." % (b-a)

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print "Raw html5lib parsed the markup in %.2fs." % (b-a)