How to use the html5lib.treebuilders function in html5lib

To help you get started, we’ve selected a few html5lib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tiddlyweb / tiddlywebwiki / test / test_tiddlywiki.py View on Github external
def _parse(content):
    parser = html5lib.liberalxmlparser.XMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup'))
    soup = parser.parseFragment(content)
    tiddler_div = soup.find('div')
    return tiddler_div
github operasoftware / oex2nex / oex2nex / convertor.py View on Github external
def _shim_wrap(self, html, file_type="index", prefs=None):
        """
        Applies certain corrections to the HTML source passed to this method.
        Specifically adds the relevant shim script, wraps all script text
        within opera.isReady() methods etc. """

        htmlparser = html5lib.HTMLParser(
                tree=html5lib.treebuilders.getTreeBuilder("dom"))
        domwalker = html5lib.treewalkers.getTreeWalker("dom")
        serializer = html5lib.serializer.HTMLSerializer(
                omit_optional_tags=False, quote_attr_values=True,
                strip_whitespace=True, use_trailing_solidus=True)
        doc = htmlparser.parse(html, "utf-8")
        inlinescrdata = ""
        nex = self._nex
        # FIXME: use the correct base for the @src (mostly this is the root
        # [''])
        # Remove scripts only if we are merging all of them

        def add_dom_prefs(doc, prefs):
            """ Add an external script with the data taken from preference
            elements in config.xml. Returns a tuple of doc, prefs script and
            script src"""
            if isinstance(prefs, dict):
github chagel / CNPROG / utils / html.py View on Github external
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
                            tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)
github CloudBotIRC / CloudBot / lib / bs4 / builder / _html5lib.py View on Github external
def __init__(self, element, soup, namespace):
        html5lib.treebuilders._base.Node.__init__(self, element.name)
        self.element = element
        self.soup = soup
        self.namespace = namespace
github marreta27 / jsm / jsm / util.py View on Github external
def html_parser(html):
    try:
        soup = BeautifulSoup(html)
    except:
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
        soup = parser.parse(html)
    return soup
github mdn / kuma / kuma / wiki / content.py View on Github external
def __init__(self, src=None, is_full_document=False):

        self.tree = html5lib.treebuilders.getTreeBuilder("etree")

        self.parser = html5lib.HTMLParser(tree=self.tree,
                                          namespaceHTMLElements=False)

        self._serializer = None
        self._default_serializer_options = {
            'omit_optional_tags': False,
            'quote_attr_values': 'always',
            'escape_lt_in_attrs': True,
            'alphabetical_attributes': True,
        }
        self._serializer_options = None
        self.walker = html5lib.treewalkers.getTreeWalker("etree")

        self.src = ''
        self.doc = None
github tantalor / emend / app / emend / html.py View on Github external
def strip_tags(html):
  if html:
    builder = treebuilders.getTreeBuilder("dom")
    parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags)
    tree = parser.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(tree)
    serializer = HTMLSerializer()
    return serializer.render(stream)
github xhtml2pdf / xhtml2pdf / xhtml2pdf / parser.py View on Github external
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        #TODO: XHTMLParser doesn't see to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if isinstance(src, six.text_type):
        # If an encoding was provided, do not change it.
        if not encoding:
            encoding = "utf-8"
        src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)

    # # Test for the restrictions of html5lib
    # if encoding:
    #     # Workaround for html5lib<0.11.1
    #     if hasattr(inputstream, "isValidEncoding"):
    #         if encoding.strip().lower() == "utf8":
    #             encoding = "utf-8"
github xhtml2pdf / xhtml2pdf / xhtml2pdf / parser.py View on Github external
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        #TODO: XHTMLParser doesn't see to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if isinstance(src, six.text_type):
        # If an encoding was provided, do not change it.
        if not encoding:
            encoding = "utf-8"
        src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)

    # # Test for the restrictions of html5lib
    # if encoding:
    #     # Workaround for html5lib<0.11.1
    #     if hasattr(inputstream, "isValidEncoding"):
    #         if encoding.strip().lower() == "utf8":
    #             encoding = "utf-8"
    #         if not inputstream.isValidEncoding(encoding):
    #             log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)