How to use the html5lib.treebuilders.getTreeBuilder function in html5lib

To help you get started, we’ve selected a few html5lib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tiddlyweb / tiddlyweb / tiddlyweb / fromsvn.py View on Github external
def process_tiddler(content):
    """
    Turn some content into a div element representing
    a tiddler.
    """
    content = _escape_brackets(content)
    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup'))
    soup = parser.parse(content)
    tiddler = soup.find('div')
    return tiddler
github RDFLib / rdflib / rdflib / plugins / parsers / pyMicrodata / __init__.py View on Github external
self.http_status = h.http_code
				if not rdfOutput : raise h
				return self._generate_error_graph(graph, "HTTP Error: %s (%s)" % (h.http_code,h.msg), uri=name)
			except Exception :
				# Something nasty happened:-(
				e = sys.exc_info()[1]
				self.http_status = 500
				if not rdfOutput : raise e
				return self._generate_error_graph(graph, str(e), uri=name)

			dom = None
			try :
				import warnings
				warnings.filterwarnings("ignore", category=DeprecationWarning)
				import html5lib
				parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
				dom = parser.parse(input)
				return self.graph_from_DOM(dom, graph)
			except ImportError :
				msg = "HTML5 parser not available. Try installing html5lib "
				raise ImportError(msg)
			except Exception :
				# Something nasty happened:-(
				e = sys.exc_info()[1]
				self.http_status = 400
				if not rdfOutput : raise e
				return self._generate_error_graph(graph, str(e), uri=name)

		except Exception :
			# Something nasty happened:-(
			e = sys.exc_info()[1]
			if isinstance(e, ImportError) :
github Ravenbrook / mps / manual / source / make-mmref.py View on Github external
def rewrite_links(src, src_base, url_filter, rewrite_base,
                  url_attributes = (('a', 'href'),)):
    """Rewrite URLs in src and return the result.

    First, src is parsed as HTML. Second, all URLs found in the
    document are resolved relative to src_base and the result passed to
    the functions url_filter. If this returns False, the URL is resolved
    again, this time relative to rewrite_base, and the result stored
    back to the document. Finally, the updated document is serialized
    as HTML and returned.

    The keyword argument url_attributes is a sequence of (tag,
    attribute) pairs that contain URLs to be rewritten.

    """
    tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
    parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
    dom = parser.parse(src)

    for tag, attr in url_attributes:
        for e in dom.getElementsByTagName(tag):
            u = e.getAttribute(attr)
            if u and not url_filter(urljoin(src_base, u)):
                rewritten = urljoin(rewrite_base, u)
                if u != rewritten:
                    e.setAttribute(attr, rewritten)

    tree_walker = html5lib.treewalkers.getTreeWalker('dom')
    html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
    return u''.join(html_serializer.serialize(tree_walker(dom)))
github RDFLib / pymicrodata / pyMicrodata / __init__.py View on Github external
self.http_status = h.http_code
				if not rdfOutput : raise h
				return self._generate_error_graph(graph, "HTTP Error: %s (%s)" % (h.http_code,h.msg), uri=name)
			except Exception :
				# Something nasty happened:-(
				e = sys.exc_info()[1]
				self.http_status = 500
				if not rdfOutput : raise e
				return self._generate_error_graph(graph, str(e), uri=name)
				
			dom = None
			try :
				import warnings
				warnings.filterwarnings("ignore", category=DeprecationWarning)
				import html5lib
				parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
				dom = parser.parse(input)
				return self.graph_from_DOM(dom, graph)
			except ImportError :
				msg = "HTML5 parser not available. Try installing html5lib "
				raise ImportError(msg)
			except Exception :
				# Something nasty happened:-(
				e = sys.exc_info()[1]
				self.http_status = 400
				if not rdfOutput : raise e
				return self._generate_error_graph(graph, str(e), uri=name)	

		except Exception :
			# Something nasty happened:-(
			e = sys.exc_info()[1]
			if isinstance(e, ImportError) :
github mdn / kuma / apps / wiki / parser.py View on Github external
"""
            top_level_elements = parser.parseFragment(html)
            container = Element(self.CONTAINER_TAG)

            # Why lxml couldn't just have text nodes, I'll never understand.
            # Text nodes that come other than first are automatically stuffed
            # into the tail attrs of the preceding elements by html5lib.
            if top_level_elements and isinstance(top_level_elements[0],
                                                 basestring):
                container.text = top_level_elements.pop(0)

            container.extend(top_level_elements)
            return container

        p = HTMLParser(tree=getTreeBuilder(self.TREEBUILDER))
        self._root = really_parse_fragment(p, html)
github ltucker / radarpost / radarpost / web / api / controller.py View on Github external
headers = {'Connection': 'close'}
        response, content = client.request(query, headers=headers)

        if response.status != 200: 
            return None

        ct = response.get('content-type', '')
        if ';' in ct:
            ct = ct[0:ct.find(';')]
        ct = ct.strip()
        if ct not in HTML_TYPES: 
            return None

        # parse html
        links = []
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("etree", etree),
                                     namespaceHTMLElements=False)
        html = parser.parse(content)
        for link in html.find("head").findall("link"):
            if link.get("rel", "").lower() == "alternate":
                linktype = link.get("type", "").lower()
                if linktype in FEED_TYPES:
                    href = link.get("href", "")
                    if href:
                        feed_item = {'url': href, 'title': link.get("title", "")}
                        links.append(feed_item)
        return links
    except:
        log.error("Error finding feed links at %s: %s" % (query, traceback.format_exc()))
        return None
    finally:
        http.close_all(client)
github laurentb / weboob / weboob / tools / parsers / html5libparser.py View on Github external
def __init__(self, api='etree'):
        # if no default implementation is defined for this api, set it to None
        # to let getTreeBuilder() using the corresponding implementation.
        implementation = self.defaults.get(api, None)
        HTMLParser.__init__(self, tree=treebuilders.getTreeBuilder(api, implementation))