How to use the html5lib.parseFragment function in html5lib

To help you get started, we’ve selected a few html5lib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hasgeek / coaster / coaster / utils / text.py View on Github external
def text_blocks(html_text, skip_pre=True):
    """
    Extracts a list of paragraphs from a given HTML string
    """
    doc = html5lib.parseFragment(html_text)
    blocks = []

    def subloop(parent_tag, element, lastchild=False):
        if callable(
            element.tag
        ):  # Comments have a callable tag. TODO: Find out, anything else?
            tag = ''
            text = ''
            tail = element.tail or u''
        else:
            tag = element.tag.split('}')[
                -1
            ]  # Extract tag from namespace: {http://www.w3.org/1999/xhtml}html
            text = element.text or u''
            tail = element.tail or u''
github apache / allura / Allura / allura / lib / markdown_extensions.py View on Github external
def run(self, text):
        parsed = html5lib.parseFragment(text)

        # if we didn't have to customize our sanitization, could just do:
        # return html5lib.serialize(parsed, sanitize=True)

        # instead we do the same steps as that function,
        # but add our ForgeHTMLSanitizerFilter instead of sanitize=True which would use the standard one
        TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
        walker = TreeWalker(parsed)
        walker = ForgeHTMLSanitizerFilter(walker)  # this is our custom step
        s = html5lib.serializer.HTMLSerializer()
        return s.render(walker)
github despawnerer / ankle / ankle / find.py View on Github external
def find_iter(skeleton, document):
    """
    Return an iterator that yields elements from the document that
    match given skeleton.

    See `find_all` for details.
    """
    if is_string(document):
        document = html5lib.parse(document)
    if is_string(skeleton):
        fragment = html5lib.parseFragment(skeleton)
        if len(fragment) != 1:
            raise ValueError("Skeleton must have exactly one root element.")
        skeleton = fragment[0]

    for element in document.iter():
        if node_matches_bone(element, skeleton):
            yield element
github norbusan / calibre-debian / src / calibre / ebooks / metadata / sources / amazon.py View on Github external
def parse_comments(self, root, raw):
        try:
            from urllib.parse import unquote
        except ImportError:
            from urllib import unquote
        ans = ''
        ns = tuple(self.selector('#bookDescription_feature_div noscript'))
        if ns:
            ns = ns[0]
            if len(ns) == 0 and ns.text:
                import html5lib
                # html5lib parsed noscript as CDATA
                ns = html5lib.parseFragment(
                    '<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
            else:
                ns.tag = 'div'
            ans = self._render_comments(ns)
        else:
            desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
            if desc:
                ans = self._render_comments(desc[0])

        desc = root.xpath(
            '//div[@id="productDescription"]/*[@class="content"]')
        if desc:
            ans += self._render_comments(desc[0])
        else:
            # Idiot chickens from amazon strike again. This data is now stored
            # in a JS variable inside a script tag URL encoded.
github mozilla / addons-server / src / olympia / amo / utils.py View on Github external
tree.text = tree.text.rstrip('\n')

            # Remove the first new line after a block level element.
            if tree.tail and tree.tail.startswith('\n'):
                tree.tail = tree.tail[1:]

        for child in tree:  # Recurse down the tree.
            if tree.tag in html_blocks:
                # Strip new lines directly inside block level elements: remove
                # the last new lines from the children's tails.
                if child.tail:
                    child.tail = child.tail.rstrip('\n')
            parse_html(child)
        return tree

    parse = parse_html(html5lib.parseFragment(string))

    # Serialize the parsed tree back to html.
    walker = html5lib.treewalkers.getTreeWalker('etree')
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values='always',
                                omit_optional_tags=False)
    return serializer.render(stream)
github kovidgoyal / calibre / src / calibre / ebooks / metadata / sources / amazon.py View on Github external
def parse_comments(self, root, raw):
        try:
            from urllib.parse import unquote
        except ImportError:
            from urllib import unquote
        ans = ''
        ns = tuple(self.selector('#bookDescription_feature_div noscript'))
        if ns:
            ns = ns[0]
            if len(ns) == 0 and ns.text:
                import html5lib
                # html5lib parsed noscript as CDATA
                ns = html5lib.parseFragment(
                    '<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
            else:
                ns.tag = 'div'
            ans = self._render_comments(ns)
        else:
            desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
            if desc:
                ans = self._render_comments(desc[0])

        desc = root.xpath(
            '//div[@id="productDescription"]/*[@class="content"]')
        if desc:
            ans += self._render_comments(desc[0])
        else:
            # Idiot chickens from amazon strike again. This data is now stored
            # in a JS variable inside a script tag URL encoded.
github nanaze / jsdoctor / generator.py View on Github external
def _ProcessString(content):
  content = linkify.LinkifyWebUrls(content)
  return html5lib.parseFragment(content, treebuilder='dom')
github mozilla / addons-server / apps / translations / utils.py View on Github external
def truncate(html, length, killwords=False, end='...'):
    """
    Return a slice of ``html`` &lt;= length chars.

    killwords and end are currently ignored.

    ONLY USE FOR KNOWN-SAFE HTML.
    """
    tree = html5lib.parseFragment(html)
    if text_length(tree) &lt;= length:
        return jinja2.Markup(html)
    else:
        # Get a truncated version of the tree.
        short, _ = trim(tree, length, killwords, end)

        # Serialize the parsed tree back to html.
        walker = html5lib.treewalkers.getTreeWalker('etree')
        stream = walker(short)
        serializer = html5lib.serializer.htmlserializer.HTMLSerializer(
            quote_attr_values=True, omit_optional_tags=False)
        return jinja2.Markup(force_unicode(serializer.render(stream)))
github mitsuhiko / bf3-aggregator / bf3.py View on Github external
node.remove(div_children[0])

            for key in node.attrib.keys():
                if key.startswith('xmlns:') or \
                   key in ('id', 'class', 'style'):
                    del node.attrib[key]

            for child in node.getchildren():
                new_child = transform(child)
                if new_child != child:
                    if new_child is not None:
                        child.addnext(new_child)
                    node.remove(child)
            return node

        prefix, root_node = html5lib.parseFragment(text, treebuilder='lxml')
        node = transform(root_node)
        node.attrib.clear()
        node.tail = None

        walker = html5lib.treewalkers.getTreeWalker('lxml')
        stream = walker(node)
        serializer = html5lib.serializer.htmlserializer.HTMLSerializer(omit_optional_tags=True)
        output_generator = serializer.serialize(stream)
        return u''.join(output_generator)