How to use the html5lib.html5parser.HTMLParser function in html5lib

To help you get started, we’ve selected a few html5lib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github html5lib / html5lib-python / tests / test_treewalkers.py View on Github external
def test_all_tokens(self):
        expected = [
            {'data': [], 'type': 'StartTag', 'name': 'html'},
            {'data': [], 'type': 'StartTag', 'name': 'head'},
            {'data': [], 'type': 'EndTag', 'name': 'head'},
            {'data': [], 'type': 'StartTag', 'name': 'body'},
            {'data': 'a', 'type': 'Characters'},
            {'data': [], 'type': 'StartTag', 'name': 'div'},
            {'data': 'b', 'type': 'Characters'},
            {'data': [], 'type': 'EndTag', 'name': 'div'},
            {'data': 'c', 'type': 'Characters'},
            {'data': [], 'type': 'EndTag', 'name': 'body'},
            {'data': [], 'type': 'EndTag', 'name': 'html'}
            ]
        for treeName, treeCls in treeTypes.items():
            p = html5parser.HTMLParser(tree = treeCls["builder"])
            document = p.parse("a<div>b</div>c")
            document = treeCls.get("adapter", lambda x: x)(document)
            output = treeCls["walker"](document)
            for expectedToken, outputToken in zip(expected, output):
                self.assertEquals(expectedToken, outputToken)
github html5lib / html5lib-python / tests / test_sanitizer.py View on Github external
def sanitize_html(stream):
  return ''.join([token.toxml() for token in
      html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
          parseFragment(stream).childNodes])
github html5lib / html5lib-python / tests / test_sanitizer.py View on Github external
def sanitize_html(self,stream):
    return ''.join([token.toxml() for token in
       html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
           parseFragment(stream).childNodes])
github Komodo / KomodoEdit / contrib / html5lib / examples / parse.py View on Github external
# Try opening from file system
                f = open(f)
            except IOError: pass
    except IndexError:
        sys.stderr.write("No filename provided. Use -h for help\n")
        sys.exit(1)

    treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)

    if opts.sanitize:
        tokenizer = sanitizer.HTMLSanitizer
    else:
        tokenizer = HTMLTokenizer


    p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer)

    if opts.fragment:
        parseMethod = p.parseFragment
    else:
        parseMethod = p.parse

    if opts.profile:
        #XXX should import cProfile instead and use that
        import hotshot
        import hotshot.stats
        prof = hotshot.Profile('stats.prof')
        prof.runcall(parseMethod, f, encoding=encoding)
        prof.close()
        # XXX - We should use a temp file here
        stats = hotshot.stats.load('stats.prof')
        stats.strip_dirs()
github rubys / venus / favicon.py View on Github external
def favicon(page):
    parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    doc = parser.parse(urlopen(page))
    favicon = urljoin(page, '/favicon.ico')
    for link in doc.getElementsByTagName('link'):
        if link.hasAttribute('rel') and link.hasAttribute('href'):
            if 'icon' in link.attributes['rel'].value.lower().split(' '):
                favicon = urljoin(page, link.attributes['href'].value)
    if urlopen(favicon).info()['content-length'] != '0':
        return favicon
github html5lib / html5lib-python / parse.py View on Github external
f = open(f, "rb")
            except IOError as e:                
                sys.stderr.write("Unable to open file: %s\n" % e)
                sys.exit(1)
    except IndexError:
        sys.stderr.write("No filename provided. Use -h for help\n")
        sys.exit(1)

    treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)

    if opts.sanitize:
        tokenizer = sanitizer.HTMLSanitizer
    else:
        tokenizer = HTMLTokenizer

    p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer, debug=opts.log)

    if opts.fragment:
        parseMethod = p.parseFragment
    else:
        parseMethod = p.parse

    if opts.profile:
        import cProfile
        import pstats
        cProfile.runctx("run(parseMethod, f, encoding)", None,
                        {"run": run,
                         "parseMethod": parseMethod,
                         "f": f,
                         "encoding": encoding},
                        "stats.prof")
        # XXX - We should use a temp file here
github pypa / warehouse / warehouse / filters.py View on Github external
def camoify(ctx, value):
    request = ctx.get("request") or get_current_request()

    # Parse the rendered output and replace any inline images that don't point
    # to HTTPS with camouflaged images.
    tree_builder = html5lib.treebuilders.getTreeBuilder("dom")
    parser = html5lib.html5parser.HTMLParser(tree=tree_builder)
    dom = parser.parse(value)

    for element in dom.getElementsByTagName("img"):
        src = element.getAttribute("src")
        if src:
            element.setAttribute("src", request.camo_url(src))

    tree_walker = html5lib.treewalkers.getTreeWalker("dom")
    html_serializer = html5lib.serializer.HTMLSerializer()
    camoed = "".join(html_serializer.serialize(tree_walker(dom)))

    return camoed
github w3c / html-tools / publish.py View on Github external
'w3c_compat': True,
      'w3c_compat_xref_a_placement': False,
      'w3c_compat_xref_elements': False,
      'w3c_compat_xref_normalization': False,
    }
    if "anolis" in conf:
        opts.update(conf["anolis"])

    if spec == "srcset":
        print 'munging (before anolis)'

        filtered.seek(0)
        pre_anolis_buffer = StringIO()

        # Parse
        parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
        tree = parser.parse(filtered, encoding='utf-8')

        # Move introduction above conformance requirements
        introduction = tree.findall("//*[@id='introduction']")[0]
        intro_ps = introduction.xpath("following-sibling::*")
        target = tree.findall("//*[@id='conformance-requirements']")[0]
        target.addprevious(introduction)
        target = introduction
        target.addnext(intro_ps[2])
        target.addnext(intro_ps[1])
        target.addnext(intro_ps[0])

        # Serialize
        tokens = html5lib.treewalkers.getTreeWalker('lxml')(tree)
        serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
        for text in serializer.serialize(tokens, encoding='utf-8'):
github rubys / venus / planet / reconstitute.py View on Github external
if isinstance(detail.value,unicode):
        detail.value=detail.value.encode('utf-8')

    if not detail.has_key('type') or detail.type.lower().find('html')&lt;0:
        detail['value'] = escape(detail.value)
        detail['type'] = 'text/html'

    if detail.type.find('xhtml')&gt;=0 and not bozo:
        try:
            data = minidom.parseString(xdiv % detail.value).documentElement
            xcontent.setAttribute('type', 'xhtml')
        except:
            bozo=1

    if detail.type.find('xhtml')&lt;0 or bozo:
        parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
        html = parser.parse(xdiv % detail.value, encoding="utf-8")
        for body in html.documentElement.childNodes:
            if body.nodeType != Node.ELEMENT_NODE: continue
            if body.nodeName != 'body': continue
            for div in body.childNodes:
                if div.nodeType != Node.ELEMENT_NODE: continue
                if div.nodeName != 'div': continue
                try:
                    div.normalize()
                    if len(div.childNodes) == 1 and \
                        div.firstChild.nodeType == Node.TEXT_NODE:
                        data = div.firstChild
                        if illegal_xml_chars.search(data.data):
                            data = xdoc.createTextNode(
                                illegal_xml_chars.sub(invalidate, data.data))
                    else:
github divio / djangocms-text-ckeditor / djangocms_text_ckeditor / html.py View on Github external
def extract_images(data, plugin):
    """
    extracts base64 encoded images from drag and drop actions in browser and saves
    those images as plugins
    """
    if not settings.TEXT_SAVE_IMAGE_FUNCTION:
        return data
    tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
    parser = html5lib.html5parser.HTMLParser(tree=tree_builder)
    dom = parser.parse(data)
    found = False
    for img in dom.getElementsByTagName('img'):
        src = img.getAttribute('src')
        if not src.startswith('data:'):
            # nothing to do
            continue
        width = img.getAttribute('width')
        height = img.getAttribute('height')
        # extract the image data
        data_re = re.compile(r'data:(?P[^"]*);(?P[^"]*),(?P<data>[^"]*)')
        m = data_re.search(src)
        dr = m.groupdict()
        mime_type = dr['mime_type']
        image_data = dr['data']
        if mime_type.find(';'):</data>