How to use the html5lib.parse function in html5lib

To help you get started, we’ve selected a few html5lib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github web-platform-tests / wpt / tools / manifest / sourcefile.py View on Github external
    parsers = {"html":lambda x:html5lib.parse(x, treebuilder="etree"),
               "xhtml":lambda x:ElementTree.parse(x, XMLParser.XMLParser()),
github bitextor / bitextor / bitextor-deferred-reconstructor.py View on Github external
reconstructedsentence.append(tail[int(wordstandofflimits[0]):int(wordstandofflimits[1])+1])
                        else:
                            reconstructedsentence[-1] = reconstructedsentence[-1] + tail[int(wordstandofflimits[0]):int(wordstandofflimits[1])+1]
                        break
    return " ".join(reconstructedsentence)



#Argument input: path of original Bitextor formatted crawl file
document_standoff = dict()
with open(sys.argv[1],'r') as reader:
    for line in reader:
        fields=line.split('\t')
        fields = list(map(str.strip, fields)) #Strip all elements
        #We use lxml treebuilder because of getelementpath function and iteration through elements
        document_standoff[fields[1]] = html5lib.parse(base64.b64decode(fields[0]),treebuilder="lxml",namespaceHTMLElements=False) #Store url:html5lib_tree for easy path search

#Input: Bitextor DOCALG file (deferred):
#url1 url2 deferred_clean_text1_in_base64 deferred_clean_text2_in_base64

#Output: Bitextor DOCALG file reconstructed:
#url1 url2 clean_text1_in_base64 clean_text2_in_base64
for line in sys.stdin:
    fields = line.split('\t')
    newfields = [fields[0],fields[1]]
    for annotation,url in {fields[2]:fields[0],fields[3]:fields[1]}.items(): #SL and TL annotations with URLs from input DOCALG file format: https://github.com/bitextor/bitextor/wiki/Intermediate-formats-used-in-Bitextor#docalg
        if annotation != "":
            newfields.append(get_sentence(annotation,document_standoff[url]))
        else:
            newfields.append("")
    print("\t".join(newfields))
github fated / calibre_amazon_cn / worker.py View on Github external
msg = 'Amazon timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r'%self.url
                self.log.exception(msg)
            return

        oraw = raw
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r'%self.url)
            return

        try:
            root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
                    namespaceHTMLElements=False)
        except:
            msg = 'Failed to parse amazon details page: %r'%self.url
            self.log.exception(msg)
            return

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = 'Failed to parse amazon details page: %r'%self.url
            msg += self.tostring(errmsg, method='text', encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(oraw, root)
</title>
github dstufft / pypi-show-urls / pypi_show_urls / __main__.py View on Github external
session = requests.session()
    session.verify = False

    for package in packages:
        print("")
        print("Download candidates for %s" % package)
        print("========================" + ("=" * len(package)))

        # Grab the page from PyPI
        url = "https://pypi.python.org/simple/%s/" % package
        resp = session.get(url)
        if resp.status_code == 404:
            continue
        resp.raise_for_status()

        html = html5lib.parse(resp.content, namespaceHTMLElements=False)

        spider = set()
        installable_ = set()

        for link in itertools.chain(
                            html.findall(".//a[@rel='download']"),
                            html.findall(".//a[@rel='homepage']")):
            if "href" in link.attrib:
                try:
                    absolute_link = urlparse.urljoin(url, link.attrib["href"])
                except Exception:
                    continue

                if not installable(package, absolute_link):
                    parsed = urlparse.urlparse(absolute_link)
                    if parsed.scheme.lower() in ["http", "https"]:
github mattaustin / fremantleline / fremantleline / api / __init__.py View on Github external
def _get_html(self):
        url_opener = URLOpener()
        response = url_opener.open(self.url)
        if lxml:
            html = lxml.html.parse(response).getroot()
        else:
            html = html5lib.parse(response, namespaceHTMLElements=False)
        return html
github FrancescoCeruti / linux-show-player / scripts / Flatpak / pipenv_flatpak.py View on Github external
def fetch(package):
        print("   Download candidates for {}".format(package["name"]))

        # GET the page from the mirror
        url = url_template.format(package["name"])
        resp = session.get(url)

        if resp.status_code != 200:
            print(
                "   Cannot fetch candidates: error {}".format(resp.status_code)
            )

        # Parse HTML content
        html = html5lib.parse(resp.content, namespaceHTMLElements=False)

        # Iterate all the provided downloads
        for link in html.findall(".//a"):
            package["candidates"].append((link.text, link.attrib["href"]))
github localwiki / localwiki-backend-server / sapling / versionutils / diff / daisydiff / daisydiff.py View on Github external
def extract_table_row(html):
    doc = html5lib.parse(html)
    return find_element_by_tag('tr', doc)
github despawnerer / ankle / ankle / find.py View on Github external
def find_iter(skeleton, document):
    """
    Return an iterator that yields elements from the document that
    match given skeleton.

    See `find_all` for details.
    """
    if is_string(document):
        document = html5lib.parse(document)
    if is_string(skeleton):
        fragment = html5lib.parseFragment(skeleton)
        if len(fragment) != 1:
            raise ValueError("Skeleton must have exactly one root element.")
        skeleton = fragment[0]

    for element in document.iter():
        if node_matches_bone(element, skeleton):
            yield element
github python / pyperformance / pyperformance / benchmarks / bm_html5lib.py View on Github external
def bench_html5lib(html_file):
    html_file.seek(0)
    html5lib.parse(html_file)