How to use the lxml.html function in lxml

To help you get started, we’ve selected a few lxml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github infinite-Joy / stock-analysis / tests / test_get_stock_data.py View on Github external
from lxml import html
import requests
import time

from get_stock_data import _get_company_primary_stats as get_company_primary_stats
from company_page import CompanyPage

stock_company = "NTPC"
page = requests.get('http://money.rediff.com/%s' % stock_company)
tree = html.fromstring(page.text)
company = CompanyPage(tree)

def test_get_company_primary_stats():
    primary_stats = get_company_primary_stats(company, tree)
    assert primary_stats.get('pe_ratio') > 0
    assert all([primary_stats.get('eps') > 0, primary_stats.get('price_of_stock') > 0, primary_stats.get('fifty_two_wk_high') > 0, primary_stats.get('fifty_two_wk_low') > 0])
github Netflix / sketchy / sketchy / controllers / tasks.py View on Github external
pid.kill()
        stdout, stderr = pid.communicate()
        app.logger.error('PhantomJS Static Capture timeout')
        raise Exception('PhantomJS Static Capture timeout')

    # If the subprocess has an error, raise an exception
    if stderr or stdout:
        raise Exception(stderr)

    # Strip tags and parse out all text
    ignore_tags = ('script', 'noscript', 'style')
    with open(content_to_parse, 'r') as content_file:
        content = content_file.read()
    cleaner = clean.Cleaner()
    content = cleaner.clean_html(content)
    doc = LH.fromstring(content)
    output = ""
    for elt in doc.iterdescendants():
        if elt.tag in ignore_tags:
            continue
        text = elt.text or ''
        tail = elt.tail or ''
        wordz = " ".join((text, tail)).strip('\t')
        if wordz and len(wordz) >= 2 and not re.match("^[ \t\n]*$", wordz):
            output += wordz.encode('utf-8')
   
    # Since the filename format is different for static captures, update the filename
    # This will ensure the URLs are pointing to the correct resources
    if model == 'static':
        capture_name = capture_name.split('.')[0]
        
    # Wite our html text that was parsed into our capture folder
github geekcomputers / Python / xkcd_downloader.py View on Github external
def main():
    # opens xkcd.com
    try:
        page = requests.get("https://www.xkcd.com")
    except requests.exceptions.RequestException as e:
        print(e)
        exit()

    # parses xkcd.com page
    tree = html.fromstring(page.content)

    # finds image src url
    image_src = tree.xpath(".//*[@id='comic']/img/@src")[0]
    image_src = "https:" + str(image_src)

    # gets comic name from the image src url
    comic_name = image_src.split('/')[-1]

    # save location of comic
    comic_location = os.getcwd() + '/comics/'

    # checks if save location exists else creates
    if not os.path.exists(comic_location):
        os.makedirs(comic_location)

    # creates final comic location including name of the comic
github Test-BMOHB / Media-Monitoring / pyScrape_Fintrac.py View on Github external
def scrapeInfo(mainContent, mainXPath, paraXPath):
    li = []
    mainLinksXPath = mainContent.xpath(mainXPath)
##  Creates a set of mainLinksXPath which takes out the duplicates and then format the set back to a list
    mainLinksXPath = list(set(mainLinksXPath))
##  Loop through elements in mainLinksXPath
    for mainLinksElements in mainLinksXPath:
##  Translate the element to a string and then formate to HTML
        link = tostring(mainLinksElements)
        link = html.fromstring(link)
##  Use xpath to get all anchor tags in HTML element
        link = link.xpath('//a')
##  Loop through each element in the xpath
##  This will loop through all anchor tags
        for i in link:
##  Get the href parameter from the anchor tags
            i = i.get('href')
            if 'http' not in i:
                i = 'http://www.fintrac-canafe.gc.ca' + i
##  Do a HTTP request on the article link
            linkRequest = requests.get(i)
	    writeToLog("Gathering Names from: " + i + "\n")
##  Translate the content from the request to HTML
            linkContent = html.fromstring(linkRequest.content)
##  Find the paraXpath in the article
            linkXPath = linkContent.xpath(paraXPath)
github openstates / openstates / openstates / sd / __init__.py View on Github external
def get_session_list(self):
        html = (
            scrapelib.Scraper()
            .get("http://www.sdlegislature.gov/" "Legislative_Session/archive.aspx")
            .text
        )
        doc = lxml.html.fromstring(html)
        sessions = [
            x.strip() for x in doc.xpath('//table//td[@data-title="Year"]/text()')
        ]

        # Archive page lacks the latest session
        current_session_url = doc.xpath(
            '//*[@id="ctl00_divHeader_mnuMain"]/li[6]/ul/li[1]/a/@href'
        )[0]
        current_session = current_session_url.replace(
            "/Legislative_Session/Bills/Default.aspx?Session=", ""
        )
        if current_session not in sessions:
            sessions.append(current_session)

        return sessions
github norbusan / calibre-debian / src / calibre / gui2 / store / stores / ebook_nl_plugin.py View on Github external
def search(self, query, max_results=10, timeout=60):
        url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@id="books"]/div[@itemtype="http://schema.org/Book"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./meta[@itemprop="url"]/@content')).strip()
                if not id:
                    continue
                cover_url = 'http://www.ebook.nl/store/' + ''.join(data.xpath('.//img[@itemprop="image"]/@src'))
                title = ''.join(data.xpath('./span[@itemprop="name"]/a/text()')).strip()
                author = ''.join(data.xpath('./span[@itemprop="author"]/a/text()')).strip()
                if author == ' ':
                    author = ''
                price = ''.join(data.xpath('.//span[@itemprop="price"]//text()'))
                counter -= 1

                s = SearchResult()
github Donearm / PyImagedownloader / pyimagedownloader / radikal.py View on Github external
def process_url(self, url):
        response = self.connector.reqhandler(url)

        try:
            self.page = lxml.html.fromstring(response)
        except lxml.etree.XMLSyntaxError as e:
            # most of the time we can simply ignore parsing errors
            self.logger.error("XMLSyntaxError at %s" % url)
            return

        return self.page
github ctb / twill / twill / browser.py View on Github external
def get_form_field(self, form, fieldname):
        """
        Return the control that matches 'fieldname'.  Must be
        a *unique* regexp/exact string match.
        """
        if fieldname in form.fields.keys():
            controls = [f for f in form.inputs if f.get("name") == fieldname \
                        and hasattr(f, 'type') and f.type == 'checkbox']
            if len(controls) > 1:
                return html.CheckboxGroup(controls)

        fieldname = str(fieldname)
        
        found = None
        found_multiple = False

        matches = [ c for c in form.inputs if c.get("id") == fieldname ]

        # test exact match.
        if matches:
            if unique_match(matches):
                found = matches[0]
            else:
                found_multiple = True   # record for error reporting.
        
        matches = [ c for c in form.inputs if str(c.name) == fieldname ]
github laws-africa / indigo / indigo_api / exporters.py View on Github external
def save_attachments(self, html, document, prefix, tmpdir):
        """ Place attachments needed by the html of this document into tmpdir. Only attachments
        referenced using the given prefix are saved.
        """
        html = lxml.html.fromstring(html)
        prefix_len = len(prefix)

        # gather up the attachments that occur in the html
        fnames = set(
            img.get('src')[prefix_len:]
            for img in html.iter('img')
            if img.get('src', '').startswith(prefix)
        )

        # ensure the media directory exists
        media_dir = os.path.join(tmpdir, prefix)
        os.makedirs(media_dir, exist_ok=True)

        for attachment in document.attachments.all():
            # the src attribute values in fnames are URL-quoted
            if urllib.parse.quote(attachment.filename) in fnames:
github kovidgoyal / calibre / src / calibre / gui2 / store / stores / amazon_es_plugin.py View on Github external
def asbytes(x):
        if isinstance(x, type('')):
            x = x.encode('utf-8')
        return x
    uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()}
    url = base_url + '?' + urlencode(uquery)
    br = browser(user_agent=get_user_agent())

    counter = max_results
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        try:
            results = doc.xpath('//div[@id="atfResults" and @class]')[0]
        except IndexError:
            return

        if 's-result-list-parent-container' in results.get('class', ''):
            data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
            format_xpath = './/a[@title="%s"]/@title' % KINDLE_EDITION
            asin_xpath = '@data-asin'
            cover_xpath =  "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
            title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
            author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY
            price_xpath = ('descendant::div[@class="a-row a-spacing-none" and'
                           ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()')
        else:
            return