Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from lxml import html
import requests
import time
from get_stock_data import _get_company_primary_stats as get_company_primary_stats
from company_page import CompanyPage
stock_company = "NTPC"
page = requests.get('http://money.rediff.com/%s' % stock_company)
tree = html.fromstring(page.text)
company = CompanyPage(tree)
def test_get_company_primary_stats():
primary_stats = get_company_primary_stats(company, tree)
assert primary_stats.get('pe_ratio') > 0
assert all([primary_stats.get('eps') > 0, primary_stats.get('price_of_stock') > 0, primary_stats.get('fifty_two_wk_high') > 0, primary_stats.get('fifty_two_wk_low') > 0])
pid.kill()
stdout, stderr = pid.communicate()
app.logger.error('PhantomJS Static Capture timeout')
raise Exception('PhantomJS Static Capture timeout')
# If the subprocess has an error, raise an exception
if stderr or stdout:
raise Exception(stderr)
# Strip tags and parse out all text
ignore_tags = ('script', 'noscript', 'style')
with open(content_to_parse, 'r') as content_file:
content = content_file.read()
cleaner = clean.Cleaner()
content = cleaner.clean_html(content)
doc = LH.fromstring(content)
output = ""
for elt in doc.iterdescendants():
if elt.tag in ignore_tags:
continue
text = elt.text or ''
tail = elt.tail or ''
wordz = " ".join((text, tail)).strip('\t')
if wordz and len(wordz) >= 2 and not re.match("^[ \t\n]*$", wordz):
output += wordz.encode('utf-8')
# Since the filename format is different for static captures, update the filename
# This will ensure the URLs are pointing to the correct resources
if model == 'static':
capture_name = capture_name.split('.')[0]
# Wite our html text that was parsed into our capture folder
def main():
# opens xkcd.com
try:
page = requests.get("https://www.xkcd.com")
except requests.exceptions.RequestException as e:
print(e)
exit()
# parses xkcd.com page
tree = html.fromstring(page.content)
# finds image src url
image_src = tree.xpath(".//*[@id='comic']/img/@src")[0]
image_src = "https:" + str(image_src)
# gets comic name from the image src url
comic_name = image_src.split('/')[-1]
# save location of comic
comic_location = os.getcwd() + '/comics/'
# checks if save location exists else creates
if not os.path.exists(comic_location):
os.makedirs(comic_location)
# creates final comic location including name of the comic
def scrapeInfo(mainContent, mainXPath, paraXPath):
li = []
mainLinksXPath = mainContent.xpath(mainXPath)
## Creates a set of mainLinksXPath which takes out the duplicates and then format the set back to a list
mainLinksXPath = list(set(mainLinksXPath))
## Loop through elements in mainLinksXPath
for mainLinksElements in mainLinksXPath:
## Translate the element to a string and then formate to HTML
link = tostring(mainLinksElements)
link = html.fromstring(link)
## Use xpath to get all anchor tags in HTML element
link = link.xpath('//a')
## Loop through each element in the xpath
## This will loop through all anchor tags
for i in link:
## Get the href parameter from the anchor tags
i = i.get('href')
if 'http' not in i:
i = 'http://www.fintrac-canafe.gc.ca' + i
## Do a HTTP request on the article link
linkRequest = requests.get(i)
writeToLog("Gathering Names from: " + i + "\n")
## Translate the content from the request to HTML
linkContent = html.fromstring(linkRequest.content)
## Find the paraXpath in the article
linkXPath = linkContent.xpath(paraXPath)
def get_session_list(self):
html = (
scrapelib.Scraper()
.get("http://www.sdlegislature.gov/" "Legislative_Session/archive.aspx")
.text
)
doc = lxml.html.fromstring(html)
sessions = [
x.strip() for x in doc.xpath('//table//td[@data-title="Year"]/text()')
]
# Archive page lacks the latest session
current_session_url = doc.xpath(
'//*[@id="ctl00_divHeader_mnuMain"]/li[6]/ul/li[1]/a/@href'
)[0]
current_session = current_session_url.replace(
"/Legislative_Session/Bills/Default.aspx?Session=", ""
)
if current_session not in sessions:
sessions.append(current_session)
return sessions
def search(self, query, max_results=10, timeout=60):
url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query))
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
for data in doc.xpath('//div[@id="books"]/div[@itemtype="http://schema.org/Book"]'):
if counter <= 0:
break
id = ''.join(data.xpath('./meta[@itemprop="url"]/@content')).strip()
if not id:
continue
cover_url = 'http://www.ebook.nl/store/' + ''.join(data.xpath('.//img[@itemprop="image"]/@src'))
title = ''.join(data.xpath('./span[@itemprop="name"]/a/text()')).strip()
author = ''.join(data.xpath('./span[@itemprop="author"]/a/text()')).strip()
if author == ' ':
author = ''
price = ''.join(data.xpath('.//span[@itemprop="price"]//text()'))
counter -= 1
s = SearchResult()
def process_url(self, url):
response = self.connector.reqhandler(url)
try:
self.page = lxml.html.fromstring(response)
except lxml.etree.XMLSyntaxError as e:
# most of the time we can simply ignore parsing errors
self.logger.error("XMLSyntaxError at %s" % url)
return
return self.page
def get_form_field(self, form, fieldname):
"""
Return the control that matches 'fieldname'. Must be
a *unique* regexp/exact string match.
"""
if fieldname in form.fields.keys():
controls = [f for f in form.inputs if f.get("name") == fieldname \
and hasattr(f, 'type') and f.type == 'checkbox']
if len(controls) > 1:
return html.CheckboxGroup(controls)
fieldname = str(fieldname)
found = None
found_multiple = False
matches = [ c for c in form.inputs if c.get("id") == fieldname ]
# test exact match.
if matches:
if unique_match(matches):
found = matches[0]
else:
found_multiple = True # record for error reporting.
matches = [ c for c in form.inputs if str(c.name) == fieldname ]
def save_attachments(self, html, document, prefix, tmpdir):
""" Place attachments needed by the html of this document into tmpdir. Only attachments
referenced using the given prefix are saved.
"""
html = lxml.html.fromstring(html)
prefix_len = len(prefix)
# gather up the attachments that occur in the html
fnames = set(
img.get('src')[prefix_len:]
for img in html.iter('img')
if img.get('src', '').startswith(prefix)
)
# ensure the media directory exists
media_dir = os.path.join(tmpdir, prefix)
os.makedirs(media_dir, exist_ok=True)
for attachment in document.attachments.all():
# the src attribute values in fnames are URL-quoted
if urllib.parse.quote(attachment.filename) in fnames:
def asbytes(x):
if isinstance(x, type('')):
x = x.encode('utf-8')
return x
uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()}
url = base_url + '?' + urlencode(uquery)
br = browser(user_agent=get_user_agent())
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
raw = f.read()
if write_html_to is not None:
with open(write_html_to, 'wb') as f:
f.write(raw)
doc = html.fromstring(raw)
try:
results = doc.xpath('//div[@id="atfResults" and @class]')[0]
except IndexError:
return
if 's-result-list-parent-container' in results.get('class', ''):
data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
format_xpath = './/a[@title="%s"]/@title' % KINDLE_EDITION
asin_xpath = '@data-asin'
cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY
price_xpath = ('descendant::div[@class="a-row a-spacing-none" and'
' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()')
else:
return