Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def record():
"""Return results from the T2K spider."""
spider = t2k_spider.T2kSpider()
response = fake_response_from_file('t2k/test_1.html')
selector = Selector(response, type='html')
nodes = selector.xpath('//%s' % spider.itertag)
spider.domain = "file:///tests/responses/t2k/"
parsed_node = spider.parse_node(response, nodes[0])
splash_response = fake_response_from_file('t2k/001.html')
splash_response.meta["date"] = parsed_node.meta["date"]
splash_response.meta["title"] = parsed_node.meta["title"]
splash_response.meta["urls"] = parsed_node.meta["urls"]
splash_response.meta["authors"] = parsed_node.meta["authors"]
parsed_item = spider.scrape_for_pdf(splash_response).next()
assert parsed_item
assert parsed_item.record
return parsed_item.record
def test_badly_encoded_body(self):
# \xe9 alone isn't valid utf8 sequence
r1 = TextResponse('http://www.example.com',
body=b'<p>an Jos\xe9 de</p>',
encoding='utf-8')
Selector(r1).xpath('//text()').getall()
def urls():
spider = base_spider.BaseSpider()
response = fake_response_from_file('base/test_1.xml')
selector = Selector(response, type='xml')
spider._register_namespaces(selector)
nodes = selector.xpath('.//%s' % spider.itertag)
return spider.get_urls_in_record(nodes[0])
def get_ovpn(url, save_to):
page = Selector(text=request.urlopen(url).read())\
.xpath('.//ul[@class="listBigArrow"]/li/a')
cururl = url.rsplit('/',2)[0]
for link in page:
if link.xpath('./strong/text()').extract_first().find('UDP') > 0:
download(cururl+link.xpath('./@href').extract_first(), save_to+'UDP.ovpn')
elif link.xpath('./strong/text()').extract_first().find('TCP') > 0:
download(cururl+link.xpath('./@href').extract_first(), save_to+'TCP.ovpn')
else:
download(cururl+link.xpath('./@href').extract_first(), save_to+'.ovpn')
def parse_content(self, response):
self.log("=================================================")
self.log("cat_url = %s proxy = %s" % (response.meta['cat_url'], response.meta['proxy']))
chenshi_name= response.meta['city_id']
sel = Selector(response)
item = SpiderDianpingXmtItem()
shop_type = response.meta['shop_type']
shop_url = response.url
http_status = response.status
self.log("chenshi_name = %s" % chenshi_name)
self.log("shop_url = %s" % shop_url)
self.log("shop_type = %s" % shop_type)
self.log("http_status = %s" % http_status)
x = sel.xpath('//div[@id="basic-info"]')
if len(x) > 0:
shop_name = x[0].xpath('h1[@class="shop-name"]/text()').extract()[0].strip()
def parse_item(self, response):
items = []
sel = Selector(response)
base_url = get_base_url(response)
sites_even = sel.css('table.tablelist tr.even')
for site in sites_even:
item = TencentItem()
item['name'] = site.css('.l.square a').xpath('text()').extract()[0]
relative_url = site.css('.l.square a').xpath('@href').extract()[0]
item['detailLink'] = urljoin_rfc(base_url, relative_url)
item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()[0]
item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()[0]
item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()[0]
item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()[0]
items.append(item)
#print repr(item).decode("unicode-escape") + '\n'
sites_odd = sel.css('table.tablelist tr.odd')
for site in sites_odd:
def _extract_links(self, response):
body = response.body_as_unicode()
_type = 'html'
if body.lstrip().startswith('
def parse(self, response):
xml = Selector(response)
xml.remove_namespaces()
urls = xml.xpath('//loc/text()').extract()
for url in urls:
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_store
)
def parse(self, response):
sel = Selector(response)
item = ScrapscrapyItem()
item['Heading'] = str(
sel.xpath('/html/body/div[2]/div/div[1]/div/div[1]/h1').extract())
item['Content'] = str(
sel.xpath('/html/body/div[2]/div/div[1]/div/div[1]/p/text()').extract())
item['Source_Website'] = "http://scrapy.org"
return item
def parseSourceUrl(self, response):
'''解析文章 发送items给pipeline'''
article_content = response.xpath("//div [@id='article-main']").extract()
if (article_content):
toutiaoItem = ToutiaoItem()
toutiaoItem['title'] = \
Selector(text=article_content[0]).xpath('//h1 [@class="article-title"]/text()').extract()[0]
toutiaoItem['source'] = '头条'
toutiaoItem['tag'] = Selector(text=article_content[0]).xpath('//li [@class="label-item"]/text()').extract()
toutiaoItem['title_hash'] = data_spider.common.get_md5_value(toutiaoItem['title'].encode("utf-8"))
toutiaoItem['artical_url'] = str(toutiaoItem['title_hash']) + ".html"
article_time = Selector(text=article_content[0]).xpath('//span [@class="time"]/text()').extract()
toutiaoItem['artical_time'] = ''
if (article_time):
toutiaoItem['artical_time'] = article_time[0]
toutiaoItem['collect_time'] = int(time.time())
article_content, number = re.subn(r"href=\".*\"", '', article_content[0])
article_content = bytes(article_content, "utf-8")
with open("./html/" + toutiaoItem['artical_url'], 'wb') as f:
f.write(article_content)
yield toutiaoItem