Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
page_worth = details_per_index * items_per_page
total_items = self.settings.getint('SPEED_TOTAL_ITEMS', 1000)
# Round up
index_pages_count = (total_items + page_worth - 1) / page_worth
return index_pages_count
def my_process_request(self, r):
if self.settings.getbool('SPEED_INDEX_HIGHER_PRIORITY', False):
r.priority = 1
return r
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[@class="nav"]'),
process_request="my_process_request"),
Rule(LinkExtractor(restrict_xpaths='//*[@class="item"]'),
callback='parse_item')
)
def parse_item(self, response):
if self.blocking_delay > 0.001:
# This is a bad bad thing
time.sleep(self.blocking_delay)
for li in response.xpath('//li'):
i = DummyItem()
id_phrase = li.xpath('.//h3/text()').extract()[0]
i['id'] = int(id_phrase.split()[1])
i['info'] = li.xpath('.//div[@class="info"]/text()').extract()
yield i
class Huxiu(RedisCrawlSpider):
# 爬虫名
name = "huxiu"
# 爬取域范围, 允许爬虫在这个域名下进行爬取
allowed_domains = ["huxiu.com"]
# 起始url列表, 爬虫执行后的第一批请求, 队列处理
redis_key = 'huxiu:start_urls'
# start_urls = ['https://www.huxiu.com']
rules = (
# 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析
Rule(LxmlLinkExtractor(allow=(r'/channel/\d{1,3}/\.html', )), follow=True),
# 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归
Rule(LxmlLinkExtractor(allow=(r'/article/\d+\.html', )), callback='parse_item'),
)
def parse_item(self, response):
item = NewsItem()
item['url'] = response.url
# get article id
article_id = response.url.split('/')[-1][:6]
# generate xpath
title_xpath = '//*[@id="article' + article_id + '"' + ']/div[2]/div[2]/h1/text()'
pub_time_xpath = '//*[@id="article' + article_id + '"' + ']/div[2]/div[2]/div[1]/div/span[1]/text()'
content_xpath = '//*[@id="article_content' + article_id + '"' + ']'
item['title'] = response.xpath(title_xpath).extract()[0].strip()
item['pub_time'] = response.xpath(pub_time_xpath).extract()[0].strip()
class groundupSpider(CrawlSpider):
name = 'groundup'
allowed_domains = ['groundup.org.za']
start_urls = ['https://www.groundup.org.za/']
link_extractor = LinkExtractor(
allow=('https://www.groundup.org.za/article/', ),
deny=(
'https://www.facebook.com/',
'https://twitter.com/',
)
)
rules = (
Rule(link_extractor, process_links='filter_links', callback='parse_item', follow=True),
)
publication_name = 'GroundUp'
def parse_item(self, response):
og_url = response.xpath('//meta[@property="og:url"]/@content').extract_first()
# no 'canonical' that I could find
title = response.xpath('//h1/text()').extract_first()
self.logger.info('%s %s', response.url, title)
# should we be using og_url instead of response.url for the above?
og_type = response.xpath('//meta[@property="og:type"]/@content').extract_first()
if og_type == 'article':
subtitle = response.xpath('//p[@id="article_subtitle"]').css('::text').extract_first()
from misc.log import *
from misc.spider import CommonSpider
class doubanmovieSpider(CommonSpider):
name = "doubanmovie"
allowed_domains = ["douban.com"]
start_urls = [
#"https://movie.douban.com/tag/",
"https://movie.douban.com/chart"
]
rules = [
#Rule(sle(allow=("/tag/[0-9]{4}$")), follow=True),
#Rule(sle(allow=("/tag/[0-9]{4}/?start=[0-9]{2,4}&type=T$")), follow=True),
#Rule(sle(allow=("/subject/[0-9]+$")), callback='parse_1'),
Rule(sle(allow=("/subject/[0-9]+/$")), callback='parse_1', follow=True),
]
list_css_rules = {
'.linkto': {
'url': 'a::attr(href)',
'name': 'a::text',
}
}
list_css_rules_2 = {
'#listZone .Q-tpWrap': {
'url': '.linkto::attr(href)',
'name': '.linkto::text'
}
}
from properties.items import PropertiesItem
class EasySpider(CrawlSpider):
name = 'easy'
allowed_domains = ["scrapybook.s3.amazonaws.com"]
# Start on the first index page
start_urls = (
'http://scrapybook.s3.amazonaws.com/properties/index_00000.html',
)
# Rules for horizontal and vertical crawling
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
callback='parse_item')
)
def parse_item(self, response):
""" This function parses a property page.
@url http://scrapybook.s3.amazonaws.com:9312/properties/property_000000.html
@returns items 1
@scrapes title price description address image_urls
@scrapes url project spider server date
"""
# Create the loader using the response
l = ItemLoader(item=PropertiesItem(), response=response)
class Dgtle(RedisCrawlSpider):
# 爬虫名
name = "dgtle"
# 爬取域范围, 允许爬虫在这个域名下进行爬取
allowed_domains = ["dgtle.com"]
# 起始url列表, 爬虫执行后的第一批请求, 队列处理
redis_key = 'dgtle:start_urls'
# start_urls = ['http://www.dgtle.com/']
rules = (
# 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析
Rule(LxmlLinkExtractor(allow=(r'/portal.php\?mod=list&catid=\d{2}', )), follow=True),
# 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归
Rule(LxmlLinkExtractor(allow=(r'/article[\d|-]+\.html', )), callback='parse_item'),
)
def parse_item(self, response):
item = NewsItem()
item['url'] = response.url
item['title'] = response.xpath('/html/body/div[3]/h2/a/text()').extract()[0].strip()
item['pub_time'] = response.xpath('/html/body/div[3]/div/div[1]/i/text()').extract()[0].strip()
item['content_code'] = response.xpath('/html/body/div[4]/div[1]').extract()[0].strip()
# 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
yield item
from scrapy.linkextractors import LinkExtractor as sle
from hrtencent.items import *
from misc.log import *
class HrtencentSpider(CrawlSpider):
name = "hrtencent"
allowed_domains = ["tencent.com"]
start_urls = [
"http://hr.tencent.com/position.php?start=%d" % d for d in range(0, 20, 10)
]
rules = [
Rule(sle(allow=("/position_detail.php\?id=\d*.*", )), callback='parse_2'),
Rule(sle(allow=("/position.php\?&start=\d{,2}#a")), follow=True, callback='parse_1')
]
def parse_2(self, response):
items = []
sel = Selector(response)
sites = sel.css('.tablelist')
for site in sites:
item = PositionDetailItem()
item['sharetitle'] = site.css('.h #sharetitle::text').extract()
item['bottomline'] = site.css('.bottomline td::text').extract()
# item['duty'] = site.css('.c .l2::text').extract()
item['link'] = response.url
items.append(item)
print repr(item).decode("unicode-escape") + '\n'
# info('parsed ' + str(response))
self.parse_1(response)
from scrapy.spiders import Rule
from scrapy.selector import Selector
from baikeSpider.items import BaiduSpiderItem
from scrapy.linkextractors import LinkExtractor
from .redis_spider import RedisCrawlSpider
from ..config import baidu_task_queue, baidu_spider_name
from ..cache.html_cache import CacheTool
class BaiduSpider(RedisCrawlSpider):
task_queue = baidu_task_queue
base_url = "https://baike.baidu.com"
name = baidu_spider_name
allowed_domains = ['baike.baidu.com']
rules = (
Rule(LinkExtractor(allow=('https://baike.baidu.com/item/',)), callback='parse', follow=True),
)
# custom_settings = {
# 'ITEM_PIPELINES': {
# 'baikeSpider.pipelines.SpiderPipeline': 300,
# 'baikeSpider.pipelines.SpiderRedisPipeline': 301,
# 'baikeSpider.pipelines.WebCachePipeline': 302,
# },
# }
def parse(self, response):
items = BaiduSpiderItem()
selector = Selector(response)
# print(response.status, response)
items['url'] = unquote(response.url)
items['html'] = response.text
from scrapy.selector import Selector
from scrapy.spiders import Rule
from baikeSpider.cache.html_cache import CacheTool
from baikeSpider.items import WikiENSpiderItem
from .redis_spider import RedisCrawlSpider
from ..config import wiki_en_task_queue, wiki_en_spider_name
class WikiENSpider(RedisCrawlSpider):
task_queue = wiki_en_task_queue
base_url = "https://en.wikipedia.org"
name = wiki_en_spider_name
allowed_domains = ['en.wikipedia.org']
rules = (
Rule(LinkExtractor(allow=('https://en.wikipedia.org/wiki/',)), callback='parse', follow=True),
)
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None,
'baikeSpider.middlewares.MyUserAgentMiddleware': 400,
'baikeSpider.middlewares.MyRetryMiddleware': 501,
'baikeSpider.middlewares.MyProxyMiddleware': 100,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,
}
}
def parse(self, response):
items = WikiENSpiderItem()
selector = Selector(response)
items['url'] = unquote(response.url)
items['html'] = response.text