How to use the scrapy.spiders.Rule function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scalingexcellence / scrapybook-2nd-edition / ch14 / speed / speed / spiders / speed.py View on Github external
page_worth = details_per_index * items_per_page

        total_items = self.settings.getint('SPEED_TOTAL_ITEMS', 1000)

        # Round up
        index_pages_count = (total_items + page_worth - 1) / page_worth

        return index_pages_count

    def my_process_request(self, r):
        if self.settings.getbool('SPEED_INDEX_HIGHER_PRIORITY', False):
            r.priority = 1
        return r

    rules = (
        Rule(LinkExtractor(restrict_xpaths='//*[@class="nav"]'),
             process_request="my_process_request"),
        Rule(LinkExtractor(restrict_xpaths='//*[@class="item"]'),
             callback='parse_item')
    )

    def parse_item(self, response):
        if self.blocking_delay > 0.001:
            # This is a bad bad thing
            time.sleep(self.blocking_delay)

        for li in response.xpath('//li'):
            i = DummyItem()
            id_phrase = li.xpath('.//h3/text()').extract()[0]
            i['id'] = int(id_phrase.split()[1])
            i['info'] = li.xpath('.//div[@class="info"]/text()').extract()
            yield i
github Johnson0722 / News_scrapy_redis / News_scrapy / spiders / huxiu.py View on Github external
class Huxiu(RedisCrawlSpider):
    # 爬虫名
    name = "huxiu"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["huxiu.com"]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'huxiu:start_urls'
    # start_urls = ['https://www.huxiu.com']


    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/channel/\d{1,3}/\.html', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归
        Rule(LxmlLinkExtractor(allow=(r'/article/\d+\.html', )), callback='parse_item'),
    )


    def parse_item(self, response):
        item = NewsItem()

        item['url'] = response.url
        # get article id
        article_id = response.url.split('/')[-1][:6]
        # generate xpath
        title_xpath = '//*[@id="article' + article_id + '"' + ']/div[2]/div[2]/h1/text()'
        pub_time_xpath = '//*[@id="article' + article_id + '"' + ']/div[2]/div[2]/div[1]/div/span[1]/text()'
        content_xpath = '//*[@id="article_content' + article_id + '"' + ']'

        item['title'] = response.xpath(title_xpath).extract()[0].strip()
        item['pub_time'] = response.xpath(pub_time_xpath).extract()[0].strip()
github public-people / scrape-news / scrapenews / spiders / groundup.py View on Github external
class groundupSpider(CrawlSpider):
    name = 'groundup'
    allowed_domains = ['groundup.org.za']

    start_urls = ['https://www.groundup.org.za/']

    link_extractor = LinkExtractor(
        allow=('https://www.groundup.org.za/article/', ),
        deny=(
            'https://www.facebook.com/',
            'https://twitter.com/',
        )
    )

    rules = (
        Rule(link_extractor, process_links='filter_links', callback='parse_item', follow=True),
    )

    publication_name = 'GroundUp'


    def parse_item(self, response):

        og_url = response.xpath('//meta[@property="og:url"]/@content').extract_first()
        # no 'canonical' that I could find
        title = response.xpath('//h1/text()').extract_first()
        self.logger.info('%s %s', response.url, title)
        # should we be using og_url instead of response.url for the above?
        og_type = response.xpath('//meta[@property="og:type"]/@content').extract_first()

        if og_type == 'article':
            subtitle = response.xpath('//p[@id="article_subtitle"]').css('::text').extract_first()
github geekan / scrapy-examples / doubanmovie / doubanmovie / spiders / spider.py View on Github external
from misc.log import *
from misc.spider import CommonSpider


class doubanmovieSpider(CommonSpider):
    name = "doubanmovie"
    allowed_domains = ["douban.com"]
    start_urls = [
        #"https://movie.douban.com/tag/",
        "https://movie.douban.com/chart"
    ]
    rules = [
        #Rule(sle(allow=("/tag/[0-9]{4}$")), follow=True),
        #Rule(sle(allow=("/tag/[0-9]{4}/?start=[0-9]{2,4}&type=T$")), follow=True),
        #Rule(sle(allow=("/subject/[0-9]+$")), callback='parse_1'),
        Rule(sle(allow=("/subject/[0-9]+/$")), callback='parse_1', follow=True),
    ]

    list_css_rules = { 
        '.linkto': {
            'url': 'a::attr(href)',
            'name': 'a::text',
        }
    }   

    list_css_rules_2 = { 
        '#listZone .Q-tpWrap': {
            'url': '.linkto::attr(href)',
            'name': '.linkto::text'
        }   
    }
github scalingexcellence / scrapybook-2nd-edition / ch03 / properties / properties / spiders / easy.py View on Github external
from properties.items import PropertiesItem


class EasySpider(CrawlSpider):
    name = 'easy'
    allowed_domains = ["scrapybook.s3.amazonaws.com"]

    # Start on the first index page
    start_urls = (
        'http://scrapybook.s3.amazonaws.com/properties/index_00000.html',
    )

    # Rules for horizontal and vertical crawling
    rules = (
        Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
        Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
             callback='parse_item')
    )

    def parse_item(self, response):
        """ This function parses a property page.

        @url http://scrapybook.s3.amazonaws.com:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)
github Johnson0722 / News_scrapy_redis / News_scrapy / spiders / dgtle.py View on Github external
class Dgtle(RedisCrawlSpider):
    # 爬虫名
    name = "dgtle"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["dgtle.com"]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'dgtle:start_urls'
    # start_urls = ['http://www.dgtle.com/']


    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/portal.php\?mod=list&catid=\d{2}', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归
        Rule(LxmlLinkExtractor(allow=(r'/article[\d|-]+\.html', )), callback='parse_item'),
    )


    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] =  response.xpath('/html/body/div[3]/h2/a/text()').extract()[0].strip()
        item['pub_time'] = response.xpath('/html/body/div[3]/div/div[1]/i/text()').extract()[0].strip()
        item['content_code'] = response.xpath('/html/body/div[4]/div[1]').extract()[0].strip()


        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
github geekan / scrapy-examples / hrtencent / hrtencent / spiders / hrtencent_spider.py View on Github external
from scrapy.linkextractors import LinkExtractor as sle


from hrtencent.items import *
from misc.log import *


class HrtencentSpider(CrawlSpider):
    name = "hrtencent"
    allowed_domains = ["tencent.com"]
    start_urls = [
        "http://hr.tencent.com/position.php?start=%d" % d for d in range(0, 20, 10)
    ]
    rules = [
        Rule(sle(allow=("/position_detail.php\?id=\d*.*", )), callback='parse_2'),
        Rule(sle(allow=("/position.php\?&start=\d{,2}#a")), follow=True, callback='parse_1')
    ]

    def parse_2(self, response):
        items = []
        sel = Selector(response)
        sites = sel.css('.tablelist')
        for site in sites:
            item = PositionDetailItem()
            item['sharetitle'] = site.css('.h #sharetitle::text').extract()
            item['bottomline'] = site.css('.bottomline td::text').extract()
            # item['duty'] = site.css('.c .l2::text').extract()
            item['link'] = response.url
            items.append(item)
            print repr(item).decode("unicode-escape") + '\n'
        # info('parsed ' + str(response))
        self.parse_1(response)
github Times125 / encyclopediaCrawler / baikeSpider / spiders / baidu_spider.py View on Github external
from scrapy.spiders import Rule
from scrapy.selector import Selector
from baikeSpider.items import BaiduSpiderItem
from scrapy.linkextractors import LinkExtractor
from .redis_spider import RedisCrawlSpider
from ..config import baidu_task_queue, baidu_spider_name
from ..cache.html_cache import CacheTool


class BaiduSpider(RedisCrawlSpider):
    task_queue = baidu_task_queue
    base_url = "https://baike.baidu.com"
    name = baidu_spider_name
    allowed_domains = ['baike.baidu.com']
    rules = (
        Rule(LinkExtractor(allow=('https://baike.baidu.com/item/',)), callback='parse', follow=True),
    )

    # custom_settings = {
    #     'ITEM_PIPELINES': {
    #         'baikeSpider.pipelines.SpiderPipeline': 300,
    #         'baikeSpider.pipelines.SpiderRedisPipeline': 301,
    #         'baikeSpider.pipelines.WebCachePipeline': 302,
    #     },
    # }

    def parse(self, response):
        items = BaiduSpiderItem()
        selector = Selector(response)
        # print(response.status, response)
        items['url'] = unquote(response.url)
        items['html'] = response.text
github Times125 / encyclopediaCrawler / baikeSpider / spiders / wiki_en_spider.py View on Github external
from scrapy.selector import Selector
from scrapy.spiders import Rule

from baikeSpider.cache.html_cache import CacheTool
from baikeSpider.items import WikiENSpiderItem
from .redis_spider import RedisCrawlSpider
from ..config import wiki_en_task_queue, wiki_en_spider_name


class WikiENSpider(RedisCrawlSpider):
    task_queue = wiki_en_task_queue
    base_url = "https://en.wikipedia.org"
    name = wiki_en_spider_name
    allowed_domains = ['en.wikipedia.org']
    rules = (
        Rule(LinkExtractor(allow=('https://en.wikipedia.org/wiki/',)), callback='parse', follow=True),
    )
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None,
            'baikeSpider.middlewares.MyUserAgentMiddleware': 400,
            'baikeSpider.middlewares.MyRetryMiddleware': 501,
            'baikeSpider.middlewares.MyProxyMiddleware': 100,
            'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,
        }
    }

    def parse(self, response):
        items = WikiENSpiderItem()
        selector = Selector(response)
        items['url'] = unquote(response.url)
        items['html'] = response.text