How to use the scrapy.spiders.CrawlSpider function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scalingexcellence / scrapybook-2nd-edition / ch03 / properties / properties / spiders / easy.py View on Github external
import datetime
import socket

from scrapy.loader.processors import MapCompose, Join
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader

from properties.items import PropertiesItem


class EasySpider(CrawlSpider):
    name = 'easy'
    allowed_domains = ["scrapybook.s3.amazonaws.com"]

    # Start on the first index page
    start_urls = (
        'http://scrapybook.s3.amazonaws.com/properties/index_00000.html',
    )

    # Rules for horizontal and vertical crawling
    rules = (
        Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
        Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
             callback='parse_item')
    )

    def parse_item(self, response):
github realpython / book2-exercises / scraping / socrata / socrata / spiders / opendata_crawl.py View on Github external
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from socrata.items import SocrataItem


class OpendataSpider(CrawlSpider):
    name = "opendatacrawl"
    allowed_domains = ["opendata.socrata.com"]
    start_urls = (
        'https://opendata.socrata.com/',
    )
    rules = [
        Rule(LinkExtractor(allow='browse\?utf8=%E2%9C%93&page=\d*'),
             callback='parse_item', follow=True)
    ]

    def parse_item(self, response):
        titles = Selector(response).xpath('//div[@itemscope="itemscope"]')
        for title in titles:
            item = SocrataItem()
            item["text"] = title.xpath('.//div[@class="browse2-result-title"]/h2/a/text()').extract()[0]
            item["url"] = title.xpath('.//div[@class="browse2-result-title"]/h2/a/@href').extract()[0]
github backto17 / SinaHouseCrawler / sinahouse / spiders / housespider.py View on Github external
# coding: utf-8
'''
@date: Feb 24, 2016
@author: alex.lin
'''
import re
import datetime

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

from sinahouse.items import SinaHouseItem, SinaHouseLayout


class SinaHouseSpider(CrawlSpider):
    """
    class:新浪房产爬虫: http://sh.house.sina.com.cn/
    """
    
    name = 'sinahouse'
    allowed_domains = ['house.sina.com.cn',]
    start_urls = ['http://data.house.sina.com.cn/sc/search/?keyword=&charset=utf8',]
    rules = [
            Rule(LinkExtractor(allow = ('.*\.cn/\w+\d+/#wt_source.*?bt.*')), callback='parse_house', follow=False), #  具体楼盘链接提取
            Rule(LinkExtractor(allow = ('^http://data.house.sina.com.cn/\w+/search$'))), # 各个城市链接提取
            Rule(LinkExtractor(allow = ('^http://data.house.sina.com.cn/\w+/search/\?bcity.*'))), # 各个省份下有其他城市的链接提取
            Rule(LinkExtractor(allow = ('/\w+/search-\d*/.*'))), # 下一页链接
            ]

    def parse_house(self, response):
        """
github scalingexcellence / scrapybook-2nd-edition / ch07 / properties / properties / spiders / login.py View on Github external
import datetime
import socket

from scrapy.loader.processors import MapCompose, Join
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from scrapy.http import FormRequest

from properties.items import PropertiesItem


class LoginSpider(CrawlSpider):
    name = 'login'
    allowed_domains = ["scrapybook.s3.amazonaws.com"]

    # Start with a login request
    def start_requests(self):
        return [
            FormRequest(
                "http://examples.scrapybook.com/post/login.php",
                formdata={"user": "user", "pass": "pass"}
            )]

    # Rules for horizontal and vertical crawling
    rules = (
        Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
        Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
             callback='parse_item')
github backto17 / SinaHouseCrawler / house / house / spiders / sinahouse.py View on Github external
# coding: utf-8
'''
@date: Feb 24, 2016
@author: alex.lin
'''
import datetime
import re

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from house.items import SinaHouseItem, SinaHouseLayout


class SinaHouseSpider(CrawlSpider):
    """
    class:新浪房产爬虫: http://sh.house.sina.com.cn/
    """
    
    name = 'sinahouse'
    allowed_domains = ['house.sina.com.cn',]
    start_urls = ['http://data.house.sina.com.cn/sc/search/?keyword=&charset=utf8',]
    rules = [
            Rule(LinkExtractor(allow = ('.*\.cn/\w+\d+/#wt_source.*?bt.*')), callback='parse_house', follow=False), #  具体楼盘链接提取
            Rule(LinkExtractor(allow = ('^http://data.house.sina.com.cn/\w+/search$'))), # 各个城市链接提取
            Rule(LinkExtractor(allow = ('^http://data.house.sina.com.cn/\w+/search/\?bcity.*'))), # 各个省份下有其他城市的链接提取
            Rule(LinkExtractor(allow = ('/\w+/search-\d*/.*'))), # 下一页链接
            ]

    def parse_house(self, response):
        """
github piaotiejun / restaurant / restaurant / spiders / dazhongdianping.py View on Github external
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Cache-Control": "max-age=0",
    "Host": "www.dianping.com",
    "HTTPS": "1",
    "RA-Sid": "7C4125DE-20150519-013547-91bdb7-b00401",
    "RA-Ver": "3.0.7",
    "Referer": "http://www.dianping.com",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36"
}

big_city_xx_cnt = 0
small_city_xx_cnt = 0
restaurant_cnt = 0

class DazhongdianpingSpider(CrawlSpider):
    name = 'dazhongdianping'
    allowed_domains = ['www.dianping.com']
    start_urls = ['http://www.www.dianping.com/citylist']
    #download_delay = 1 # 下载间隔

    rules = (
        #Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    )

    def parse_start_url(self, response):
        city_cnt = 0
        big_city_list = response.xpath('//ul[@id="divArea"]/li[1]/div/a/strong/text()').extract()
        big_city_code_list = response.xpath('//ul[@id="divArea"]/li[1]/div/a/@href').extract()
        for index, city in enumerate(big_city_list):
            item = DazhongdianpingItem()
            item['province'] = ''
github pskun / finance_news_analysis / crawler / crawler / spiders / EastMoneyGubaSpider.py View on Github external
# -*- coding: utf-8 -*-

from scrapy.spiders import CrawlSpider

from ..items import GubaItem


class EastmoneyGubaSpider(CrawlSpider):
    ''' 东方财富股吧内容页爬虫 '''
    name = 'EastMoneyGubaSpider'
    allowed_domains = ['guba.eastmoney.com']

    def start_requests(self):
        # 从文件中读取url
        pass

    # def parse(self, response):
    def parse_item(self, response):
        item = GubaItem()

        url = response.url
        title = response.xpath('//div[@id="zwconttbt"]/text()').extract()
        content_data = response.xpath('//div[@id="zwconbody"]')
        content = content_data.xpath('string(.)').extract()