Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import datetime
import socket
from scrapy.loader.processors import MapCompose, Join
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from properties.items import PropertiesItem
class EasySpider(CrawlSpider):
name = 'easy'
allowed_domains = ["scrapybook.s3.amazonaws.com"]
# Start on the first index page
start_urls = (
'http://scrapybook.s3.amazonaws.com/properties/index_00000.html',
)
# Rules for horizontal and vertical crawling
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
callback='parse_item')
)
def parse_item(self, response):
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from socrata.items import SocrataItem
class OpendataSpider(CrawlSpider):
name = "opendatacrawl"
allowed_domains = ["opendata.socrata.com"]
start_urls = (
'https://opendata.socrata.com/',
)
rules = [
Rule(LinkExtractor(allow='browse\?utf8=%E2%9C%93&page=\d*'),
callback='parse_item', follow=True)
]
def parse_item(self, response):
titles = Selector(response).xpath('//div[@itemscope="itemscope"]')
for title in titles:
item = SocrataItem()
item["text"] = title.xpath('.//div[@class="browse2-result-title"]/h2/a/text()').extract()[0]
item["url"] = title.xpath('.//div[@class="browse2-result-title"]/h2/a/@href').extract()[0]
# coding: utf-8
'''
@date: Feb 24, 2016
@author: alex.lin
'''
import re
import datetime
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from sinahouse.items import SinaHouseItem, SinaHouseLayout
class SinaHouseSpider(CrawlSpider):
"""
class:新浪房产爬虫: http://sh.house.sina.com.cn/
"""
name = 'sinahouse'
allowed_domains = ['house.sina.com.cn',]
start_urls = ['http://data.house.sina.com.cn/sc/search/?keyword=&charset=utf8',]
rules = [
Rule(LinkExtractor(allow = ('.*\.cn/\w+\d+/#wt_source.*?bt.*')), callback='parse_house', follow=False), # 具体楼盘链接提取
Rule(LinkExtractor(allow = ('^http://data.house.sina.com.cn/\w+/search$'))), # 各个城市链接提取
Rule(LinkExtractor(allow = ('^http://data.house.sina.com.cn/\w+/search/\?bcity.*'))), # 各个省份下有其他城市的链接提取
Rule(LinkExtractor(allow = ('/\w+/search-\d*/.*'))), # 下一页链接
]
def parse_house(self, response):
"""
import datetime
import socket
from scrapy.loader.processors import MapCompose, Join
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from scrapy.http import FormRequest
from properties.items import PropertiesItem
class LoginSpider(CrawlSpider):
name = 'login'
allowed_domains = ["scrapybook.s3.amazonaws.com"]
# Start with a login request
def start_requests(self):
return [
FormRequest(
"http://examples.scrapybook.com/post/login.php",
formdata={"user": "user", "pass": "pass"}
)]
# Rules for horizontal and vertical crawling
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
callback='parse_item')
# coding: utf-8
'''
@date: Feb 24, 2016
@author: alex.lin
'''
import datetime
import re
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from house.items import SinaHouseItem, SinaHouseLayout
class SinaHouseSpider(CrawlSpider):
"""
class:新浪房产爬虫: http://sh.house.sina.com.cn/
"""
name = 'sinahouse'
allowed_domains = ['house.sina.com.cn',]
start_urls = ['http://data.house.sina.com.cn/sc/search/?keyword=&charset=utf8',]
rules = [
Rule(LinkExtractor(allow = ('.*\.cn/\w+\d+/#wt_source.*?bt.*')), callback='parse_house', follow=False), # 具体楼盘链接提取
Rule(LinkExtractor(allow = ('^http://data.house.sina.com.cn/\w+/search$'))), # 各个城市链接提取
Rule(LinkExtractor(allow = ('^http://data.house.sina.com.cn/\w+/search/\?bcity.*'))), # 各个省份下有其他城市的链接提取
Rule(LinkExtractor(allow = ('/\w+/search-\d*/.*'))), # 下一页链接
]
def parse_house(self, response):
"""
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "max-age=0",
"Host": "www.dianping.com",
"HTTPS": "1",
"RA-Sid": "7C4125DE-20150519-013547-91bdb7-b00401",
"RA-Ver": "3.0.7",
"Referer": "http://www.dianping.com",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36"
}
big_city_xx_cnt = 0
small_city_xx_cnt = 0
restaurant_cnt = 0
class DazhongdianpingSpider(CrawlSpider):
name = 'dazhongdianping'
allowed_domains = ['www.dianping.com']
start_urls = ['http://www.www.dianping.com/citylist']
#download_delay = 1 # 下载间隔
rules = (
#Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
def parse_start_url(self, response):
city_cnt = 0
big_city_list = response.xpath('//ul[@id="divArea"]/li[1]/div/a/strong/text()').extract()
big_city_code_list = response.xpath('//ul[@id="divArea"]/li[1]/div/a/@href').extract()
for index, city in enumerate(big_city_list):
item = DazhongdianpingItem()
item['province'] = ''
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider
from ..items import GubaItem
class EastmoneyGubaSpider(CrawlSpider):
''' 东方财富股吧内容页爬虫 '''
name = 'EastMoneyGubaSpider'
allowed_domains = ['guba.eastmoney.com']
def start_requests(self):
# 从文件中读取url
pass
# def parse(self, response):
def parse_item(self, response):
item = GubaItem()
url = response.url
title = response.xpath('//div[@id="zwconttbt"]/text()').extract()
content_data = response.xpath('//div[@id="zwconbody"]')
content = content_data.xpath('string(.)').extract()