Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, place='amsterdam'):
self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page_number) for page_number in range(1,301)]
self.base_url = "http://www.funda.nl/koop/%s/" % place
self.le1 = LinkExtractor(allow=r'%s+(huis|appartement)-\d{8}' % self.base_url)
import re
import time
from datetime import datetime
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.shell import inspect_response
from newscrawler.items import NewsItem
class NeteaseSpider(CrawlSpider):
name = "netease"
allowed_domains = ["163.com"]
start_urls = ['http://tech.163.com/']
rules = (
Rule(LinkExtractor(allow=('/\d+/\d+/\d+/*', )), callback='parse_item'),
)
def parse_item(self, response):
# inspect_response(response, self)
r = response
title = r.xpath('/html/head/title/text()').extract()[0].strip()
source = r.xpath("//a[@id='ne_article_source']/text()").extract()[0].strip()
content = "".join(r.xpath("//div[@id='endText']/p/text()").extract()).strip()
raw_time = r.xpath("//div[@class='post_time_source']/text()").extract()[0]
re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", raw_time)
if re_result:
ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M:%S'))
else:
ts = 0
url = r.url
new_news = NewsItem(
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapyuniversal.items import *
from scrapyuniversal.loaders import *
class ChinaSpider(CrawlSpider):
name = 'china'
allowed_domains = ['tech.china.com']
start_urls = ['http://tech.china.com/articles/']
rules = (
Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
callback='parse_item'),
Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
)
def parse_item(self, response):
loader = ChinaLoader(item=NewsItem(), response=response)
loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
loader.add_value('url', response.url)
loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)')
loader.add_value('website', '中华网')
yield loader.load_item()
if not kwargs['no_cdns']:
bot.allowed_image_domains.extend(cdns)
log.debug('allowed image domains: \n' + ', '.join(bot.allowed_image_domains))
bot._jobname = bot.allowed_domains[0]
jobname = kwargs.get('jobname', None)
if jobname:
bot._jobname = jobname
stay_under = kwargs.get('stay_under', None)
if stay_under:
bot.rules = ()
for start_url in kwargs['start_urls']:
bot.rules += (Rule(LinkExtractor(allow=(start_url + '.*',)), callback='parse_item', follow=True),)
log.debug('staying under: %s'%start_urls)
if kwargs['url_regex']:
regex_rule = (Rule(LinkExtractor(allow=kwargs['url_regex'],), callback='parse_item', follow=True),)
if stay_under:
bot.rules += regex_rule
else:
bot.rules = regex_rule
if kwargs['monitor']:
try:
bot._inpipe, outpipe = Pipe()
mon_start_func = get_monitor()
monitor_process = Process(target=mon_start_func, args=(outpipe,))
monitor_process.start()
except MonitorException:
Request(
"http://web:9312/dynamic/nonce",
callback=self.parse_welcome)
]
# Post welcome page's first form with the given user/pass
def parse_welcome(self, response):
return FormRequest.from_response(
response,
formdata={"user": "user", "pass": "pass"}
)
# Rules for horizontal and vertical crawling
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
callback='parse_item')
)
def parse_item(self, response):
""" This function parses a property page.
@url http://web:9312/properties/property_000000.html
@returns items 1
@scrapes title price description address image_urls
@scrapes url project spider server date
"""
# Create the loader using the response
l = ItemLoader(item=PropertiesItem(), response=response)
# Load fields using XPath expressions
import logging as log
import re
from mutils.web.urls import AbsUrl
from imagebot.items import ImageItem
import imagebot.spiders.init as init
class ImageSpider(CrawlSpider):
name = 'imagebot'
allowed_domains = []
start_urls = []
#by default allow all urls
rules = (Rule(LinkExtractor(allow=('.*', )), callback='parse_item', follow=True),)
def __init__(self, **kwargs):
init.process_kwargs(self, kwargs)
ImageSpider.allowed_domains = self.allowed_domains
super(ImageSpider, self).__init__(**kwargs)
#overridden to enable crawling of just one page by setting follow=False
def parse(self, response):
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=(not self._start_url_only))
def parse_start_url(self, response):
return self.parse_item(response)
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class DmozSpider(CrawlSpider):
"""Follow categories and extract links."""
name = 'dmoz'
allowed_domains = ['dmoz.org']
start_urls = ['http://www.dmoz.org/']
rules = [
Rule(LinkExtractor(
restrict_css=('.top-cat', '.sub-cat', '.cat-item')
), callback='parse_directory', follow=True),
]
def parse_directory(self, response):
for div in response.css('.title-and-desc'):
yield {
'name': div.css('.site-title::text').extract_first(),
'description': div.css('.site-descr::text').extract_first().strip(),
'link': div.css('a::attr(href)').extract_first()
}
class AmazonSpider(CrawlSpider):
name = 'amazon'
allowed_domains = ['amazon.cn']
# start_urls = ["https://www.amazon.cn/b/ref=sr_aj?node=658394051&ajr=0"]
# start_urls = get_start_urls_from_txt()
start_urls = ("https://www.amazon.cn/gp/search/other/ref=sr_sa_p_lbr_one_browse-bin?rh=n%3A658390051%2Cn%3A%21658391051%2Cn%3A658394051%2Cp_6%3AA1AJ19PSB66TGU%2Cp_n_binding_browse-bin%3A2038564051&bbn=658394051&pickerToList=lbr_one_browse-bin&ie=UTF8&qid=1496321636",
"https://www.amazon.cn/gp/search/other/ref=sr_sa_p_lbr_one_browse-bin?rh=n%3A658390051%2Cn%3A%21658391051%2Cn%3A658394051%2Cp_n_binding_browse-bin%3A2038565051&bbn=658394051&pickerToList=lbr_one_browse-bin&ie=UTF8&qid=1496321894")
url_xpath = '//a[contains(@class, "s-color-twister-title-link")]'
author_xpath = '//div[@id="refinementList"]'
nextPage_xpath = '//a[@id="pagnNextLink"]'
book_class_xpath = '//div[contains(@class, "browseBox")]/ul[2]'
rules = (
# Rule(LinkExtractor(restrict_xpaths=(author_xpath,)), follow=True),
# Rule(LinkExtractor(restrict_xpaths=(book_class_xpath,)), follow=True),
# Rule(LinkExtractor(allow=("b/ref.*",), restrict_xpaths=(book_class_xpath,)), follow=True),
Rule(LinkExtractor(restrict_xpaths=(nextPage_xpath,)), follow=True, callback="parse_search_result"),
# Rule(LinkExtractor(deny=("b/ref.*",), restrict_xpaths=(url_xpath,)), callback="parse_item"),
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.parse_starturls)
def parse_starturls(self, response):
# url_list = response.xpath('//div[@class="categoryRefinementsSection"]/ul/li/a/@href').extract()[1:]
# url_list = response.xpath('//div[contains(@class, "browseBox")]/ul[2]/li/a/@href').extract()
url_list = response.xpath('//li/span[@class="a-list-item"]/a/@href').extract()
home_url = "https://www.amazon.cn"
url_list = [home_url + u for u in url_list]
for url in url_list:
yield scrapy.Request(url, callback=self.parse_search_result)
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from RSpider.items import TestLoader
class BaseSpider(CrawlSpider):
name = 'Base'
allowed_domains = ['dmoz.org']
start_urls = ['http://www.dmoz.org/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@id="catalogs"]')),
Rule(LinkExtractor(restrict_xpaths='//ul[@class="directory dir-col"]'), callback='parse_directory', follow=True)
)
def parse_directory(self, response):
for li in response.css('ul.directory-url > li'):
tl = TestLoader(selector=li)
tl.add_css('name', 'a::text')
tl.add_css('description', '::text')
tl.add_css('link', 'a::attr(href)')
tl.add_value('url', response.url)
yield tl.load_item()
def spider_closed(self):
pass