How to use the scrapy.linkextractors.LinkExtractor function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github khpeek / funda-scraper / funda / spiders / funda_spider.py View on Github external
def __init__(self, place='amsterdam'):
        self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page_number) for page_number in range(1,301)]
        self.base_url = "http://www.funda.nl/koop/%s/" % place
        self.le1 = LinkExtractor(allow=r'%s+(huis|appartement)-\d{8}' % self.base_url)
github BillBillBillBill / NewsCrawler / newscrawler / newscrawler / spiders / netease.py View on Github external
import re
import time
from datetime import datetime
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.shell import inspect_response
from newscrawler.items import NewsItem


class NeteaseSpider(CrawlSpider):
    name = "netease"
    allowed_domains = ["163.com"]
    start_urls = ['http://tech.163.com/']

    rules = (
        Rule(LinkExtractor(allow=('/\d+/\d+/\d+/*', )), callback='parse_item'),
    )

    def parse_item(self, response):
        # inspect_response(response, self)
        r = response
        title = r.xpath('/html/head/title/text()').extract()[0].strip()
        source = r.xpath("//a[@id='ne_article_source']/text()").extract()[0].strip()
        content = "".join(r.xpath("//div[@id='endText']/p/text()").extract()).strip()
        raw_time = r.xpath("//div[@class='post_time_source']/text()").extract()[0]
        re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", raw_time)
        if re_result:
            ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M:%S'))
        else:
            ts = 0
        url = r.url
        new_news = NewsItem(
github Python3WebSpider / ScrapyUniversal / scrapyuniversal / spiders / china.py View on Github external
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapyuniversal.items import *
from scrapyuniversal.loaders import *


class ChinaSpider(CrawlSpider):
    name = 'china'
    allowed_domains = ['tech.china.com']
    start_urls = ['http://tech.china.com/articles/']
    
    rules = (
        Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
             callback='parse_item'),
        Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
    )
    
    def parse_item(self, response):
        loader = ChinaLoader(item=NewsItem(), response=response)
        loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
        loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
        loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)')
        loader.add_value('website', '中华网')
        yield loader.load_item()
github amol9 / imagebot / imagebot / spiders / init.py View on Github external
if not kwargs['no_cdns']:
		bot.allowed_image_domains.extend(cdns)

	log.debug('allowed image domains: \n' + ', '.join(bot.allowed_image_domains))

	bot._jobname = bot.allowed_domains[0]

	jobname = kwargs.get('jobname', None)
	if jobname:
		bot._jobname = jobname

	stay_under = kwargs.get('stay_under', None)
	if stay_under:
		bot.rules = ()
		for start_url in kwargs['start_urls']:
			bot.rules += (Rule(LinkExtractor(allow=(start_url + '.*',)), callback='parse_item', follow=True),)
		log.debug('staying under: %s'%start_urls)

	if kwargs['url_regex']:
		regex_rule = (Rule(LinkExtractor(allow=kwargs['url_regex'],), callback='parse_item', follow=True),)
		if stay_under:
			bot.rules += regex_rule
		else:
			bot.rules = regex_rule

	if kwargs['monitor']:
		try:
			bot._inpipe, outpipe = Pipe()
			mon_start_func = get_monitor()
			monitor_process = Process(target=mon_start_func, args=(outpipe,))
			monitor_process.start()
		except MonitorException:
github scalingexcellence / scrapybook / ch05 / properties / properties / spiders / noncelogin.py View on Github external
Request(
                "http://web:9312/dynamic/nonce",
                callback=self.parse_welcome)
        ]

    # Post welcome page's first form with the given user/pass
    def parse_welcome(self, response):
        return FormRequest.from_response(
            response,
            formdata={"user": "user", "pass": "pass"}
        )

    # Rules for horizontal and vertical crawling
    rules = (
        Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
        Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
             callback='parse_item')
    )

    def parse_item(self, response):
        """ This function parses a property page.

        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
github amol9 / imagebot / imagebot / spiders / bot.py View on Github external
import logging as log
import re

from mutils.web.urls import AbsUrl

from imagebot.items import ImageItem
import imagebot.spiders.init as init


class ImageSpider(CrawlSpider):
	name = 'imagebot'
	allowed_domains = []
	start_urls = []

	#by default allow all urls
	rules = (Rule(LinkExtractor(allow=('.*', )), callback='parse_item', follow=True),)


	def __init__(self, **kwargs):
		init.process_kwargs(self, kwargs)
		ImageSpider.allowed_domains = self.allowed_domains

		super(ImageSpider, self).__init__(**kwargs)


	#overridden to enable crawling of just one page by setting follow=False
	def parse(self, response):
        	return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=(not self._start_url_only))
	
	
	def parse_start_url(self, response):
		return self.parse_item(response)
github 1Tian-zhang / easyspider / next / slave / spider / easyspider / spiders / dmoz.py View on Github external
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class DmozSpider(CrawlSpider):
    """Follow categories and extract links."""
    name = 'dmoz'
    allowed_domains = ['dmoz.org']
    start_urls = ['http://www.dmoz.org/']

    rules = [
        Rule(LinkExtractor(
            restrict_css=('.top-cat', '.sub-cat', '.cat-item')
        ), callback='parse_directory', follow=True),
    ]

    def parse_directory(self, response):
        for div in response.css('.title-and-desc'):
            yield {
                'name': div.css('.site-title::text').extract_first(),
                'description': div.css('.site-descr::text').extract_first().strip(),
                'link': div.css('a::attr(href)').extract_first()
            }
github xchaoinfo / Book-spider / BookSpider / spiders / amazon.py View on Github external
class AmazonSpider(CrawlSpider):
    name = 'amazon'
    allowed_domains = ['amazon.cn']
    # start_urls = ["https://www.amazon.cn/b/ref=sr_aj?node=658394051&ajr=0"]
    # start_urls = get_start_urls_from_txt()
    start_urls = ("https://www.amazon.cn/gp/search/other/ref=sr_sa_p_lbr_one_browse-bin?rh=n%3A658390051%2Cn%3A%21658391051%2Cn%3A658394051%2Cp_6%3AA1AJ19PSB66TGU%2Cp_n_binding_browse-bin%3A2038564051&bbn=658394051&pickerToList=lbr_one_browse-bin&ie=UTF8&qid=1496321636",
        "https://www.amazon.cn/gp/search/other/ref=sr_sa_p_lbr_one_browse-bin?rh=n%3A658390051%2Cn%3A%21658391051%2Cn%3A658394051%2Cp_n_binding_browse-bin%3A2038565051&bbn=658394051&pickerToList=lbr_one_browse-bin&ie=UTF8&qid=1496321894")
    url_xpath = '//a[contains(@class, "s-color-twister-title-link")]'
    author_xpath = '//div[@id="refinementList"]'
    nextPage_xpath = '//a[@id="pagnNextLink"]'
    book_class_xpath = '//div[contains(@class, "browseBox")]/ul[2]'
    rules = (
        # Rule(LinkExtractor(restrict_xpaths=(author_xpath,)), follow=True),
        # Rule(LinkExtractor(restrict_xpaths=(book_class_xpath,)), follow=True),
        # Rule(LinkExtractor(allow=("b/ref.*",), restrict_xpaths=(book_class_xpath,)), follow=True),
        Rule(LinkExtractor(restrict_xpaths=(nextPage_xpath,)), follow=True, callback="parse_search_result"),
        # Rule(LinkExtractor(deny=("b/ref.*",), restrict_xpaths=(url_xpath,)), callback="parse_item"),
    )

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url, callback=self.parse_starturls)

    def parse_starturls(self, response):
        # url_list = response.xpath('//div[@class="categoryRefinementsSection"]/ul/li/a/@href').extract()[1:]
        # url_list = response.xpath('//div[contains(@class, "browseBox")]/ul[2]/li/a/@href').extract()
        url_list = response.xpath('//li/span[@class="a-list-item"]/a/@href').extract()
        home_url = "https://www.amazon.cn"
        url_list = [home_url + u for u in url_list]
        for url in url_list:
            yield scrapy.Request(url, callback=self.parse_search_result)
github KDF5000 / RSpider / RSpider / spiders / Base.py View on Github external
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from RSpider.items import TestLoader


class BaseSpider(CrawlSpider):
    name = 'Base'
    allowed_domains = ['dmoz.org']
    start_urls = ['http://www.dmoz.org/']

    rules = (
        Rule(LinkExtractor(restrict_xpaths='//div[@id="catalogs"]')),
        Rule(LinkExtractor(restrict_xpaths='//ul[@class="directory dir-col"]'), callback='parse_directory', follow=True)
    )

    def parse_directory(self, response):
        for li in response.css('ul.directory-url > li'):
            tl = TestLoader(selector=li)
            tl.add_css('name', 'a::text')
            tl.add_css('description', '::text')
            tl.add_css('link', 'a::attr(href)')
            tl.add_value('url', response.url)
            yield tl.load_item()

    def spider_closed(self):
        pass