How to use the scrapy.Request function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github aiqm / torchani / tools / generate-unit-test-expect / nist-dataset / nist.py View on Github external
def start_requests(self):
        start_url = urltemplate.format(min_weight, max_weight)
        yield scrapy.Request(
            url=start_url,
            callback=lambda x: self.parse_range(x, min_weight, max_weight))
github alltheplaces / alltheplaces / locations / spiders / supercuts.py View on Github external
def parse_locations(self, response):
        locations = [x.xpath("@href").extract_first() for x in response.css("a[href*='locations/']")]
        salon_base = "https://info3.regiscorp.com/salonservices/siteid/1/salon/{}"
        for location in locations:
            salon_id = location.strip(".html").split("-")[-1]
            if salon_id.isnumeric():
                yield scrapy.Request(salon_base.format(salon_id),
                                     callback=partial(self.parse_result, trunk={"url": response.urljoin(location)}))
github donnki / ScrapyImage / yesky / yesky / spiders / beauty.py View on Github external
for x in xrange(1,imgCount+1):
					suffix = ".shtml"
					if x > 1:
						suffix = "_"+str(x)+".shtml"
					request = scrapy.Request(url+suffix, callback=self.parse_item, cookies={'title': title})
					yield request
		#读取下一页
		print(response.url)
		selector = response.xpath('//div[@class="flym"]/*')
		last = selector[len(selector)-1].xpath("a")
		if len(last) > 0:
			nextPage = self.baseURL+last[0].xpath("@href").extract()[0]
			tmp = nextPage.split("_")[2]
			tmp = int(tmp.split(".")[0])
			if self.endPage == None or tmp <= self.endPage :
				request2 = scrapy.Request(nextPage, callback=self.parse)
				yield request2
github arthurmmm / hq-proxies / proxy_spider / spiders / proxy_spider.py View on Github external
if len(td_list) < 3:
                continue
            ipaddr = td_list[0].extract()
            port = td_list[1].extract()
            proto = td_list[5].extract()
            latency = tr.css('div.bar::attr(title)').extract_first()
            latency = re.match('(\d+\.\d+)秒', latency).group(1)
            proxy = '%s://%s:%s' % (proto, ipaddr, port)
            proxies = {proto: '%s:%s' % (ipaddr, port)}
            if float(latency) > 3:
                logger.info('丢弃慢速代理: %s 延迟%s秒' % (proxy, latency))
                continue
            logger.info('验证: %s' % proxy)
            if not self.redis_db.sismember(self.PROXY_SET, proxy):
                vaurl, vastart = random.choice(list(self.validator_pool))
                yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True)
            else:
                logger.info('该代理已收录..')
github TPF2017 / Scrapy_crawl_qichacha / QCC / spiders / qcc.py View on Github external
def parse_firm_url(self, response):
        url = 'https://www.qichacha.com'
        url_lst = response.css('.panel-default').css('a').xpath('@href').extract()
        for i in range(10):
            time.sleep(2)
            yield scrapy.Request(
                url=url + url_lst[i],
                headers=self.headers_next(),
                cookies=self.cookie_next())
github MaLei666 / Spider / taobao / taobao / spiders / taobao.py View on Github external
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0'
        }
        page=Selector(response)
        goods_urls=page.xpath('//div[@class="grid g-clearfix"]/div[@class="items"]/div/div[3]/div[2]/a/@href').extract()
        # goods_class=page.xpath('//div[@class="grid g-clearfix"]/div[@class="items"]/div[1]/div[3]/div[2]/a/span[@class="H"]/text()').extract()
        goods_class=response.meta['sub_nav']
        areas = page.xpath('//div[@class="row row-3 g-clearfix"]/div[@class="location"]/text()').extract()
        sell_counts=page.xpath('//div[@class="deal-cnt"]/text()').extract()

        # print(goods_urls)
        for i in range(0,len(goods_urls)):
            area=areas[i]
            sell_count=sell_counts[i]
            goods_url='http:'+goods_urls[i]
            yield scrapy.Request(goods_url,self.parse2,headers=headers3,dont_filter=True,
                                 meta={'goods_url':goods_url,
                                       'goods_class':goods_class,
                                       'area':area,
                                       'sell_count':sell_count})
github invanalabs / invana-bot / invana_bot_old / spiders / default.py View on Github external
for url in all_urls:
                    url = get_absolute_url(url=url, origin_url=response.url)
                    if get_domain(url) == current_domain:
                        filtered_urls.append(url)
                filtered_urls = list(set(filtered_urls))
                # max_pages = traversal.get("max_pages", 100)
                #  implementing max_pages is difficult cos it keeps adding
                # new 100 pages in each thread.
                current_page_count = response.meta.get('current_page_count', 1)
                next_crawler_id = traversal['next_crawler_id']
                next_parser = get_crawler_from_list(crawler_id=next_crawler_id, crawlers=crawlers)

                for url in filtered_urls:
                    current_page_count = current_page_count + 1

                    yield scrapy.Request(
                        url, callback=self.parse,
                        meta={
                            "current_page_count": current_page_count,
                            "current_crawler": next_parser,
                            "crawlers": crawlers
                        }
                    )
        self.post_parse(response=response)
github g0v / tw-rental-house-data / scrapy-package / scrapy_twrh / spiders / rental591 / rental591_spider.py View on Github external
def start_requests(self):
        # 591 require a valid session to start request, #27
        yield scrapy.Request(
            url=SESSION_ENDPOINT,
            dont_filter=True,
            callback=self.handle_session_init,
        )
github scrapinghub / scrapy-training / extras / itemloaders_example / itemloaders_example / spiders / spider_4_custom_itemloader.py View on Github external
def parse(self, response):
        for quote in response.css('div.quote'):
            # have a look at the itemloaders.py file to see how we've defined
            # the input and output processors for the QuoteLoader class.
            il = QuoteLoader(item=QuoteItem(), selector=quote)
            il.add_css('text', 'span.text::text')
            il.add_css('author_name', 'small.author::text')
            il.add_css('tags', 'a.tag::text')
            il.add_value('url', response.url)
            yield il.load_item()

        next_page = response.css("li.next > a::attr(href)").extract_first()
        if next_page is not None:
            url = response.urljoin(next_page)
            yield scrapy.Request(url, callback=self.parse)
github alltheplaces / alltheplaces / locations / spiders / jimmy_johns.py View on Github external
def parse_cities(self, response):
        cities = json.loads(response.body)
        for city in cities['d']:
            current_city = json.dumps({ 'state': response.meta['state'], 'city': city })
            request = scrapy.Request(
                STORES,
                method='POST',
                body=current_city,
                headers=HEADERS,
                callback=self.parse
            )
            yield request