How to use the scrapy.http.Request function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scrapy / scrapy / tests / test_pipeline_media.py View on Github external
def _assert_request_no3xx(self, pipeline_class, settings):
        pipe = pipeline_class(settings=Settings(settings))
        request = Request('http://url')
        pipe._modify_media_request(request)

        self.assertIn('handle_httpstatus_list', request.meta)
        for status, check in [
                (200, True),

                # These are the status codes we want
                # the downloader to handle itself
                (301, False),
                (302, False),
                (302, False),
                (307, False),
                (308, False),

                # we still want to get 4xx and 5xx
                (400, True),
github scrapedia / scrapy-cookies / tests / test_downloadermiddleware_cookies.py View on Github external
# embed C1 and C3 for scrapytest.org/foo
        req = Request("http://scrapytest.org/foo")
        self.mw.process_request(req, self.spider)
        assert req.headers.get("Cookie") in (
            b"C1=value1; C3=value3",
            b"C3=value3; C1=value1",
        )

        # embed C2 for scrapytest.org/bar
        req = Request("http://scrapytest.org/bar")
        self.mw.process_request(req, self.spider)
        self.assertEqual(req.headers.get("Cookie"), b"C2=value2")

        # embed nothing for scrapytest.org/baz
        req = Request("http://scrapytest.org/baz")
        self.mw.process_request(req, self.spider)
        assert "Cookie" not in req.headers
github scrapy / scrapy / tests / test_spidermiddleware_httperror.py View on Github external
def start_requests(self):
        for url in self.start_urls:
            yield Request(url, self.parse, errback=self.on_error)
github OlivierBlanvillain / crawler / bibcrawl / spiders / rsscrawl.py View on Github external
      lambda url: Request(
        url=url,
        callback=self.bufferEntries,
        errback=self.bufferEntries,
        dont_filter=True,
        # meta={ "u": _ } is here to keep a "safe" copy of the source url.
        # I don't trust response.url == (what was passed as Request url).
        meta={ "u": url }),
      self.contentExtractor.getRssLinks())
github SylvanasSun / scrapy-picture-spider / deviant_art / deviant_art_spider / spiders / deviant_art_image_spider.py View on Github external
def parse_detail_page(self, response):
        if response.url in self.filter:
            self.logger.debug('[REPETITION] already parse url %s ' % response.url)
            return None
        soup = self._init_soup(response, '[PREPARING DETAIL PAGE]')
        if soup is None:
            return None
        yield self.packing_item(response.meta['item'], soup)
        self.filter.add(response.url)
        # continue search more detail page of current page link
        all_div_tag = soup.find_all('div', class_='tt-crop thumb')
        if all_div_tag is not None and len(all_div_tag) > 0:
            for div_tag in all_div_tag:
                detail_link = div_tag.find('a')['href']
                request = Request(
                    url=detail_link,
                    headers=self.headers,
                    callback=self.parse_detail_page
                )
                request.meta['item'] = DeviantArtSpiderItem()
                yield request
        else:
            self.logger.debug('[PARSE FAILED] get <div> tag failed')
            return None
</div>
github scrapy / scrapy / scrapy / trunk / scrapy / contrib / pipeline / s3images.py View on Github external
def s3_request(self, key, method, body=None, headers=None):
        url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket_name, self.key_prefix, key)
        req = Request(url, method=method, body=body, headers=headers)
        return req
github nyov / scrapyext / scrapyext / spiders / mixin / logoutmixin.py View on Github external
def _spider_logout(self, spider):
		if spider != self: return
		if self.logged_out: return
		request = self.logout()
		if not isinstance(request, Request): return
		self.crawler.engine.schedule(request, spider)
		raise DontCloseSpider('logout scheduled')
github parul1931 / walmart / walmart_spider / walmart_spider / spiders / target.py View on Github external
def parse(self, response):
		categories = response.xpath(
			'//ul[@class="innerCol"]/li/a/@href').re('N-(.*)#')

		for category in categories:
			new_meta = response.meta.copy()
			new_meta['category'] = category
			new_meta['next_page'] = 2
			new_meta['index'] = new_meta['next_page']*60

			yield Request(url=self.JSON_SEARCH_URL.format(category=category,
														  page=1,
														  index=0),
						  meta=new_meta,
				 		  callback=self.parse_product)
github cytopia / crawlpy / crawlpy / spiders / crawlpy_spider.py View on Github external
# Do not start a request on error,
        # simply return nothing and quit scrapy
        if self.abort:
            return

        logging.info('All set, start crawling with depth: ' + str(self.max_depth))

        # Do a login
        if self.config['login']['enabled']:
            # Start with login first
            logging.info('Login required')
            return Request(url=self.login_url, callback=self.login)
        else:
            # Start with pase function
            logging.info('Not login required')
            return Request(url=self.base_url, callback=self.parse)