Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _assert_request_no3xx(self, pipeline_class, settings):
pipe = pipeline_class(settings=Settings(settings))
request = Request('http://url')
pipe._modify_media_request(request)
self.assertIn('handle_httpstatus_list', request.meta)
for status, check in [
(200, True),
# These are the status codes we want
# the downloader to handle itself
(301, False),
(302, False),
(302, False),
(307, False),
(308, False),
# we still want to get 4xx and 5xx
(400, True),
# embed C1 and C3 for scrapytest.org/foo
req = Request("http://scrapytest.org/foo")
self.mw.process_request(req, self.spider)
assert req.headers.get("Cookie") in (
b"C1=value1; C3=value3",
b"C3=value3; C1=value1",
)
# embed C2 for scrapytest.org/bar
req = Request("http://scrapytest.org/bar")
self.mw.process_request(req, self.spider)
self.assertEqual(req.headers.get("Cookie"), b"C2=value2")
# embed nothing for scrapytest.org/baz
req = Request("http://scrapytest.org/baz")
self.mw.process_request(req, self.spider)
assert "Cookie" not in req.headers
def start_requests(self):
for url in self.start_urls:
yield Request(url, self.parse, errback=self.on_error)
lambda url: Request(
url=url,
callback=self.bufferEntries,
errback=self.bufferEntries,
dont_filter=True,
# meta={ "u": _ } is here to keep a "safe" copy of the source url.
# I don't trust response.url == (what was passed as Request url).
meta={ "u": url }),
self.contentExtractor.getRssLinks())
def parse_detail_page(self, response):
if response.url in self.filter:
self.logger.debug('[REPETITION] already parse url %s ' % response.url)
return None
soup = self._init_soup(response, '[PREPARING DETAIL PAGE]')
if soup is None:
return None
yield self.packing_item(response.meta['item'], soup)
self.filter.add(response.url)
# continue search more detail page of current page link
all_div_tag = soup.find_all('div', class_='tt-crop thumb')
if all_div_tag is not None and len(all_div_tag) > 0:
for div_tag in all_div_tag:
detail_link = div_tag.find('a')['href']
request = Request(
url=detail_link,
headers=self.headers,
callback=self.parse_detail_page
)
request.meta['item'] = DeviantArtSpiderItem()
yield request
else:
self.logger.debug('[PARSE FAILED] get <div> tag failed')
return None
</div>
def s3_request(self, key, method, body=None, headers=None):
url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket_name, self.key_prefix, key)
req = Request(url, method=method, body=body, headers=headers)
return req
def _spider_logout(self, spider):
if spider != self: return
if self.logged_out: return
request = self.logout()
if not isinstance(request, Request): return
self.crawler.engine.schedule(request, spider)
raise DontCloseSpider('logout scheduled')
def parse(self, response):
categories = response.xpath(
'//ul[@class="innerCol"]/li/a/@href').re('N-(.*)#')
for category in categories:
new_meta = response.meta.copy()
new_meta['category'] = category
new_meta['next_page'] = 2
new_meta['index'] = new_meta['next_page']*60
yield Request(url=self.JSON_SEARCH_URL.format(category=category,
page=1,
index=0),
meta=new_meta,
callback=self.parse_product)
# Do not start a request on error,
# simply return nothing and quit scrapy
if self.abort:
return
logging.info('All set, start crawling with depth: ' + str(self.max_depth))
# Do a login
if self.config['login']['enabled']:
# Start with login first
logging.info('Login required')
return Request(url=self.login_url, callback=self.login)
else:
# Start with pase function
logging.info('Not login required')
return Request(url=self.base_url, callback=self.parse)