Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def start_requests(self):
start_url = urltemplate.format(min_weight, max_weight)
yield scrapy.Request(
url=start_url,
callback=lambda x: self.parse_range(x, min_weight, max_weight))
def parse_locations(self, response):
locations = [x.xpath("@href").extract_first() for x in response.css("a[href*='locations/']")]
salon_base = "https://info3.regiscorp.com/salonservices/siteid/1/salon/{}"
for location in locations:
salon_id = location.strip(".html").split("-")[-1]
if salon_id.isnumeric():
yield scrapy.Request(salon_base.format(salon_id),
callback=partial(self.parse_result, trunk={"url": response.urljoin(location)}))
for x in xrange(1,imgCount+1):
suffix = ".shtml"
if x > 1:
suffix = "_"+str(x)+".shtml"
request = scrapy.Request(url+suffix, callback=self.parse_item, cookies={'title': title})
yield request
#读取下一页
print(response.url)
selector = response.xpath('//div[@class="flym"]/*')
last = selector[len(selector)-1].xpath("a")
if len(last) > 0:
nextPage = self.baseURL+last[0].xpath("@href").extract()[0]
tmp = nextPage.split("_")[2]
tmp = int(tmp.split(".")[0])
if self.endPage == None or tmp <= self.endPage :
request2 = scrapy.Request(nextPage, callback=self.parse)
yield request2
if len(td_list) < 3:
continue
ipaddr = td_list[0].extract()
port = td_list[1].extract()
proto = td_list[5].extract()
latency = tr.css('div.bar::attr(title)').extract_first()
latency = re.match('(\d+\.\d+)秒', latency).group(1)
proxy = '%s://%s:%s' % (proto, ipaddr, port)
proxies = {proto: '%s:%s' % (ipaddr, port)}
if float(latency) > 3:
logger.info('丢弃慢速代理: %s 延迟%s秒' % (proxy, latency))
continue
logger.info('验证: %s' % proxy)
if not self.redis_db.sismember(self.PROXY_SET, proxy):
vaurl, vastart = random.choice(list(self.validator_pool))
yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True)
else:
logger.info('该代理已收录..')
def parse_firm_url(self, response):
url = 'https://www.qichacha.com'
url_lst = response.css('.panel-default').css('a').xpath('@href').extract()
for i in range(10):
time.sleep(2)
yield scrapy.Request(
url=url + url_lst[i],
headers=self.headers_next(),
cookies=self.cookie_next())
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0'
}
page=Selector(response)
goods_urls=page.xpath('//div[@class="grid g-clearfix"]/div[@class="items"]/div/div[3]/div[2]/a/@href').extract()
# goods_class=page.xpath('//div[@class="grid g-clearfix"]/div[@class="items"]/div[1]/div[3]/div[2]/a/span[@class="H"]/text()').extract()
goods_class=response.meta['sub_nav']
areas = page.xpath('//div[@class="row row-3 g-clearfix"]/div[@class="location"]/text()').extract()
sell_counts=page.xpath('//div[@class="deal-cnt"]/text()').extract()
# print(goods_urls)
for i in range(0,len(goods_urls)):
area=areas[i]
sell_count=sell_counts[i]
goods_url='http:'+goods_urls[i]
yield scrapy.Request(goods_url,self.parse2,headers=headers3,dont_filter=True,
meta={'goods_url':goods_url,
'goods_class':goods_class,
'area':area,
'sell_count':sell_count})
for url in all_urls:
url = get_absolute_url(url=url, origin_url=response.url)
if get_domain(url) == current_domain:
filtered_urls.append(url)
filtered_urls = list(set(filtered_urls))
# max_pages = traversal.get("max_pages", 100)
# implementing max_pages is difficult cos it keeps adding
# new 100 pages in each thread.
current_page_count = response.meta.get('current_page_count', 1)
next_crawler_id = traversal['next_crawler_id']
next_parser = get_crawler_from_list(crawler_id=next_crawler_id, crawlers=crawlers)
for url in filtered_urls:
current_page_count = current_page_count + 1
yield scrapy.Request(
url, callback=self.parse,
meta={
"current_page_count": current_page_count,
"current_crawler": next_parser,
"crawlers": crawlers
}
)
self.post_parse(response=response)
def start_requests(self):
# 591 require a valid session to start request, #27
yield scrapy.Request(
url=SESSION_ENDPOINT,
dont_filter=True,
callback=self.handle_session_init,
)
def parse(self, response):
for quote in response.css('div.quote'):
# have a look at the itemloaders.py file to see how we've defined
# the input and output processors for the QuoteLoader class.
il = QuoteLoader(item=QuoteItem(), selector=quote)
il.add_css('text', 'span.text::text')
il.add_css('author_name', 'small.author::text')
il.add_css('tags', 'a.tag::text')
il.add_value('url', response.url)
yield il.load_item()
next_page = response.css("li.next > a::attr(href)").extract_first()
if next_page is not None:
url = response.urljoin(next_page)
yield scrapy.Request(url, callback=self.parse)
def parse_cities(self, response):
cities = json.loads(response.body)
for city in cities['d']:
current_city = json.dumps({ 'state': response.meta['state'], 'city': city })
request = scrapy.Request(
STORES,
method='POST',
body=current_city,
headers=HEADERS,
callback=self.parse
)
yield request