Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def maintain_local_heartbeat(self):
while True:
try:
self.r_local.set('crawler:heartbeat:%s' % self.crawler_id, time.time())
time.sleep(settings.CRAWLER_HEARTBEAT_INTERVAL)
except:
log.msg("heartbeat failed",level=log.ERROR)
break
def process_spider_output(self, response, result, spider):
context = getattr(spider, 'context', {})
visited_ids = context.setdefault(self.CONTEXT_KEY, {})
ret = {}
for x in result:
visited = False
if isinstance(x, Request):
if self.FILTER_VISITED in x.meta:
visit_id = self._visited_id(x)
if visit_id in visited_ids:
log.msg('Ignore: %s' %x.url, level=log.INFO, spider=spider)
visited = True
elif isinstance(x, BaseItem):
visit_id = self._visited_id(response.request)
if visit_id:
visited_ids[visit_id] = True
x['visit_id'] = visit_id
x['visit_status'] = 'new'
if visited:
ret.append(KoubeiStoreItem(visit_id=visit_id, visit_status='old'))
else:
ret.append(x)
return ret
def spider_idle(self, spider):
log.msg("HooksasyncExtension, signals.spider_idle fired")
def _on_success(response):
assert isinstance(response, (Response, Request))
if isinstance(response, Response):
response.request = request # tie request to response received
logkws = self.logformatter.crawled(request, response, spider)
log.msg(level=log.DEBUG, spider=spider, **logkws)
self.signals.send_catch_log(signal=signals.response_received, \
response=response, request=request, spider=spider)
return response
newprio = priority - 1 # increase priority for rerunning asap
self.master.reschedule([domain], spider_settings, newprio,
reason="no available slots at worker=%s" % self.name)
elif status['callresponse'][0] == ResponseCode.DOMAIN_ALREADY_RUNNING:
log.msg("ClusterMaster: Already running domain=%s at worker=%s" %
(domain, self.name), log.WARNING)
self.master.loading.remove(domain)
self.master.reschedule([domain], spider_settings, priority,
reason="domain already running at worker=%s" % self.name)
try:
log.msg("ClusterMaster: Running domain=%s at worker=%s" % (domain, self.name), log.DEBUG)
deferred = self._worker.callRemote("run", domain, dsettings)
except pb.DeadReferenceError:
self._set_status(None)
log.msg("ClusterMaster: Lost connection to worker=%s." % self.name, log.ERROR)
else:
deferred.addCallbacks(callback=_run_callback, errback=_run_errback)
"""Return the metadata dictionary (possibly empty) if the entry is
cached, None otherwise.
"""
requestpath = self.requestpath(domain, key)
try:
with open(os.path.join(requestpath, 'pickled_meta'), 'r') as f:
metadata = pickle.load(f)
except IOError, e:
if e.errno != errno.ENOENT:
raise
return None
expiration_secs = settings.getint('CACHE2_EXPIRATION_SECS')
if expiration_secs >= 0:
expiration_date = metadata['timestamp'] + datetime.timedelta(seconds=expiration_secs)
if datetime.datetime.utcnow() > expiration_date:
log.msg('dropping old cached response from %s' % metadata['timestamp'], level=log.DEBUG)
return None
return metadata
sassfilter_match = re.search(r'{(\"pid\":\"pl_common_sassfilter\".*?)}', text, re.M | re.I)
if sassfilter_match:
raise CloseSpider('weibo search exceeded')
# check the num of search results
totalshow_match = re.search(r'{(\"pid\":\"pl_common_totalshow\".*?)}', text, re.M | re.I)
if totalshow_match:
html = json.loads(totalshow_match.group())['html']
if len(html) == 0:
raise CloseSpider('not login? %s' % html)
totalshow = pq(html)
if totalshow('div.topcon_l').html() is None:
log.msg('%s 0 feeds' % query, level=log.INFO)
return
topcon_num = int(re.search('\s(\d+)\s', totalshow('div.topcon_l').text().replace(',', ''), re.I).group(1))
log.msg('%s %d feeds' % (query, topcon_num), level=log.INFO)
max_feeds = settings.getint('FEED_LIMIT', 200000)
if topcon_num > max_feeds:
log.msg('too much (%d) result for %s.' % (topcon_num, query), logLevel=log.WARNING)
elif 1000 < topcon_num < max_feeds:
# weibo search only allow 20 feeds on 1 page and at most 50 pages.
days = range.days / float(2)
middle = start + timedelta(days)
# first part
url = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), start, middle)
request = Request(url=url, callback=self.parse_weibo)
request.meta['query'] = query
request.meta['start'] = start.strftime("%Y-%m-%d %H:%M:%S")
request.meta['end'] = middle.strftime("%Y-%m-%d %H:%M:%S")
request.meta['priority'] = days / 2
request.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S")
def log(self, message, level=log.DEBUG, **kw):
"""Log the given messages at the given log level. Always use this
method to send log messages from your spider
"""
log.msg(message, spider=self, level=level, **kw)
def parse_hq_stock_name_list(self, response):
json_response = json.loads(response.body_as_unicode())
if 'success' not in json_response or json_response['success'] != 'true':
log.msg('parse_hq_stock_name_list parse failed')
return
for stock in json_response['stocks']:
item = StockItem()
item['symbol'] = stock['symbol']
item['name'] = stock['name']
item['market'] = getmarket(stock['symbol'])
item['catelog'] = getcatelog(stock['symbol'])
yield item
request = scrapy.Request("http://xueqiu.com/stock/industry/stockList.json?type=1&code=%s&size=0" % (stock['symbol']),
cookies=self.get_cookies(),
callback=self.parse_hq_stock_category)
yield request
if item['market'] == 'PRE':
continue
def set_spider(self, url, opts):
if opts.spider:
try:
self.spider = self.crawler.spiders.create(opts.spider)
except KeyError:
log.msg(format='Unable to find spider: %(spider)s',
level=log.ERROR, spider=opts.spider)
else:
self.spider = create_spider_for_request(self.crawler.spiders, Request(url))
if not self.spider:
log.msg(format='Unable to find spider for: %(url)s',
level=log.ERROR, url=url)