Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def prepare(self):
host = settings.get('REDIS_HOST', REDIS_HOST)
port = settings.get('REDIS_PORT', REDIS_PORT)
self.r = _default_redis(host, port)
uids_set = UIDS_SET.format(spider=self.name)
log.msg(format='Load uids from %(uids_set)s', level=log.INFO, uids_set=uids_set)
uids = self.r.smembers(uids_set)
if uids == []:
log.msg(format='Not load any uids from %(uids_set)s', level=log.INFO, uids_set=uids_set)
return uids
def _download(self, request, spider, deferred):
if self.debug_mode:
log.msg('Activating %s' % request_info(request), log.DEBUG)
domain = spider.domain_name
site = self.sites.get(domain)
site.downloading.add(request)
def _remove(result):
if self.debug_mode:
log.msg('Deactivating %s' % request_info(request), log.DEBUG)
site.downloading.remove(request)
return result
def _finish(result):
self.process_queue(spider)
dwld = mustbe_deferred(self.download_function, request, spider)
dwld.addBoth(_remove)
chain_deferred(dwld, deferred)
def process_request(self, request, spider):
if self.db.shops.find_one({'link_url':request.url}):
log.msg('Ignore: %s'%request.url, log.WARNING)
raise IgnoreRequest
# log.msg('Request: %s'%request.url, log.INFO)
return None
def _exporter_supported(self, format):
if format in self.exporters:
return True
log.msg("Unknown feed format: %s" % format, log.ERROR)
def _dqpush(self, request):
if self.dqs is None:
return
try:
reqd = request_to_dict(request, self.spider)
self.dqs.push(reqd, -request.priority)
except ValueError, e: # non serializable request
if self.logunser:
log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s",
level=log.ERROR, spider=self.spider,
request=request, reason=e)
return
else:
return True
def _domain_closed(self, domain, spider, status):
dispatcher.send(signal=self.domain_closing, sender=self.__class__, domain=domain, spider=spider, status=status)
if self.debug:
log.msg(pprint.pformat(self[domain]), domain=domain, level=log.DEBUG)
if self.db:
self.db.put(domain, self[domain])
if self.cleanup:
del self[domain]
dispatcher.send(signal=self.domain_closed, sender=self.__class__, domain=domain, spider=spider, status=status)
def __init__(self, auth, project, spider, job, level=log.INFO, url='http://localhost:8002'):
self.level = level
self.errors_count = 0
self.url = url
self.auth = auth
self.path = "/logs/%s/%s/%s" % (project, spider, job)
def _dqpush(self, request):
if self.dqs is None:
return
try:
reqd = request_to_dict(request, self.spider)
self.dqs.push(reqd, -request.priority)
except ValueError, e: # non serializable request
if self.logunser:
log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s",
level=log.ERROR, spider=self.spider,
request=request, reason=e)
return
else:
return True
def _remove(result):
if self.debug_mode:
log.msg('Deactivating %s' % request_info(request), log.DEBUG)
site.downloading.remove(request)
return result
domain = args[0]
spider = spiders.fromdomain(domain)
scrapymanager.configure()
if opts.child:
def _stop():
pass
# monkeypatching stop command to prevent stoping prematurely in child mode
scrapymanager.stop = _stop
if not task.locked:
task.lock()
self.crawl(spider, task)
scrapyengine.start()
else:
log.msg('You must specify atleast 1 domain', level=log.ERROR)