How to use the scrapy.log function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github MOON-CLJ / scrapy_weibo / scrapy_weibo / spiders / followers_uids_spider.py View on Github external
def prepare(self):
        host = settings.get('REDIS_HOST', REDIS_HOST)
        port = settings.get('REDIS_PORT', REDIS_PORT)
        self.r = _default_redis(host, port)

        uids_set = UIDS_SET.format(spider=self.name)
        log.msg(format='Load uids from %(uids_set)s', level=log.INFO, uids_set=uids_set)
        uids = self.r.smembers(uids_set)
        if uids == []:
            log.msg(format='Not load any uids from %(uids_set)s', level=log.INFO, uids_set=uids_set)

        return uids
github scrapy / scrapy / scrapy / trunk / scrapy / core / downloader / manager.py View on Github external
def _download(self, request, spider, deferred):
        if self.debug_mode:
            log.msg('Activating %s' % request_info(request), log.DEBUG)
        domain = spider.domain_name
        site = self.sites.get(domain)
        site.downloading.add(request)

        def _remove(result):
            if self.debug_mode:
                log.msg('Deactivating %s' % request_info(request), log.DEBUG)
            site.downloading.remove(request)
            return result

        def _finish(result):
            self.process_queue(spider)

        dwld = mustbe_deferred(self.download_function, request, spider)
        dwld.addBoth(_remove)
        chain_deferred(dwld, deferred)
github lifepy / wolfspider / koubei / koubei / middlewares.py View on Github external
def process_request(self, request, spider):
        if self.db.shops.find_one({'link_url':request.url}):
            log.msg('Ignore: %s'%request.url, log.WARNING)
            raise IgnoreRequest
        # log.msg('Request: %s'%request.url, log.INFO)
        return None
github mozilla / spade / vendor / scrapy / contrib / feedexport.py View on Github external
def _exporter_supported(self, format):
        if format in self.exporters:
            return True
        log.msg("Unknown feed format: %s" % format, log.ERROR)
github mozilla / spade / vendor / scrapy / core / scheduler.py View on Github external
def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.priority)
        except ValueError, e: # non serializable request
            if self.logunser:
                log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s",
                        level=log.ERROR, spider=self.spider,
                        request=request, reason=e)
            return
        else:
            return True
github scrapy / scrapy / scrapy / trunk / scrapy / stats / statscollector.py View on Github external
def _domain_closed(self, domain, spider, status):
        dispatcher.send(signal=self.domain_closing, sender=self.__class__, domain=domain, spider=spider, status=status)
        if self.debug:
            log.msg(pprint.pformat(self[domain]), domain=domain, level=log.DEBUG)
        if self.db:
            self.db.put(domain, self[domain])
        if self.cleanup:
            del self[domain]
        dispatcher.send(signal=self.domain_closed, sender=self.__class__, domain=domain, spider=spider, status=status)
github scrapinghub / python-scrapinghub / hubstorage / scrapylog.py View on Github external
    def __init__(self, auth, project, spider, job, level=log.INFO, url='http://localhost:8002'):
        self.level = level
        self.errors_count = 0
        self.url = url
        self.auth = auth
        self.path = "/logs/%s/%s/%s" % (project, spider, job)
github mozilla / spade / vendor / scrapy / core / scheduler.py View on Github external
def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.priority)
        except ValueError, e: # non serializable request
            if self.logunser:
                log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s",
                        level=log.ERROR, spider=self.spider,
                        request=request, reason=e)
            return
        else:
            return True
github scrapy / scrapy / scrapy / trunk / scrapy / core / downloader / manager.py View on Github external
def _remove(result):
            if self.debug_mode:
                log.msg('Deactivating %s' % request_info(request), log.DEBUG)
            site.downloading.remove(request)
            return result
github taras / djangoscraper / djangoscraper / commands / run.py View on Github external
domain = args[0]
            
            spider = spiders.fromdomain(domain)         
            scrapymanager.configure()
            if opts.child:
                def _stop():
                    pass
                # monkeypatching stop command to prevent stoping prematurely in child mode
                scrapymanager.stop = _stop
            if not task.locked:
                task.lock()
            self.crawl(spider, task)
            scrapyengine.start()

        else:
            log.msg('You must specify atleast 1 domain', level=log.ERROR)