How to use the scrapy.log.msg function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github immzz / zhihu-scrapy / zhihu / spiders / zhihu_people.py View on Github external
def maintain_local_heartbeat(self):
        while True:
            try:
                self.r_local.set('crawler:heartbeat:%s' % self.crawler_id, time.time())
                time.sleep(settings.CRAWLER_HEARTBEAT_INTERVAL)
            except:
                log.msg("heartbeat failed",level=log.ERROR)
                break
github lifepy / wolfspider / koubei / koubei / middlewares.py View on Github external
def process_spider_output(self, response, result, spider):
        context = getattr(spider, 'context', {})
        visited_ids = context.setdefault(self.CONTEXT_KEY, {})
        ret = {}
        for x in result:
            visited = False
            if isinstance(x, Request):
                if self.FILTER_VISITED in x.meta:
                    visit_id = self._visited_id(x)
                    if visit_id in visited_ids:
                        log.msg('Ignore: %s' %x.url, level=log.INFO, spider=spider)
                        visited = True
            elif isinstance(x, BaseItem):
                visit_id = self._visited_id(response.request)
                if visit_id:
                    visited_ids[visit_id] = True
                    x['visit_id'] = visit_id
                    x['visit_status'] = 'new'
            if visited:
                ret.append(KoubeiStoreItem(visit_id=visit_id, visit_status='old'))
            else:
                ret.append(x)
        return ret
github scalingexcellence / scrapybook / ch07 / hooksasync / hooksasync / extensions.py View on Github external
def spider_idle(self, spider):
        log.msg("HooksasyncExtension, signals.spider_idle fired")
github mozilla / spade / vendor / scrapy / core / engine.py View on Github external
def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                log.msg(level=log.DEBUG, spider=spider, **logkws)
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response
github scrapy / scrapy / scrapy / contrib / cluster / master / manager.py View on Github external
newprio = priority - 1 # increase priority for rerunning asap
                self.master.reschedule([domain], spider_settings, newprio,
                        reason="no available slots at worker=%s" % self.name)
            elif status['callresponse'][0] == ResponseCode.DOMAIN_ALREADY_RUNNING:
                log.msg("ClusterMaster: Already running domain=%s at worker=%s" %
                        (domain, self.name), log.WARNING)
                self.master.loading.remove(domain)
                self.master.reschedule([domain], spider_settings, priority,
                        reason="domain already running at worker=%s" % self.name)

        try:
            log.msg("ClusterMaster: Running domain=%s at worker=%s" % (domain, self.name), log.DEBUG)
            deferred = self._worker.callRemote("run", domain, dsettings)
        except pb.DeadReferenceError:
            self._set_status(None)
            log.msg("ClusterMaster: Lost connection to worker=%s." % self.name, log.ERROR)
        else:
            deferred.addCallbacks(callback=_run_callback, errback=_run_errback)
github scrapy / scrapy / scrapy / trunk / scrapy / contrib / downloadermiddleware / cache.py View on Github external
"""Return the metadata dictionary (possibly empty) if the entry is
        cached, None otherwise.
        """
        requestpath = self.requestpath(domain, key)
        try:
            with open(os.path.join(requestpath, 'pickled_meta'), 'r') as f:
                metadata = pickle.load(f)
        except IOError, e:
            if e.errno != errno.ENOENT:
                raise
            return None
        expiration_secs = settings.getint('CACHE2_EXPIRATION_SECS')
        if expiration_secs >= 0:
            expiration_date = metadata['timestamp'] + datetime.timedelta(seconds=expiration_secs)
            if datetime.datetime.utcnow() > expiration_date:
                log.msg('dropping old cached response from %s' % metadata['timestamp'], level=log.DEBUG)
                return None
        return metadata
github tpeng / weibosearch / weibosearch / spiders / WeiboSearchSpider.py View on Github external
sassfilter_match = re.search(r'{(\"pid\":\"pl_common_sassfilter\".*?)}', text, re.M | re.I)
    if sassfilter_match:
      raise CloseSpider('weibo search exceeded')

    # check the num of search results
    totalshow_match = re.search(r'{(\"pid\":\"pl_common_totalshow\".*?)}', text, re.M | re.I)
    if totalshow_match:
      html = json.loads(totalshow_match.group())['html']
      if len(html) == 0:
        raise CloseSpider('not login? %s' % html)
      totalshow = pq(html)
      if totalshow('div.topcon_l').html() is None:
        log.msg('%s 0 feeds' % query, level=log.INFO)
        return
      topcon_num = int(re.search('\s(\d+)\s', totalshow('div.topcon_l').text().replace(',', ''), re.I).group(1))
      log.msg('%s %d feeds' % (query, topcon_num), level=log.INFO)
      max_feeds = settings.getint('FEED_LIMIT', 200000)
      if topcon_num > max_feeds:
        log.msg('too much (%d) result for %s.' % (topcon_num, query), logLevel=log.WARNING)
      elif 1000 < topcon_num < max_feeds:
        # weibo search only allow 20 feeds on 1 page and at most 50 pages.
        days = range.days / float(2)
        middle = start + timedelta(days)

        # first part
        url = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), start, middle)
        request = Request(url=url, callback=self.parse_weibo)
        request.meta['query'] = query
        request.meta['start'] = start.strftime("%Y-%m-%d %H:%M:%S")
        request.meta['end'] = middle.strftime("%Y-%m-%d %H:%M:%S")
        request.meta['priority'] = days / 2
        request.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S")
github scrapy / scrapy / scrapy / spider.py View on Github external
def log(self, message, level=log.DEBUG, **kw):
        """Log the given messages at the given log level. Always use this
        method to send log messages from your spider
        """
        log.msg(message, spider=self, level=level, **kw)
github yegong / stock / stockspider / spiders / hq_spider.py View on Github external
def parse_hq_stock_name_list(self, response):
    json_response = json.loads(response.body_as_unicode())
    if 'success' not in json_response or json_response['success'] != 'true':
      log.msg('parse_hq_stock_name_list parse failed')
      return
    for stock in json_response['stocks']:
      item = StockItem()
      item['symbol'] = stock['symbol']
      item['name'] = stock['name']
      item['market'] = getmarket(stock['symbol'])
      item['catelog'] = getcatelog(stock['symbol'])
      yield item

      request = scrapy.Request("http://xueqiu.com/stock/industry/stockList.json?type=1&code=%s&size=0" % (stock['symbol']), 
          cookies=self.get_cookies(),
          callback=self.parse_hq_stock_category)
      yield request
      
      if item['market'] == 'PRE':
        continue
github mozilla / spade / vendor / scrapy / commands / parse.py View on Github external
def set_spider(self, url, opts):
        if opts.spider:
            try:
                self.spider = self.crawler.spiders.create(opts.spider)
            except KeyError:
                log.msg(format='Unable to find spider: %(spider)s',
                        level=log.ERROR, spider=opts.spider)
        else:
            self.spider = create_spider_for_request(self.crawler.spiders, Request(url))
            if not self.spider:
                log.msg(format='Unable to find spider for: %(url)s',
                        level=log.ERROR, url=url)