How to use the scrapelib.cache.FileCache function in scrapelib

To help you get started, we’ve selected a few scrapelib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openstates / legacy-openstates.org / experiments / legatron / models.py View on Github external
'''Update the feed record with the latest report.
        '''
        if not self.is_valid():
            return
        spec = dict(url=self.url)
        update = {'$set': self.report}
        self.logger.debug('feed.finish_report %r' % self.url)
        feeds_db.feeds.find_and_modify(spec, update, upsert=True, new=True)
        self.logger.info('feed.save: %r' % self.url)


class Entry(object):
    '''Wrap a parsed feed entry dictionary thingy from feedparser.
    '''
    request_defaults = dict(
        cache_obj=FileCache(ENTRIES_CACHE),
        requests_per_minute=0,
        cache_write_only=False)

    session = scrapelib.Scraper(**_request_defaults(request_defaults))
    logger = logging.getLogger('billy.entry-model')

    def __init__(self, entry, feed):
        self.entry = entry
        self.feed = feed
        self.report = {
            'entities': {
                'count' : 0,
                }
            }

        # Whether a fetch of the full text was tried and succeeded.
github openstates / legacy-openstates.org / experiments / legatron / models.py View on Github external
'user_agent': USER_AGENT,
        'follow_robots': False,
        }
    request_defaults.update(kwargs)
    return request_defaults


class Feed(object):
    '''This model handles fetching the rss feed and recording any errors
    that occur for post-mortem reporting. It also has an instance-level
    report dictionary that gets augmented each time one of the feed's
    entries is scanned for relevant entities.
    '''

    request_defaults = dict(
        cache_obj=FileCache(FEEDS_CACHE),
        requests_per_minute=0,
        cache_write_only=False)

    session = scrapelib.Scraper(
        **_request_defaults(request_defaults))
    logger = logging.getLogger('billy.feed-model')

    def __init__(self, url, jurisdiction):
        self.url = url
        self.jurisdiction = jurisdiction

        self.succeeded = None
        self.default_report = {
            'entries': {
                'count': 0,
                'new': 0,