How to use the scrapelib.FileCache function in scrapelib

To help you get started, we’ve selected a few scrapelib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github opencivicdata / python-legistar-scraper / scripts / guessdomains.py View on Github external
# scrapelib setup
        self.timeout = self.SCRAPELIB_TIMEOUT
        self.requests_per_minute = self.SCRAPELIB_RPM
        self.retry_attempts = self.SCRAPELIB_RETRY_ATTEMPTS
        self.retry_wait_seconds = self.SCRAPELIB_RETRY_WAIT_SECONDS
        self.follow_robots = False

        # if self.PROXIES:
        #     self.proxies = self.PROXIES

        if self.FASTMODE:
            self.cache_write_only = False

        cache_dir = '.cache'
        self.cache_storage = scrapelib.FileCache(cache_dir)
github openstates / billy / billy / scrape / __init__.py View on Github external
def __init__(self, metadata, output_dir=None, strict_validation=None,
                 fastmode=False, options={}):
        """
        Create a new Scraper instance.

        :param metadata: metadata for this scraper
        :param output_dir: the data directory to use
        :param strict_validation: exit immediately if validation fails
        """
        super(Scraper, self).__init__()

        # scrapelib overrides
        self.timeout = settings.SCRAPELIB_TIMEOUT
        self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR)
        self.requests_per_minute = settings.SCRAPELIB_RPM
        self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS
        self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS

        if fastmode:
            self.requests_per_minute = 0
            self.cache_write_only = False

        self.metadata = metadata
        self.output_dir = output_dir
        self.output_names = set()
        self.options = options

        # make output_dir
        os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir)
github opencivicdata / pupa / pupa / scrape / base.py View on Github external
super(Scraper, self).__init__()

        # set options
        self.jurisdiction = jurisdiction
        self.datadir = datadir

        # scrapelib setup
        self.timeout = settings.SCRAPELIB_TIMEOUT
        self.requests_per_minute = settings.SCRAPELIB_RPM
        self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS
        self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS
        self.verify = settings.SCRAPELIB_VERIFY

        # caching
        if settings.CACHE_DIR:
            self.cache_storage = scrapelib.FileCache(settings.CACHE_DIR)

        if fastmode:
            self.requests_per_minute = 0
            self.cache_write_only = False

        # validation
        self.strict_validation = strict_validation

        # 'type' -> {set of names}
        self.output_names = defaultdict(set)

        # logging convenience methods
        self.logger = logging.getLogger("pupa")
        self.info = self.logger.info
        self.debug = self.logger.debug
        self.warning = self.logger.warning