How to use the icrawler.storage.BaseStorage function in icrawler

To help you get started, we’ve selected a few icrawler examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hellock / icrawler / icrawler / storage / google_storage.py View on Github external
from icrawler.storage import BaseStorage
from io import BytesIO


class GoogleStorage(BaseStorage):
    """Google Storage backend.

    The id is filename and data is stored as text files or binary files.
    The root_dir is the bucket address such as gs:///.
    """

    def __init__(self, root_dir):
        try:
            from google.cloud import storage
        except ImportError:
            print('GoogleStorage backend requires the package '
                  '"google-cloud-storage", execute '
                  '"pip install google-cloud-storage" to install it.')

        self.client = storage.Client()
        bucket_str = root_dir[5:].split('/')[0]
github hellock / icrawler / icrawler / crawler.py View on Github external
def set_storage(self, storage):
        """Set storage backend for downloader

        For full list of storage backend supported, please see :mod:`storage`.

        Args:
            storage (dict or BaseStorage): storage backend configuration or instance

        """
        if isinstance(storage, BaseStorage):
            self.storage = storage
        elif isinstance(storage, dict):
            if 'backend' not in storage and 'root_dir' in storage:
                storage['backend'] = 'FileSystem'
            try:
                backend_cls = getattr(storage_package, storage['backend'])
            except AttributeError:
                try:
                    backend_cls = import_module(storage['backend'])
                except ImportError:
                    self.logger.error('cannot find backend module %s',
                                      storage['backend'])
                    sys.exit()
            kwargs = storage.copy()
            del kwargs['backend']
            self.storage = backend_cls(**kwargs)
github hellock / icrawler / icrawler / storage / filesystem.py View on Github external
# -*- coding: utf-8 -*-

import os
import os.path as osp

import six

from icrawler.storage import BaseStorage


class FileSystem(BaseStorage):
    """Use filesystem as storage backend.

    The id is filename and data is stored as text files or binary files.
    """

    def __init__(self, root_dir):
        self.root_dir = root_dir

    def write(self, id, data):
        filepath = osp.join(self.root_dir, id)
        folder = osp.dirname(filepath)
        if not osp.isdir(folder):
            try:
                os.makedirs(folder)
            except OSError:
                pass