How to use the scrapy.utils.misc.load_object function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scrapy / scrapy / scrapy / middleware.py View on Github external
def from_settings(cls, settings, crawler=None):
        mwlist = cls._get_mwlist_from_settings(settings)
        middlewares = []
        enabled = []
        for clspath in mwlist:
            try:
                mwcls = load_object(clspath)
                mw = create_instance(mwcls, settings, crawler)
                middlewares.append(mw)
                enabled.append(clspath)
            except NotConfigured as e:
                if e.args:
                    clsname = clspath.split('.')[-1]
                    logger.warning("Disabled %(clsname)s: %(eargs)s",
                                   {'clsname': clsname, 'eargs': e.args[0]},
                                   extra={'crawler': crawler})

        logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
                    {'componentname': cls.component_name,
                     'enabledlist': pprint.pformat(enabled)},
                    extra={'crawler': crawler})
        return cls(*middlewares)
github scrapinghub / portia / slybot / slybot / spidermanager.py View on Github external
def __init__(self, datadir, spider_cls=None, settings=None, **kwargs):
        logging.info('Slybot %s Spider', slybot.__version__)
        if is_zipfile(datadir):
            tempdir = tempfile.mkdtemp(prefix='slybot-')
            ZipFile(datadir).extractall(tempdir)
            atexit.register(shutil.rmtree, tempdir)
            datadir = tempdir

        if settings is None:
            settings = get_project_settings()
        self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider
        self._specs = open_project_from_dir(datadir)
        settings = settings.copy()
        settings.frozen = False
        settings.set('LOADED_PLUGINS', load_plugins(settings))
        self.settings = settings
github scrapy / scrapy / scrapy / core / downloader / handlers / http11.py View on Github external
def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool._factory.noisy = False
        self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        self._contextFactory = self._contextFactoryClass()
github leffss / ScrapyRedisBloomFilterBlockCluster / scrapy_redis_bloomfilter_block_cluster / connection.py View on Github external
def get_redis_cluster_from_settings(settings):
    params = defaults.REDIS_CLUSTER_PARAMS.copy()
    params.update(settings.getdict('REDIS_CLUSTER_PARAMS'))
    # XXX: Deprecate REDIS_CLUSTER* settings.
    for setting_name, name in REDIS_CLUSTER_SETTINGS_PARAMS_MAP.items():
        val = settings.get(setting_name)
        if val:
            params[name] = val

    # Allow ``redis_cluster_cls`` to be a path to a class.
    if isinstance(params.get('redis_cluster_cls'), six.string_types):
        params['redis_cluster_cls'] = load_object(params['redis_cluster_cls'])

    return get_redis_cluster(**params)
github scrapy / scrapy / scrapy / crawler.py View on Github external
def __init__(self, spidercls, settings=None):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings:\n%(settings)s",
                    {'settings': pprint.pformat(d)})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)
github scrapy / scrapy / scrapy / responsetypes.py View on Github external
def __init__(self):
        self.classes = {}
        self.mimetypes = MimeTypes()
        mimedata = get_data('scrapy', 'mime.types').decode('utf8')
        self.mimetypes.readfp(StringIO(mimedata))
        for mimetype, cls in six.iteritems(self.CLASSES):
            self.classes[mimetype] = load_object(cls)
github scrapy / scrapy / scrapy / trunk / scrapy / core / manager.py View on Github external
def configure(self, *args, **opts):
        self._install_signals()

        extensions.load()
        log.msg("Enabled extensions: %s" % ", ".join(extensions.enabled.iterkeys()))

        scheduler = load_object(settings['SCHEDULER'])()

        scrapyengine.configure(scheduler=scheduler)

        self.prioritizer_class = load_object(settings['PRIORITIZER'])

        requests = self._parse_args(args)
        self.priorities = self.prioritizer_class(requests.keys())
github taras / djangoscraper / djangoscraper / spiders / taskspider.py View on Github external
def load(self, task):
        '''
        Gets task for the spider, loads the tasks's module code and applies code
        from configuration to the spider.
        '''
        self.task = task
        configuration = None
        if settings.get('TASKS'):
            available_tasks = settings.get('TASKS')
            if available_tasks.has_key(task.name):
                try:
                    configuration = load_object(available_tasks[task.name])
                except Exception, (ErrorMessage):
                    log.msg('Could not load configuration for task %s' % task.name, level=log.ERROR)
                    log.msg(ErrorMessage, level=log.DEBUG, domain='tripcentral.ca')
                configuration = configuration(task, self)
                if hasattr(configuration, 'start_urls'):
                    setattr(self, 'start_urls', configuration.start_urls)
                if hasattr(configuration, 'rules'):
                    setattr(self, 'rules', configuration.rules)
                if hasattr(configuration, 'parse_start_url'):
                    setattr(self, 'parse_start_url', configuration.parse_start_url)
                self.start_urls = self.get_start_urls()
                self._compile_rules()                
            else:
                log.msg('%s is not defined in settings.TASKS' % task.name, level=log.ERROR, domain=task.domain )
        else:
            log.msg('settings.TASKS is not defined', level=log.ERROR, domain=task.domain )
github scrapy / scrapy / scrapy / item / pipeline.py View on Github external
def load(self):
        """
        Load pipelines stages defined in settings module
        """
        for stage in settings.getlist('ITEM_PIPELINES') or ():
            cls = load_object(stage)
            if cls:
                try:
                    stageinstance = cls()
                    self.pipeline.append(stageinstance)
                except NotConfigured:
                    pass
        log.msg("Enabled item pipelines: %s" % ", ".join([type(p).__name__ for p in self.pipeline]),
            level=log.DEBUG)
        self.loaded = True
github dequinns / ScrapydArt / build / lib / scrapydart / app.py View on Github external
http_port = config.getint('http_port', 6800)
    bind_address = config.get('bind_address', '127.0.0.1')
    poll_interval = config.getfloat('poll_interval', 5)

    poller = QueuePoller(config)
    eggstorage = FilesystemEggStorage(config)
    scheduler = SpiderScheduler(config)
    environment = Environment(config)

    app.setComponent(IPoller, poller)
    app.setComponent(IEggStorage, eggstorage)
    app.setComponent(ISpiderScheduler, scheduler)
    app.setComponent(IEnvironment, environment)

    laupath = config.get('launcher', 'scrapydart.launcher.Launcher')
    laucls = load_object(laupath)
    launcher = laucls(config, app)

    webpath = config.get('webroot', 'scrapydart.website.Root')
    webcls = load_object(webpath)

    timer = TimerService(poll_interval, poller.poll)
    webservice = TCPServer(http_port, server.Site(webcls(config, app)), interface=bind_address)
    log.msg(format="Scrapyd web console available at http://%(bind_address)s:%(http_port)s/",
            bind_address=bind_address, http_port=http_port)

    launcher.setServiceParent(app)
    timer.setServiceParent(app)
    webservice.setServiceParent(app)

    return app