Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def from_settings(cls, settings, crawler=None):
mwlist = cls._get_mwlist_from_settings(settings)
middlewares = []
enabled = []
for clspath in mwlist:
try:
mwcls = load_object(clspath)
mw = create_instance(mwcls, settings, crawler)
middlewares.append(mw)
enabled.append(clspath)
except NotConfigured as e:
if e.args:
clsname = clspath.split('.')[-1]
logger.warning("Disabled %(clsname)s: %(eargs)s",
{'clsname': clsname, 'eargs': e.args[0]},
extra={'crawler': crawler})
logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
{'componentname': cls.component_name,
'enabledlist': pprint.pformat(enabled)},
extra={'crawler': crawler})
return cls(*middlewares)
def __init__(self, datadir, spider_cls=None, settings=None, **kwargs):
logging.info('Slybot %s Spider', slybot.__version__)
if is_zipfile(datadir):
tempdir = tempfile.mkdtemp(prefix='slybot-')
ZipFile(datadir).extractall(tempdir)
atexit.register(shutil.rmtree, tempdir)
datadir = tempdir
if settings is None:
settings = get_project_settings()
self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider
self._specs = open_project_from_dir(datadir)
settings = settings.copy()
settings.frozen = False
settings.set('LOADED_PLUGINS', load_plugins(settings))
self.settings = settings
def __init__(self, settings):
self._pool = HTTPConnectionPool(reactor, persistent=True)
self._pool._factory.noisy = False
self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
self._contextFactory = self._contextFactoryClass()
def get_redis_cluster_from_settings(settings):
params = defaults.REDIS_CLUSTER_PARAMS.copy()
params.update(settings.getdict('REDIS_CLUSTER_PARAMS'))
# XXX: Deprecate REDIS_CLUSTER* settings.
for setting_name, name in REDIS_CLUSTER_SETTINGS_PARAMS_MAP.items():
val = settings.get(setting_name)
if val:
params[name] = val
# Allow ``redis_cluster_cls`` to be a path to a class.
if isinstance(params.get('redis_cluster_cls'), six.string_types):
params['redis_cluster_cls'] = load_object(params['redis_cluster_cls'])
return get_redis_cluster(**params)
def __init__(self, spidercls, settings=None):
if isinstance(spidercls, Spider):
raise ValueError(
'The spidercls argument must be a class, not an object')
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.spidercls = spidercls
self.settings = settings.copy()
self.spidercls.update_settings(self.settings)
self.signals = SignalManager(self)
self.stats = load_object(self.settings['STATS_CLASS'])(self)
handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
logging.root.addHandler(handler)
d = dict(overridden_settings(self.settings))
logger.info("Overridden settings:\n%(settings)s",
{'settings': pprint.pformat(d)})
if get_scrapy_root_handler() is not None:
# scrapy root handler already installed: update it with new settings
install_scrapy_root_handler(self.settings)
# lambda is assigned to Crawler attribute because this way it is not
# garbage collected after leaving __init__ scope
self.__remove_handler = lambda: logging.root.removeHandler(handler)
self.signals.connect(self.__remove_handler, signals.engine_stopped)
def __init__(self):
self.classes = {}
self.mimetypes = MimeTypes()
mimedata = get_data('scrapy', 'mime.types').decode('utf8')
self.mimetypes.readfp(StringIO(mimedata))
for mimetype, cls in six.iteritems(self.CLASSES):
self.classes[mimetype] = load_object(cls)
def configure(self, *args, **opts):
self._install_signals()
extensions.load()
log.msg("Enabled extensions: %s" % ", ".join(extensions.enabled.iterkeys()))
scheduler = load_object(settings['SCHEDULER'])()
scrapyengine.configure(scheduler=scheduler)
self.prioritizer_class = load_object(settings['PRIORITIZER'])
requests = self._parse_args(args)
self.priorities = self.prioritizer_class(requests.keys())
def load(self, task):
'''
Gets task for the spider, loads the tasks's module code and applies code
from configuration to the spider.
'''
self.task = task
configuration = None
if settings.get('TASKS'):
available_tasks = settings.get('TASKS')
if available_tasks.has_key(task.name):
try:
configuration = load_object(available_tasks[task.name])
except Exception, (ErrorMessage):
log.msg('Could not load configuration for task %s' % task.name, level=log.ERROR)
log.msg(ErrorMessage, level=log.DEBUG, domain='tripcentral.ca')
configuration = configuration(task, self)
if hasattr(configuration, 'start_urls'):
setattr(self, 'start_urls', configuration.start_urls)
if hasattr(configuration, 'rules'):
setattr(self, 'rules', configuration.rules)
if hasattr(configuration, 'parse_start_url'):
setattr(self, 'parse_start_url', configuration.parse_start_url)
self.start_urls = self.get_start_urls()
self._compile_rules()
else:
log.msg('%s is not defined in settings.TASKS' % task.name, level=log.ERROR, domain=task.domain )
else:
log.msg('settings.TASKS is not defined', level=log.ERROR, domain=task.domain )
def load(self):
"""
Load pipelines stages defined in settings module
"""
for stage in settings.getlist('ITEM_PIPELINES') or ():
cls = load_object(stage)
if cls:
try:
stageinstance = cls()
self.pipeline.append(stageinstance)
except NotConfigured:
pass
log.msg("Enabled item pipelines: %s" % ", ".join([type(p).__name__ for p in self.pipeline]),
level=log.DEBUG)
self.loaded = True
http_port = config.getint('http_port', 6800)
bind_address = config.get('bind_address', '127.0.0.1')
poll_interval = config.getfloat('poll_interval', 5)
poller = QueuePoller(config)
eggstorage = FilesystemEggStorage(config)
scheduler = SpiderScheduler(config)
environment = Environment(config)
app.setComponent(IPoller, poller)
app.setComponent(IEggStorage, eggstorage)
app.setComponent(ISpiderScheduler, scheduler)
app.setComponent(IEnvironment, environment)
laupath = config.get('launcher', 'scrapydart.launcher.Launcher')
laucls = load_object(laupath)
launcher = laucls(config, app)
webpath = config.get('webroot', 'scrapydart.website.Root')
webcls = load_object(webpath)
timer = TimerService(poll_interval, poller.poll)
webservice = TCPServer(http_port, server.Site(webcls(config, app)), interface=bind_address)
log.msg(format="Scrapyd web console available at http://%(bind_address)s:%(http_port)s/",
bind_address=bind_address, http_port=http_port)
launcher.setServiceParent(app)
timer.setServiceParent(app)
webservice.setServiceParent(app)
return app