How to use the memorious.settings function in memorious

To help you get started, we’ve selected a few memorious examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github alephdata / memorious / memorious / operations / aleph.py View on Github external
def aleph_emit(context, data):
    if not settings.ALEPH_HOST:
        context.log.warning("No $MEMORIOUS_ALEPH_HOST, skipping upload...")
        return
    if not settings.ALEPH_API_KEY:
        context.log.warning("No $MEMORIOUS_ALEPH_API_KEY, skipping upload...")
        return

    session_id = 'memorious:%s' % context.crawler.name
    api = AlephAPI(settings.ALEPH_HOST, settings.ALEPH_API_KEY,
                   session_id=session_id)
    collection_id = get_collection_id(context, api)
    if collection_id is None:
        context.log.warning("Cannot get aleph collection.")
        return

    content_hash = data.get('content_hash')
    source_url = data.get('source_url', data.get('url'))
github alephdata / memorious / memorious / operations / aleph.py View on Github external
def aleph_emit(context, data):
    if not settings.ALEPH_HOST:
        context.log.warning("No $MEMORIOUS_ALEPH_HOST, skipping upload...")
        return
    if not settings.ALEPH_API_KEY:
        context.log.warning("No $MEMORIOUS_ALEPH_API_KEY, skipping upload...")
        return

    session_id = 'memorious:%s' % context.crawler.name
    api = AlephAPI(settings.ALEPH_HOST, settings.ALEPH_API_KEY,
                   session_id=session_id)
    collection_id = get_collection_id(context, api)
    if collection_id is None:
        context.log.warning("Cannot get aleph collection.")
        return

    content_hash = data.get('content_hash')
    source_url = data.get('source_url', data.get('url'))
    foreign_id = data.get('foreign_id', data.get('request_id', source_url))
    if context.skip_incremental(collection_id, foreign_id, content_hash):
        context.log.info("Skip aleph upload: %s", foreign_id)
        return

    meta = {
        'crawler': context.crawler.name,
        'foreign_id': foreign_id,
github alephdata / memorious / memorious / cli.py View on Github external
def cli(debug, cache, incremental):
    """Crawler framework for documents and structured scrapers."""
    settings.HTTP_CACHE = cache
    settings.INCREMENTAL = incremental
    settings.DEBUG = debug
    if settings.DEBUG:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
    init_memorious()
github alephdata / memorious / memorious / core.py View on Github external
def load_manager():
    if not hasattr(settings, '_manager'):
        from memorious.logic.manager import CrawlerManager
        settings._manager = CrawlerManager()
        if settings.CONFIG_PATH:
            settings._manager.load_path(settings.CONFIG_PATH)
    return settings._manager
github alephdata / memorious / memorious / logic / signal_handlers.py View on Github external
def log_operation_start(context):
    if settings.REDIS_HOST:
        r = redis.Redis(connection_pool=redis_pool)
        crawler_name = context.crawler.name
        stage_name = context.stage.name
        r.incr(crawler_name)
        r.incr(crawler_name + ":" + stage_name)
        r.incr(crawler_name + ":total_ops")
        r.set(crawler_name + ":last_run", datetime.datetime.now())
    else:
        pass
github alephdata / memorious / memorious / logic / http.py View on Github external
def reset(self):
        self.session = Session()
        self.session.headers['User-Agent'] = settings.USER_AGENT
        if self.context.crawler.stealthy:
            self.session.headers['User-Agent'] = UserAgent().random()
        return self.session
github alephdata / memorious / memorious / logic / crawler.py View on Github external
self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding='utf-8') as fh:
            self.config_yaml = fh.read()
            self.config = yaml.safe_load(self.config_yaml)

        self.name = os.path.basename(source_file)
        self.name = self.config.get('name', self.name)
        self.description = self.config.get('description', self.name)
        self.category = self.config.get('category', 'scrape')
        self.schedule = self.config.get('schedule')
        self.disabled = self.config.get('disabled', False)
        self.init_stage = self.config.get('init', 'init')
        self.delta = Crawler.SCHEDULES.get(self.schedule)
        self.delay = int(self.config.get('delay', 0))
        self.expire = int(self.config.get('expire', settings.EXPIRE)) * 84600
        self.stealthy = self.config.get('stealthy', False)
        self.aggregator_config = self.config.get('aggregator', {})

        self.stages = {}
        for name, stage in self.config.get('pipeline', {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)
github alephdata / memorious / memorious / core.py View on Github external
def load_datastore():
    if not hasattr(settings, '_datastore'):
        if not settings.DATASTORE_URI:
            raise RuntimeError("No $MEMORIOUS_DATASTORE_URI.")
        # do not pool connections for the datastore
        engine_kwargs = {'poolclass': NullPool}
        settings._datastore = dataset.connect(settings.DATASTORE_URI,
                                              engine_kwargs=engine_kwargs)
        # Use bigint to store integers by default
        settings._datastore.types.integer = settings._datastore.types.bigint
    return settings._datastore
github alephdata / memorious / memorious / worker.py View on Github external
def periodic(self):
        if self.scheduler.check() and not settings.DEBUG:
            log.info("Running scheduled crawlers ...")
            self.scheduler.update()
            manager.run_scheduled()