How to use the gramex.data function in gramex

To help you get started, we’ve selected a few gramex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github gramener / gramex / testlib / test_data.py View on Github external
def test_filtercols_limit(self):
        # ?_c=District&_limit=200 returns 200 values. (_limit= defaults to 100 values)
        cols = ['District']
        limit = 200
        args = {'_c': cols, '_limit': [limit]}
        result = gramex.data.filtercols(args=args, **self.urls['census'])
        self.check_keys(result, cols)
        for key, val in result.items():
            eqframe(val, self.unique_of(self.census, key).head(limit))
github gramener / gramex / tests / test_formhandler.py View on Github external
def test_dir(self):
        def check(expected, **params):
            actual = pd.DataFrame(self.get('/formhandler/dir', params=params).json())
            expected.index = actual.index
            afe(actual, expected, check_like=True)

        for path in ('dir/subdir', 'dir/', 'subapp'):
            df = gramex.data.dirstat(os.path.join(folder, path))
            check(df, root=path)
            check(df.sort_values('size'), root=path, _sort='size')
            check(df.sort_values('name', ascending=False), root=path, _sort='-name')
github gramener / gramex / testlib / test_data.py View on Github external
def check_insert_db(self, url, dbname):
        self.db.add(dbname)
        rows = self.insert_rows.copy()
        rows['index'] = [1, 2]  # create a primary key
        inserted = gramex.data.insert(url, args=rows, table='test_insert', id='index')
        eq_(inserted, 2)
        # query table here
        actual = gramex.data.filter(url, table='test_insert')
        expected = pd.DataFrame(rows)
        for df in [actual, expected]:
            df['sales'] = df['sales'].astype(float)
        afe(actual, expected, check_like=True)
        # Check if it created a primary key
        engine = sa.create_engine(url)
        insp = sa.inspect(engine)
        ok_('index' in insp.get_pk_constraint('test_insert')['constrained_columns'])
        # Inserting duplicate keys raises an Exception
        with assert_raises(sa.exc.IntegrityError):
            gramex.data.insert(url, args=rows, table='test_insert', id='index')
github gramener / gramex / gramex / apps / update / gramexupdate.py View on Github external
def consolidate():
    '''Consolidate log data into a database'''
    log_file = variables['LOGFILE']
    data_file = variables['LOGDB']

    # Connect to DB and initialize
    data_url = 'sqlite:///' + data_file
    engine = sa.create_engine(data_url)
    engine.execute('''CREATE TABLE IF NOT EXISTS logs (
        src TEXT, time INT, event TEXT, ip TEXT,
        system TEXT, node TEXT, release TEXT, version TEXT, machine TEXT, processor TEXT,
        pid NUM, args TEXT, cwd TEXT, dir TEXT,
        date TEXT
    )''')

    merged = set(gramex.data.filter(url=data_url, query='SELECT DISTINCT src FROM logs')['src'])

    def merge(path, force=False):
        '''Merge log file from path into database'''
        src = os.path.split(path)[-1]
        if src in merged and not force:
            return
        app_log.info('consolidating %s', src)

        result = []
        for line in io.open(path, 'r', encoding='utf-8'):
            row = json.loads(line)
            row['src'] = src

            # uname is a list. Convert into system data
            (row['system'], row['node'], row['release'], row['version'],
             row['machine'], row['processor']) = row.pop('uname')
github gramener / gramex / gramex / ml.py View on Github external
raise ValueError('cache= must be a FormHandler dict config, not %r' % cache)

    # Store data in cache with fixed columns: source, target, q, t
    result = pd.DataFrame(columns=['source', 'target', 'q', 't'])
    if not q:
        return result
    original_q = q

    # Fetch from cache, if any
    if cache:
        try:
            args = {'q': q, 'target': [target] * len(q)}
            if source:
                args['source'] = [source] * len(q)
            with _translate_cache_lock:
                result = gramex.data.filter(args=args, **cache)
        except Exception:
            app_log.exception('Cannot query %r in translate cache: %r', args, dict(cache))
        # Remove already cached  results from q
        q = [v for v in q if v not in set(result.get('q', []))]

    if len(q):
        new_data = translate_api[api](q, source, target, key)
        if new_data is not None:
            result = result.append(pd.DataFrame(new_data), sort=False)
            if cache:
                with _translate_cache_lock:
                    gramex.data.insert(id=['source', 'target', 'q'], args=new_data, **cache)

    # Sort results by q
    result['order'] = result['q'].map(original_q.index)
    result.sort_values('order', inplace=True)
github gramener / gramex / gramex / handlers / modelhandler.py View on Github external
def put(self, *path_args, **path_kwargs):
        '''
        Request to /model/name/ with no params will create a blank model.
        Request to /model/name/ with args will interpret as model paramters.
        Set Model-Retrain: true in headers to either train a model from scratch or extend it.
        To Extend a trained model, don't update the parameters and send Model-Retrain in headers.
        Request to /model/name/data with args will update the training data,
        doesn't currently work on DF's thanks to the gramex.data bug.
        '''
        try:
            model = gramex.cache.open(self.pickle_file_path, gramex.ml.load)
        except EnvironmentError: # noqa
            model = gramex.ml.Classifier(**self.request_body)
        if self.get_data_flag():
            file_kwargs = self.listify(model.input + [model.output] + ['id'])
            gramex.data.update(model.url, args=file_kwargs, id=file_kwargs['id'])
        else:
            if not self._train(model):
                model.save(self.pickle_file_path)
github gramener / gramex / gramex / apps / logviewer / logviewer.py View on Github external
def prepare_where(query, args, columns):
    '''prepare where clause'''
    wheres = []
    for key, vals in args.items():
        col, agg, op = gramex.data._filter_col(key, columns)
        if col not in columns:
            continue
        if op == '':
            wheres.append('"{}" IN ("{}")'.format(col, '", "'.join(vals)))
        elif op == '!':
            wheres.append('"{}" NOT IN ("{}")'.format(col, '", "'.join(vals)))
        elif op == '>':
            wheres.append('"{}" > "{}"'.format(col, min(vals)))
        elif op == '>~':
            wheres.append('"{}" >= "{}"'.format(col, min(vals)))
        elif op == '<':
            wheres.append('"{}" < "{}"'.format(col, max(vals)))
        elif op == '<~':
            wheres.append('"{}" <= "{}"'.format(col, max(vals)))
        elif op == '~':
            q = ' OR '.join('"{}" LIKE "%{}%"'.format(col, x) for x in vals)
github gramener / gramex / gramex / apps / admin2 / gramexadmin.py View on Github external
if not value[key]:
            error[key] = stderr.strip()

    value['python', 'version'] = '{0}.{1}.{2}'.format(*sys.version_info[:3])
    value['python', 'path'] = sys.executable
    value['gramex', 'version'] = gramex.__version__
    value['gramex', 'path'] = os.path.dirname(gramex.__file__)

    import pandas as pd
    df = pd.DataFrame({'value': value, 'error': error}).reset_index()
    df.columns = ['section', 'key'] + df.columns[2:].tolist()
    df = df[['section', 'key', 'value', 'error']].sort_values(['section', 'key'])
    df['error'] = df['error'].fillna('')
    data = gramex.data.filter(df, handler.args)
    # TODO: handle _format, _meta, _download, etc just like FormHandler
    raise Return(gramex.data.download(data))
github gramener / gramex / gramex / transforms / twitterstream.py View on Github external
self.url = 'https://stream.twitter.com/1.1/statuses/filter.json'
        self.valid_params = {
            'follow', 'track', 'locations', 'delimited', 'stall_warnings',
            'filter_level', 'language'}
        self.enabled = True
        self.delay = 0

        # Set up writers
        if 'path' in kwargs:
            self.stream = StreamWriter(kwargs['path'], flush=kwargs.get('flush', False))
            self.process_bytes = self.stream.write
        elif 'function' in kwargs:
            self.process_json = build_transform(
                kwargs, vars={'message': {}}, filename='TwitterStream:function')
        elif kwargs.get('driver') == 'sqlalchemy':
            engine = gramex.data.create_engine(kwargs['url'], **kwargs.get('parameters', {}))
            table = gramex.data.get_table(kwargs['table'])
            fields = kwargs['fields']
            for field in list(fields.keys()):
                if field not in table.columns:
                    app_log.error('TwitterStream field %s not in table' % field)
                    fields.pop(field)
            flatten = flattener(fields=fields)
            self.process_json = lambda tweet: engine.execute(table.insert(flatten(tweet)))

        self.buf = bytearray()
        self.client = tornado.httpclient.HTTPClient()
        while True:
            # Set .enabled to False to temporarily disable streamer
            if self.enabled:
                params = {key: val.encode('utf-8') for key, val in self.params.items()
                          if key in self.valid_params}
github gramener / gramex / gramex / transforms / twitterstream.py View on Github external
self.valid_params = {
            'follow', 'track', 'locations', 'delimited', 'stall_warnings',
            'filter_level', 'language'}
        self.enabled = True
        self.delay = 0

        # Set up writers
        if 'path' in kwargs:
            self.stream = StreamWriter(kwargs['path'], flush=kwargs.get('flush', False))
            self.process_bytes = self.stream.write
        elif 'function' in kwargs:
            self.process_json = build_transform(
                kwargs, vars={'message': {}}, filename='TwitterStream:function')
        elif kwargs.get('driver') == 'sqlalchemy':
            engine = gramex.data.create_engine(kwargs['url'], **kwargs.get('parameters', {}))
            table = gramex.data.get_table(kwargs['table'])
            fields = kwargs['fields']
            for field in list(fields.keys()):
                if field not in table.columns:
                    app_log.error('TwitterStream field %s not in table' % field)
                    fields.pop(field)
            flatten = flattener(fields=fields)
            self.process_json = lambda tweet: engine.execute(table.insert(flatten(tweet)))

        self.buf = bytearray()
        self.client = tornado.httpclient.HTTPClient()
        while True:
            # Set .enabled to False to temporarily disable streamer
            if self.enabled:
                params = {key: val.encode('utf-8') for key, val in self.params.items()
                          if key in self.valid_params}
                if 'follow' not in params and 'track' not in params and 'locations' not in params: