How to use the fastparquet.dataframe.empty function in fastparquet

To help you get started, we’ve selected a few fastparquet examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github martindurant / uavro / uavro / core.py View on Github external
def make_empty(head):
    """Pre-assign dataframe to put values into"""
    cats = {e['name']: e['symbols'] for e in head['schema']['fields']
            if e['type'] == 'enum'}
    df, arrs = empty(head['dtypes'].values(), head['nrows'],
                     cols=head['dtypes'], cats=cats)

    for entry in head['schema']['fields']:
        # temporary array for decimal
        if entry.get('logicalType', None) == 'decimal':
            if entry['type'] == 'fixed':
                arrs[entry['name']] = np.empty(head['nrows'],
                                               'S%s' % entry['size'])
            else:
                arrs[entry['name']] = np.empty(head['nrows'], "O")
    return df, arrs
github Valassis-Digital-Media / cyavro / cyavro / dask_reader.py View on Github external
C avro reader can interpret them.
    """
    with open_with(URL, 'rb') as f:
        f.seek(start_byte)
        if start_byte == 0:
            header = read_header(f)
            f.seek(header['header_size'])
        data = header['head_bytes'] + f.read(length)
    if nrows is None:
        b = io.BytesIO(data)
        header['blocks'] = []
        scan_blocks(b, header, len(data))
        nrows = sum(b['nrows'] for b in header['blocks'])
    f = cyavro.AvroReader()
    f.init_bytes(data)
    df, arrs = empty(header['dtypes'].values(), nrows, cols=header['dtypes'])
    f.init_reader()
    f.init_buffers(10000)
    for i in range(0, nrows, 10000):
        d = f.read_chunk()
        for c in d:
            s = [f for f in header['schema']['fields'] if f['name'] == c][0]
            if 'logicalType' in s:
                df[c].values[i:i + 10000] = time_convert(d[c], s)
            else:
                df[c].values[i:i + 10000] = d[c]
    return df