How to use the pyarrow.array function in pyarrow

To help you get started, we’ve selected a few pyarrow examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github bmoscon / cryptostore / cryptostore / data / parquet.py View on Github external
def aggregate(self, data):
        names = list(data[0].keys())
        cols = {name: [] for name in names}

        for entry in data:
            for key in entry:
                val = entry[key]
                cols[key].append(val)
        arrays = [pa.array(cols[col]) for col in cols]
        table = pa.Table.from_arrays(arrays, names=names)
        self.data = table
github andrewgross / json2parquet / json2parquet / client.py View on Github external
def _convert_data_with_column_names_dict(data, schema):
    column_data = {}
    array_data = []
    schema_names = []
    for row in data:
        for column in schema:
            _col = column_data.get(column, [])
            _col.append(row.get(column))
            column_data[column] = _col
    for column in schema.keys():
        _col = column_data.get(column)
        array_data.append(pa.array(_col))
        # Use custom column names given by user
        schema_names.append(schema[column])
    return pa.RecordBatch.from_arrays(array_data, schema_names)
github JDASoftwareGroup / kartothek / kartothek / core / index.py View on Github external
):
            keys = np.asarray([d.to_datetime64() for d in keys_it])
        else:
            keys = np.asarray(list(keys_it))

    # TODO: Remove work-around
    # This is because of ARROW-1646:
    #   [Python] pyarrow.array cannot handle NumPy scalar types
    # Additional note: pyarrow.array is supposed to infer type automatically.
    # But the inferred type is not enough to hold np.uint64. Until this is fixed in
    # upstream Arrow, we have to retain the following line
    if not index_dct:
        # the np.array dtype will be double which arrow cannot convert to the target type, so use an empty list instead
        labeled_array = pa.array([], type=dtype)
    else:
        labeled_array = pa.array(keys, type=dtype)

    partition_array = pa.array(list(index_dct.values()))

    return pa.Table.from_arrays(
        [labeled_array, partition_array], names=[column, _PARTITION_COLUMN_NAME]
    )
github apache / arrow / python / benchmarks / array_ops.py View on Github external
def setUp(self):
        self._array = pa.array(list(range(self.n)), type=pa.int64())
        self._array_items = list(self._array)
github src-d / gemini / src / main / python / community-detector / report.py View on Github external
def main(dirpath):
    connected_components = read_connected_components('%s/cc.parquet' % dirpath)
    buckets_matrix = read_buckets_matrix('%s/buckets.parquet' % dirpath)

    # The result is a list of communities. Each community is a list of element-ids
    coms = community_detector.detect_communities(connected_components,
                                                 buckets_matrix)
    com_ids = list(range(len(coms)))

    data = [pa.array(com_ids), pa.array(coms)]
    batch = pa.RecordBatch.from_arrays(data, ['community_id', 'element_ids'])

    table = pa.Table.from_batches([batch])
    pq.write_table(table, '%s/communities.parquet' % dirpath)
github JDASoftwareGroup / kartothek / kartothek / core / index.py View on Github external
else:
            keys = np.asarray(list(keys_it))

    # TODO: Remove work-around
    # This is because of ARROW-1646:
    #   [Python] pyarrow.array cannot handle NumPy scalar types
    # Additional note: pyarrow.array is supposed to infer type automatically.
    # But the inferred type is not enough to hold np.uint64. Until this is fixed in
    # upstream Arrow, we have to retain the following line
    if not index_dct:
        # the np.array dtype will be double which arrow cannot convert to the target type, so use an empty list instead
        labeled_array = pa.array([], type=dtype)
    else:
        labeled_array = pa.array(keys, type=dtype)

    partition_array = pa.array(list(index_dct.values()))

    return pa.Table.from_arrays(
        [labeled_array, partition_array], names=[column, _PARTITION_COLUMN_NAME]
    )
github holoviz / spatialpandas / spatialpandas / geometry / base.py View on Github external
"{idx}".format(
                        typ=self.__class__.__name__,
                        n=len(self),
                        idx=indices[invalid_mask][0]
                    )
                )

            # Build pyarrow array of indices
            indices = pa.array(indices.astype('int'), mask=indices < 0)
        else:
            # Convert negative indices to positive
            negative_mask = indices < 0
            indices[negative_mask] = indices[negative_mask] + len(self)

            # Build pyarrow array of indices
            indices = pa.array(indices.astype('int'))

        return self.__class__(self.data.take(indices), dtype=self.dtype)
github andrewgross / json2parquet / json2parquet / client.py View on Github external
array_data.append(pa.array(_converted_col, type=pa.date32()))
        # Float types are ambiguous for conversions, need to specify the exact type
        elif column.type.id == pa.float64().id:
            array_data.append(pa.array(_col, type=pa.float64()))
        elif column.type.id == pa.float32().id:
            # Python doesn't have a native float32 type
            # and PyArrow cannot cast float64 -> float32
            _col = pd.to_numeric(_col, downcast='float')
            array_data.append(pa.Array.from_pandas(_col, type=pa.float32()))
        elif column.type.id == pa.int32().id:
            # PyArrow 0.8.0 can cast int64 -> int32
            _col64 = pa.array(_col, type=pa.int64())
            array_data.append(_col64.cast(pa.int32()))
        elif column.type.id == pa.bool_().id:
            _col = map(_boolean_converter, _col)
            array_data.append(pa.array(_col, type=column.type))
        else:
            array_data.append(pa.array(_col, type=column.type))
        if isinstance(field_aliases, dict):
            schema_names.append(field_aliases.get(column.name, column.name))
        else:
            schema_names.append(column.name)
    return pa.RecordBatch.from_arrays(array_data, schema_names)
github aloneguid / parquet-dotnet / 3rdparty / pyarrow / threecols.py View on Github external
import pyarrow as pa
#conda install pyarrow

data = [
   pa.array([1, 2, 3, 4]),
   pa.array(['foo', 'bar', 'baz', None]),
   pa.array([True, None, False, True])
]

batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])

print(batch)
print(batch.num_rows)
print(batch.num_columns)

sink = pa.BufferOutputStream()
writer = pa.RecordBatchFileWriter(sink, batch.schema)

for i in range(5):
   writer.write_batch(batch)