Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def aggregate(self, data):
names = list(data[0].keys())
cols = {name: [] for name in names}
for entry in data:
for key in entry:
val = entry[key]
cols[key].append(val)
arrays = [pa.array(cols[col]) for col in cols]
table = pa.Table.from_arrays(arrays, names=names)
self.data = table
def _convert_data_with_column_names_dict(data, schema):
column_data = {}
array_data = []
schema_names = []
for row in data:
for column in schema:
_col = column_data.get(column, [])
_col.append(row.get(column))
column_data[column] = _col
for column in schema.keys():
_col = column_data.get(column)
array_data.append(pa.array(_col))
# Use custom column names given by user
schema_names.append(schema[column])
return pa.RecordBatch.from_arrays(array_data, schema_names)
):
keys = np.asarray([d.to_datetime64() for d in keys_it])
else:
keys = np.asarray(list(keys_it))
# TODO: Remove work-around
# This is because of ARROW-1646:
# [Python] pyarrow.array cannot handle NumPy scalar types
# Additional note: pyarrow.array is supposed to infer type automatically.
# But the inferred type is not enough to hold np.uint64. Until this is fixed in
# upstream Arrow, we have to retain the following line
if not index_dct:
# the np.array dtype will be double which arrow cannot convert to the target type, so use an empty list instead
labeled_array = pa.array([], type=dtype)
else:
labeled_array = pa.array(keys, type=dtype)
partition_array = pa.array(list(index_dct.values()))
return pa.Table.from_arrays(
[labeled_array, partition_array], names=[column, _PARTITION_COLUMN_NAME]
)
def setUp(self):
self._array = pa.array(list(range(self.n)), type=pa.int64())
self._array_items = list(self._array)
def main(dirpath):
connected_components = read_connected_components('%s/cc.parquet' % dirpath)
buckets_matrix = read_buckets_matrix('%s/buckets.parquet' % dirpath)
# The result is a list of communities. Each community is a list of element-ids
coms = community_detector.detect_communities(connected_components,
buckets_matrix)
com_ids = list(range(len(coms)))
data = [pa.array(com_ids), pa.array(coms)]
batch = pa.RecordBatch.from_arrays(data, ['community_id', 'element_ids'])
table = pa.Table.from_batches([batch])
pq.write_table(table, '%s/communities.parquet' % dirpath)
else:
keys = np.asarray(list(keys_it))
# TODO: Remove work-around
# This is because of ARROW-1646:
# [Python] pyarrow.array cannot handle NumPy scalar types
# Additional note: pyarrow.array is supposed to infer type automatically.
# But the inferred type is not enough to hold np.uint64. Until this is fixed in
# upstream Arrow, we have to retain the following line
if not index_dct:
# the np.array dtype will be double which arrow cannot convert to the target type, so use an empty list instead
labeled_array = pa.array([], type=dtype)
else:
labeled_array = pa.array(keys, type=dtype)
partition_array = pa.array(list(index_dct.values()))
return pa.Table.from_arrays(
[labeled_array, partition_array], names=[column, _PARTITION_COLUMN_NAME]
)
"{idx}".format(
typ=self.__class__.__name__,
n=len(self),
idx=indices[invalid_mask][0]
)
)
# Build pyarrow array of indices
indices = pa.array(indices.astype('int'), mask=indices < 0)
else:
# Convert negative indices to positive
negative_mask = indices < 0
indices[negative_mask] = indices[negative_mask] + len(self)
# Build pyarrow array of indices
indices = pa.array(indices.astype('int'))
return self.__class__(self.data.take(indices), dtype=self.dtype)
array_data.append(pa.array(_converted_col, type=pa.date32()))
# Float types are ambiguous for conversions, need to specify the exact type
elif column.type.id == pa.float64().id:
array_data.append(pa.array(_col, type=pa.float64()))
elif column.type.id == pa.float32().id:
# Python doesn't have a native float32 type
# and PyArrow cannot cast float64 -> float32
_col = pd.to_numeric(_col, downcast='float')
array_data.append(pa.Array.from_pandas(_col, type=pa.float32()))
elif column.type.id == pa.int32().id:
# PyArrow 0.8.0 can cast int64 -> int32
_col64 = pa.array(_col, type=pa.int64())
array_data.append(_col64.cast(pa.int32()))
elif column.type.id == pa.bool_().id:
_col = map(_boolean_converter, _col)
array_data.append(pa.array(_col, type=column.type))
else:
array_data.append(pa.array(_col, type=column.type))
if isinstance(field_aliases, dict):
schema_names.append(field_aliases.get(column.name, column.name))
else:
schema_names.append(column.name)
return pa.RecordBatch.from_arrays(array_data, schema_names)
import pyarrow as pa
#conda install pyarrow
data = [
pa.array([1, 2, 3, 4]),
pa.array(['foo', 'bar', 'baz', None]),
pa.array([True, None, False, True])
]
batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])
print(batch)
print(batch.num_rows)
print(batch.num_columns)
sink = pa.BufferOutputStream()
writer = pa.RecordBatchFileWriter(sink, batch.schema)
for i in range(5):
writer.write_batch(batch)