Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_make_meta_column_normalization_pyarrow_schema():
# GH228
df = pd.DataFrame(
[{"part": 1, "id": 1, "col1": "abc"}, {"part": 2, "id": 2, "col1": np.nan}],
# Kartothek normalizes field order s.t. partition keys are first and the
# rest is alphabetically. This is reverse.
columns=["col1", "id", "part"],
)
schema = make_meta(
pa.Schema.from_pandas(df), origin="gh228", partition_keys=["part"]
)
fields = [
pa.field("part", pa.int64()),
pa.field("col1", pa.string()),
pa.field("id", pa.int64()),
]
expected_schema = pa.schema(fields)
assert schema.internal().equals(expected_schema, check_metadata=False)
def get_arrow_type(self, dt, is_list):
"""get_arrow_type"""
if dt == dtypes.bool:
arrow_type = pa.bool_()
elif dt == dtypes.int8:
arrow_type = pa.int8()
elif dt == dtypes.int16:
arrow_type = pa.int16()
elif dt == dtypes.int32:
arrow_type = pa.int32()
elif dt == dtypes.int64:
arrow_type = pa.int64()
elif dt == dtypes.uint8:
arrow_type = pa.uint8()
elif dt == dtypes.uint16:
arrow_type = pa.uint16()
elif dt == dtypes.uint32:
arrow_type = pa.uint32()
elif dt == dtypes.uint64:
arrow_type = pa.uint64()
elif dt == dtypes.float16:
arrow_type = pa.float16()
elif dt == dtypes.float32:
arrow_type = pa.float32()
elif dt == dtypes.float64:
arrow_type = pa.float64()
else:
raise TypeError("Unsupported dtype for Arrow" + str(dt))
new_index1 = original_index.copy().remove_values([1, 3], inplace=inplace)
expected_index1 = ExplicitSecondaryIndex(
column="col", dtype=pa.int64(), index_dct={2: ["d"]}
)
assert new_index1 == expected_index1
new_index2 = original_index.copy().remove_values([1.0, 3.0], inplace=inplace)
expected_index2 = ExplicitSecondaryIndex(
column="col", dtype=pa.int64(), index_dct={2: ["d"]}
)
assert new_index2 == expected_index2
new_index3 = original_index.copy().remove_values(["1", "3"], inplace=inplace)
expected_index3 = ExplicitSecondaryIndex(
column="col", dtype=pa.int64(), index_dct={2: ["d"]}
)
assert new_index3 == expected_index3
def test_store_schema_metadata(store, df_all_types):
store_schema_metadata(
schema=make_meta(df_all_types, origin="df_all_types"),
dataset_uuid="some_uuid",
store=store,
table="some_table",
)
key = "some_uuid/some_table/_common_metadata"
assert key in store.keys()
pq_file = pq.ParquetFile(store.open(key))
actual_schema = pq_file.schema.to_arrow_schema()
fields = [
pa.field("array_float32", pa.list_(pa.float64())),
pa.field("array_float64", pa.list_(pa.float64())),
pa.field("array_int16", pa.list_(pa.int64())),
pa.field("array_int32", pa.list_(pa.int64())),
pa.field("array_int64", pa.list_(pa.int64())),
pa.field("array_int8", pa.list_(pa.int64())),
pa.field("array_uint16", pa.list_(pa.uint64())),
pa.field("array_uint32", pa.list_(pa.uint64())),
pa.field("array_uint64", pa.list_(pa.uint64())),
pa.field("array_uint8", pa.list_(pa.uint64())),
pa.field("array_unicode", pa.list_(pa.string())),
pa.field("bool", pa.bool_()),
pa.field("byte", pa.binary()),
pa.field("date", pa.date32()),
pa.field("datetime64", pa.timestamp("us")),
pa.field("float32", pa.float64()),
pa.field("float64", pa.float64()),
pa.field("int16", pa.int64()),
pa.field("int32", pa.int64()),
def test_iterate_over_timestamp_ntz_chunk():
random.seed(datetime.datetime.now())
scale = random.randint(0, 9)
column_meta = [
{"logicalType": "TIMESTAMP_NTZ", "scale": str(scale)},
{"logicalType": "TIMESTAMP_NTZ", "scale": str(scale)}
]
data_type = pyarrow.struct([pyarrow.field('epoch', pyarrow.int64()),
pyarrow.field('fraction', pyarrow.int32())]) if scale > 7 else pyarrow.int64()
def timestamp_ntz_generator(scale):
epoch = random.randint(-621355968, 2534023007)
frac = random.randint(0, 10**scale - 1) * (10**(9 - scale)) if scale > 7 else random.randint(0, 10**scale - 1)
if scale > 7:
return {'epoch': epoch, 'fraction': frac}
else:
epoch = str(epoch)
frac = str(frac)
ZEROFILL = '000000000'
frac = ZEROFILL[:scale - len(frac)] + frac
return int(epoch + frac) if scale else int(epoch)
def expected_data_transform_ntz(_scale):
def expected_data_transform_ntz_impl(data, scale=_scale):
if scale > 7:
raise ValueError("unrecognized type %r" % (type_name,))
if kind in ('int', 'float'):
ty = getattr(pa, type_name)()
elif kind == 'bool':
ty = pa.bool_()
elif kind == 'decimal':
ty = pa.decimal128(9, 9)
elif kind == 'fixed binary':
ty = pa.binary(size)
elif kind == 'varying binary':
ty = pa.binary()
elif kind in ('ascii', 'unicode'):
ty = pa.string()
elif kind == 'int64 list':
ty = pa.list_(pa.int64())
elif kind == 'struct':
ty = pa.struct([pa.field('u', pa.int64()),
pa.field('v', pa.float64()),
pa.field('w', pa.bool_())])
factories = {
'int': self.generate_int_list,
'float': self.generate_float_list,
'bool': self.generate_bool_list,
'decimal': self.generate_decimal_list,
'fixed binary': partial(self.generate_fixed_binary_list,
size=size),
'varying binary': partial(self.generate_varying_binary_list,
min_size=3, max_size=40),
'ascii': partial(self.generate_ascii_string_list,
min_size=3, max_size=40),
def to_arrow_type(dt):
""" Convert Spark data type to pyarrow type
"""
import pyarrow as pa
if type(dt) == BooleanType:
arrow_type = pa.bool_()
elif type(dt) == ByteType:
arrow_type = pa.int8()
elif type(dt) == ShortType:
arrow_type = pa.int16()
elif type(dt) == IntegerType:
arrow_type = pa.int32()
elif type(dt) == LongType:
arrow_type = pa.int64()
elif type(dt) == FloatType:
arrow_type = pa.float32()
elif type(dt) == DoubleType:
arrow_type = pa.float64()
elif type(dt) == DecimalType:
arrow_type = pa.decimal128(dt.precision, dt.scale)
elif type(dt) == StringType:
arrow_type = pa.string()
elif type(dt) == BinaryType:
arrow_type = pa.binary()
elif type(dt) == DateType:
arrow_type = pa.date32()
elif type(dt) == TimestampType:
# Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
arrow_type = pa.timestamp('us', tz='UTC')
elif type(dt) == ArrayType:
def _get_numba_typ_from_pa_typ(pa_typ):
import pyarrow as pa
_typ_map = {
# boolean
pa.bool_(): types.bool_,
# signed int types
pa.int8(): types.int8,
pa.int16(): types.int16,
pa.int32(): types.int32,
pa.int64(): types.int64,
# unsigned int types
pa.uint8(): types.uint8,
pa.uint16(): types.uint16,
pa.uint32(): types.uint32,
pa.uint64(): types.uint64,
# float types (TODO: float16?)
pa.float32(): types.float32,
pa.float64(): types.float64,
# String
pa.string(): string_type,
# date
pa.date32(): types.NPDatetime('ns'),
pa.date64(): types.NPDatetime('ns'),
# time (TODO: time32, time64, ...)
pa.timestamp('ns'): types.NPDatetime('ns'),
pa.timestamp('us'): types.NPDatetime('ns'),
import pyarrow as arrow
int32 = arrow.int32()
int64 = arrow.int64()
def rectify(
edges,
nodes,
edge,
node,
edge_src,
edge_dst,
safe = True
):
return _rectify_node_ids(
edges=_rectify_edge_ids(
edges=edges,
edge=edge,
safe=safe