Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
return datetime.time(hour, minute, second, milisec)
def expected_data_transform_int32(data):
milisec = data % (10**4)
milisec *= 10**2
data //= 10**4
second = data % 60
data //= 60
minute = data % 60
hour = data // 60
return datetime.time(hour, minute, second, milisec)
iterate_over_test_chunk([pyarrow.int64(), pyarrow.int64()],
column_meta_int64, time_generator_int64, expected_data_transform_int64)
iterate_over_test_chunk([pyarrow.int32(), pyarrow.int32()],
column_meta_int32, time_generator_int32, expected_data_transform_int32)
def test_parse_schema(self):
buf = pa.py_buffer(schema_data)
result = _load_schema(buf)
expected = pa.schema([
pa.field("depdelay", pa.int16()),
pa.field("arrdelay", pa.int16())
])
assert result.equals(expected)
def test_ingest_with_datetime():
"""
Test ingesting datetime data with a given schema
"""
schema = pa.schema([
pa.field("foo", pa.int64()),
pa.field("bar", pa.int64()),
pa.field("baz", pa.timestamp("ns"))
])
data = [{"foo": 1, "bar": 2, "baz": "2018-01-01 01:02:03"}, {"foo": 10, "bar": 20, "baz": "2018-01-02 01:02:03"}]
converted_data = client.ingest_data(data, schema)
timestamp_values = [pd.to_datetime("2018-01-01 01:02:03"), pd.to_datetime("2018-01-02 01:02:03")]
assert converted_data.to_pydict() == {'foo': [1, 10], 'bar': [2, 20], 'baz': timestamp_values}
def generate_data(pyarrow_type, column_meta, source_data_generator, batch_count, batch_row_count):
stream = BytesIO()
assert len(pyarrow_type) == len(column_meta)
column_size = len(pyarrow_type)
fields = []
for i in range(column_size):
fields.append(pyarrow.field("column_{}".format(i), pyarrow_type[i], True, column_meta[i]))
schema = pyarrow.schema(fields)
expected_data = []
writer = RecordBatchStreamWriter(stream, schema)
for i in range(batch_count):
column_arrays = []
py_arrays = []
for j in range(column_size):
column_data = []
not_none_cnt = 0
while not_none_cnt == 0:
column_data = []
for _ in range(batch_row_count):
data = None if bool(random.getrandbits(1)) else source_data_generator()
if data is not None:
def write_table(table, where, filesystem, **kwargs): # pylint: disable=unused-argument
path = str(filesystem.tmp_path / FILENAME)
filesystem.files[str(where)] = path
pq.write_table(table, path)
def test_index_metadata(store):
key = "test.parquet"
df = pd.DataFrame({"a": [1]})
table = pa.Table.from_pandas(df)
meta = b"""{
"pandas_version": "0.20.3",
"index_columns": ["__index_level_0__"],
"columns": [
{"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"}
]
}"""
table = table.replace_schema_metadata({b"pandas": meta})
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
store.put(key, buf.getvalue().to_pybytes())
pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)
def test_leak3():
import pyarrow.parquet as pq
df = pd.DataFrame({'a{0}'.format(i): [1, 2, 3, 4]
for i in range(50)})
table = pa.Table.from_pandas(df, preserve_index=False)
writer = pq.ParquetWriter('leak_test_' + tm.rands(5) + '.parquet',
table.schema)
def func():
writer.write_table(table, row_group_size=len(table))
# This does not "leak" per se but we do want to have this use as little
# memory as possible
assert_does_not_leak(func, iterations=500,
check_interval=50, tolerance=20)
def test_load_empty_table_arrow(self, con):
con.execute("drop table if exists baz;")
con.execute("create table baz (a int, b float, c text);")
data = [(1, 1.1, 'a'),
(2, 2.2, '2'),
(3, 3.3, '3')]
df = pd.DataFrame(data, columns=list('abc')).astype({
'a': 'int32',
'b': 'float32'
})
table = pa.Table.from_pandas(df, preserve_index=False)
con.load_table("baz", table, method='arrow')
result = sorted(con.execute("select * from baz"))
self.check_empty_insert(result, data)
con.execute("drop table if exists baz;")
def test_fastparquet_read_with_hdfs():
fs = hdfs_test_client()
df = tm.makeDataFrame()
table = pa.Table.from_pandas(df)
path = '/tmp/testing.parquet'
with fs.open(path, 'wb') as f:
pq.write_table(table, f)
parquet_file = fastparquet.ParquetFile(path, open_with=fs.open)
result = parquet_file.to_pandas()
tm.assert_frame_equal(result, df)
def test_iterate_over_string_chunk():
random.seed(datetime.datetime.now())
column_meta = [
{"logicalType": "TEXT"},
{"logicalType": "TEXT"}
]
field_foo = pyarrow.field("column_foo", pyarrow.string(), True, column_meta[0])
field_bar = pyarrow.field("column_bar", pyarrow.string(), True, column_meta[1])
pyarrow.schema([field_foo, field_bar])
def str_generator():
return str(random.randint(-100, 100))
iterate_over_test_chunk([pyarrow.string(), pyarrow.string()],
column_meta, str_generator)