Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_index_metadata(store):
key = "test.parquet"
df = pd.DataFrame({"a": [1]})
table = pa.Table.from_pandas(df)
meta = b"""{
"pandas_version": "0.20.3",
"index_columns": ["__index_level_0__"],
"columns": [
{"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"}
]
}"""
table = table.replace_schema_metadata({b"pandas": meta})
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
store.put(key, buf.getvalue().to_pybytes())
pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)
:param columns: Names of columns to read from the file
:param filters: List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
:param procs_cpu_bound: Number of cores used for CPU bound tasks
"""
session = session_primitives.session
is_file: bool = session.s3.does_object_exists(path=path)
if is_file is False:
path = path[:-1] if path[-1] == "/" else path
procs_cpu_bound = procs_cpu_bound if procs_cpu_bound is not None else session_primitives.procs_cpu_bound if session_primitives.procs_cpu_bound is not None else 1
use_threads: bool = True if procs_cpu_bound > 1 else False
logger.debug(f"Reading Parquet: {path}")
if is_file is True:
client_s3 = session.boto3_session.client(service_name="s3", use_ssl=True, config=session.botocore_config)
bucket, key = path.replace("s3://", "").split("/", 1)
obj = client_s3.get_object(Bucket=bucket, Key=key)
table = pq.ParquetFile(source=BytesIO(obj["Body"].read())).read(columns=columns, use_threads=use_threads)
else:
fs: S3FileSystem = s3.get_fs(session_primitives=session_primitives)
fs = pa.filesystem._ensure_filesystem(fs)
fs.invalidate_cache()
table = pq.read_table(source=path, columns=columns, filters=filters, filesystem=fs, use_threads=use_threads)
# Check if we lose some integer during the conversion (Happens when has some null value)
integers = [field.name for field in table.schema if str(field.type).startswith("int")]
logger.debug(f"Converting to Pandas: {path}")
df = table.to_pandas(use_threads=use_threads, integer_object_nulls=True)
for c in integers:
if not str(df[c].dtype).startswith("int"):
df[c] = df[c].astype("Int64")
logger.debug(f"Done: {path}")
return df
Returns
-------
pd.DataFrame
A DataFrame with relevant parquet metadata
"""
if not isinstance(table_name, str):
raise TypeError("Expecting a string for parameter `table_name`.")
if callable(store):
store = store()
data = {}
if table_name in self.files:
with store.open(self.files[table_name]) as fd: # type: ignore
pq_metadata = pa.parquet.ParquetFile(fd).metadata
try:
metadata_dict = pq_metadata.to_dict()
except AttributeError: # No data in file
metadata_dict = None
data = {
"partition_label": self.label,
"serialized_size": pq_metadata.serialized_size,
"number_rows_total": pq_metadata.num_rows,
"number_row_groups": pq_metadata.num_row_groups,
"row_group_id": [0],
"number_rows_per_row_group": [0],
"row_group_compressed_size": [0],
"row_group_uncompressed_size": [0],
}
if metadata_dict:
def _write_partition_pyarrow(
df, path, fs, filename, write_index, partition_on, metadata_path=None, **kwargs
):
import pyarrow as pa
from pyarrow import parquet
t = pa.Table.from_pandas(df, preserve_index=write_index)
if partition_on:
parquet.write_to_dataset(
t,
path,
partition_cols=partition_on,
preserve_index=write_index,
filesystem=fs,
**kwargs
)
else:
with fs.open(filename, "wb") as fil:
parquet.write_table(t, fil, **kwargs)
if metadata_path is not None:
with fs.open(metadata_path, "wb") as fil:
# Get only arguments specified in the function
kwargs_meta = {
k: v for k, v in kwargs.items() if k in _pyarrow_write_metadata_kwargs
def _schema2bytes(schema):
buf = pa.BufferOutputStream()
pq.write_metadata(schema, buf, version="2.0", coerce_timestamps="us")
return buf.getvalue().to_pybytes()
if "_metadata" in fns and "validate_schema" not in dataset_kwargs:
dataset_kwargs["validate_schema"] = False
if "_metadata" in fns or gather_statistics is not False:
# Let arrow do its thing (use _metadata or scan files)
dataset = pq.ParquetDataset(
paths, filesystem=fs, filters=filters, **dataset_kwargs
)
else:
# Use _common_metadata file if it is available.
# Otherwise, just use 0th file
if "_common_metadata" in fns:
dataset = pq.ParquetDataset(
base + fs.sep + "_common_metadata", filesystem=fs, **dataset_kwargs
)
else:
dataset = pq.ParquetDataset(
allpaths[0], filesystem=fs, **dataset_kwargs
)
parts = [base + fs.sep + fn for fn in fns]
else:
# There is only one file to read
dataset = pq.ParquetDataset(paths, filesystem=fs, **dataset_kwargs)
return parts, dataset
def get_parquet_dataset(self, path):
return pq.ParquetDataset(self.get_localized_path(path), filesystem=self.get_filesystem())
file_name += f"{data_type}-"
elif var == "exchange":
file_name += f"{exchange}-"
elif var == "pair":
file_name += f"{pair}-"
else:
print(var)
raise ValueError("Invalid file format specified for parquet file")
file_name = file_name[:-1] + ".parquet"
else:
file_name = f'{exchange}-{data_type}-{pair}-{int(timestamp)}.parquet'
if self.path:
file_name = os.path.join(self.path, file_name)
pq.write_table(self.data, file_name)
self.data = None
if self._write:
for func, bucket, prefix, kwargs in zip(self._write, self.bucket, self.prefix, self.kwargs):
path = f'{exchange}/{data_type}/{pair}/{exchange}-{data_type}-{pair}-{int(timestamp)}.parquet'
if prefix:
path = f"{prefix}/{path}"
func(bucket, path, file_name, **kwargs)
if self.del_file:
os.remove(file_name)
def export_parquet(dataset, path, column_names=None, byteorder="=", shuffle=False, selection=False, progress=None, virtual=True, sort=None, ascending=True):
table = _export_table(dataset, column_names, byteorder, shuffle, selection, progress, virtual, sort, ascending)
pq.write_table(table, path)