Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
>>> sample, blocks = read_bytes('s3://bucket/2015-*-*.csv', delimiter=b'\\n') # doctest: +SKIP
>>> sample, paths, blocks = read_bytes('2015-*-*.csv', include_path=True) # doctest: +SKIP
Returns
-------
sample : bytes
The sample header
blocks : list of lists of ``dask.Delayed``
Each list corresponds to a file, and each delayed object computes to a
block of bytes from that file.
paths : list of strings, only included if include_path is True
List of same length as blocks, where each item is the path to the file
represented in the corresponding block.
"""
fs, fs_token, paths = get_fs_token_paths(urlpath, mode="rb", storage_options=kwargs)
if len(paths) == 0:
raise IOError("%s resolved to no files" % urlpath)
if blocksize is not None:
if isinstance(blocksize, str):
blocksize = parse_bytes(blocksize)
if not is_integer(blocksize):
raise TypeError("blocksize must be an integer")
blocksize = int(blocksize)
if blocksize is None:
offsets = [[0]] * len(paths)
lengths = [[None]] * len(paths)
else:
offsets = []
--------
read_parquet: Read parquet data to dask.dataframe
"""
partition_on = partition_on or []
if set(partition_on) - set(df.columns):
raise ValueError("Partitioning on non-existent column")
if compression != "default":
kwargs["compression"] = compression
elif snappy is not None:
kwargs["compression"] = "snappy"
write = get_engine(engine)["write"]
fs, fs_token, _ = get_fs_token_paths(
path, mode="wb", storage_options=storage_options
)
fs.mkdirs(path, exist_ok=True)
# Trim any protocol information from the path before forwarding
path = infer_storage_options(path)["path"]
out = write(
df,
fs,
fs_token,
path,
write_index=write_index,
append=append,
ignore_divisions=ignore_divisions,
partition_on=partition_on,
**kwargs
path,
columns,
filters,
categories,
index,
storage_options,
engine,
gather_statistics,
)
if isinstance(engine, str):
engine = get_engine(engine)
if hasattr(path, "name"):
path = stringify_path(path)
fs, _, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options)
paths = sorted(paths, key=natural_sort_key) # numeric rather than glob ordering
auto_index_allowed = False
if index is None:
# User is allowing auto-detected index
auto_index_allowed = True
if index and isinstance(index, str):
index = [index]
meta, statistics, parts = engine.read_metadata(
fs,
paths,
categories=categories,
index=index,
gather_statistics=gather_statistics,
if is_ParquetFile:
read = get_engine("fastparquet")["read"]
if path.fn.endswith("_metadata"):
# remove '_metadata' from path
urlpath = path.fn[: -len("_metadata")]
else:
urlpath = path.fn
fs, fs_token, paths = get_fs_token_paths(
urlpath, mode="rb", storage_options=storage_options
)
paths = path
else:
read = get_engine(engine)["read"]
fs, fs_token, paths = get_fs_token_paths(
path, mode="rb", storage_options=storage_options
)
if isinstance(path, string_types) and len(paths) > 1:
# Sort paths naturally if multiple paths resulted from a single
# specification (by '*' globbing)
paths = sorted(paths, key=natural_sort_key)
return read(
fs,
fs_token,
paths,
columns=columns,
filters=filters,
categories=categories,
index=index,
"'engine' should be set to 'auto' or 'fastparquet' "
+ "when reading from fastparquet.ParquetFile"
)
is_ParquetFile = True
except ImportError:
pass
if is_ParquetFile:
read = get_engine("fastparquet")["read"]
if path.fn.endswith("_metadata"):
# remove '_metadata' from path
urlpath = path.fn[: -len("_metadata")]
else:
urlpath = path.fn
fs, fs_token, paths = get_fs_token_paths(
urlpath, mode="rb", storage_options=storage_options
)
paths = path
else:
read = get_engine(engine)["read"]
fs, fs_token, paths = get_fs_token_paths(
path, mode="rb", storage_options=storage_options
)
if isinstance(path, string_types) and len(paths) > 1:
# Sort paths naturally if multiple paths resulted from a single
# specification (by '*' globbing)
paths = sorted(paths, key=natural_sort_key)
return read(
fs,
if isinstance(partition_on, str):
partition_on = [partition_on]
if set(partition_on) - set(df.columns):
raise ValueError(
"Partitioning on non-existent column. "
"partition_on=%s ."
"columns=%s" % (str(partition_on), str(list(df.columns)))
)
if isinstance(engine, str):
engine = get_engine(engine)
if hasattr(path, "name"):
path = stringify_path(path)
fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options)
# Trim any protocol information from the path before forwarding
path = fs._strip_protocol(path)
# Save divisions and corresponding index name. This is necessary,
# because we may be resetting the index to write the file
division_info = {"divisions": df.divisions, "name": df.index.name}
if division_info["name"] is None:
# As of 0.24.2, pandas will rename an index with name=None
# when df.reset_index() is called. The default name is "index",
# (or "level_0" if "index" is already a column name)
division_info["name"] = "index" if "index" not in df.columns else "level_0"
# If write_index==True (default), reset the index and record the
# name of the original index in `index_cols` (will be `index` if None,
# or `level_0` if `index` is already a column name).
# `fastparquet` will use `index_cols` to specify the index column(s)