Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# NOTE: should the index here be cols['cl_id'] ?
cl = dd.read_parquet(
path,
columns=cols_toload - set([cols['pl_id']]),
index=cols['pl_id'])
elif self.parquet_engine == 'pyarrow':
try:
pf = pq.ParquetFile(path)
itr = (
pf.read_row_group(
i,
columns=cols_toload).to_pandas().set_index(
cols['pl_id'])
for i in range(pf.num_row_groups))
except pa.ArrowIOError:
pf = pq.ParquetDataset(path)
itr = (pf.read(columns=cols_toload).to_pandas().set_index(
cols['pl_id']) for i in [1])
elif self.parquet_engine == 'fastparquet':
pf = fp.ParquetFile(path)
itr = pf.iter_row_groups(columns=list(cols_toload), index=cols['pl_id'])
if dask:
cl = self._search_for_codes_df_inner(
cl=cl,
codes=codes,
cols=cols,
year=year,
keep_vars=keep_vars,
rename=rename,
collapse_codes=collapse_codes,
pl_ids_to_filter=pl_ids_to_filter)
def parquet_file_schema(file_name):
import pyarrow.parquet as pq
col_names = []
col_types = []
pq_dataset = pq.ParquetDataset(file_name)
col_names = pq_dataset.schema.names
pa_schema = pq_dataset.schema.to_arrow_schema()
col_types = [_get_numba_typ_from_pa_typ(pa_schema.field_by_name(c).type)
for c in col_names]
# TODO: close file?
return col_names, col_types
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
"""Main worker function. Loads and returns all rows matching the predicate from a rowgroup
Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
the rest of the columns are not loaded.
:param piece_index:
:param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
of partitions.
:return:
"""
if not self._dataset:
self._dataset = pq.ParquetDataset(
self._dataset_path,
filesystem=self._filesystem,
validate_schema=False)
piece = self._split_pieces[piece_index]
# Create pyarrow file system
parquet_file = ParquetFile(self._dataset.fs.open(piece.path))
if not isinstance(self._local_cache, NullCache):
if worker_predicate:
raise RuntimeError('Local cache is not supported together with predicates, '
'unless the dataset is partitioned by the column the predicate operates on.')
if shuffle_row_drop_partition[1] != 1:
raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')
def _determine_dataset_parts(fs, paths, gather_statistics, filters, dataset_kwargs):
""" Determine how to access metadata and break read into ``parts``
This logic is mostly to handle `gather_statistics=False` cases,
because this also means we should avoid scanning every file in the
dataset.
"""
parts = []
if len(paths) > 1:
if gather_statistics is not False:
# This scans all the files
dataset = pq.ParquetDataset(
paths, filesystem=fs, filters=filters, **dataset_kwargs
)
else:
base, fns = _analyze_paths(paths, fs)
if "_metadata" in fns:
# We have a _metadata file, lets use it
dataset = pq.ParquetDataset(
base + fs.sep + "_metadata",
filesystem=fs,
filters=filters,
**dataset_kwargs,
)
else:
# Rely on metadata for 0th file.
# Will need to pass a list of paths to read_partition
dataset = pq.ParquetDataset(paths[0], filesystem=fs, **dataset_kwargs)
toload_regex.append(r'^(hmoind\d{2})$')
if self.year_type == 'age':
toload_regex.append(r'^(bene_dob)$')
for keep_var in keep_vars:
if isinstance(keep_var, str):
toload_regex.append(r'^({})$'.format(keep_var))
toload_regex = re.compile('|'.join(toload_regex)).search
toload_vars: Dict[int, List[str]] = {}
for year in self.years:
if self.parquet_engine == 'pyarrow':
try:
pf = pq.ParquetFile(self._fpath(self.percent, year, 'bsfab'))
except pa.ArrowIOError:
pf = pq.ParquetDataset(self._fpath(self.percent, year, 'bsfab'))
cols = pf.schema.names
elif self.parquet_engine == 'fastparquet':
pf = fp.ParquetFile(self._fpath(self.percent, year, 'bsfab'))
cols = pf.columns
toload_vars[year] = [x for x in cols if toload_regex(x)]
for keep_var in keep_vars:
if isinstance(keep_var, re._pattern_type):
toload_vars[year].extend([
x for x in cols if keep_var.search(x)])
# Deduplicate while preserving order
toload_vars[year] = list(dict.fromkeys(toload_vars[year]))
# Check cols against keep_vars
# Is there an item in keep_vars that wasn't matched?
:param schema: A unischema corresponding to the data in the dataset
:param ngram: An instance of NGram if ngrams should be read or None, if each row in the dataset corresponds to
a single sample returned.
:param local_cache: An instance of a rowgroup cache (CacheBase interface) object to be used.
:param worker_predicate: An instance of predicate (PredicateBase interface)
:param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
libhdfs (java through JNI) or libhdfs3 (C++)
"""
self._dataset_url_parsed = urlparse(dataset_url)
self._schema = schema
self._ngram = ngram
self._local_cache = local_cache
self._worker_predicate = worker_predicate
resolver = FilesystemResolver(self._dataset_url_parsed, hdfs_driver=hdfs_driver)
self._dataset = pq.ParquetDataset(
resolver.get_dataset_path(),
filesystem=resolver.filesystem(),
validate_schema=False)
# Get ehic-bene_id crosswalk
# If self.pl exists, then cl data frames use only those ids
# So I can merge using that
if self.pl is not None:
if 'match' in self.pl.columns:
right = self.pl.loc(axis=0)[:, year]
right.loc[right['match'], 'ehic']
else:
right = self.pl.loc[(slice(None), year), 'ehic']
right = right.to_frame()
else:
if self.parquet_engine == 'pyarrow':
try:
pf = pq.ParquetFile(self._fpath(self.percent, year, 'bsfab'))
except pa.ArrowIOError:
pf = pq.ParquetDataset(self._fpath(self.percent, year, 'bsfab'))
right = pf.read(
columns=['ehic']).to_pandas().set_index('bene_id')
elif self.parquet_engine == 'fastparquet':
pf = fp.ParquetFile(
self._fpath(self.percent, year, 'bsfab'))
right = pf.to_pandas(columns=['ehic'], index='bene_id')
# Join bene_ids onto data using ehic
for data_type in data.keys():
data[data_type][year] = data[data_type][year].merge(
right, how='left', left_index=True, right_on='ehic')
# Concatenate ehic data
for data_type in data.keys():
data[data_type]['ehic'] = pd.concat(
[data[data_type].pop(year) for year in years_ehic],
paths, filesystem=fs, filters=filters, **dataset_kwargs
)
else:
base, fns = _analyze_paths(paths, fs)
if "_metadata" in fns:
# We have a _metadata file, lets use it
dataset = pq.ParquetDataset(
base + fs.sep + "_metadata",
filesystem=fs,
filters=filters,
**dataset_kwargs,
)
else:
# Rely on metadata for 0th file.
# Will need to pass a list of paths to read_partition
dataset = pq.ParquetDataset(paths[0], filesystem=fs, **dataset_kwargs)
parts = [base + fs.sep + fn for fn in fns]
elif fs.isdir(paths[0]):
# This is a directory, check for _metadata, then _common_metadata
allpaths = fs.glob(paths[0] + fs.sep + "*")
base, fns = _analyze_paths(allpaths, fs)
if "_metadata" in fns and "validate_schema" not in dataset_kwargs:
dataset_kwargs["validate_schema"] = False
if "_metadata" in fns or gather_statistics is not False:
# Let arrow do its thing (use _metadata or scan files)
dataset = pq.ParquetDataset(
paths, filesystem=fs, filters=filters, **dataset_kwargs
)
else:
# Use _common_metadata file if it is available.
# Otherwise, just use 0th file
if "_common_metadata" in fns:
def _load_parquet_pandas_metadata(path, filesystem=None):
filesystem = validate_coerce_filesystem(path, filesystem)
if not filesystem.exists(path):
raise ValueError("Path not found: " + path)
if filesystem.isdir(path):
pqds = pq.ParquetDataset(
path, filesystem=filesystem, validate_schema=False
)
common_metadata = pqds.common_metadata
if common_metadata is None:
# Get metadata for first piece
piece = pqds.pieces[0]
metadata = piece.get_metadata().metadata
else:
metadata = pqds.common_metadata.metadata
else:
with filesystem.open(path) as f:
pf = pq.ParquetFile(f)
metadata = pf.metadata.metadata
return json.loads(
metadata.get(b'pandas', b'{}').decode('utf')
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
"""Main worker function. Loads and returns all rows matching the predicate from a rowgroup
Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
the rest of the columns are not loaded.
:param piece_index:
:param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
of partitions.
:return:
"""
if not self._dataset:
self._dataset = pq.ParquetDataset(
self._dataset_path,
filesystem=self._filesystem,
validate_schema=False)
piece = self._split_pieces[piece_index]
# Create pyarrow file system
parquet_file = ParquetFile(self._dataset.fs.open(piece.path))
if not isinstance(self._local_cache, NullCache):
if worker_predicate:
raise RuntimeError('Local cache is not supported together with predicates, '
'unless the dataset is partitioned by the column the predicate operates on.')
if shuffle_row_drop_partition[1] != 1:
raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')