Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _apply_row_group_selector(self, dataset, rowgroup_selector, filtered_row_group_indexes):
"""Filters the list of row group indexes using rowgroup selector object. Returns a modified list of rowgroup
indexes."""
if not isinstance(rowgroup_selector, RowGroupSelectorBase):
raise ValueError('rowgroup_selector parameter is expected to be derived from RowGroupSelectorBase')
# Load indexes from metadata
available_row_group_indexes = rowgroup_indexing.get_row_group_indexes(dataset)
required_indexes = rowgroup_selector.get_index_names()
if not set(required_indexes).issubset(set(available_row_group_indexes.keys())):
raise ValueError('Some of required indexes {} are not available in {}'.format(
required_indexes, list(available_row_group_indexes.keys())))
selected_indexes = rowgroup_selector.select_row_groups(available_row_group_indexes)
# include only selected_indexes but in filtered_row_group_indexes order
filtered_row_group_indexes = [idx for idx in filtered_row_group_indexes if idx in selected_indexes]
return filtered_row_group_indexes
def _apply_row_group_selector(self, dataset, rowgroup_selector, filtered_row_group_indexes):
"""Filters the list of row group indexes using rowgroup selector object. Returns a modified list of rowgroup
indexes."""
if not isinstance(rowgroup_selector, RowGroupSelectorBase):
raise ValueError('rowgroup_selector parameter is expected to be derived from RowGroupSelectorBase')
# Load indexes from metadata
available_row_group_indexes = rowgroup_indexing.get_row_group_indexes(dataset)
required_indexes = rowgroup_selector.get_index_names()
if not set(required_indexes).issubset(set(available_row_group_indexes.keys())):
raise ValueError('Some of required indexes {} are not available in {}'.format(
required_indexes, list(available_row_group_indexes.keys())))
selected_indexes = rowgroup_selector.select_row_groups(available_row_group_indexes)
# include only selected_indexes but in filtered_row_group_indexes order
filtered_row_group_indexes = [idx for idx in filtered_row_group_indexes if idx in selected_indexes]
return filtered_row_group_indexes
if args.dataset_url and args.dataset_url[-1] == '/':
args.dataset_url = args.dataset_url[:-1]
# Create pyarrow file system
resolver = FilesystemResolver(args.dataset_url, hdfs_driver=args.hdfs_driver)
dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
validate_schema=False)
print_all = not args.schema and not args.index
if args.schema or print_all:
print('*** Schema from dataset metadata ***')
print((dataset_metadata.get_schema(dataset)))
if args.index or print_all:
index_dict = rowgroup_indexing.get_row_group_indexes(dataset)
print('*** Row group indexes from dataset metadata ***')
for index_name in index_dict:
print(('Index: {}'.format(index_name)))
if args.skip_index is None or index_name not in args.skip_index:
for field_value in index_dict[index_name].indexed_values:
print(' -- {}({})'.format(field_value,
len(index_dict[index_name].get_row_group_indexes(field_value))))
if args.print_values:
print(index_dict[index_name].get_row_group_indexes(field_value))
else:
print(' (skipped)')