How to use the pyarrow.parquet.ParquetDataset function in pyarrow

To help you get started, we’ve selected a few pyarrow examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kylebarron / medicare_utils / medicare_utils / medicare_df.py View on Github external
# NOTE: should the index here be cols['cl_id'] ?
            cl = dd.read_parquet(
                path,
                columns=cols_toload - set([cols['pl_id']]),
                index=cols['pl_id'])
        elif self.parquet_engine == 'pyarrow':
            try:
                pf = pq.ParquetFile(path)
                itr = (
                    pf.read_row_group(
                        i,
                        columns=cols_toload).to_pandas().set_index(
                                         cols['pl_id'])
                    for i in range(pf.num_row_groups))
            except pa.ArrowIOError:
                pf = pq.ParquetDataset(path)
                itr = (pf.read(columns=cols_toload).to_pandas().set_index(
                                 cols['pl_id']) for i in [1])
        elif self.parquet_engine == 'fastparquet':
            pf = fp.ParquetFile(path)
            itr = pf.iter_row_groups(columns=list(cols_toload), index=cols['pl_id'])

        if dask:
            cl = self._search_for_codes_df_inner(
                cl=cl,
                codes=codes,
                cols=cols,
                year=year,
                keep_vars=keep_vars,
                rename=rename,
                collapse_codes=collapse_codes,
                pl_ids_to_filter=pl_ids_to_filter)
github IntelPython / sdc / sdc / io / parquet_pio.py View on Github external
def parquet_file_schema(file_name):
    import pyarrow.parquet as pq
    col_names = []
    col_types = []

    pq_dataset = pq.ParquetDataset(file_name)
    col_names = pq_dataset.schema.names
    pa_schema = pq_dataset.schema.to_arrow_schema()

    col_types = [_get_numba_typ_from_pa_typ(pa_schema.field_by_name(c).type)
                 for c in col_names]
    # TODO: close file?
    return col_names, col_types
github uber / petastorm / petastorm / arrow_reader_worker.py View on Github external
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
        """Main worker function. Loads and returns all rows matching the predicate from a rowgroup

        Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
        columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
        the rest of the columns are not loaded.

        :param piece_index:
        :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
            of partitions.
        :return:
        """

        if not self._dataset:
            self._dataset = pq.ParquetDataset(
                self._dataset_path,
                filesystem=self._filesystem,
                validate_schema=False)

        piece = self._split_pieces[piece_index]

        # Create pyarrow file system
        parquet_file = ParquetFile(self._dataset.fs.open(piece.path))

        if not isinstance(self._local_cache, NullCache):
            if worker_predicate:
                raise RuntimeError('Local cache is not supported together with predicates, '
                                   'unless the dataset is partitioned by the column the predicate operates on.')
            if shuffle_row_drop_partition[1] != 1:
                raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')
github dask / dask / dask / dataframe / io / parquet / arrow.py View on Github external
def _determine_dataset_parts(fs, paths, gather_statistics, filters, dataset_kwargs):
    """ Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.
    """
    parts = []
    if len(paths) > 1:
        if gather_statistics is not False:
            # This scans all the files
            dataset = pq.ParquetDataset(
                paths, filesystem=fs, filters=filters, **dataset_kwargs
            )
        else:
            base, fns = _analyze_paths(paths, fs)
            if "_metadata" in fns:
                # We have a _metadata file, lets use it
                dataset = pq.ParquetDataset(
                    base + fs.sep + "_metadata",
                    filesystem=fs,
                    filters=filters,
                    **dataset_kwargs,
                )
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                dataset = pq.ParquetDataset(paths[0], filesystem=fs, **dataset_kwargs)
github kylebarron / medicare_utils / medicare_utils / medicare_df.py View on Github external
toload_regex.append(r'^(hmoind\d{2})$')
        if self.year_type == 'age':
            toload_regex.append(r'^(bene_dob)$')
        for keep_var in keep_vars:
            if isinstance(keep_var, str):
                toload_regex.append(r'^({})$'.format(keep_var))

        toload_regex = re.compile('|'.join(toload_regex)).search

        toload_vars: Dict[int, List[str]] = {}
        for year in self.years:
            if self.parquet_engine == 'pyarrow':
                try:
                    pf = pq.ParquetFile(self._fpath(self.percent, year, 'bsfab'))
                except pa.ArrowIOError:
                    pf = pq.ParquetDataset(self._fpath(self.percent, year, 'bsfab'))
                cols = pf.schema.names
            elif self.parquet_engine == 'fastparquet':
                pf = fp.ParquetFile(self._fpath(self.percent, year, 'bsfab'))
                cols = pf.columns

            toload_vars[year] = [x for x in cols if toload_regex(x)]
            for keep_var in keep_vars:
                if isinstance(keep_var, re._pattern_type):
                    toload_vars[year].extend([
                        x for x in cols if keep_var.search(x)])

            # Deduplicate while preserving order
            toload_vars[year] = list(dict.fromkeys(toload_vars[year]))

            # Check cols against keep_vars
            # Is there an item in keep_vars that wasn't matched?
github uber / petastorm / petastorm / reader_impl / row_group_loader.py View on Github external
:param schema: A unischema corresponding to the data in the dataset
        :param ngram: An instance of NGram if ngrams should be read or None, if each row in the dataset corresponds to
          a single sample returned.
        :param local_cache: An instance of a rowgroup cache (CacheBase interface) object to be used.
        :param worker_predicate: An instance of predicate (PredicateBase interface)
        :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
        """
        self._dataset_url_parsed = urlparse(dataset_url)
        self._schema = schema
        self._ngram = ngram
        self._local_cache = local_cache
        self._worker_predicate = worker_predicate

        resolver = FilesystemResolver(self._dataset_url_parsed, hdfs_driver=hdfs_driver)
        self._dataset = pq.ParquetDataset(
            resolver.get_dataset_path(),
            filesystem=resolver.filesystem(),
            validate_schema=False)
github kylebarron / medicare_utils / medicare_utils / medicare_df.py View on Github external
# Get ehic-bene_id crosswalk
                # If self.pl exists, then cl data frames use only those ids
                # So I can merge using that
                if self.pl is not None:
                    if 'match' in self.pl.columns:
                        right = self.pl.loc(axis=0)[:, year]
                        right.loc[right['match'], 'ehic']
                    else:
                        right = self.pl.loc[(slice(None), year), 'ehic']
                    right = right.to_frame()
                else:
                    if self.parquet_engine == 'pyarrow':
                        try:
                            pf = pq.ParquetFile(self._fpath(self.percent, year, 'bsfab'))
                        except pa.ArrowIOError:
                            pf = pq.ParquetDataset(self._fpath(self.percent, year, 'bsfab'))
                        right = pf.read(
                            columns=['ehic']).to_pandas().set_index('bene_id')
                    elif self.parquet_engine == 'fastparquet':
                        pf = fp.ParquetFile(
                            self._fpath(self.percent, year, 'bsfab'))
                        right = pf.to_pandas(columns=['ehic'], index='bene_id')

                # Join bene_ids onto data using ehic
                for data_type in data.keys():
                    data[data_type][year] = data[data_type][year].merge(
                        right, how='left', left_index=True, right_on='ehic')

            # Concatenate ehic data
            for data_type in data.keys():
                data[data_type]['ehic'] = pd.concat(
                    [data[data_type].pop(year) for year in years_ehic],
github dask / dask / dask / dataframe / io / parquet / arrow.py View on Github external
paths, filesystem=fs, filters=filters, **dataset_kwargs
            )
        else:
            base, fns = _analyze_paths(paths, fs)
            if "_metadata" in fns:
                # We have a _metadata file, lets use it
                dataset = pq.ParquetDataset(
                    base + fs.sep + "_metadata",
                    filesystem=fs,
                    filters=filters,
                    **dataset_kwargs,
                )
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                dataset = pq.ParquetDataset(paths[0], filesystem=fs, **dataset_kwargs)
                parts = [base + fs.sep + fn for fn in fns]
    elif fs.isdir(paths[0]):
        # This is a directory, check for _metadata, then _common_metadata
        allpaths = fs.glob(paths[0] + fs.sep + "*")
        base, fns = _analyze_paths(allpaths, fs)
        if "_metadata" in fns and "validate_schema" not in dataset_kwargs:
            dataset_kwargs["validate_schema"] = False
        if "_metadata" in fns or gather_statistics is not False:
            # Let arrow do its thing (use _metadata or scan files)
            dataset = pq.ParquetDataset(
                paths, filesystem=fs, filters=filters, **dataset_kwargs
            )
        else:
            # Use _common_metadata file if it is available.
            # Otherwise, just use 0th file
            if "_common_metadata" in fns:
github holoviz / spatialpandas / spatialpandas / io / parquet.py View on Github external
def _load_parquet_pandas_metadata(path, filesystem=None):
    filesystem = validate_coerce_filesystem(path, filesystem)
    if not filesystem.exists(path):
        raise ValueError("Path not found: " + path)

    if filesystem.isdir(path):
        pqds = pq.ParquetDataset(
            path, filesystem=filesystem, validate_schema=False
        )
        common_metadata = pqds.common_metadata
        if common_metadata is None:
            # Get metadata for first piece
            piece = pqds.pieces[0]
            metadata = piece.get_metadata().metadata
        else:
            metadata = pqds.common_metadata.metadata
    else:
        with filesystem.open(path) as f:
            pf = pq.ParquetFile(f)
        metadata = pf.metadata.metadata

    return json.loads(
        metadata.get(b'pandas', b'{}').decode('utf')
github uber / petastorm / petastorm / py_dict_reader_worker.py View on Github external
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
        """Main worker function. Loads and returns all rows matching the predicate from a rowgroup

        Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
        columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
        the rest of the columns are not loaded.

        :param piece_index:
        :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
            of partitions.
        :return:
        """

        if not self._dataset:
            self._dataset = pq.ParquetDataset(
                self._dataset_path,
                filesystem=self._filesystem,
                validate_schema=False)

        piece = self._split_pieces[piece_index]

        # Create pyarrow file system
        parquet_file = ParquetFile(self._dataset.fs.open(piece.path))

        if not isinstance(self._local_cache, NullCache):
            if worker_predicate:
                raise RuntimeError('Local cache is not supported together with predicates, '
                                   'unless the dataset is partitioned by the column the predicate operates on.')
            if shuffle_row_drop_partition[1] != 1:
                raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')