How to use the fastparquet.api.filter_out_cats function in fastparquet

To help you get started, we’ve selected a few fastparquet examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dask / dask / dask / dataframe / io / parquet / fastparquet.py View on Github external
if None in [cs_min, cs_max] and i == 0:
                            skip_cols.add(col)
                            continue
                        if isinstance(cs_min, np.datetime64):
                            cs_min = pd.Timestamp(cs_min)
                            cs_max = pd.Timestamp(cs_max)
                        d.update(
                            {
                                "min": cs_min,
                                "max": cs_max,
                                "null_count": pf.statistics["null_count"][col][i],
                            }
                        )
                        s["columns"].append(d)
                # Need this to filter out partitioned-on categorical columns
                s["filter"] = fastparquet.api.filter_out_cats(row_group, filters)
                s["total_byte_size"] = row_group.total_byte_size
                s["file_path_0"] = row_group.columns[0].file_path  # 0th column only
                stats.append(s)

        else:
            stats = None

        pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
        pf.fmd.row_groups = None

        # Create `parts`
        # This is a list of row-group-descriptor dicts, or file-paths
        # if we have a list of files and gather_statistics=False
        if not parts:
            partsin = pf.row_groups
            if fast_metadata:
github dask / dask / dask / dataframe / io / parquet.py View on Github external
)

    (
        meta,
        filters,
        index_name,
        out_type,
        all_columns,
        index_names,
        storage_name_mapping,
    ) = _pf_validation(pf, columns, index, categories, filters)
    rgs = [
        rg
        for rg in pf.row_groups
        if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema))
        and not (fastparquet.api.filter_out_cats(rg, filters))
    ]

    name = "read-parquet-" + tokenize(fs_token, paths, all_columns, filters, categories)
    dsk = {
        (name, i): (
            _read_parquet_row_group,
            fs,
            pf.row_group_filename(rg),
            index_names,
            all_columns,
            rg,
            out_type == Series,
            categories,
            pf.schema,
            pf.cats,
            pf.dtypes,