How to use the fastparquet.api.sorted_partitioned_columns function in fastparquet

To help you get started, we’ve selected a few fastparquet examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dask / dask / dask / dataframe / io / parquet.py View on Github external
"Previous: {} | New: {}".format(pf.columns, list(df.columns))
            )
        elif (pd.Series(pf.dtypes).loc[pf.columns] != df[pf.columns].dtypes).any():
            raise ValueError(
                "Appended dtypes differ.\n{}".format(
                    set(pf.dtypes.items()) ^ set(df.dtypes.iteritems())
                )
            )
        else:
            df = df[pf.columns + partition_on]

        fmd = pf.fmd
        i_offset = fastparquet.writer.find_max_part(fmd.row_groups)

        if not ignore_divisions:
            minmax = fastparquet.api.sorted_partitioned_columns(pf)
            old_end = minmax[index_cols[0]]["max"][-1]
            if divisions[0] < old_end:
                raise ValueError(
                    "Appended divisions overlapping with the previous ones.\n"
                    "Previous: {} | New: {}".format(old_end, divisions[0])
                )
    else:
        fmd = fastparquet.writer.make_metadata(
            df._meta,
            object_encoding=object_encoding,
            index_cols=index_cols,
            ignore_columns=partition_on,
            **kwargs
        )
        i_offset = 0
github dask / dask / dask / dataframe / io / parquet / fastparquet.py View on Github external
elif (pd.Series(pf.dtypes).loc[pf.columns] != df[pf.columns].dtypes).any():
                raise ValueError(
                    "Appended dtypes differ.\n{}".format(
                        set(pf.dtypes.items()) ^ set(df.dtypes.iteritems())
                    )
                )
            else:
                df = df[pf.columns + partition_on]

            fmd = pf.fmd
            i_offset = fastparquet.writer.find_max_part(fmd.row_groups)
            if not ignore_divisions:
                if not set(index_cols).intersection([division_info["name"]]):
                    ignore_divisions = True
            if not ignore_divisions:
                minmax = fastparquet.api.sorted_partitioned_columns(pf)
                old_end = minmax[index_cols[0]]["max"][-1]
                divisions = division_info["divisions"]
                if divisions[0] < old_end:
                    raise ValueError(
                        "Appended divisions overlapping with previous ones."
                        "\n"
                        "Previous: {} | New: {}".format(old_end, divisions[0])
                    )
        else:
            fmd = fastparquet.writer.make_metadata(
                df._meta,
                object_encoding=object_encoding,
                index_cols=index_cols,
                ignore_columns=partition_on,
                **kwargs
            )
github dask / dask / dask / dataframe / io / parquet.py View on Github external
getattr(pf, "tz", {}),
        )
        for i, rg in enumerate(rgs)
    }
    if not dsk:
        # empty dataframe
        dsk = {(name, 0): meta}
        divisions = (None, None)
        return out_type(dsk, name, meta, divisions)

    if index_names and infer_divisions is not False:
        index_name = meta.index.name
        try:
            # is https://github.com/dask/fastparquet/pull/371 available in
            # current fastparquet installation?
            minmax = fastparquet.api.sorted_partitioned_columns(pf, filters)
        except TypeError:
            minmax = fastparquet.api.sorted_partitioned_columns(pf)
        if index_name in minmax:
            divisions = minmax[index_name]
            divisions = divisions["min"] + [divisions["max"][-1]]
        else:
            if infer_divisions is True:
                raise ValueError(
                    (
                        "Unable to infer divisions for index of '{index_name}'"
                        " because it is not known to be "
                        "sorted across partitions"
                    ).format(index_name=index_name)
                )

            divisions = (None,) * (len(rgs) + 1)
github dask / dask / dask / dataframe / io / parquet.py View on Github external
for i, rg in enumerate(rgs)
    }
    if not dsk:
        # empty dataframe
        dsk = {(name, 0): meta}
        divisions = (None, None)
        return out_type(dsk, name, meta, divisions)

    if index_names and infer_divisions is not False:
        index_name = meta.index.name
        try:
            # is https://github.com/dask/fastparquet/pull/371 available in
            # current fastparquet installation?
            minmax = fastparquet.api.sorted_partitioned_columns(pf, filters)
        except TypeError:
            minmax = fastparquet.api.sorted_partitioned_columns(pf)
        if index_name in minmax:
            divisions = minmax[index_name]
            divisions = divisions["min"] + [divisions["max"][-1]]
        else:
            if infer_divisions is True:
                raise ValueError(
                    (
                        "Unable to infer divisions for index of '{index_name}'"
                        " because it is not known to be "
                        "sorted across partitions"
                    ).format(index_name=index_name)
                )

            divisions = (None,) * (len(rgs) + 1)
    else:
        if infer_divisions is True: