How to use the fsspec.core.get_fs_token_paths function in fsspec

To help you get started, we’ve selected a few fsspec examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dask / dask / dask / bytes / core.py View on Github external
>>> sample, blocks = read_bytes('s3://bucket/2015-*-*.csv', delimiter=b'\\n')  # doctest: +SKIP
    >>> sample, paths, blocks = read_bytes('2015-*-*.csv', include_path=True)  # doctest: +SKIP

    Returns
    -------
    sample : bytes
        The sample header
    blocks : list of lists of ``dask.Delayed``
        Each list corresponds to a file, and each delayed object computes to a
        block of bytes from that file.
    paths : list of strings, only included if include_path is True
        List of same length as blocks, where each item is the path to the file
        represented in the corresponding block.

    """
    fs, fs_token, paths = get_fs_token_paths(urlpath, mode="rb", storage_options=kwargs)

    if len(paths) == 0:
        raise IOError("%s resolved to no files" % urlpath)

    if blocksize is not None:
        if isinstance(blocksize, str):
            blocksize = parse_bytes(blocksize)
        if not is_integer(blocksize):
            raise TypeError("blocksize must be an integer")
        blocksize = int(blocksize)

    if blocksize is None:
        offsets = [[0]] * len(paths)
        lengths = [[None]] * len(paths)
    else:
        offsets = []
github dask / dask / dask / dataframe / io / parquet.py View on Github external
--------
    read_parquet: Read parquet data to dask.dataframe
    """
    partition_on = partition_on or []

    if set(partition_on) - set(df.columns):
        raise ValueError("Partitioning on non-existent column")

    if compression != "default":
        kwargs["compression"] = compression
    elif snappy is not None:
        kwargs["compression"] = "snappy"

    write = get_engine(engine)["write"]

    fs, fs_token, _ = get_fs_token_paths(
        path, mode="wb", storage_options=storage_options
    )
    fs.mkdirs(path, exist_ok=True)
    # Trim any protocol information from the path before forwarding
    path = infer_storage_options(path)["path"]

    out = write(
        df,
        fs,
        fs_token,
        path,
        write_index=write_index,
        append=append,
        ignore_divisions=ignore_divisions,
        partition_on=partition_on,
        **kwargs
github dask / dask / dask / dataframe / io / parquet / core.py View on Github external
path,
        columns,
        filters,
        categories,
        index,
        storage_options,
        engine,
        gather_statistics,
    )

    if isinstance(engine, str):
        engine = get_engine(engine)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options)

    paths = sorted(paths, key=natural_sort_key)  # numeric rather than glob ordering

    auto_index_allowed = False
    if index is None:
        # User is allowing auto-detected index
        auto_index_allowed = True
    if index and isinstance(index, str):
        index = [index]

    meta, statistics, parts = engine.read_metadata(
        fs,
        paths,
        categories=categories,
        index=index,
        gather_statistics=gather_statistics,
github dask / dask / dask / dataframe / io / parquet.py View on Github external
if is_ParquetFile:
        read = get_engine("fastparquet")["read"]
        if path.fn.endswith("_metadata"):
            # remove '_metadata' from path
            urlpath = path.fn[: -len("_metadata")]
        else:
            urlpath = path.fn

        fs, fs_token, paths = get_fs_token_paths(
            urlpath, mode="rb", storage_options=storage_options
        )
        paths = path
    else:
        read = get_engine(engine)["read"]
        fs, fs_token, paths = get_fs_token_paths(
            path, mode="rb", storage_options=storage_options
        )

        if isinstance(path, string_types) and len(paths) > 1:
            # Sort paths naturally if multiple paths resulted from a single
            # specification (by '*' globbing)
            paths = sorted(paths, key=natural_sort_key)

    return read(
        fs,
        fs_token,
        paths,
        columns=columns,
        filters=filters,
        categories=categories,
        index=index,
github dask / dask / dask / dataframe / io / parquet.py View on Github external
"'engine' should be set to 'auto' or 'fastparquet' "
                + "when reading from fastparquet.ParquetFile"
            )
            is_ParquetFile = True
    except ImportError:
        pass

    if is_ParquetFile:
        read = get_engine("fastparquet")["read"]
        if path.fn.endswith("_metadata"):
            # remove '_metadata' from path
            urlpath = path.fn[: -len("_metadata")]
        else:
            urlpath = path.fn

        fs, fs_token, paths = get_fs_token_paths(
            urlpath, mode="rb", storage_options=storage_options
        )
        paths = path
    else:
        read = get_engine(engine)["read"]
        fs, fs_token, paths = get_fs_token_paths(
            path, mode="rb", storage_options=storage_options
        )

        if isinstance(path, string_types) and len(paths) > 1:
            # Sort paths naturally if multiple paths resulted from a single
            # specification (by '*' globbing)
            paths = sorted(paths, key=natural_sort_key)

    return read(
        fs,
github dask / dask / dask / dataframe / io / parquet / core.py View on Github external
if isinstance(partition_on, str):
        partition_on = [partition_on]

    if set(partition_on) - set(df.columns):
        raise ValueError(
            "Partitioning on non-existent column. "
            "partition_on=%s ."
            "columns=%s" % (str(partition_on), str(list(df.columns)))
        )

    if isinstance(engine, str):
        engine = get_engine(engine)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)

    # Save divisions and corresponding index name. This is necessary,
    # because we may be resetting the index to write the file
    division_info = {"divisions": df.divisions, "name": df.index.name}
    if division_info["name"] is None:
        # As of 0.24.2, pandas will rename an index with name=None
        # when df.reset_index() is called.  The default name is "index",
        # (or "level_0" if "index" is already a column name)
        division_info["name"] = "index" if "index" not in df.columns else "level_0"

    # If write_index==True (default), reset the index and record the
    # name of the original index in `index_cols` (will be `index` if None,
    # or `level_0` if `index` is already a column name).
    # `fastparquet` will use `index_cols` to specify the index column(s)