How to use fsspec - 10 common examples

To help you get started, we’ve selected a few fsspec examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github quantumblacklabs / kedro / tests / io / test_json_dataset.py View on Github external
            ("file:///tmp/test.json", LocalFileSystem),
            ("/tmp/test.json", LocalFileSystem),
            ("gcs://bucket/file.json", GCSFileSystem),
        ],
    )
    def test_protocol_usage(self, filepath, instance_type):
        data_set = JSONDataSet(filepath=filepath)
        assert isinstance(data_set._fs, instance_type)
        assert str(data_set._filepath) == data_set._fs._strip_protocol(filepath)
        assert isinstance(data_set._filepath, PurePosixPath)
github dcs4cop / xcube / test / core / test_dsio.py View on Github external
def test_path_or_store_write_to_bucket(self):
        path, _, _ = _get_path_or_store('http://obs.eu-de.otc.t-systems.com/fake_bucket/fake_cube.zarr',
                                        mode='write',
                                        client_kwargs={'aws_access_key_id': 'some_fake_id',
                                                       'aws_secret_access_key': 'some_fake_key'})
        self.assertIsInstance(path, fsspec.mapping.FSMap)
github dcs4cop / xcube / test / core / test_dsio.py View on Github external
def test_path_or_store_read_from_bucket(self):
        path, _, _ = _get_path_or_store('http://obs.eu-de.otc.t-systems.com/dcs4cop-obs-02/OLCI-SNS-RAW-CUBE-2.zarr',
                                        mode='read')
        self.assertIsInstance(path, fsspec.mapping.FSMap)
github intake / intake / intake / catalog / local.py View on Github external
def _load(self):
        # initial: find cat files
        # if flattening, need to get all entries from each.
        self._entries.clear()
        options = self.storage_options or {}
        if isinstance(self.path, (list, tuple)):
            files = sum([open_files(p, mode='rb', **options)
                         for p in self.path], [])
            self.name = self.name or "%i files" % len(files)
            self.description = self.description or f'Catalog generated from {len(files)} files'
            self.path = [make_path_posix(p) for p in self.path]
        else:
            if isinstance(self.path, str) and '*' not in self.path:
                self.path = self.path + '/*'
            files = open_files(self.path, mode='rb', **options)
            self.path = make_path_posix(self.path)
            self.name = self.name or self.path
            self.description = self.description or f'Catalog generated from all files found in {self.path}'
        if not set(f.path for f in files) == set(
                f.path for f in self._cat_files):
            # glob changed, reload all
            self._cat_files = files
            self._cats.clear()
        for f in files:
            name = os.path.split(f.path)[-1].replace(
                '.yaml', '').replace('.yml', '')
            kwargs = self.kwargs.copy()
            kwargs['path'] = f.path
            d = make_path_posix(os.path.dirname(f.path))
            if f.path not in self._cats:
                entry = LocalCatalogEntry(name, "YAML file: %s" % name,
github intake / filesystem_spec / fsspec / _version.py View on Github external
if not keywords:
        raise NotThisMethod("no keywords at all, weird")
    date = keywords.get("date")
    if date is not None:
        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
        # -like" string, which we must then edit to make compliant), because
        # it's been around since git-1.5.3, and it's too difficult to
        # discover which version we're using, or to work around using an
        # older one.
        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
    refnames = keywords["refnames"].strip()
    if refnames.startswith("$Format"):
        if verbose:
            print("keywords are unexpanded, not using")
        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
    refs = set([r.strip() for r in refnames.strip("()").split(",")])
    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
    TAG = "tag: "
    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
    if not tags:
        # Either we're using git < 1.8.3, or there really are no tags. We use
        # a heuristic: assume all version tags have a digit. The old git %d
        # expansion behaves like git log --decorate=short and strips out the
        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
        # between branches and tags. By ignoring refnames without digits, we
        # filter out many common branch names like "release" and
        # "stabilization", as well as "HEAD" and "master".
        tags = set([r for r in refs if re.search(r"\d", r)])
        if verbose:
            print("discarding '%s', no digits" % ",".join(refs - tags))
github intake / filesystem_spec / fsspec / _version.py View on Github external
def git_versions_from_keywords(keywords, tag_prefix, verbose):
    """Get version information from git keywords."""
    if not keywords:
        raise NotThisMethod("no keywords at all, weird")
    date = keywords.get("date")
    if date is not None:
        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
        # -like" string, which we must then edit to make compliant), because
        # it's been around since git-1.5.3, and it's too difficult to
        # discover which version we're using, or to work around using an
        # older one.
        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
    refnames = keywords["refnames"].strip()
    if refnames.startswith("$Format"):
        if verbose:
            print("keywords are unexpanded, not using")
        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
    refs = set([r.strip() for r in refnames.strip("()").split(",")])
    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
github intake / filesystem_spec / fsspec / implementations / http.py View on Github external
head["Accept-Encoding"] = "identity"
    session = session or await get_client()
    if size_policy == "head":
        r = await session.head(url, allow_redirects=ar, **kwargs)
    elif size_policy == "get":
        r = await session.get(url, allow_redirects=ar, **kwargs)
    else:
        raise TypeError('size_policy must be "head" or "get", got %s' "" % size_policy)
    async with r:
        if "Content-Length" in r.headers:
            return int(r.headers["Content-Length"])
        elif "Content-Range" in r.headers:
            return int(r.headers["Content-Range"].split("/")[1])


file_size = sync_wrapper(_file_size)
github intake / filesystem_spec / fsspec / core.py View on Github external
return out
    x = re.compile(".*[^a-z]+.*")  # test for non protocol-like single word
    bits = (
        [p if "://" in p or x.match(p) else p + "://" for p in path.split("::")]
        if "::" in path
        else [path]
    )
    if len(bits) < 2:
        return []
    # [[url, protocol, kwargs], ...]
    out = []
    previous_bit = None
    previous_protocol = None
    for bit in reversed(bits):
        protocol = split_protocol(bit)[0] or "file"
        cls = get_filesystem_class(protocol)
        extra_kwargs = cls._get_kwargs_from_urls(bit)
        kws = kwargs.get(split_protocol(bit)[0] or "file", {})
        kw = dict(**extra_kwargs, **kws)
        if (
            protocol in {"blockcache", "filecache", "simplecache"}
            and "target_protocol" not in kw
        ):
            bit = previous_bit.replace(previous_protocol, protocol)
        out.append((bit, protocol, kw))
        previous_bit = bit
        previous_protocol = protocol
    out = list(reversed(out))
    # We should only do the url rewrite if the cache is in the middle of the chain
    if out[0][1] in {"blockcache", "filecache", "simplecache"}:
        out[0] = (f"{out[0][1]}://", out[0][1], out[0][2])
    return out
github dask / dask / dask / bytes / core.py View on Github external
OpenFile(fs, path, compression=compression),
                o,
                l,
                delimiter,
                dask_key_name=key,
            )
            for o, key, l in zip(offset, keys, length)
        ]
        out.append(values)

    if sample:
        if sample is True:
            sample = "10 kiB"  # backwards compatibility
        if isinstance(sample, str):
            sample = parse_bytes(sample)
        with OpenFile(fs, paths[0], compression=compression) as f:
            # read block without seek (because we start at zero)
            if delimiter is None:
                sample = f.read(sample)
            else:
                sample_buff = f.read(sample)
                while True:
                    new = f.read(sample)
                    if not new:
                        break
                    if delimiter in new:
                        sample_buff = (
                            sample_buff + new.split(delimiter, 1)[0] + delimiter
                        )
                        break
                    sample_buff = sample_buff + new
                sample = sample_buff
github intake / filesystem_spec / fsspec / implementations / memory.py View on Github external
identifier
        mode: str
            normally "rb", "wb" or "ab"
        """
        if mode in ["rb", "ab", "rb+"]:
            if path in self.store:
                f = self.store[path]
                if mode == "rb":
                    f.seek(0)
                else:
                    f.seek(0, 2)
                return f
            else:
                raise FileNotFoundError(path)
        if mode == "wb":
            m = MemoryFile(self, path)
            if not self._intrans:
                m.commit()
            return m