Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
("file:///tmp/test.json", LocalFileSystem),
("/tmp/test.json", LocalFileSystem),
("gcs://bucket/file.json", GCSFileSystem),
],
)
def test_protocol_usage(self, filepath, instance_type):
data_set = JSONDataSet(filepath=filepath)
assert isinstance(data_set._fs, instance_type)
assert str(data_set._filepath) == data_set._fs._strip_protocol(filepath)
assert isinstance(data_set._filepath, PurePosixPath)
def test_path_or_store_write_to_bucket(self):
path, _, _ = _get_path_or_store('http://obs.eu-de.otc.t-systems.com/fake_bucket/fake_cube.zarr',
mode='write',
client_kwargs={'aws_access_key_id': 'some_fake_id',
'aws_secret_access_key': 'some_fake_key'})
self.assertIsInstance(path, fsspec.mapping.FSMap)
def test_path_or_store_read_from_bucket(self):
path, _, _ = _get_path_or_store('http://obs.eu-de.otc.t-systems.com/dcs4cop-obs-02/OLCI-SNS-RAW-CUBE-2.zarr',
mode='read')
self.assertIsInstance(path, fsspec.mapping.FSMap)
def _load(self):
# initial: find cat files
# if flattening, need to get all entries from each.
self._entries.clear()
options = self.storage_options or {}
if isinstance(self.path, (list, tuple)):
files = sum([open_files(p, mode='rb', **options)
for p in self.path], [])
self.name = self.name or "%i files" % len(files)
self.description = self.description or f'Catalog generated from {len(files)} files'
self.path = [make_path_posix(p) for p in self.path]
else:
if isinstance(self.path, str) and '*' not in self.path:
self.path = self.path + '/*'
files = open_files(self.path, mode='rb', **options)
self.path = make_path_posix(self.path)
self.name = self.name or self.path
self.description = self.description or f'Catalog generated from all files found in {self.path}'
if not set(f.path for f in files) == set(
f.path for f in self._cat_files):
# glob changed, reload all
self._cat_files = files
self._cats.clear()
for f in files:
name = os.path.split(f.path)[-1].replace(
'.yaml', '').replace('.yml', '')
kwargs = self.kwargs.copy()
kwargs['path'] = f.path
d = make_path_posix(os.path.dirname(f.path))
if f.path not in self._cats:
entry = LocalCatalogEntry(name, "YAML file: %s" % name,
if not keywords:
raise NotThisMethod("no keywords at all, weird")
date = keywords.get("date")
if date is not None:
# git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
# datestamp. However we prefer "%ci" (which expands to an "ISO-8601
# -like" string, which we must then edit to make compliant), because
# it's been around since git-1.5.3, and it's too difficult to
# discover which version we're using, or to work around using an
# older one.
date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
refnames = keywords["refnames"].strip()
if refnames.startswith("$Format"):
if verbose:
print("keywords are unexpanded, not using")
raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
refs = set([r.strip() for r in refnames.strip("()").split(",")])
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
# just "foo-1.0". If we see a "tag: " prefix, prefer those.
TAG = "tag: "
tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
if not tags:
# Either we're using git < 1.8.3, or there really are no tags. We use
# a heuristic: assume all version tags have a digit. The old git %d
# expansion behaves like git log --decorate=short and strips out the
# refs/heads/ and refs/tags/ prefixes that would let us distinguish
# between branches and tags. By ignoring refnames without digits, we
# filter out many common branch names like "release" and
# "stabilization", as well as "HEAD" and "master".
tags = set([r for r in refs if re.search(r"\d", r)])
if verbose:
print("discarding '%s', no digits" % ",".join(refs - tags))
def git_versions_from_keywords(keywords, tag_prefix, verbose):
"""Get version information from git keywords."""
if not keywords:
raise NotThisMethod("no keywords at all, weird")
date = keywords.get("date")
if date is not None:
# git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
# datestamp. However we prefer "%ci" (which expands to an "ISO-8601
# -like" string, which we must then edit to make compliant), because
# it's been around since git-1.5.3, and it's too difficult to
# discover which version we're using, or to work around using an
# older one.
date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
refnames = keywords["refnames"].strip()
if refnames.startswith("$Format"):
if verbose:
print("keywords are unexpanded, not using")
raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
refs = set([r.strip() for r in refnames.strip("()").split(",")])
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
head["Accept-Encoding"] = "identity"
session = session or await get_client()
if size_policy == "head":
r = await session.head(url, allow_redirects=ar, **kwargs)
elif size_policy == "get":
r = await session.get(url, allow_redirects=ar, **kwargs)
else:
raise TypeError('size_policy must be "head" or "get", got %s' "" % size_policy)
async with r:
if "Content-Length" in r.headers:
return int(r.headers["Content-Length"])
elif "Content-Range" in r.headers:
return int(r.headers["Content-Range"].split("/")[1])
file_size = sync_wrapper(_file_size)
return out
x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
bits = (
[p if "://" in p or x.match(p) else p + "://" for p in path.split("::")]
if "::" in path
else [path]
)
if len(bits) < 2:
return []
# [[url, protocol, kwargs], ...]
out = []
previous_bit = None
previous_protocol = None
for bit in reversed(bits):
protocol = split_protocol(bit)[0] or "file"
cls = get_filesystem_class(protocol)
extra_kwargs = cls._get_kwargs_from_urls(bit)
kws = kwargs.get(split_protocol(bit)[0] or "file", {})
kw = dict(**extra_kwargs, **kws)
if (
protocol in {"blockcache", "filecache", "simplecache"}
and "target_protocol" not in kw
):
bit = previous_bit.replace(previous_protocol, protocol)
out.append((bit, protocol, kw))
previous_bit = bit
previous_protocol = protocol
out = list(reversed(out))
# We should only do the url rewrite if the cache is in the middle of the chain
if out[0][1] in {"blockcache", "filecache", "simplecache"}:
out[0] = (f"{out[0][1]}://", out[0][1], out[0][2])
return out
OpenFile(fs, path, compression=compression),
o,
l,
delimiter,
dask_key_name=key,
)
for o, key, l in zip(offset, keys, length)
]
out.append(values)
if sample:
if sample is True:
sample = "10 kiB" # backwards compatibility
if isinstance(sample, str):
sample = parse_bytes(sample)
with OpenFile(fs, paths[0], compression=compression) as f:
# read block without seek (because we start at zero)
if delimiter is None:
sample = f.read(sample)
else:
sample_buff = f.read(sample)
while True:
new = f.read(sample)
if not new:
break
if delimiter in new:
sample_buff = (
sample_buff + new.split(delimiter, 1)[0] + delimiter
)
break
sample_buff = sample_buff + new
sample = sample_buff
identifier
mode: str
normally "rb", "wb" or "ab"
"""
if mode in ["rb", "ab", "rb+"]:
if path in self.store:
f = self.store[path]
if mode == "rb":
f.seek(0)
else:
f.seek(0, 2)
return f
else:
raise FileNotFoundError(path)
if mode == "wb":
m = MemoryFile(self, path)
if not self._intrans:
m.commit()
return m