Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
("s3://bucket/file.csv", S3FileSystem),
("file:///tmp/test.csv", LocalFileSystem),
("/tmp/test.csv", LocalFileSystem),
("gcs://bucket/file.csv", GCSFileSystem),
("https://example.com/file.csv", HTTPFileSystem),
],
)
def test_protocol_usage(self, filepath, instance_type):
data_set = CSVDataSet(filepath=filepath)
assert isinstance(data_set._fs, instance_type)
# _strip_protocol() doesn't strip http(s) protocol
if data_set._protocol == "https":
path = filepath.split("://")[-1]
else:
path = data_set._fs._strip_protocol(filepath)
("s3://bucket/file.json", S3FileSystem),
("file:///tmp/test.json", LocalFileSystem),
("/tmp/test.json", LocalFileSystem),
("gcs://bucket/file.json", GCSFileSystem),
],
)
def test_protocol_usage(self, filepath, instance_type):
data_set = JSONDataSet(filepath=filepath)
assert isinstance(data_set._fs, instance_type)
assert str(data_set._filepath) == data_set._fs._strip_protocol(filepath)
assert isinstance(data_set._filepath, PurePosixPath)
load_args: Pandas options for loading csv files.
Here you can find all available arguments:
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
All defaults are preserved.
save_args: Pandas options for saving csv files.
Here you can find all available arguments:
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
All defaults are preserved, but "index", which is set to False.
version: If specified, should be an instance of
``kedro.io.core.Version``. If its ``load`` attribute is
None, the latest version will be loaded. If its ``save``
attribute is None, save version will be autogenerated.
"""
_credentials = copy.deepcopy(credentials) or {}
_s3 = S3FileSystem(client_kwargs=_credentials)
path = _s3._strip_protocol(filepath) # pylint: disable=protected-access
path = PurePosixPath("{}/{}".format(bucket_name, path) if bucket_name else path)
super().__init__(
path, version, exists_function=_s3.exists, glob_function=_s3.glob,
)
self._credentials = _credentials
# Handle default load and save arguments
self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS)
if load_args is not None:
self._load_args.update(load_args)
self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS)
if save_args is not None:
self._save_args.update(save_args)
self._s3 = _s3
including bucket and protocol, e.g. `s3://bucket-name/path/to/file.pkl`.
bucket_name: S3 bucket name. Must be specified **only** if not
present in ``filepath``.
credentials: Credentials to access the S3 bucket, such as
``aws_access_key_id``, ``aws_secret_access_key``.
load_args: Options for loading pickle files. Refer to the help
file of ``pickle.loads`` for options.
save_args: Options for saving pickle files. Refer to the help
file of ``pickle.dumps`` for options.
version: If specified, should be an instance of
``kedro.io.core.Version``. If its ``load`` attribute is
None, the latest version will be loaded. If its ``save``
attribute is None, save version will be autogenerated.
"""
_credentials = deepcopy(credentials) or {}
_s3 = S3FileSystem(client_kwargs=_credentials)
path = _s3._strip_protocol(filepath) # pylint: disable=protected-access
path = PurePosixPath("{}/{}".format(bucket_name, path) if bucket_name else path)
super().__init__(
path, version, exists_function=_s3.exists, glob_function=_s3.glob,
)
self._credentials = _credentials
# Handle default load and save arguments
self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS)
if load_args is not None:
self._load_args.update(load_args)
self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS)
if save_args is not None:
self._save_args.update(save_args)
self._s3 = _s3
load_args: Additional loading options `pyarrow`:
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html
or `fastparquet`:
https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.ParquetFile.to_pandas
save_args: Additional saving options for `pyarrow`:
https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas
or `fastparquet`:
https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write
version: If specified, should be an instance of
``kedro.io.core.Version``. If its ``load`` attribute is
None, the latest version will be loaded. If its ``save``
attribute is None, save version will be autogenerated.
"""
_credentials = deepcopy(credentials) or {}
_s3 = S3FileSystem(client_kwargs=_credentials)
path = _s3._strip_protocol(filepath) # pylint: disable=protected-access
path = PurePosixPath("{}/{}".format(bucket_name, path) if bucket_name else path)
super().__init__(
path, version, exists_function=_s3.exists, glob_function=_s3.glob,
)
default_load_args = {} # type: Dict[str, Any]
default_save_args = {} # type: Dict[str, Any]
self._load_args = (
{**default_load_args, **load_args}
if load_args is not None
else default_load_args
)
self._save_args = (
{**default_save_args, **save_args}
def s3fs_open(self, path, mode):
from s3fs.core import S3FileSystem
endpoint_url = os.environ.get('S3_ENDPOINT_URL')
client_kwargs = {}
if endpoint_url:
client_kwargs = {'endpoint_url': endpoint_url}
if 'r' in mode:
self.wait_for_path(path)
s3 = S3FileSystem(anon=False, default_fill_cache=False,
client_kwargs=client_kwargs)
return s3.open(path, mode=mode)