Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_is_uri(self):
self.assertEqual(is_uri('notauri!'), False)
self.assertEqual(is_uri('they://did/the/monster/mash'), True)
self.assertEqual(is_uri('C:\some\windows\path'), False)
# test #1455
self.assertEqual(is_uri('2016-10-11T06:29:17'), False)
# sorry, we only care about file URIs
self.assertEqual(is_uri('mailto:someone@example.com'), False)
# urlparse has to accept it
self.assertEqual(is_uri('://'), False)
# doesn't have a working directory
job = MRSparkOSWalk(['-r', 'spark',
'--spark-master', _LOCAL_CLUSTER_MASTER,
'--files',
'%s#ghoti,%s' % (fish_path, fowl_path)])
job.sandbox()
file_sizes = {}
with job.make_runner() as runner:
runner.run()
# check working dir mirror
wd_mirror = runner._wd_mirror()
self.assertIsNotNone(wd_mirror)
self.assertFalse(is_uri(wd_mirror))
self.assertTrue(exists(wd_mirror))
# only files which needed to be renamed should be in wd_mirror
self.assertTrue(exists(join(wd_mirror, 'ghoti')))
self.assertFalse(exists(join(wd_mirror, 'fish')))
self.assertFalse(exists(join(wd_mirror, 'fowl')))
for line in to_lines(runner.cat_output()):
path, size = safeeval(line)
file_sizes[path] = size
# check that files were uploaded to working dir
self.assertIn('fowl', file_sizes)
self.assertEqual(file_sizes['fowl'], 5)
self.assertIn('ghoti', file_sizes)
def _fully_qualify_network_uri(uri, project_id):
if '/' not in uri: # just a name
uri = 'projects/%s/global/networks/%s' % (project_id, uri)
if not is_uri(uri):
uri = 'https://www.googleapis.com/compute/v1/' + uri
return uri
def test_spark_master_local(self):
runner = SparkMRJobRunner(spark_master='local[*]')
self.assertFalse(is_uri(runner._spark_tmp_dir))
self.assertIsNone(runner._upload_mgr)
self._get_local_tmp_dir(), 'tmp-download')
log.info('Archiving %s -> %s' % (dir_path, tar_gz_path))
with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz:
for path in self.fs.ls(dir_path):
# fs.ls() only lists files
if path == dir_path:
raise OSError('%s is a file, not a directory!' % dir_path)
# TODO: do we need this?
if os.path.realpath(path) == os.path.realpath(tar_gz_path):
raise OSError(
'attempted to archive %s into itself!' % tar_gz_path)
if is_uri(path):
path_in_tar_gz = path[len(dir_path):].lstrip('/')
log.info(' downloading %s -> %s' % (
path, tmp_download_path))
with open(tmp_download_path, 'wb') as f:
for chunk in self.fs.cat(path):
f.write(chunk)
local_path = tmp_download_path
else:
path_in_tar_gz = path[len(dir_path):].lstrip(os.sep)
local_path = path
log.debug(' adding %s to %s' % (path, tar_gz_path))
tar_gz.add(local_path, path_in_tar_gz, recursive=False)
self._dir_archives_created.add(tar_gz_path)
def _dest_in_wd_mirror(self, path, name):
"""Return the URI of where to upload *path* so it can appear in the
working dir as *name*, or ``None`` if it doesn't need to be uploaded.
"""
dest_dir = self._wd_mirror()
if not dest_dir:
return None
# the only reason to re-upload a URI is if it has the wrong name
#
# similarly, the only point of a local working dir mirror is
# to rename things
if (is_uri(path) or not is_uri(dest_dir)) and (
posixpath.basename(path) == name or
not self._wd_filenames_must_match()):
return None
return posixpath.join(dest_dir, name)
def can_handle_path(self, path):
if not (self._hadoop_bin or self._hadoop_bin is None):
return False
return is_uri(path)
def _dir_archive_path(self, dir_path):
"""Assign a path for the archive of *dir_path* but don't
actually create anything."""
if dir_path not in self._dir_to_archive_path:
# we can check local paths now
if not (is_uri(dir_path) or os.path.isdir(dir_path)):
raise OSError('%s is not a directory!' % dir_path)
name = name_uniquely(
dir_path, names_taken=self._dir_archive_names_taken)
self._dir_archive_names_taken.add(name)
self._dir_to_archive_path[dir_path] = os.path.join(
self._get_local_tmp_dir(), 'archives', name + '.tar.gz')
return self._dir_to_archive_path[dir_path]