How to use the mrjob.parse.is_uri function in mrjob

To help you get started, we’ve selected a few mrjob examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Yelp / mrjob / tests / test_parse.py View on Github external
def test_is_uri(self):
        self.assertEqual(is_uri('notauri!'), False)
        self.assertEqual(is_uri('they://did/the/monster/mash'), True)
        self.assertEqual(is_uri('C:\some\windows\path'), False)
        # test #1455
        self.assertEqual(is_uri('2016-10-11T06:29:17'), False)
        # sorry, we only care about file URIs
        self.assertEqual(is_uri('mailto:someone@example.com'), False)
        # urlparse has to accept it
        self.assertEqual(is_uri('://'), False)
github Yelp / mrjob / tests / spark / test_runner.py View on Github external
# doesn't have a working directory
        job = MRSparkOSWalk(['-r', 'spark',
                             '--spark-master', _LOCAL_CLUSTER_MASTER,
                             '--files',
                             '%s#ghoti,%s' % (fish_path, fowl_path)])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            self.assertIsNotNone(wd_mirror)
            self.assertFalse(is_uri(wd_mirror))

            self.assertTrue(exists(wd_mirror))
            # only files which needed to be renamed should be in wd_mirror
            self.assertTrue(exists(join(wd_mirror, 'ghoti')))
            self.assertFalse(exists(join(wd_mirror, 'fish')))
            self.assertFalse(exists(join(wd_mirror, 'fowl')))

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
github Yelp / mrjob / tests / mock_google / dataproc.py View on Github external
def _fully_qualify_network_uri(uri, project_id):
    if '/' not in uri:  # just a name
        uri = 'projects/%s/global/networks/%s' % (project_id, uri)

    if not is_uri(uri):
        uri = 'https://www.googleapis.com/compute/v1/' + uri

    return uri
github Yelp / mrjob / tests / spark / test_runner.py View on Github external
def test_spark_master_local(self):
        runner = SparkMRJobRunner(spark_master='local[*]')

        self.assertFalse(is_uri(runner._spark_tmp_dir))
        self.assertIsNone(runner._upload_mgr)
github Yelp / mrjob / mrjob / runner.py View on Github external
self._get_local_tmp_dir(), 'tmp-download')

        log.info('Archiving %s -> %s' % (dir_path, tar_gz_path))

        with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz:
            for path in self.fs.ls(dir_path):
                # fs.ls() only lists files
                if path == dir_path:
                    raise OSError('%s is a file, not a directory!' % dir_path)

                # TODO: do we need this?
                if os.path.realpath(path) == os.path.realpath(tar_gz_path):
                    raise OSError(
                        'attempted to archive %s into itself!' % tar_gz_path)

                if is_uri(path):
                    path_in_tar_gz = path[len(dir_path):].lstrip('/')

                    log.info('  downloading %s -> %s' % (
                        path, tmp_download_path))
                    with open(tmp_download_path, 'wb') as f:
                        for chunk in self.fs.cat(path):
                            f.write(chunk)
                    local_path = tmp_download_path
                else:
                    path_in_tar_gz = path[len(dir_path):].lstrip(os.sep)
                    local_path = path

                log.debug('  adding %s to %s' % (path, tar_gz_path))
                tar_gz.add(local_path, path_in_tar_gz, recursive=False)

        self._dir_archives_created.add(tar_gz_path)
github Yelp / mrjob / mrjob / runner.py View on Github external
def _dest_in_wd_mirror(self, path, name):
        """Return the URI of where to upload *path* so it can appear in the
        working dir as *name*, or ``None`` if it doesn't need to be uploaded.
        """
        dest_dir = self._wd_mirror()
        if not dest_dir:
            return None

        # the only reason to re-upload a URI is if it has the wrong name
        #
        # similarly, the only point of a local working dir mirror is
        # to rename things
        if (is_uri(path) or not is_uri(dest_dir)) and (
                posixpath.basename(path) == name or
                not self._wd_filenames_must_match()):
            return None

        return posixpath.join(dest_dir, name)
github Yelp / mrjob / mrjob / fs / hadoop.py View on Github external
def can_handle_path(self, path):
        if not (self._hadoop_bin or self._hadoop_bin is None):
            return False

        return is_uri(path)
github Yelp / mrjob / mrjob / runner.py View on Github external
def _dir_archive_path(self, dir_path):
        """Assign a path for the archive of *dir_path* but don't
        actually create anything."""
        if dir_path not in self._dir_to_archive_path:
            # we can check local paths now
            if not (is_uri(dir_path) or os.path.isdir(dir_path)):
                raise OSError('%s is not a directory!' % dir_path)

            name = name_uniquely(
                dir_path, names_taken=self._dir_archive_names_taken)
            self._dir_archive_names_taken.add(name)

            self._dir_to_archive_path[dir_path] = os.path.join(
                self._get_local_tmp_dir(), 'archives', name + '.tar.gz')

        return self._dir_to_archive_path[dir_path]