How to use the mrjob.util.cmd_line function in mrjob

To help you get started, we’ve selected a few mrjob examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Yelp / mrjob / tests / test_util.py View on Github external
archive_command = [arg % variables for arg in archive_template]

        # sometime the relevant command isn't available or doesn't work;
        # if so, skip the test
        try:
            proc = Popen(archive_command, cwd=join(self.tmp_dir, 'a'),
                         stdout=PIPE, stderr=PIPE)
        except OSError as e:
            if e.errno == 2:
                self.skipTest("No %s command" % archive_command[0])
            else:
                raise
        proc.communicate()  # discard output
        if proc.returncode != 0:
            self.skipTest("Can't run `%s` to create archive." %
                          cmd_line(archive_command))

        # unarchive it into b/
        unarchive(join(self.tmp_dir, archive_name), join(self.tmp_dir, 'b'))

        self.ensure_expected_results(added_files=added_files)
github Yelp / mrjob / tests / test_util.py View on Github external
def test_cmd_line(self):
        self.assertEqual(cmd_line(['cut', '-f', 2, '-d', ' ']),
                         "cut -f 2 -d ' '")
        self.assertIn(cmd_line(['grep', '-e', "# DON'T USE$"]),
                      ("grep -e \"# DON'T USE\\$\"",
                       'grep -e \'# DON\'"\'"\'T USE$\''))
github Yelp / mrjob / mrjob / fs / hadoop.py View on Github external
def _cat_file(self, path):
        # stream from HDFS
        cat_args = self.get_hadoop_bin() + ['fs', '-cat', path]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        for chunk in decompress(cat_proc.stdout, path):
            yield chunk

        # this does someties happen; see #1396
        for line in cat_proc.stderr:
            log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))

        cat_proc.stdout.close()
        cat_proc.stderr.close()

        returncode = cat_proc.wait()

        if returncode != 0:
github Yelp / mrjob / mrjob / hadoop.py View on Github external
def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_type = step['type']
            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            if self._step_type_uses_spark(step_type):
                returncode, step_interpretation = self._run_spark_submit(
                    step_args, env, record_callback=_log_log4j_record)
            else:
                returncode, step_interpretation = self._run_hadoop(
                    step_args, env, record_callback=_log_record_from_hadoop)

            # make sure output_dir is filled (used for history log)
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))
github Yelp / mrjob / mrjob / hadoop.py View on Github external
def _manifest_download_commands(self):
        cp_to_local = self.get_hadoop_bin() + ['fs', '-copyToLocal']

        return [
            ('*://*', cmd_line(cp_to_local)),
        ]
github Yelp / mrjob / mrjob / bin.py View on Github external
if wrap_python:
            # start with shebang
            sh_bin = self._sh_bin()

            if os.path.isabs(sh_bin[0]):
                shebang_bin = sh_bin
            else:
                shebang_bin = ['/usr/bin/env'] + list(sh_bin)

            if len(shebang_bin) > 2:
                # Linux limits shebang to one binary and one arg
                shebang_bin = shebang_bin[:2]
                log.warning('Limiting shebang to two arguments:'
                            '#!%s' % cmd_line(shebang_bin))

            lines.append('#!%s' % cmd_line(shebang_bin))

        # hook for 'set -e', etc.
        pre_commands = self._sh_pre_commands()
        if pre_commands:
            for cmd in pre_commands:
                lines.append(cmd)
            lines.append('')

        if setup:
            lines.extend(self._setup_cmd_content(setup))

        # handle arguments to the script
        if wrap_python:
            # pretend to be python ($@ is arguments to the python binary)
            python_bin = self._task_python_bin()
            lines.append('%s "$@"' % cmd_line(python_bin))
github Yelp / mrjob / mrjob / fs / ssh.py View on Github external
def _ssh_launch(self, address, cmd_args, stdin=None):
        """Copy SSH keys if necessary, then launch the given command
        over SSH and return a Popen."""
        self._ssh_copy_key(address)

        args = self._ssh_cmd_args(address, cmd_args)

        log.debug('  > ' + cmd_line(args))
        try:
            return Popen(args, stdout=PIPE, stderr=PIPE, stdin=stdin)
        except OSError as ex:
            raise IOError(ex.strerror)
github Yelp / mrjob / mrjob / bin.py View on Github external
# TODO: this is very similar to _start_of_sh_script() in cloud.py

        if wrap_python:
            # start with shebang
            sh_bin = self._sh_bin()

            if os.path.isabs(sh_bin[0]):
                shebang_bin = sh_bin
            else:
                shebang_bin = ['/usr/bin/env'] + list(sh_bin)

            if len(shebang_bin) > 2:
                # Linux limits shebang to one binary and one arg
                shebang_bin = shebang_bin[:2]
                log.warning('Limiting shebang to two arguments:'
                            '#!%s' % cmd_line(shebang_bin))

            lines.append('#!%s' % cmd_line(shebang_bin))

        # hook for 'set -e', etc.
        pre_commands = self._sh_pre_commands()
        if pre_commands:
            for cmd in pre_commands:
                lines.append(cmd)
            lines.append('')

        if setup:
            lines.extend(self._setup_cmd_content(setup))

        # handle arguments to the script
        if wrap_python:
            # pretend to be python ($@ is arguments to the python binary)
github Yelp / mrjob / mrjob / cloud.py View on Github external
def _launch_ssh_proc(self, args):
        """The command used to create a :py:class:`subprocess.Popen` to
        run the SSH tunnel. You usually don't need to redefine this."""
        log.debug('> %s' % cmd_line(args))
        return Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE)
github Yelp / mrjob / mrjob / local.py View on Github external
that *sort_bin* sees.
    """
    if input_paths:
        env = os.environ.copy()

        # ignore locale when sorting
        env['LC_ALL'] = 'C'

        # Make sure that the tmp dir environment variables are changed if
        # the default is changed.
        env['TMP'] = tmp_dir
        env['TMPDIR'] = tmp_dir

        with open(output_path, 'wb') as output:
            args = sort_bin + list(input_paths)
            log.debug('> %s' % cmd_line(args))

            try:
                check_call(args, stdout=output, env=env)
                return
            except CalledProcessError:
                log.error(
                    '`%s` failed, falling back to in-memory sort' %
                    cmd_line(sort_bin))
            except OSError:
                log.error(
                    'no sort binary, falling back to in-memory sort')

    _sort_lines_in_memory(input_paths, output_path, sort_values=sort_values)