Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
archive_command = [arg % variables for arg in archive_template]
# sometime the relevant command isn't available or doesn't work;
# if so, skip the test
try:
proc = Popen(archive_command, cwd=join(self.tmp_dir, 'a'),
stdout=PIPE, stderr=PIPE)
except OSError as e:
if e.errno == 2:
self.skipTest("No %s command" % archive_command[0])
else:
raise
proc.communicate() # discard output
if proc.returncode != 0:
self.skipTest("Can't run `%s` to create archive." %
cmd_line(archive_command))
# unarchive it into b/
unarchive(join(self.tmp_dir, archive_name), join(self.tmp_dir, 'b'))
self.ensure_expected_results(added_files=added_files)
def test_cmd_line(self):
self.assertEqual(cmd_line(['cut', '-f', 2, '-d', ' ']),
"cut -f 2 -d ' '")
self.assertIn(cmd_line(['grep', '-e', "# DON'T USE$"]),
("grep -e \"# DON'T USE\\$\"",
'grep -e \'# DON\'"\'"\'T USE$\''))
def _cat_file(self, path):
# stream from HDFS
cat_args = self.get_hadoop_bin() + ['fs', '-cat', path]
log.debug('> %s' % cmd_line(cat_args))
cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)
for chunk in decompress(cat_proc.stdout, path):
yield chunk
# this does someties happen; see #1396
for line in cat_proc.stderr:
log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))
cat_proc.stdout.close()
cat_proc.stderr.close()
returncode = cat_proc.wait()
if returncode != 0:
def _run_job_in_hadoop(self):
for step_num, step in enumerate(self._get_steps()):
self._warn_about_spark_archives(step)
step_type = step['type']
step_args = self._args_for_step(step_num)
env = _fix_env(self._env_for_step(step_num))
# log this *after* _args_for_step(), which can start a search
# for the Hadoop streaming jar
log.info('Running step %d of %d...' %
(step_num + 1, self._num_steps()))
log.debug('> %s' % cmd_line(step_args))
log.debug(' with environment: %r' % sorted(env.items()))
log_interpretation = {}
self._log_interpretations.append(log_interpretation)
if self._step_type_uses_spark(step_type):
returncode, step_interpretation = self._run_spark_submit(
step_args, env, record_callback=_log_log4j_record)
else:
returncode, step_interpretation = self._run_hadoop(
step_args, env, record_callback=_log_record_from_hadoop)
# make sure output_dir is filled (used for history log)
if 'output_dir' not in step_interpretation:
step_interpretation['output_dir'] = (
self._step_output_uri(step_num))
def _manifest_download_commands(self):
cp_to_local = self.get_hadoop_bin() + ['fs', '-copyToLocal']
return [
('*://*', cmd_line(cp_to_local)),
]
if wrap_python:
# start with shebang
sh_bin = self._sh_bin()
if os.path.isabs(sh_bin[0]):
shebang_bin = sh_bin
else:
shebang_bin = ['/usr/bin/env'] + list(sh_bin)
if len(shebang_bin) > 2:
# Linux limits shebang to one binary and one arg
shebang_bin = shebang_bin[:2]
log.warning('Limiting shebang to two arguments:'
'#!%s' % cmd_line(shebang_bin))
lines.append('#!%s' % cmd_line(shebang_bin))
# hook for 'set -e', etc.
pre_commands = self._sh_pre_commands()
if pre_commands:
for cmd in pre_commands:
lines.append(cmd)
lines.append('')
if setup:
lines.extend(self._setup_cmd_content(setup))
# handle arguments to the script
if wrap_python:
# pretend to be python ($@ is arguments to the python binary)
python_bin = self._task_python_bin()
lines.append('%s "$@"' % cmd_line(python_bin))
def _ssh_launch(self, address, cmd_args, stdin=None):
"""Copy SSH keys if necessary, then launch the given command
over SSH and return a Popen."""
self._ssh_copy_key(address)
args = self._ssh_cmd_args(address, cmd_args)
log.debug(' > ' + cmd_line(args))
try:
return Popen(args, stdout=PIPE, stderr=PIPE, stdin=stdin)
except OSError as ex:
raise IOError(ex.strerror)
# TODO: this is very similar to _start_of_sh_script() in cloud.py
if wrap_python:
# start with shebang
sh_bin = self._sh_bin()
if os.path.isabs(sh_bin[0]):
shebang_bin = sh_bin
else:
shebang_bin = ['/usr/bin/env'] + list(sh_bin)
if len(shebang_bin) > 2:
# Linux limits shebang to one binary and one arg
shebang_bin = shebang_bin[:2]
log.warning('Limiting shebang to two arguments:'
'#!%s' % cmd_line(shebang_bin))
lines.append('#!%s' % cmd_line(shebang_bin))
# hook for 'set -e', etc.
pre_commands = self._sh_pre_commands()
if pre_commands:
for cmd in pre_commands:
lines.append(cmd)
lines.append('')
if setup:
lines.extend(self._setup_cmd_content(setup))
# handle arguments to the script
if wrap_python:
# pretend to be python ($@ is arguments to the python binary)
def _launch_ssh_proc(self, args):
"""The command used to create a :py:class:`subprocess.Popen` to
run the SSH tunnel. You usually don't need to redefine this."""
log.debug('> %s' % cmd_line(args))
return Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE)
that *sort_bin* sees.
"""
if input_paths:
env = os.environ.copy()
# ignore locale when sorting
env['LC_ALL'] = 'C'
# Make sure that the tmp dir environment variables are changed if
# the default is changed.
env['TMP'] = tmp_dir
env['TMPDIR'] = tmp_dir
with open(output_path, 'wb') as output:
args = sort_bin + list(input_paths)
log.debug('> %s' % cmd_line(args))
try:
check_call(args, stdout=output, env=env)
return
except CalledProcessError:
log.error(
'`%s` failed, falling back to in-memory sort' %
cmd_line(sort_bin))
except OSError:
log.error(
'no sort binary, falling back to in-memory sort')
_sort_lines_in_memory(input_paths, output_path, sort_values=sort_values)