Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_hadoop_version(self):
"""Invoke the hadoop executable to determine its version"""
# mkdir() needs this
if not self._hadoop_version:
stdout = self.invoke_hadoop(['version'], return_stdout=True)
if stdout:
first_line = stdout.split(b'\n')[0]
m = _HADOOP_VERSION_RE.match(first_line)
if m:
self._hadoop_version = to_unicode(m.group('version'))
log.info("Using Hadoop version %s" % self._hadoop_version)
else:
raise Exception('Unable to determine Hadoop version.')
return self._hadoop_version
spark_submit_args = self._args_for_spark_step(step_num, last_step_num)
env = dict(os.environ)
env.update(self._spark_cmdenv(step_num))
returncode, step_interpretation = self._run_spark_submit(
spark_submit_args, env, record_callback=_log_log4j_record)
counters = None
if step['type'] == 'streaming':
counter_file = self.fs.join(
self._counter_output_dir(step_num), 'part-*')
counter_json = b''.join(self.fs.cat(counter_file))
if counter_json.strip():
# json.loads() on Python 3.4/3.5 can't take bytes
counters = json.loads(to_unicode(counter_json))
if isinstance(counters, list):
self._counters.extend(counters)
# desc_num is 1-indexed user-readable step num
for desc_num, counter_dict in enumerate(
counters, start=(step_num + 1)):
if counter_dict:
log.info(_format_counters(
counter_dict,
desc=('Counters for step %d' % desc_num)))
# for non-streaming steps, there are no counters.
# pad self._counters to match number of steps
while len(self._counters) < (last_step_num or step_num) + 1:
self._counters.append({})
this document is in the category
doc_id: (hopefully) unique document ID
doc: the encoded document. We'll fill these fields:
ngram_counts: map from (n, ngram) to # of times ngram appears
in the document, using (n, None) to represent the total
number of times ANY ngram of that size appears (essentially
number of words)
in_test_set: boolean indicating if this doc is in the test set
id: SHA1 hash of doc text (if not already filled)
"""
# fill *id* and *cats*
doc = parse_doc_filename(input_uri)
with open(input_path) as f:
text = to_unicode(f.read())
# pick test/training docs
if self.options.no_test_set:
doc['in_test_set'] = False
else:
doc_hash = hashlib.sha1(text.encode('utf-8')).hexdigest()
doc['in_test_set'] = bool(int(doc_hash[-1], 16) % 2)
# map from (n, ngram) to number of times it appears
ngram_counts = count_ngrams(
text, self.options.max_ngram_size, self.stop_words)
# yield the number of times the ngram appears in this doc
# and the categories for this document, so we can train the classifier
if not doc['in_test_set']:
for (n, ngram), count in ngram_counts.items():
def _yield_lines_from_pty_or_pipe(stderr):
"""Yield lines from a PTY or pipe, converting to unicode and gracefully
handling errno.EIO"""
try:
for line in stderr:
yield to_unicode(line)
except IOError as e:
# this is just the PTY's way of saying goodbye
if e.errno == errno.EIO:
return
else:
raise
def _ssh_run(self, address, cmd_args, stdin=None):
"""Run the given SSH command, and raise an IOError if it fails.
Return ``(stdout, stderr)``
Use this for commands with a bounded amount of output.
"""
p = self._ssh_launch(address, cmd_args, stdin=stdin)
stdout, stderr = p.communicate()
if p.returncode != 0:
raise IOError(to_unicode(stderr))
return stdout, stderr
Lines will be converted to unicode, and trailing \r and \n will be stripped
from lines.
If set, *pre_filter* will be applied to stripped lines. If it
returns true, we'll return a fake record with message set to the line,
num_lines and start_line set as normal, and everything else set to ''.
Also yields fake records for leading non-log4j lines (trailing non-log4j
lines are assumed to be part of a multiline message if not pre-filtered).
"""
last_record = None
for line_num, line in enumerate(lines):
# convert from bytes to unicode, if needed, and strip trailing newlines
line = to_unicode(line).rstrip('\r\n')
def fake_record():
return dict(
caller_location='',
level='',
logger='',
message=line,
num_lines=1,
start_line=line_num,
thread='',
timestamp='')
# had to patch this in here to get _parse_hadoop_jar_command_stderr()'s
# record_callback to fire on the correct line. The problem is that
# we don't emit records until we see the next line (to handle
# multiline records), so the callback would fire in the wrong order
def _log_line(line):
log.info(' %s' % to_unicode(line).strip('\r\n'))
log_func = log.debug if proc.returncode == 0 else log.error
if not return_stdout:
for line in BytesIO(stdout):
log_func('STDOUT: ' + to_unicode(line.rstrip(b'\r\n')))
# check if STDERR is okay
stderr_is_ok = False
if ok_stderr:
for stderr_re in ok_stderr:
if stderr_re.match(stderr):
stderr_is_ok = True
break
if not stderr_is_ok:
for line in BytesIO(stderr):
log_func('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))
ok_returncodes = ok_returncodes or [0]
if not stderr_is_ok and proc.returncode not in ok_returncodes:
raise CalledProcessError(proc.returncode, args)
if return_stdout:
return stdout
else:
return proc.returncode