How to use the mrjob.py2.to_unicode function in mrjob

To help you get started, we’ve selected a few mrjob examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Yelp / mrjob / mrjob / fs / hadoop.py View on Github external
def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        # mkdir() needs this
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(['version'], return_stdout=True)
            if stdout:
                first_line = stdout.split(b'\n')[0]
                m = _HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = to_unicode(m.group('version'))
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                else:
                    raise Exception('Unable to determine Hadoop version.')

        return self._hadoop_version
github Yelp / mrjob / mrjob / spark / runner.py View on Github external
spark_submit_args = self._args_for_spark_step(step_num, last_step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode, step_interpretation = self._run_spark_submit(
            spark_submit_args, env, record_callback=_log_log4j_record)

        counters = None
        if step['type'] == 'streaming':
            counter_file = self.fs.join(
                self._counter_output_dir(step_num), 'part-*')
            counter_json = b''.join(self.fs.cat(counter_file))
            if counter_json.strip():
                # json.loads() on Python 3.4/3.5 can't take bytes
                counters = json.loads(to_unicode(counter_json))

        if isinstance(counters, list):
            self._counters.extend(counters)

            # desc_num is 1-indexed user-readable step num
            for desc_num, counter_dict in enumerate(
                    counters, start=(step_num + 1)):
                if counter_dict:
                    log.info(_format_counters(
                        counter_dict,
                        desc=('Counters for step %d' % desc_num)))

        # for non-streaming steps, there are no counters.
        # pad self._counters to match number of steps
        while len(self._counters) < (last_step_num or step_num) + 1:
            self._counters.append({})
github Yelp / mrjob / mrjob / examples / mr_text_classifier.py View on Github external
this document is in the category

        doc_id: (hopefully) unique document ID
        doc: the encoded document. We'll fill these fields:
            ngram_counts: map from (n, ngram) to  # of times ngram appears
                in the document, using (n, None) to represent the total
                number of times ANY ngram of that size appears (essentially
                number of words)
            in_test_set: boolean indicating if this doc is in the test set
            id: SHA1 hash of doc text (if not already filled)
        """
        # fill *id* and *cats*
        doc = parse_doc_filename(input_uri)

        with open(input_path) as f:
            text = to_unicode(f.read())

        # pick test/training docs
        if self.options.no_test_set:
            doc['in_test_set'] = False
        else:
            doc_hash = hashlib.sha1(text.encode('utf-8')).hexdigest()
            doc['in_test_set'] = bool(int(doc_hash[-1], 16) % 2)

        # map from (n, ngram) to number of times it appears
        ngram_counts = count_ngrams(
            text, self.options.max_ngram_size, self.stop_words)

        # yield the number of times the ngram appears in this doc
        # and the categories for this document, so we can train the classifier
        if not doc['in_test_set']:
            for (n, ngram), count in ngram_counts.items():
github Yelp / mrjob / mrjob / logs / step.py View on Github external
def _yield_lines_from_pty_or_pipe(stderr):
    """Yield lines from a PTY or pipe, converting to unicode and gracefully
    handling errno.EIO"""
    try:
        for line in stderr:
            yield to_unicode(line)
    except IOError as e:
        # this is just the PTY's way of saying goodbye
        if e.errno == errno.EIO:
            return
        else:
            raise
github Yelp / mrjob / mrjob / fs / ssh.py View on Github external
def _ssh_run(self, address, cmd_args, stdin=None):
        """Run the given SSH command, and raise an IOError if it fails.
        Return ``(stdout, stderr)``

        Use this for commands with a bounded amount of output.
        """
        p = self._ssh_launch(address, cmd_args, stdin=stdin)

        stdout, stderr = p.communicate()

        if p.returncode != 0:
            raise IOError(to_unicode(stderr))

        return stdout, stderr
github Yelp / mrjob / mrjob / logs / log4j.py View on Github external
Lines will be converted to unicode, and trailing \r and \n will be stripped
    from lines.

    If set, *pre_filter* will be applied to stripped lines. If it
    returns true, we'll return a fake record with message set to the line,
    num_lines and start_line set as normal, and everything else set to ''.

    Also yields fake records for leading non-log4j lines (trailing non-log4j
    lines are assumed to be part of a multiline message if not pre-filtered).
    """
    last_record = None

    for line_num, line in enumerate(lines):
        # convert from bytes to unicode, if needed, and strip trailing newlines
        line = to_unicode(line).rstrip('\r\n')

        def fake_record():
            return dict(
                caller_location='',
                level='',
                logger='',
                message=line,
                num_lines=1,
                start_line=line_num,
                thread='',
                timestamp='')

        # had to patch this in here to get _parse_hadoop_jar_command_stderr()'s
        # record_callback to fire on the correct line. The problem is that
        # we don't emit records until we see the next line (to handle
        # multiline records), so the callback would fire in the wrong order
github Yelp / mrjob / mrjob / local.py View on Github external
def _log_line(line):
            log.info('  %s' % to_unicode(line).strip('\r\n'))
github Yelp / mrjob / mrjob / fs / hadoop.py View on Github external
log_func = log.debug if proc.returncode == 0 else log.error
        if not return_stdout:
            for line in BytesIO(stdout):
                log_func('STDOUT: ' + to_unicode(line.rstrip(b'\r\n')))

        # check if STDERR is okay
        stderr_is_ok = False
        if ok_stderr:
            for stderr_re in ok_stderr:
                if stderr_re.match(stderr):
                    stderr_is_ok = True
                    break

        if not stderr_is_ok:
            for line in BytesIO(stderr):
                log_func('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))

        ok_returncodes = ok_returncodes or [0]

        if not stderr_is_ok and proc.returncode not in ok_returncodes:
            raise CalledProcessError(proc.returncode, args)

        if return_stdout:
            return stdout
        else:
            return proc.returncode