How to use the mrjob.emr.EMRJobRunner function in mrjob

To help you get started, we’ve selected a few mrjob examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Yelp / mrjob / tests / test_emr.py View on Github external
def test_region_nobucket_nomatchexists(self):
        # aws_region specified, no bucket specified, no buckets have matching
        # region
        self.bucket1.set_location('PUPPYLAND')
        j = EMRJobRunner(aws_region='KITTYLAND',
                         s3_endpoint='KITTYLAND',
                         conf_path=False)
        self.assertNotEqual(j._opts['s3_scratch_uri'], self.bucket1_uri)
github Yelp / mrjob / tests / tools / emr / test_mrboss.py View on Github external
def make_runner(self):
        self.runner = EMRJobRunner(conf_paths=[])
        self.add_mock_s3_data({'walrus': {}})
        self.runner = EMRJobRunner(cloud_fs_sync_secs=0,
                                   cloud_tmp_dir='s3://walrus/tmp',
                                   conf_paths=[])
        self.runner._s3_log_dir_uri = BUCKET_URI + LOG_DIR
        self.prepare_runner_for_ssh(self.runner)
        self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd')
github Yelp / mrjob / tests / test_emr.py View on Github external
def test_explicit_endpoints(self):
        runner = EMRJobRunner(conf_paths=[], aws_region='EU',
                              s3_endpoint='s3-proxy', emr_endpoint='emr-proxy')
        self.assertEqual(runner.make_emr_conn().host, 'emr-proxy')
github Yelp / mrjob / tests / test_emr_pooling.py View on Github external
def make_pooled_cluster(self, **kwargs):
        cluster_id = EMRJobRunner(**kwargs).make_persistent_cluster()

        # simulate that instances are provisioned
        mock_cluster = self.mock_emr_clusters[cluster_id]
        mock_cluster['Status']['State'] = 'WAITING'
        mock_cluster['MasterPublicDnsName'] = 'mockmaster'
        for ig in mock_cluster['_InstanceGroups']:
            ig['RunningInstanceCount'] = ig['RequestedInstanceCount']

        return cluster_id
github Yelp / mrjob / tests / test_emr.py View on Github external
def test_no_waiting_for_job_pool_success(self):
        self.mock_cluster_ids.append('j-fail-lock')
        runner = EMRJobRunner(conf_paths=[], pool_wait_minutes=0)
        cluster_id = runner._find_cluster()

        self.assertEqual(cluster_id, None)
github Yelp / mrjob / tests / test_emr_pooling.py View on Github external
def test_dont_destroy_own_pooled_cluster_on_failure(self):
        # Issue 242: job failure shouldn't kill the pooled clusters
        mr_job = MRTwoStepJob(['-r', 'emr', '-v',
                               '--pool-clusters'])
        mr_job.sandbox()

        self.mock_emr_failures = set([('j-MOCKCLUSTER0', 0)])

        with mr_job.make_runner() as runner:
            self.assertIsInstance(runner, EMRJobRunner)
            self.prepare_runner_for_ssh(runner)
            self.assertRaises(StepFailedException, runner.run)

            for _ in range(10):
                self.simulate_emr_progress(runner.get_cluster_id())

            cluster = runner._describe_cluster()
            self.assertEqual(cluster['Status']['State'], 'WAITING')

        # job shouldn't get terminated by cleanup
        for _ in range(10):
            self.simulate_emr_progress(runner.get_cluster_id())

        cluster = runner._describe_cluster()
        self.assertEqual(cluster['Status']['State'], 'WAITING')
github Yelp / mrjob / tests / tools / emr / test_mrboss.py View on Github external
def make_runner(self):
        self.runner = EMRJobRunner(conf_paths=[])
        self.add_mock_s3_data({'walrus': {}})
        self.runner = EMRJobRunner(s3_sync_wait_time=0,
                                   s3_tmp_dir='s3://walrus/tmp',
                                   conf_paths=[])
        self.runner._s3_log_dir_uri = BUCKET_URI + LOG_DIR
        self.prepare_runner_for_ssh(self.runner)
        self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd')
        self.runner._address_of_master = MagicMock(return_value='testmaster')
github Yelp / mrjob / mrjob / tools / emr / s3_tmpwatch.py View on Github external
def _s3_cleanup(glob_path, time_old, dry_run=False, **runner_kwargs):
    """Delete all files older than *time_old* in *path*.

    If *dry_run* is true, then just log the files that need to be
    deleted without actually deleting them
    """
    runner = EMRJobRunner(**runner_kwargs)

    log.info('Deleting all files in %s that are older than %s' %
             (glob_path, time_old))

    for path, key in runner.fs.s3._ls(glob_path):
        age = _boto3_now() - key.last_modified
        if age > time_old:
            # Delete it
            log.info('Deleting %s; is %s old' % (path, age))
            if not dry_run:
                key.delete()
github Yelp / mrjob / mrjob / tools / emr / report_long_jobs.py View on Github external
def main(args=None):
    now = _boto3_now()

    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    log.info('getting information about running jobs')

    min_time = timedelta(hours=options.min_hours)

    emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client()
    cluster_summaries = _boto3_paginate(
        'Clusters', emr_client, 'list_clusters',
        ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING'])

    if not options.exclude:
        filtered_cluster_summaries = cluster_summaries
    else:
        filtered_cluster_summaries = _filter_clusters(
            cluster_summaries, emr_client, options.exclude)

    job_info = _find_long_running_jobs(
        emr_client, filtered_cluster_summaries, min_time, now=now)

    _print_report(job_info)
github Yelp / mrjob / mrjob / tools / emr / fetch_logs.py View on Github external
def main(args=None):
    option_parser = make_option_parser()
    try:
        options = parse_args(option_parser, args)
    except OptionError:
        option_parser.error('This tool takes exactly one argument.')

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    with EMRJobRunner(**runner_kwargs(options)) as runner:
        perform_actions(options, runner)