How to use the sockeye.utils.check_condition function in sockeye

To help you get started, we’ve selected a few sockeye examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / sockeye / test / unit / test_utils.py View on Github external
def test_check_condition_true():
    utils.check_condition(1 == 1, "Nice")
github awslabs / sockeye / sockeye_contrib / autopilot / autopilot.py View on Github external
help="Custom test data (pairs of source and target).")
    arg_parser.add_argument("--custom-text-type", type=str, choices=CUSTOM_TEXT_TYPES, default=CUSTOM_UTF8_RAW,
                            help="Level of pre-processing already applied to data for custom task: none (raw), tokenization, or byte-pair encoding. Default: %(default)s.")
    arg_parser.add_argument("--custom-lang", type=str, nargs=2, metavar=("SRC", "TRG"),
                            help="Source and target language codes for custom task (en, fr, de, etc.).")
    arg_parser.add_argument("--custom-bpe-op", type=int, default=32000,
                            help="Number of byte-pair encoding operations for custom task. Default: %(default)s.")
    arg_parser.add_argument("--gpus", type=int, metavar="N", default=1,
                            help="Number of GPUs to use. 0 for CPU only. Default: %(default)s.")
    arg_parser.add_argument("--test", action="store_true", default=False,
                            help="Run in test mode (much abbreviated system build).")

    args = arg_parser.parse_args()

    # Listed task or fully specified custom task
    utils.check_condition(args.task or all((args.custom_train, args.custom_dev, args.custom_test)),
            "Please specify --task or all of: --custom-task --custom-train --custom-dev --custom-test")

    # Required args for different custom tasks
    if not args.task:
        if args.custom_text_type == CUSTOM_UTF8_RAW:
            utils.check_condition(args.custom_lang, "Please specify --custom-lang for source and target tokenization")

    # Require explicit request to not train model
    if not args.model:
        raise RuntimeError("Please specify --model.  Use --model %s to run data preparation steps only" % MODEL_NONE)

    run_steps(args)
github awslabs / sockeye / sockeye / training.py View on Github external
"""
        utils.check_condition(len(metrics) > 0, "At least one metric must be provided.")
        for metric in metrics:
            utils.check_condition(metric in C.METRICS, "Unknown metric to track during training: %s" % metric)

        if 'dist' in self.optimizer_config.kvstore:
            # In distributed training the optimizer will run remotely. For eve we however need to pass information about
            # the loss, which is not possible anymore by means of accessing self.module._curr_module._optimizer.
            utils.check_condition(self.optimizer_config.name != C.OPTIMIZER_EVE,
                                  "Eve optimizer not supported with distributed training.")
            utils.check_condition(
                not issubclass(type(self.optimizer_config.lr_scheduler),
                               lr_scheduler.AdaptiveLearningRateScheduler),
                "Adaptive learning rate schedulers not supported with a dist kvstore. "
                "Try a fixed schedule such as %s." % C.LR_SCHEDULER_FIXED_RATE_INV_SQRT_T)
            utils.check_condition(not lr_decay_param_reset, "Parameter reset when the learning rate decays not "
                                                            "supported with distributed training.")
            utils.check_condition(lr_decay_opt_states_reset == C.LR_DECAY_OPT_STATES_RESET_OFF,
                                  "Optimizer state reset when the learning rate decays "
                                  "not supported with distributed training.")

        utils.check_condition(self.optimizer_config.gradient_clipping_type in C.GRADIENT_CLIPPING_TYPES,
                              "Unknown gradient clipping type %s" % self.optimizer_config.gradient_clipping_type)

        utils.check_condition(early_stopping_metric in C.METRICS,
                              "Unsupported early-stopping metric: %s" % early_stopping_metric)
        if early_stopping_metric in C.METRICS_REQUIRING_DECODER:
            utils.check_condition(cp_decoder is not None, "%s requires CheckpointDecoder" % early_stopping_metric)
github awslabs / sockeye / sockeye / train.py View on Github external
validation_target=validation_target,
            shared_vocab=shared_vocab,
            batch_size=args.batch_size,
            batch_by_words=batch_by_words,
            batch_num_devices=batch_num_devices)

        check_condition(args.source_factors_combine == C.SOURCE_FACTORS_COMBINE_SUM \
                        or len(source_vocabs) == len(args.source_factors_num_embed) + 1,
                        "Data was prepared with %d source factors, but only provided %d source factor dimensions." % (
                            len(source_vocabs), len(args.source_factors_num_embed) + 1))

        if resume_training:
            # resuming training. Making sure the vocabs in the model and in the prepared data match up
            model_source_vocabs = vocab.load_source_vocabs(output_folder)
            for i, (v, mv) in enumerate(zip(source_vocabs, model_source_vocabs)):
                utils.check_condition(vocab.are_identical(v, mv),
                                      "Prepared data and resumed model source vocab %d do not match." % i)
            model_target_vocab = vocab.load_target_vocab(output_folder)
            utils.check_condition(vocab.are_identical(target_vocab, model_target_vocab),
                                  "Prepared data and resumed model target vocabs do not match.")

        check_condition(data_config.num_source_factors == len(validation_sources),
                        'Training and validation data must have the same number of factors, but found %d and %d.' % (
                            data_config.num_source_factors, len(validation_sources)))

        return train_iter, validation_iter, data_config, source_vocabs, target_vocab

    else:
        utils.check_condition(args.prepared_data is None and args.source is not None and args.target is not None,
                              either_raw_or_prepared_error_msg)

        if resume_training:
github awslabs / sockeye / sockeye / train.py View on Github external
num_words_target = num_words_target if num_words_target > 0 else None

    word_min_count_source, word_min_count_target = args.word_min_count
    batch_num_devices = 1 if args.use_cpu else sum(-di if di < 0 else 1 for di in args.device_ids)
    batch_by_words = args.batch_type == C.BATCH_TYPE_WORD

    validation_sources = [args.validation_source] + args.validation_source_factors
    validation_sources = [str(os.path.abspath(source)) for source in validation_sources]
    validation_target = str(os.path.abspath(args.validation_target))

    either_raw_or_prepared_error_msg = "Either specify a raw training corpus with %s and %s or a preprocessed corpus " \
                                       "with %s." % (C.TRAINING_ARG_SOURCE,
                                                     C.TRAINING_ARG_TARGET,
                                                     C.TRAINING_ARG_PREPARED_DATA)
    if args.prepared_data is not None:
        utils.check_condition(args.source is None and args.target is None, either_raw_or_prepared_error_msg)
        if not resume_training:
            utils.check_condition(args.source_vocab is None and args.target_vocab is None,
                                  "You are using a prepared data folder, which is tied to a vocabulary. "
                                  "To change it you need to rerun data preparation with a different vocabulary.")
        train_iter, validation_iter, data_config, source_vocabs, target_vocab = data_io.get_prepared_data_iters(
            prepared_data_dir=args.prepared_data,
            validation_sources=validation_sources,
            validation_target=validation_target,
            shared_vocab=shared_vocab,
            batch_size=args.batch_size,
            batch_by_words=batch_by_words,
            batch_num_devices=batch_num_devices)

        check_condition(args.source_factors_combine == C.SOURCE_FACTORS_COMBINE_SUM \
                        or len(source_vocabs) == len(args.source_factors_num_embed) + 1,
                        "Data was prepared with %d source factors, but only provided %d source factor dimensions." % (
github awslabs / sockeye / sockeye / image_captioning / train.py View on Github external
:param resume_training: Whether to resume training.
    :param output_folder: Output folder.
    :return: The data iterators (train, validation, config_data) as well as the source and target vocabularies.
    """

    _, num_words_target = args.num_words
    num_words_target = num_words_target if num_words_target > 0 else None
    _, word_min_count_target = args.word_min_count
    batch_num_devices = 1 if args.use_cpu else sum(-di if di < 0 else 1 for di in args.device_ids)
    batch_by_words = args.batch_type == C.BATCH_TYPE_WORD

    either_raw_or_prepared_error_msg = "Either specify a raw training corpus with %s or a preprocessed corpus " \
                                       "with %s." % (C.TRAINING_ARG_TARGET,
                                                     C.TRAINING_ARG_PREPARED_DATA)
    # Note: ignore args.prepared_data for the moment
    utils.check_condition(args.prepared_data is None and args.target is not None,
                          either_raw_or_prepared_error_msg)

    if resume_training:
        # Load the existing vocab created when starting the training run.
        target_vocab = vocab.vocab_from_json(os.path.join(output_folder, C.VOCAB_TRG_NAME))

        # Recover the vocabulary path from the existing config file:
        data_info = cast(data_io.DataInfo, Config.load(os.path.join(output_folder, C.DATA_INFO)))
        target_vocab_path = data_info.target_vocab
    else:
        # Load vocab:
        target_vocab_path = args.target_vocab
        # Note: We do not care about the source vocab for images, that is why some inputs are mocked
        target_vocab = vocab.load_or_create_vocab(data=args.target,
                                                  vocab_path=target_vocab_path,
                                                  num_words=num_words_target,
github awslabs / sockeye / sockeye_contrib / autopilot / autopilot.py View on Github external
help="Number of byte-pair encoding operations for custom task. Default: %(default)s.")
    arg_parser.add_argument("--gpus", type=int, metavar="N", default=1,
                            help="Number of GPUs to use. 0 for CPU only. Default: %(default)s.")
    arg_parser.add_argument("--test", action="store_true", default=False,
                            help="Run in test mode (much abbreviated system build).")

    args = arg_parser.parse_args()

    # Listed task or fully specified custom task
    utils.check_condition(args.task or all((args.custom_train, args.custom_dev, args.custom_test)),
            "Please specify --task or all of: --custom-task --custom-train --custom-dev --custom-test")

    # Required args for different custom tasks
    if not args.task:
        if args.custom_text_type == CUSTOM_UTF8_RAW:
            utils.check_condition(args.custom_lang, "Please specify --custom-lang for source and target tokenization")

    # Require explicit request to not train model
    if not args.model:
        raise RuntimeError("Please specify --model.  Use --model %s to run data preparation steps only" % MODEL_NONE)

    run_steps(args)
github awslabs / sockeye / sockeye / translate.py View on Github external
if input_file is None:
        check_condition(input_factors is None, "Translating from STDIN, not expecting any factor files.")
        for sentence_id, line in enumerate(sys.stdin, 1):
            if input_is_json:
                yield inference.make_input_from_json_string(sentence_id=sentence_id,
                                                            json_string=line,
                                                            translator=translator)
            else:
                yield inference.make_input_from_factored_string(sentence_id=sentence_id,
                                                                factored_string=line,
                                                                translator=translator)
    else:
        input_factors = [] if input_factors is None else input_factors
        inputs = [input_file] + input_factors
        if not input_is_json:
            check_condition(translator.num_source_factors == len(inputs),
                            "Model(s) require %d factors, but %d given (through --input and --input-factors)." % (
                                translator.num_source_factors, len(inputs)))
        with ExitStack() as exit_stack:
            streams = [exit_stack.enter_context(data_io.smart_open(i)) for i in inputs]  # pylint: disable=no-member
            for sentence_id, inputs in enumerate(zip(*streams), 1):
                if input_is_json:
                    yield inference.make_input_from_json_string(sentence_id=sentence_id,
                                                                json_string=inputs[0],
                                                                translator=translator)
                else:
                    yield inference.make_input_from_multiple_strings(sentence_id=sentence_id, strings=list(inputs))
github awslabs / sockeye / sockeye / data_io.py View on Github external
"Must provide None or average target length for each bucket")
    data_target_average_len = list(data_target_average_len)
    bucket_batch_sizes = []  # type: List[BucketBatchSize]
    largest_total_num_words = 0
    for buck_idx, bucket in enumerate(buckets):
        # Target/label length with padding
        padded_seq_len = bucket[1]
        # Average target/label length excluding padding
        if data_target_average_len[buck_idx] is None:
            data_target_average_len[buck_idx] = padded_seq_len
        average_seq_len = data_target_average_len[buck_idx]

        # Word-based: num words determines num sentences
        # Sentence-based: num sentences determines num words
        if batch_by_words:
            check_condition(padded_seq_len <= batch_size, "Word batch size must cover sequence lengths for all"
                                                          " buckets: (%d > %d)" % (padded_seq_len, batch_size))
            # Multiple of number of devices (int) closest to target number of words, assuming each sentence is of
            # average length
            batch_size_seq = batch_num_devices * max(1, round((batch_size / average_seq_len) / batch_num_devices))
            batch_size_word = batch_size_seq * average_seq_len
        else:
            batch_size_seq = batch_size
            batch_size_word = batch_size_seq * average_seq_len
        bucket_batch_sizes.append(BucketBatchSize(bucket, batch_size_seq, batch_size_word))
        # Track largest number of source or target word samples in a batch
        largest_total_num_words = max(largest_total_num_words, batch_size_seq * max(*bucket))

    # Final step: guarantee that largest bucket by sequence length also has a batch size so that it covers any
    # (batch_size, len_source) and (batch_size, len_target) matrix from the data iterator to allow for memory sharing.
    # When batching by sentences, this will already be the case.
    if batch_by_words:
github awslabs / sockeye / sockeye / score.py View on Github external
def score(args: argparse.Namespace):
    setup_main_logger(file_logging=False,
                      console=not args.quiet,
                      level=args.loglevel)  # pylint: disable=no-member

    utils.log_basic_info(args)

    with ExitStack() as exit_stack:
        context = utils.determine_context(device_ids=args.device_ids,
                                          use_cpu=args.use_cpu,
                                          disable_device_locking=args.disable_device_locking,
                                          lock_dir=args.lock_dir,
                                          exit_stack=exit_stack)
        if args.batch_type == C.BATCH_TYPE_SENTENCE:
            check_condition(args.batch_size % len(context) == 0, "When using multiple devices the batch size must be "
                                                                 "divisible by the number of devices. Choose a batch "
                                                                 "size that is a multiple of %d." % len(context))
        logger.info("Scoring Device(s): %s", ", ".join(str(c) for c in context))

        # This call has a number of different parameters compared to training which reflect our need to get scores
        # one-for-one and in the same order as the input data.
        # To enable code reuse, we stuff the `args` parameter with some values.
        # Bucketing and permuting need to be turned off in order to preserve the ordering of sentences.
        # Finally, 'resume_training' needs to be set to True because it causes the model to be loaded instead of initialized.
        args.no_bucketing = True
        args.bucket_width = 10
        score_iter, source_vocabs, target_vocab, model_config = get_data_iters_and_vocabs(
            args=args,
            model_folder=args.model)

        scoring_model = scoring.ScoringModel(config=model_config,