How to use the fairseq.distributed_utils.is_master function in fairseq

To help you get started, we’ve selected a few fairseq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ecchochan / roberta-squad / fairseq_train_cn.py View on Github external
def main(args, init_distributed=False):
    utils.import_user_module(args)

    assert args.max_tokens is not None or args.max_sentences is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'

    # Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args.cpu:
        torch.cuda.set_device(args.device_id)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if init_distributed:
        args.distributed_rank = distributed_utils.distributed_init(args)

    if distributed_utils.is_master(args):
        checkpoint_utils.verify_checkpoint_directory(args.save_dir)

    # Print args
    print(args)

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load valid dataset (we load training data below, based on the latest checkpoint)
    for valid_sub_split in args.valid_subset.split(','):
        task.load_dataset(valid_sub_split, combine=False, epoch=0)

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    print(model)
github ecchochan / roberta-squad / fairseq_train_mnli_cn.py View on Github external
def main(args, init_distributed=False):
    utils.import_user_module(args)

    assert args.max_tokens is not None or args.max_sentences is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'

    # Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args.cpu:
        torch.cuda.set_device(args.device_id)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if init_distributed:
        args.distributed_rank = distributed_utils.distributed_init(args)

    if distributed_utils.is_master(args):
        checkpoint_utils.verify_checkpoint_directory(args.save_dir)

    # Print args
    print(args)

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load valid dataset (we load training data below, based on the latest checkpoint)
    for valid_sub_split in args.valid_subset.split(','):
        task.load_dataset(valid_sub_split, combine=False, epoch=0)

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    print(model)
github ecchochan / roberta-squad / fairseq_train_mnli.py View on Github external
def main(args, init_distributed=False):
    utils.import_user_module(args)

    assert args.max_tokens is not None or args.max_sentences is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'

    # Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args.cpu:
        torch.cuda.set_device(args.device_id)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if init_distributed:
        args.distributed_rank = distributed_utils.distributed_init(args)

    if distributed_utils.is_master(args):
        checkpoint_utils.verify_checkpoint_directory(args.save_dir)

    # Print args
    print(args)

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load valid dataset (we load training data below, based on the latest checkpoint)
    for valid_sub_split in args.valid_subset.split(','):
        task.load_dataset(valid_sub_split, combine=False, epoch=0)

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    print(model)
github ecchochan / roberta-squad / fairseq_train2.py View on Github external
def main(args, init_distributed=False):
    utils.import_user_module(args)

    assert args.max_tokens is not None or args.max_sentences is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'

    # Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args.cpu:
        torch.cuda.set_device(args.device_id)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if init_distributed:
        args.distributed_rank = distributed_utils.distributed_init(args)

    if distributed_utils.is_master(args):
        checkpoint_utils.verify_checkpoint_directory(args.save_dir)

    # Print args
    print(args)

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load valid dataset (we load training data below, based on the latest checkpoint)
    for valid_sub_split in args.valid_subset.split(','):
        task.load_dataset(valid_sub_split, combine=False, epoch=0)

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    print(model)
github KelleyYin / Cross-lingual-Summarization / Teaching-Attention / train.py View on Github external
def save_checkpoint(args, trainer, epoch_itr, val_loss, val_state):

    if args.no_save or not distributed_utils.is_master(args):
        return
    epoch = epoch_itr.epoch
    end_of_epoch = epoch_itr.end_of_epoch()
    updates = trainer.get_num_updates()

    checkpoint_conds = collections.OrderedDict()

    checkpoint_conds['checkpoint_acc_{}_loss_{:.2f}_ppl_{}_e{}.pt'.format(val_state["acc"], val_state["valid_nll_loss"],
                                                                          val_state["valid_ppl"], epoch)] = \
        (end_of_epoch and not args.no_epoch_checkpoints and
         epoch % args.save_interval == 0
    )
    checkpoint_conds['checkpoint_{}_{}.pt'.format(epoch, updates)] = (
            not end_of_epoch and args.save_interval_updates > 0 and
            updates % args.save_interval_updates == 0
    )
github pytorch / translate / pytorch_translate / research / adversarial / adv_train.py View on Github external
if do_validate:
        val_loss, val_ppl, stop_due_to_val_loss = validate(
            args=args,
            trainer=trainer,
            task=task,
            subset=args.valid_subset,
            extra_state=extra_state,
        )
    extra_state["val_loss"] = val_loss
    extra_state["val_ppl"] = val_ppl

    lr = trainer.optimizer.get_lr()
    val_bleu = None
    stop_due_to_val_bleu = False
    translation_samples = None
    if do_save and distributed_utils.is_master(args):
        # save checkpoint
        save_checkpoint(trainer=trainer, args=args, extra_state=extra_state)
        if do_eval_bleu:
            (
                val_bleu,
                stop_due_to_val_bleu,
                translation_samples,
                decay_lr,
            ) = evaluate_bleu(args=args, task=task, extra_state=extra_state)
            if decay_lr:
                current_lr = lr
                trainer.optimizer.set_lr(lr * args.lr_shrink)
                lr = trainer.optimizer.get_lr()
                print(f"Decay lr from {current_lr} to {lr}.")

    return (
github freewym / espresso / fairseq / checkpoint_utils.py View on Github external
def save_checkpoint(args, trainer, epoch_itr, val_loss):
    from fairseq import distributed_utils, meters

    prev_best = getattr(save_checkpoint, "best", val_loss)
    if val_loss is not None:
        best_function = max if args.maximize_best_checkpoint_metric else min
        save_checkpoint.best = best_function(val_loss, prev_best)

    if args.no_save or not distributed_utils.is_master(args):
        return

    def is_better(a, b):
        return a >= b if args.maximize_best_checkpoint_metric else a <= b

    write_timer = meters.StopwatchMeter()
    write_timer.start()

    epoch = epoch_itr.epoch
    end_of_epoch = epoch_itr.end_of_epoch()
    updates = trainer.get_num_updates()

    checkpoint_conds = collections.OrderedDict()
    checkpoint_conds["checkpoint{}.pt".format(epoch)] = (
        end_of_epoch
        and not args.no_epoch_checkpoints
github pytorch / translate / pytorch_translate / train.py View on Github external
training_progress = extra_state["training_progress"]
    extra_state["training_progress"] = (
        ["...truncated...", training_progress[-1]] if len(training_progress) > 0 else []
    )
    print(f"| extra_state: {extra_state}")
    extra_state["training_progress"] = training_progress

    epoch = extra_state["epoch"]
    if extra_state["batch_offset"] == 0:
        epoch -= 1  # this will be incremented when we call epoch_itr.next_epoch_itr()
    epoch_itr.load_state_dict(
        {"epoch": epoch, "iterations_in_epoch": extra_state["batch_offset"]}
    )

    checkpoint_manager = None
    if distributed_utils.is_master(args):
        checkpoint_manager = checkpoint.CheckpointManager(
            num_avg_checkpoints=args.num_avg_checkpoints,
            auto_clear_checkpoints=args.auto_clear_checkpoints,
            log_verbose=args.log_verbose,
            checkpoint_files=extra_state["checkpoint_files"],
        )

    return extra_state, epoch_itr, checkpoint_manager