Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main(args, init_distributed=False):
utils.import_user_module(args)
assert args.max_tokens is not None or args.max_sentences is not None, \
'Must specify batch size either with --max-tokens or --max-sentences'
# Initialize CUDA and distributed training
if torch.cuda.is_available() and not args.cpu:
torch.cuda.set_device(args.device_id)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if init_distributed:
args.distributed_rank = distributed_utils.distributed_init(args)
if distributed_utils.is_master(args):
checkpoint_utils.verify_checkpoint_directory(args.save_dir)
# Print args
print(args)
# Setup task, e.g., translation, language modeling, etc.
task = tasks.setup_task(args)
# Load valid dataset (we load training data below, based on the latest checkpoint)
for valid_sub_split in args.valid_subset.split(','):
task.load_dataset(valid_sub_split, combine=False, epoch=0)
# Build model and criterion
model = task.build_model(args)
criterion = task.build_criterion(args)
print(model)
def main(args, init_distributed=False):
utils.import_user_module(args)
assert args.max_tokens is not None or args.max_sentences is not None, \
'Must specify batch size either with --max-tokens or --max-sentences'
# Initialize CUDA and distributed training
if torch.cuda.is_available() and not args.cpu:
torch.cuda.set_device(args.device_id)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if init_distributed:
args.distributed_rank = distributed_utils.distributed_init(args)
if distributed_utils.is_master(args):
checkpoint_utils.verify_checkpoint_directory(args.save_dir)
# Print args
print(args)
# Setup task, e.g., translation, language modeling, etc.
task = tasks.setup_task(args)
# Load valid dataset (we load training data below, based on the latest checkpoint)
for valid_sub_split in args.valid_subset.split(','):
task.load_dataset(valid_sub_split, combine=False, epoch=0)
# Build model and criterion
model = task.build_model(args)
criterion = task.build_criterion(args)
print(model)
def main(args, init_distributed=False):
utils.import_user_module(args)
assert args.max_tokens is not None or args.max_sentences is not None, \
'Must specify batch size either with --max-tokens or --max-sentences'
# Initialize CUDA and distributed training
if torch.cuda.is_available() and not args.cpu:
torch.cuda.set_device(args.device_id)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if init_distributed:
args.distributed_rank = distributed_utils.distributed_init(args)
if distributed_utils.is_master(args):
checkpoint_utils.verify_checkpoint_directory(args.save_dir)
# Print args
print(args)
# Setup task, e.g., translation, language modeling, etc.
task = tasks.setup_task(args)
# Load valid dataset (we load training data below, based on the latest checkpoint)
for valid_sub_split in args.valid_subset.split(','):
task.load_dataset(valid_sub_split, combine=False, epoch=0)
# Build model and criterion
model = task.build_model(args)
criterion = task.build_criterion(args)
print(model)
def main(args, init_distributed=False):
utils.import_user_module(args)
assert args.max_tokens is not None or args.max_sentences is not None, \
'Must specify batch size either with --max-tokens or --max-sentences'
# Initialize CUDA and distributed training
if torch.cuda.is_available() and not args.cpu:
torch.cuda.set_device(args.device_id)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if init_distributed:
args.distributed_rank = distributed_utils.distributed_init(args)
if distributed_utils.is_master(args):
checkpoint_utils.verify_checkpoint_directory(args.save_dir)
# Print args
print(args)
# Setup task, e.g., translation, language modeling, etc.
task = tasks.setup_task(args)
# Load valid dataset (we load training data below, based on the latest checkpoint)
for valid_sub_split in args.valid_subset.split(','):
task.load_dataset(valid_sub_split, combine=False, epoch=0)
# Build model and criterion
model = task.build_model(args)
criterion = task.build_criterion(args)
print(model)
def save_checkpoint(args, trainer, epoch_itr, val_loss, val_state):
if args.no_save or not distributed_utils.is_master(args):
return
epoch = epoch_itr.epoch
end_of_epoch = epoch_itr.end_of_epoch()
updates = trainer.get_num_updates()
checkpoint_conds = collections.OrderedDict()
checkpoint_conds['checkpoint_acc_{}_loss_{:.2f}_ppl_{}_e{}.pt'.format(val_state["acc"], val_state["valid_nll_loss"],
val_state["valid_ppl"], epoch)] = \
(end_of_epoch and not args.no_epoch_checkpoints and
epoch % args.save_interval == 0
)
checkpoint_conds['checkpoint_{}_{}.pt'.format(epoch, updates)] = (
not end_of_epoch and args.save_interval_updates > 0 and
updates % args.save_interval_updates == 0
)
if do_validate:
val_loss, val_ppl, stop_due_to_val_loss = validate(
args=args,
trainer=trainer,
task=task,
subset=args.valid_subset,
extra_state=extra_state,
)
extra_state["val_loss"] = val_loss
extra_state["val_ppl"] = val_ppl
lr = trainer.optimizer.get_lr()
val_bleu = None
stop_due_to_val_bleu = False
translation_samples = None
if do_save and distributed_utils.is_master(args):
# save checkpoint
save_checkpoint(trainer=trainer, args=args, extra_state=extra_state)
if do_eval_bleu:
(
val_bleu,
stop_due_to_val_bleu,
translation_samples,
decay_lr,
) = evaluate_bleu(args=args, task=task, extra_state=extra_state)
if decay_lr:
current_lr = lr
trainer.optimizer.set_lr(lr * args.lr_shrink)
lr = trainer.optimizer.get_lr()
print(f"Decay lr from {current_lr} to {lr}.")
return (
def save_checkpoint(args, trainer, epoch_itr, val_loss):
from fairseq import distributed_utils, meters
prev_best = getattr(save_checkpoint, "best", val_loss)
if val_loss is not None:
best_function = max if args.maximize_best_checkpoint_metric else min
save_checkpoint.best = best_function(val_loss, prev_best)
if args.no_save or not distributed_utils.is_master(args):
return
def is_better(a, b):
return a >= b if args.maximize_best_checkpoint_metric else a <= b
write_timer = meters.StopwatchMeter()
write_timer.start()
epoch = epoch_itr.epoch
end_of_epoch = epoch_itr.end_of_epoch()
updates = trainer.get_num_updates()
checkpoint_conds = collections.OrderedDict()
checkpoint_conds["checkpoint{}.pt".format(epoch)] = (
end_of_epoch
and not args.no_epoch_checkpoints
training_progress = extra_state["training_progress"]
extra_state["training_progress"] = (
["...truncated...", training_progress[-1]] if len(training_progress) > 0 else []
)
print(f"| extra_state: {extra_state}")
extra_state["training_progress"] = training_progress
epoch = extra_state["epoch"]
if extra_state["batch_offset"] == 0:
epoch -= 1 # this will be incremented when we call epoch_itr.next_epoch_itr()
epoch_itr.load_state_dict(
{"epoch": epoch, "iterations_in_epoch": extra_state["batch_offset"]}
)
checkpoint_manager = None
if distributed_utils.is_master(args):
checkpoint_manager = checkpoint.CheckpointManager(
num_avg_checkpoints=args.num_avg_checkpoints,
auto_clear_checkpoints=args.auto_clear_checkpoints,
log_verbose=args.log_verbose,
checkpoint_files=extra_state["checkpoint_files"],
)
return extra_state, epoch_itr, checkpoint_manager