Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
restore_fns = _get_init_fn(self, model, self.dump_dir, self.ctx)
if restore_fns is not None:
for restore_fn in restore_fns:
restore_fn(self.sess)
# Restore from custom auxilary init funcs
for func in self._aux_init_funcs:
func(self.sess)
# resotre from auxilary checkpoint
for auxilary_scope, auxilary_checkpoint in self.auxilary_checkpoints.items():
self.restore_scopy_from(model, auxilary_scope, auxilary_checkpoint)
# Horovod boardcast global variables
if self.is_distribute_training:
bgv = hvd.BroadcastGlobalVariablesHook(0)
bgv.begin()
bgv.after_create_session(self.sess, self.coord)
else:
training_logging_hook = hooks.TrainingLoggingHook(
log_file_path=os.path.join(self.run_hparams.log_dir, "training.json"),
global_batch_size=global_batch_size,
num_steps=num_steps,
num_samples=num_samples,
num_epochs=num_epochs,
log_every=log_every_n_steps
)
training_hooks.append(training_logging_hook)
if hvd_utils.is_using_hvd():
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
training_hooks.append(bcast_hook)
training_hooks.append(hooks.PrefillStagingAreasHook())
estimator_params = {
'batch_size': batch_size,
'steps_per_epoch': steps_per_epoch,
'num_gpus': num_gpus,
'momentum': momentum,
'learning_rate_init': learning_rate_init,
'weight_decay': weight_decay,
'loss_scale': loss_scale,
'apply_loss_scaling': use_static_loss_scaling
}
elif train_step < 60000:
self._lrn_rate = 0.01
elif train_step < 80000:
self._lrn_rate = 0.001
else:
self._lrn_rate = 0.0001
if FLAGS.job_name == None:
#serial version
checkpoint_dir = FLAGS.log_root if hvd.rank() == 0 else None
with tf.train.MonitoredTrainingSession(
checkpoint_dir=checkpoint_dir,
save_checkpoint_secs=60,
hooks=[hvd.BroadcastGlobalVariablesHook(0), logging_hook, _LearningRateSetterHook()],
chief_only_hooks=[summary_hook],
# Since we provide a SummarySaverHook, we need to disable default
# SummarySaverHook. To do that we set save_summaries_steps to 0.
save_summaries_steps=0,
config=create_config_proto()) as mon_sess:
while not mon_sess.should_stop():
mon_sess.run(model.train_op)
else:
is_chief = (FLAGS.task_index == 0)
with tf.train.MonitoredTrainingSession(
master=server.target,
is_chief=is_chief,
checkpoint_dir=FLAGS.log_root,
save_checkpoint_secs=60,
hooks=[logging_hook, _LearningRateSetterHook()],
with log.verbose_level(2):
m = get_model("resnet", config, **kwargs)
global_step = tf.get_variable(
"global_step", [],
initializer=tf.constant_initializer(0),
trainable=False,
dtype=tf.int64)
lr = tf.train.piecewise_constant(
global_step, config.learn_rate_decay_steps,
[config.learn_rate] + list(config.learn_rate_list))
m._lr = lr
m._global_step = global_step
opt = tf.train.MomentumOptimizer(lr, 0.9)
opt = hvd.DistributedOptimizer(opt)
hooks = [hvd.BroadcastGlobalVariablesHook(0)]
m._train_op = opt.minimize(m.cost, global_step=global_step, name="train_step")
tf.summary.scalar("train ce", m.cross_ent)
return m, hooks
def eval_input_fn():
dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test")
return dataset.shuffle(1000).batch(128)
estimator = tf.compat.v1.estimator.LinearClassifier(
feature_columns=winequality.get_feature_columns(),
model_dir=f"{HDFS_DIR}",
n_classes=winequality.get_n_classes(),
optimizer=lambda: hvd.DistributedOptimizer(tf.compat.v1.train.AdamOptimizer()))
return Experiment(
estimator,
tf.estimator.TrainSpec(
train_input_fn,
max_steps=10,
hooks=[hvd.BroadcastGlobalVariablesHook(0)]
),
tf.estimator.EvalSpec(
eval_input_fn,
steps=10,
start_delay_secs=0,
throttle_secs=30
)
def _get_hooks(batch_size, is_distributed=defaults.DISTRIBUTED):
logger = logging.getLogger(__name__)
if is_distributed:
exps_hook = ExamplesPerSecondHook(batch_size * hvd.size())
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
logger.info("Rank: {} Cluster Size {}".format(hvd.rank(), hvd.size()))
return [bcast_hook, exps_hook]
else:
exps_hook = ExamplesPerSecondHook(batch_size)
return [exps_hook]
rfig = tf.estimator.RunConfig(save_checkpoints_steps=10000, session_config=config)
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
model_dir = os.getenv('AZ_BATCHAI_OUTPUT_MODEL') if hvd.rank() == 0 else None
params = {"learning_rate": LR}
# rfig = tf.estimator.RunConfig(save_checkpoints_steps=1000)
logger.info('Creating estimator with params: {}'.format(params))
model = tf.estimator.Estimator(model_fn=model_fn,
params=params,
model_dir=model_dir,
config=rfig)
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
logger.info('{} {}'.format(hvd.local_rank(), hvd.size()))
for epoch in range(EPOCHS):
logger.info('Running epoch {}...'.format(epoch))
model.train(input_fn=train_input_fn, steps=10000, hooks=[bcast_hook]) # data_length//batch_size
logger.info('Validation...')
model.evaluate(input_fn=validation_input_fn, steps=10) # validation_length//batch_size
custom_params = {
'data_format': FLAGS.data_format,
'batch_size': FLAGS.batch_size,
'fp16': FLAGS.fp16,
'efficient': FLAGS.efficient
}
features, labels = input_fn(True, FLAGS.data_dir, FLAGS.batch_size, None)
with tf.variable_scope('model', custom_getter=float32_variable_storage_getter):
train_op, loss, global_step = cifar10_model_fn(features, labels, custom_params)
# BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0
# to all other processes. This is necessary to ensure consistent initialization
# of all workers when training is started with random weights or restored
# from a checkpoint.
hooks = [hvd.BroadcastGlobalVariablesHook(0),
tf.train.StopAtStepHook(last_step=10000),
tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
every_n_iter=10),
]
# Pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
# Save checkpoints only on worker 0 to prevent other workers from corrupting them.
checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
# The MonitoredTrainingSession takes care of session initialization,
# restoring from a checkpoint, saving to a checkpoint, and closing when done
# or an error occurs.
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
master_process = True
training_hooks = []
global_batch_size = FLAGS.train_batch_size
hvd_rank = 0
config = tf.ConfigProto()
if FLAGS.horovod:
global_batch_size = FLAGS.train_batch_size * hvd.size()
master_process = (hvd.rank() == 0)
hvd_rank = hvd.rank()
config.gpu_options.visible_device_list = str(hvd.local_rank())
if hvd.size() > 1:
training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
if FLAGS.use_xla:
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
run_config = tf.estimator.RunConfig(
model_dir=FLAGS.output_dir if master_process else None,
session_config=config,
save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
keep_checkpoint_max=1)
if master_process:
tf.logging.info("***** Configuaration *****")
for key in FLAGS.__flags.keys():
tf.logging.info(' {}: {}'.format(key, getattr(FLAGS, key)))
tf.logging.info("**************************")
train_examples = None