Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main(_):
hvd.init()
FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else os.path.join(FLAGS.output_dir, str(hvd.rank()))
FLAGS.num_train_steps = FLAGS.num_train_steps // hvd.size()
FLAGS.num_warmup_steps = FLAGS.num_warmup_steps // hvd.size()
tf.logging.set_verbosity(tf.logging.INFO)
if not FLAGS.do_train and not FLAGS.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
tf.gfile.MakeDirs(FLAGS.output_dir)
input_files = []
for input_pattern in FLAGS.input_file.split(","):
input_files.extend(tf.gfile.Glob(input_pattern))
tf.logging.info("*** Input Files ***")
for input_file in input_files:
tf.logging.info(" %s" % input_file)
nstep = num_training_samples * num_iter // global_batch_size
decay_steps = nstep
else:
nstep = num_iter
num_epochs = max(nstep * global_batch_size // num_training_samples, 1)
decay_steps = 90 * num_training_samples // global_batch_size
nstep_per_epoch = num_training_samples // global_batch_size
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
config.gpu_options.force_gpu_compatible = True # Force pinned memory
config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
config.inter_op_parallelism_threads = max(2, 40//hvd.size()-2)
classifier = tf.estimator.Estimator(
model_fn=_cnn_model_function,
model_dir=log_dir,
params={
'model': infer_func,
'format': image_format,
'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
'momentum' : momentum,
'learning_rate_init' : learning_rate_init,
'learning_rate_power' : learning_rate_power,
'decay_steps' : decay_steps,
'weight_decay' : weight_decay,
'loss_scale' : loss_scale,
'larc_eta' : larc_eta,
'larc_mode' : larc_mode,
# initialization of all workers when training is started with random weights or
# restored from a checkpoint.
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": train_data},
y=train_labels,
batch_size=100,
num_epochs=None,
shuffle=True)
# Horovod: adjust number of steps based on number of GPUs.
mnist_classifier.train(
input_fn=train_input_fn,
steps=3000 // hvd.size(),
hooks=[logging_hook, bcast_hook])
# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": eval_data},
y=eval_labels,
num_epochs=1,
shuffle=False)
eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)
# Polyaxon
if hvd.rank() == 0:
experiment.log_metrics(**eval_results)
eval_idx_filenames = None
if data_idx_dir is not None:
filename_pattern = os.path.join(data_idx_dir, '%s-*')
eval_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
else:
raise ValueError("data_idx_dir must be specified")
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
config.gpu_options.force_gpu_compatible = True # Force pinned memory
config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
config.inter_op_parallelism_threads = 40 // hvd.size() - 2
classifier_eval = tf.estimator.Estimator(
model_fn=_cnn_model_function,
model_dir=log_dir,
params={
'model': infer_func,
'format': image_format,
'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
'momentum' : momentum,
'learning_rate_init' : learning_rate_init,
'learning_rate_power' : learning_rate_power,
'decay_steps' : None,
'weight_decay' : weight_decay,
'loss_scale' : loss_scale,
'larc_eta' : larc_eta,
'larc_mode' : larc_mode,
def test_horovod_allgather_grad_cpu(self):
"""Test the correctness of the allgather gradient on CPU."""
hvd.init()
rank = hvd.rank()
size = hvd.size()
# As of TensorFlow v1.9, gradients are not supported on
# integer tensors
dtypes = [tf.float32, tf.float64]
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
tensor_sizes = tensor_sizes[:size]
if _executing_eagerly():
with tf.GradientTape() as tape:
tensor = self.tfe.Variable(
tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank)
if dtype == tf.bool:
tensor = tensor % 2
tensor = tf.cast(tensor, dtype=dtype)
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
master_process = True
training_hooks = []
global_batch_size = FLAGS.train_batch_size
hvd_rank = 0
config = tf.ConfigProto()
if FLAGS.horovod:
tf.logging.info("Multi-GPU training with TF Horovod")
tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank())
global_batch_size = FLAGS.train_batch_size * hvd.size()
master_process = (hvd.rank() == 0)
hvd_rank = hvd.rank()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
if hvd.size() > 1:
training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
if FLAGS.use_xla:
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
run_config = tf.estimator.RunConfig(
model_dir=FLAGS.output_dir if master_process else None,
session_config=config,
save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
keep_checkpoint_max=1)
def get_data(hps, sess):
if hps.image_size == -1:
hps.image_size = {'edges2shoes': 32, 'mnist': 32, 'cifar10': 32, 'imagenet-oord': 64,
'imagenet': 256, 'celeba': 256, 'lsun_realnvp': 64, 'lsun': 256}[hps.problem]
if hps.n_test == -1:
hps.n_test = {'edges2shoes': 200, 'mnist': 10000, 'cifar10': 10000, 'imagenet-oord': 50000, 'imagenet': 50000,
'celeba': 3000, 'lsun_realnvp': 300*hvd.size(), 'lsun': 300*hvd.size()}[hps.problem]
hps.n_y = {'edges2shoes': 10, 'mnist': 10, 'cifar10': 10, 'imagenet-oord': 1000,
'imagenet': 1000, 'celeba': 1, 'lsun_realnvp': 1, 'lsun': 1}[hps.problem]
if hps.data_dir == "":
hps.data_dir = {'edges2shoes': None, 'mnist': None, 'cifar10': None, 'imagenet-oord': '/mnt/host/imagenet-oord-tfr', 'imagenet': '/mnt/host/imagenet-tfr',
'celeba': '/mnt/host/celeba-reshard-tfr', 'lsun_realnvp': '/mnt/host/lsun_realnvp', 'lsun': '/mnt/host/lsun'}[hps.problem]
if hps.problem == 'lsun_realnvp':
hps.rnd_crop = True
else:
hps.rnd_crop = False
if hps.category:
hps.data_dir += ('/%s' % hps.category)
# Use anchor_size to rescale batch size based on image_size
s = hps.anchor_size
def _log_summary(data_length, duration):
logger = _get_logger()
images_per_second = data_length / duration
logger.info('Data length: {}'.format(data_length))
logger.info('Total duration: {:.3f}'.format(duration))
logger.info('Total images/sec: {:.3f}'.format(images_per_second))
logger.info('Batch size: (Per GPU {}: Total {})'.format(_BATCHSIZE,
hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE))
logger.info('Distributed: {}'.format('True' if _DISTRIBUTED else 'False'))
logger.info('Num GPUs: {:.3f}'.format(hvd.size() if _DISTRIBUTED else 1))
logger.info('Dataset: {}'.format('Synthetic' if _FAKE else 'Imagenet'))
increased_aug=False,
):
if synthetic and training:
input_shape = [height, width, 3]
input_element = nest.map_structure(
lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape)
)
label_element = nest.map_structure(
lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1])
)
element = (input_element, label_element)
ds = tf.data.Dataset.from_tensors(element).repeat()
else:
shuffle_buffer_size = 10000
num_readers = 1
if hvd.size() > len(filenames):
assert (hvd.size() % len(filenames)) == 0
filenames = filenames * (hvd.size() / len(filenames))
ds = tf.data.Dataset.from_tensor_slices(filenames)
if shard:
# split the dataset into parts for each GPU
ds = ds.shard(hvd.size(), hvd.rank())
if not training:
# make sure all ranks have the same amount
ds = ds.take(take_count)
if training:
ds = ds.shuffle(1000, seed=7 * (1 + hvd.rank()))
ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1)
def _log_summary(data_length, duration):
logger = _get_logger()
images_per_second = data_length / duration
logger.info('Data length: {}'.format(data_length))
logger.info('Total duration: {:.3f}'.format(duration))
logger.info('Total images/sec: {:.3f}'.format(images_per_second))
logger.info('Batch size: (Per GPU {}: Total {})'.format(_BATCHSIZE,
hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE))
logger.info('Distributed: {}'.format('True' if _DISTRIBUTED else 'False'))
logger.info('Num GPUs: {:.3f}'.format(hvd.size() if _DISTRIBUTED else 1))
logger.info('Dataset: {}'.format('Synthetic' if _FAKE else 'Imagenet'))