Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@metrics.profile("torchelastic")
def init_process_group(self):
self.monitor_progress_step = 0
dist.init_process_group(
self.c10d_backend,
timeout=timedelta(milliseconds=self.process_group_timeout),
world_size=self.world_size,
rank=self.rank,
store=self.store,
)
if self.c10d_backend == dist.Backend.GLOO:
self.coordinator_process_group = dist.group.WORLD
else:
# We don't need to use NCCL process group for control plane
# collective operations, this helps us simplify our code (no need
# to make it portable with NCCL)
@metrics.profile("torchelastic")
def rendezvous_barrier(self):
self._destroy_process_group()
try:
self.store, self.rank, self.world_size = self.rendezvous.next_rendezvous()
except RendezvousClosedException:
# Sets the local variable to True
self.stop_training = True
raise StopException(
"Rank {0} received RendezvousClosedException."
" Raising a StopException".format(self.rank)
)
except (RuntimeError, Exception) as e:
raise NonRetryableException(
"Rank {0} received an Exception."
" Detailed message: {1}".format(self.rank, str(e))
)
@metrics.profile("torchelastic")
def load_checkpoint(self, state, rank: int):
"""
Loads checkpoint if the checkpoint manager has been configured and
at least one worker has already loaded the checkpoint
"""
if not self.checkpoint_manager:
# checkpoint not enabled
return state
# all gather `checkpoint_loaded` from all trainers, return true
# if any trainer have ever loaded checkpoint
any_checkpoint_loaded = (
edist.all_gather_return_max_long(1 if self.checkpoint_loaded else 0) == 1
)
if any_checkpoint_loaded:
@metrics.profile("torchelastic")
def save_checkpoint(self, state, rank: int):
"""
TODO: https://github.com/pytorch/elastic/issues/9
"""
if (
self.checkpoint_manager # checkpoint enabled
and (
self.coordinator.should_save_checkpoint()
or state.should_save_checkpoint(rank)
)
# ASSUMPTION: `state.should_save_checkpoint()` return
# consistent value for all workers.
):
# we will save checkpoint if coordinator/platform told us
# or the application told us to do.
# ASSUMPTION: PET built on DDP, which has an implicit barrier