How to use the torchelastic.metrics function in torchelastic

To help you get started, we’ve selected a few torchelastic examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pytorch / elastic / torchelastic / p2p / coordinator_p2p.py View on Github external
    @metrics.profile("torchelastic")
    def init_process_group(self):
        self.monitor_progress_step = 0
        dist.init_process_group(
            self.c10d_backend,
            timeout=timedelta(milliseconds=self.process_group_timeout),
            world_size=self.world_size,
            rank=self.rank,
            store=self.store,
        )

        if self.c10d_backend == dist.Backend.GLOO:
            self.coordinator_process_group = dist.group.WORLD
        else:
            # We don't need to use NCCL process group for control plane
            # collective operations, this helps us simplify our code (no need
            # to make it portable with NCCL)
github pytorch / elastic / torchelastic / p2p / coordinator_p2p.py View on Github external
    @metrics.profile("torchelastic")
    def rendezvous_barrier(self):
        self._destroy_process_group()
        try:
            self.store, self.rank, self.world_size = self.rendezvous.next_rendezvous()
        except RendezvousClosedException:
            # Sets the local variable to True
            self.stop_training = True
            raise StopException(
                "Rank {0} received RendezvousClosedException."
                " Raising a StopException".format(self.rank)
            )
        except (RuntimeError, Exception) as e:
            raise NonRetryableException(
                "Rank {0} received an Exception."
                " Detailed message: {1}".format(self.rank, str(e))
            )
github pytorch / elastic / torchelastic / checkpoint / api.py View on Github external
    @metrics.profile("torchelastic")
    def load_checkpoint(self, state, rank: int):
        """
        Loads checkpoint if the checkpoint manager has been configured and
        at least one worker has already loaded the checkpoint
        """
        if not self.checkpoint_manager:
            # checkpoint not enabled
            return state

        # all gather `checkpoint_loaded` from all trainers, return true
        # if any trainer have ever loaded checkpoint
        any_checkpoint_loaded = (
            edist.all_gather_return_max_long(1 if self.checkpoint_loaded else 0) == 1
        )

        if any_checkpoint_loaded:
github pytorch / elastic / torchelastic / checkpoint / api.py View on Github external
    @metrics.profile("torchelastic")
    def save_checkpoint(self, state, rank: int):
        """
        TODO: https://github.com/pytorch/elastic/issues/9
        """
        if (
            self.checkpoint_manager  # checkpoint enabled
            and (
                self.coordinator.should_save_checkpoint()
                or state.should_save_checkpoint(rank)
            )
            # ASSUMPTION: `state.should_save_checkpoint()` return
            # consistent value for all workers.
        ):
            # we will save checkpoint if coordinator/platform told us
            # or the application told us to do.
            # ASSUMPTION: PET built on DDP, which has an implicit barrier