How to use the xgboost.rabit.get_rank function in xgboost

To help you get started, we’ve selected a few xgboost examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / xgboost / tests / distributed / distributed_gpu.py View on Github external
def run_test(name, params_fun):
    """Runs a distributed GPU test."""
    # Always call this before using distributed module
    xgb.rabit.init()
    rank = xgb.rabit.get_rank()
    world = xgb.rabit.get_world_size()

    # Load file, file will be automatically sharded in distributed mode.
    dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train')
    dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test')

    params, n_rounds = params_fun(rank)

    # Specify validations set to watch performance
    watchlist = [(dtest, 'eval'), (dtrain, 'train')]

    # Run training, all the features in training API is available.
    # Currently, this script only support calling train once for fault recovery purpose.
    bst = xgb.train(params, dtrain, n_rounds, watchlist, early_stopping_rounds=2)

    # Have each worker save its model
github dmlc / xgboost / tests / distributed / test_issue3402.py View on Github external
5.16,75.39,900.00,232.10,3.00,5.00,6049.88,1.00,126.00,46.00,0.59,0.15,0.00,8.00,
   7.00,0.00,577.25,0.00,0.07,2415.10,0.00,83.72,9.00,1.76,0.20,0.00,0.17,3278.65,155.26,
   4415.50,22731.62,1.00,55.00,0.00,499.94,22.00,0.58,67.00,0.21,341.72,16.00,0.00,965.07,
   17.00,138.41,0.00,0.00,1.00,0.14,1.00,0.02,0.35,1.69,369.00,1300.00,25.00,0.00,0.01,
   0.00,0.00,0.00,0.00,52.00,8.00]]
X = np.array(X)
y = [1, 0]

dtrain = xgb.DMatrix(X, label=y)

param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' }
watchlist  = [(dtrain,'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)

if xgb.rabit.get_rank() == 0:
  bst.save_model("test_issue3402.model")
  xgb.rabit.tracker_print("Finished training\n")

# Notify the tracker all training has been successful
# This is only needed in distributed training.
xgb.rabit.finalize()
github mars-project / mars / mars / learn / contrib / xgboost / train.py View on Github external
# non distributed
            local_history = dict()
            kwargs = dict() if op.kwargs is None else op.kwargs
            bst = train(params, dtrain, evals=evals,
                        evals_result=local_history, **kwargs)
            ctx[op.outputs[0].key] = {'booster': pickle.dumps(bst), 'history': local_history}
        else:
            # distributed
            rabit_args = ctx[op.tracker.key]
            rabit.init(rabit_args)
            try:
                local_history = dict()
                bst = train(params, dtrain, evals=evals, evals_result=local_history,
                            **op.kwargs)
                ret = {'booster': pickle.dumps(bst), 'history': local_history}
                if rabit.get_rank() != 0:
                    ret = {}
                ctx[op.outputs[0].key] = ret
            finally:
                rabit.finalize()
github aws / sagemaker-xgboost-container / src / sagemaker_xgboost_container / distributed.py View on Github external
time.sleep(self.connect_retry_timeout)

        if not successful_connection:
            self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts)
            raise Exception("Failed to connect to Rabit Tracker")
        else:
            self.logger.info("Connected to RabitTracker.")

        rabit.init(['DMLC_NUM_WORKER={}'.format(self.n_workers).encode(),
                    'DMLC_TRACKER_URI={}'.format(self.master_host).encode(),
                    'DMLC_TRACKER_PORT={}'.format(self.port).encode()])

        # We can check that the Rabit instance has successfully connected to the
        # server by getting the rank of the server (e.g. its position in the ring).
        # This should be unique for each instance.
        self.logger.debug("Rabit started - Rank {}".format(rabit.get_rank()))
        self.logger.debug("Executing user code")

        # We can now run user-code. Since XGBoost runs in the same process space
        # it will use the same instance of Rabit that we have configured. It has
        # a number of checks throughout the learning process to see if it is running
        # in distributed mode by calling Rabit APIs. If it is it will do the
        # synchronization automatically.
        #
        # Hence we can now execute any XGBoost specific training code and it
        # will be distributed automatically.
        return RabitHelper(self.is_master_host, self.current_host, self.port)
github aws / sagemaker-xgboost-container / src / sagemaker_xgboost_container / distributed.py View on Github external
def __init__(self, is_master, current_host, master_port):
        """This is returned by the Rabit context manager for useful cluster information and data synchronization.

        :param is_master:
        :param current_host:
        :param master_port:
        """
        self.is_master = is_master
        self.rank = rabit.get_rank()
        self.current_host = current_host
        self.master_port = master_port
github awslabs / sagemaker-debugger / smdebug / xgboost / hook.py View on Github external
def _get_worker_name(self):
        return "worker_{}".format(xgb.rabit.get_rank())