Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def run_test(name, params_fun):
"""Runs a distributed GPU test."""
# Always call this before using distributed module
xgb.rabit.init()
rank = xgb.rabit.get_rank()
world = xgb.rabit.get_world_size()
# Load file, file will be automatically sharded in distributed mode.
dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test')
params, n_rounds = params_fun(rank)
# Specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
# Run training, all the features in training API is available.
# Currently, this script only support calling train once for fault recovery purpose.
bst = xgb.train(params, dtrain, n_rounds, watchlist, early_stopping_rounds=2)
# Have each worker save its model
5.16,75.39,900.00,232.10,3.00,5.00,6049.88,1.00,126.00,46.00,0.59,0.15,0.00,8.00,
7.00,0.00,577.25,0.00,0.07,2415.10,0.00,83.72,9.00,1.76,0.20,0.00,0.17,3278.65,155.26,
4415.50,22731.62,1.00,55.00,0.00,499.94,22.00,0.58,67.00,0.21,341.72,16.00,0.00,965.07,
17.00,138.41,0.00,0.00,1.00,0.14,1.00,0.02,0.35,1.69,369.00,1300.00,25.00,0.00,0.01,
0.00,0.00,0.00,0.00,52.00,8.00]]
X = np.array(X)
y = [1, 0]
dtrain = xgb.DMatrix(X, label=y)
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' }
watchlist = [(dtrain,'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)
if xgb.rabit.get_rank() == 0:
bst.save_model("test_issue3402.model")
xgb.rabit.tracker_print("Finished training\n")
# Notify the tracker all training has been successful
# This is only needed in distributed training.
xgb.rabit.finalize()
# non distributed
local_history = dict()
kwargs = dict() if op.kwargs is None else op.kwargs
bst = train(params, dtrain, evals=evals,
evals_result=local_history, **kwargs)
ctx[op.outputs[0].key] = {'booster': pickle.dumps(bst), 'history': local_history}
else:
# distributed
rabit_args = ctx[op.tracker.key]
rabit.init(rabit_args)
try:
local_history = dict()
bst = train(params, dtrain, evals=evals, evals_result=local_history,
**op.kwargs)
ret = {'booster': pickle.dumps(bst), 'history': local_history}
if rabit.get_rank() != 0:
ret = {}
ctx[op.outputs[0].key] = ret
finally:
rabit.finalize()
time.sleep(self.connect_retry_timeout)
if not successful_connection:
self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts)
raise Exception("Failed to connect to Rabit Tracker")
else:
self.logger.info("Connected to RabitTracker.")
rabit.init(['DMLC_NUM_WORKER={}'.format(self.n_workers).encode(),
'DMLC_TRACKER_URI={}'.format(self.master_host).encode(),
'DMLC_TRACKER_PORT={}'.format(self.port).encode()])
# We can check that the Rabit instance has successfully connected to the
# server by getting the rank of the server (e.g. its position in the ring).
# This should be unique for each instance.
self.logger.debug("Rabit started - Rank {}".format(rabit.get_rank()))
self.logger.debug("Executing user code")
# We can now run user-code. Since XGBoost runs in the same process space
# it will use the same instance of Rabit that we have configured. It has
# a number of checks throughout the learning process to see if it is running
# in distributed mode by calling Rabit APIs. If it is it will do the
# synchronization automatically.
#
# Hence we can now execute any XGBoost specific training code and it
# will be distributed automatically.
return RabitHelper(self.is_master_host, self.current_host, self.port)
def __init__(self, is_master, current_host, master_port):
"""This is returned by the Rabit context manager for useful cluster information and data synchronization.
:param is_master:
:param current_host:
:param master_port:
"""
self.is_master = is_master
self.rank = rabit.get_rank()
self.current_host = current_host
self.master_port = master_port
def _get_worker_name(self):
return "worker_{}".format(xgb.rabit.get_rank())