Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_from_dask_dataframe(client):
X, y = generate_array()
X = dd.from_dask_array(X)
y = dd.from_dask_array(y)
dtrain = DaskDMatrix(client, X, y)
booster = xgb.dask.train(
client, {}, dtrain, num_boost_round=2)['booster']
prediction = xgb.dask.predict(client, model=booster, data=dtrain)
assert prediction.ndim == 1
assert isinstance(prediction, da.Array)
assert prediction.shape[0] == kRows
with pytest.raises(ValueError):
# evals_result is not supported in dask interface.
xgb.dask.train(
client, {}, dtrain, num_boost_round=2, evals_result={})
prediction = prediction.compute() # force prediction to be computed
def _check_outputs(out, predictions):
assert isinstance(out['booster'], dxgb.Booster)
assert len(out['history']['validation']['rmse']) == 2
assert isinstance(predictions, np.ndarray)
assert predictions.shape[0] == 1
client=client, data=dask_X_train, label=dask_label_train)
dask_X_test = da.from_array(X_test, partition_size)
dask_X_test = dask_X_test.persist()
client.rebalance(dask_X_test)
dask_label_test = da.from_array(y_test, partition_size)
dask_label_test = dask_label_test.persist()
client.rebalance(dask_label_test)
dtest = DaskDMatrix(
client=client, data=dask_X_test, label=dask_label_test)
gpu_res = {} # Store accuracy result
tmp = time.time()
# Train model
xgb.dask.train(client, param, dtrain, num_boost_round=num_round, evals=[
(dtest, 'test')])
print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))
# TODO: https://github.com/dmlc/xgboost/issues/4518
dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1)
dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1)
# Repeat for CPU algorithm
tmp = time.time()
param['tree_method'] = 'hist'
cpu_res = {}
xgb.train(param, dtrain, num_round, evals=[
(dtest, 'test')], evals_result=cpu_res)
print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
def fit(self, data, args):
params = self.configure(data, args)
n_workers = None if args.gpus < 0 else args.gpus
cluster = LocalCUDACluster(n_workers=n_workers,
local_directory=args.root)
client = Client(cluster)
n_partitions = len(client.scheduler_info()['workers'])
X_sliced, y_sliced = self.get_slices(n_partitions,
data.X_train, data.y_train)
X = da.concatenate([da.from_array(sub_array) for sub_array in X_sliced])
X = X.rechunk((X_sliced[0].shape[0], data.X_train.shape[1]))
y = da.concatenate([da.from_array(sub_array) for sub_array in y_sliced])
y = y.rechunk(X.chunksize[0])
dtrain = xgb.dask.DaskDMatrix(client, X, y)
with Timer() as t:
output = xgb.dask.train(client, params, dtrain, num_boost_round=args.ntrees)
self.model = output['booster']
client.close()
cluster.close()
return t.interval
def main(client):
# generate some random data for demonstration
n = 100
m = 10000
partition_size = 100
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
regressor = xgboost.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
regressor.set_params(tree_method='hist')
# assigning client here is optional
regressor.client = client
regressor.fit(X, y, eval_set=[(X, y)])
prediction = regressor.predict(X)
bst = regressor.get_booster()
history = regressor.evals_result()
print('Evaluation history:', history)
# returned prediction is always a dask array.
assert isinstance(prediction, da.Array)
return bst # returning the trained model
def main(client):
# generate some random data for demonstration
n = 100
m = 1000000
partition_size = 10000
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
regressor = xgboost.dask.DaskXGBRegressor(verbosity=1)
regressor.set_params(tree_method='gpu_hist')
# assigning client here is optional
regressor.client = client
regressor.fit(X, y, eval_set=[(X, y)])
prediction = regressor.predict(X)
bst = regressor.get_booster()
history = regressor.evals_result()
print('Evaluation history:', history)
# returned prediction is always a dask array.
assert isinstance(prediction, da.Array)
return bst # returning the trained model
def main(client):
# generate some random data for demonstration
m = 100000
n = 100
X = da.random.random(size=(m, n), chunks=100)
y = da.random.random(size=(m, ), chunks=100)
# DaskDMatrix acts like normal DMatrix, works as a proxy for local
# DMatrix scatter around workers.
dtrain = DaskDMatrix(client, X, y)
# Use train method from xgboost.dask instead of xgboost. This
# distributed version of train returns a dictionary containing the
# resulting booster and evaluation history obtained from
# evaluation metrics.
output = xgb.dask.train(client,
{'verbosity': 1,
'nthread': 1,
'tree_method': 'hist'},
dtrain,
num_boost_round=4, evals=[(dtrain, 'train')])
bst = output['booster']
history = output['history']
# you can pass output directly into `predict` too.
prediction = xgb.dask.predict(client, bst, dtrain)
print('Evaluation history:', history)
return prediction
warm_start=warm_start,
presort=presort) # h2o4gpu)
if random_state is None:
random_state = 0
self.distributed = False
import xgboost as xgb
from ..util.gpu import device_count
n_gpus, _ = device_count(n_gpus)
if n_gpus > 1 and CUDA_DASK_INSTALLED:
self.distributed = True
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster(n_workers=n_gpus, threads_per_worker=1)
self.model_h2o4gpu = xgb.dask.DaskXGBClassifier(
learning_rate=learning_rate, # h2o4gpu
n_estimators=n_estimators, # h2o4gpu
subsample=subsample, # h2o4gpu
max_depth=max_depth, # h2o4gpu
random_state=random_state, # h2o4gpu
verbose=verbose, # h2o4gpu
colsample_bytree=colsample_bytree, # h2o4gpu
colsample_bylevel=colsample_bylevel,
colsample_bynode=colsample_bynode,
num_parallel_tree=num_parallel_tree, # h2o4gpu
tree_method=tree_method, # h2o4gpu
predictor=predictor, # h2o4gpu
objective=objective,
booster=booster,
n_jobs=n_jobs,
gamma=gamma,