How to use the xgboost.dask.DaskDMatrix function in xgboost

To help you get started, we’ve selected a few xgboost examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / xgboost / tests / python-gpu / test_gpu_with_dask.py View on Github external
dtrain = dxgb.DaskDMatrix(client, X, y)

                out = dxgb.train(client, parameters,
                                 dtrain=dtrain,
                                 evals=[(dtrain, 'validation')],
                                 num_boost_round=2)
                predictions = dxgb.predict(client=client, model=out,
                                           data=dtrain).compute()
                _check_outputs(out, predictions)

                # train has more rows than evals
                valid = dtrain
                kRows += 1
                X = dd.from_array(np.random.randn(kRows, kCols))
                y = dd.from_array(np.random.rand(kRows))
                dtrain = dxgb.DaskDMatrix(client, X, y)

                out = dxgb.train(client, parameters,
                                 dtrain=dtrain,
                                 evals=[(valid, 'validation')],
                                 num_boost_round=2)
                predictions = dxgb.predict(client=client, model=out,
                                           data=valid).compute()
                _check_outputs(out, predictions)
github dmlc / xgboost / tests / python-gpu / test_gpu_with_dask.py View on Github external
def test_dask_dataframe(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                X, y = generate_array()

                X = dd.from_dask_array(X)
                y = dd.from_dask_array(y)

                X = X.map_partitions(cudf.from_pandas)
                y = y.map_partitions(cudf.from_pandas)

                dtrain = dxgb.DaskDMatrix(client, X, y)
                out = dxgb.train(client, {'tree_method': 'gpu_hist'},
                                 dtrain=dtrain,
                                 evals=[(dtrain, 'X')],
                                 num_boost_round=2)

                assert isinstance(out['booster'], dxgb.Booster)
                assert len(out['history']['X']['rmse']) == 2

                predictions = dxgb.predict(client, out, dtrain).compute()
                assert isinstance(predictions, np.ndarray)
github h2oai / h2o4gpu / tests / python / open_data / gbm / test_xgboost.py View on Github external
client.rebalance(dask_X_train)
            dask_label_train = da.from_array(y_train, partition_size)
            dask_label_train = dask_label_train.persist()
            client.rebalance(dask_label_train)

            dtrain = DaskDMatrix(
                client=client, data=dask_X_train, label=dask_label_train)

            dask_X_test = da.from_array(X_test, partition_size)
            dask_X_test = dask_X_test.persist()
            client.rebalance(dask_X_test)
            dask_label_test = da.from_array(y_test, partition_size)
            dask_label_test = dask_label_test.persist()
            client.rebalance(dask_label_test)

            dtest = DaskDMatrix(
                client=client, data=dask_X_test, label=dask_label_test)

            gpu_res = {}  # Store accuracy result
            tmp = time.time()
            # Train model
            xgb.dask.train(client, param, dtrain, num_boost_round=num_round, evals=[
                (dtest, 'test')])
            print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))

            # TODO: https://github.com/dmlc/xgboost/issues/4518
            dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1)
            dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1)
            # Repeat for CPU algorithm
            tmp = time.time()
            param['tree_method'] = 'hist'
            cpu_res = {}
github NVIDIA / gbm-bench / algorithms.py View on Github external
def fit(self, data, args):
        params = self.configure(data, args)
        n_workers = None if args.gpus < 0 else args.gpus
        cluster = LocalCUDACluster(n_workers=n_workers,
                                   local_directory=args.root)
        client = Client(cluster)
        n_partitions = len(client.scheduler_info()['workers'])
        X_sliced, y_sliced = self.get_slices(n_partitions,
                                             data.X_train, data.y_train)
        X = da.concatenate([da.from_array(sub_array) for sub_array in X_sliced])
        X = X.rechunk((X_sliced[0].shape[0], data.X_train.shape[1]))
        y = da.concatenate([da.from_array(sub_array) for sub_array in y_sliced])
        y = y.rechunk(X.chunksize[0])
        dtrain = xgb.dask.DaskDMatrix(client, X, y)
        with Timer() as t:
            output = xgb.dask.train(client, params, dtrain, num_boost_round=args.ntrees)
        self.model = output['booster']
        client.close()
        cluster.close()
        return t.interval
github dmlc / xgboost / demo / dask / gpu_training.py View on Github external
def main(client):
    # generate some random data for demonstration
    m = 100000
    n = 100
    X = da.random.random(size=(m, n), chunks=100)
    y = da.random.random(size=(m, ), chunks=100)

    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
    # DMatrix scatter around workers.
    dtrain = DaskDMatrix(client, X, y)

    # Use train method from xgboost.dask instead of xgboost.  This
    # distributed version of train returns a dictionary containing the
    # resulting booster and evaluation history obtained from
    # evaluation metrics.
    output = xgb.dask.train(client,
                            {'verbosity': 2,
                             'nthread': 1,
                             # Golden line for GPU training
                             'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=4, evals=[(dtrain, 'train')])
    bst = output['booster']
    history = output['history']

    # you can pass output directly into `predict` too.
github dmlc / xgboost / demo / dask / cpu_training.py View on Github external
def main(client):
    # generate some random data for demonstration
    m = 100000
    n = 100
    X = da.random.random(size=(m, n), chunks=100)
    y = da.random.random(size=(m, ), chunks=100)

    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
    # DMatrix scatter around workers.
    dtrain = DaskDMatrix(client, X, y)

    # Use train method from xgboost.dask instead of xgboost.  This
    # distributed version of train returns a dictionary containing the
    # resulting booster and evaluation history obtained from
    # evaluation metrics.
    output = xgb.dask.train(client,
                            {'verbosity': 1,
                             'nthread': 1,
                             'tree_method': 'hist'},
                            dtrain,
                            num_boost_round=4, evals=[(dtrain, 'train')])
    bst = output['booster']
    history = output['history']

    # you can pass output directly into `predict` too.
    prediction = xgb.dask.predict(client, bst, dtrain)