Data Structures
class	CommunicatorContext

class	DaskDeviceQuantileDMatrix

class	DaskDMatrix

class	DaskPartitionIter

class	DaskQuantileDMatrix

class	DaskScikitLearnBase

class	DaskXGBClassifier

class	DaskXGBRanker

class	DaskXGBRegressor

class	DaskXGBRFClassifier

class	DaskXGBRFRegressor

Functions
Dict[str, Union[int, str]]	_try_start_tracker (int n_workers, List[Union[Optional[str], Optional[Tuple[str, int]]]] addrs)

Dict[str, Union[int, str]]	_start_tracker (int n_workers, Optional[str] addr_from_dask, Optional[Tuple[str, int]] addr_from_user)

None	_assert_dask_support ()

_T	dconcat (Sequence[_T] value)

"distributed.Client"	_xgb_get_client (Optional["distributed.Client"] client)

List[_MapRetT]	map_worker_partitions (Optional["distributed.Client"] client, Callable[..., _MapRetT] func, *Any refs, Sequence[str] workers)

Dict[str, List[Any]]	_get_worker_parts (_DataParts list_of_parts)

QuantileDMatrix	_create_quantile_dmatrix (Optional[FeatureNames] feature_names, Optional[Union[Any, List[Any]]] feature_types, Optional[Any] feature_weights, float missing, int nthread, Optional[_DataParts] parts, int max_bin, bool enable_categorical, Optional[DMatrix] ref=None)

DMatrix	_create_dmatrix (Optional[FeatureNames] feature_names, Optional[Union[Any, List[Any]]] feature_types, Optional[Any] feature_weights, float missing, int nthread, bool enable_categorical, Optional[_DataParts] parts)

DMatrix	_dmatrix_from_list_of_parts (bool is_quantile, **Any kwargs)

Dict[str, Union[str, int]]	_get_rabit_args (int n_workers, Optional[Dict[str, Any]] dconfig, "distributed.Client" client)

Optional[Dict[str, Any]]	_get_dask_config ()

List[str]	_get_workers_from_data (DaskDMatrix dtrain, Optional[Sequence[Tuple[DaskDMatrix, str]]] evals)

Optional[TrainReturnT]	_filter_empty (Booster booster, TrainingCallback.EvalsLog local_history, bool is_valid)

None	_check_workers_are_alive (List[str] workers, "distributed.Client" client)

Optional[TrainReturnT]	_train_async ("distributed.Client" client, Dict[str, Any] global_config, Optional[Dict[str, Any]] dconfig, Dict[str, Any] params, DaskDMatrix dtrain, int num_boost_round, Optional[Sequence[Tuple[DaskDMatrix, str]]] evals, Optional[Objective] obj, Optional[Metric] feval, Optional[int] early_stopping_rounds, Union[int, bool] verbose_eval, Optional[Booster] xgb_model, Optional[Sequence[TrainingCallback]] callbacks, Optional[Metric] custom_metric)

Any	train ("distributed.Client" client, Dict[str, Any] params, DaskDMatrix dtrain, int num_boost_round=10, *Optional[Sequence[Tuple[DaskDMatrix, str]]] evals=None, Optional[Objective] obj=None, Optional[Metric] feval=None, Optional[int] early_stopping_rounds=None, Optional[Booster] xgb_model=None, Union[int, bool] verbose_eval=True, Optional[Sequence[TrainingCallback]] callbacks=None, Optional[Metric] custom_metric=None)

bool	_can_output_df (bool is_df, Tuple output_shape)

Any	_maybe_dataframe (Any data, Any prediction, List[int] columns, bool is_df)

_DaskCollection	_direct_predict_impl (Callable mapped_predict, "distributed.Future" booster, _DataT data, Optional[_DaskCollection] base_margin, Tuple[int,...] output_shape, Dict[int, str] meta)

Tuple[Tuple[int,...], Dict[int, str]]	_infer_predict_output (Booster booster, int features, bool is_df, bool inplace, **Any kwargs)

"distributed.Future"	_get_model_future ("distributed.Client" client, Union[Booster, Dict, "distributed.Future"] model)

_DaskCollection	_predict_async ("distributed.Client" client, Dict[str, Any] global_config, Union[Booster, Dict, "distributed.Future"] model, _DataT data, bool output_margin, float missing, bool pred_leaf, bool pred_contribs, bool approx_contribs, bool pred_interactions, bool validate_features, Tuple[int, int] iteration_range, bool strict_shape)

Any	predict (Optional["distributed.Client"] client, Union[TrainReturnT, Booster, "distributed.Future"] model, Union[DaskDMatrix, _DataT] data, bool output_margin=False, float missing=numpy.nan, bool pred_leaf=False, bool pred_contribs=False, bool approx_contribs=False, bool pred_interactions=False, bool validate_features=True, Tuple[int, int] iteration_range=(0, 0), bool strict_shape=False)

_DaskCollection	_inplace_predict_async ("distributed.Client" client, Dict[str, Any] global_config, Union[Booster, Dict, "distributed.Future"] model, _DataT data, Tuple[int, int] iteration_range, str predict_type, float missing, bool validate_features, Optional[_DaskCollection] base_margin, bool strict_shape)

Any	inplace_predict (Optional["distributed.Client"] client, Union[TrainReturnT, Booster, "distributed.Future"] model, _DataT data, Tuple[int, int] iteration_range=(0, 0), str predict_type="value", float missing=numpy.nan, bool validate_features=True, Optional[_DaskCollection] base_margin=None, bool strict_shape=False)

Tuple[DaskDMatrix, Optional[List[Tuple[DaskDMatrix, str]]]]	_async_wrap_evaluation_matrices (Optional["distributed.Client"] client, Optional[str] tree_method, Optional[int] max_bin, **Any kwargs)

Generator	_set_worker_client ("DaskScikitLearnBase" model, "distributed.Client" client)

Variables
	dd = LazyLoader("dd", globals(), "dask.dataframe")

	da = LazyLoader("da", globals(), "dask.array")

	dask = LazyLoader("dask", globals(), "dask")

	distributed = LazyLoader("distributed", globals(), "dask.distributed")

	_DaskCollection = Union["da.Array", "dd.DataFrame", "dd.Series"]

	_DataT = Union["da.Array", "dd.DataFrame"]

	TrainReturnT

	LOGGER = logging.getLogger("[xgboost.dask]")

	_MapRetT = TypeVar("_MapRetT")

	_DataParts = List[Dict[str, Any]]

Detailed Description

Dask extensions for distributed training
----------------------------------------

See :doc:`Distributed XGBoost with Dask </tutorials/dask>` for simple tutorial.  Also
:doc:`/python/dask-examples/index` for some examples.

There are two sets of APIs in this module, one is the functional API including
``train`` and ``predict`` methods.  Another is stateful Scikit-Learner wrapper
inherited from single-node Scikit-Learn interface.

The implementation is heavily influenced by dask_xgboost:
https://github.com/dask/dask-xgboost

Optional dask configuration
===========================

- **xgboost.scheduler_address**: Specify the scheduler address, see :ref:`tracker-ip`.

  .. versionadded:: 1.6.0

  .. code-block:: python

      dask.config.set({"xgboost.scheduler_address": "192.0.0.100"})
      # We can also specify the port.
      dask.config.set({"xgboost.scheduler_address": "192.0.0.100:12345"})

Function Documentation

◆ _async_wrap_evaluation_matrices()

Tuple[DaskDMatrix, Optional[List[Tuple[DaskDMatrix, str]]]] xgboost.dask._async_wrap_evaluation_matrices	(	Optional["distributed.Client"]	client,
		Optional[str]	tree_method,
		Optional[int]	max_bin,
		**Any	kwargs
	)

protected

A switch function for async environment.

◆ _create_dmatrix()

DMatrix xgboost.dask._create_dmatrix	(	Optional[FeatureNames]	feature_names,
		Optional[Union[Any, List[Any]]]	feature_types,
		Optional[Any]	feature_weights,
		float	missing,
		int	nthread,
		bool	enable_categorical,
		Optional[_DataParts]	parts
	)

protected

Get data that local to worker from DaskDMatrix.

Returns
-------
A DMatrix object.

◆ _get_rabit_args()

Dict[str, Union[str, int]] xgboost.dask._get_rabit_args	(	int	n_workers,
		Optional[Dict[str, Any]]	dconfig,
		"distributed.Client"	client
	)

protected

Get rabit context arguments from data distribution in DaskDMatrix.

◆ _infer_predict_output()

Tuple[Tuple[int, ...], Dict[int, str]] xgboost.dask._infer_predict_output	(	Booster	booster,
		int	features,
		bool	is_df,
		bool	inplace,
		**Any	kwargs
	)

protected

Create a dummy test sample to infer output shape for prediction.

◆ _maybe_dataframe()

Any xgboost.dask._maybe_dataframe	(	Any	data,
		Any	prediction,
		List[int]	columns,
		bool	is_df
	)

protected

Return dataframe for prediction when applicable.

◆ _set_worker_client()

Generator xgboost.dask._set_worker_client	(	"DaskScikitLearnBase"	model,
		"distributed.Client"	client
	)

protected

Temporarily set the client for sklearn model.

◆ _start_tracker()

Dict[str, Union[int, str]] xgboost.dask._start_tracker	(	int	n_workers,
		Optional[str]	addr_from_dask,
		Optional[Tuple[str, int]]	addr_from_user
	)

protected

Start Rabit tracker, recurse to try different addresses.

◆ _xgb_get_client()

"distributed.Client" xgboost.dask._xgb_get_client ( Optional["distributed.Client"] client )

protected

Simple wrapper around testing None.

◆ dconcat()

_T xgboost.dask.dconcat ( Sequence[_T] value )

Concatenate sequence of partitions.

◆ inplace_predict()

Any xgboost.dask.inplace_predict	(	Optional["distributed.Client"]	client,
		Union[TrainReturnT, Booster, "distributed.Future"]	model,
		_DataT	data,
		Tuple[int, int]	iteration_range = `(0, 0)`,
		str	predict_type = `"value"`,
		float	missing = `numpy.nan`,
		bool	validate_features = `True`,
		Optional[_DaskCollection]	base_margin = `None`,
		bool	strict_shape = `False`
	)

Inplace prediction. See doc in :py:meth:`xgboost.Booster.inplace_predict` for
details.

.. versionadded:: 1.1.0

Parameters
----------
client:
    Specify the dask client used for training.  Use default client
    returned from dask if it's set to None.
model:
    See :py:func:`xgboost.dask.predict` for details.
data :
    dask collection.
iteration_range:
    See :py:meth:`xgboost.Booster.predict` for details.
predict_type:
    See :py:meth:`xgboost.Booster.inplace_predict` for details.
missing:
    Value in the input data which needs to be present as a missing
    value. If None, defaults to np.nan.
base_margin:
    See :py:obj:`xgboost.DMatrix` for details.

    .. versionadded:: 1.4.0

strict_shape:
    See :py:meth:`xgboost.Booster.predict` for details.

    .. versionadded:: 1.4.0

Returns
-------
prediction :
    When input data is ``dask.array.Array``, the return value is an array, when
    input data is ``dask.dataframe.DataFrame``, return value can be
    ``dask.dataframe.Series``, ``dask.dataframe.DataFrame``, depending on the output
    shape.

◆ map_worker_partitions()

List[_MapRetT] xgboost.dask.map_worker_partitions	(	Optional["distributed.Client"]	client,
		Callable[..., _MapRetT]	func,
		*Any	refs,
		Sequence[str]	workers
	)

Map a function onto partitions of each worker.

◆ predict()

Any xgboost.dask.predict	(	Optional["distributed.Client"]	client,
		Union[TrainReturnT, Booster, "distributed.Future"]	model,
		Union[DaskDMatrix, _DataT]	data,
		bool	output_margin = `False`,
		float	missing = `numpy.nan`,
		bool	pred_leaf = `False`,
		bool	pred_contribs = `False`,
		bool	approx_contribs = `False`,
		bool	pred_interactions = `False`,
		bool	validate_features = `True`,
		Tuple[int, int]	iteration_range = `(0, 0)`,
		bool	strict_shape = `False`
	)

Run prediction with a trained booster.

.. note::

    Using ``inplace_predict`` might be faster when some features are not needed.
    See :py:meth:`xgboost.Booster.predict` for details on various parameters.  When
    output has more than 2 dimensions (shap value, leaf with strict_shape), input
    should be ``da.Array`` or ``DaskDMatrix``.

.. versionadded:: 1.0.0

Parameters
----------
client:
    Specify the dask client used for training.  Use default client
    returned from dask if it's set to None.
model:
    The trained model.  It can be a distributed.Future so user can
    pre-scatter it onto all workers.
data:
    Input data used for prediction.  When input is a dataframe object,
    prediction output is a series.
missing:
    Used when input data is not DaskDMatrix.  Specify the value
    considered as missing.

Returns
-------
prediction: dask.array.Array/dask.dataframe.Series
    When input data is ``dask.array.Array`` or ``DaskDMatrix``, the return value is
    an array, when input data is ``dask.dataframe.DataFrame``, return value can be
    ``dask.dataframe.Series``, ``dask.dataframe.DataFrame``, depending on the output
    shape.

◆ train()

Any xgboost.dask.train	(	"distributed.Client"	client,
		Dict[str, Any]	params,
		DaskDMatrix	dtrain,
		int	num_boost_round = `10`,
		*Optional[Sequence[Tuple[DaskDMatrix, str]]]	evals = `None`,
		Optional[Objective]	obj = `None`,
		Optional[Metric]	feval = `None`,
		Optional[int]	early_stopping_rounds = `None`,
		Optional[Booster]	xgb_model = `None`,
		Union[int, bool]	verbose_eval = `True`,
		Optional[Sequence[TrainingCallback]]	callbacks = `None`,
		Optional[Metric]	custom_metric = `None`
	)

Train XGBoost model.

.. versionadded:: 1.0.0

.. note::

    Other parameters are the same as :py:func:`xgboost.train` except for
    `evals_result`, which is returned as part of function return value instead of
    argument.

Parameters
----------
client :
    Specify the dask client used for training.  Use default client returned from
    dask if it's set to None.

Returns
-------
results: dict
    A dictionary containing trained booster and evaluation history.  `history` field
    is the same as `eval_result` from `xgboost.train`.

    .. code-block:: python

        {'booster': xgboost.Booster,
         'history': {'train': {'logloss': ['0.48253', '0.35953']},
                     'eval': {'logloss': ['0.480385', '0.357756']}}}

Variable Documentation

◆ TrainReturnT

xgboost.dask.TrainReturnT

Initial value:

=  TypedDict(
    "TrainReturnT",
    {
        "booster": Booster,
        "history": Dict,
    },
)

Data Structures

Functions

Variables

Detailed Description

Function Documentation

◆ _async_wrap_evaluation_matrices()

◆ _create_dmatrix()

◆ _get_rabit_args()

◆ _infer_predict_output()

◆ _maybe_dataframe()

◆ _set_worker_client()

◆ _start_tracker()

◆ _xgb_get_client()

◆ dconcat()

◆ inplace_predict()

◆ map_worker_partitions()

◆ predict()

◆ train()

Variable Documentation

◆ TrainReturnT