Medial Code Documentation
Loading...
Searching...
No Matches
Data Structures | Functions | Variables
xgboost.dask Namespace Reference

Data Structures

class  CommunicatorContext
 
class  DaskDeviceQuantileDMatrix
 
class  DaskDMatrix
 
class  DaskPartitionIter
 
class  DaskQuantileDMatrix
 
class  DaskScikitLearnBase
 
class  DaskXGBClassifier
 
class  DaskXGBRanker
 
class  DaskXGBRegressor
 
class  DaskXGBRFClassifier
 
class  DaskXGBRFRegressor
 

Functions

Dict[str, Union[int, str]] _try_start_tracker (int n_workers, List[Union[Optional[str], Optional[Tuple[str, int]]]] addrs)
 
Dict[str, Union[int, str]] _start_tracker (int n_workers, Optional[str] addr_from_dask, Optional[Tuple[str, int]] addr_from_user)
 
None _assert_dask_support ()
 
_T dconcat (Sequence[_T] value)
 
"distributed.Client" _xgb_get_client (Optional["distributed.Client"] client)
 
List[_MapRetT] map_worker_partitions (Optional["distributed.Client"] client, Callable[..., _MapRetT] func, *Any refs, Sequence[str] workers)
 
Dict[str, List[Any]] _get_worker_parts (_DataParts list_of_parts)
 
QuantileDMatrix _create_quantile_dmatrix (Optional[FeatureNames] feature_names, Optional[Union[Any, List[Any]]] feature_types, Optional[Any] feature_weights, float missing, int nthread, Optional[_DataParts] parts, int max_bin, bool enable_categorical, Optional[DMatrix] ref=None)
 
DMatrix _create_dmatrix (Optional[FeatureNames] feature_names, Optional[Union[Any, List[Any]]] feature_types, Optional[Any] feature_weights, float missing, int nthread, bool enable_categorical, Optional[_DataParts] parts)
 
DMatrix _dmatrix_from_list_of_parts (bool is_quantile, **Any kwargs)
 
Dict[str, Union[str, int]] _get_rabit_args (int n_workers, Optional[Dict[str, Any]] dconfig, "distributed.Client" client)
 
Optional[Dict[str, Any]] _get_dask_config ()
 
List[str] _get_workers_from_data (DaskDMatrix dtrain, Optional[Sequence[Tuple[DaskDMatrix, str]]] evals)
 
Optional[TrainReturnT] _filter_empty (Booster booster, TrainingCallback.EvalsLog local_history, bool is_valid)
 
None _check_workers_are_alive (List[str] workers, "distributed.Client" client)
 
Optional[TrainReturnT] _train_async ("distributed.Client" client, Dict[str, Any] global_config, Optional[Dict[str, Any]] dconfig, Dict[str, Any] params, DaskDMatrix dtrain, int num_boost_round, Optional[Sequence[Tuple[DaskDMatrix, str]]] evals, Optional[Objective] obj, Optional[Metric] feval, Optional[int] early_stopping_rounds, Union[int, bool] verbose_eval, Optional[Booster] xgb_model, Optional[Sequence[TrainingCallback]] callbacks, Optional[Metric] custom_metric)
 
Any train ("distributed.Client" client, Dict[str, Any] params, DaskDMatrix dtrain, int num_boost_round=10, *Optional[Sequence[Tuple[DaskDMatrix, str]]] evals=None, Optional[Objective] obj=None, Optional[Metric] feval=None, Optional[int] early_stopping_rounds=None, Optional[Booster] xgb_model=None, Union[int, bool] verbose_eval=True, Optional[Sequence[TrainingCallback]] callbacks=None, Optional[Metric] custom_metric=None)
 
bool _can_output_df (bool is_df, Tuple output_shape)
 
Any _maybe_dataframe (Any data, Any prediction, List[int] columns, bool is_df)
 
_DaskCollection _direct_predict_impl (Callable mapped_predict, "distributed.Future" booster, _DataT data, Optional[_DaskCollection] base_margin, Tuple[int,...] output_shape, Dict[int, str] meta)
 
Tuple[Tuple[int,...], Dict[int, str]] _infer_predict_output (Booster booster, int features, bool is_df, bool inplace, **Any kwargs)
 
"distributed.Future" _get_model_future ("distributed.Client" client, Union[Booster, Dict, "distributed.Future"] model)
 
_DaskCollection _predict_async ("distributed.Client" client, Dict[str, Any] global_config, Union[Booster, Dict, "distributed.Future"] model, _DataT data, bool output_margin, float missing, bool pred_leaf, bool pred_contribs, bool approx_contribs, bool pred_interactions, bool validate_features, Tuple[int, int] iteration_range, bool strict_shape)
 
Any predict (Optional["distributed.Client"] client, Union[TrainReturnT, Booster, "distributed.Future"] model, Union[DaskDMatrix, _DataT] data, bool output_margin=False, float missing=numpy.nan, bool pred_leaf=False, bool pred_contribs=False, bool approx_contribs=False, bool pred_interactions=False, bool validate_features=True, Tuple[int, int] iteration_range=(0, 0), bool strict_shape=False)
 
_DaskCollection _inplace_predict_async ("distributed.Client" client, Dict[str, Any] global_config, Union[Booster, Dict, "distributed.Future"] model, _DataT data, Tuple[int, int] iteration_range, str predict_type, float missing, bool validate_features, Optional[_DaskCollection] base_margin, bool strict_shape)
 
Any inplace_predict (Optional["distributed.Client"] client, Union[TrainReturnT, Booster, "distributed.Future"] model, _DataT data, Tuple[int, int] iteration_range=(0, 0), str predict_type="value", float missing=numpy.nan, bool validate_features=True, Optional[_DaskCollection] base_margin=None, bool strict_shape=False)
 
Tuple[DaskDMatrix, Optional[List[Tuple[DaskDMatrix, str]]]] _async_wrap_evaluation_matrices (Optional["distributed.Client"] client, Optional[str] tree_method, Optional[int] max_bin, **Any kwargs)
 
Generator _set_worker_client ("DaskScikitLearnBase" model, "distributed.Client" client)
 

Variables

 dd = LazyLoader("dd", globals(), "dask.dataframe")
 
 da = LazyLoader("da", globals(), "dask.array")
 
 dask = LazyLoader("dask", globals(), "dask")
 
 distributed = LazyLoader("distributed", globals(), "dask.distributed")
 
 _DaskCollection = Union["da.Array", "dd.DataFrame", "dd.Series"]
 
 _DataT = Union["da.Array", "dd.DataFrame"]
 
 TrainReturnT
 
 LOGGER = logging.getLogger("[xgboost.dask]")
 
 _MapRetT = TypeVar("_MapRetT")
 
 _DataParts = List[Dict[str, Any]]
 

Detailed Description

Dask extensions for distributed training
----------------------------------------

See :doc:`Distributed XGBoost with Dask </tutorials/dask>` for simple tutorial.  Also
:doc:`/python/dask-examples/index` for some examples.

There are two sets of APIs in this module, one is the functional API including
``train`` and ``predict`` methods.  Another is stateful Scikit-Learner wrapper
inherited from single-node Scikit-Learn interface.

The implementation is heavily influenced by dask_xgboost:
https://github.com/dask/dask-xgboost

Optional dask configuration
===========================

- **xgboost.scheduler_address**: Specify the scheduler address, see :ref:`tracker-ip`.

  .. versionadded:: 1.6.0

  .. code-block:: python

      dask.config.set({"xgboost.scheduler_address": "192.0.0.100"})
      # We can also specify the port.
      dask.config.set({"xgboost.scheduler_address": "192.0.0.100:12345"})

Function Documentation

◆ _async_wrap_evaluation_matrices()

Tuple[DaskDMatrix, Optional[List[Tuple[DaskDMatrix, str]]]] xgboost.dask._async_wrap_evaluation_matrices ( Optional["distributed.Client"]  client,
Optional[str]  tree_method,
Optional[int]  max_bin,
**Any  kwargs 
)
protected
A switch function for async environment.

◆ _create_dmatrix()

DMatrix xgboost.dask._create_dmatrix ( Optional[FeatureNames]  feature_names,
Optional[Union[Any, List[Any]]]  feature_types,
Optional[Any]  feature_weights,
float  missing,
int  nthread,
bool  enable_categorical,
Optional[_DataParts]  parts 
)
protected
Get data that local to worker from DaskDMatrix.

Returns
-------
A DMatrix object.

◆ _get_rabit_args()

Dict[str, Union[str, int]] xgboost.dask._get_rabit_args ( int  n_workers,
Optional[Dict[str, Any]]  dconfig,
"distributed.Client"   client 
)
protected
Get rabit context arguments from data distribution in DaskDMatrix.

◆ _infer_predict_output()

Tuple[Tuple[int, ...], Dict[int, str]] xgboost.dask._infer_predict_output ( Booster  booster,
int  features,
bool  is_df,
bool  inplace,
**Any   kwargs 
)
protected
Create a dummy test sample to infer output shape for prediction.

◆ _maybe_dataframe()

Any xgboost.dask._maybe_dataframe ( Any  data,
Any  prediction,
List[int]  columns,
bool   is_df 
)
protected
Return dataframe for prediction when applicable.

◆ _set_worker_client()

Generator xgboost.dask._set_worker_client ( "DaskScikitLearnBase"  model,
"distributed.Client"   client 
)
protected
Temporarily set the client for sklearn model.

◆ _start_tracker()

Dict[str, Union[int, str]] xgboost.dask._start_tracker ( int  n_workers,
Optional[str]  addr_from_dask,
Optional[Tuple[str, int]]  addr_from_user 
)
protected
Start Rabit tracker, recurse to try different addresses.

◆ _xgb_get_client()

"distributed.Client" xgboost.dask._xgb_get_client ( Optional["distributed.Client"]  client)
protected
Simple wrapper around testing None.

◆ dconcat()

_T xgboost.dask.dconcat ( Sequence[_T]  value)
Concatenate sequence of partitions.

◆ inplace_predict()

Any xgboost.dask.inplace_predict ( Optional["distributed.Client"]  client,
Union[TrainReturnT, Booster, "distributed.Future"]  model,
_DataT  data,
Tuple[int, int]   iteration_range = (0, 0),
str   predict_type = "value",
float   missing = numpy.nan,
bool   validate_features = True,
Optional[_DaskCollection]   base_margin = None,
bool   strict_shape = False 
)
Inplace prediction. See doc in :py:meth:`xgboost.Booster.inplace_predict` for
details.

.. versionadded:: 1.1.0

Parameters
----------
client:
    Specify the dask client used for training.  Use default client
    returned from dask if it's set to None.
model:
    See :py:func:`xgboost.dask.predict` for details.
data :
    dask collection.
iteration_range:
    See :py:meth:`xgboost.Booster.predict` for details.
predict_type:
    See :py:meth:`xgboost.Booster.inplace_predict` for details.
missing:
    Value in the input data which needs to be present as a missing
    value. If None, defaults to np.nan.
base_margin:
    See :py:obj:`xgboost.DMatrix` for details.

    .. versionadded:: 1.4.0

strict_shape:
    See :py:meth:`xgboost.Booster.predict` for details.

    .. versionadded:: 1.4.0

Returns
-------
prediction :
    When input data is ``dask.array.Array``, the return value is an array, when
    input data is ``dask.dataframe.DataFrame``, return value can be
    ``dask.dataframe.Series``, ``dask.dataframe.DataFrame``, depending on the output
    shape.

◆ map_worker_partitions()

List[_MapRetT] xgboost.dask.map_worker_partitions ( Optional["distributed.Client"]  client,
Callable[..., _MapRetT]  func,
*Any  refs,
Sequence[str]  workers 
)
Map a function onto partitions of each worker.

◆ predict()

Any xgboost.dask.predict ( Optional["distributed.Client"]  client,
Union[TrainReturnT, Booster, "distributed.Future"]  model,
Union[DaskDMatrix, _DataT]  data,
bool   output_margin = False,
float   missing = numpy.nan,
bool   pred_leaf = False,
bool   pred_contribs = False,
bool   approx_contribs = False,
bool   pred_interactions = False,
bool   validate_features = True,
Tuple[int, int]   iteration_range = (0, 0),
bool   strict_shape = False 
)
Run prediction with a trained booster.

.. note::

    Using ``inplace_predict`` might be faster when some features are not needed.
    See :py:meth:`xgboost.Booster.predict` for details on various parameters.  When
    output has more than 2 dimensions (shap value, leaf with strict_shape), input
    should be ``da.Array`` or ``DaskDMatrix``.

.. versionadded:: 1.0.0

Parameters
----------
client:
    Specify the dask client used for training.  Use default client
    returned from dask if it's set to None.
model:
    The trained model.  It can be a distributed.Future so user can
    pre-scatter it onto all workers.
data:
    Input data used for prediction.  When input is a dataframe object,
    prediction output is a series.
missing:
    Used when input data is not DaskDMatrix.  Specify the value
    considered as missing.

Returns
-------
prediction: dask.array.Array/dask.dataframe.Series
    When input data is ``dask.array.Array`` or ``DaskDMatrix``, the return value is
    an array, when input data is ``dask.dataframe.DataFrame``, return value can be
    ``dask.dataframe.Series``, ``dask.dataframe.DataFrame``, depending on the output
    shape.

◆ train()

Any xgboost.dask.train ( "distributed.Client"  client,
Dict[str, Any]  params,
DaskDMatrix  dtrain,
int   num_boost_round = 10,
*Optional[Sequence[Tuple[DaskDMatrix, str]]]   evals = None,
Optional[Objective]   obj = None,
Optional[Metric]   feval = None,
Optional[int]   early_stopping_rounds = None,
Optional[Booster]   xgb_model = None,
Union[int, bool]   verbose_eval = True,
Optional[Sequence[TrainingCallback]]   callbacks = None,
Optional[Metric]   custom_metric = None 
)
Train XGBoost model.

.. versionadded:: 1.0.0

.. note::

    Other parameters are the same as :py:func:`xgboost.train` except for
    `evals_result`, which is returned as part of function return value instead of
    argument.

Parameters
----------
client :
    Specify the dask client used for training.  Use default client returned from
    dask if it's set to None.

Returns
-------
results: dict
    A dictionary containing trained booster and evaluation history.  `history` field
    is the same as `eval_result` from `xgboost.train`.

    .. code-block:: python

        {'booster': xgboost.Booster,
         'history': {'train': {'logloss': ['0.48253', '0.35953']},
                     'eval': {'logloss': ['0.480385', '0.357756']}}}

Variable Documentation

◆ TrainReturnT

xgboost.dask.TrainReturnT
Initial value:
1= TypedDict(
2 "TrainReturnT",
3 {
4 "booster": Booster,
5 "history": Dict,
6 },
7)