MR_LIBS/gbtree_8h_source.html

#ifndef XGBOOST_GBM_GBTREE_H_

#define XGBOOST_GBM_GBTREE_H_


#include <dmlc/omp.h>


#include <algorithm>

#include <cstdint>  // std::int32_t

#include <map>

#include <memory>

#include <string>

#include <unordered_map>

#include <utility>

#include <vector>


#include "../common/common.h"

#include "../common/timer.h"

#include "../tree/param.h"  // TrainParam

#include "gbtree_model.h"

#include "xgboost/base.h"

#include "xgboost/data.h"

#include "xgboost/gbm.h"

#include "xgboost/host_device_vector.h"

#include "xgboost/json.h"

#include "xgboost/logging.h"

#include "xgboost/parameter.h"

#include "xgboost/predictor.h"

#include "xgboost/tree_updater.h"


namespace xgboost {

enum class TreeMethod : int {

  kAuto = 0, kApprox = 1, kExact = 2, kHist = 3,

  kGPUHist = 5

};


// boosting process types

enum class TreeProcessType : int {

  kDefault = 0,

  kUpdate = 1

};

}  // namespace xgboost


DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);

DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);


namespace xgboost::gbm {


struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {

  std::string updater_seq;

  TreeProcessType process_type;

  // tree construction method

  TreeMethod tree_method;

  // declare parameters

  DMLC_DECLARE_PARAMETER(GBTreeTrainParam) {

    DMLC_DECLARE_FIELD(updater_seq).describe("Tree updater sequence.").set_default("");

    DMLC_DECLARE_FIELD(process_type)

        .set_default(TreeProcessType::kDefault)

        .add_enum("default", TreeProcessType::kDefault)

        .add_enum("update", TreeProcessType::kUpdate)

        .describe("Whether to run the normal boosting process that creates new trees,"\

                  " or to update the trees in an existing model.");

    DMLC_DECLARE_ALIAS(updater_seq, updater);

    DMLC_DECLARE_FIELD(tree_method)

        .set_default(TreeMethod::kAuto)

        .add_enum("auto",      TreeMethod::kAuto)

        .add_enum("approx",    TreeMethod::kApprox)

        .add_enum("exact",     TreeMethod::kExact)

        .add_enum("hist",      TreeMethod::kHist)

        .add_enum("gpu_hist",  TreeMethod::kGPUHist)

        .describe("Choice of tree construction method.");

  }

};


struct DartTrainParam : public XGBoostParameter<DartTrainParam> {

  int sample_type;

  int normalize_type;

  float rate_drop;

  bool one_drop;

  float skip_drop;

  // declare parameters

  DMLC_DECLARE_PARAMETER(DartTrainParam) {

    DMLC_DECLARE_FIELD(sample_type)

        .set_default(0)

        .add_enum("uniform", 0)

        .add_enum("weighted", 1)

        .describe("Different types of sampling algorithm.");

    DMLC_DECLARE_FIELD(normalize_type)

        .set_default(0)

        .add_enum("tree", 0)

        .add_enum("forest", 1)

        .describe("Different types of normalization algorithm.");

    DMLC_DECLARE_FIELD(rate_drop)

        .set_range(0.0f, 1.0f)

        .set_default(0.0f)

        .describe("Fraction of trees to drop during the dropout.");

    DMLC_DECLARE_FIELD(one_drop)

        .set_default(false)

        .describe("Whether at least one tree should always be dropped during the dropout.");

    DMLC_DECLARE_FIELD(skip_drop)

        .set_range(0.0f, 1.0f)

        .set_default(0.0f)

        .describe("Probability of skipping the dropout during a boosting iteration.");

  }

};


namespace detail {

// From here on, layer becomes concrete trees.

inline std::pair<bst_tree_t, bst_tree_t> LayerToTree(gbm::GBTreeModel const& model,

                                                     bst_layer_t begin, bst_layer_t end) {

  CHECK(!model.iteration_indptr.empty());

  end = end == 0 ? model.BoostedRounds() : end;

  CHECK_LE(end, model.BoostedRounds()) << "Out of range for tree layers.";

  bst_tree_t tree_begin = model.iteration_indptr[begin];

  bst_tree_t tree_end = model.iteration_indptr[end];

  if (model.trees.size() != 0) {

    CHECK_LE(tree_begin, tree_end);

  }

  return {tree_begin, tree_end};

}


// Call fn for each pair of input output tree.  Return true if index is out of bound.

template <typename Func>

bool SliceTrees(bst_layer_t begin, bst_layer_t end, bst_layer_t step, GBTreeModel const& model,

                Func&& fn) {

  end = end == 0 ? model.iteration_indptr.size() : end;

  CHECK_GE(step, 1);

  if (step > end - begin) {

    return true;

  }

  if (end > model.BoostedRounds()) {

    return true;

  }


  bst_layer_t n_layers = (end - begin) / step;

  bst_layer_t out_l = 0;


  for (bst_layer_t l = begin; l < end; l += step) {

    auto [tree_begin, tree_end] = detail::LayerToTree(model, l, l + 1);

    if (tree_end > static_cast<bst_tree_t>(model.trees.size())) {

      return true;

    }


    for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {

      fn(tree_idx, out_l);

    }

    ++out_l;

  }


  CHECK_EQ(out_l, n_layers);

  return false;

}

}  // namespace detail


// gradient boosted trees


class GBTree : public GradientBooster {

 public:

  explicit GBTree(LearnerModelParam const* booster_config, Context const* ctx)

      : GradientBooster{ctx}, model_(booster_config, ctx_) {

    monitor_.Init(__func__);

  }


  void Configure(Args const& cfg) override;

  void UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const& predictions,

                      ObjFunction const* obj, std::int32_t group_idx,

                      std::vector<HostDeviceVector<bst_node_t>> const& node_position,

                      std::vector<std::unique_ptr<RegTree>>* p_trees);

  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,

               PredictionCacheEntry* predt, ObjFunction const* obj) override;


  [[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }


  [[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }


  void Load(dmlc::Stream* fi) override { model_.Load(fi); }


  void Save(dmlc::Stream* fo) const override {

    model_.Save(fo);

  }


  void LoadConfig(Json const& in) override;

  void SaveConfig(Json* p_out) const override;


  void SaveModel(Json* p_out) const override;

  void LoadModel(Json const& in) override;


  // slice the trees, out must be already allocated

  void Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, GradientBooster* out,

             bool* out_of_bound) const override;


  [[nodiscard]] std::int32_t BoostedRounds() const override { return this->model_.BoostedRounds(); }


  [[nodiscard]] bool ModelFitted() const override {

    return !model_.trees.empty() || !model_.trees_to_update.empty();

  }


  void PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,

                        bst_layer_t layer_begin, bst_layer_t layer_end) const;


  void PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool training,

                    bst_layer_t layer_begin, bst_layer_t layer_end) override;


  void InplacePredict(std::shared_ptr<DMatrix> p_m, float missing, PredictionCacheEntry* out_preds,

                      bst_layer_t layer_begin, bst_layer_t layer_end) const override;


  void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,

                    std::vector<bst_feature_t>* features,

                    std::vector<float>* scores) const override {

    // Because feature with no importance doesn't appear in the return value so

    // we need to set up another pair of vectors to store the values during

    // computation.

    std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0);

    std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0);

    std::vector<int32_t> tree_idx;

    if (trees.empty()) {

      tree_idx.resize(this->model_.trees.size());

      std::iota(tree_idx.begin(), tree_idx.end(), 0);

      trees = common::Span<int32_t const>(tree_idx);

    }


    auto total_n_trees = model_.trees.size();

    auto add_score = [&](auto fn) {

      for (auto idx : trees) {

        CHECK_LE(idx, total_n_trees) << "Invalid tree index.";

        auto const& p_tree = model_.trees[idx];

        p_tree->WalkTree([&](bst_node_t nidx) {

          auto const& node = (*p_tree)[nidx];

          if (!node.IsLeaf()) {

            split_counts[node.SplitIndex()]++;

            fn(p_tree, nidx, node.SplitIndex());

          }

          return true;

        });

      }

    };


    if (importance_type == "weight") {

      add_score([&](auto const&, bst_node_t, bst_feature_t split) {

        gain_map[split] = split_counts[split];

      });

    } else if (importance_type == "gain" || importance_type == "total_gain") {

      add_score([&](auto const &p_tree, bst_node_t nidx, bst_feature_t split) {

        gain_map[split] += p_tree->Stat(nidx).loss_chg;

      });

    } else if (importance_type == "cover" || importance_type == "total_cover") {

      add_score([&](auto const &p_tree, bst_node_t nidx, bst_feature_t split) {

        gain_map[split] += p_tree->Stat(nidx).sum_hess;

      });

    } else {

      LOG(FATAL)

          << "Unknown feature importance type, expected one of: "

          << R"({"weight", "total_gain", "total_cover", "gain", "cover"}, got: )"

          << importance_type;

    }

    if (importance_type == "gain" || importance_type == "cover") {

      for (size_t i = 0; i < gain_map.size(); ++i) {

        gain_map[i] /= std::max(1.0f, static_cast<float>(split_counts[i]));

      }

    }


    features->clear();

    scores->clear();

    for (size_t i = 0; i < split_counts.size(); ++i) {

      if (split_counts[i] != 0) {

        features->push_back(i);

        scores->push_back(gain_map[i]);

      }

    }

  }


  void PredictInstance(const SparsePage::Inst& inst, std::vector<bst_float>* out_preds,

                       uint32_t layer_begin, uint32_t layer_end) override {

    std::uint32_t _, tree_end;

    std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);

    cpu_predictor_->PredictInstance(inst, out_preds, model_, tree_end);

  }


  void PredictLeaf(DMatrix* p_fmat,

                   HostDeviceVector<bst_float>* out_preds,

                   uint32_t layer_begin, uint32_t layer_end) override {

    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);

    CHECK_EQ(tree_begin, 0) << "Predict leaf supports only iteration end: (0, "

                               "n_iteration), use model slicing instead.";

    this->GetPredictor(false)->PredictLeaf(p_fmat, out_preds, model_, tree_end);

  }


  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,

                           bst_layer_t layer_begin, bst_layer_t layer_end,

                           bool approximate) override {

    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);

    CHECK_EQ(tree_begin, 0) << "Predict contribution supports only iteration end: (0, "

                               "n_iteration), using model slicing instead.";

    this->GetPredictor(false)->PredictContribution(p_fmat, out_contribs, model_, tree_end, nullptr,

                                                   approximate);

  }


  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,

                                       bst_layer_t layer_begin, bst_layer_t layer_end,

                                       bool approximate) override {

    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);

    CHECK_EQ(tree_begin, 0) << "Predict interaction contribution supports only iteration end: (0, "

                               "n_iteration), using model slicing instead.";

    this->GetPredictor(false)->PredictInteractionContributions(p_fmat, out_contribs, model_,

                                                               tree_end, nullptr, approximate);

  }


  [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,

                                                   std::string format) const override {

    return model_.DumpModel(fmap, with_stats, this->ctx_->Threads(), format);

  }


 protected:

  void BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,

                     std::vector<HostDeviceVector<bst_node_t>>* out_position,

                     std::vector<std::unique_ptr<RegTree>>* ret);


  [[nodiscard]] std::unique_ptr<Predictor> const& GetPredictor(

      bool is_training, HostDeviceVector<float> const* out_pred = nullptr,

      DMatrix* f_dmat = nullptr) const;


  // commit new trees all at once

  virtual void CommitModel(TreesOneIter&& new_trees);


  // --- data structure ---

  GBTreeModel model_;

  // training parameter

  GBTreeTrainParam tparam_;

  // Tree training parameter

  tree::TrainParam tree_param_;

  bool specified_updater_   {false};

  // the updaters that can be applied to each of tree

  std::vector<std::unique_ptr<TreeUpdater>> updaters_;

  // Predictors

  std::unique_ptr<Predictor> cpu_predictor_;

  std::unique_ptr<Predictor> gpu_predictor_{nullptr};

#if defined(XGBOOST_USE_ONEAPI)

  std::unique_ptr<Predictor> oneapi_predictor_;

#endif  // defined(XGBOOST_USE_ONEAPI)

  common::Monitor monitor_;

};


}  // namespace xgboost::gbm


#endif  // XGBOOST_GBM_GBTREE_H_

dmlc::Stream
interface of stream I/O for serialization
Definition io.h:30

xgboost::DMatrix
Internal data structured used by XGBoost during training.
Definition data.h:509

xgboost::FeatureMap
Feature map data structure to help text model dump. TODO(tqchen) consider make it even more lightweig...
Definition feature_map.h:22

xgboost::GradientBooster
interface of gradient boosting model.
Definition gbm.h:37

xgboost::HostDeviceVector
Definition host_device_vector.h:87

xgboost::Json
Data structure representing JSON format.
Definition json.h:357

xgboost::ObjFunction
interface of objective function
Definition objective.h:29

xgboost::SparsePage::Inst
common::Span< Entry const  > Inst
an instance of sparse vector in the batch
Definition data.h:338

xgboost::common::Span
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition span.h:424

xgboost::gbm::GBTree
Definition gbtree.h:168

xgboost::gbm::GBTree::UpdateTreeLeaf
void UpdateTreeLeaf(DMatrix const *p_fmat, HostDeviceVector< float > const &predictions, ObjFunction const *obj, std::int32_t group_idx, std::vector< HostDeviceVector< bst_node_t > > const &node_position, std::vector< std::unique_ptr< RegTree > > *p_trees)
Optionally update the leaf value.
Definition gbtree.cc:174

xgboost::gbm::GBTree::UseGPU
bool UseGPU() const override
Whether the current booster uses GPU.
Definition gbtree.h:189

xgboost::gbm::GBTree::DoBoost
void DoBoost(DMatrix *p_fmat, HostDeviceVector< GradientPair > *in_gpair, PredictionCacheEntry *predt, ObjFunction const *obj) override
Carry out one iteration of boosting.
Definition gbtree.cc:199

xgboost::gbm::GBTree::ModelFitted
bool ModelFitted() const override
Whether the model has already been trained.
Definition gbtree.h:209

xgboost::gbm::GBTree::PredictContribution
void PredictContribution(DMatrix *p_fmat, HostDeviceVector< float > *out_contribs, bst_layer_t layer_begin, bst_layer_t layer_end, bool approximate) override
feature contributions to individual predictions; the output will be a vector of length (nfeats + 1) *...
Definition gbtree.h:303

xgboost::gbm::GBTree::PredictBatch
void PredictBatch(DMatrix *p_fmat, PredictionCacheEntry *out_preds, bool training, bst_layer_t layer_begin, bst_layer_t layer_end) override
Generate predictions for given feature matrix.
Definition gbtree.cc:520

xgboost::gbm::GBTree::SaveConfig
void SaveConfig(Json *p_out) const override
Save configuration to JSON object.
Definition gbtree.cc:388

xgboost::gbm::GBTree::BoostedRounds
std::int32_t BoostedRounds() const override
Return number of boosted rounds.
Definition gbtree.h:208

xgboost::gbm::GBTree::Load
void Load(dmlc::Stream *fi) override
load model from stream
Definition gbtree.h:193

xgboost::gbm::GBTree::LoadModel
void LoadModel(Json const &in) override
load the model from a JSON object
Definition gbtree.cc:416

xgboost::gbm::GBTree::Save
void Save(dmlc::Stream *fo) const override
save model to stream.
Definition gbtree.h:194

xgboost::gbm::GBTree::LoadConfig
void LoadConfig(Json const &in) override
Load configuration from JSON object.
Definition gbtree.cc:338

xgboost::gbm::GBTree::InplacePredict
void InplacePredict(std::shared_ptr< DMatrix > p_m, float missing, PredictionCacheEntry *out_preds, bst_layer_t layer_begin, bst_layer_t layer_end) const override
Inplace prediction.
Definition gbtree.cc:526

xgboost::gbm::GBTree::SaveModel
void SaveModel(Json *p_out) const override
saves the model config to a JSON object
Definition gbtree.cc:421

xgboost::gbm::GBTree::DumpModel
std::vector< std::string > DumpModel(const FeatureMap &fmap, bool with_stats, std::string format) const override
dump the model in the requested format
Definition gbtree.h:323

xgboost::gbm::GBTree::Configure
void Configure(Args const &cfg) override
Set the configuration of gradient boosting. User must call configure once before InitModel and Traini...
Definition gbtree.cc:90

xgboost::gbm::GBTree::Slice
void Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, GradientBooster *out, bool *out_of_bound) const override
Slice a model using boosting index.
Definition gbtree.cc:429

gbm.h
Copyright 2014-2023 by XGBoost Contributors.

gbtree_model.h
Copyright 2017-2023, XGBoost Contributors.

host_device_vector.h
A device-and-host vector abstraction layer.

base.h
Copyright 2015-2023 by XGBoost Contributors.

data.h
Copyright 2015-2023 by XGBoost Contributors.

logging.h
defines console logging options for xgboost. Use to enforce unified print behavior.

parameter.h
macro for using C++11 enum class as DMLC parameter

DECLARE_FIELD_ENUM_CLASS
#define DECLARE_FIELD_ENUM_CLASS(EnumClass)
Specialization of FieldEntry for enum class (backed by int)
Definition parameter.h:50

detail
detail namespace with internal helper functions
Definition json.hpp:249

xgboost::gbm
Copyright 2019-2023, XGBoost Contributors.
Definition linear_updater.h:23

xgboost::gbm::TreesOneIter
std::vector< TreesOneGroup > TreesOneIter
Container for all trees built (not update) for one iteration.
Definition gbtree_model.h:35

xgboost
namespace of xgboost
Definition base.h:90

xgboost::bst_feature_t
uint32_t bst_feature_t
Type for data column (feature) index.
Definition base.h:101

xgboost::bst_node_t
std::int32_t bst_node_t
Type for tree node index.
Definition base.h:112

xgboost::bst_tree_t
std::int32_t bst_tree_t
Type for indexing trees.
Definition base.h:126

xgboost::bst_layer_t
std::int32_t bst_layer_t
Type for indexing boosted layers.
Definition base.h:122

omp.h
header to handle OpenMP compatibility issues

predictor.h
Copyright 2017-2023 by Contributors.

xgboost::Context
Runtime context for XGBoost.
Definition context.h:84

xgboost::Context::Threads
std::int32_t Threads() const
Returns the automatically chosen number of threads based on the nthread parameter and the system sett...
Definition context.cc:203

xgboost::LearnerModelParam
Basic model parameters, used to describe the booster.
Definition learner.h:291

xgboost::LearnerModelParam::num_feature
bst_feature_t num_feature
The number of features.
Definition learner.h:303

xgboost::PredictionCacheEntry
Contains pointer to input matrix and associated cached predictions.
Definition predictor.h:30

xgboost::XGBoostParameter
Definition parameter.h:84

xgboost::gbm::DartTrainParam
training parameters
Definition gbtree.h:82

xgboost::gbm::DartTrainParam::normalize_type
int normalize_type
type of normalization algorithm
Definition gbtree.h:86

xgboost::gbm::DartTrainParam::one_drop
bool one_drop
whether at least one tree should always be dropped during the dropout
Definition gbtree.h:90

xgboost::gbm::DartTrainParam::sample_type
int sample_type
type of sampling algorithm
Definition gbtree.h:84

xgboost::gbm::DartTrainParam::skip_drop
float skip_drop
probability of skipping the dropout during an iteration
Definition gbtree.h:92

xgboost::gbm::DartTrainParam::rate_drop
float rate_drop
fraction of trees to drop during the dropout
Definition gbtree.h:88

xgboost::gbm::GBTreeModel
Definition gbtree_model.h:84

xgboost::gbm::GBTreeModel::trees_to_update
std::vector< std::unique_ptr< RegTree > > trees_to_update
for the update process, a place to keep the initial trees
Definition gbtree_model.h:151

xgboost::gbm::GBTreeModel::trees
std::vector< std::unique_ptr< RegTree > > trees
vector of trees stored in the model
Definition gbtree_model.h:149

xgboost::gbm::GBTreeTrainParam
training parameters
Definition gbtree.h:53

xgboost::gbm::GBTreeTrainParam::process_type
TreeProcessType process_type
type of boosting process to run
Definition gbtree.h:57

xgboost::gbm::GBTreeTrainParam::updater_seq
std::string updater_seq
tree updater sequence
Definition gbtree.h:55

xgboost::tree::TrainParam
training parameters for regression tree
Definition param.h:28

tree_updater.h
Copyright 2014-2023 by XGBoost Contributors.