MR_LIBS/gpu__tree__learner_8h_source.html

#ifndef LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_

#define LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_


#include <LightGBM/utils/random.h>

#include <LightGBM/utils/array_args.h>

#include <LightGBM/dataset.h>

#include <LightGBM/tree.h>

#include <LightGBM/feature_group.h>

#include "feature_histogram.hpp"

#include "serial_tree_learner.h"

#include "data_partition.hpp"

#include "split_info.hpp"

#include "leaf_splits.hpp"


#include <cstdio>

#include <vector>

#include <random>

#include <cmath>

#include <memory>


#ifdef USE_GPU


#define BOOST_COMPUTE_THREAD_SAFE

#define BOOST_COMPUTE_HAVE_THREAD_LOCAL

// Use Boost.Compute on-disk kernel cache

#define BOOST_COMPUTE_USE_OFFLINE_CACHE

#include <boost/compute/core.hpp>

#include <boost/compute/container/vector.hpp>

#include <boost/align/aligned_allocator.hpp>


using namespace json11;


namespace LightGBM {


class GPUTreeLearner: public SerialTreeLearner {

public:

  explicit GPUTreeLearner(const Config* tree_config);

  ~GPUTreeLearner();

  void Init(const Dataset* train_data, bool is_constant_hessian) override;

  void ResetTrainingData(const Dataset* train_data) override;

  Tree* Train(const score_t* gradients, const score_t *hessians,

              bool is_constant_hessian, Json& forced_split_json) override;


  void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {

    SerialTreeLearner::SetBaggingData(used_indices, num_data);

    // determine if we are using bagging before we construct the data partition

    // thus we can start data movement to GPU earlier

    if (used_indices != nullptr) {

      if (num_data != num_data_) {

        use_bagging_ = true;

        return;

      }

    }

    use_bagging_ = false;

  }


protected:

  void BeforeTrain() override;

  bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;

  void FindBestSplits() override;

  void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;

  void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;


private:

  struct Feature4 {

      uint8_t s[4];

  };


  struct GPUHistogramBinEntry {

    score_t sum_gradients;

    score_t sum_hessians;

    uint32_t cnt;

  };


  int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);


  void InitGPU(int platform_id, int device_id);


  void AllocateGPUMemory();


  void BuildGPUKernels();


  std::string GetBuildLog(const std::string &opts);


  void SetupKernelArguments();


  void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);


  template <typename HistType>

  void WaitAndGetHistograms(HistogramBinEntry* histograms);


  bool ConstructGPUHistogramsAsync(

    const std::vector<int8_t>& is_feature_used,

    const data_size_t* data_indices, data_size_t num_data,

    const score_t* gradients, const score_t* hessians,

    score_t* ordered_gradients, score_t* ordered_hessians);


  const int kMaxLogWorkgroupsPerFeature = 10;  // 2^10

  int preallocd_max_num_wg_ = 1024;


  bool use_bagging_;


  boost::compute::device dev_;

  boost::compute::context ctx_;

  boost::compute::command_queue queue_;

  const char *kernel256_src_ =

  #include "ocl/histogram256.cl"

  ;

  const char *kernel64_src_ =

  #include "ocl/histogram64.cl"

  ;

  const char *kernel16_src_ =

  #include "ocl/histogram16.cl"

  ;

  std::string kernel_source_;

  std::string kernel_name_;


  std::vector<boost::compute::kernel> histogram_kernels_;

  std::vector<boost::compute::kernel> histogram_allfeats_kernels_;

  std::vector<boost::compute::kernel> histogram_fulldata_kernels_;

  int num_feature_groups_;

  int num_dense_feature_groups_;

  int dword_features_;

  int num_dense_feature4_;

  int max_num_bin_;

  int device_bin_size_;

  size_t hist_bin_entry_sz_;

  std::vector<int> dense_feature_group_map_;

  std::vector<int> sparse_feature_group_map_;

  std::vector<int> device_bin_mults_;

  std::unique_ptr<boost::compute::vector<Feature4>> device_features_;

  boost::compute::buffer device_gradients_;

  boost::compute::buffer pinned_gradients_;

  void * ptr_pinned_gradients_ = nullptr;

  boost::compute::buffer device_hessians_;

  boost::compute::buffer pinned_hessians_;

  void * ptr_pinned_hessians_ = nullptr;

  std::vector<char, boost::alignment::aligned_allocator<char, 4096>> feature_masks_;

  boost::compute::buffer device_feature_masks_;

  boost::compute::buffer pinned_feature_masks_;

  void * ptr_pinned_feature_masks_ = nullptr;

  std::unique_ptr<boost::compute::vector<data_size_t>> device_data_indices_;

  std::unique_ptr<boost::compute::vector<int>> sync_counters_;

  std::unique_ptr<boost::compute::vector<char>> device_subhistograms_;

  boost::compute::buffer device_histogram_outputs_;

  void * host_histogram_outputs_;

  boost::compute::wait_list kernel_wait_obj_;

  boost::compute::wait_list histograms_wait_obj_;

  boost::compute::future<void> indices_future_;

  boost::compute::event gradients_future_;

  boost::compute::event hessians_future_;

};


}  // namespace LightGBM

#else


// When GPU support is not compiled in, quit with an error message


namespace LightGBM {


class GPUTreeLearner: public SerialTreeLearner {

public:

  #pragma warning(disable : 4702)

  explicit GPUTreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {

    Log::Fatal("GPU Tree Learner was not enabled in this build.\n"

               "Please recompile with CMake option -DUSE_GPU=1");

  }

};


}  // namespace LightGBM


#endif   // USE_GPU


#endif   // LightGBM_TREELEARNER_GPU_TREE_LEARNER_H_


LightGBM::GPUTreeLearner
Definition gpu_tree_learner.h:271

LightGBM::SerialTreeLearner
Used for learning a tree by single machine.
Definition serial_tree_learner.h:34

LightGBM::SerialTreeLearner::num_data_
data_size_t num_data_
number of data
Definition serial_tree_learner.h:116

LightGBM::SerialTreeLearner::Init
void Init(const Dataset *train_data, bool is_constant_hessian) override
Initialize tree learner with training dataset.
Definition serial_tree_learner.cpp:44

LightGBM::SerialTreeLearner::Train
Tree * Train(const score_t *gradients, const score_t *hessians, bool is_constant_hessian, Json &forced_split_json) override
training tree model on dataset
Definition serial_tree_learner.cpp:157

LightGBM::SerialTreeLearner::SetBaggingData
void SetBaggingData(const data_size_t *used_indices, data_size_t num_data) override
Set bagging data.
Definition serial_tree_learner.h:54

LightGBM::SerialTreeLearner::Split
virtual void Split(Tree *tree, int best_leaf, int *left_leaf, int *right_leaf)
Partition tree and data according best split.
Definition serial_tree_learner.cpp:703

LightGBM::SerialTreeLearner::BeforeTrain
virtual void BeforeTrain()
Some initial works before training.
Definition serial_tree_learner.cpp:255

LightGBM::SerialTreeLearner::BeforeFindBestSplit
virtual bool BeforeFindBestSplit(const Tree *tree, int left_leaf, int right_leaf)
Some initial works before FindBestSplit.
Definition serial_tree_learner.cpp:348

json11::Json
Definition json11.hpp:79

LightGBM
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10

LightGBM::score_t
float score_t
Type of score, and gradients.
Definition meta.h:26

LightGBM::data_size_t
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14

LightGBM::Config
Definition config.h:27