1#ifndef LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_
2#define LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_
4#include <LightGBM/utils/random.h>
5#include <LightGBM/utils/array_args.h>
6#include <LightGBM/dataset.h>
7#include <LightGBM/tree.h>
8#include <LightGBM/feature_group.h>
9#include "feature_histogram.hpp"
10#include "serial_tree_learner.h"
11#include "data_partition.hpp"
12#include "split_info.hpp"
13#include "leaf_splits.hpp"
23#define BOOST_COMPUTE_THREAD_SAFE
24#define BOOST_COMPUTE_HAVE_THREAD_LOCAL
26#define BOOST_COMPUTE_USE_OFFLINE_CACHE
27#include <boost/compute/core.hpp>
28#include <boost/compute/container/vector.hpp>
29#include <boost/align/aligned_allocator.hpp>
31using namespace json11;
38class GPUTreeLearner:
public SerialTreeLearner {
40 explicit GPUTreeLearner(
const Config* tree_config);
42 void Init(
const Dataset* train_data,
bool is_constant_hessian)
override;
43 void ResetTrainingData(
const Dataset* train_data)
override;
45 bool is_constant_hessian,
Json& forced_split_json)
override;
51 if (used_indices !=
nullptr) {
63 void FindBestSplits()
override;
64 void Split(Tree* tree,
int best_Leaf,
int* left_leaf,
int* right_leaf)
override;
65 void ConstructHistograms(
const std::vector<int8_t>& is_feature_used,
bool use_subtract)
override;
74 struct GPUHistogramBinEntry {
85 int GetNumWorkgroupsPerFeature(
data_size_t leaf_num_data);
93 void InitGPU(
int platform_id,
int device_id);
98 void AllocateGPUMemory();
103 void BuildGPUKernels();
110 std::string GetBuildLog(
const std::string &opts);
115 void SetupKernelArguments();
123 void GPUHistogram(
data_size_t leaf_num_data,
bool use_all_features);
129 template <
typename HistType>
130 void WaitAndGetHistograms(HistogramBinEntry* histograms);
147 bool ConstructGPUHistogramsAsync(
148 const std::vector<int8_t>& is_feature_used,
155 const int kMaxLogWorkgroupsPerFeature = 10;
158 int preallocd_max_num_wg_ = 1024;
164 boost::compute::device dev_;
166 boost::compute::context ctx_;
168 boost::compute::command_queue queue_;
170 const char *kernel256_src_ =
171 #include "ocl/histogram256.cl"
174 const char *kernel64_src_ =
175 #include "ocl/histogram64.cl"
178 const char *kernel16_src_ =
179 #include "ocl/histogram16.cl"
182 std::string kernel_source_;
184 std::string kernel_name_;
188 std::vector<boost::compute::kernel> histogram_kernels_;
191 std::vector<boost::compute::kernel> histogram_allfeats_kernels_;
194 std::vector<boost::compute::kernel> histogram_fulldata_kernels_;
196 int num_feature_groups_;
198 int num_dense_feature_groups_;
206 int num_dense_feature4_;
211 int device_bin_size_;
213 size_t hist_bin_entry_sz_;
215 std::vector<int> dense_feature_group_map_;
217 std::vector<int> sparse_feature_group_map_;
219 std::vector<int> device_bin_mults_;
221 std::unique_ptr<boost::compute::vector<Feature4>> device_features_;
223 boost::compute::buffer device_gradients_;
225 boost::compute::buffer pinned_gradients_;
227 void * ptr_pinned_gradients_ =
nullptr;
229 boost::compute::buffer device_hessians_;
231 boost::compute::buffer pinned_hessians_;
233 void * ptr_pinned_hessians_ =
nullptr;
235 std::vector<char, boost::alignment::aligned_allocator<char, 4096>> feature_masks_;
237 boost::compute::buffer device_feature_masks_;
239 boost::compute::buffer pinned_feature_masks_;
241 void * ptr_pinned_feature_masks_ =
nullptr;
243 std::unique_ptr<boost::compute::vector<data_size_t>> device_data_indices_;
245 std::unique_ptr<boost::compute::vector<int>> sync_counters_;
247 std::unique_ptr<boost::compute::vector<char>> device_subhistograms_;
249 boost::compute::buffer device_histogram_outputs_;
251 void * host_histogram_outputs_;
253 boost::compute::wait_list kernel_wait_obj_;
255 boost::compute::wait_list histograms_wait_obj_;
257 boost::compute::future<void> indices_future_;
259 boost::compute::event gradients_future_;
261 boost::compute::event hessians_future_;
273 #pragma warning(disable : 4702)
275 Log::Fatal(
"GPU Tree Learner was not enabled in this build.\n"
276 "Please recompile with CMake option -DUSE_GPU=1");
Definition gpu_tree_learner.h:271
Used for learning a tree by single machine.
Definition serial_tree_learner.h:34
data_size_t num_data_
number of data
Definition serial_tree_learner.h:116
void Init(const Dataset *train_data, bool is_constant_hessian) override
Initialize tree learner with training dataset.
Definition serial_tree_learner.cpp:44
Tree * Train(const score_t *gradients, const score_t *hessians, bool is_constant_hessian, Json &forced_split_json) override
training tree model on dataset
Definition serial_tree_learner.cpp:157
void SetBaggingData(const data_size_t *used_indices, data_size_t num_data) override
Set bagging data.
Definition serial_tree_learner.h:54
virtual void Split(Tree *tree, int best_leaf, int *left_leaf, int *right_leaf)
Partition tree and data according best split.
Definition serial_tree_learner.cpp:703
virtual void BeforeTrain()
Some initial works before training.
Definition serial_tree_learner.cpp:255
virtual bool BeforeFindBestSplit(const Tree *tree, int left_leaf, int right_leaf)
Some initial works before FindBestSplit.
Definition serial_tree_learner.cpp:348
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
float score_t
Type of score, and gradients.
Definition meta.h:26
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14