Medial Code Documentation
Loading...
Searching...
No Matches
serial_tree_learner.h
1#ifndef LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
2#define LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
3#include <LightGBM/tree_learner.h>
4
5#include <LightGBM/utils/random.h>
6#include <LightGBM/utils/array_args.h>
7
8#include <LightGBM/dataset.h>
9#include <LightGBM/tree.h>
10
11#include "feature_histogram.hpp"
12#include "split_info.hpp"
13#include "data_partition.hpp"
14#include "leaf_splits.hpp"
15
16#include <cstdio>
17#include <vector>
18#include <random>
19#include <cmath>
20#include <memory>
21#ifdef USE_GPU
22// Use 4KBytes aligned allocator for ordered gradients and ordered hessians when GPU is enabled.
23// This is necessary to pin the two arrays in memory and make transferring faster.
24#include <boost/align/aligned_allocator.hpp>
25#endif
26
27using namespace json11;
28
29namespace LightGBM {
30
35public:
36 explicit SerialTreeLearner(const Config* config);
37
39
40 void Init(const Dataset* train_data, bool is_constant_hessian) override;
41
42 void ResetTrainingData(const Dataset* train_data) override;
43
44 void ResetConfig(const Config* config) override;
45
46 Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian,
47 Json& forced_split_json) override;
48
49 Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;
50
51 Tree* FitByExistingTree(const Tree* old_tree, const std::vector<int>& leaf_pred,
52 const score_t* gradients, const score_t* hessians) override;
53
54 void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
55 data_partition_->SetUsedDataIndices(used_indices, num_data);
56 }
57
58 void AddPredictionToScore(const Tree* tree, double* out_score) const override {
59 if (tree->num_leaves() <= 1) { return; }
60 CHECK(tree->num_leaves() <= data_partition_->num_leaves());
61 #pragma omp parallel for schedule(static)
62 for (int i = 0; i < tree->num_leaves(); ++i) {
63 double output = static_cast<double>(tree->LeafOutput(i));
64 data_size_t cnt_leaf_data = 0;
65 auto tmp_idx = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data);
66 for (data_size_t j = 0; j < cnt_leaf_data; ++j) {
67 out_score[tmp_idx[j]] += output;
68 }
69 }
70 }
71
72 void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, const double* prediction,
73 data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;
74
75 void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, double prediction,
76 data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;
77
78protected:
82 virtual void BeforeTrain();
83
87 virtual bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf);
88
89 virtual void FindBestSplits();
90
91 virtual void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract);
92
93 virtual void FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract);
94
102 virtual void Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf);
103
104 /* Force splits with forced_split_json dict and then return num splits forced.*/
105 virtual int32_t ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
106 int* right_leaf, int* cur_depth,
107 bool *aborted_last_force_split);
108
114 inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;
126 std::unique_ptr<DataPartition> data_partition_;
130 std::vector<int8_t> is_feature_used_;
137
139 std::vector<SplitInfo> best_split_per_leaf_;
140
142 std::unique_ptr<LeafSplits> smaller_leaf_splits_;
144 std::unique_ptr<LeafSplits> larger_leaf_splits_;
145 std::vector<int> valid_feature_indices_;
146
147#ifdef USE_GPU
149 std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
151 std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
152#else
154 std::vector<score_t> ordered_gradients_;
156 std::vector<score_t> ordered_hessians_;
157#endif
158
160 std::vector<std::unique_ptr<OrderedBin>> ordered_bins_;
162 bool has_ordered_bin_ = false;
164 std::vector<char> is_data_in_leaf_;
169 int num_threads_;
170 std::vector<int> ordered_bin_indices_;
171 bool is_constant_hessian_;
172};
173
175 if (leaf_idx >= 0) {
176 return data_partition_->leaf_count(leaf_idx);
177 } else {
178 return 0;
179 }
180}
181
182} // namespace LightGBM
183#endif // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
The main class of data set, which are used to traning or validation.
Definition dataset.h:278
FeatureHistogram is used to construct and store a histogram for a feature.
Definition feature_histogram.hpp:29
Definition feature_histogram.hpp:646
The interface of Objective Function.
Definition objective_function.h:13
A wrapper for random generator.
Definition random.h:15
Used for learning a tree by single machine.
Definition serial_tree_learner.h:34
std::unique_ptr< LeafSplits > smaller_leaf_splits_
stores best thresholds for all feature for smaller leaf
Definition serial_tree_learner.h:142
const score_t * gradients_
gradients of current iteration
Definition serial_tree_learner.h:122
FeatureHistogram * parent_leaf_histogram_array_
pointer to histograms array of parent of current leaves
Definition serial_tree_learner.h:132
std::unique_ptr< LeafSplits > larger_leaf_splits_
stores best thresholds for all feature for larger leaf
Definition serial_tree_learner.h:144
std::vector< std::unique_ptr< OrderedBin > > ordered_bins_
Store ordered bin.
Definition serial_tree_learner.h:160
std::vector< char > is_data_in_leaf_
is_data_in_leaf_[i] != 0 means i-th data is marked
Definition serial_tree_learner.h:164
bool has_ordered_bin_
True if has ordered bin.
Definition serial_tree_learner.h:162
FeatureHistogram * larger_leaf_histogram_array_
pointer to histograms array of larger leaf
Definition serial_tree_learner.h:136
HistogramPool histogram_pool_
used to cache historical histogram to speed up
Definition serial_tree_learner.h:166
std::vector< SplitInfo > best_split_per_leaf_
store best split points for all leaves
Definition serial_tree_learner.h:139
data_size_t num_data_
number of data
Definition serial_tree_learner.h:116
Tree * FitByExistingTree(const Tree *old_tree, const score_t *gradients, const score_t *hessians) const override
use a existing tree to fit the new gradients and hessians.
Definition serial_tree_learner.cpp:223
void Init(const Dataset *train_data, bool is_constant_hessian) override
Initialize tree learner with training dataset.
Definition serial_tree_learner.cpp:44
Tree * Train(const score_t *gradients, const score_t *hessians, bool is_constant_hessian, Json &forced_split_json) override
training tree model on dataset
Definition serial_tree_learner.cpp:157
void AddPredictionToScore(const Tree *tree, double *out_score) const override
Using last trained tree to predict score then adding to out_score;.
Definition serial_tree_learner.h:58
FeatureHistogram * smaller_leaf_histogram_array_
pointer to histograms array of smaller leaf
Definition serial_tree_learner.h:134
std::unique_ptr< DataPartition > data_partition_
training data partition on leaves
Definition serial_tree_learner.h:126
int num_features_
number of features
Definition serial_tree_learner.h:118
std::vector< int8_t > is_feature_used_
used for sub feature training, is_feature_used_[i] = false means don't used feature i
Definition serial_tree_learner.h:130
void SetBaggingData(const data_size_t *used_indices, data_size_t num_data) override
Set bagging data.
Definition serial_tree_learner.h:54
virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const
Get the number of data in a leaf.
Definition serial_tree_learner.h:174
std::vector< score_t > ordered_hessians_
hessians of current iteration, ordered for cache optimized
Definition serial_tree_learner.h:156
const score_t * hessians_
hessians of current iteration
Definition serial_tree_learner.h:124
const Dataset * train_data_
training data
Definition serial_tree_learner.h:120
Random random_
used for generate used features
Definition serial_tree_learner.h:128
virtual void Split(Tree *tree, int best_leaf, int *left_leaf, int *right_leaf)
Partition tree and data according best split.
Definition serial_tree_learner.cpp:703
void ResetConfig(const Config *config) override
Reset tree configs.
Definition serial_tree_learner.cpp:128
std::vector< score_t > ordered_gradients_
gradients of current iteration, ordered for cache optimized
Definition serial_tree_learner.h:154
virtual void BeforeTrain()
Some initial works before training.
Definition serial_tree_learner.cpp:255
const Config * config_
config of tree learner
Definition serial_tree_learner.h:168
virtual bool BeforeFindBestSplit(const Tree *tree, int left_leaf, int right_leaf)
Some initial works before FindBestSplit.
Definition serial_tree_learner.cpp:348
Interface for tree learner.
Definition tree_learner.h:23
Tree model.
Definition tree.h:20
int num_leaves() const
Get Number of leaves.
Definition tree.h:121
double LeafOutput(int leaf) const
Get the output of one leaf.
Definition tree.h:78
Definition json11.hpp:79
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
float score_t
Type of score, and gradients.
Definition meta.h:26
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
Definition config.h:27