Medial Code Documentation
Loading...
Searching...
No Matches
tree.h
1#ifndef LIGHTGBM_TREE_H_
2#define LIGHTGBM_TREE_H_
3
4#include <LightGBM/meta.h>
5#include <LightGBM/dataset.h>
6
7#include <string>
8#include <vector>
9#include <memory>
10#include <map>
11
12namespace LightGBM {
13
14#define kCategoricalMask (1)
15#define kDefaultLeftMask (2)
16
20class Tree {
21public:
26 explicit Tree(int max_leaves);
27
33 Tree(const char* str, size_t* used_len);
34
35 ~Tree();
36
53 int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
54 double threshold_double, double left_value, double right_value,
55 int left_cnt, int right_cnt, float gain, MissingType missing_type, bool default_left);
56
73 int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
74 const uint32_t* threshold, int num_threshold, double left_value, double right_value,
75 int left_cnt, int right_cnt, float gain, MissingType missing_type);
76
78 inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
79
81 inline void SetLeafOutput(int leaf, double output) {
82 leaf_value_[leaf] = output;
83 }
84
91 void AddPredictionToScore(const Dataset* data,
92 data_size_t num_data,
93 double* score) const;
94
102 void AddPredictionToScore(const Dataset* data,
103 const data_size_t* used_data_indices,
104 data_size_t num_data, double* score) const;
105
111 inline double Predict(const double* feature_values) const;
112 inline double PredictByMap(const std::unordered_map<int, double>& feature_values) const;
113
114 inline int PredictLeafIndex(const double* feature_values) const;
115 inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;
116
117
118 inline void PredictContrib(const double* feature_values, int num_features, double* output);
119
121 inline int num_leaves() const { return num_leaves_; }
122
124 inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }
125
127 inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
128
129 inline double split_gain(int split_idx) const { return split_gain_[split_idx]; }
130
132 inline int data_count(int node) const { return node >= 0 ? internal_count_[node] : leaf_count_[~node]; }
133
139 inline void Shrinkage(double rate) {
140 #pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
141 for (int i = 0; i < num_leaves_; ++i) {
142 leaf_value_[i] *= rate;
143 }
144 shrinkage_ *= rate;
145 }
146
147 inline double shrinkage() const {
148 return shrinkage_;
149 }
150
151 inline void AddBias(double val) {
152 #pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
153 for (int i = 0; i < num_leaves_; ++i) {
154 leaf_value_[i] = val + leaf_value_[i];
155 }
156 // force to 1.0
157 shrinkage_ = 1.0f;
158 }
159
160 inline void AsConstantTree(double val) {
161 num_leaves_ = 1;
162 shrinkage_ = 1.0f;
163 leaf_value_[0] = val;
164 }
165
167 std::string ToString() const;
168
170 std::string ToJSON() const;
171
173 std::string ToIfElse(int index, bool predict_leaf_index) const;
174
175 inline static bool IsZero(double fval) {
176 if (fval > -kZeroThreshold && fval <= kZeroThreshold) {
177 return true;
178 } else {
179 return false;
180 }
181 }
182
183 inline static bool GetDecisionType(int8_t decision_type, int8_t mask) {
184 return (decision_type & mask) > 0;
185 }
186
187 inline static void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) {
188 if (input) {
189 (*decision_type) |= mask;
190 } else {
191 (*decision_type) &= (127 - mask);
192 }
193 }
194
195 inline static int8_t GetMissingType(int8_t decision_type) {
196 return (decision_type >> 2) & 3;
197 }
198
199 inline static void SetMissingType(int8_t* decision_type, int8_t input) {
200 (*decision_type) &= 3;
201 (*decision_type) |= (input << 2);
202 }
203
204 void RecomputeMaxDepth();
205
206private:
207 std::string NumericalDecisionIfElse(int node) const;
208
209 std::string CategoricalDecisionIfElse(int node) const;
210
211 inline int NumericalDecision(double fval, int node) const {
212 uint8_t missing_type = GetMissingType(decision_type_[node]);
213 if (std::isnan(fval)) {
214 if (missing_type != 2) {
215 fval = 0.0f;
216 }
217 }
218 if ((missing_type == 1 && IsZero(fval))
219 || (missing_type == 2 && std::isnan(fval))) {
220 if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
221 return left_child_[node];
222 } else {
223 return right_child_[node];
224 }
225 }
226 if (fval <= threshold_[node]) {
227 return left_child_[node];
228 } else {
229 return right_child_[node];
230 }
231 }
232
233 inline int NumericalDecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
234 uint8_t missing_type = GetMissingType(decision_type_[node]);
235 if ((missing_type == 1 && fval == default_bin)
236 || (missing_type == 2 && fval == max_bin)) {
237 if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
238 return left_child_[node];
239 } else {
240 return right_child_[node];
241 }
242 }
243 if (fval <= threshold_in_bin_[node]) {
244 return left_child_[node];
245 } else {
246 return right_child_[node];
247 }
248 }
249
250 inline int CategoricalDecision(double fval, int node) const {
251 uint8_t missing_type = GetMissingType(decision_type_[node]);
252 int int_fval = static_cast<int>(fval);
253 if (int_fval < 0) {
254 return right_child_[node];;
255 } else if (std::isnan(fval)) {
256 // NaN is always in the right
257 if (missing_type == 2) {
258 return right_child_[node];
259 }
260 int_fval = 0;
261 }
262 int cat_idx = int(threshold_[node]);
263 if (Common::FindInBitset(cat_threshold_.data() + cat_boundaries_[cat_idx],
264 cat_boundaries_[cat_idx + 1] - cat_boundaries_[cat_idx], int_fval)) {
265 return left_child_[node];
266 }
267 return right_child_[node];
268 }
269
270 inline int CategoricalDecisionInner(uint32_t fval, int node) const {
271 int cat_idx = int(threshold_in_bin_[node]);
272 if (Common::FindInBitset(cat_threshold_inner_.data() + cat_boundaries_inner_[cat_idx],
273 cat_boundaries_inner_[cat_idx + 1] - cat_boundaries_inner_[cat_idx], fval)) {
274 return left_child_[node];
275 }
276 return right_child_[node];
277 }
278
279 inline int Decision(double fval, int node) const {
280 if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
281 return CategoricalDecision(fval, node);
282 } else {
283 return NumericalDecision(fval, node);
284 }
285 }
286
287 inline int DecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
288 if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
289 return CategoricalDecisionInner(fval, node);
290 } else {
291 return NumericalDecisionInner(fval, node, default_bin, max_bin);
292 }
293 }
294
295 inline void Split(int leaf, int feature, int real_feature,
296 double left_value, double right_value, int left_cnt, int right_cnt, float gain);
302 inline int GetLeaf(const double* feature_values) const;
303 inline int GetLeafByMap(const std::unordered_map<int, double>& feature_values) const;
304
306 std::string NodeToJSON(int index) const;
307
309 std::string NodeToIfElse(int index, bool predict_leaf_index) const;
310
311 std::string NodeToIfElseByMap(int index, bool predict_leaf_index) const;
312
313 double ExpectedValue() const;
314
316 inline void RecomputeLeafDepths(int node = 0, int depth = 0);
317
321 struct PathElement {
322 int feature_index;
323 double zero_fraction;
324 double one_fraction;
325
326 // note that pweight is included for convenience and is not tied with the other attributes,
327 // the pweight of the i'th path element is the permuation weight of paths with i-1 ones in them
328 double pweight;
329
330 PathElement() {}
331 PathElement(int i, double z, double o, double w) : feature_index(i), zero_fraction(z), one_fraction(o), pweight(w) {}
332 };
333
335 void TreeSHAP(const double *feature_values, double *phi,
336 int node, int unique_depth,
337 PathElement *parent_unique_path, double parent_zero_fraction,
338 double parent_one_fraction, int parent_feature_index) const;
339
341 static void ExtendPath(PathElement *unique_path, int unique_depth,
342 double zero_fraction, double one_fraction, int feature_index);
343
345 static void UnwindPath(PathElement *unique_path, int unique_depth, int path_index);
346
348 static double UnwoundPathSum(const PathElement *unique_path, int unique_depth, int path_index);
349
351 int max_leaves_;
353 int num_leaves_;
354 // following values used for non-leaf node
356 std::vector<int> left_child_;
358 std::vector<int> right_child_;
360 std::vector<int> split_feature_inner_;
362 std::vector<int> split_feature_;
364 std::vector<uint32_t> threshold_in_bin_;
366 std::vector<double> threshold_;
367 int num_cat_;
368 std::vector<int> cat_boundaries_inner_;
369 std::vector<uint32_t> cat_threshold_inner_;
370 std::vector<int> cat_boundaries_;
371 std::vector<uint32_t> cat_threshold_;
373 std::vector<int8_t> decision_type_;
375 std::vector<float> split_gain_;
376 // used for leaf node
378 std::vector<int> leaf_parent_;
380 std::vector<double> leaf_value_;
382 std::vector<int> leaf_count_;
384 std::vector<double> internal_value_;
386 std::vector<int> internal_count_;
388 std::vector<int> leaf_depth_;
389 double shrinkage_;
390 int max_depth_;
391};
392
393inline void Tree::Split(int leaf, int feature, int real_feature,
394 double left_value, double right_value, int left_cnt, int right_cnt, float gain) {
395 int new_node_idx = num_leaves_ - 1;
396 // update parent info
397 int parent = leaf_parent_[leaf];
398 if (parent >= 0) {
399 // if cur node is left child
400 if (left_child_[parent] == ~leaf) {
401 left_child_[parent] = new_node_idx;
402 } else {
403 right_child_[parent] = new_node_idx;
404 }
405 }
406 // add new node
407 split_feature_inner_[new_node_idx] = feature;
408 split_feature_[new_node_idx] = real_feature;
409
410 split_gain_[new_node_idx] = Common::AvoidInf(gain);
411 // add two new leaves
412 left_child_[new_node_idx] = ~leaf;
413 right_child_[new_node_idx] = ~num_leaves_;
414 // update new leaves
415 leaf_parent_[leaf] = new_node_idx;
416 leaf_parent_[num_leaves_] = new_node_idx;
417 // save current leaf value to internal node before change
418 internal_value_[new_node_idx] = leaf_value_[leaf];
419 internal_count_[new_node_idx] = left_cnt + right_cnt;
420 leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;
421 leaf_count_[leaf] = left_cnt;
422 leaf_value_[num_leaves_] = std::isnan(right_value) ? 0.0f : right_value;
423 leaf_count_[num_leaves_] = right_cnt;
424 // update leaf depth
425 leaf_depth_[num_leaves_] = leaf_depth_[leaf] + 1;
426 leaf_depth_[leaf]++;
427}
428
429inline double Tree::Predict(const double* feature_values) const {
430 if (num_leaves_ > 1) {
431 int leaf = GetLeaf(feature_values);
432 return LeafOutput(leaf);
433 } else {
434 return leaf_value_[0];
435 }
436}
437
438inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_values) const {
439 if (num_leaves_ > 1) {
440 int leaf = GetLeafByMap(feature_values);
441 return LeafOutput(leaf);
442 } else {
443 return leaf_value_[0];
444 }
445}
446
447inline int Tree::PredictLeafIndex(const double* feature_values) const {
448 if (num_leaves_ > 1) {
449 int leaf = GetLeaf(feature_values);
450 return leaf;
451 } else {
452 return 0;
453 }
454}
455
456inline int Tree::PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const {
457 if (num_leaves_ > 1) {
458 int leaf = GetLeafByMap(feature_values);
459 return leaf;
460 } else {
461 return 0;
462 }
463}
464
465inline void Tree::PredictContrib(const double* feature_values, int num_features, double* output) {
466 output[num_features] += ExpectedValue();
467 // Run the recursion with preallocated space for the unique path data
468 if (num_leaves_ > 1) {
469 CHECK(max_depth_ >= 0);
470 const int max_path_len = max_depth_ + 1;
471 std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
472 TreeSHAP(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
473 }
474}
475
476inline void Tree::RecomputeLeafDepths(int node, int depth) {
477 if (node == 0) leaf_depth_.resize(num_leaves());
478 if (node < 0) {
479 leaf_depth_[~node] = depth;
480 } else {
481 RecomputeLeafDepths(left_child_[node], depth + 1);
482 RecomputeLeafDepths(right_child_[node], depth + 1);
483 }
484}
485
486inline int Tree::GetLeaf(const double* feature_values) const {
487 int node = 0;
488 if (num_cat_ > 0) {
489 while (node >= 0) {
490 node = Decision(feature_values[split_feature_[node]], node);
491 }
492 } else {
493 while (node >= 0) {
494 node = NumericalDecision(feature_values[split_feature_[node]], node);
495 }
496 }
497 return ~node;
498}
499
500inline int Tree::GetLeafByMap(const std::unordered_map<int, double>& feature_values) const {
501 int node = 0;
502 if (num_cat_ > 0) {
503 while (node >= 0) {
504 node = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
505 }
506 } else {
507 while (node >= 0) {
508 node = NumericalDecision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
509 }
510 }
511 return ~node;
512}
513
514
515} // namespace LightGBM
516
517#endif // LightGBM_TREE_H_
The main class of data set, which are used to traning or validation.
Definition dataset.h:278
Tree model.
Definition tree.h:20
int split_feature(int split_idx) const
Get feature of specific split.
Definition tree.h:127
int leaf_depth(int leaf_idx) const
Get depth of specific leaf.
Definition tree.h:124
int num_leaves() const
Get Number of leaves.
Definition tree.h:121
double Predict(const double *feature_values) const
Prediction on one record.
Definition tree.h:429
void AddPredictionToScore(const Dataset *data, data_size_t num_data, double *score) const
Adding prediction value of this tree model to scores.
Definition tree.cpp:113
int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t *threshold_bin, int num_threshold_bin, const uint32_t *threshold, int num_threshold, double left_value, double right_value, int left_cnt, int right_cnt, float gain, MissingType missing_type)
Performing a split on tree leaves, with categorical feature.
Definition tree.cpp:70
void Shrinkage(double rate)
Shrinkage for the tree's output shrinkage rate (a.k.a learning rate) is used to tune the traning proc...
Definition tree.h:139
std::string ToIfElse(int index, bool predict_leaf_index) const
Serialize this object to if-else statement.
Definition tree.cpp:353
double LeafOutput(int leaf) const
Get the output of one leaf.
Definition tree.h:78
std::string ToString() const
Serialize this object to string.
Definition tree.cpp:207
int data_count(int node) const
Get the number of data points that fall at or below this node.
Definition tree.h:132
int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, double threshold_double, double left_value, double right_value, int left_cnt, int right_cnt, float gain, MissingType missing_type, bool default_left)
Performing a split on tree leaves.
Definition tree.cpp:49
void SetLeafOutput(int leaf, double output)
Set the output of one leaf.
Definition tree.h:81
std::string ToJSON() const
Serialize this object to json.
Definition tree.cpp:242
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
Definition tree_shap.h:108