Medial Code Documentation
Loading...
Searching...
No Matches
feature_histogram.hpp
1#ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
2#define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
3
4#include "split_info.hpp"
5
6#include <LightGBM/utils/array_args.h>
7#include <LightGBM/dataset.h>
8
9#include <cstring>
10#include <cmath>
11
12namespace LightGBM {
13
15public:
16 int num_bin;
17 MissingType missing_type;
18 int8_t bias = 0;
19 uint32_t default_bin;
20 int8_t monotone_type;
21 double penalty;
23 const Config* config;
24 BinType bin_type;
25};
30public:
32 data_ = nullptr;
33 }
34
36 }
37
42
48 void Init(HistogramBinEntry* data, const FeatureMetainfo* meta) {
49 meta_ = meta;
50 data_ = data;
51 if (meta_->bin_type == BinType::NumericalBin) {
52 find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
53 , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
54 } else {
55 find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
56 , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
57 }
58 }
59
60 HistogramBinEntry* RawData() {
61 return data_;
62 }
67 void Subtract(const FeatureHistogram& other) {
68 for (int i = 0; i < meta_->num_bin - meta_->bias; ++i) {
69 data_[i].cnt -= other.data_[i].cnt;
70 data_[i].sum_gradients -= other.data_[i].sum_gradients;
71 data_[i].sum_hessians -= other.data_[i].sum_hessians;
72 }
73 }
74
75 void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
76 SplitInfo* output) {
77 output->default_left = true;
78 output->gain = kMinScore;
79 find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output);
80 output->gain *= meta_->penalty;
81 }
82
83 void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
84 SplitInfo* output) {
85 is_splittable_ = false;
86 double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
87 meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
88 double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
89 if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
90 if (meta_->missing_type == MissingType::Zero) {
91 FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, true, false);
92 FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, 1, true, false);
93 } else {
94 FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, false, true);
95 FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, 1, false, true);
96 }
97 } else {
98 FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, false, false);
99 // fix the direction error when only have 2 bins
100 if (meta_->missing_type == MissingType::NaN) {
101 output->default_left = false;
102 }
103 }
104 output->gain -= min_gain_shift;
105 output->monotone_type = meta_->monotone_type;
106 output->min_constraint = min_constraint;
107 output->max_constraint = max_constraint;
108 }
109
110 void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
111 double min_constraint, double max_constraint,
112 SplitInfo* output) {
113 output->default_left = false;
114 double best_gain = kMinScore;
115 data_size_t best_left_count = 0;
116 double best_sum_left_gradient = 0;
117 double best_sum_left_hessian = 0;
118 double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
119
120 double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
121 bool is_full_categorical = meta_->missing_type == MissingType::None;
122 int used_bin = meta_->num_bin - 1 + is_full_categorical;
123
124 std::vector<int> sorted_idx;
125 double l2 = meta_->config->lambda_l2;
126 bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot;
127 int best_threshold = -1;
128 int best_dir = 1;
129
130 if (use_onehot) {
131 for (int t = 0; t < used_bin; ++t) {
132 // if data not enough, or sum hessian too small
133 if (data_[t].cnt < meta_->config->min_data_in_leaf
134 || data_[t].sum_hessians < meta_->config->min_sum_hessian_in_leaf) continue;
135 data_size_t other_count = num_data - data_[t].cnt;
136 // if data not enough
137 if (other_count < meta_->config->min_data_in_leaf) continue;
138
139 double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon;
140 // if sum hessian too small
141 if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
142
143 double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
144 // current split gain
145 double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
146 meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
147 min_constraint, max_constraint, 0);
148 // gain with split is worse than without split
149 if (current_gain <= min_gain_shift) continue;
150
151 // mark to is splittable
152 is_splittable_ = true;
153 // better split point
154 if (current_gain > best_gain) {
155 best_threshold = t;
156 best_sum_left_gradient = data_[t].sum_gradients;
157 best_sum_left_hessian = data_[t].sum_hessians + kEpsilon;
158 best_left_count = data_[t].cnt;
159 best_gain = current_gain;
160 }
161 }
162 } else {
163 for (int i = 0; i < used_bin; ++i) {
164 if (data_[i].cnt >= meta_->config->cat_smooth) {
165 sorted_idx.push_back(i);
166 }
167 }
168 used_bin = static_cast<int>(sorted_idx.size());
169
170 l2 += meta_->config->cat_l2;
171
172 auto ctr_fun = [this](double sum_grad, double sum_hess) {
173 return (sum_grad) / (sum_hess + meta_->config->cat_smooth);
174 };
175 std::sort(sorted_idx.begin(), sorted_idx.end(),
176 [this, &ctr_fun](int i, int j) {
177 return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians);
178 });
179
180 std::vector<int> find_direction(1, 1);
181 std::vector<int> start_position(1, 0);
182 find_direction.push_back(-1);
183 start_position.push_back(used_bin - 1);
184 const int max_num_cat = std::min(meta_->config->max_cat_threshold, (used_bin + 1) / 2);
185
186 is_splittable_ = false;
187 for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
188 auto dir = find_direction[out_i];
189 auto start_pos = start_position[out_i];
190 data_size_t min_data_per_group = meta_->config->min_data_per_group;
191 data_size_t cnt_cur_group = 0;
192 double sum_left_gradient = 0.0f;
193 double sum_left_hessian = kEpsilon;
194 data_size_t left_count = 0;
195 for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
196 auto t = sorted_idx[start_pos];
197 start_pos += dir;
198
199 sum_left_gradient += data_[t].sum_gradients;
200 sum_left_hessian += data_[t].sum_hessians;
201 left_count += data_[t].cnt;
202 cnt_cur_group += data_[t].cnt;
203
204 if (left_count < meta_->config->min_data_in_leaf
205 || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
206 data_size_t right_count = num_data - left_count;
207 if (right_count < meta_->config->min_data_in_leaf || right_count < min_data_per_group) break;
208
209 double sum_right_hessian = sum_hessian - sum_left_hessian;
210 if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) break;
211
212 if (cnt_cur_group < min_data_per_group) continue;
213
214 cnt_cur_group = 0;
215
216 double sum_right_gradient = sum_gradient - sum_left_gradient;
217 double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
218 meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
219 min_constraint, max_constraint, 0);
220 if (current_gain <= min_gain_shift) continue;
221 is_splittable_ = true;
222 if (current_gain > best_gain) {
223 best_left_count = left_count;
224 best_sum_left_gradient = sum_left_gradient;
225 best_sum_left_hessian = sum_left_hessian;
226 best_threshold = i;
227 best_gain = current_gain;
228 best_dir = dir;
229 }
230 }
231 }
232 }
233
234 if (is_splittable_) {
235 output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
236 meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
237 min_constraint, max_constraint);
238 output->left_count = best_left_count;
239 output->left_sum_gradient = best_sum_left_gradient;
240 output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
241 output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
242 sum_hessian - best_sum_left_hessian,
243 meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
244 min_constraint, max_constraint);
245 output->right_count = num_data - best_left_count;
246 output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
247 output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
248 output->gain = best_gain - min_gain_shift;
249 if (use_onehot) {
250 output->num_cat_threshold = 1;
251 output->cat_threshold = std::vector<uint32_t>(1, static_cast<uint32_t>(best_threshold));
252 } else {
253 output->num_cat_threshold = best_threshold + 1;
254 output->cat_threshold = std::vector<uint32_t>(output->num_cat_threshold);
255 if (best_dir == 1) {
256 for (int i = 0; i < output->num_cat_threshold; ++i) {
257 auto t = sorted_idx[i];
258 output->cat_threshold[i] = t;
259 }
260 } else {
261 for (int i = 0; i < output->num_cat_threshold; ++i) {
262 auto t = sorted_idx[used_bin - 1 - i];
263 output->cat_threshold[i] = t;
264 }
265 }
266 }
267 output->monotone_type = 0;
268 output->min_constraint = min_constraint;
269 output->max_constraint = max_constraint;
270 }
271 }
272
273 void GatherInfoForThreshold(double sum_gradient, double sum_hessian,
274 uint32_t threshold, data_size_t num_data, SplitInfo *output) {
275 if (meta_->bin_type == BinType::NumericalBin) {
276 GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold,
277 num_data, output);
278 } else {
279 GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold,
280 num_data, output);
281 }
282 }
283
284 void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian,
285 uint32_t threshold, data_size_t num_data,
286 SplitInfo *output) {
287 double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
288 meta_->config->lambda_l1, meta_->config->lambda_l2,
289 meta_->config->max_delta_step);
290 double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
291
292 // do stuff here
293 const int8_t bias = meta_->bias;
294
295 double sum_right_gradient = 0.0f;
296 double sum_right_hessian = kEpsilon;
297 data_size_t right_count = 0;
298
299 // set values
300 bool use_na_as_missing = false;
301 bool skip_default_bin = false;
302 if (meta_->missing_type == MissingType::Zero) {
303 skip_default_bin = true;
304 } else if (meta_->missing_type == MissingType::NaN) {
305 use_na_as_missing = true;
306 }
307
308 int t = meta_->num_bin - 1 - bias - use_na_as_missing;
309 const int t_end = 1 - bias;
310
311 // from right to left, and we don't need data in bin0
312 for (; t >= t_end; --t) {
313 if (static_cast<uint32_t>(t + bias) < threshold) { break; }
314
315 // need to skip default bin
316 if (skip_default_bin && (t + bias) == static_cast<int>(meta_->default_bin)) { continue; }
317
318 sum_right_gradient += data_[t].sum_gradients;
319 sum_right_hessian += data_[t].sum_hessians;
320 right_count += data_[t].cnt;
321 }
322 double sum_left_gradient = sum_gradient - sum_right_gradient;
323 double sum_left_hessian = sum_hessian - sum_right_hessian;
324 data_size_t left_count = num_data - right_count;
325 double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
326 meta_->config->lambda_l1, meta_->config->lambda_l2,
327 meta_->config->max_delta_step)
328 + GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
329 meta_->config->lambda_l1, meta_->config->lambda_l2,
330 meta_->config->max_delta_step);
331
332 // gain with split is worse than without split
333 if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
334 output->gain = kMinScore;
335 Log::Warning("'Forced Split' will be ignored since the gain getting worse. ");
336 return;
337 }
338
339 // update split information
340 output->threshold = threshold;
341 output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
342 meta_->config->lambda_l1, meta_->config->lambda_l2,
343 meta_->config->max_delta_step);
344 output->left_count = left_count;
345 output->left_sum_gradient = sum_left_gradient;
346 output->left_sum_hessian = sum_left_hessian - kEpsilon;
347 output->right_output = CalculateSplittedLeafOutput(sum_gradient - sum_left_gradient,
348 sum_hessian - sum_left_hessian,
349 meta_->config->lambda_l1, meta_->config->lambda_l2,
350 meta_->config->max_delta_step);
351 output->right_count = num_data - left_count;
352 output->right_sum_gradient = sum_gradient - sum_left_gradient;
353 output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon;
354 output->gain = current_gain;
355 output->gain -= min_gain_shift;
356 output->default_left = true;
357 }
358
359 void GatherInfoForThresholdCategorical(double sum_gradient, double sum_hessian,
360 uint32_t threshold, data_size_t num_data, SplitInfo *output) {
361 // get SplitInfo for a given one-hot categorical split.
362 output->default_left = false;
363 double gain_shift = GetLeafSplitGain(
364 sum_gradient, sum_hessian,
365 meta_->config->lambda_l1, meta_->config->lambda_l2,
366 meta_->config->max_delta_step);
367 double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
368 bool is_full_categorical = meta_->missing_type == MissingType::None;
369 int used_bin = meta_->num_bin - 1 + is_full_categorical;
370 if (threshold >= static_cast<uint32_t>(used_bin)) {
371 output->gain = kMinScore;
372 Log::Warning("Invalid categorical threshold split");
373 return;
374 }
375
376 double l2 = meta_->config->lambda_l2;
377 data_size_t left_count = data_[threshold].cnt;
378 data_size_t right_count = num_data - left_count;
379 double sum_left_hessian = data_[threshold].sum_hessians + kEpsilon;
380 double sum_right_hessian = sum_hessian - sum_left_hessian;
381 double sum_left_gradient = data_[threshold].sum_gradients;
382 double sum_right_gradient = sum_gradient - sum_left_gradient;
383 // current split gain
384 double current_gain = GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
385 meta_->config->lambda_l1, l2,
386 meta_->config->max_delta_step)
387 + GetLeafSplitGain(sum_left_gradient, sum_right_hessian,
388 meta_->config->lambda_l1, l2,
389 meta_->config->max_delta_step);
390 if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
391 output->gain = kMinScore;
392 Log::Warning("'Forced Split' will be ignored since the gain getting worse. ");
393 return;
394 }
395
396 output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
397 meta_->config->lambda_l1, l2,
398 meta_->config->max_delta_step);
399 output->left_count = left_count;
400 output->left_sum_gradient = sum_left_gradient;
401 output->left_sum_hessian = sum_left_hessian - kEpsilon;
402 output->right_output = CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian,
403 meta_->config->lambda_l1, l2,
404 meta_->config->max_delta_step);
405 output->right_count = right_count;
406 output->right_sum_gradient = sum_gradient - sum_left_gradient;
407 output->right_sum_hessian = sum_right_hessian - kEpsilon;
408 output->gain = current_gain - min_gain_shift;
409 output->num_cat_threshold = 1;
410 output->cat_threshold = std::vector<uint32_t>(1, threshold);
411 }
412
413
417 int SizeOfHistgram() const {
418 return (meta_->num_bin - meta_->bias) * sizeof(HistogramBinEntry);
419 }
420
424 void FromMemory(char* memory_data) {
425 std::memcpy(data_, memory_data, (meta_->num_bin - meta_->bias) * sizeof(HistogramBinEntry));
426 }
427
431 bool is_splittable() { return is_splittable_; }
432
436 void set_is_splittable(bool val) { is_splittable_ = val; }
437
438 static double ThresholdL1(double s, double l1) {
439 const double reg_s = std::max(0.0, std::fabs(s) - l1);
440 return Common::Sign(s) * reg_s;
441 }
442
443 static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step) {
444 double ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2);
445 if (max_delta_step <= 0.0f || std::fabs(ret) <= max_delta_step) {
446 return ret;
447 } else {
448 return Common::Sign(ret) * max_delta_step;
449 }
450 }
451
452private:
453 static double GetSplitGains(double sum_left_gradients, double sum_left_hessians,
454 double sum_right_gradients, double sum_right_hessians,
455 double l1, double l2, double max_delta_step,
456 double min_constraint, double max_constraint, int8_t monotone_constraint) {
457 double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
458 double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
459 if (((monotone_constraint > 0) && (left_output > right_output)) ||
460 ((monotone_constraint < 0) && (left_output < right_output))) {
461 return 0;
462 }
463 return GetLeafSplitGainGivenOutput(sum_left_gradients, sum_left_hessians, l1, l2, left_output)
464 + GetLeafSplitGainGivenOutput(sum_right_gradients, sum_right_hessians, l1, l2, right_output);
465 }
466
473 static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step,
474 double min_constraint, double max_constraint) {
475 double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step);
476 if (ret < min_constraint) {
477 ret = min_constraint;
478 } else if (ret > max_constraint) {
479 ret = max_constraint;
480 }
481 return ret;
482 }
483
490 static double GetLeafSplitGain(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step) {
491 double output = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step);
492 return GetLeafSplitGainGivenOutput(sum_gradients, sum_hessians, l1, l2, output);
493 }
494
495 static double GetLeafSplitGainGivenOutput(double sum_gradients, double sum_hessians, double l1, double l2, double output) {
496 const double sg_l1 = ThresholdL1(sum_gradients, l1);
497 return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output);
498 }
499
500 void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
501 double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) {
502 const int8_t bias = meta_->bias;
503
504 double best_sum_left_gradient = NAN;
505 double best_sum_left_hessian = NAN;
506 double best_gain = kMinScore;
507 data_size_t best_left_count = 0;
508 uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
509
510 if (dir == -1) {
511 double sum_right_gradient = 0.0f;
512 double sum_right_hessian = kEpsilon;
513 data_size_t right_count = 0;
514
515 int t = meta_->num_bin - 1 - bias - use_na_as_missing;
516 const int t_end = 1 - bias;
517
518 // from right to left, and we don't need data in bin0
519 for (; t >= t_end; --t) {
520 // need to skip default bin
521 if (skip_default_bin && (t + bias) == static_cast<int>(meta_->default_bin)) { continue; }
522
523 sum_right_gradient += data_[t].sum_gradients;
524 sum_right_hessian += data_[t].sum_hessians;
525 right_count += data_[t].cnt;
526 // if data not enough, or sum hessian too small
527 if (right_count < meta_->config->min_data_in_leaf
528 || sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
529 data_size_t left_count = num_data - right_count;
530 // if data not enough
531 if (left_count < meta_->config->min_data_in_leaf) break;
532
533 double sum_left_hessian = sum_hessian - sum_right_hessian;
534 // if sum hessian too small
535 if (sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) break;
536
537 double sum_left_gradient = sum_gradient - sum_right_gradient;
538 // current split gain
539 double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
540 meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
541 min_constraint, max_constraint, meta_->monotone_type);
542 // gain with split is worse than without split
543 if (current_gain <= min_gain_shift) continue;
544
545 // mark to is splittable
546 is_splittable_ = true;
547 // better split point
548 if (current_gain > best_gain) {
549 best_left_count = left_count;
550 best_sum_left_gradient = sum_left_gradient;
551 best_sum_left_hessian = sum_left_hessian;
552 // left is <= threshold, right is > threshold. so this is t-1
553 best_threshold = static_cast<uint32_t>(t - 1 + bias);
554 best_gain = current_gain;
555 }
556 }
557 } else {
558 double sum_left_gradient = 0.0f;
559 double sum_left_hessian = kEpsilon;
560 data_size_t left_count = 0;
561
562 int t = 0;
563 const int t_end = meta_->num_bin - 2 - bias;
564
565 if (use_na_as_missing && bias == 1) {
566 sum_left_gradient = sum_gradient;
567 sum_left_hessian = sum_hessian - kEpsilon;
568 left_count = num_data;
569 for (int i = 0; i < meta_->num_bin - bias; ++i) {
570 sum_left_gradient -= data_[i].sum_gradients;
571 sum_left_hessian -= data_[i].sum_hessians;
572 left_count -= data_[i].cnt;
573 }
574 t = -1;
575 }
576
577 for (; t <= t_end; ++t) {
578 // need to skip default bin
579 if (skip_default_bin && (t + bias) == static_cast<int>(meta_->default_bin)) { continue; }
580 if (t >= 0) {
581 sum_left_gradient += data_[t].sum_gradients;
582 sum_left_hessian += data_[t].sum_hessians;
583 left_count += data_[t].cnt;
584 }
585 // if data not enough, or sum hessian too small
586 if (left_count < meta_->config->min_data_in_leaf
587 || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
588 data_size_t right_count = num_data - left_count;
589 // if data not enough
590 if (right_count < meta_->config->min_data_in_leaf) break;
591
592 double sum_right_hessian = sum_hessian - sum_left_hessian;
593 // if sum hessian too small
594 if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) break;
595
596 double sum_right_gradient = sum_gradient - sum_left_gradient;
597 // current split gain
598 double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
599 meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
600 min_constraint, max_constraint, meta_->monotone_type);
601 // gain with split is worse than without split
602 if (current_gain <= min_gain_shift) continue;
603
604 // mark to is splittable
605 is_splittable_ = true;
606 // better split point
607 if (current_gain > best_gain) {
608 best_left_count = left_count;
609 best_sum_left_gradient = sum_left_gradient;
610 best_sum_left_hessian = sum_left_hessian;
611 best_threshold = static_cast<uint32_t>(t + bias);
612 best_gain = current_gain;
613 }
614 }
615 }
616
617 if (is_splittable_ && best_gain > output->gain) {
618 // update split information
619 output->threshold = best_threshold;
620 output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
621 meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
622 min_constraint, max_constraint);
623 output->left_count = best_left_count;
624 output->left_sum_gradient = best_sum_left_gradient;
625 output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
626 output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
627 sum_hessian - best_sum_left_hessian,
628 meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
629 min_constraint, max_constraint);
630 output->right_count = num_data - best_left_count;
631 output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
632 output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
633 output->gain = best_gain;
634 output->default_left = dir == -1;
635 }
636 }
637
638 const FeatureMetainfo* meta_;
640 HistogramBinEntry* data_;
641 // std::vector<HistogramBinEntry> data_;
642 bool is_splittable_ = true;
643
644 std::function<void(double, double, data_size_t, double, double, SplitInfo*)> find_best_threshold_fun_;
645};
647public:
652 cache_size_ = 0;
653 total_size_ = 0;
654 }
659 }
665 void Reset(int cache_size, int total_size) {
666 cache_size_ = cache_size;
667 // at least need 2 bucket to store smaller leaf and larger leaf
668 CHECK(cache_size_ >= 2);
669 total_size_ = total_size;
670 if (cache_size_ > total_size_) {
671 cache_size_ = total_size_;
672 }
673 is_enough_ = (cache_size_ == total_size_);
674 if (!is_enough_) {
675 mapper_.resize(total_size_);
676 inverse_mapper_.resize(cache_size_);
677 last_used_time_.resize(cache_size_);
678 ResetMap();
679 }
680 }
684 void ResetMap() {
685 if (!is_enough_) {
686 cur_time_ = 0;
687 std::fill(mapper_.begin(), mapper_.end(), -1);
688 std::fill(inverse_mapper_.begin(), inverse_mapper_.end(), -1);
689 std::fill(last_used_time_.begin(), last_used_time_.end(), 0);
690 }
691 }
692
693 void DynamicChangeSize(const Dataset* train_data, const Config* config, int cache_size, int total_size) {
694 if (feature_metas_.empty()) {
695 int num_feature = train_data->num_features();
696 feature_metas_.resize(num_feature);
697 #pragma omp parallel for schedule(static, 512) if (num_feature >= 1024)
698 for (int i = 0; i < num_feature; ++i) {
699 feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
700 feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
701 feature_metas_[i].missing_type = train_data->FeatureBinMapper(i)->missing_type();
702 feature_metas_[i].monotone_type = train_data->FeatureMonotone(i);
703 feature_metas_[i].penalty = train_data->FeaturePenalte(i);
704 if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
705 feature_metas_[i].bias = 1;
706 } else {
707 feature_metas_[i].bias = 0;
708 }
709 feature_metas_[i].config = config;
710 feature_metas_[i].bin_type = train_data->FeatureBinMapper(i)->bin_type();
711 }
712 }
713 uint64_t num_total_bin = train_data->NumTotalBin();
714 Log::Info("Total Bins %d", num_total_bin);
715 int old_cache_size = static_cast<int>(pool_.size());
716 Reset(cache_size, total_size);
717
718 if (cache_size > old_cache_size) {
719 pool_.resize(cache_size);
720 data_.resize(cache_size);
721 }
722
723 OMP_INIT_EX();
724 #pragma omp parallel for schedule(static)
725 for (int i = old_cache_size; i < cache_size; ++i) {
726 OMP_LOOP_EX_BEGIN();
727 pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
728 data_[i].resize(num_total_bin);
729 uint64_t offset = 0;
730 for (int j = 0; j < train_data->num_features(); ++j) {
731 offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
732 pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j]);
733 auto num_bin = train_data->FeatureNumBin(j);
734 if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) {
735 num_bin -= 1;
736 }
737 offset += static_cast<uint64_t>(num_bin);
738 }
739 CHECK(offset == num_total_bin);
740 OMP_LOOP_EX_END();
741 }
742 OMP_THROW_EX();
743 }
744
745 void ResetConfig(const Config* config) {
746 int size = static_cast<int>(feature_metas_.size());
747 #pragma omp parallel for schedule(static, 512) if (size >= 1024)
748 for (int i = 0; i < size; ++i) {
749 feature_metas_[i].config = config;
750 }
751 }
758 bool Get(int idx, FeatureHistogram** out) {
759 if (is_enough_) {
760 *out = pool_[idx].get();
761 return true;
762 } else if (mapper_[idx] >= 0) {
763 int slot = mapper_[idx];
764 *out = pool_[slot].get();
765 last_used_time_[slot] = ++cur_time_;
766 return true;
767 } else {
768 // choose the least used slot
769 int slot = static_cast<int>(ArrayArgs<int>::ArgMin(last_used_time_));
770 *out = pool_[slot].get();
771 last_used_time_[slot] = ++cur_time_;
772
773 // reset previous mapper
774 if (inverse_mapper_[slot] >= 0) mapper_[inverse_mapper_[slot]] = -1;
775
776 // update current mapper
777 mapper_[idx] = slot;
778 inverse_mapper_[slot] = idx;
779 return false;
780 }
781 }
782
788 void Move(int src_idx, int dst_idx) {
789 if (is_enough_) {
790 std::swap(pool_[src_idx], pool_[dst_idx]);
791 return;
792 }
793 if (mapper_[src_idx] < 0) {
794 return;
795 }
796 // get slot of src idx
797 int slot = mapper_[src_idx];
798 // reset src_idx
799 mapper_[src_idx] = -1;
800
801 // move to dst idx
802 mapper_[dst_idx] = slot;
803 last_used_time_[slot] = ++cur_time_;
804 inverse_mapper_[slot] = dst_idx;
805 }
806
807private:
808 std::vector<std::unique_ptr<FeatureHistogram[]>> pool_;
809 std::vector<std::vector<HistogramBinEntry>> data_;
810 std::vector<FeatureMetainfo> feature_metas_;
811 int cache_size_;
812 int total_size_;
813 bool is_enough_ = false;
814 std::vector<int> mapper_;
815 std::vector<int> inverse_mapper_;
816 std::vector<int> last_used_time_;
817 int cur_time_ = 0;
818};
819
820} // namespace LightGBM
821#endif // LightGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
Contains some operation for a array, e.g. ArgMax, TopK.
Definition array_args.h:14
The main class of data set, which are used to traning or validation.
Definition dataset.h:278
FeatureHistogram is used to construct and store a histogram for a feature.
Definition feature_histogram.hpp:29
bool is_splittable()
True if this histogram can be splitted.
Definition feature_histogram.hpp:431
int SizeOfHistgram() const
Binary size of this histogram.
Definition feature_histogram.hpp:417
void set_is_splittable(bool val)
Set splittable to this histogram.
Definition feature_histogram.hpp:436
void FromMemory(char *memory_data)
Restore histogram from memory.
Definition feature_histogram.hpp:424
FeatureHistogram & operator=(const FeatureHistogram &)=delete
Disable copy.
FeatureHistogram(const FeatureHistogram &)=delete
Disable copy.
void Init(HistogramBinEntry *data, const FeatureMetainfo *meta)
Init the feature histogram.
Definition feature_histogram.hpp:48
void Subtract(const FeatureHistogram &other)
Subtract current histograms with other.
Definition feature_histogram.hpp:67
Definition feature_histogram.hpp:14
const Config * config
pointer of tree config
Definition feature_histogram.hpp:23
Definition feature_histogram.hpp:646
void ResetMap()
Reset mapper.
Definition feature_histogram.hpp:684
bool Get(int idx, FeatureHistogram **out)
Get data for the specific index.
Definition feature_histogram.hpp:758
~HistogramPool()
Destructor.
Definition feature_histogram.hpp:658
void Move(int src_idx, int dst_idx)
Move data from one index to another index.
Definition feature_histogram.hpp:788
void Reset(int cache_size, int total_size)
Reset pool size.
Definition feature_histogram.hpp:665
HistogramPool()
Constructor.
Definition feature_histogram.hpp:651
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
NLOHMANN_BASIC_JSON_TPL_DECLARATION void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL &j1, nlohmann::NLOHMANN_BASIC_JSON_TPL &j2) noexcept(//NOLINT(readability-inconsistent-declaration-parameter-name, cert-dcl58-cpp) is_nothrow_move_constructible< nlohmann::NLOHMANN_BASIC_JSON_TPL >::value &&//NOLINT(misc-redundant-expression) is_nothrow_move_assignable< nlohmann::NLOHMANN_BASIC_JSON_TPL >::value)
exchanges the values of two JSON objects
Definition json.hpp:24418
Definition config.h:27
Store data for one histogram bin.
Definition bin.h:29
data_size_t cnt
Number of data on this bin.
Definition bin.h:36
double sum_hessians
Sum of hessians on this bin.
Definition bin.h:34
double sum_gradients
Sum of gradients on this bin.
Definition bin.h:32
Used to store some information for gain split point.
Definition split_info.hpp:17