51 if (meta_->bin_type == BinType::NumericalBin) {
52 find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical,
this, std::placeholders::_1
53 , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
55 find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical,
this, std::placeholders::_1
56 , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
68 for (
int i = 0; i < meta_->num_bin - meta_->bias; ++i) {
69 data_[i].
cnt -= other.data_[i].
cnt;
75 void FindBestThreshold(
double sum_gradient,
double sum_hessian,
data_size_t num_data,
double min_constraint,
double max_constraint,
77 output->default_left =
true;
78 output->gain = kMinScore;
79 find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output);
80 output->gain *= meta_->penalty;
83 void FindBestThresholdNumerical(
double sum_gradient,
double sum_hessian,
data_size_t num_data,
double min_constraint,
double max_constraint,
85 is_splittable_ =
false;
86 double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
88 double min_gain_shift = gain_shift + meta_->
config->min_gain_to_split;
89 if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
90 if (meta_->missing_type == MissingType::Zero) {
91 FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1,
true,
false);
92 FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, 1,
true,
false);
94 FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1,
false,
true);
95 FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, 1,
false,
true);
98 FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1,
false,
false);
100 if (meta_->missing_type == MissingType::NaN) {
101 output->default_left =
false;
104 output->gain -= min_gain_shift;
105 output->monotone_type = meta_->monotone_type;
106 output->min_constraint = min_constraint;
107 output->max_constraint = max_constraint;
110 void FindBestThresholdCategorical(
double sum_gradient,
double sum_hessian,
data_size_t num_data,
111 double min_constraint,
double max_constraint,
113 output->default_left =
false;
114 double best_gain = kMinScore;
116 double best_sum_left_gradient = 0;
117 double best_sum_left_hessian = 0;
118 double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, meta_->
config->lambda_l1, meta_->
config->lambda_l2, meta_->
config->max_delta_step);
120 double min_gain_shift = gain_shift + meta_->
config->min_gain_to_split;
121 bool is_full_categorical = meta_->missing_type == MissingType::None;
122 int used_bin = meta_->num_bin - 1 + is_full_categorical;
124 std::vector<int> sorted_idx;
125 double l2 = meta_->
config->lambda_l2;
126 bool use_onehot = meta_->num_bin <= meta_->
config->max_cat_to_onehot;
127 int best_threshold = -1;
131 for (
int t = 0; t < used_bin; ++t) {
133 if (data_[t].cnt < meta_->config->min_data_in_leaf
134 || data_[t].sum_hessians < meta_->
config->min_sum_hessian_in_leaf)
continue;
137 if (other_count < meta_->config->min_data_in_leaf)
continue;
139 double sum_other_hessian = sum_hessian - data_[t].
sum_hessians - kEpsilon;
141 if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf)
continue;
143 double sum_other_gradient = sum_gradient - data_[t].
sum_gradients;
145 double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
146 meta_->
config->lambda_l1, l2, meta_->
config->max_delta_step,
147 min_constraint, max_constraint, 0);
149 if (current_gain <= min_gain_shift)
continue;
152 is_splittable_ =
true;
154 if (current_gain > best_gain) {
157 best_sum_left_hessian = data_[t].
sum_hessians + kEpsilon;
158 best_left_count = data_[t].
cnt;
159 best_gain = current_gain;
163 for (
int i = 0; i < used_bin; ++i) {
164 if (data_[i].cnt >= meta_->
config->cat_smooth) {
165 sorted_idx.push_back(i);
168 used_bin =
static_cast<int>(sorted_idx.size());
170 l2 += meta_->
config->cat_l2;
172 auto ctr_fun = [
this](
double sum_grad,
double sum_hess) {
173 return (sum_grad) / (sum_hess + meta_->
config->cat_smooth);
175 std::sort(sorted_idx.begin(), sorted_idx.end(),
176 [
this, &ctr_fun](
int i,
int j) {
177 return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians);
180 std::vector<int> find_direction(1, 1);
181 std::vector<int> start_position(1, 0);
182 find_direction.push_back(-1);
183 start_position.push_back(used_bin - 1);
184 const int max_num_cat = std::min(meta_->
config->max_cat_threshold, (used_bin + 1) / 2);
186 is_splittable_ =
false;
187 for (
size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
188 auto dir = find_direction[out_i];
189 auto start_pos = start_position[out_i];
192 double sum_left_gradient = 0.0f;
193 double sum_left_hessian = kEpsilon;
195 for (
int i = 0; i < used_bin && i < max_num_cat; ++i) {
196 auto t = sorted_idx[start_pos];
201 left_count += data_[t].
cnt;
202 cnt_cur_group += data_[t].
cnt;
204 if (left_count < meta_->config->min_data_in_leaf
205 || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf)
continue;
207 if (right_count < meta_->config->min_data_in_leaf || right_count < min_data_per_group)
break;
209 double sum_right_hessian = sum_hessian - sum_left_hessian;
210 if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf)
break;
212 if (cnt_cur_group < min_data_per_group)
continue;
216 double sum_right_gradient = sum_gradient - sum_left_gradient;
217 double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
218 meta_->
config->lambda_l1, l2, meta_->
config->max_delta_step,
219 min_constraint, max_constraint, 0);
220 if (current_gain <= min_gain_shift)
continue;
221 is_splittable_ =
true;
222 if (current_gain > best_gain) {
223 best_left_count = left_count;
224 best_sum_left_gradient = sum_left_gradient;
225 best_sum_left_hessian = sum_left_hessian;
227 best_gain = current_gain;
234 if (is_splittable_) {
235 output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
236 meta_->
config->lambda_l1, l2, meta_->
config->max_delta_step,
237 min_constraint, max_constraint);
238 output->left_count = best_left_count;
239 output->left_sum_gradient = best_sum_left_gradient;
240 output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
241 output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
242 sum_hessian - best_sum_left_hessian,
243 meta_->
config->lambda_l1, l2, meta_->
config->max_delta_step,
244 min_constraint, max_constraint);
245 output->right_count = num_data - best_left_count;
246 output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
247 output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
248 output->gain = best_gain - min_gain_shift;
250 output->num_cat_threshold = 1;
251 output->cat_threshold = std::vector<uint32_t>(1,
static_cast<uint32_t
>(best_threshold));
253 output->num_cat_threshold = best_threshold + 1;
254 output->cat_threshold = std::vector<uint32_t>(output->num_cat_threshold);
256 for (
int i = 0; i < output->num_cat_threshold; ++i) {
257 auto t = sorted_idx[i];
258 output->cat_threshold[i] = t;
261 for (
int i = 0; i < output->num_cat_threshold; ++i) {
262 auto t = sorted_idx[used_bin - 1 - i];
263 output->cat_threshold[i] = t;
267 output->monotone_type = 0;
268 output->min_constraint = min_constraint;
269 output->max_constraint = max_constraint;
273 void GatherInfoForThreshold(
double sum_gradient,
double sum_hessian,
274 uint32_t threshold,
data_size_t num_data, SplitInfo *output) {
275 if (meta_->bin_type == BinType::NumericalBin) {
276 GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold,
279 GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold,
284 void GatherInfoForThresholdNumerical(
double sum_gradient,
double sum_hessian,
287 double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
289 meta_->
config->max_delta_step);
290 double min_gain_shift = gain_shift + meta_->
config->min_gain_to_split;
293 const int8_t bias = meta_->bias;
295 double sum_right_gradient = 0.0f;
296 double sum_right_hessian = kEpsilon;
300 bool use_na_as_missing =
false;
301 bool skip_default_bin =
false;
302 if (meta_->missing_type == MissingType::Zero) {
303 skip_default_bin =
true;
304 }
else if (meta_->missing_type == MissingType::NaN) {
305 use_na_as_missing =
true;
308 int t = meta_->num_bin - 1 - bias - use_na_as_missing;
309 const int t_end = 1 - bias;
312 for (; t >= t_end; --t) {
313 if (
static_cast<uint32_t
>(t + bias) < threshold) {
break; }
316 if (skip_default_bin && (t + bias) ==
static_cast<int>(meta_->default_bin)) {
continue; }
320 right_count += data_[t].
cnt;
322 double sum_left_gradient = sum_gradient - sum_right_gradient;
323 double sum_left_hessian = sum_hessian - sum_right_hessian;
325 double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
327 meta_->
config->max_delta_step)
328 + GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
330 meta_->
config->max_delta_step);
333 if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
334 output->gain = kMinScore;
335 Log::Warning(
"'Forced Split' will be ignored since the gain getting worse. ");
340 output->threshold = threshold;
341 output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
343 meta_->
config->max_delta_step);
344 output->left_count = left_count;
345 output->left_sum_gradient = sum_left_gradient;
346 output->left_sum_hessian = sum_left_hessian - kEpsilon;
347 output->right_output = CalculateSplittedLeafOutput(sum_gradient - sum_left_gradient,
348 sum_hessian - sum_left_hessian,
350 meta_->
config->max_delta_step);
351 output->right_count = num_data - left_count;
352 output->right_sum_gradient = sum_gradient - sum_left_gradient;
353 output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon;
354 output->gain = current_gain;
355 output->gain -= min_gain_shift;
356 output->default_left =
true;
359 void GatherInfoForThresholdCategorical(
double sum_gradient,
double sum_hessian,
360 uint32_t threshold,
data_size_t num_data, SplitInfo *output) {
362 output->default_left =
false;
363 double gain_shift = GetLeafSplitGain(
364 sum_gradient, sum_hessian,
366 meta_->
config->max_delta_step);
367 double min_gain_shift = gain_shift + meta_->
config->min_gain_to_split;
368 bool is_full_categorical = meta_->missing_type == MissingType::None;
369 int used_bin = meta_->num_bin - 1 + is_full_categorical;
370 if (threshold >=
static_cast<uint32_t
>(used_bin)) {
371 output->gain = kMinScore;
372 Log::Warning(
"Invalid categorical threshold split");
376 double l2 = meta_->
config->lambda_l2;
379 double sum_left_hessian = data_[threshold].
sum_hessians + kEpsilon;
380 double sum_right_hessian = sum_hessian - sum_left_hessian;
382 double sum_right_gradient = sum_gradient - sum_left_gradient;
384 double current_gain = GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
385 meta_->
config->lambda_l1, l2,
386 meta_->
config->max_delta_step)
387 + GetLeafSplitGain(sum_left_gradient, sum_right_hessian,
388 meta_->
config->lambda_l1, l2,
389 meta_->
config->max_delta_step);
390 if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
391 output->gain = kMinScore;
392 Log::Warning(
"'Forced Split' will be ignored since the gain getting worse. ");
396 output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
397 meta_->
config->lambda_l1, l2,
398 meta_->
config->max_delta_step);
399 output->left_count = left_count;
400 output->left_sum_gradient = sum_left_gradient;
401 output->left_sum_hessian = sum_left_hessian - kEpsilon;
402 output->right_output = CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian,
403 meta_->
config->lambda_l1, l2,
404 meta_->
config->max_delta_step);
405 output->right_count = right_count;
406 output->right_sum_gradient = sum_gradient - sum_left_gradient;
407 output->right_sum_hessian = sum_right_hessian - kEpsilon;
408 output->gain = current_gain - min_gain_shift;
409 output->num_cat_threshold = 1;
410 output->cat_threshold = std::vector<uint32_t>(1, threshold);
425 std::memcpy(data_, memory_data, (meta_->num_bin - meta_->bias) *
sizeof(
HistogramBinEntry));
438 static double ThresholdL1(
double s,
double l1) {
439 const double reg_s = std::max(0.0, std::fabs(s) - l1);
440 return Common::Sign(s) * reg_s;
443 static double CalculateSplittedLeafOutput(
double sum_gradients,
double sum_hessians,
double l1,
double l2,
double max_delta_step) {
444 double ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2);
445 if (max_delta_step <= 0.0f || std::fabs(ret) <= max_delta_step) {
448 return Common::Sign(ret) * max_delta_step;
453 static double GetSplitGains(
double sum_left_gradients,
double sum_left_hessians,
454 double sum_right_gradients,
double sum_right_hessians,
455 double l1,
double l2,
double max_delta_step,
456 double min_constraint,
double max_constraint, int8_t monotone_constraint) {
457 double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
458 double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
459 if (((monotone_constraint > 0) && (left_output > right_output)) ||
460 ((monotone_constraint < 0) && (left_output < right_output))) {
463 return GetLeafSplitGainGivenOutput(sum_left_gradients, sum_left_hessians, l1, l2, left_output)
464 + GetLeafSplitGainGivenOutput(sum_right_gradients, sum_right_hessians, l1, l2, right_output);
473 static double CalculateSplittedLeafOutput(
double sum_gradients,
double sum_hessians,
double l1,
double l2,
double max_delta_step,
474 double min_constraint,
double max_constraint) {
475 double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step);
476 if (ret < min_constraint) {
477 ret = min_constraint;
478 }
else if (ret > max_constraint) {
479 ret = max_constraint;
490 static double GetLeafSplitGain(
double sum_gradients,
double sum_hessians,
double l1,
double l2,
double max_delta_step) {
491 double output = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step);
492 return GetLeafSplitGainGivenOutput(sum_gradients, sum_hessians, l1, l2, output);
495 static double GetLeafSplitGainGivenOutput(
double sum_gradients,
double sum_hessians,
double l1,
double l2,
double output) {
496 const double sg_l1 = ThresholdL1(sum_gradients, l1);
497 return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output);
500 void FindBestThresholdSequence(
double sum_gradient,
double sum_hessian,
data_size_t num_data,
double min_constraint,
double max_constraint,
501 double min_gain_shift, SplitInfo* output,
int dir,
bool skip_default_bin,
bool use_na_as_missing) {
502 const int8_t bias = meta_->bias;
504 double best_sum_left_gradient = NAN;
505 double best_sum_left_hessian = NAN;
506 double best_gain = kMinScore;
508 uint32_t best_threshold =
static_cast<uint32_t
>(meta_->num_bin);
511 double sum_right_gradient = 0.0f;
512 double sum_right_hessian = kEpsilon;
515 int t = meta_->num_bin - 1 - bias - use_na_as_missing;
516 const int t_end = 1 - bias;
519 for (; t >= t_end; --t) {
521 if (skip_default_bin && (t + bias) ==
static_cast<int>(meta_->default_bin)) {
continue; }
525 right_count += data_[t].
cnt;
527 if (right_count < meta_->config->min_data_in_leaf
528 || sum_right_hessian < meta_->config->min_sum_hessian_in_leaf)
continue;
531 if (left_count < meta_->config->min_data_in_leaf)
break;
533 double sum_left_hessian = sum_hessian - sum_right_hessian;
535 if (sum_left_hessian < meta_->config->min_sum_hessian_in_leaf)
break;
537 double sum_left_gradient = sum_gradient - sum_right_gradient;
539 double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
541 min_constraint, max_constraint, meta_->monotone_type);
543 if (current_gain <= min_gain_shift)
continue;
546 is_splittable_ =
true;
548 if (current_gain > best_gain) {
549 best_left_count = left_count;
550 best_sum_left_gradient = sum_left_gradient;
551 best_sum_left_hessian = sum_left_hessian;
553 best_threshold =
static_cast<uint32_t
>(t - 1 + bias);
554 best_gain = current_gain;
558 double sum_left_gradient = 0.0f;
559 double sum_left_hessian = kEpsilon;
563 const int t_end = meta_->num_bin - 2 - bias;
565 if (use_na_as_missing && bias == 1) {
566 sum_left_gradient = sum_gradient;
567 sum_left_hessian = sum_hessian - kEpsilon;
568 left_count = num_data;
569 for (
int i = 0; i < meta_->num_bin - bias; ++i) {
572 left_count -= data_[i].
cnt;
577 for (; t <= t_end; ++t) {
579 if (skip_default_bin && (t + bias) ==
static_cast<int>(meta_->default_bin)) {
continue; }
583 left_count += data_[t].
cnt;
586 if (left_count < meta_->config->min_data_in_leaf
587 || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf)
continue;
590 if (right_count < meta_->config->min_data_in_leaf)
break;
592 double sum_right_hessian = sum_hessian - sum_left_hessian;
594 if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf)
break;
596 double sum_right_gradient = sum_gradient - sum_left_gradient;
598 double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
600 min_constraint, max_constraint, meta_->monotone_type);
602 if (current_gain <= min_gain_shift)
continue;
605 is_splittable_ =
true;
607 if (current_gain > best_gain) {
608 best_left_count = left_count;
609 best_sum_left_gradient = sum_left_gradient;
610 best_sum_left_hessian = sum_left_hessian;
611 best_threshold =
static_cast<uint32_t
>(t + bias);
612 best_gain = current_gain;
617 if (is_splittable_ && best_gain > output->gain) {
619 output->threshold = best_threshold;
620 output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
622 min_constraint, max_constraint);
623 output->left_count = best_left_count;
624 output->left_sum_gradient = best_sum_left_gradient;
625 output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
626 output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
627 sum_hessian - best_sum_left_hessian,
629 min_constraint, max_constraint);
630 output->right_count = num_data - best_left_count;
631 output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
632 output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
633 output->gain = best_gain;
634 output->default_left = dir == -1;
638 const FeatureMetainfo* meta_;
640 HistogramBinEntry* data_;
642 bool is_splittable_ =
true;
644 std::function<void(
double,
double,
data_size_t,
double,
double, SplitInfo*)> find_best_threshold_fun_;