1#ifndef LIGHTGBM_BOOSTING_GOSS_H_
2#define LIGHTGBM_BOOSTING_GOSS_H_
4#include <LightGBM/utils/array_args.h>
5#include <LightGBM/utils/log.h>
6#include <LightGBM/utils/openmp_wrapper.h>
7#include <LightGBM/boosting.h>
9#include "score_updater.hpp"
22std::chrono::duration<double, std::milli> subset_time;
23std::chrono::duration<double, std::milli> re_init_tree_time;
36 Log::Info(
"GOSS::subset costs %f", subset_time * 1e-3);
37 Log::Info(
"GOSS::re_init_tree costs %f", re_init_tree_time * 1e-3);
42 const std::vector<const Metric*>& training_metrics)
override {
43 GBDT::Init(config, train_data, objective_function, training_metrics);
48 const std::vector<const Metric*>& training_metrics)
override {
61 if (
config_->bagging_freq > 0 &&
config_->bagging_fraction != 1.0f) {
62 Log::Fatal(
"Cannot use bagging in GOSS");
64 Log::Info(
"Using GOSS");
75 is_use_subset_ =
false;
78 bag_data_cnt = std::max(1, bag_data_cnt);
79 tmp_subset_.reset(
new Dataset(bag_data_cnt));
81 is_use_subset_ =
true;
91 std::vector<score_t> tmp_gradients(cnt, 0.0f);
94 size_t idx =
static_cast<size_t>(cur_tree_id) *
num_data_ + start + i;
100 top_k = std::max(1, top_k);
101 ArrayArgs<score_t>::ArgMaxAtK(&tmp_gradients, 0,
static_cast<int>(tmp_gradients.size()), top_k - 1);
102 score_t threshold = tmp_gradients[top_k - 1];
111 size_t idx =
static_cast<size_t>(cur_tree_id) *
num_data_ + start + i;
114 if (grad >= threshold) {
115 buffer[cur_left_cnt++] = start + i;
118 data_size_t sampled = cur_left_cnt - big_weight_cnt;
120 data_size_t rest_all = (cnt - i) - (top_k - big_weight_cnt);
121 double prob = (rest_need) /
static_cast<double>(rest_all);
122 if (cur_rand.NextFloat() < prob) {
123 buffer[cur_left_cnt++] = start + i;
125 size_t idx =
static_cast<size_t>(cur_tree_id) *
num_data_ + start + i;
130 buffer_right[cur_right_cnt++] = start + i;
140 if (iter <
static_cast<int>(1.0f /
config_->learning_rate)) {
return; }
144 if (inner_size < min_inner_size) { inner_size = min_inner_size; }
146 #pragma omp parallel for schedule(static, 1)
156 data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt,
157 tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start);
173 #pragma omp parallel for schedule(static, 1)
189 if (!is_use_subset_) {
194 auto start_time = std::chrono::steady_clock::now();
199 subset_time += std::chrono::steady_clock::now() - start_time;
202 start_time = std::chrono::steady_clock::now();
206 re_init_tree_time += std::chrono::steady_clock::now() - start_time;
212 std::vector<data_size_t> tmp_indice_right_;
The main class of data set, which are used to traning or validation.
Definition dataset.h:278
GBDT algorithm implementation. including Training, prediction, bagging.
Definition gbdt.h:26
std::vector< score_t > hessians_
Secend order derivative of training data.
Definition gbdt.h:445
data_size_t num_data_
Number of training data.
Definition gbdt.h:453
std::unique_ptr< TreeLearner > tree_learner_
Tree learner, will use this class to learn trees.
Definition gbdt.h:419
void Init(const Config *gbdt_config, const Dataset *train_data, const ObjectiveFunction *objective_function, const std::vector< const Metric * > &training_metrics) override
Initialization logic.
Definition gbdt.cpp:45
int num_threads_
number of threads
Definition gbdt.h:470
std::vector< data_size_t > tmp_indices_
Store the indices of in-bag data.
Definition gbdt.h:451
std::vector< data_size_t > right_write_pos_buf_
Buffer for multi-threading bagging.
Definition gbdt.h:480
std::vector< data_size_t > left_write_pos_buf_
Buffer for multi-threading bagging.
Definition gbdt.h:478
void ResetTrainingData(const Dataset *train_data, const ObjectiveFunction *objective_function, const std::vector< const Metric * > &training_metrics) override
Reset the training data.
Definition gbdt.cpp:622
void ResetConfig(const Config *gbdt_config) override
Reset Boosting Config.
Definition gbdt.cpp:676
std::vector< data_size_t > bag_data_indices_
Store the indices of in-bag data.
Definition gbdt.h:447
std::unique_ptr< Config > config_
Config of gbdt.
Definition gbdt.h:417
data_size_t bag_data_cnt_
Number of in-bag data.
Definition gbdt.h:449
int num_tree_per_iteration_
Number of trees per iterations.
Definition gbdt.h:455
const Dataset * train_data_
Pointer to training data.
Definition gbdt.h:415
std::vector< data_size_t > right_cnts_buf_
Buffer for multi-threading bagging.
Definition gbdt.h:476
std::vector< score_t > gradients_
First order derivative of training data.
Definition gbdt.h:443
std::vector< data_size_t > offsets_buf_
Buffer for multi-threading bagging.
Definition gbdt.h:472
std::vector< data_size_t > left_cnts_buf_
Buffer for multi-threading bagging.
Definition gbdt.h:474
void Bagging(int iter) override
Implement bagging logic.
Definition goss.hpp:137
void Init(const Config *config, const Dataset *train_data, const ObjectiveFunction *objective_function, const std::vector< const Metric * > &training_metrics) override
Initialization logic.
Definition goss.hpp:41
void ResetTrainingData(const Dataset *train_data, const ObjectiveFunction *objective_function, const std::vector< const Metric * > &training_metrics) override
Reset the training data.
Definition goss.hpp:47
void ResetConfig(const Config *config) override
Reset Boosting Config.
Definition goss.hpp:53
GOSS()
Constructor.
Definition goss.hpp:31
The interface of Objective Function.
Definition objective_function.h:13
A wrapper for random generator.
Definition random.h:15
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
float score_t
Type of score, and gradients.
Definition meta.h:26
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14