1#ifndef LIGHTGBM_BOOSTING_GBDT_H_
2#define LIGHTGBM_BOOSTING_GBDT_H_
4#include <LightGBM/boosting.h>
5#include <LightGBM/objective_function.h>
6#include <LightGBM/prediction_early_stop.h>
7#include <LightGBM/json11.hpp>
9#include "score_updater.hpp"
19using namespace json11;
47 const std::vector<const Metric*>& training_metrics)
override;
54 auto other_gbdt =
reinterpret_cast<const GBDT*
>(other);
56 auto original_models = std::move(
models_);
57 models_ = std::vector<std::unique_ptr<Tree>>();
59 for (
const auto& tree : other_gbdt->models_) {
60 auto new_tree = std::unique_ptr<Tree>(
new Tree(*(tree.get())));
61 models_.push_back(std::move(new_tree));
65 for (
const auto& tree : original_models) {
66 auto new_tree = std::unique_ptr<Tree>(
new Tree(*(tree.get())));
67 models_.push_back(std::move(new_tree));
74 start_iter = std::max(0, start_iter);
76 end_iter = total_iter;
78 end_iter = std::min(total_iter, end_iter);
79 auto original_models = std::move(
models_);
80 std::vector<int> indices(total_iter);
81 for (
int i = 0; i < total_iter; ++i) {
85 for (
int i = start_iter; i < end_iter - 1; ++i) {
86 int j = tmp_rand.
NextShort(i + 1, end_iter);
89 models_ = std::vector<std::unique_ptr<Tree>>();
90 for (
int i = 0; i < total_iter; ++i) {
93 auto new_tree = std::unique_ptr<Tree>(
new Tree(*(original_models[tree_idx].get())));
94 models_.push_back(std::move(new_tree));
106 const std::vector<const Metric*>& training_metrics)
override;
120 const std::vector<const Metric*>& valid_metrics)
override;
127 void Train(
int snapshot_freq,
const std::string& model_output_path)
override;
129 void RefitTree(
const std::vector<std::vector<int>>& tree_leaf_prediction)
override;
166 std::vector<double>
GetEvalAt(
int data_idx)
const override;
195 void GetPredictAt(
int data_idx,
double* out_result, int64_t* out_len)
override;
204 inline int NumPredictOneRow(
int num_iteration,
bool is_pred_leaf,
bool is_pred_contrib)
const override {
208 if (num_iteration > 0) {
209 num_preb_in_one_row *=
static_cast<int>(std::min(max_iteration, num_iteration));
211 num_preb_in_one_row *= max_iteration;
213 }
else if (is_pred_contrib) {
216 return num_preb_in_one_row;
219 void PredictRaw(
const double* features,
double* output,
222 void PredictRawByMap(
const std::unordered_map<int, double>& features,
double* output,
225 void Predict(
const double* features,
double* output,
228 void PredictByMap(
const std::unordered_map<int, double>& features,
double* output,
231 void PredictLeafIndex(
const double* features,
double* output)
const override;
233 void PredictLeafIndexByMap(
const std::unordered_map<int, double>& features,
double* output)
const override;
244 std::string
DumpModel(
int start_iteration,
int num_iteration)
const override;
268 virtual bool SaveModelToFile(
int start_iteration,
int num_iterations,
const char* filename)
const override;
277 virtual std::string
SaveModelToString(
int start_iteration,
int num_iterations)
const override;
291 std::vector<double>
FeatureImportance(
int num_iteration,
int importance_type)
const override;
329 inline void InitPredict(
int num_iteration,
bool is_pred_contrib)
override {
331 if (num_iteration > 0) {
334 if (is_pred_contrib) {
335 #pragma omp parallel for schedule(static)
336 for (
int i = 0; i < static_cast<int>(
models_.size()); ++i) {
337 models_[i]->RecomputeMaxDepth();
342 inline double GetLeafValue(
int tree_idx,
int leaf_idx)
const override {
343 CHECK(tree_idx >= 0 &&
static_cast<size_t>(tree_idx) <
models_.size());
344 CHECK(leaf_idx >= 0 && leaf_idx <
models_[tree_idx]->num_leaves());
345 return models_[tree_idx]->LeafOutput(leaf_idx);
348 inline void SetLeafValue(
int tree_idx,
int leaf_idx,
double val)
override {
349 CHECK(tree_idx >= 0 &&
static_cast<size_t>(tree_idx) <
models_.size());
350 CHECK(leaf_idx >= 0 && leaf_idx <
models_[tree_idx]->num_leaves());
351 models_[tree_idx]->SetLeafOutput(leaf_idx, val);
374 virtual void Bagging(
int iter);
401 virtual std::vector<double>
EvalOneMetric(
const Metric* metric,
const double* score)
const;
410 double BoostFromAverage(
int class_id,
bool update_scorer);
468 std::vector<std::string> feature_infos_;
481 std::unique_ptr<Dataset> tmp_subset_;
483 std::vector<bool> class_need_train_;
484 bool is_constant_hessian_;
485 std::unique_ptr<ObjectiveFunction> loaded_objective_;
486 bool average_output_;
487 bool need_re_bagging_;
488 std::string loaded_parameter_;
490 Json forced_splits_json_;
The interface for Boosting.
Definition boosting.h:22
The main class of data set, which are used to traning or validation.
Definition dataset.h:278
data_size_t num_data() const
Get Number of data.
Definition dataset.h:577
Definition boosting.h:298
GBDT algorithm implementation. including Training, prediction, bagging.
Definition gbdt.h:26
int NumModelPerIteration() const override
Get number of tree per iteration.
Definition gbdt.h:321
std::vector< score_t > hessians_
Secend order derivative of training data.
Definition gbdt.h:445
virtual void Bagging(int iter)
Implement bagging logic.
Definition gbdt.cpp:180
void PredictContrib(const double *features, double *output, const PredictionEarlyStopInstance *earlyStop) const override
Feature contributions for the model's prediction of one record.
Definition gbdt.cpp:564
void AddValidDataset(const Dataset *valid_data, const std::vector< const Metric * > &valid_metrics) override
Adding a validation dataset.
Definition gbdt.cpp:117
int num_iteration_for_pred_
number of used model
Definition gbdt.h:461
std::vector< const Metric * > training_metrics_
Metrics for training data.
Definition gbdt.h:425
int LabelIdx() const override
Get index of label column.
Definition gbdt.h:309
void InitPredict(int num_iteration, bool is_pred_contrib) override
Initial work for the prediction.
Definition gbdt.h:329
std::vector< std::unique_ptr< Tree > > models_
Trained models(trees)
Definition gbdt.h:439
data_size_t BaggingHelper(Random &cur_rand, data_size_t start, data_size_t cnt, data_size_t *buffer)
Helper function for bagging, used for multi-threading optimization.
Definition gbdt.cpp:159
void ResetBaggingConfig(const Config *config, bool is_change_dataset)
reset config for bagging
Definition gbdt.cpp:689
int GetCurrentIteration() const override
Get current iteration.
Definition gbdt.h:147
data_size_t num_data_
Number of training data.
Definition gbdt.h:453
std::string DumpModel(int start_iteration, int num_iteration) const override
Dump model to json format string.
Definition gbdt_model_text.cpp:15
~GBDT()
Destructor.
Definition gbdt.cpp:42
virtual int64_t GetNumPredictAt(int data_idx) const override
Get size of prediction at data_idx data.
Definition gbdt.h:180
std::unique_ptr< TreeLearner > tree_learner_
Tree learner, will use this class to learn trees.
Definition gbdt.h:419
virtual void UpdateScore(const Tree *tree, const int cur_tree_id)
updating score after tree was trained
Definition gbdt.cpp:451
bool NeedAccuratePrediction() const override
Can use early stopping for prediction or not.
Definition gbdt.h:153
void Init(const Config *gbdt_config, const Dataset *train_data, const ObjectiveFunction *objective_function, const std::vector< const Metric * > &training_metrics) override
Initialization logic.
Definition gbdt.cpp:45
int num_threads_
number of threads
Definition gbdt.h:470
std::string ModelToIfElse(int num_iteration) const override
Translate model to if-else statement.
Definition gbdt_model_text.cpp:60
virtual std::vector< double > EvalOneMetric(const Metric *metric, const double *score) const
eval results for one metric
Definition gbdt.cpp:472
std::vector< data_size_t > tmp_indices_
Store the indices of in-bag data.
Definition gbdt.h:451
std::vector< data_size_t > right_write_pos_buf_
Buffer for multi-threading bagging.
Definition gbdt.h:480
virtual const char * SubModelName() const override
Get Type name of this boosting object.
Definition gbdt.h:357
std::vector< data_size_t > left_write_pos_buf_
Buffer for multi-threading bagging.
Definition gbdt.h:478
std::vector< std::vector< std::string > > best_msg_
output message of best iteration
Definition gbdt.h:437
std::vector< double > FeatureImportance(int num_iteration, int importance_type) const override
Calculate feature importances.
Definition gbdt_model_text.cpp:513
int early_stopping_round_
Number of rounds for early stopping.
Definition gbdt.h:431
void ResetTrainingData(const Dataset *train_data, const ObjectiveFunction *objective_function, const std::vector< const Metric * > &training_metrics) override
Reset the training data.
Definition gbdt.cpp:622
void RollbackOneIter() override
Rollback one iteration.
Definition gbdt.cpp:414
std::vector< std::vector< double > > best_score_
Best score(s) for early stopping.
Definition gbdt.h:435
int NumberOfClasses() const override
Get number of classes.
Definition gbdt.h:327
const ObjectiveFunction * objective_function_
Objective function.
Definition gbdt.h:421
int num_class_
Number of class.
Definition gbdt.h:457
void ResetConfig(const Config *gbdt_config) override
Reset Boosting Config.
Definition gbdt.cpp:676
virtual void Boosting()
calculate the object function
Definition gbdt.cpp:149
virtual bool TrainOneIter(const score_t *gradients, const score_t *hessians) override
Training logic.
Definition gbdt.cpp:333
std::vector< data_size_t > bag_data_indices_
Store the indices of in-bag data.
Definition gbdt.h:447
std::vector< std::vector< const Metric * > > valid_metrics_
Metric for validation data.
Definition gbdt.h:429
std::vector< std::vector< int > > best_iter_
Best iteration(s) for early stopping.
Definition gbdt.h:433
std::vector< std::unique_ptr< ScoreUpdater > > valid_score_updater_
Store and update validation data's scores.
Definition gbdt.h:427
std::unique_ptr< Config > config_
Config of gbdt.
Definition gbdt.h:417
data_size_t bag_data_cnt_
Number of in-bag data.
Definition gbdt.h:449
int num_tree_per_iteration_
Number of trees per iterations.
Definition gbdt.h:455
std::string OutputMetric(int iter)
Print metric result of current iteration.
Definition gbdt.cpp:476
bool SaveModelToIfElse(int num_iteration, const char *filename) const override
Translate model to if-else statement.
Definition gbdt_model_text.cpp:219
virtual bool EvalAndCheckEarlyStopping()
Print eval result and check early stopping.
Definition gbdt.cpp:432
int iter_
current iteration
Definition gbdt.h:413
void PredictLeafIndex(const double *features, double *output) const override
Prediction for one record with leaf index.
Definition gbdt_prediction.cpp:73
int max_feature_idx_
Max feature index of training data.
Definition gbdt.h:441
void MergeFrom(const Boosting *other) override
Merge model from other boosting object. Will insert to the front of current boosting object.
Definition gbdt.h:53
bool LoadModelFromString(std::string str)
Restore from a serialized buffer.
Definition gbdt.h:282
GBDT()
Constructor.
Definition gbdt.cpp:22
void PredictRaw(const double *features, double *output, const PredictionEarlyStopInstance *earlyStop) const override
Prediction for one record, not sigmoid transform.
Definition gbdt_prediction.cpp:9
void Predict(const double *features, double *output, const PredictionEarlyStopInstance *earlyStop) const override
Prediction for one record, sigmoid transformation will be used if needed.
Definition gbdt_prediction.cpp:49
std::vector< std::string > FeatureNames() const override
Get feature names of this model.
Definition gbdt.h:303
const Dataset * train_data_
Pointer to training data.
Definition gbdt.h:415
std::vector< data_size_t > right_cnts_buf_
Buffer for multi-threading bagging.
Definition gbdt.h:476
std::unique_ptr< ScoreUpdater > train_score_updater_
Store and update training data's score.
Definition gbdt.h:423
int NumPredictOneRow(int num_iteration, bool is_pred_leaf, bool is_pred_contrib) const override
Get number of prediction for one data.
Definition gbdt.h:204
double shrinkage_rate_
Shrinkage rate for one iteration.
Definition gbdt.h:463
std::vector< score_t > gradients_
First order derivative of training data.
Definition gbdt.h:443
data_size_t label_idx_
Index of label column.
Definition gbdt.h:459
void RefitTree(const std::vector< std::vector< int > > &tree_leaf_prediction) override
Update the tree output by new training data.
Definition gbdt.cpp:263
void ShuffleModels(int start_iter, int end_iter) override
Shuffle Existing Models.
Definition gbdt.h:72
virtual const double * GetTrainingScore(int64_t *out_len) override
Get current training score.
Definition gbdt.cpp:559
std::vector< data_size_t > offsets_buf_
Buffer for multi-threading bagging.
Definition gbdt.h:472
void GetPredictAt(int data_idx, double *out_result, int64_t *out_len) override
Get prediction result at data_idx data.
Definition gbdt.cpp:585
int num_init_iteration_
Number of loaded initial models.
Definition gbdt.h:465
std::vector< std::string > feature_names_
Feature names.
Definition gbdt.h:467
int NumberOfTotalModel() const override
Get number of weak sub-models.
Definition gbdt.h:315
void Train(int snapshot_freq, const std::string &model_output_path) override
Perform a full training procedure.
Definition gbdt.cpp:243
std::vector< data_size_t > left_cnts_buf_
Buffer for multi-threading bagging.
Definition gbdt.h:474
virtual bool SaveModelToFile(int start_iteration, int num_iterations, const char *filename) const override
Save model to file.
Definition gbdt_model_text.cpp:332
std::vector< double > GetEvalAt(int data_idx) const override
Get evaluation result at data_idx data.
Definition gbdt.cpp:536
std::string SaveModelToString(int num_iterations)
Save model to string.
Definition gbdt.h:276
int MaxFeatureIdx() const override
Get max feature index of this model.
Definition gbdt.h:297
The interface of metric. Metric is used to calculate metric result.
Definition metric.h:20
The interface of Objective Function.
Definition objective_function.h:13
virtual bool NeedAccuratePrediction() const
The prediction should be accurate or not. True will disable early stopping for prediction.
Definition objective_function.h:63
A wrapper for random generator.
Definition random.h:15
int NextShort(int lower_bound, int upper_bound)
Generate random integer, int16 range. [0, 65536].
Definition random.h:38
Tree model.
Definition tree.h:20
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
float score_t
Type of score, and gradients.
Definition meta.h:26
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
NLOHMANN_BASIC_JSON_TPL_DECLARATION void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL &j1, nlohmann::NLOHMANN_BASIC_JSON_TPL &j2) noexcept(//NOLINT(readability-inconsistent-declaration-parameter-name, cert-dcl58-cpp) is_nothrow_move_constructible< nlohmann::NLOHMANN_BASIC_JSON_TPL >::value &&//NOLINT(misc-redundant-expression) is_nothrow_move_assignable< nlohmann::NLOHMANN_BASIC_JSON_TPL >::value)
exchanges the values of two JSON objects
Definition json.hpp:24418
Definition prediction_early_stop.h:11