Medial Code Documentation
Loading...
Searching...
No Matches
MedXGB.h
1#ifndef __MEDXGB_H__
2#define __MEDXGB_H__
3#pragma once
5#include <MedProcessTools/MedProcessTools/MedProcessUtils.h>
6#include <xgboost/learner.h>
7#include <xgboost/data.h>
8#include <xgboost/c_api.h>
9#include "MedProcessTools/MedProcessTools/MedSamples.h"
10
12 string booster; // gbtree or gblinear
13 string objective; // binary:logistic is logistic regression loss function for binary classification
14 float eta; // step size shrinkage
15 float gamma; // minimum loss reduction required to make a further partition
16 int min_child_weight; // minimum sum of instance weight(hessian) needed in a child
17 int max_depth; // maximum depth of a tree
18 int num_round; // the number of rounds to do boosting
19 vector<string> eval_metric; // when not silent, report this metric
20 int silent; // debug mode
21 float missing_value; // which value in the input is representing missing
22 int num_class; // needed for multi:softmax
23 float colsample_bytree;
24 float colsample_bylevel;
25 float subsample;
26 float scale_pos_weight;
27 string tree_method;
28 float lambda;
29 float alpha;
30 int seed; // randomization seed
31 int verbose_eval;
32 float validate_frac; // how much of the training set is used as validation for evaluation. should be between 0 and 1.
33 string split_penalties; // feature-dependent splitting penalty. string format is "number:value,number:value,..."
34 string monotone_constraints; // feature-dependent monotonic constraint. string format is "part_of_feature_name:part_of_feature_name,number:value,..."
35
36
37 MedXGBParams() {
38 booster = "gbtree"; objective = "binary:logistic"; eta = 1.0; gamma = 1.0;
39 min_child_weight = 1; max_depth = 3; num_round = 500; silent = 1; eval_metric.push_back("auc"); missing_value = MED_MAT_MISSING_VALUE;
40 num_class = 1; //only set when multiclass
41 colsample_bytree = 1.0; colsample_bylevel = 1.0; subsample = 1.0; scale_pos_weight = 1.0; tree_method = "auto"; lambda = 1; alpha = 0;
42 seed = 0;
43 verbose_eval = 0;
44 validate_frac = 0;
45 }
46
47 ADD_CLASS_NAME(MedXGBParams)
48 ADD_SERIALIZATION_FUNCS(booster, objective, eta, gamma, min_child_weight, max_depth, num_round, eval_metric, silent, missing_value, num_class,
49 colsample_bytree, colsample_bylevel, subsample, scale_pos_weight, tree_method, lambda, alpha, seed, verbose_eval, validate_frac, split_penalties, monotone_constraints)
50};
51
53public:
54 explicit XGBBooster(const std::vector<std::shared_ptr<xgboost::DMatrix> >& cache_mats)
55 : configured_(false),
56 initialized_(false),
57 learner_(xgboost::Learner::Create(cache_mats)) {}
58
59 inline xgboost::Learner* learner() {
60 return learner_.get();
61 }
62
63 inline void SetParam(const std::string& name, const std::string& val) {
64 auto it = std::find_if(cfg_.begin(), cfg_.end(),
65 [&name, &val](decltype(*cfg_.begin()) &x) {
66 if (name == "eval_metric") {
67 return x.first == name && x.second == val;
68 }
69 return x.first == name;
70 });
71 if (it == cfg_.end()) {
72 cfg_.push_back(std::make_pair(name, val));
73 }
74 else {
75 (*it).second = val;
76 }
77 if (configured_) {
78 learner_->SetParams(cfg_);
79 learner_->Configure();
80 }
81 }
82
83 inline void LazyInit() {
84 if (!configured_) {
85 learner_->SetParams(cfg_);
86 learner_->Configure();
87 configured_ = true;
88 }
89 if (!initialized_) {
90 //learner_->Configure();
91 initialized_ = true;
92 }
93 }
94
95 inline void LoadModel(dmlc::Stream* fi) {
96 learner_->Load(fi);
97 initialized_ = true;
98 }
99
100public:
101 bool configured_;
102 bool initialized_;
103 std::unique_ptr<xgboost::Learner> learner_;
104 std::vector<std::pair<std::string, std::string> > cfg_;
105};
106
107class MedXGB : public MedPredictor {
108public:
109 BoosterHandle my_learner = NULL;
110 xgboost::DMatrix* dvalidate = NULL;
111 MedXGBParams params;
112 void init_defaults();
113 int feat_contrib_flags = 0;
114 virtual int init(void *classifier_params) { this->params = *((MedXGBParams*)classifier_params); return 0; };
117 virtual int set_params(map<string, string>& initialization_map);
118
119 // Function
120 MedXGB() { init_defaults(); };
121 ~MedXGB();
122
123 int Learn(float *x, float *y, const float *w, int nsamples, int nftrs);
124 int Learn(float *x, float *y, int nsamples, int nftrs);
125 int Predict(float *x, float *&preds, int nsamples, int nftrs) const;
126 void prepare_mat_handle(float *x, float *y, const float *w, int nsamples, int nftrs, DMatrixHandle &matrix_handle);
127
128 virtual void print(FILE *fp, const string& prefix, int level = 0) const;
129
130 void calc_feature_importance(vector<float> &features_importance_scores,
131 const string &general_params, const MedFeatures *features);
132
134
135 void calc_feature_contribs_conditional(MedMat<float> &mat_x_in, unordered_map<string, float> &contiditional_variables, MedMat<float> &mat_x_out, MedMat<float> &mat_contribs);
136
137
138 void export_predictor(const string &output_fname);
139
140 int n_preds_per_sample() const;
141
142 void pre_serialization() {
143 const char* out_dptr;
145 string cfg_js = "{ \"format\":\"json\" }";
146 if (my_learner != NULL) {
147 if (XGBoosterSaveModelToBuffer(my_learner, cfg_js.c_str(), &len, &out_dptr) != 0)
148 throw runtime_error("failed XGBoosterGetModelRaw\n");
149 serial_xgb.resize(len);
150 memcpy(&serial_xgb[0], out_dptr, len);
151 }
152 else
153 serial_xgb.clear();
154 }
155
156 void post_deserialization() {
157 if (this->my_learner != NULL)
158 XGBoosterFree(this->my_learner);
159 if (!serial_xgb.empty()) {
160 DMatrixHandle h_train_empty[1];
161 if (XGBoosterCreate(h_train_empty, 0, &my_learner) != 0)
162 throw runtime_error("failed XGBoosterCreate\n");
163 if (XGBoosterLoadModelFromBuffer(my_learner, &serial_xgb[0], serial_xgb.size()) != 0)
164 throw runtime_error("failed XGBoosterLoadModelFromBuffer\n");
165 serial_xgb.clear();
166 }
167 }
168
169 void prepare_predict_single();
170 void predict_single(const vector<float> &x, vector<float> &preds) const;
171
172 void get_json(const char ***json, int& len, string type) {
173 if (my_learner != NULL) {
174 string no_fmap = "";
176 int succ = XGBoosterDumpModelEx(my_learner, no_fmap.c_str(), 1, type.c_str(), &_len, json);
177 if (succ < 0)
178 HMTHROW_AND_ERR("Error MedXGB::get_json - can't get model\n");
179 len = (int)_len;
180 }
181 else
182 len = 0;
183 }
184
185 ADD_CLASS_NAME(MedXGB)
186 ADD_SERIALIZATION_FUNCS(classifier_type, serial_xgb, params, model_features, features_count, _mark_learn_done)
187
188private:
189 bool _mark_learn_done;
190 bool prepared_single;
191 vector<BoosterHandle> learner_per_thread;
192
193 void translate_split_penalties(string& split_penalties_s);
194 void translate_monotone_constraints(string& monotone_constraints_s);
195 void calc_feature_importance_local(vector<float> &features_importance_scores, string &importance_type);
196 vector<char> serial_xgb;
197};
198
199//=================================================================
200// Joining the MedSerialize Wagon
201//=================================================================
204
205//#endif
206#endif
MedAlgo - APIs to different algorithms: Linear Models, RF, GBM, KNN, and more.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
Definition MedMat.h:63
Base Interface for predictor.
Definition MedAlgo.h:78
int features_count
The model features count used in Learn, to validate when caling predict.
Definition MedAlgo.h:96
MedPredictorTypes classifier_type
The Predicotr enum type.
Definition MedAlgo.h:80
vector< string > model_features
The model features used in Learn, to validate when caling predict.
Definition MedAlgo.h:93
Definition MedXGB.h:107
virtual int set_params(map< string, string > &initialization_map)
The parsed fields from init command.
Definition MedXGB.cpp:438
void calc_feature_contribs(MedMat< float > &x, MedMat< float > &contribs)
Feature contributions explains the prediction on each sample (aka BUT_WHY)
Definition MedXGB.cpp:75
int Predict(float *x, float *&preds, int nsamples, int nftrs) const
Predict should be implemented for each model.
Definition MedXGB.cpp:57
int n_preds_per_sample() const
Number of predictions per sample. typically 1 - but some models return several per sample (for exampl...
Definition MedXGB.cpp:34
int Learn(float *x, float *y, const float *w, int nsamples, int nftrs)
Learn should be implemented for each model.
Definition MedXGB.cpp:152
Definition SerializableObject.h:32
Definition MedXGB.h:52
interface of stream I/O for serialization
Definition io.h:30
Internal data structured used by XGBoost during training.
Definition data.h:509
Learner class that does training and prediction. This is the user facing module of xgboost training....
Definition learner.h:65
static Learner * Create(const std::vector< std::shared_ptr< DMatrix > > &cache_data)
Create a new instance of learner.
Definition learner.cc:1485
XGB_DLL int XGBoosterFree(BoosterHandle handle)
free obj in handle
Definition c_api.cc:896
XGB_DLL int XGBoosterDumpModelEx(BoosterHandle handle, const char *fmap, int with_stats, const char *format, bst_ulong *out_len, const char ***out_dump_array)
dump model, return array of strings representing model dump
Definition c_api.cc:1464
XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[], bst_ulong len, BoosterHandle *out)
create xgboost learner
Definition c_api.cc:882
void * DMatrixHandle
handle to DMatrix
Definition c_api.h:49
XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, const void *buf, bst_ulong len)
load model from in memory buffer
Definition c_api.cc:1277
XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *config, bst_ulong *out_len, char const **out_dptr)
Save model into raw bytes, return header of the array. User must copy the result out,...
Definition c_api.cc:1288
Copyright 2015-2023 by XGBoost Contributors.
Copyright 2015-2023 by XGBoost Contributors.
uint64_t bst_ulong
unsigned long integers
Definition base.h:95
Definition MedXGB.h:11
Copyright 2015~2023 by XGBoost Contributors.