Medial Code Documentation
Loading...
Searching...
No Matches
QRF.h
1//
2// Quantized Random Forest - an attempt to ultra fast RF version
3//
4// Currently - only binary categories (y values - 0/1)
5//
6
7#ifndef __QRF__H__
8#define __QRF__H__
9
11#include <vector>
12#include <map>
13#include <random>
14
15#define QRF_MODE_QRF 0
16
17enum QRF_TreeType {
18 QRF_BINARY_TREE = 0,
19 QRF_REGRESSION_TREE = 1,
20 QRF_CATEGORICAL_CHI2_TREE = 2,
21 QRF_CATEGORICAL_ENTROPY_TREE = 3,
22 QRF_MULTILABEL_ENTROPY_TREE = 4,
23 QRF_LAST = 4
24};
25
26#define REGRESSION_SPLIT_IMPROVE 0.999999
27
28//#define REGRESSION_SPLIT_IMPROVE 0.8
29
30#define MIN_SPLIT_NODE_SIZE 50
31#define MIN_SPLIT_SPREAD 0.1
32
33#define PREDS_REGRESSION_AVG 0
34#define PREDS_REGRESSION_WEIGHTED_AVG 1
35#define PREDS_REGRESSION_QUANTILE 2
36#define PREDS_REGRESSION_WEIGHTED_QUANTILE 3
37#define PREDS_REGRESSION_SAMPLE 4
38#define PROBS_CATEG_MAJORITY_AVG 0
39#define PROBS_CATEG_AVG_PROBS 1
40#define PROBS_CATEG_AVG_COUNTS 2
41#define PREDS_CATEG_MAJORITY_AVG 4
42#define PREDS_CATEG_AVG_PROBS 5
43#define PREDS_CATEG_AVG_COUNTS 6
44
45#define COLLECTED_VALUES_SIZE 200000
46
47using namespace std;
48
49struct ValInd {
50 float val;
51 int idx;
52};
53
54class QRF_Node {
55
56public:
57 // next used for both learn & predict
58 short split_feat;
59 float split_val;
60 int counts[2];
61 char is_leaf;
62 int r_ind;
63 int l_ind;
64 float pred; // #1 / (#1 + #0)
65 float s2; // avg sq err for regression
66 int depth;
67
68 // next used only for learn
69 short split_q_idx;
70 int from_sample;
71 int to_sample;
72 int size() { return (to_sample - from_sample + 1); }
73
74
75 size_t estimated_size() { return 12; }
76};
77
78class QRF_Tree {
79public:
80 // next used for both learn & predict
81 vector<QRF_Node> nodes;
82 int n_nodes;
83 int max_nodes;
84
85 // next used only for learn
86 vector<int> sample_ids;
87 map<short, short> feat_chosen; // used in random feature choices within a node while building the tree.
88 vector<int> hist[2]; // histogram for binary case (here to avoid reallocation)
89 vector<ValInd> qy; // holding and sorting y values in a node when searching a split
90 vector<int> inds;
91
92 vector<double> histr_sum;
93 vector<int> histr_num;
94
95
96 size_t estimated_size() { size_t s = histr_num.size() + histr_num.size() + inds.size() + qy.size() * 2 + hist[0].size() + hist[1].size() + feat_chosen.size() * 4 + sample_ids.size(); for (auto &n : nodes) s += n.estimated_size(); return s; }
97
98 mt19937 rand_gen;
99
100
101
102
103 void print(FILE *f);
104 void init_rand_state();
105};
106
108 float sum;
109 int n_tests;
110 float mean;
111 float std;
112 vector<int> cnts;
113 vector<float> probs;
114};
115
117 // this class is designed to hold the minimal needed information in a node in order to use a tree as a predictor
118public:
119 int mode;
120 short ifeat;
121 float split_val;
122 int is_leaf;
123 int left;
124 int right;
125 float pred;
126 int n_size;
127 vector<int> counts; // none for regression, 2 for binary, ncateg (defined in QRF_Forest) for categorized case
128 vector<int> values; // Counting of values in learning set in regression learning
129 vector<pair<int, unsigned int>> value_counts; // Counting of values in learning set in regression learning when working in sparse mode
130 int tot_n_values;
131 int majority; // for categories cases
132
133
134 size_t estimated_size() { return 10 + counts.size() + values.size() + 2*value_counts.size(); }
135
136 void get_scores(int mode, int get_counts_flag, int n_categ, vector<float> &scores) const;
137
138 ADD_CLASS_NAME(QRF_ResNode)
139 ADD_SERIALIZATION_FUNCS(n_size, mode, ifeat, split_val, is_leaf, left, right, pred, counts, values, value_counts, tot_n_values, majority)
140
141};
142
143
145public:
146 vector<QRF_ResNode> qnodes;
147
148 size_t estimated_size() { size_t s = 0; for (auto &q : qnodes) s += q.estimated_size(); return s; }
149
150 ADD_CLASS_NAME(QRF_ResTree)
152};
153
155
156public:
157
158 int tree_mode; // QRF_BINARY_TREE or QRF_REGRESSION_TREE or QRF_CATEGORICAL_CHI2_TREE
159 int NSamples, NFeat, MaxQ;
160 vector<short> max_q;
161 vector<char> y;
162 vector<int> ids[2];
163 vector<vector<float>> quant_values;
164 vector<vector<short>> q_data;
165 vector<float> yr; // y for regression trees
166 vector<vector<int>> yr_multilabel; // y for multilabel splitting
167 vector<float> w; // weights
168
169 vector<double> log_table;
170
171 vector<OOB_Result> cv;
172
173 vector<char> test_s;
174
175 vector<int> groups;
176
177 int n_groups;
178
179
180 int init_all(float *X, int *Y, float *Yr, const float *W, int nfeat, int nsamples, int maxq);
181
182 void init_groups(vector<int> &groups_in);
183 // binary problem related
184 int init(float *X, int *Y, int nfeat, int nsamples, int maxq);
185 int get_Tree(int *sampsize, int ntry, QRF_Tree &tree);
186 int collect_Tree_oob_scores(float *x, int nfeat, QRF_ResTree &resTree, vector<int>& sample_ids);
187 void complete_oob_cv();
188 void score_tree_by_index(float *x, int nfeat, QRF_ResTree &tree, int id, float& score, int& majority, vector<int> &counts);
189
190
191 double get_cross_validation_auc();
192
193 // regresssion problem
194 int init_regression(float *X, float *Y, const float *W, int nfeat, int nsamples, int maxq);
195 int get_regression_Tree(int *sampsize, int ntry, QRF_Tree &tree);
196
197 // categorized, split by chi square problem
198 int n_categ;
199
200 // stopping criteria related
201 int min_split_node_size;
202 float min_split_spread;
203
204
205 int n_called;
206
207 void clear();
208
209 bool take_all_samples;
210
211 int max_depth;
212
213private:
214 //int quantize_feature(vector<ValInd> &x, int nsamples, int maxq, vector<float> &quant_val, vector<short> &qd);
215 int quantize_no_loss(vector<ValInd> &vals, int nsamples, int maxq, vector<float> &quant_val, vector<short> &qd);
216
217 // binary problem
218 int find_best_split(QRF_Tree &tree, int node, int ntry);
219 int split_node(QRF_Tree &tree, int node);
220
221 // regression problem
222 int find_best_regression_split(QRF_Tree &tree, int node, int ntry);
223 int split_regression_node(QRF_Tree &tree, int node);
224
225 // categorized, split by chi square problem
226 int find_best_categories_chi2_split(QRF_Tree &tree, int node, int ntry);
227 int find_best_categories_entropy_split(QRF_Tree &tree, int node, int ntry);
228 int find_best_categories_entropy_split_multilabel(QRF_Tree &tree, int node, int ntry);
229
230};
231
233 const float *x;
234 const vector<QRF_ResTree> *trees;
235 int from;
236 int to;
237 int nfeat;
238 int nsamples;
239 float *res;
240
241 int serial;
242 int state;
243
244 int mode;
245 int n_categ;
246 vector<float> *quantiles;
247 const vector<float> *sorted_values;
248 bool sparse_values;
249
250 int get_counts; // for CATEGORICAL runs there's such an option, 0 - don't get, 1 - sum counts 2 - sum probs
251 // thread th_handle;
252};
253
254//============================================================================================================================
256public:
257 int mode; // 0 - regular mode (only QRF)
258 int min_node_size; // for stop criteria - min size of a node above which we keep on splitting (for categorical or regression)
259 float min_spread; // for stop criteria - min spread of a node above which we keep on splitting (for regression)
260 int n_categ; // categories in y must be 0,1,...,n_categ-1 in this case
261 int get_counts_flag; // how to get results
262 int get_only_this_categ; // in case of categorical model, return only probs for this category in predictions (def is -1: return all)
263
264 bool keep_all_values; // Keep all values in each node in regression mode
265 bool sparse_values; // Keep all values in sparse mode (as histogram)
266 vector<float> quantiles; // Quantiles to predict in quantile-regression mode
267 vector<float> sorted_values; // sorted prediction values actually appearing in learning set in regression mode
268
269 vector<int> groups; // if given should be at length nsamples. It maps each sample to a group (for example samples from the same patient at different times)
270 // this is important when we randomize elements to each tree and when we test oob.
271
272 vector<QRF_ResTree> qtrees; // actual collection of trees built or deserialized
273
274 // out of bag related arrays
275 int collect_oob;
276 vector<vector<float> > oob_scores; // used to keep oob scores which we use in some cases - if regression : (mean,std) ; if classification : vector of probs
277
278 // constructor:
279 QRF_Forest() {
280 qtrees.clear(); mode = 0; collect_oob = 0; oob_scores.clear(); keep_all_values = false; sparse_values = true; nthreads = 1; min_node_size = MIN_SPLIT_NODE_SIZE; get_only_this_categ = -1;
281 min_spread = 0; n_categ = -1; get_counts_flag = 0; take_all_samples = false; max_depth = 0;
282 }
283
284 int nthreads; // number of threads to use in building a forest, and in scoring it.
285
286 //full samples to take - similar algorithm to knn where k=min_node. use with n_trees=1
287 bool take_all_samples;
288
289 //max depth features to take in branch of tree
290 int max_depth;
291
292 // Learn Methods :
293 //-----------------
294
295 // binary problem ::
296 // y must be 0/1 values
297 // sampsize is either NULL (all taken for bagging) or of size 2 (sampsize[0], sampsize[1] - bagging with these sizes)
298 int get_forest(double *x, double *y, int nfeat, int nsamples, int *sampsize, int ntry, int ntrees, int maxq);
299 int get_forest(float *x, int *y, int nfeat, int nsamples, int *sampsize, int ntry, int ntrees, int maxq);
300
301 // regression problem
302 // sampsize - 0: take all, other: bag using given size.
303 int get_forest_regression_trees(float *x, float *y, int nfeat, int nsamples, int sampsize, int ntry, int ntrees, int maxq, int min_node, float spread);
304
305 // categorical chi2/entropy problem - Splitting method: QRF_CATEGORICAL_ENTROPY_TREE/QRF_CATEGORICAL_CHI2_TREE
306 // y must be in 0 ... (ncateg-1) values.
307 // sampsize : if NULL all bagging size is the number of samples, otherwise sampsize for each category is expected (sampsize[0]....sampsize[ncateg-1])
308 int get_forest_categorical(float *x, float *y, const float *w, int nfeat, int nsamples, int *sampsize, int ntry, int ntrees, int maxq, int min_node, int ncateg, int splitting_method);
309
310 // Predict Methods :
311 //------------------
312
313 int score_samples(float *x, int nfeat, int nsamples, float *&res) const; // same as next one but get_counts = 0;
314 // res should be allocated outside.
315 // for classification nsamples x n_categ. get_counts : 0 - average on majority vote of each tree ; 1 - average on probabilities in each tree ; 2 (or otherwise) - average on counts in each tree
316 // 4 - prediction (based on majority)
317 // for regression nsamples x 1. get_counts : 0 - average prediction ; 1 (or otherwise) - weighted average.
318 int score_samples(float *x_in, int nfeat, int nsamples, float *&res, int get_counts) const;
319
320 int score_samples_t(double *x, int nfeat, int nsamples, double *&res); // tailored for predictor formats
321
322 void get_single_score_fast(qrf_scoring_thread_params &params, const vector<float> &x, vector<float> &preds) const;
323
324 // OOB Methods:
325 //--------------
326 //int collect_Tree_oob_scores_threaded(float *x, int nfeat, QRF_ResTree &resTree, vector<int>& sample_ids);
327
328
329 // serialization
330 ADD_CLASS_NAME(QRF_Forest)
331 ADD_SERIALIZATION_FUNCS(qtrees, mode, min_node_size, min_spread, n_categ, get_counts_flag, get_only_this_categ, keep_all_values, sparse_values, quantiles, sorted_values, nthreads, take_all_samples, max_depth)
332
333 // IO Methods :
334 //-------------
335 void write(FILE *fp);
336
337 // Variable Importance:
338 //---------------------
339
340 void variableImportance(vector<pair<short, double> >& rankedFeatures, unsigned int nFeatures);
341
342private:
343 // learn for all modes
344 int get_forest_trees_all_modes(float *x, void *y, const float *w, int nfeat, int nsamples, int *sampsize, int ntry, int ntrees, int maxq, int mode);
345
346 // scoring with threads for all modes
347 void score_with_threads(float *x, int nfeat, int nsamples, float *res) const;
348
349 // transferring from inner algo data structure to exposed on
350 int transfer_to_forest(vector<QRF_Tree> &trees, QuantizedRF &qrf, int mode);
351 int transfer_tree_to_res_tree(QuantizedRF &qrf, QRF_Tree &tree, QRF_ResTree &qt, int mode, map<float, int> &all_values);
352 int init_keep_all_values(QuantizedRF &qrf, int mode, map<float, int> &all_values);
353};
354
358
359#endif
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
Definition QRF.h:255
Definition QRF.h:54
Definition QRF.h:116
Definition QRF.h:144
Definition QRF.h:78
Definition QRF.h:154
Definition SerializableObject.h:32
Definition StdDeque.h:58
Definition QRF.h:107
Definition QRF.h:49
Definition QRF.h:232