Medial Code Documentation
Loading...
Searching...
No Matches
TQRF.h
1#ifndef __TQRF_H__
2#define __TQRF_H__
3//
4// TQRF
5//
6
9#include <MedProcessTools/MedProcessTools/MedFeatures.h>
10#include <MedMat/MedMat/MedMat.h>
12#include <queue>
13
14using namespace std;
15
16enum TQRF_TreeTypes {
17 TQRF_TREE_ENTROPY = 0,
18 TQRF_TREE_REGRESSION = 1,
19 TQRF_TREE_LIKELIHOOD = 2,
20 TQRF_TREE_WEIGHTED_LIKELIHOOD = 3,
21 TQRF_TREE_DEV = 4, // free place to use when developing new score ideas
22 TQRF_TREE_UNDEFINED = 5
23};
24
25enum TQRF_Missing_Value_Method {
26 TQRF_MISSING_VALUE_MEAN = 0,
27 TQRF_MISSING_VALUE_MEDIAN = 1,
28 TQRF_MISSING_VALUE_LARGER_NODE = 2,
29 TQRF_MISSING_VALUE_LEFT = 3,
30 TQRF_MISSING_VALUE_RIGHT = 4,
31 TQRF_MISSING_VALUE_RAND_ALL = 5,
32 TQRF_MISSING_VALUE_RAND_EACH_SAMPLE = 6,
33 TQRF_MISSING_VALUE_NOTHING = 7
34};
35
36
37enum TQRF_Node_Working_State {
38 TQRF_Node_State_Initiated = 0,
39 TQRF_Node_State_In_Progress = 1,
40 TQRF_Node_State_Done = 2
41};
42
43enum TQRF_Missing_Direction {
44 TQRF_MISSING_DIRECTION_LEFT = 0,
45 TQRF_MISSING_DIRECTION_RIGHT = 1,
46 TQRF_MISSING_DIRECTION_RAND_EACH_SAMPLE = 2
47};
48
49#define TQRF_MAX_TIME_SLICE 10000000
50#define MIN_ELEMENTS_IN_TIME_SLICE 100
51#define UNSET_BETA ((float)-1e-10)
52
53
54//==========================================================================================================================
56public:
57
58 //========================================================================================================================
59 // params list
60 //========================================================================================================================
61 string init_string = "";
62 string samples_time_unit = "Date";
63 int samples_time_unit_i;
64
65 int ncateg = 2;
66
67 // time slices
68 string time_slice_unit = "Days";
69 int time_slice_unit_i;
70 int time_slice_size = -1;
71 int n_time_slices = 1;
72 vector<int> time_slices = {};
73
74 vector<float> time_slices_wgts ={};
75 int censor_cases = 0;
76
77 // quantization
78 int max_q = 200;
79 //int max_q_sample = 100000; /// the max number of values to use when deciding q limits
80
81 // trees and stopping criteria
82 string tree_type = "";
83 int tree_type_i = -1;
84 int ntrees = 50;
85 int max_depth = 100;
87 int min_node = 10;
89
90 // feature sampling
91 int ntry = -1;
92 float ntry_prob = (float)0.1;
93 int nsplits = -1;
94
95 // speedup by subsample control
97
98 // bagging control
101 float bag_prob = (float)0.5;
102 float bag_ratio = -1;
103 float bag_feat = (float)1.0;
105
106 // categorial featues
108 vector<string> categorial_str;
109 vector<string> categorial_tags;
110 //vector<int> categorial; /// calculated from the above (in learning, once train data is given. In testing - already ready)
111
112 // missing value
113 float missing_val = MED_MAT_MISSING_VALUE;
114 string missing_method_str = "median";
115 int missing_method = -1;
116
117 // sanities
118 int test_for_inf = 1;
120
121 // prediction configuration
126 int predict_sum_times = 0;
127
128
129 // weights
130 float case_wgt = 1;
131
132 // ada boost mode
133 int nrounds = 1;
134 float min_p = (float)0.01;
135 float max_p = (float)0.99;
136 float alpha = 1;
137 float wgts_pow = 2;
138
139 // lists
140 float tuning_size = 0;
143
144 // tuning gradient descent parameters
145 float gd_rate = (float)0.01;
146 int gd_batch = 1000;
147 float gd_momentum = (float)0.95;
148 float gd_lambda = 0;
149 int gd_epochs = 0;
150
151 // verbosity
152 int verbosity = 0;
153 int ids_to_print = 30;
154 int debug = 0;
155
156 //========================================================================================================================
157
158
160 int init(map<string, string>& map);
161
162 // next are non serialized helpers we keep here as they are common to ALL the forest
163 vector<double> log_table;
164
165
166 // Serialization
167 ADD_CLASS_NAME(TQRF_Params)
168 ADD_SERIALIZATION_FUNCS(init_string, samples_time_unit, samples_time_unit_i, ncateg, time_slice_unit, time_slice_unit_i, time_slice_size, n_time_slices,
174
175 //ADD_SERIALIZATION_FUNCS(init_string, samples_time_unit, time_slice_unit, time_slice_size, time_slices, max_q, max_q_sample, tree_type, ntrees, max_depth, min_node_last_slice, min_node, )
176};
177
178
179//==========================================================================================================================
180// contains all the needed data for training including all quantizations (features, time slices) that are needed
181//==========================================================================================================================
183
184public:
185 vector<vector<short>> qx;
186 vector<vector<float>> q_to_val;
189
190 int nfeat = 0;
191 vector<string> feature_names;
192 int ncateg = 0;
193 vector<const vector<float> *> orig_data;
194 vector<string> feat_names;
195 vector<float> y;
196 vector<int> y_i;
197 vector<int> last_time_slice;
199 vector<int> slice_counts[2];
200
201 vector<vector<int>> lists;
204
206
207 // next are pre computed for bagging purposes
208 vector<vector<vector<int>>> time_categ_pids;
209 vector<vector<vector<int>>> time_categ_idx;
210 vector<vector<int>> categ_pids;
211 vector<vector<int>> categ_idx;
212 unordered_map<int, vector<vector<vector<int>>>> pid2time_categ_idx;
213
214 // next are helper arrays used when doind adaboost
215 vector<float> wgts;
216 vector<float> orig_wgts;
217 vector<float> probs;
218 vector<float> w_to_sum;
219 vector<vector<float>> sum_over_trees;
220 float alpha0;
221
222 int init(const MedFeatures &medf, TQRF_Params &params);
223
224 ~Quantized_Feat() { pid2time_categ_idx.clear(); }
225
226private:
227 int quantize_feat(int i_feat, TQRF_Params &params);
228 int init_time_slices(const MedFeatures &medf, TQRF_Params &params);
229 int init_pre_bagging(TQRF_Params &params);
230 int init_lists(const MedFeatures &medf, TQRF_Params &params);
231
232};
233
234
235//==========================================================================================================================
236// a basic node class : currently a single node type serves all trees .... could be changed to
237//==========================================================================================================================
239public:
240 // Next are must for every node and are ALWAYS serialized
241 int node_idx = -1;
242 int i_feat = -1;
243 float bound = (float)-1e10;
244 int is_terminal = 0;
245 int left_node = -1;
246 int right_node = -1;
247 int depth = -1;
248 int missing_direction = TQRF_MISSING_DIRECTION_RAND_EACH_SAMPLE;
249
250 // next are needed while learning , and if asked to keep samples in nodes - we keep them always for now
251 int from_idx = -1;
252 int to_idx = -1;
253 int size() { return to_idx-from_idx+1; }
254
255 int node_serialization_mask = 0x1;
256 int beta_idx = -1;
257
258 // categorical : mask |= 0x1 , time_categ_count[t][c] : how many counts in this node are in timeslice t and category c
259 vector<vector<float>> time_categ_count;
260
261 // regression : mask |= 0x2
262 //float pred_mean = (float)-1e10;
263 //float pred_std = (float)1;
264
265 // quantiles: mask |= 0x4
266 //vector<pair<float, float>> quantiles;
267
268
269 // following are never serialized - only for learn time
270 int state = TQRF_Node_State_Initiated; // 0 - created 1 - in process 2 - done with
271
272 ADD_CLASS_NAME(TQRF_Node)
273 ADD_SERIALIZATION_FUNCS(node_idx, i_feat, bound, is_terminal, left_node, right_node, depth, missing_direction, from_idx, to_idx,
274 node_serialization_mask, beta_idx, time_categ_count, state)
275};
276
277//----------------------------------------------------------------------------------------------------------------------
279
280};
281
282//==========================================================================================================================
283// Split_Stat contains the quantized data structures used in order to make a split decision
284// Basically :
285// for categorial outcomes:
286// for each time slot, for each quanta -> counts for each category
287// for regression outcomes:
288// for each time slot, for each quanta -> nvals and sum (??)
289//==========================================================================================================================
291
292public:
293 // categorial case
294 // vector<vector<vector<int>>> counts; /// counts[t][q][c] = how many counts were in time slot t, quanta q, and category c.
295
296 // suggestion (categorial case):
297 // TQRF_Split will get a node with indexes, and then:
298 // (1) Go over all the samples in the node and for each one add +1 to the relevant counts[t][q][c]
299 // (2) Will enable going over counts and choose the best q for splitting given some params (score type, minimal size, minimal change, etc)
300 //
301 // This may allow a very elegant code for a tree in which all the hard stuff is implemented inside.
302 //
303 // issues to think about:
304 // parallelism over nodes
305 // memory allocation - we want it single time
306 // tricks for efficient calculation of counts and the scores
307
308 virtual ~TQRF_Split_Stat() {};
309
310 virtual int init(Quantized_Feat &qf, TQRF_Params &params) { return 0;};
311 virtual int prep_histograms(int i_feat, TQRF_Node &node, vector<int> &indexes, Quantized_Feat &qf, TQRF_Params &params) { return 0; };
312 virtual int get_best_split(TQRF_Params &params, int &best_q, double &best_score) { return 0; };
313
314 int get_q_test_points(int feat_max_q, TQRF_Params &params, vector<int> &qpoints);
315
316
317 // helper vector for qpoints
318 vector<int> qpoints;
319
320 // debug
321 virtual void print_histograms() { return; };
322
323 // the actual number of q values used (full or after qpoints squeeze if it was done)
324 int counts_q = 0;
325
326
327 static TQRF_Split_Stat *make_tqrf_split_stat(int tree_type);
328};
329
330//==========================================================================================================================
332public:
333 // categorial case : counts[t][q][c] : time_slice , quanta, category : number of elements
334 vector<vector<vector<int>>> counts;
335
336 // sums[t][c] = number of samples in time slice t and category c summed on all q vals
337 // this is needed for a more efficient computation of scores later
338 vector<vector<int>> sums;
339
340 // sums_t[t] = number of samples in time slice t (needed later for more efficient calculations)
341 vector<int> sums_t;
342
343 int total_sum = 0; // sum of the sum_t vector
344
346
347 // next are for easy access
348 int ncateg = 0;
349 int nslices = 0;
350 int maxq = 0; // overall
351
352 // API's
353 int init(Quantized_Feat &qf, TQRF_Params &params);
354 int prep_histograms(int i_feat, TQRF_Node &node, vector<int> &indexes, Quantized_Feat &qf, TQRF_Params &params);
355 //virtual int get_best_split(TQRF_Params &params, int &best_q, float &best_score);
356
357 void print_histograms();
358};
359
360
361//==========================================================================================================================
363public:
364 int get_best_split(TQRF_Params &params, int &best_q, double &best_score);
365};
366
367
368//==========================================================================================================================
370public:
371 int get_best_split(TQRF_Params &params, int &best_q, double &best_score);
372};
373
374//==========================================================================================================================
376public:
377 // categorial case : counts[t][q][c] : time_slice , quanta, category : number of elements
378 vector<vector<vector<float>>> counts;
379
380 // sums[t][c] = number of samples in time slice t and category c summed on all q vals
381 // this is needed for a more efficient computation of scores later
382 vector<vector<float>> sums;
383
384 // sums_t[t] = number of samples in time slice t (needed later for more efficient calculations)
385 vector<float> sums_t;
386
387 float total_sum = 0; // sum of the sum_t vector
388
390
391 // next are for easy access
392 int ncateg = 0;
393 int nslices = 0;
394 int maxq = 0; // overall
395
396 // API's
397 int init(Quantized_Feat &qf, TQRF_Params &params);
398 int prep_histograms(int i_feat, TQRF_Node &node, vector<int> &indexes, Quantized_Feat &qf, TQRF_Params &params);
399 //virtual int get_best_split(TQRF_Params &params, int &best_q, float &best_score);
400
401 void print_histograms();
402};
403
404//==========================================================================================================================
406public:
407 int get_best_split(TQRF_Params &params, int &best_q, double &best_score);
408};
409
410//==========================================================================================================================
412public:
413 ~TQRF_Split_Dev() {};
414 int get_best_split(TQRF_Params &params, int &best_q, double &best_score) { return 0; };
415};
416
417//==========================================================================================================================
419public:
420 // categorial case
421 vector<vector<vector<pair<float,int>>>> sum_num;
422
423 int init(Quantized_Feat &qf, TQRF_Params &params) { return 0; };
424 int prep_histograms(int i_feat, TQRF_Node &node, vector<int> &indexes, Quantized_Feat &qf, TQRF_Params &params) { return 0; };
425 int get_best_split(TQRF_Params &params, int &best_q, double &best_score) { return 0; };
426};
427
428//==========================================================================================================================
429// A tree base class
430//==========================================================================================================================
432
433public:
434 // next are needed also for predictions, and hence should be serialized
435 int tree_type;
436 int id; // for debug prints - a specific tree identifier
437 int keep_indexes = 0;
438 vector<int> indexes; // indexes[i] = an index of a sample in the given Quantized_Feat
439 vector<TQRF_Node> nodes; // this node supports currently all possible nodes for all trees... to save ugly templated code
440
441 // next variables are no-need-to-serialize helpers
442 vector<int> i_feats; // feature indexes to be used in this tree (they can be bagged as well)
443
444 TQRF_Tree() {};
445
446 void init(Quantized_Feat &qfeat, TQRF_Params &params) { _qfeat = &qfeat; _params = &params; }
447 int Train(Quantized_Feat &qfeat, TQRF_Params &params) { init(qfeat, params); return Train(); }
448
449 const TQRF_Node *Get_Node_for_predict(MedMat<float> &x, int i_row, float missing_val, int &beta_idx) const;
450 TQRF_Node *Get_Node(MedMat<float> &x, int i_row, float missing_val);
451
452 int Train();
453
454 // helpers inside Train:
455
456 // get indexes vector ready
457 int get_bagged_indexes();
458
459 // initialize root node
460 int init_root_node();
461
462 // get the next node to work on
463 int get_next_node(int curr_node);
464
465 // get the list of features to work on
466 int get_feats_to_test(vector<int> &feats_to_test);
467
468 // init the vector for splits: to nfeat, right sizes, and right type + a free mem api
469 int init_split_stats(vector<TQRF_Split_Stat *> &tqs);
470 void free_split_stats(vector<TQRF_Split_Stat *> &tqs);
471
472 // close work on current node and make the split if needed
473 int node_splitter(int i_curr_node, int i_best, int q_best);
474
475 // once a node is finalized : prepares its internal counts (with or without taking weights into account)
476 float prep_node_counts(int i_curr_node, int use_wgts_flag);
477
478
479 void pre_serialization() { if (keep_indexes == 0) indexes.clear(); }
480 ADD_CLASS_NAME(TQRF_Tree)
481 ADD_SERIALIZATION_FUNCS(tree_type, id, keep_indexes, indexes, nodes)
482
483private:
484 Quantized_Feat *_qfeat;
485 TQRF_Params *_params;
486
487 // next used to manage nodes while building
488 int n_nodes_in_process = 0;
489 int i_last_node_in_process = 0;
490
491 int bag_chooser(float p, int _t, int _c, /* OUT APPEND */ vector<int> &_indexes);
492 int bag_chooser(int choose_with_repeats, int single_sample_per_id, float p, vector<int> &pids, vector<int> &idx, unordered_map<int, vector<int>> &pid2idx, /* OUT APPEND */ vector<int> &_indexes);
493
494
495};
496
497
498
499//==========================================================================================================================
501
502public:
503
504 TQRF_Params params;
505 vector<TQRF_Tree> trees;
506 vector<float> alphas;
507 vector<float> betas;
508
509 int init(map<string, string>& map) { return params.init(map); }
510 int init_from_string(string init_string) { params.init_string = init_string; return SerializableObject::init_from_string(init_string); }
511
512 void init_tables(Quantized_Feat &qfeat);
513
518 int Train(const MedFeatures &medf, const MedMat<float> &Y);
519 int Train(const MedFeatures &medf) { MedMat<float> dummy; return Train(medf, dummy); }
520
521 int Train_AdaBoost(const MedFeatures &medf, const MedMat<float> &Y);
522 int update_counts(vector<vector<float>> &sample_counts, MedMat<float> &x, Quantized_Feat &qf, int zero_counts, int round);
523
526 int tune_betas(Quantized_Feat &qfeat);
527 int solve_betas_gd(MedMat<float>& C, MedMat<float>& S, vector<float> &b);
528
531 int Predict(MedMat<float> &x, vector<float> &preds) const;
532 int n_preds_per_sample() const;
533
534 int Predict_Categorial(MedMat<float> &x, vector<float> &preds) const; // currently like this... with time should consider inheritance to do it right.
535
536 // print average bagging reports
537 void print_average_bagging(int _n_time_slices, int _n_categ);
538
539 // simple helpers
540 static int get_tree_type(const string &str);
541 static int get_missing_value_method(const string &str);
542
543 ADD_CLASS_NAME(TQRF_Forest)
544 ADD_SERIALIZATION_FUNCS(params, trees, alphas, betas)
545
546private:
547
548};
549
550// helper struct
552 int i_tree = -1;
553 int i_node = -1;
554 TreeNodeIdx(int i_t, int i_n) { i_tree = i_t; i_node = i_n; }
555};
556
557
560
561public:
562
563 vector<vector<int>> all_indexes;
564
565 void init_all_indexes(vector<TQRF_Tree> &trees) {
566 for (auto &tree : trees)
567 all_indexes.push_back(tree.indexes);
568 }
569
570 ADD_CLASS_NAME(AllIndexes)
571 ADD_SERIALIZATION_FUNCS(all_indexes)
572};
573
574//========================================
575// Join the serialization Waggon
576//========================================
582
583#endif
Logger.h - allowing logs with more control.
MedTime.h.
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
next is for debugging
Definition TQRF.h:559
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
Definition MedMat.h:63
Definition TQRF.h:182
int nfeat
pointer to the original MedFeatures
Definition TQRF.h:190
vector< vector< float > > q_to_val
a vector of features that mimics the input x_in features matrix, but coded into quantized values
Definition TQRF.h:186
vector< string > feature_names
just an easy helper that = qx.size()
Definition TQRF.h:191
const MedFeatures * orig_medf
from a q value to float value : q=0 is reserved for missing value the range for q>0 is : [q_to_val[q]...
Definition TQRF.h:188
vector< string > feat_names
pointers to the original data given
Definition TQRF.h:194
vector< const vector< float > * > orig_data
ncateg 0 is regression, otherwise categories are assumed to be 0 ... ncateg-1
Definition TQRF.h:193
int ncateg
useful for debugging
Definition TQRF.h:192
vector< float > y
as given in train
Definition TQRF.h:195
int n_time_slices
when there's more than 1 time slice there may be censoring involved and the last_time_slice is the la...
Definition TQRF.h:198
vector< int > slice_counts[2]
1 time slice is simply the regular case of a label for the whole future
Definition TQRF.h:199
vector< int > is_categorial_feat
lists[0] is always the lines used for training the trees in round 1 the others can be used for later ...
Definition TQRF.h:205
vector< vector< int > > lists
counts of elements in slices (in case of non regression trees). slices with no variability are not in...
Definition TQRF.h:201
Definition SerializableObject.h:32
int init_from_string(string init_string)
Init from string.
Definition SerializableObject.cpp:121
Definition TQRF.h:500
int Train(const MedFeatures &medf, const MedMat< float > &Y)
The basic train matrix for TQRF is MedFeatures (!!) the reason is that it contains everything in one ...
Definition TQRF.cpp:622
int tune_betas(Quantized_Feat &qfeat)
tuning : solving a gd problem of finding the optimal betas for nodes at some certain chosen depth in ...
Definition TQRF.cpp:2266
int init(map< string, string > &map)
Virtual to init object from parsed fields.
Definition TQRF.h:509
int Predict(MedMat< float > &x, vector< float > &preds) const
However - the basic predict for this model is MedMat !! , as here it is much simpler : we only need t...
Definition TQRF.cpp:882
Definition TQRF.h:278
Definition TQRF.h:238
int from_idx
0: left , 1: right , 2: randomize each sample
Definition TQRF.h:251
int is_terminal
samples with <= bound go to Left , the other to Right
Definition TQRF.h:244
int to_idx
the node elements are those given in its tree indexes from place from_idx, to to_idx.
Definition TQRF.h:252
int i_feat
for debugging and prints
Definition TQRF.h:242
float bound
index of feature used in this node
Definition TQRF.h:243
int beta_idx
choose which of the following to serialize
Definition TQRF.h:256
Definition TQRF.h:55
float gd_lambda
gradient descent momentum
Definition TQRF.h:148
float gd_momentum
gradient descent batch size
Definition TQRF.h:147
int single_sample_per_pid
when a node is bigger than this number : choose this number of random samples to make decisions
Definition TQRF.h:99
int nrounds
the weight to use for cases with y!=0 in a weighted case
Definition TQRF.h:133
float bag_prob
weather to bag with repeats or not
Definition TQRF.h:101
string time_slice_unit
number of categories (1 for regression)
Definition TQRF.h:68
int min_node
stopping criteria : minimal number of samples in a node in the last time slice
Definition TQRF.h:87
int max_node_test_samples
-1: check all splits for each feature , then split the max, > 0: choose this number of split points a...
Definition TQRF.h:96
int tune_min_node_size
max depth of a node to get a weight for. 0 means 1 weight per tree.
Definition TQRF.h:142
float tuning_size
power for the pow(-log(p), wgts_pow) used for adaboost weights
Definition TQRF.h:140
int gd_batch
gradient descent step size
Definition TQRF.h:146
int debug
control debug prints in certain places
Definition TQRF.h:154
vector< string > categorial_str
features with number of different values below nvals_for_categ will be assumed categorial
Definition TQRF.h:108
float case_wgt
will sum predictions over different times
Definition TQRF.h:130
int gd_epochs
regularization
Definition TQRF.h:149
int max_depth
number of trees to learn
Definition TQRF.h:85
int predict_from_slice
relavant only to categorial predictions: -1: give all categs, 0 and above: give only those categs rem...
Definition TQRF.h:124
float random_split_prob
stopping criteria : minimal number of samples in a node in the first time slice
Definition TQRF.h:88
int nvals_for_categorial
if > 0 : will only choose this random number of points to test split points at, otherwise will test a...
Definition TQRF.h:107
int test_for_missing
will fail on non finite values in input data
Definition TQRF.h:119
float alpha
maximal probability to trip to when recalculating weights
Definition TQRF.h:136
int min_node_last_slice
maximal depth of tree
Definition TQRF.h:86
vector< float > time_slices_wgts
if not empty: defines the borders of all the time lines. Enables a very flexible time slicing strateg...
Definition TQRF.h:74
int predict_to_slice
will give predictions for slices [predict_from_slice,...,predict_to_slice]. if negative: all slices.
Definition TQRF.h:125
string tree_type
maximal quantization
Definition TQRF.h:82
float bag_ratio
random choice of samples for each tree prob
Definition TQRF.h:102
int time_slice_size
calculated upon init
Definition TQRF.h:70
int ncateg
calculated upon init
Definition TQRF.h:65
float max_p
minimal probability to trim to when recalculating weights
Definition TQRF.h:135
float ntry_prob
-1: use the ntry_prob rule, > 0 : choose this number of features.
Definition TQRF.h:92
int ntrees
tree type code : calulated from tree_type the string
Definition TQRF.h:84
int ids_to_print
for debug prints
Definition TQRF.h:153
int tune_max_depth
size of group to tune tree weights by.
Definition TQRF.h:141
float missing_val
all features containing these tags will be assumed categorial
Definition TQRF.h:113
int n_time_slices
the size of the basic time slice, -1: is like infinity: a single time slice like a regular QRF
Definition TQRF.h:71
float min_p
a single round means simply running TQRF as defined with no boosting applied
Definition TQRF.h:134
string missing_method_str
missing value
Definition TQRF.h:114
int test_for_inf
to be initialized from missing_method_str
Definition TQRF.h:118
float gd_rate
min node size for a node to have a weight
Definition TQRF.h:145
float wgts_pow
shrinkage factor
Definition TQRF.h:137
float bag_feat
control ratio of #0 : #NonZero of labels, if < -1 , leave as is.
Definition TQRF.h:103
int init(map< string, string > &map)
extra param for use when debugging
Definition TQRF.cpp:19
int bag_with_repeats
when bagging select a single sample per pid (which in itself can be repeated)
Definition TQRF.h:100
vector< string > categorial_tags
all features containing one of the strings defined here in their name will be assumed categorial
Definition TQRF.h:109
int max_q
when calclating the time slices distributions we have an option to NOT count the preciding 0's of non...
Definition TQRF.h:78
int censor_cases
default is all 1.0 , but can be assigned by the user, will be used to weight the scores from differen...
Definition TQRF.h:75
int nsplits
choose ntry_prob * nfeat features each time
Definition TQRF.h:93
vector< int > time_slices
if time_slices vector is not given, one will be created using time_slice_size and this parameter.
Definition TQRF.h:72
int ntry
at this probability we will split a node in a random manner, in order to add noise to the tree.
Definition TQRF.h:91
int only_this_categ
will fail if missing value found in data
Definition TQRF.h:122
int verbosity
0 : stop automatically , Otherwise: do this number of epochs
Definition TQRF.h:152
int qpoints_per_split
proportion of random features chosen for each tree
Definition TQRF.h:104
string samples_time_unit
sometimes it helps to keep it for debugging
Definition TQRF.h:62
int tree_type_i
options: regression, entropy, logrank
Definition TQRF.h:83
int missing_method
how to handle missing values: median , left, right, mean, rand
Definition TQRF.h:115
Definition TQRF.h:331
Definition TQRF.h:411
Definition TQRF.h:369
Definition TQRF.h:362
Definition TQRF.h:418
Definition TQRF.h:290
Definition TQRF.h:375
Definition TQRF.h:405
Definition TQRF.h:431
Definition StdDeque.h:58
Definition TQRF.h:551