9#include <MedProcessTools/MedProcessTools/MedFeatures.h>
10#include <MedMat/MedMat/MedMat.h>
17 TQRF_TREE_ENTROPY = 0,
18 TQRF_TREE_REGRESSION = 1,
19 TQRF_TREE_LIKELIHOOD = 2,
20 TQRF_TREE_WEIGHTED_LIKELIHOOD = 3,
22 TQRF_TREE_UNDEFINED = 5
25enum TQRF_Missing_Value_Method {
26 TQRF_MISSING_VALUE_MEAN = 0,
27 TQRF_MISSING_VALUE_MEDIAN = 1,
28 TQRF_MISSING_VALUE_LARGER_NODE = 2,
29 TQRF_MISSING_VALUE_LEFT = 3,
30 TQRF_MISSING_VALUE_RIGHT = 4,
31 TQRF_MISSING_VALUE_RAND_ALL = 5,
32 TQRF_MISSING_VALUE_RAND_EACH_SAMPLE = 6,
33 TQRF_MISSING_VALUE_NOTHING = 7
37enum TQRF_Node_Working_State {
38 TQRF_Node_State_Initiated = 0,
39 TQRF_Node_State_In_Progress = 1,
40 TQRF_Node_State_Done = 2
43enum TQRF_Missing_Direction {
44 TQRF_MISSING_DIRECTION_LEFT = 0,
45 TQRF_MISSING_DIRECTION_RIGHT = 1,
46 TQRF_MISSING_DIRECTION_RAND_EACH_SAMPLE = 2
49#define TQRF_MAX_TIME_SLICE 10000000
50#define MIN_ELEMENTS_IN_TIME_SLICE 100
51#define UNSET_BETA ((float)-1e-10)
61 string init_string =
"";
63 int samples_time_unit_i;
69 int time_slice_unit_i;
126 int predict_sum_times = 0;
160 int init(map<string, string>& map);
163 vector<double> log_table;
169 time_slices,
time_slices_wgts,
censor_cases,
max_q,
tree_type,
tree_type_i,
ntrees,
max_depth,
min_node_last_slice,
min_node,
random_split_prob,
ntry,
ntry_prob,
170 nsplits,
max_node_test_samples,
single_sample_per_pid,
bag_with_repeats,
bag_prob,
bag_ratio,
bag_feat,
qpoints_per_split,
nvals_for_categorial,
categorial_str,
172 predict_sum_times,
case_wgt,
nrounds,
min_p,
max_p,
alpha,
wgts_pow,
tuning_size,
tune_max_depth,
tune_min_node_size,
gd_rate,
gd_batch,
gd_momentum,
185 vector<vector<short>> qx;
197 vector<int> last_time_slice;
208 vector<vector<vector<int>>> time_categ_pids;
209 vector<vector<vector<int>>> time_categ_idx;
210 vector<vector<int>> categ_pids;
211 vector<vector<int>> categ_idx;
212 unordered_map<int, vector<vector<vector<int>>>> pid2time_categ_idx;
216 vector<float> orig_wgts;
218 vector<float> w_to_sum;
219 vector<vector<float>> sum_over_trees;
227 int quantize_feat(
int i_feat,
TQRF_Params ¶ms);
248 int missing_direction = TQRF_MISSING_DIRECTION_RAND_EACH_SAMPLE;
255 int node_serialization_mask = 0x1;
259 vector<vector<float>> time_categ_count;
270 int state = TQRF_Node_State_Initiated;
273 ADD_SERIALIZATION_FUNCS(node_idx,
i_feat,
bound,
is_terminal, left_node, right_node, depth, missing_direction,
from_idx,
to_idx,
274 node_serialization_mask,
beta_idx, time_categ_count, state)
312 virtual int get_best_split(
TQRF_Params ¶ms,
int &best_q,
double &best_score) {
return 0; };
314 int get_q_test_points(
int feat_max_q,
TQRF_Params ¶ms, vector<int> &qpoints);
321 virtual void print_histograms() {
return; };
334 vector<vector<vector<int>>> counts;
338 vector<vector<int>> sums;
357 void print_histograms();
364 int get_best_split(
TQRF_Params ¶ms,
int &best_q,
double &best_score);
371 int get_best_split(
TQRF_Params ¶ms,
int &best_q,
double &best_score);
378 vector<vector<vector<float>>> counts;
382 vector<vector<float>> sums;
385 vector<float> sums_t;
401 void print_histograms();
407 int get_best_split(
TQRF_Params ¶ms,
int &best_q,
double &best_score);
414 int get_best_split(
TQRF_Params ¶ms,
int &best_q,
double &best_score) {
return 0; };
421 vector<vector<vector<pair<float,int>>>> sum_num;
425 int get_best_split(
TQRF_Params ¶ms,
int &best_q,
double &best_score) {
return 0; };
437 int keep_indexes = 0;
439 vector<TQRF_Node> nodes;
449 const TQRF_Node *Get_Node_for_predict(
MedMat<float> &x,
int i_row,
float missing_val,
int &beta_idx)
const;
457 int get_bagged_indexes();
460 int init_root_node();
463 int get_next_node(
int curr_node);
466 int get_feats_to_test(vector<int> &feats_to_test);
469 int init_split_stats(vector<TQRF_Split_Stat *> &tqs);
470 void free_split_stats(vector<TQRF_Split_Stat *> &tqs);
473 int node_splitter(
int i_curr_node,
int i_best,
int q_best);
476 float prep_node_counts(
int i_curr_node,
int use_wgts_flag);
479 void pre_serialization() {
if (keep_indexes == 0) indexes.clear(); }
488 int n_nodes_in_process = 0;
489 int i_last_node_in_process = 0;
491 int bag_chooser(
float p,
int _t,
int _c, vector<int> &_indexes);
492 int bag_chooser(
int choose_with_repeats,
int single_sample_per_id,
float p, vector<int> &pids, vector<int> &idx, unordered_map<
int, vector<int>> &pid2idx, vector<int> &_indexes);
505 vector<TQRF_Tree> trees;
506 vector<float> alphas;
509 int init(map<string, string>& map) {
return params.init(map); }
532 int n_preds_per_sample()
const;
534 int Predict_Categorial(
MedMat<float> &x, vector<float> &preds)
const;
537 void print_average_bagging(
int _n_time_slices,
int _n_categ);
540 static int get_tree_type(
const string &str);
541 static int get_missing_value_method(
const string &str);
554 TreeNodeIdx(
int i_t,
int i_n) { i_tree = i_t; i_node = i_n; }
563 vector<vector<int>> all_indexes;
565 void init_all_indexes(vector<TQRF_Tree> &trees) {
566 for (
auto &tree : trees)
567 all_indexes.push_back(tree.indexes);
Logger.h - allowing logs with more control.
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
next is for debugging
Definition TQRF.h:559
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
int nfeat
pointer to the original MedFeatures
Definition TQRF.h:190
vector< vector< float > > q_to_val
a vector of features that mimics the input x_in features matrix, but coded into quantized values
Definition TQRF.h:186
vector< string > feature_names
just an easy helper that = qx.size()
Definition TQRF.h:191
const MedFeatures * orig_medf
from a q value to float value : q=0 is reserved for missing value the range for q>0 is : [q_to_val[q]...
Definition TQRF.h:188
vector< string > feat_names
pointers to the original data given
Definition TQRF.h:194
vector< const vector< float > * > orig_data
ncateg 0 is regression, otherwise categories are assumed to be 0 ... ncateg-1
Definition TQRF.h:193
int ncateg
useful for debugging
Definition TQRF.h:192
vector< float > y
as given in train
Definition TQRF.h:195
int n_time_slices
when there's more than 1 time slice there may be censoring involved and the last_time_slice is the la...
Definition TQRF.h:198
vector< int > slice_counts[2]
1 time slice is simply the regular case of a label for the whole future
Definition TQRF.h:199
vector< int > is_categorial_feat
lists[0] is always the lines used for training the trees in round 1 the others can be used for later ...
Definition TQRF.h:205
vector< vector< int > > lists
counts of elements in slices (in case of non regression trees). slices with no variability are not in...
Definition TQRF.h:201
Definition SerializableObject.h:32
int init_from_string(string init_string)
Init from string.
Definition SerializableObject.cpp:121
int Train(const MedFeatures &medf, const MedMat< float > &Y)
The basic train matrix for TQRF is MedFeatures (!!) the reason is that it contains everything in one ...
Definition TQRF.cpp:622
int tune_betas(Quantized_Feat &qfeat)
tuning : solving a gd problem of finding the optimal betas for nodes at some certain chosen depth in ...
Definition TQRF.cpp:2266
int init(map< string, string > &map)
Virtual to init object from parsed fields.
Definition TQRF.h:509
int Predict(MedMat< float > &x, vector< float > &preds) const
However - the basic predict for this model is MedMat !! , as here it is much simpler : we only need t...
Definition TQRF.cpp:882
int from_idx
0: left , 1: right , 2: randomize each sample
Definition TQRF.h:251
int is_terminal
samples with <= bound go to Left , the other to Right
Definition TQRF.h:244
int to_idx
the node elements are those given in its tree indexes from place from_idx, to to_idx.
Definition TQRF.h:252
int i_feat
for debugging and prints
Definition TQRF.h:242
float bound
index of feature used in this node
Definition TQRF.h:243
int beta_idx
choose which of the following to serialize
Definition TQRF.h:256
float gd_lambda
gradient descent momentum
Definition TQRF.h:148
float gd_momentum
gradient descent batch size
Definition TQRF.h:147
int single_sample_per_pid
when a node is bigger than this number : choose this number of random samples to make decisions
Definition TQRF.h:99
int nrounds
the weight to use for cases with y!=0 in a weighted case
Definition TQRF.h:133
float bag_prob
weather to bag with repeats or not
Definition TQRF.h:101
string time_slice_unit
number of categories (1 for regression)
Definition TQRF.h:68
int min_node
stopping criteria : minimal number of samples in a node in the last time slice
Definition TQRF.h:87
int max_node_test_samples
-1: check all splits for each feature , then split the max, > 0: choose this number of split points a...
Definition TQRF.h:96
int tune_min_node_size
max depth of a node to get a weight for. 0 means 1 weight per tree.
Definition TQRF.h:142
float tuning_size
power for the pow(-log(p), wgts_pow) used for adaboost weights
Definition TQRF.h:140
int gd_batch
gradient descent step size
Definition TQRF.h:146
int debug
control debug prints in certain places
Definition TQRF.h:154
vector< string > categorial_str
features with number of different values below nvals_for_categ will be assumed categorial
Definition TQRF.h:108
float case_wgt
will sum predictions over different times
Definition TQRF.h:130
int gd_epochs
regularization
Definition TQRF.h:149
int max_depth
number of trees to learn
Definition TQRF.h:85
int predict_from_slice
relavant only to categorial predictions: -1: give all categs, 0 and above: give only those categs rem...
Definition TQRF.h:124
float random_split_prob
stopping criteria : minimal number of samples in a node in the first time slice
Definition TQRF.h:88
int nvals_for_categorial
if > 0 : will only choose this random number of points to test split points at, otherwise will test a...
Definition TQRF.h:107
int test_for_missing
will fail on non finite values in input data
Definition TQRF.h:119
float alpha
maximal probability to trip to when recalculating weights
Definition TQRF.h:136
int min_node_last_slice
maximal depth of tree
Definition TQRF.h:86
vector< float > time_slices_wgts
if not empty: defines the borders of all the time lines. Enables a very flexible time slicing strateg...
Definition TQRF.h:74
int predict_to_slice
will give predictions for slices [predict_from_slice,...,predict_to_slice]. if negative: all slices.
Definition TQRF.h:125
string tree_type
maximal quantization
Definition TQRF.h:82
float bag_ratio
random choice of samples for each tree prob
Definition TQRF.h:102
int time_slice_size
calculated upon init
Definition TQRF.h:70
int ncateg
calculated upon init
Definition TQRF.h:65
float max_p
minimal probability to trim to when recalculating weights
Definition TQRF.h:135
float ntry_prob
-1: use the ntry_prob rule, > 0 : choose this number of features.
Definition TQRF.h:92
int ntrees
tree type code : calulated from tree_type the string
Definition TQRF.h:84
int ids_to_print
for debug prints
Definition TQRF.h:153
int tune_max_depth
size of group to tune tree weights by.
Definition TQRF.h:141
float missing_val
all features containing these tags will be assumed categorial
Definition TQRF.h:113
int n_time_slices
the size of the basic time slice, -1: is like infinity: a single time slice like a regular QRF
Definition TQRF.h:71
float min_p
a single round means simply running TQRF as defined with no boosting applied
Definition TQRF.h:134
string missing_method_str
missing value
Definition TQRF.h:114
int test_for_inf
to be initialized from missing_method_str
Definition TQRF.h:118
float gd_rate
min node size for a node to have a weight
Definition TQRF.h:145
float wgts_pow
shrinkage factor
Definition TQRF.h:137
float bag_feat
control ratio of #0 : #NonZero of labels, if < -1 , leave as is.
Definition TQRF.h:103
int init(map< string, string > &map)
extra param for use when debugging
Definition TQRF.cpp:19
int bag_with_repeats
when bagging select a single sample per pid (which in itself can be repeated)
Definition TQRF.h:100
vector< string > categorial_tags
all features containing one of the strings defined here in their name will be assumed categorial
Definition TQRF.h:109
int max_q
when calclating the time slices distributions we have an option to NOT count the preciding 0's of non...
Definition TQRF.h:78
int censor_cases
default is all 1.0 , but can be assigned by the user, will be used to weight the scores from differen...
Definition TQRF.h:75
int nsplits
choose ntry_prob * nfeat features each time
Definition TQRF.h:93
vector< int > time_slices
if time_slices vector is not given, one will be created using time_slice_size and this parameter.
Definition TQRF.h:72
int ntry
at this probability we will split a node in a random manner, in order to add noise to the tree.
Definition TQRF.h:91
int only_this_categ
will fail if missing value found in data
Definition TQRF.h:122
int verbosity
0 : stop automatically , Otherwise: do this number of epochs
Definition TQRF.h:152
int qpoints_per_split
proportion of random features chosen for each tree
Definition TQRF.h:104
string samples_time_unit
sometimes it helps to keep it for debugging
Definition TQRF.h:62
int tree_type_i
options: regression, entropy, logrank
Definition TQRF.h:83
int missing_method
how to handle missing values: median , left, right, mean, rand
Definition TQRF.h:115