Medial Code Documentation
Loading...
Searching...
No Matches
IterativeImputer.h
1//#pragma once
2#ifndef _ITERATIVEIMPUTER_H_
3#define _ITERATIVEIMPUTER_H_
6#include <unordered_set>
7
8//=========================================================================
9// IterativeImputer :: Inner Class : the actual implementation
10//=========================================================================
12public:
13 // params
14 vector<string> features_to_impute = {}; // if empty : impute all
15 string regressor = "qrf";
16 string regressor_params = "type=regression;ntrees=30;min_node=100;spread=0.1;learn_nthreads=40;predict_nthreads=40";
17 // string regressor = "linear_model";
18 // string regressor_params = "rfactor=0.9";
19 string multi_categ_classifier = "qrf";
20 string multi_categ_classifier_params = "type=categorical_entropy;ntrees=30;min_node=10;learn_nthreads=40;predict_nthreads=40";
21 string add_ncateg_var_name = "n_categ"; // for algorithms that need it
22 string round1_strata = "Age,40,80,5:Gender,1,2,1";
23 int do_round1 = 1;
24 int round1_moment = 0; // 0 mean , 1 median
25 int categorial_bound = 0; // if n diff values for a feature < this bound --> we declare it categorial
26 int max_iterations = 1;
27 float p_validation = (float)0.1; // helping to print intermediate results
28 int min_vals_for_training = 10000;
29 float missing_value = MED_MAT_MISSING_VALUE;
30 int round_to_resolution = 1;
31 int verbose = 1;
32 float missing_bound = 0.5;
33
34 // a few constant params
35 int min_vals_for_impute = 1000; // min number of samples to run impute rounds.
36
37
38 int init(map<string, string>& mapper);
39
40 ADD_CLASS_NAME(IterativeImputerParams)
41 ADD_SERIALIZATION_FUNCS(features_to_impute, regressor, regressor_params, multi_categ_classifier, multi_categ_classifier_params, add_ncateg_var_name,
42 round1_strata, do_round1, round1_moment, categorial_bound, max_iterations, p_validation, min_vals_for_training, missing_value,
43 round_to_resolution, verbose, missing_bound);
44
45};
46
47
49public:
50 string name = "";
51 string full_name = "";
52 int n_diff_vals = 0;
53 int is_categorial = 0;
54 float min = (float)1e10;
55 float max = (float)-1e10;
56 float resolution = 0;
57 int predictor_type = 0;
58 vector<int> inds_for_pred;
59
60 // no need to serialize helpers
61 int n_missing = 0;
62 int n_with_values = 0;
63 int n_with_non_zero_values = 0;
64 float *data = NULL;
65 int data_len = 0;
66 vector<char> is_missing;
67 vector<string> feats_for_pred; // full names of features to be used in the prediction matrix
68 vector<int> train_idx;
69 vector<int> test_idx;
70 vector<int> pred_idx;
71
72 int prep_feats_for_pred(MedFeatures &mfd);
73 int prep_indexes(const vector<int> &external_train_idx, const vector<int> &external_test_idx, float missing_value);
74
75
76 void print() {
77 fprintf(stderr, "Feature Info :: %s :: %s :: data_len %d : n_missing %d ( %5.2f ): n_with %d ( non zero %d ): n_diff_vals %d : categorial %d : min %f : max %f : resolution %f\n",
78 name.c_str(), full_name.c_str(), data_len, n_missing, (float)100 * n_missing / (float)data_len, n_with_values, n_with_non_zero_values, n_diff_vals, is_categorial, min, max, resolution);
79 }
80
81 ADD_CLASS_NAME(feature_info)
82 ADD_SERIALIZATION_FUNCS(name, full_name, n_diff_vals, is_categorial, min, max, resolution, predictor_type, inds_for_pred);
83};
84
96public:
97
98 IterativeImputerParams params; // needs serialization
99
100 // helpers not for serialization
101 vector<int> train_idx, test_idx;
102 vector<char> is_train;
103 MedFeatures learn_features_map; // in the learn stage we must keep a copy of our MedFeatures since we change it with each step and iteration
104 unordered_set<int> train_ids, test_ids;
105
106
107 // helpers for (partial) serialization
108 vector<feature_info> feats;
109
110 // First round imputers (needs serialization)
111 vector<FeatureImputer> first_round_imputers;
112
113 // Iterations imputers for each split (needs serialization)
114 vector<int> predictors_order;
115 vector<vector<MedPredictor *>> predictors; // predictors[i][j] = the predictor at iteration i , for feature j
116
119 int init(map<string, string>& mapper) { return params.init(mapper); }
120
121 int init_internals(MedFeatures &mfd);
122 int init_feature_info(MedFeatures &mfd, string feat_name);
123 int init_feature_info_update(MedFeatures &mfd, feature_info &fi);
124 int round_to_resolution(MedFeatures &mfd);
125 int round_arr(float *arr, int len, float resolution, float _min, float _max);
126 int learn_first_round(MedFeatures &mfd);
127 int learn_iteration(MedFeatures &mfd, int iter);
128 int apply_first_round(MedFeatures &mfd, bool learning);
129 int apply_iteration(MedFeatures &mfd, int iter);
130
131 int find_feats_to_learn_from(int f_idx);
132 int feats_for_pred_inds_to_names(feature_info &fi);
133
134
135 //int Learn(MedFeatures &mfd) { init_internals(mfd); }
136 //int Apply(MedFeatures &mfd) { fprintf(stderr, "IterativeImputer::Apply() NOT IMPLEMENTED YET\n"); }
137 int Learn(MedFeatures &mfd);
138 int Apply(MedFeatures &mfd, bool learning);
139
140 // Serialization
141 ADD_CLASS_NAME(IterativeImputer)
142 ADD_SERIALIZATION_FUNCS(params, feats, first_round_imputers, predictors_order, predictors)
143
144};
145
146
147//=============================================================================
148// FeatureIterativeImputer :: Wrapper of IterativeImputer as a FeatureProcessor
149//=============================================================================
151public:
152
153 // holding an IterativeImputer instance
154 IterativeImputer imputer;
155
156 // Constructor
158
161 int init(map<string, string>& mapper) { return imputer.init(mapper); }
162
163 // Learn imputing model
164 int Learn(MedFeatures& features, unordered_set<int>& ids);
165 int Learn(MedFeatures& features) { return imputer.Learn(features); }
166
167 // Apply cleaning model
168 int _apply(MedFeatures& features, unordered_set<int>& ids, bool learning);
169 int _apply(MedFeatures& features, bool learning) { return imputer.Apply(features, learning); }
170
172 bool are_features_affected(unordered_set<string>& out_req_features);
173
175 void update_req_features_vec(unordered_set<string>& out_req_features, unordered_set<string>& in_req_features);
176
177 // Serialization
178 ADD_CLASS_NAME(FeatureIterativeImputer)
179 ADD_SERIALIZATION_FUNCS(processor_type, imputer)
180
181};
182
183
184//=======================================================
185// Join the MedSerialize Wagon
186//=======================================================
191#endif
A virtual class of processes on MedFeatures; E.g.
@ FTR_PROCESS_ITERATIVE_IMPUTER
"iterative_imputer" to create IterativeImputer
Definition FeatureProcess.h:34
MedAlgo - APIs to different algorithms: Linear Models, RF, GBM, KNN, and more.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
Definition IterativeImputer.h:150
int _apply(MedFeatures &features, unordered_set< int > &ids, bool learning)
Apply imputing model on subset of ids (TBI)
Definition IterativeImputer.cpp:598
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor
Definition IterativeImputer.cpp:586
int Learn(MedFeatures &features, unordered_set< int > &ids)
Learn imputing model on subset of ids (TBI)
Definition IterativeImputer.cpp:605
bool are_features_affected(unordered_set< string > &out_req_features)
check if a set of features is affected by the current processor
Definition IterativeImputer.cpp:566
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition IterativeImputer.h:161
Definition FeatureProcess.h:51
Definition IterativeImputer.h:11
int init(map< string, string > &mapper)
Virtual to init object from parsed fields.
Definition IterativeImputer.cpp:59
IterativeImputer A general strong imputer that does the following: (1) Runs a simple stratified im...
Definition IterativeImputer.h:95
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition IterativeImputer.h:119
int init_feature_info(MedFeatures &mfd, string feat_name)
Definition IterativeImputer.cpp:119
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
Definition SerializableObject.h:32
Definition IterativeImputer.h:48