Medial Code Documentation
Loading...
Searching...
No Matches
FeatureProcess.h
Go to the documentation of this file.
1#ifndef _FTR_PROCESS_H_
2#define _FTR_PROCESS_H_
3
5#include <MedProcessTools/MedProcessTools/MedFeatures.h>
6#include <MedProcessTools/MedProcessTools/MedProcessUtils.h>
8#include <MedProcessTools/MedProcessTools/MedProcessUtils.h>
9#include <MedUtils/MedUtils/MedUtils.h>
10#include <MedAlgo/MedAlgo/BinSplitOptimizer.h>
11#include <unordered_set>
12
13#define DEFAULT_FEAT_CLNR_NTHREADS 24
14
15// Forward definition of MedBootstrap
17
46
52public:
53
55 string feature_name = "unset_feature_name";
56 string resolved_feature_name;
57
59 virtual string select_learn_matrix(const vector<string> &matrix_tags) const { return ""; };
60
61 // Type
62 FeatureProcessorTypes processor_type = FTR_PROCESS_LAST;
63
64 // Threading
65 int learn_nthreads, clean_nthreads;
66
67 // Constructor/Destructor
68 FeatureProcessor() { init_defaults(); };
69 virtual ~FeatureProcessor() { clear(); };
70 virtual void clear() {};
71 void init_defaults() { learn_nthreads = DEFAULT_FEAT_CLNR_NTHREADS; clean_nthreads = DEFAULT_FEAT_CLNR_NTHREADS; processor_type = FTR_PROCESS_LAST; };
72
73 // Copy
74 virtual void copy(FeatureProcessor *processor) { *this = *processor; }
75
76 // Virtual Set Feature Name
77 virtual void set_feature_name(const string& feature_name) { this->feature_name = feature_name; }
78 virtual string get_feature_name() { return this->feature_name; }
79 virtual void get_feature_names(vector<string> & feature_names) { feature_names.clear(); feature_names.push_back(feature_name); };
80
81 // Learn cleaning model
82 virtual int Learn(MedFeatures& features, unordered_set<int>& ids) { return 0; }
83
91 int learn(MedFeatures& features);
92 int learn(MedFeatures& features, unordered_set<int>& ids) { return Learn(features, ids); }
93
94 // Apply feature processing
95 virtual int _apply(MedFeatures& features, unordered_set<int>& ids, bool learning) {
96 // For most processors - application is the same for model - learning/applying
97 return _apply(features, ids);
98 }
99
100 virtual int _apply(MedFeatures& features, unordered_set<int>& ids);
101 virtual int _conditional_apply(MedFeatures& features, unordered_set<int>& ids, unordered_set<string>& req_features, bool learning);
102
111 int apply(MedFeatures& features, bool learning);
112 int apply(MedFeatures& features, unordered_set<string>& req_features, bool learning);
113 int apply(MedFeatures& features, unordered_set<int>& ids, bool learning);
114 int apply(MedFeatures& features, unordered_set<int>& ids, unordered_set<string>& req_features, bool learning);
115
116 int apply(MedFeatures& features) { return apply(features, true); };
117 int apply(MedFeatures& features, unordered_set<string>& req_features) { return apply(features, req_features, true); };
118 int apply(MedFeatures& features, unordered_set<int>& ids) { return apply(features, ids, true); };
119 int apply(MedFeatures& features, unordered_set<int>& ids, unordered_set<string>& req_features) { return apply(features, ids, req_features, true); };
120
121 // Init
122 static FeatureProcessor *make_processor(string processor_name);
123 static FeatureProcessor *make_processor(FeatureProcessorTypes type);
124 static FeatureProcessor *make_processor(string processor_name, string params);
125 static FeatureProcessor *make_processor(FeatureProcessorTypes type, string params);
126
127 virtual int init(void *processor_params) { return 0; };
128 virtual int init(map<string, string>& mapper) { return 0; };
129
131 virtual int filter(unordered_set<string>& features) { return (features.find(feature_name) == features.end()) ? 0 : 1; };
132
134 string resolve_feature_name(MedFeatures& features, string substr);
135
137 virtual bool are_features_affected(unordered_set<string>& out_req_features) { return (out_req_features.empty() || out_req_features.find(resolved_feature_name) != out_req_features.end()); }
138
142 virtual void update_req_features_vec(unordered_set<string>& out_req_features, unordered_set<string>& in_req_features) { in_req_features = out_req_features; };
143
145 virtual bool is_selector() { return false; }
146
147 // Serialization (including type)
148 ADD_CLASS_NAME(FeatureProcessor)
149 ADD_SERIALIZATION_FUNCS(feature_name, resolved_feature_name, processor_type)
150 void *new_polymorphic(string derived_class_name);
151
152 size_t get_processor_size();
153 size_t processor_serialize(unsigned char *blob);
154
155 // debug prints
156 virtual void dprint(const string &pref, int rp_flag);
157
158};
159
160// Utilities
161FeatureProcessorTypes feature_processor_name_to_type(const string& cleaner_name);
162
170public:
171
172 // For generating processors only at learning, we need type + init_string
173 FeatureProcessorTypes members_type;
174 string init_string;
175 int duplicate;
176 string tag;
177 bool use_parallel_learn;
178 bool use_parallel_apply;
179
180 // Processors (if empty, will be generated upon learning for all featuers)
181 vector<FeatureProcessor *> processors;
182
183 string select_learn_matrix(const vector<string> &matrix_tags) const;
184
185 // Constructor/Destructor
186 MultiFeatureProcessor() { init_defaults(); };
187 ~MultiFeatureProcessor() { clear(); };
188
189 void init_defaults() { processor_type = FTR_PROCESS_MULTI; duplicate = 0; members_type = FTR_PROCESS_LAST; init_string = ""; tag = ""; use_parallel_learn = true; use_parallel_apply = true; };
190
191 void clear();
192
195 int init(map<string, string>& mapper);
196
197 // Copy
198 virtual void copy(FeatureProcessor *processor);
199
200 // Learn cleaning model
201 int Learn(MedFeatures& features, unordered_set<int>& ids);
202
203 // Apply cleaning model
204 int _apply(MedFeatures& features, unordered_set<int>& ids, bool learning);
205 int _conditional_apply(MedFeatures& features, unordered_set<int>& ids, unordered_set<string>& req_features, bool learning);
206
207 virtual void get_feature_names(vector<string>& feature_names);
208
209 // Add processors
210 void add_processors_set(FeatureProcessorTypes type, vector<string>& features);
211 void add_processors_set(FeatureProcessorTypes type, vector<string>& features, string init_string);
212
213 // Filter according to a subset of features
214 int filter(unordered_set<string>& features);
215
216 // debug print
217 void dprint(const string &pref, int fp_flag);
218
220 bool are_features_affected(unordered_set<string>& out_req_features);
221
223 void update_req_features_vec(unordered_set<string>& out_req_features, unordered_set<string>& in_req_features);
224
225 // Serialization
226 ADD_CLASS_NAME(MultiFeatureProcessor)
227 ADD_SERIALIZATION_FUNCS(processor_type, members_type, init_string, duplicate, tag, processors, use_parallel_apply)
228};
229
230#define DEF_FTR_TRIMMING_SD_NUM 7
231#define DEF_FTR_REMOVING_SD_NUM 14
238public:
239
240 // Constructor
241 FeatureBasicOutlierCleaner() : FeatureProcessor() { init_defaults(); }
242 FeatureBasicOutlierCleaner(string& feature_name) : FeatureProcessor() { set_feature_name(feature_name); init_defaults(); }
243 FeatureBasicOutlierCleaner(string& feature_name, string init_string) : FeatureProcessor() { set_feature_name(feature_name); init_defaults(); init_from_string(init_string); }
244 FeatureBasicOutlierCleaner(string& feature_name, ValueCleanerParams *_params) : FeatureProcessor() { init_defaults(); set_feature_name(feature_name); MedValueCleaner::init(_params); }
245
246 void init_defaults() {
247 processor_type = FTR_PROCESS_BASIC_OUTLIER_CLEANER;
248 params.missing_value = MED_MAT_MISSING_VALUE;
249 params.trimming_sd_num = DEF_FTR_TRIMMING_SD_NUM;
250 params.removing_sd_num = DEF_FTR_REMOVING_SD_NUM;
251 params.nbrs_sd_num = 0;
252 params.take_log = 0;
253 params.doTrim = params.doRemove = true;
255
256 };
257
258 // Init
259 int init(void *processor_params) { return MedValueCleaner::init(processor_params); };
262 int init(map<string, string>& mapper);
263
264 // Copy
265 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<FeatureBasicOutlierCleaner *>(processor)); }
266
267 // Learn cleaning model
268 int Learn(MedFeatures& features, unordered_set<int>& ids);
269 int iterativeLearn(MedFeatures& features, unordered_set<int>& ids);
270 int quantileLearn(MedFeatures& features, unordered_set<int>& ids);
271
272 // Apply cleaning model
273 int _apply(MedFeatures& features, unordered_set<int>& ids);
274
275 // Serialization
276 ADD_CLASS_NAME(FeatureBasicOutlierCleaner)
277 ADD_SERIALIZATION_FUNCS(processor_type, feature_name, resolved_feature_name, params.doTrim, params.doRemove, trimMax, trimMin, removeMax, removeMin)
278
279};
280
287public:
288
291
294
297
299 float mean, sd;
300
302 int resolution = 0;
303
306
308 int max_samples = 0;
309
311 bool resolution_only = false;
312
314 int verbosity = 0;
315
318 float max_val_prctile = 1;
319 float max_val_for_triming = 2;
320 float prctile_th = (float)0.001;
321 float min_x, max_x;
322
323 // Constructor
324 FeatureNormalizer() : FeatureProcessor() { init_defaults(); }
325 FeatureNormalizer(const string& feature_name) : FeatureProcessor() { init_defaults(); set_feature_name(feature_name); }
326 FeatureNormalizer(const string& feature_name, string init_string) : FeatureProcessor() { init_defaults(); init_from_string(init_string); set_feature_name(feature_name); }
327
328 string select_learn_matrix(const vector<string> &matrix_tags) const;
329
330 // Learn cleaning model
331 int Learn(MedFeatures& features, unordered_set<int>& ids);
332
333 // Apply cleaning model
334 int _apply(MedFeatures& features, unordered_set<int>& ids);
335
337 void reverse_apply(float &feature_value) const;
338
341 int init(map<string, string>& mapper);
342 void init_defaults() { missing_value = MED_MAT_MISSING_VALUE; normalizeSd = true; fillMissing = false; processor_type = FTR_PROCESS_NORMALIZER; resolution_bin = 0; };
343
344 // Copy
345 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<FeatureNormalizer *>(processor)); }
346
347 // Serialization
348 ADD_CLASS_NAME(FeatureNormalizer)
350 use_linear_transform, max_val_prctile, max_val_for_triming, prctile_th, min_x, max_x)
351
352};
353
354//.......................................................................................
355//.......................................................................................
356// Feature Imputer
357//.......................................................................................
358//.......................................................................................
359
360typedef enum {
361 IMPUTE_MMNT_MEAN,
362 IMPUTE_MMNT_MEDIAN,
363 IMPUTE_MMNT_COMMON,
364 IMPUTE_MMNT_SAMPLE,
365 IMPUTE_MMNT_LAST
366} imputeMomentTypes;
367
369public:
370 string name;
371 float resolution, min, max;
372 int nValues;
373
374 featureStrata() {};
375 featureStrata(string& _name, float _resolution, float _min, float _max) { name = _name; resolution = _resolution; min = _min; max = _max; }
376
377 void SetNValues() { nValues = ((int)(max / resolution) - (int)(min / resolution) + 1); }
378 // returns the correct strata for a value.
379 // E.g. if "strata": "Age,40,80,5" 42 will return 0, the first bin
380 int getIndex(float value, float missing_val) const {
381 if (value == missing_val)
382 return nValues / 2;
383 else {
384 if (value >= max)
385 return nValues - 1;
386 else if (value <= min)
387 return 0;
388 else
389 return ((int)(value / resolution) - (int)(min / resolution));
390 }
391 }
392
393 // Serialization
394 ADD_CLASS_NAME(featureStrata)
395 ADD_SERIALIZATION_FUNCS(name, resolution, min, max, nValues)
396
397};
402public:
403 vector<featureStrata> stratas;
404 vector<int> factors;
405
406 size_t nStratas() const { return stratas.size(); }
407
408 void getFactors() {
409
410 if (stratas.size() == 0)
411 return;
412
413 factors.resize(stratas.size());
414
415 for (auto& strata : stratas)
416 strata.SetNValues();
417
418 factors[0] = 1;
419 for (int i = 1; i < stratas.size(); i++)
420 factors[i] = factors[i - 1] * stratas[i - 1].nValues;
421 }
422
423 int nValues() const {
424 if (stratas.size() == 0)
425 return 1;
426 else
427 return factors.back() * stratas.back().nValues;
428 }
429
430 int getIndex(float missing_val,
431 const vector<const vector<float> *> &strataValues, int row) const {
432 int index = 0;
433 for (int i = 0; i < nStratas(); i++)
434 index += factors[i] * stratas[i].getIndex(strataValues[i]->at(row), missing_val);
435 return index;
436 }
437
438 // Serialization
439 ADD_CLASS_NAME(featureSetStrata)
440 ADD_SERIALIZATION_FUNCS(stratas, factors)
441};
442
449private:
450 float round_to_closest(float val) const;
451public:
452
453 // Missing Value
454 float missing_value = MED_MAT_MISSING_VALUE;
455 bool verbose = true;
456 bool verbose_learn = false;
457
458 // Strata for setting moment
459 featureSetStrata imputerStrata;
460
461 // minimum samples required for learning
462 int min_samples = 50;
463
464 // if true, doesn't impute missing values that are left due to small stratas
465 int leave_missing_for_small_stratas = 0;
466
467 // if false (default), do NOT impute ANY case in which the strata contains a missing value
468 int impute_strata_with_missing = 0;
469
470 // Moment (learning/applying)
471 vector<imputeMomentTypes> moment_type_vec;
472 vector<float> default_moment_vec;
473 vector<vector<float>> moments_vec;
474
475 // For backword compatability ...
476 imputeMomentTypes moment_type = IMPUTE_MMNT_MEAN;
477 float default_moment;
478 vector<float> moments;
479
480 // for sampling-imputation
481 vector < pair<float, float> > default_histogram;
482 vector < vector<pair<float, float> > > histograms;
483
484 vector<int> strata_sizes;
485
487 int max_samples = 100000;
488
489 bool round_to_existing_value = true;
490 vector<float> existing_values;
491
492 // Constructor
493 FeatureImputer() : FeatureProcessor() { init_defaults(); }
494 FeatureImputer(const string& feature_name) : FeatureProcessor() { init_defaults(); set_feature_name(feature_name); }
495 FeatureImputer(const string& feature_name, string init_string) : FeatureProcessor() { init_defaults(); init_from_string(init_string); set_feature_name(feature_name); }
496
497 // Add stratifier
498 void addStrata(string& init_string);
499 void addStrata(featureStrata& strata) { imputerStrata.stratas.push_back(strata); }
500 void addStrata(string& name, float resolution, float min, float max) { imputerStrata.stratas.push_back(featureStrata(name, resolution, min, max)); }
501
504 int init(map<string, string>& mapper);
505 void init_defaults() { missing_value = MED_MAT_MISSING_VALUE; moment_type = IMPUTE_MMNT_MEAN; processor_type = FTR_PROCESS_IMPUTER; verbose = true; verbose_learn = false; };
506 imputeMomentTypes getMomentType(string& entry);
507
509 void update_req_features_vec(unordered_set<string>& out_req_features, unordered_set<string>& in_req_features);
510
511 // Copy
512 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<FeatureImputer *>(processor)); }
513
514 // Learn cleaning model
515 int Learn(MedFeatures& features, unordered_set<int>& ids);
516
517 // Apply cleaning model
518 int _apply(MedFeatures& features, unordered_set<int>& ids, bool learning);
519
520 // check startas name
521 void check_stratas_name(MedFeatures& features, map <string, string> &strata_name_conversion);
522
523 // Serialization
524 ADD_CLASS_NAME(FeatureImputer)
525 ADD_SERIALIZATION_FUNCS(processor_type, feature_name, resolved_feature_name, missing_value, imputerStrata, moment_type, moments, histograms, strata_sizes, default_moment, default_histogram,
526 moment_type_vec, moments_vec, default_moment_vec, leave_missing_for_small_stratas, impute_strata_with_missing, round_to_existing_value, existing_values)
527
528 void dprint(const string &pref, int fp_flag);
530 void print();
531
532};
533
534
539public:
540
542 float missing_value = (float)MED_MAT_MISSING_VALUE;
543
545 unordered_set<string> required;
546
548 vector<string> selected;
549
551 int numToSelect = 0;
552
555
556 // Constructor
557 FeatureSelector() : FeatureProcessor() { missing_value = MED_MAT_MISSING_VALUE; numToSelect = 0; }
558
560 virtual int Learn(MedFeatures& features, unordered_set<int>& ids);
561
563 virtual int _apply(MedFeatures& features, unordered_set<int>& ids);
564 virtual int _conditional_apply(MedFeatures& features, unordered_set<int>& ids, unordered_set<string>& out_req_features);
565
566 bool is_selector() { return true; }
567 bool are_features_affected(unordered_set<string>& out_req_features);
568
570 void update_req_features_vec(unordered_set<string>& out_req_features, unordered_set<string>& in_req_features);
571
572 ADD_CLASS_NAME(FeatureSelector)
574
575private:
577 virtual int _learn(MedFeatures& features, unordered_set<int>& ids) { return 0; }
578};
579
586public:
587 LassoSelector() : FeatureSelector() { init_defaults(); }
588
591 int init(map<string, string>& mapper);
592
594 float initMaxLambda = (float)0.005;
595
597 float lambdaRatio = (float)0.1;
598 vector<string> lax_lasso_features;
599
601 float rate = (float)0.01;
602
604 float momentum = (float)0.95;
605
607 float stop_at_err = (float)1e-4;
608
609 int nthreads = 12;
610
611 void init_defaults() { missing_value = MED_MAT_MISSING_VALUE; processor_type = FTR_PROCESSOR_LASSO_SELECTOR; };
612
613 // Copy
614 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<LassoSelector *>(processor)); }
615
616 // Serialization
617 ADD_CLASS_NAME(LassoSelector)
619
620private:
621 // Find set of selected features
622 int _learn(MedFeatures& features, unordered_set<int>& ids);
623};
624
631public:
632
633 // Percantage covered by single value to define as degenerate
634 float percentage = 1.0F;
635
636 // Constructor
637 DgnrtFeatureRemvoer() : FeatureSelector() { init_defaults(); }
638
641 int init(map<string, string>& mapper);
642 void init_defaults() { missing_value = MED_MAT_MISSING_VALUE; processor_type = FTR_PROCESS_REMOVE_DGNRT_FTRS; numToSelect = 0; };
643
644 // Copy
645 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<DgnrtFeatureRemvoer *>(processor)); }
646
647 // Serialization
648 ADD_CLASS_NAME(DgnrtFeatureRemvoer)
649 ADD_SERIALIZATION_FUNCS(processor_type, percentage, missing_value, selected)
650
651private:
652 // Find set of selected features
653 int _learn(MedFeatures& features, unordered_set<int>& ids);
654};
655
656typedef enum {
657 UNIV_SLCT_PRSN = 0,
658 UNIV_SLCT_MI = 1,
659 UNIV_SLCT_DCORR = 2,
660 UNIV_SLCT_LAST
661} UnivariateSelectionMethod;
662
664public:
665 UnivariateSelectionMethod method;
666 float minStat;
667
669 int nBins = 10;
670 MedBinningType binMethod = BIN_EQUIDIST;
671
673 int takeSquare = 0;
674
677
679 int max_samples = 10000;
680
681 UnivariateSelectionMethod get_method(string name) {
682
683 boost::algorithm::to_lower(name);
684 if (name == "pearson")
685 return UNIV_SLCT_PRSN;
686 else if (name == "mi" || name == "mutual_information" || name == "mutualinformation")
687 return UNIV_SLCT_MI;
688 else if (name == "dcorr" || name == "dist_corr" || name == "distcorr")
689 return UNIV_SLCT_DCORR;
690 else
691 return UNIV_SLCT_LAST;
692 }
693
694 MedBinningType get_binning_method(string name) {
695
696 boost::algorithm::to_lower(name);
697 if (name == "equi_dist")
698 return BIN_EQUIDIST;
699 else if (name == "equi_size")
700 return BIN_EQUISIZE;
701 else
702 return BIN_LAST;
703 }
704
705 ADD_CLASS_NAME(univariateSelectionParams)
706 ADD_SERIALIZATION_FUNCS(method, minStat, nBins, binMethod, takeSquare, pDistance, max_samples)
707};
708
715public:
716
719
720 // Constructor
721 UnivariateFeatureSelector() : FeatureSelector() { init_defaults(); }
722
725 int init(map<string, string>& mapper);
726 void init_defaults() { missing_value = MED_MAT_MISSING_VALUE; processor_type = FTR_PROCESS_UNIVARIATE_SELECTOR; params.method = UNIV_SLCT_PRSN; params.minStat = 0.05F; };
727
728 // Copy
729 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<UnivariateFeatureSelector *>(processor)); }
730
731 // Serialization
732 ADD_CLASS_NAME(UnivariateFeatureSelector)
734
735private:
736 // Scores
737 int getAbsPearsonCorrs(MedFeatures& features, unordered_set<int>& ids, vector<float>& stats);
738 int getMIs(MedFeatures& features, unordered_set<int>& ids, vector<float>& stats);
739 int getDistCorrs(MedFeatures& features, unordered_set<int>& ids, vector<float>& stats);
740 // Find set of selected features
741 int _learn(MedFeatures& features, unordered_set<int>& ids);
742};
743
744typedef enum {
745 MRMR_MAX = 0,
746 MRMR_MEAN = 1,
747 MRMR_LAST
748} MRMRPenaltyMethod;
749
756public:
759 float penalty;
760 MRMRPenaltyMethod penaltyMethod;
761
762 // Constructor
763 MRMRFeatureSelector() : FeatureSelector() { init_defaults(); }
764
767 int init(map<string, string>& mapper);
768 void init_defaults();
769 MRMRPenaltyMethod get_penalty_method(string _method);
770
771 // Copy
772 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<MRMRFeatureSelector *>(processor)); }
773
774 // Serialization
775 ADD_CLASS_NAME(MRMRFeatureSelector)
776 ADD_SERIALIZATION_FUNCS(processor_type, params, penalty, penaltyMethod, missing_value, required, selected, numToSelect)
777
778private:
779 // Find set of selected features
780 int _learn(MedFeatures& features, unordered_set<int>& ids);
781private:
782 // Scores
783 int fillStatsMatrix(MedFeatures& features, unordered_set<int>& ids, MedMat<float>& stats, int index);
784 int fillAbsPearsonCorrsMatrix(MedFeatures& features, unordered_set<int>& ids, MedMat<float>& stats, int index);
785 int fillMIsMatrix(MedFeatures& features, unordered_set<int>& ids, MedMat<float>& stats, int index);
786 int fillDistCorrsMatrix(MedFeatures& features, unordered_set<int>& ids, MedMat<float>& stats, int index);
787};
788
789//.......................................................................................
790//.......................................................................................
791// Additional implimentations in other h files
792//.......................................................................................
793//.......................................................................................
794#include "IterativeImputer.h"
795
796//.......................................................................................
797//.......................................................................................
798// Utilities
799//.......................................................................................
800//.......................................................................................
801
802#define DEF_MAX_SAMPLE 1000
803void get_all_values(MedFeatures& features, string& signalName, unordered_set<int>& ids, vector<float>& values, int max_sample = DEF_MAX_SAMPLE);
804void get_all_outcomes(MedFeatures& features, unordered_set<int>& ids, vector<float>& values, int max_sample = DEF_MAX_SAMPLE);
805void smearBins(vector<int>& bins, int nBins, int reqNbins);
806
807/************************************************************************************/
814public:
815 int verbose = 0;
816 vector<string> selected_tags;
817 vector<string> removed_tags;
818 // Constructor
819 TagFeatureSelector() : FeatureSelector() { init_defaults(); }
820
823 int init(map<string, string>& mapper);
824 void init_defaults() { missing_value = MED_MAT_MISSING_VALUE; processor_type = FTR_PROCESSOR_TAGS_SELECTOR; };
825
826 // Copy
827 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<TagFeatureSelector *>(processor)); }
828
829 //print function
830 void dprint(const string &pref, int fp_flag);
831
832 // Serialization
833 ADD_CLASS_NAME(TagFeatureSelector)
835private:
836 // Find set of selected features
837 int _learn(MedFeatures& features, unordered_set<int>& ids);
838};
839
846public:
847 string predictor;
850 float minStat;
851 bool verbose;
852 // Constructor
853 ImportanceFeatureSelector() : FeatureSelector() { init_defaults(); }
854
857 int init(map<string, string>& mapper);
858 virtual void init_defaults() { missing_value = MED_MAT_MISSING_VALUE; processor_type = FTR_PROCESSOR_IMPORTANCE_SELECTOR; };
859
860 // Copy
861 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<ImportanceFeatureSelector *>(processor)); }
862
863
864 // Serialization
865 ADD_CLASS_NAME(ImportanceFeatureSelector)
867
868private:
869 // Find set of selected features
870 int _learn(MedFeatures& features, unordered_set<int>& ids);
871};
872
880public:
881 string predictor;
884 int nfolds = 5;
885 bool do_internal_cv = true;
886 vector<int> folds;
887 string mode = "top2bottom";
888 string rates = "50:1,100:2,500:5,5000:10";
890 string bootstrap_params = "sample_per_pid:1";
891 string msr_params = "AUC";
892 bool work_on_sets = false;
893 bool group_to_sigs = false;
894 unordered_set<string> ungroupd_names = { "Drug","RC","ICD9" };
895 unordered_set<string> ignored;
896 bool verbose;
897 string progress_file_path = "";
898 string grouping_mode = "BY_SIGNAL_CATEG";
899
900 vector<int> rates_vec;
901 vector<string> predictor_params_vec;
902 string measurement_name;
903 vector<string> report;
904
905 // Constructor
906 IterativeFeatureSelector() : FeatureSelector() { init_defaults(); }
907
910 int init(map<string, string>& mapper);
911 virtual void init_defaults() { missing_value = MED_MAT_MISSING_VALUE; processor_type = FTR_PROCESSOR_ITERATIVE_SELECTOR; };
912
913 // Bootstrapper initialization
914 void init_bootstrap_cohort(MedBootstrapResult& bootstrapper, string& init);
915 void init_bootstrap_params(MedBootstrapResult& bootstrapper, string& init);
916
917 // Copy
918 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<IterativeFeatureSelector *>(processor)); }
919
920 // Print report to file
921 void print_report(string& fileName);
922
923 // Retracing a pre-given order of families
924 void retrace(MedFeatures& features, unordered_set<int>& ids, vector<string>& families_order, int start, int end);
925 void retrace(MedFeatures& features, vector<string>& families_order, int start, int end) { unordered_set<int> empty; retrace(features, empty, families_order, start, end); }
926
927 // Serialization
928 ADD_CLASS_NAME(IterativeFeatureSelector)
931
932private:
933 // Resolved names of required signals
934 unordered_set<string> resolved_required, resolved_ignored;
935
936 // Find set of selected features
937 int _learn(MedFeatures& features, unordered_set<int>& ids);
938
939 // Parse rates string
940 void get_rates_vec();
941
942 // Read parameters file
943 void read_params_vec();
944
945 // Get Families of signals
946 void get_features_families(MedFeatures& features, map<string, vector<string> >& featureFamilies);
947
948 // Utilities
949 void prepare_for_iterations(MedBootstrapResult& bootstrapper, MedFeatures& features, vector<int>& folds, vector<vector<int>>& trainRows, vector<vector<int>>& testRows, vector<vector<float>>&trainLabels,
950 vector<vector<MedSample>>&testSamples, MedFeatures& bootstrapFeatures);
951 void pre_learn(MedFeatures& features, MedBootstrapResult& bootstrapper, map<string, vector<string> >& featureFamilies, vector<int>& orig_folds);
952
953 // Actual selection
954 void doTop2BottomSelection(MedFeatures& features, map<string, vector<string>>& featureFamilies, MedBootstrapResult& bootstrapper);
955 void doBottom2TopSelection(MedFeatures& features, map<string, vector<string>>& featureFamilies, MedBootstrapResult& bootstrapper);
956
957 // Actual retracing a pre-given order of families
958 void retraceTop2BottomSelection(MedFeatures& features, map<string, vector<string>>& featureFamilies, MedBootstrapResult& bootstrapper, vector<string>& order, int start, int end);
959 void retraceBottom2TopSelection(MedFeatures& features, map<string, vector<string>>& featureFamilies, MedBootstrapResult& bootstrapper, vector<string>& order, int start, int end);
960};
961
962
967public:
968
970 vector<string> names;
971
972 // Constructor
973 FeatureEncoder() : FeatureProcessor() { init_defaults(); }
974
976 bool are_features_affected(unordered_set<string>& out_req_features);
977
979 ADD_CLASS_NAME(FeatureEncoder)
980 ADD_SERIALIZATION_FUNCS(processor_type, names)
981
982 void update_req_features_vec(unordered_set<string>& out_req_features, unordered_set<string>& in_req_features);
983
984};
985
998
1005public:
1008
1009 // Constructor
1010 FeaturePCA() : FeatureEncoder() { init_defaults(); }
1011
1014 int init(map<string, string>& mapper);
1015 void init_defaults() { processor_type = FTR_PROCESS_ENCODER_PCA; params.pca_cutoff = 0; params.pca_top = 100; };
1016
1017 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<FeaturePCA *>(processor)); }
1018
1019 ADD_CLASS_NAME(FeaturePCA)
1020 ADD_SERIALIZATION_FUNCS(processor_type, names, params, selected_indexes, W)
1021
1022private:
1023 MedMat<float> W;
1024 vector<int> selected_indexes;
1025
1026 int _learn(MedFeatures& features, unordered_set<int>& ids);
1027
1028 // Apply encoding
1029 int _apply(MedFeatures& features, unordered_set<int>& ids);
1030};
1031
1039public:
1040
1044 bool rem_origin = true;
1045 bool add_other = false;
1046 bool allow_other = false;
1047 bool remove_last = false;
1048 int max_values = 32;
1049 vector<string> regex_list;
1050 vector<string> regex_list_names;
1051 string other_suffix = "other";
1052
1053 //map<float, string> value2feature;
1054 map<float, vector<string>> value2feature;
1055
1056 // Constructor
1057 OneHotFeatProcessor() { init_defaults(); }
1058
1061 int init(map<string, string>& mapper);
1062
1063 void init_defaults() { processor_type = FTR_PROCESS_ONE_HOT; }
1064 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<OneHotFeatProcessor *>(processor)); }
1065
1067 bool are_features_affected(unordered_set<string>& out_req_features);
1068
1070 void update_req_features_vec(unordered_set<string>& out_req_features, unordered_set<string>& in_req_features);
1071
1072 ADD_CLASS_NAME(OneHotFeatProcessor);
1075private:
1076 int Learn(MedFeatures& features, unordered_set<int>& ids);
1077 int _apply(MedFeatures& features, unordered_set<int>& ids);
1078 string get_feature_name(float value, const string &out_prefix, unordered_map<float, string> &value2Name, float missing_value);
1079};
1080
1089public:
1090
1091 float missing_value = MED_MAT_MISSING_VALUE;
1093 map<float, int> target_labels;
1094 map<float, string> feature_names;
1095 bool remove_origin = true;
1096 bool all_labels = false;
1097 int min_obs = 100;
1098
1099 vector<map<float, float>> probs;
1100 vector<float> overall_prob;
1101
1102 // Constructor
1103 GetProbFeatProcessor() : FeatureProcessor() { init_defaults(); processor_type = FTR_PROCESS_GET_PROB; }
1104 GetProbFeatProcessor(const string& feature_name) : FeatureProcessor() { init_defaults(); processor_type = FTR_PROCESS_GET_PROB; set_feature_name(feature_name); }
1105 GetProbFeatProcessor(const string& feature_name, string init_string) : FeatureProcessor() { init_defaults(); processor_type = FTR_PROCESS_GET_PROB;; init_from_string(init_string); set_feature_name(feature_name); }
1106
1107 // Learn probabilities
1108 int Learn(MedFeatures& features, unordered_set<int>& ids);
1109
1110 // Apply transformation
1111 int _apply(MedFeatures& features, unordered_set<int>& ids);
1112
1115 int init(map<string, string>& mapper);
1116
1117 // Copy
1118 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<GetProbFeatProcessor *>(processor)); }
1119
1120 // Serialization
1121 ADD_CLASS_NAME(GetProbFeatProcessor)
1122 ADD_SERIALIZATION_FUNCS(processor_type, feature_name, resolved_feature_name, missing_value, overall_count, probs,
1124
1125};
1126
1134public:
1135 vector<string> selected_tags;
1137 bool divide;
1138 bool verbose;
1139
1140 MultiplierProcessor() : FeatureProcessor() { init_defaults(); }
1141
1144 int init(map<string, string>& mapper);
1145 void init_defaults() { multiplier_name = ""; divide = false; verbose = true; processor_type = FTR_PROCESS_MULTIPLIER; };
1146
1147 // Copy
1148 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<MultiplierProcessor *>(processor)); }
1149
1150 //print function
1151 void dprint(const string &pref, int fp_flag);
1152
1153 // Serialization
1154 ADD_CLASS_NAME(MultiplierProcessor)
1156private:
1157 // Apply multiplier
1158 int _apply(MedFeatures& features, unordered_set<int>& ids);
1159};
1165public:
1167 string name;
1170
1171 MissingIndicatorProcessor() { init_defaults(); }
1172
1173 void init_defaults() { processor_type = FTR_PROCESS_MISSING_INDICATOR; missing_value = MED_MAT_MISSING_VALUE; name = "is_missing"; }
1174 int _apply(MedFeatures& features, unordered_set<int>& ids);
1175 int init(map<string, string>& mapper);
1176 void update_req_features_vec(unordered_set<string>& out_req_features, unordered_set<string>& in_req_features);
1177
1178 // Serialization
1179 ADD_CLASS_NAME(MissingIndicatorProcessor)
1180 ADD_SERIALIZATION_FUNCS(processor_type, name, replace_value, new_feature_name, feature_name, resolved_feature_name)
1181};
1182
1183
1185public:
1186 vector<double> bin_cutoffs;
1187 vector<double> bin_repr_vals;
1188
1190
1192 int init(map<string, string>& mapper);
1193
1195 void load_bin_settings(const vector<float> &nums, vector<float> &y);
1196
1198 int get_idx(float v) const;
1199
1201 int num_of_bins() const;
1202
1204 float normalize(float v) const;
1205
1206 ADD_CLASS_NAME(Binning_Wrapper)
1208};
1209
1210
1218private:
1219 string get_bin_name(float num) const;
1220public:
1221
1222 float missing_value = MED_MAT_MISSING_VALUE;
1224 float missing_target_val = MED_MAT_MISSING_VALUE;
1226 bool remove_origin = true;
1228 bool one_hot = true;
1230 bool keep_original_val = false;
1232 string bin_format = "%2.1f";
1236 // Constructor
1237 BinningFeatProcessor() : FeatureProcessor() { init_defaults(); processor_type = FTR_PROCESS_BINNING; }
1238 BinningFeatProcessor(const string& feature_name) : FeatureProcessor() { init_defaults(); processor_type = FTR_PROCESS_BINNING; set_feature_name(feature_name); }
1239 BinningFeatProcessor(const string& feature_name, string init_string) : FeatureProcessor() { init_defaults(); processor_type = FTR_PROCESS_BINNING; init_from_string(init_string); set_feature_name(feature_name); }
1240
1241 // Learn probabilities
1242 int Learn(MedFeatures& features, unordered_set<int>& ids);
1243
1244 // Apply transformation
1245 int _apply(MedFeatures& features, unordered_set<int>& ids);
1246
1249 int init(map<string, string>& mapper);
1250
1251 // Copy
1252 virtual void copy(FeatureProcessor *processor) { *this = *(dynamic_cast<BinningFeatProcessor *>(processor)); }
1253
1254 // Serialization
1255 ADD_CLASS_NAME(BinningFeatProcessor)
1256 ADD_SERIALIZATION_FUNCS(processor_type, feature_name, resolved_feature_name, missing_value, missing_target_val,
1258
1259};
1260
1261
1262//=======================================
1263// Joining the MedSerialze wagon
1264//=======================================
1289#endif
FeatureProcessorTypes
Definition FeatureProcess.h:21
@ FTR_PROCESSOR_LASSO_SELECTOR
"lasso" to create LassoSelector
Definition FeatureProcess.h:29
@ FTR_PROCESSOR_ITERATIVE_SELECTOR
"iterative_selector" applies bottom-up or top-down iteration for feature selection....
Definition FeatureProcess.h:32
@ FTR_PROCESS_IMPUTER
"imputer" to create FeatureImputer
Definition FeatureProcess.h:25
@ FTR_PROCESSOR_TAGS_SELECTOR
"tags_selector" to create TagFeatureSelector
Definition FeatureProcess.h:30
@ FTR_PROCESS_NORMALIZER
"normalizer" to create FeatureNormalizer
Definition FeatureProcess.h:24
@ FTR_PROCESS_MULTIPLIER
"multiplier" to create MultiplierProcessor - to multiply feature by other feature
Definition FeatureProcess.h:39
@ FTR_PROCESS_DUPLICATE
"duplicate" to create DuplicateProcessor - duplicates samples in order to do multiple imputations.
Definition FeatureProcess.h:41
@ FTR_PROCESS_BINNING
"binning" to create BinningFeatProcessor - binning with one hot on the bins
Definition FeatureProcess.h:43
@ FTR_PROCESS_ONE_HOT
"one_hot" to create OneHotFeatProcessor - make one-hot features from a given feature
Definition FeatureProcess.h:36
@ FTR_PROCESS_GET_PROB
"get_prob" to create GetProbFeatProcessor - replace categorical feature with probability of outcome i...
Definition FeatureProcess.h:37
@ FTR_PROCESS_MISSING_INDICATOR
"missing_indicator" to create MissingIndicatorProcessor - creates a feature that indicates if a featu...
Definition FeatureProcess.h:42
@ FTR_PROCESS_UNIVARIATE_SELECTOR
"univariate_selector" to create UnivariateFeatureSelector
Definition FeatureProcess.h:27
@ FTR_PROCESS_BASIC_OUTLIER_CLEANER
"basic_outlier_cleaner" or "basic_cleaner" to create FeatureBasicOutlierCleaner
Definition FeatureProcess.h:23
@ FTR_PROCESSOR_IMPORTANCE_SELECTOR
"importance_selector" to create ImportanceFeatureSelector
Definition FeatureProcess.h:31
@ FTR_PROCESS_PREDICTOR_IMPUTER
"predcitor_imputer" to create PredictorImputer
Definition FeatureProcess.h:38
@ FTR_PROCESS_ENCODER_PCA
"pca" to create FeaturePCA
Definition FeatureProcess.h:35
@ FTR_PROCESS_MULTI
"multi_processor" or "multi" to create MultiFeatureProcessor
Definition FeatureProcess.h:22
@ FTR_PROCESS_REMOVE_DGNRT_FTRS
"remove_deg" to create DgnrtFeatureRemvoer
Definition FeatureProcess.h:33
@ FTR_PROCESS_DO_CALC
"do_calc" to create DoCalcFeatProcessor
Definition FeatureProcess.h:26
@ FTR_PROCESSOR_MRMR_SELECTOR
"mrmr" or "mrmr_selector" to create MRMRFeatureSelector
Definition FeatureProcess.h:28
@ FTR_PROCESS_RESAMPLE_WITH_MISSING
"resample_with_missing" to create ResampleMissingProcessor - adds missing values to learn matrix
Definition FeatureProcess.h:40
@ FTR_PROCESS_ITERATIVE_IMPUTER
"iterative_imputer" to create IterativeImputer
Definition FeatureProcess.h:34
A parent class for single-value cleaners.
@ VAL_CLNR_ITERATIVE
"iterative"
Definition MedValueCleaner.h:12
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
GetProbProcessor:
Definition FeatureProcess.h:1217
Binning_Wrapper bin_sett
"bin_sett" parameter - controls how to bin the feature.
Definition FeatureProcess.h:1235
bool one_hot
If true will split each bin value to one hot of 0/1.
Definition FeatureProcess.h:1228
string bin_format
formating of feature name after binning
Definition FeatureProcess.h:1232
float missing_value
Missing Value.
Definition FeatureProcess.h:1222
bool keep_original_val
only relevant in one hot mode.
Definition FeatureProcess.h:1230
bool remove_origin
If true will remove the original/source feature.
Definition FeatureProcess.h:1226
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition BinningFeatProcessor.cpp:83
float missing_target_val
missing value target mapping. converts missing value to this value
Definition FeatureProcess.h:1224
Definition FeatureProcess.h:1184
int num_of_bins() const
returns number of bins
Definition BinningFeatProcessor.cpp:74
int init(map< string, string > &mapper)
Definition BinningFeatProcessor.cpp:7
int get_idx(float v) const
returns index for each value
Definition BinningFeatProcessor.cpp:69
vector< double > bin_repr_vals
the representative value for each bin. With size := bin_cutoffs.size()+1
Definition FeatureProcess.h:1187
vector< double > bin_cutoffs
index i for value (v) := bin_cutoffs[i-1] < v <= bin_cutoffs[i]
Definition FeatureProcess.h:1186
void load_bin_settings(const vector< float > &nums, vector< float > &y)
if has use_bin_settings => will update bin_cutoffs, bin_repr_vals
Definition BinningFeatProcessor.cpp:42
string use_bin_settings
if not empty - will use bin setting to create bins:
Definition FeatureProcess.h:1189
float normalize(float v) const
normalize value into bin and represantative
Definition BinningFeatProcessor.cpp:78
Feature Selector : Remove Degenerate features.
Definition FeatureProcess.h:630
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:860
A simple cleaner considering each value of a certain feature separatley.
Definition FeatureProcess.h:237
FeatureEncoder - General class for encoding features - PCA, autoencoder...
Definition FeatureProcess.h:966
vector< string > names
generated names
Definition FeatureProcess.h:970
bool are_features_affected(unordered_set< string > &out_req_features)
check if a set of features is affected by the current processor
Definition FeatureEncoder.cpp:15
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor
Definition FeatureEncoder.cpp:32
Feature Imputer to complete missing values.
Definition FeatureProcess.h:448
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureProcess.cpp:1055
void print()
debug and print
Definition FeatureProcess.cpp:753
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor
Definition FeatureProcess.cpp:1127
bool verbose
If true will print how many missing value were in each feature.
Definition FeatureProcess.h:455
bool verbose_learn
If true will call print after learn.
Definition FeatureProcess.h:456
int max_samples
Utility : maximum number of samples to take for moments calculations.
Definition FeatureProcess.h:487
Feature Normalizer.
Definition FeatureProcess.h:286
float max_x
parmeters of the transformation
Definition FeatureProcess.h:321
int resolution
resolution : if > 0 , will keep only the given number of digits after the point.
Definition FeatureProcess.h:302
float mean
Moments.
Definition FeatureProcess.h:299
bool use_linear_transform
If true will convert into linear transform from lower prctile to high prctile and has triming value.
Definition FeatureProcess.h:317
int verbosity
verbosity
Definition FeatureProcess.h:314
void reverse_apply(float &feature_value) const
Reverse action of Apply - denorm feature value.
Definition FeatureProcess.cpp:703
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureProcess.cpp:725
bool fillMissing
Fill missing values with mean.
Definition FeatureProcess.h:296
float resolution_bin
A factor to divide by - take floor and then multiply by again. Used in resolution_only mode.
Definition FeatureProcess.h:305
bool normalizeSd
Normalize Standard Deviation.
Definition FeatureProcess.h:293
string select_learn_matrix(const vector< string > &matrix_tags) const
Will be called before learn to create new version for the matrix if needed - in parallel of existing ...
Definition FeatureProcess.cpp:569
bool resolution_only
if resolution only
Definition FeatureProcess.h:311
float missing_value
Missing Value.
Definition FeatureProcess.h:290
int max_samples
Utility : maximum number of samples to take for moments calculations.
Definition FeatureProcess.h:308
PCA Parameters class.
Definition FeatureProcess.h:989
int subsample_count
subsample in the pca rows to speed up
Definition FeatureProcess.h:993
float pca_cutoff
PCA variance threshold to stop.
Definition FeatureProcess.h:992
int pca_top
Max Number of PCA Components to take.
Definition FeatureProcess.h:991
FeaturePCA - PCA encoder.
Definition FeatureProcess.h:1004
FeaturePCAParams params
PCA parameters.
Definition FeatureProcess.h:1007
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureEncoder.cpp:206
Definition FeatureProcess.h:51
virtual int init(map< string, string > &mapper)
Virtual to init object from parsed fields.
Definition FeatureProcess.h:128
string resolve_feature_name(MedFeatures &features, string substr)
Utility : get corresponding name in MedFeatures.
Definition FeatureProcess.cpp:223
void * new_polymorphic(string derived_class_name)
for polymorphic classes that want to be able to serialize/deserialize a pointer * to the derived clas...
Definition FeatureProcess.cpp:83
virtual int filter(unordered_set< string > &features)
Filter according to a subset of features.
Definition FeatureProcess.h:131
int apply(MedFeatures &features, bool learning)
PostProcess of MedFeatures - on all or a subset of the ids calls virtaul function "_apply/_conditiona...
Definition FeatureProcess.cpp:183
string feature_name
Feature name ( + name as appears in MedFeatures) ;.
Definition FeatureProcess.h:55
virtual bool are_features_affected(unordered_set< string > &out_req_features)
check if a set of features is affected by the current processor
Definition FeatureProcess.h:137
int learn(MedFeatures &features)
PostProcess of MedFeatures - on all ids.
Definition FeatureProcess.cpp:174
virtual string select_learn_matrix(const vector< string > &matrix_tags) const
Will be called before learn to create new version for the matrix if needed - in parallel of existing ...
Definition FeatureProcess.h:59
virtual void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor Empty sets = requir...
Definition FeatureProcess.h:142
virtual bool is_selector()
allows testing if this feature processor is a selector
Definition FeatureProcess.h:145
Feature Selector abstract class.
Definition FeatureProcess.h:538
bool is_selector()
allows testing if this feature processor is a selector
Definition FeatureProcess.h:566
virtual int _apply(MedFeatures &features, unordered_set< int > &ids)
Apply selection.
Definition FeatureSelector.cpp:75
int numToSelectDelta
Delta around numToSelect. will search to find [numToSelect - numToSelectDelta, numToSelect + numToSel...
Definition FeatureProcess.h:554
int numToSelect
Target number to select (if 0, ignored)
Definition FeatureProcess.h:551
float missing_value
Missing Value.
Definition FeatureProcess.h:542
bool are_features_affected(unordered_set< string > &out_req_features)
check if a set of features is affected by the current processor
Definition FeatureSelector.cpp:107
unordered_set< string > required
Required Features.
Definition FeatureProcess.h:545
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor
Definition FeatureSelector.cpp:115
virtual int Learn(MedFeatures &features, unordered_set< int > &ids)
Find set of selected features- Calls _learn function, and may be overrided directly.
Definition FeatureSelector.cpp:19
vector< string > selected
Selected Features (ordered)
Definition FeatureProcess.h:548
GetProbProcessor:
Definition FeatureProcess.h:1088
map< float, string > feature_names
feature names if multiple target_labels are given
Definition FeatureProcess.h:1094
bool remove_origin
determine whether to remove original if multiple target_labels are given
Definition FeatureProcess.h:1095
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureProcess.cpp:1509
float missing_value
Missing Value.
Definition FeatureProcess.h:1091
map< float, int > target_labels
if given, create a new feature per target label
Definition FeatureProcess.h:1093
int min_obs
minimal observations to calc prob - otherwise use prior
Definition FeatureProcess.h:1097
int overall_count
weight of overall probability
Definition FeatureProcess.h:1092
vector< float > overall_prob
default prob for unknown classes
Definition FeatureProcess.h:1100
bool all_labels
if given - take all labels as target-labels
Definition FeatureProcess.h:1096
vector< map< float, float > > probs
actual probability per class
Definition FeatureProcess.h:1099
ImportanceFeatureSelector - selector which uses feature importance method for sepcific model to rank ...
Definition FeatureProcess.h:845
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:988
float minStat
minimal threshold score to select the feature
Definition FeatureProcess.h:850
string predictor_params
the predictor parameters
Definition FeatureProcess.h:848
bool verbose
print all feature importance
Definition FeatureProcess.h:851
string importance_params
additional importance parameters for the feature importance
Definition FeatureProcess.h:849
string predictor
the predictor type - same as in the json file: qrf,lightgbm...
Definition FeatureProcess.h:847
IterativeFeatureSelector - Apply bottom-up or top-down iteration for feature selection.
Definition FeatureProcess.h:879
string grouping_mode
get also provide external file with the grouping
Definition FeatureProcess.h:898
string predictor_params
the predictor parameters
Definition FeatureProcess.h:882
string rates
instruction on rate of selection - comma separated pairs : #-bound:step
Definition FeatureProcess.h:888
unordered_set< string > ignored
features to ignore in selection process
Definition FeatureProcess.h:895
ADD_SERIALIZATION_FUNCS(processor_type, predictor, predictor_params, predictor_params_vec, nfolds, folds, mode, rates_vec, cohort_params, bootstrap_params, msr_params, work_on_sets, required, ignored, numToSelect, selected, report, do_internal_cv, grouping_mode) private int _learn(MedFeatures &features, unordered_set< int > &ids)
Find set of selected features.
Definition FeatureSelector.cpp:1185
vector< int > folds
if given, perform only subset of the possible 'nfolds' folds in cross-validation
Definition FeatureProcess.h:886
string predictor
the predictor type - same as in the json file: qrf,lightgbm...
Definition FeatureProcess.h:881
string predictor_params_file
File with nFeatures-dependent predictor parameters.
Definition FeatureProcess.h:883
string msr_params
measurements parameters for bootstrap performance evaluation
Definition FeatureProcess.h:891
string bootstrap_params
parameters for bootstrapping ('/' separaters)
Definition FeatureProcess.h:890
unordered_set< string > ungroupd_names
features-names (NAME in FTR_####.NAME) not to be grouped even in work_on_sets mode.
Definition FeatureProcess.h:894
string progress_file_path
file path to progress file
Definition FeatureProcess.h:897
bool group_to_sigs
If true will group ungroupd_names to signals.
Definition FeatureProcess.h:893
bool verbose
print all feature importance
Definition FeatureProcess.h:896
bool do_internal_cv
use nfolds and create internal splits (if false, uses original samples' splits
Definition FeatureProcess.h:885
bool work_on_sets
work on sets of features according to signals
Definition FeatureProcess.h:892
int nfolds
number of folds for cross-validation
Definition FeatureProcess.h:884
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:1062
string cohort_params
cohort parameters for bootstrap performance evaluation (type:from,to/type:from,to/....
Definition FeatureProcess.h:889
string mode
'top2bottom' or 'bottom2top'
Definition FeatureProcess.h:887
Feature Selector : lasso.
Definition FeatureProcess.h:585
float rate
rate for SGD:
Definition FeatureProcess.h:601
float momentum
Momentum for SGD:
Definition FeatureProcess.h:604
float stop_at_err
Momentum for SGD:
Definition FeatureProcess.h:607
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:797
float initMaxLambda
Initial lambda.
Definition FeatureProcess.h:594
float lambdaRatio
Features less controled in the selection stage (set labmda -> lambda*lambdaRatio)
Definition FeatureProcess.h:597
Feature Selector : MRMR.
Definition FeatureProcess.h:755
univariateSelectionParams params
Selection Params.
Definition FeatureProcess.h:758
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:405
A wrapper class which contains the MedBootstrap object and the results for later quering the scores f...
Definition MedBootstrap.h:324
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
Definition MedMat.h:63
Definition MedValueCleaner.h:61
float removeMax
Thresholds for removing.
Definition MedValueCleaner.h:71
float trimMax
Thresholds for trimming.
Definition MedValueCleaner.h:68
ValueCleanerParams params
Learning parameters.
Definition MedValueCleaner.h:65
FeatureMissingIndicator: creates a feature that indicates if a feature is missing or not.
Definition FeatureProcess.h:1164
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor Empty sets = requir...
Definition FeatureProcess.cpp:1588
int init(map< string, string > &mapper)
Virtual to init object from parsed fields.
Definition FeatureProcess.cpp:1538
string new_feature_name
generated feature name
Definition FeatureProcess.h:1169
string name
feature name postfix (new feautre X is XXX.name)
Definition FeatureProcess.h:1167
float replace_value
if added, replace value in original matrix
Definition FeatureProcess.h:1168
float missing_value
missing value in origianl features matrix
Definition FeatureProcess.h:1166
A Processor which contains a vector of simpler processors Useful for applying same cleaners on a set ...
Definition FeatureProcess.h:169
int filter(unordered_set< string > &features)
Filter according to a subset of features.
Definition FeatureProcess.cpp:366
string select_learn_matrix(const vector< string > &matrix_tags) const
Will be called before learn to create new version for the matrix if needed - in parallel of existing ...
Definition FeatureProcess.cpp:458
bool are_features_affected(unordered_set< string > &out_req_features)
check if a set of features is affected by the current processor
Definition FeatureProcess.cpp:410
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureProcess.cpp:439
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor
Definition FeatureProcess.cpp:426
MultiplierProcessor:
Definition FeatureProcess.h:1133
vector< string > selected_tags
the selected tags to activeate on
Definition FeatureProcess.h:1135
bool divide
if true will divide instead of multiply
Definition FeatureProcess.h:1137
string multiplier_name
the name of the feature to multiply by
Definition FeatureProcess.h:1136
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition MultiplierProcessor.cpp:7
OneHotFeatProcessor:
Definition FeatureProcess.h:1038
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureProcess.cpp:1149
string removed_feature_name
name of feature to remove (if needed)
Definition FeatureProcess.h:1043
string other_feature_name
name of 'other' feature (if needed)
Definition FeatureProcess.h:1042
bool remove_last
if true, remove the feature corresponding to the last value to avoid linear dependency
Definition FeatureProcess.h:1047
bool rem_origin
if true, remove original feature after creating indeices
Definition FeatureProcess.h:1044
bool add_other
if true, add an extra feature for values not in learning-set
Definition FeatureProcess.h:1045
vector< string > regex_list
define multilabel according to regexs list comma separated (don't check values in learn).
Definition FeatureProcess.h:1049
vector< string > regex_list_names
define the names for the columns in regex_list case.
Definition FeatureProcess.h:1050
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor
Definition FeatureProcess.cpp:1343
int max_values
maximal allowed number of different values
Definition FeatureProcess.h:1048
bool allow_other
if true, values in test, but not in learning-set are allowed
Definition FeatureProcess.h:1046
string index_feature_prefix
prefix of index features (names are prefix_value)
Definition FeatureProcess.h:1041
bool are_features_affected(unordered_set< string > &out_req_features)
check if a set of features is affected by the current processor
Definition FeatureProcess.cpp:1321
Definition SerializableObject.h:32
int init_from_string(string init_string)
Init from string.
Definition SerializableObject.cpp:121
TagFeatureSelector - selector which leave us only with the selected "tags" given as param (if empty d...
Definition FeatureProcess.h:813
vector< string > removed_tags
tags to remove
Definition FeatureProcess.h:817
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:876
vector< string > selected_tags
the selected tags
Definition FeatureProcess.h:816
Feature Selector : Univariate.
Definition FeatureProcess.h:714
univariateSelectionParams params
Selection Params.
Definition FeatureProcess.h:718
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:183
Definition MedValueCleaner.h:17
When building startas on a set of several features, we build a cartesian product of their combination...
Definition FeatureProcess.h:401
Definition FeatureProcess.h:368
Definition FeatureProcess.h:663
int takeSquare
for correlation
Definition FeatureProcess.h:673
int max_samples
Utility : maximum number of samples to take for moments calculations.
Definition FeatureProcess.h:679
float pDistance
for samples distance correlation
Definition FeatureProcess.h:676
int nBins
for mutual information
Definition FeatureProcess.h:669