5#include <MedProcessTools/MedProcessTools/MedFeatures.h>
6#include <MedProcessTools/MedProcessTools/MedProcessUtils.h>
8#include <MedProcessTools/MedProcessTools/MedProcessUtils.h>
9#include <MedUtils/MedUtils/MedUtils.h>
10#include <MedAlgo/MedAlgo/BinSplitOptimizer.h>
11#include <unordered_set>
13#define DEFAULT_FEAT_CLNR_NTHREADS 24
56 string resolved_feature_name;
65 int learn_nthreads, clean_nthreads;
70 virtual void clear() {};
71 void init_defaults() { learn_nthreads = DEFAULT_FEAT_CLNR_NTHREADS; clean_nthreads = DEFAULT_FEAT_CLNR_NTHREADS; processor_type = FTR_PROCESS_LAST; };
78 virtual string get_feature_name() {
return this->
feature_name; }
79 virtual void get_feature_names(vector<string> & feature_names) { feature_names.clear(); feature_names.push_back(
feature_name); };
82 virtual int Learn(
MedFeatures& features, unordered_set<int>& ids) {
return 0; }
92 int learn(
MedFeatures& features, unordered_set<int>& ids) {
return Learn(features, ids); }
95 virtual int _apply(
MedFeatures& features, unordered_set<int>& ids,
bool learning) {
97 return _apply(features, ids);
100 virtual int _apply(
MedFeatures& features, unordered_set<int>& ids);
101 virtual int _conditional_apply(
MedFeatures& features, unordered_set<int>& ids, unordered_set<string>& req_features,
bool learning);
112 int apply(
MedFeatures& features, unordered_set<string>& req_features,
bool learning);
114 int apply(
MedFeatures& features, unordered_set<int>& ids, unordered_set<string>& req_features,
bool learning);
117 int apply(
MedFeatures& features, unordered_set<string>& req_features) {
return apply(features, req_features,
true); };
119 int apply(
MedFeatures& features, unordered_set<int>& ids, unordered_set<string>& req_features) {
return apply(features, ids, req_features,
true); };
124 static FeatureProcessor *make_processor(
string processor_name,
string params);
127 virtual int init(
void *processor_params) {
return 0; };
128 virtual int init(map<string, string>& mapper) {
return 0; };
131 virtual int filter(unordered_set<string>& features) {
return (features.find(
feature_name) == features.end()) ? 0 : 1; };
137 virtual bool are_features_affected(unordered_set<string>& out_req_features) {
return (out_req_features.empty() || out_req_features.find(resolved_feature_name) != out_req_features.end()); }
142 virtual void update_req_features_vec(unordered_set<string>& out_req_features, unordered_set<string>& in_req_features) { in_req_features = out_req_features; };
152 size_t get_processor_size();
153 size_t processor_serialize(
unsigned char *blob);
156 virtual void dprint(
const string &pref,
int rp_flag);
177 bool use_parallel_learn;
178 bool use_parallel_apply;
181 vector<FeatureProcessor *> processors;
189 void init_defaults() { processor_type =
FTR_PROCESS_MULTI; duplicate = 0; members_type = FTR_PROCESS_LAST; init_string =
""; tag =
""; use_parallel_learn =
true; use_parallel_apply =
true; };
195 int init(map<string, string>& mapper);
201 int Learn(
MedFeatures& features, unordered_set<int>& ids);
204 int _apply(
MedFeatures& features, unordered_set<int>& ids,
bool learning);
205 int _conditional_apply(
MedFeatures& features, unordered_set<int>& ids, unordered_set<string>& req_features,
bool learning);
207 virtual void get_feature_names(vector<string>& feature_names);
214 int filter(unordered_set<string>& features);
217 void dprint(
const string &pref,
int fp_flag);
227 ADD_SERIALIZATION_FUNCS(processor_type, members_type, init_string, duplicate, tag, processors, use_parallel_apply)
230#define DEF_FTR_TRIMMING_SD_NUM 7
231#define DEF_FTR_REMOVING_SD_NUM 14
246 void init_defaults() {
248 params.missing_value = MED_MAT_MISSING_VALUE;
249 params.trimming_sd_num = DEF_FTR_TRIMMING_SD_NUM;
250 params.removing_sd_num = DEF_FTR_REMOVING_SD_NUM;
259 int init(
void *processor_params) {
return MedValueCleaner::init(processor_params); };
262 int init(map<string, string>& mapper);
268 int Learn(
MedFeatures& features, unordered_set<int>& ids);
269 int iterativeLearn(
MedFeatures& features, unordered_set<int>& ids);
270 int quantileLearn(
MedFeatures& features, unordered_set<int>& ids);
273 int _apply(
MedFeatures& features, unordered_set<int>& ids);
318 float max_val_prctile = 1;
319 float max_val_for_triming = 2;
320 float prctile_th = (float)0.001;
331 int Learn(
MedFeatures& features, unordered_set<int>& ids);
334 int _apply(
MedFeatures& features, unordered_set<int>& ids);
341 int init(map<string, string>& mapper);
349 ADD_SERIALIZATION_FUNCS(processor_type,
feature_name, resolved_feature_name,
mean, sd,
resolution,
normalizeSd,
fillMissing,
resolution_only,
verbosity,
resolution_bin,
371 float resolution, min, max;
375 featureStrata(
string& _name,
float _resolution,
float _min,
float _max) { name = _name; resolution = _resolution; min = _min; max = _max; }
377 void SetNValues() { nValues = ((int)(max / resolution) - (int)(min / resolution) + 1); }
380 int getIndex(
float value,
float missing_val)
const {
381 if (value == missing_val)
386 else if (value <= min)
389 return ((
int)(value / resolution) - (
int)(min / resolution));
403 vector<featureStrata> stratas;
406 size_t nStratas()
const {
return stratas.size(); }
410 if (stratas.size() == 0)
413 factors.resize(stratas.size());
415 for (
auto& strata : stratas)
419 for (
int i = 1; i < stratas.size(); i++)
420 factors[i] = factors[i - 1] * stratas[i - 1].nValues;
423 int nValues()
const {
424 if (stratas.size() == 0)
427 return factors.back() * stratas.back().nValues;
430 int getIndex(
float missing_val,
431 const vector<
const vector<float> *> &strataValues,
int row)
const {
433 for (
int i = 0; i < nStratas(); i++)
434 index += factors[i] * stratas[i].getIndex(strataValues[i]->at(row), missing_val);
450 float round_to_closest(
float val)
const;
454 float missing_value = MED_MAT_MISSING_VALUE;
462 int min_samples = 50;
465 int leave_missing_for_small_stratas = 0;
468 int impute_strata_with_missing = 0;
471 vector<imputeMomentTypes> moment_type_vec;
472 vector<float> default_moment_vec;
473 vector<vector<float>> moments_vec;
476 imputeMomentTypes moment_type = IMPUTE_MMNT_MEAN;
477 float default_moment;
478 vector<float> moments;
481 vector < pair<float, float> > default_histogram;
482 vector < vector<pair<float, float> > > histograms;
484 vector<int> strata_sizes;
489 bool round_to_existing_value =
true;
490 vector<float> existing_values;
498 void addStrata(
string& init_string);
499 void addStrata(
featureStrata& strata) { imputerStrata.stratas.push_back(strata); }
500 void addStrata(
string& name,
float resolution,
float min,
float max) { imputerStrata.stratas.push_back(
featureStrata(name, resolution, min, max)); }
504 int init(map<string, string>& mapper);
506 imputeMomentTypes getMomentType(
string& entry);
515 int Learn(
MedFeatures& features, unordered_set<int>& ids);
518 int _apply(
MedFeatures& features, unordered_set<int>& ids,
bool learning);
521 void check_stratas_name(
MedFeatures& features, map <string, string> &strata_name_conversion);
525 ADD_SERIALIZATION_FUNCS(processor_type,
feature_name, resolved_feature_name, missing_value, imputerStrata, moment_type, moments, histograms, strata_sizes, default_moment, default_histogram,
526 moment_type_vec, moments_vec, default_moment_vec, leave_missing_for_small_stratas, impute_strata_with_missing, round_to_existing_value, existing_values)
528 void dprint(
const string &pref,
int fp_flag);
564 virtual int _conditional_apply(
MedFeatures& features, unordered_set<int>& ids, unordered_set<string>& out_req_features);
577 virtual int _learn(
MedFeatures& features, unordered_set<int>& ids) {
return 0; }
591 int init(map<string, string>& mapper);
598 vector<string> lax_lasso_features;
622 int _learn(
MedFeatures& features, unordered_set<int>& ids);
634 float percentage = 1.0F;
641 int init(map<string, string>& mapper);
653 int _learn(
MedFeatures& features, unordered_set<int>& ids);
661} UnivariateSelectionMethod;
665 UnivariateSelectionMethod method;
670 MedBinningType binMethod = BIN_EQUIDIST;
681 UnivariateSelectionMethod get_method(
string name) {
683 boost::algorithm::to_lower(name);
684 if (name ==
"pearson")
685 return UNIV_SLCT_PRSN;
686 else if (name ==
"mi" || name ==
"mutual_information" || name ==
"mutualinformation")
688 else if (name ==
"dcorr" || name ==
"dist_corr" || name ==
"distcorr")
689 return UNIV_SLCT_DCORR;
691 return UNIV_SLCT_LAST;
694 MedBinningType get_binning_method(
string name) {
696 boost::algorithm::to_lower(name);
697 if (name ==
"equi_dist")
699 else if (name ==
"equi_size")
725 int init(map<string, string>& mapper);
737 int getAbsPearsonCorrs(
MedFeatures& features, unordered_set<int>& ids, vector<float>& stats);
738 int getMIs(
MedFeatures& features, unordered_set<int>& ids, vector<float>& stats);
739 int getDistCorrs(
MedFeatures& features, unordered_set<int>& ids, vector<float>& stats);
741 int _learn(
MedFeatures& features, unordered_set<int>& ids);
760 MRMRPenaltyMethod penaltyMethod;
767 int init(map<string, string>& mapper);
768 void init_defaults();
769 MRMRPenaltyMethod get_penalty_method(
string _method);
780 int _learn(
MedFeatures& features, unordered_set<int>& ids);
794#include "IterativeImputer.h"
802#define DEF_MAX_SAMPLE 1000
803void get_all_values(
MedFeatures& features,
string& signalName, unordered_set<int>& ids, vector<float>& values,
int max_sample = DEF_MAX_SAMPLE);
804void get_all_outcomes(
MedFeatures& features, unordered_set<int>& ids, vector<float>& values,
int max_sample = DEF_MAX_SAMPLE);
805void smearBins(vector<int>& bins,
int nBins,
int reqNbins);
823 int init(map<string, string>& mapper);
830 void dprint(
const string &pref,
int fp_flag);
837 int _learn(
MedFeatures& features, unordered_set<int>& ids);
857 int init(map<string, string>& mapper);
870 int _learn(
MedFeatures& features, unordered_set<int>& ids);
888 string rates =
"50:1,100:2,500:5,5000:10";
900 vector<int> rates_vec;
901 vector<string> predictor_params_vec;
902 string measurement_name;
903 vector<string> report;
910 int init(map<string, string>& mapper);
921 void print_report(
string& fileName);
924 void retrace(
MedFeatures& features, unordered_set<int>& ids, vector<string>& families_order,
int start,
int end);
925 void retrace(
MedFeatures& features, vector<string>& families_order,
int start,
int end) { unordered_set<int> empty; retrace(features, empty, families_order, start, end); }
929 ADD_SERIALIZATION_FUNCS(processor_type,
predictor,
predictor_params, predictor_params_vec,
nfolds,
folds,
mode, rates_vec,
cohort_params,
bootstrap_params,
msr_params,
work_on_sets,
934 unordered_set<string> resolved_required, resolved_ignored;
940 void get_rates_vec();
943 void read_params_vec();
946 void get_features_families(
MedFeatures& features, map<
string, vector<string> >& featureFamilies);
949 void prepare_for_iterations(
MedBootstrapResult& bootstrapper,
MedFeatures& features, vector<int>&
folds, vector<vector<int>>& trainRows, vector<vector<int>>& testRows, vector<vector<float>>&trainLabels,
950 vector<vector<MedSample>>&testSamples,
MedFeatures& bootstrapFeatures);
951 void pre_learn(
MedFeatures& features,
MedBootstrapResult& bootstrapper, map<
string, vector<string> >& featureFamilies, vector<int>& orig_folds);
958 void retraceTop2BottomSelection(
MedFeatures& features, map<
string, vector<string>>& featureFamilies,
MedBootstrapResult& bootstrapper, vector<string>& order,
int start,
int end);
959 void retraceBottom2TopSelection(
MedFeatures& features, map<
string, vector<string>>& featureFamilies,
MedBootstrapResult& bootstrapper, vector<string>& order,
int start,
int end);
1014 int init(map<string, string>& mapper);
1024 vector<int> selected_indexes;
1026 int _learn(
MedFeatures& features, unordered_set<int>& ids);
1029 int _apply(
MedFeatures& features, unordered_set<int>& ids);
1051 string other_suffix =
"other";
1054 map<float, vector<string>> value2feature;
1061 int init(map<string, string>& mapper);
1070 void update_req_features_vec(unordered_set<string>& out_req_features, unordered_set<string>& in_req_features);
1076 int Learn(
MedFeatures& features, unordered_set<int>& ids);
1077 int _apply(
MedFeatures& features, unordered_set<int>& ids);
1078 string get_feature_name(
float value,
const string &out_prefix, unordered_map<float, string> &value2Name,
float missing_value);
1108 int Learn(
MedFeatures& features, unordered_set<int>& ids);
1111 int _apply(
MedFeatures& features, unordered_set<int>& ids);
1115 int init(map<string, string>& mapper);
1144 int init(map<string, string>& mapper);
1151 void dprint(
const string &pref,
int fp_flag);
1158 int _apply(
MedFeatures& features, unordered_set<int>& ids);
1174 int _apply(
MedFeatures& features, unordered_set<int>& ids);
1175 int init(map<string, string>& mapper);
1176 void update_req_features_vec(unordered_set<string>& out_req_features, unordered_set<string>& in_req_features);
1192 int init(map<string, string>& mapper);
1219 string get_bin_name(
float num)
const;
1242 int Learn(
MedFeatures& features, unordered_set<int>& ids);
1245 int _apply(
MedFeatures& features, unordered_set<int>& ids);
1249 int init(map<string, string>& mapper);
FeatureProcessorTypes
Definition FeatureProcess.h:21
@ FTR_PROCESSOR_LASSO_SELECTOR
"lasso" to create LassoSelector
Definition FeatureProcess.h:29
@ FTR_PROCESSOR_ITERATIVE_SELECTOR
"iterative_selector" applies bottom-up or top-down iteration for feature selection....
Definition FeatureProcess.h:32
@ FTR_PROCESS_IMPUTER
"imputer" to create FeatureImputer
Definition FeatureProcess.h:25
@ FTR_PROCESSOR_TAGS_SELECTOR
"tags_selector" to create TagFeatureSelector
Definition FeatureProcess.h:30
@ FTR_PROCESS_NORMALIZER
"normalizer" to create FeatureNormalizer
Definition FeatureProcess.h:24
@ FTR_PROCESS_MULTIPLIER
"multiplier" to create MultiplierProcessor - to multiply feature by other feature
Definition FeatureProcess.h:39
@ FTR_PROCESS_DUPLICATE
"duplicate" to create DuplicateProcessor - duplicates samples in order to do multiple imputations.
Definition FeatureProcess.h:41
@ FTR_PROCESS_BINNING
"binning" to create BinningFeatProcessor - binning with one hot on the bins
Definition FeatureProcess.h:43
@ FTR_PROCESS_ONE_HOT
"one_hot" to create OneHotFeatProcessor - make one-hot features from a given feature
Definition FeatureProcess.h:36
@ FTR_PROCESS_GET_PROB
"get_prob" to create GetProbFeatProcessor - replace categorical feature with probability of outcome i...
Definition FeatureProcess.h:37
@ FTR_PROCESS_MISSING_INDICATOR
"missing_indicator" to create MissingIndicatorProcessor - creates a feature that indicates if a featu...
Definition FeatureProcess.h:42
@ FTR_PROCESS_UNIVARIATE_SELECTOR
"univariate_selector" to create UnivariateFeatureSelector
Definition FeatureProcess.h:27
@ FTR_PROCESS_BASIC_OUTLIER_CLEANER
"basic_outlier_cleaner" or "basic_cleaner" to create FeatureBasicOutlierCleaner
Definition FeatureProcess.h:23
@ FTR_PROCESSOR_IMPORTANCE_SELECTOR
"importance_selector" to create ImportanceFeatureSelector
Definition FeatureProcess.h:31
@ FTR_PROCESS_PREDICTOR_IMPUTER
"predcitor_imputer" to create PredictorImputer
Definition FeatureProcess.h:38
@ FTR_PROCESS_ENCODER_PCA
"pca" to create FeaturePCA
Definition FeatureProcess.h:35
@ FTR_PROCESS_MULTI
"multi_processor" or "multi" to create MultiFeatureProcessor
Definition FeatureProcess.h:22
@ FTR_PROCESS_REMOVE_DGNRT_FTRS
"remove_deg" to create DgnrtFeatureRemvoer
Definition FeatureProcess.h:33
@ FTR_PROCESS_DO_CALC
"do_calc" to create DoCalcFeatProcessor
Definition FeatureProcess.h:26
@ FTR_PROCESSOR_MRMR_SELECTOR
"mrmr" or "mrmr_selector" to create MRMRFeatureSelector
Definition FeatureProcess.h:28
@ FTR_PROCESS_RESAMPLE_WITH_MISSING
"resample_with_missing" to create ResampleMissingProcessor - adds missing values to learn matrix
Definition FeatureProcess.h:40
@ FTR_PROCESS_ITERATIVE_IMPUTER
"iterative_imputer" to create IterativeImputer
Definition FeatureProcess.h:34
A parent class for single-value cleaners.
@ VAL_CLNR_ITERATIVE
"iterative"
Definition MedValueCleaner.h:12
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
GetProbProcessor:
Definition FeatureProcess.h:1217
Binning_Wrapper bin_sett
"bin_sett" parameter - controls how to bin the feature.
Definition FeatureProcess.h:1235
bool one_hot
If true will split each bin value to one hot of 0/1.
Definition FeatureProcess.h:1228
string bin_format
formating of feature name after binning
Definition FeatureProcess.h:1232
float missing_value
Missing Value.
Definition FeatureProcess.h:1222
bool keep_original_val
only relevant in one hot mode.
Definition FeatureProcess.h:1230
bool remove_origin
If true will remove the original/source feature.
Definition FeatureProcess.h:1226
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition BinningFeatProcessor.cpp:83
float missing_target_val
missing value target mapping. converts missing value to this value
Definition FeatureProcess.h:1224
Definition FeatureProcess.h:1184
int num_of_bins() const
returns number of bins
Definition BinningFeatProcessor.cpp:74
int init(map< string, string > &mapper)
Definition BinningFeatProcessor.cpp:7
int get_idx(float v) const
returns index for each value
Definition BinningFeatProcessor.cpp:69
vector< double > bin_repr_vals
the representative value for each bin. With size := bin_cutoffs.size()+1
Definition FeatureProcess.h:1187
vector< double > bin_cutoffs
index i for value (v) := bin_cutoffs[i-1] < v <= bin_cutoffs[i]
Definition FeatureProcess.h:1186
void load_bin_settings(const vector< float > &nums, vector< float > &y)
if has use_bin_settings => will update bin_cutoffs, bin_repr_vals
Definition BinningFeatProcessor.cpp:42
string use_bin_settings
if not empty - will use bin setting to create bins:
Definition FeatureProcess.h:1189
float normalize(float v) const
normalize value into bin and represantative
Definition BinningFeatProcessor.cpp:78
Feature Selector : Remove Degenerate features.
Definition FeatureProcess.h:630
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:860
A simple cleaner considering each value of a certain feature separatley.
Definition FeatureProcess.h:237
FeatureEncoder - General class for encoding features - PCA, autoencoder...
Definition FeatureProcess.h:966
vector< string > names
generated names
Definition FeatureProcess.h:970
bool are_features_affected(unordered_set< string > &out_req_features)
check if a set of features is affected by the current processor
Definition FeatureEncoder.cpp:15
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor
Definition FeatureEncoder.cpp:32
Feature Imputer to complete missing values.
Definition FeatureProcess.h:448
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureProcess.cpp:1055
void print()
debug and print
Definition FeatureProcess.cpp:753
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor
Definition FeatureProcess.cpp:1127
bool verbose
If true will print how many missing value were in each feature.
Definition FeatureProcess.h:455
bool verbose_learn
If true will call print after learn.
Definition FeatureProcess.h:456
int max_samples
Utility : maximum number of samples to take for moments calculations.
Definition FeatureProcess.h:487
Feature Normalizer.
Definition FeatureProcess.h:286
float max_x
parmeters of the transformation
Definition FeatureProcess.h:321
int resolution
resolution : if > 0 , will keep only the given number of digits after the point.
Definition FeatureProcess.h:302
float mean
Moments.
Definition FeatureProcess.h:299
bool use_linear_transform
If true will convert into linear transform from lower prctile to high prctile and has triming value.
Definition FeatureProcess.h:317
int verbosity
verbosity
Definition FeatureProcess.h:314
void reverse_apply(float &feature_value) const
Reverse action of Apply - denorm feature value.
Definition FeatureProcess.cpp:703
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureProcess.cpp:725
bool fillMissing
Fill missing values with mean.
Definition FeatureProcess.h:296
float resolution_bin
A factor to divide by - take floor and then multiply by again. Used in resolution_only mode.
Definition FeatureProcess.h:305
bool normalizeSd
Normalize Standard Deviation.
Definition FeatureProcess.h:293
string select_learn_matrix(const vector< string > &matrix_tags) const
Will be called before learn to create new version for the matrix if needed - in parallel of existing ...
Definition FeatureProcess.cpp:569
bool resolution_only
if resolution only
Definition FeatureProcess.h:311
float missing_value
Missing Value.
Definition FeatureProcess.h:290
int max_samples
Utility : maximum number of samples to take for moments calculations.
Definition FeatureProcess.h:308
PCA Parameters class.
Definition FeatureProcess.h:989
int subsample_count
subsample in the pca rows to speed up
Definition FeatureProcess.h:993
float pca_cutoff
PCA variance threshold to stop.
Definition FeatureProcess.h:992
int pca_top
Max Number of PCA Components to take.
Definition FeatureProcess.h:991
FeaturePCA - PCA encoder.
Definition FeatureProcess.h:1004
FeaturePCAParams params
PCA parameters.
Definition FeatureProcess.h:1007
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureEncoder.cpp:206
Definition FeatureProcess.h:51
virtual int init(map< string, string > &mapper)
Virtual to init object from parsed fields.
Definition FeatureProcess.h:128
string resolve_feature_name(MedFeatures &features, string substr)
Utility : get corresponding name in MedFeatures.
Definition FeatureProcess.cpp:223
void * new_polymorphic(string derived_class_name)
for polymorphic classes that want to be able to serialize/deserialize a pointer * to the derived clas...
Definition FeatureProcess.cpp:83
virtual int filter(unordered_set< string > &features)
Filter according to a subset of features.
Definition FeatureProcess.h:131
int apply(MedFeatures &features, bool learning)
PostProcess of MedFeatures - on all or a subset of the ids calls virtaul function "_apply/_conditiona...
Definition FeatureProcess.cpp:183
string feature_name
Feature name ( + name as appears in MedFeatures) ;.
Definition FeatureProcess.h:55
virtual bool are_features_affected(unordered_set< string > &out_req_features)
check if a set of features is affected by the current processor
Definition FeatureProcess.h:137
int learn(MedFeatures &features)
PostProcess of MedFeatures - on all ids.
Definition FeatureProcess.cpp:174
virtual string select_learn_matrix(const vector< string > &matrix_tags) const
Will be called before learn to create new version for the matrix if needed - in parallel of existing ...
Definition FeatureProcess.h:59
virtual void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor Empty sets = requir...
Definition FeatureProcess.h:142
virtual bool is_selector()
allows testing if this feature processor is a selector
Definition FeatureProcess.h:145
Feature Selector abstract class.
Definition FeatureProcess.h:538
bool is_selector()
allows testing if this feature processor is a selector
Definition FeatureProcess.h:566
virtual int _apply(MedFeatures &features, unordered_set< int > &ids)
Apply selection.
Definition FeatureSelector.cpp:75
int numToSelectDelta
Delta around numToSelect. will search to find [numToSelect - numToSelectDelta, numToSelect + numToSel...
Definition FeatureProcess.h:554
int numToSelect
Target number to select (if 0, ignored)
Definition FeatureProcess.h:551
float missing_value
Missing Value.
Definition FeatureProcess.h:542
bool are_features_affected(unordered_set< string > &out_req_features)
check if a set of features is affected by the current processor
Definition FeatureSelector.cpp:107
unordered_set< string > required
Required Features.
Definition FeatureProcess.h:545
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor
Definition FeatureSelector.cpp:115
virtual int Learn(MedFeatures &features, unordered_set< int > &ids)
Find set of selected features- Calls _learn function, and may be overrided directly.
Definition FeatureSelector.cpp:19
vector< string > selected
Selected Features (ordered)
Definition FeatureProcess.h:548
GetProbProcessor:
Definition FeatureProcess.h:1088
map< float, string > feature_names
feature names if multiple target_labels are given
Definition FeatureProcess.h:1094
bool remove_origin
determine whether to remove original if multiple target_labels are given
Definition FeatureProcess.h:1095
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureProcess.cpp:1509
float missing_value
Missing Value.
Definition FeatureProcess.h:1091
map< float, int > target_labels
if given, create a new feature per target label
Definition FeatureProcess.h:1093
int min_obs
minimal observations to calc prob - otherwise use prior
Definition FeatureProcess.h:1097
int overall_count
weight of overall probability
Definition FeatureProcess.h:1092
vector< float > overall_prob
default prob for unknown classes
Definition FeatureProcess.h:1100
bool all_labels
if given - take all labels as target-labels
Definition FeatureProcess.h:1096
vector< map< float, float > > probs
actual probability per class
Definition FeatureProcess.h:1099
ImportanceFeatureSelector - selector which uses feature importance method for sepcific model to rank ...
Definition FeatureProcess.h:845
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:988
float minStat
minimal threshold score to select the feature
Definition FeatureProcess.h:850
string predictor_params
the predictor parameters
Definition FeatureProcess.h:848
bool verbose
print all feature importance
Definition FeatureProcess.h:851
string importance_params
additional importance parameters for the feature importance
Definition FeatureProcess.h:849
string predictor
the predictor type - same as in the json file: qrf,lightgbm...
Definition FeatureProcess.h:847
IterativeFeatureSelector - Apply bottom-up or top-down iteration for feature selection.
Definition FeatureProcess.h:879
string grouping_mode
get also provide external file with the grouping
Definition FeatureProcess.h:898
string predictor_params
the predictor parameters
Definition FeatureProcess.h:882
string rates
instruction on rate of selection - comma separated pairs : #-bound:step
Definition FeatureProcess.h:888
unordered_set< string > ignored
features to ignore in selection process
Definition FeatureProcess.h:895
ADD_SERIALIZATION_FUNCS(processor_type, predictor, predictor_params, predictor_params_vec, nfolds, folds, mode, rates_vec, cohort_params, bootstrap_params, msr_params, work_on_sets, required, ignored, numToSelect, selected, report, do_internal_cv, grouping_mode) private int _learn(MedFeatures &features, unordered_set< int > &ids)
Find set of selected features.
Definition FeatureSelector.cpp:1185
vector< int > folds
if given, perform only subset of the possible 'nfolds' folds in cross-validation
Definition FeatureProcess.h:886
string predictor
the predictor type - same as in the json file: qrf,lightgbm...
Definition FeatureProcess.h:881
string predictor_params_file
File with nFeatures-dependent predictor parameters.
Definition FeatureProcess.h:883
string msr_params
measurements parameters for bootstrap performance evaluation
Definition FeatureProcess.h:891
string bootstrap_params
parameters for bootstrapping ('/' separaters)
Definition FeatureProcess.h:890
unordered_set< string > ungroupd_names
features-names (NAME in FTR_####.NAME) not to be grouped even in work_on_sets mode.
Definition FeatureProcess.h:894
string progress_file_path
file path to progress file
Definition FeatureProcess.h:897
bool group_to_sigs
If true will group ungroupd_names to signals.
Definition FeatureProcess.h:893
bool verbose
print all feature importance
Definition FeatureProcess.h:896
bool do_internal_cv
use nfolds and create internal splits (if false, uses original samples' splits
Definition FeatureProcess.h:885
bool work_on_sets
work on sets of features according to signals
Definition FeatureProcess.h:892
int nfolds
number of folds for cross-validation
Definition FeatureProcess.h:884
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:1062
string cohort_params
cohort parameters for bootstrap performance evaluation (type:from,to/type:from,to/....
Definition FeatureProcess.h:889
string mode
'top2bottom' or 'bottom2top'
Definition FeatureProcess.h:887
Feature Selector : lasso.
Definition FeatureProcess.h:585
float rate
rate for SGD:
Definition FeatureProcess.h:601
float momentum
Momentum for SGD:
Definition FeatureProcess.h:604
float stop_at_err
Momentum for SGD:
Definition FeatureProcess.h:607
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:797
float initMaxLambda
Initial lambda.
Definition FeatureProcess.h:594
float lambdaRatio
Features less controled in the selection stage (set labmda -> lambda*lambdaRatio)
Definition FeatureProcess.h:597
Feature Selector : MRMR.
Definition FeatureProcess.h:755
univariateSelectionParams params
Selection Params.
Definition FeatureProcess.h:758
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:405
A wrapper class which contains the MedBootstrap object and the results for later quering the scores f...
Definition MedBootstrap.h:324
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
Definition MedValueCleaner.h:61
float removeMax
Thresholds for removing.
Definition MedValueCleaner.h:71
float trimMax
Thresholds for trimming.
Definition MedValueCleaner.h:68
ValueCleanerParams params
Learning parameters.
Definition MedValueCleaner.h:65
FeatureMissingIndicator: creates a feature that indicates if a feature is missing or not.
Definition FeatureProcess.h:1164
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor Empty sets = requir...
Definition FeatureProcess.cpp:1588
int init(map< string, string > &mapper)
Virtual to init object from parsed fields.
Definition FeatureProcess.cpp:1538
string new_feature_name
generated feature name
Definition FeatureProcess.h:1169
string name
feature name postfix (new feautre X is XXX.name)
Definition FeatureProcess.h:1167
float replace_value
if added, replace value in original matrix
Definition FeatureProcess.h:1168
float missing_value
missing value in origianl features matrix
Definition FeatureProcess.h:1166
A Processor which contains a vector of simpler processors Useful for applying same cleaners on a set ...
Definition FeatureProcess.h:169
int filter(unordered_set< string > &features)
Filter according to a subset of features.
Definition FeatureProcess.cpp:366
string select_learn_matrix(const vector< string > &matrix_tags) const
Will be called before learn to create new version for the matrix if needed - in parallel of existing ...
Definition FeatureProcess.cpp:458
bool are_features_affected(unordered_set< string > &out_req_features)
check if a set of features is affected by the current processor
Definition FeatureProcess.cpp:410
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureProcess.cpp:439
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor
Definition FeatureProcess.cpp:426
MultiplierProcessor:
Definition FeatureProcess.h:1133
vector< string > selected_tags
the selected tags to activeate on
Definition FeatureProcess.h:1135
bool divide
if true will divide instead of multiply
Definition FeatureProcess.h:1137
string multiplier_name
the name of the feature to multiply by
Definition FeatureProcess.h:1136
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition MultiplierProcessor.cpp:7
OneHotFeatProcessor:
Definition FeatureProcess.h:1038
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureProcess.cpp:1149
string removed_feature_name
name of feature to remove (if needed)
Definition FeatureProcess.h:1043
string other_feature_name
name of 'other' feature (if needed)
Definition FeatureProcess.h:1042
bool remove_last
if true, remove the feature corresponding to the last value to avoid linear dependency
Definition FeatureProcess.h:1047
bool rem_origin
if true, remove original feature after creating indeices
Definition FeatureProcess.h:1044
bool add_other
if true, add an extra feature for values not in learning-set
Definition FeatureProcess.h:1045
vector< string > regex_list
define multilabel according to regexs list comma separated (don't check values in learn).
Definition FeatureProcess.h:1049
vector< string > regex_list_names
define the names for the columns in regex_list case.
Definition FeatureProcess.h:1050
void update_req_features_vec(unordered_set< string > &out_req_features, unordered_set< string > &in_req_features)
update sets of required as input according to set required as output to processor
Definition FeatureProcess.cpp:1343
int max_values
maximal allowed number of different values
Definition FeatureProcess.h:1048
bool allow_other
if true, values in test, but not in learning-set are allowed
Definition FeatureProcess.h:1046
string index_feature_prefix
prefix of index features (names are prefix_value)
Definition FeatureProcess.h:1041
bool are_features_affected(unordered_set< string > &out_req_features)
check if a set of features is affected by the current processor
Definition FeatureProcess.cpp:1321
Definition SerializableObject.h:32
int init_from_string(string init_string)
Init from string.
Definition SerializableObject.cpp:121
TagFeatureSelector - selector which leave us only with the selected "tags" given as param (if empty d...
Definition FeatureProcess.h:813
vector< string > removed_tags
tags to remove
Definition FeatureProcess.h:817
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:876
vector< string > selected_tags
the selected tags
Definition FeatureProcess.h:816
Feature Selector : Univariate.
Definition FeatureProcess.h:714
univariateSelectionParams params
Selection Params.
Definition FeatureProcess.h:718
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureSelector.cpp:183
Definition MedValueCleaner.h:17
When building startas on a set of several features, we build a cartesian product of their combination...
Definition FeatureProcess.h:401
Definition FeatureProcess.h:368
Definition FeatureProcess.h:663
int takeSquare
for correlation
Definition FeatureProcess.h:673
int max_samples
Utility : maximum number of samples to take for moments calculations.
Definition FeatureProcess.h:679
float pDistance
for samples distance correlation
Definition FeatureProcess.h:676
int nBins
for mutual information
Definition FeatureProcess.h:669