Medial Code Documentation
Loading...
Searching...
No Matches
MedFeatures.h
1// MedFeatures - holding data as a map from name to vector
2
3#ifndef __MED_FEATURES_H__
4#define __MED_FEATURES_H__
5
6#include <InfraMed/InfraMed/InfraMed.h>
8#include <MedProcessTools/MedProcessTools/MedProcessUtils.h>
9#include <MedProcessTools/MedProcessTools/MedSamples.h>
10#include <MedMat/MedMat/MedMat.h>
11#include <random>
12
13//.......................................................................................
14// A structure holding feature attributes
15//.......................................................................................
17public:
18 bool normalized = false;
19 bool imputed = false;
20 float denorm_mean = 0.0, denorm_sdv = 1.0;
21
22 unordered_map<float, string> value2Name;
23
24 // Serialization
25 ADD_CLASS_NAME(FeatureAttr)
27};
28
29//.......................................................................................
46//.......................................................................................
47class MedFeatures final : public SerializableObject {
48public:
49
50 // Data
51 map<string, vector<float> > data;
52 vector<float> weights;
53 vector<MedSample> samples;
54
57 map<int, pair<int, int>> pid_pos_len;
58
59 // Attributes
60 map<string, FeatureAttr> attributes;
61 map<string, unordered_set<string> > tags;
62
63 // masks for cleaning , imputing
64 const static unsigned char cleaned_mask = (unsigned char)0x01;
65 const static unsigned char imputed_mask = (unsigned char)0x02;
66 map<string, vector<unsigned char>> masks;
67 float medf_missing_value = (float)MED_MAT_MISSING_VALUE;
68
69 // time Unit
71
74
75 // Functions
76
78 MedFeatures(int _time_unit) { time_unit = _time_unit; }
80 MedFeatures() { time_unit = global_default_time_unit; global_serial_id_cnt = 0; }
81
82 // Initialization
84 void clear() { data.clear(); samples.clear(); pid_pos_len.clear(); attributes.clear(); weights.clear(); tags.clear(); masks.clear(); }
86 void set_time_unit(int _time_unit) { time_unit = _time_unit; }
87
89 void get_feature_names(vector<string>& names) const;
91 void get_as_matrix(MedMat<float>& mat) const;
93 void get_as_matrix(MedMat<float>& mat, vector<string>& names) const;
95 void get_as_matrix(MedMat<float>& mat, const vector<string>& names, vector<int> &idx) const;
96
98 void set_as_matrix(const MedMat<float>& mat);
99
101 void append_samples(MedIdSamples& in_samples);
103 void insert_samples(MedIdSamples& in_samples, int index);
105 void init_all_samples(vector<MedIdSamples> &in_samples) { samples.clear(); for (auto& id : in_samples) append_samples(id); init_pid_pos_len(); }
107 void init_pid_pos_len();
109 int get_pid_pos(int pid) const { if (pid_pos_len.find(pid) == pid_pos_len.end()) return -1; return (pid_pos_len.at(pid).first); }
111 int get_pid_len(int pid) const { if (pid_pos_len.find(pid) == pid_pos_len.end()) return 0; return (pid_pos_len.at(pid).second); }
113 unsigned int get_crc();
115 void print_csv() const;
117 void get_samples(MedSamples& outSamples) const;
119 int get_max_serial_id_cnt() const;
120
123 int write_as_csv_mat(const string &csv_fname, bool write_attributes = false) const;
124 int add_to_csv_mat(const string &csv_fname, bool write_attributes, int start_idx) const;
125 void write_csv_data(ofstream& out_f, bool write_attributes, vector<string>& col_names, int start_idx) const;
126
129 int read_from_csv_mat(const string &csv_fname, bool read_time_raw = true);
130
133 int filter(unordered_set<string>& selectedFeatures);
134
136 int prep_selected_list(vector<string>& search_str, unordered_set<string> &selected);
137
138 // masks functions
139 int init_masks();
140 int get_masks_as_mat(MedMat<unsigned char> &masks_mat);
141 int mark_imputed_in_masks(float _missing_val);
142 int mark_imputed_in_masks() { return mark_imputed_in_masks(medf_missing_value); }
143
144 void round_data(float r);
145 void noise_data(float r);
146
148 void samples_sort();
149
151 string resolve_name(string& substr) const;
152
153 // Serialization
154 ADD_CLASS_NAME(MedFeatures)
156
157};
158
161
162
165namespace medial {
169 namespace process {
171 template<class T> void commit_selection(vector<T> &vec, const vector<int> &idx);
173 void filter_row_indexes(MedFeatures &dataMat, vector<int> &selected_indexes, bool op_flag = false);
175 void filter_row_indexes_safe(MedFeatures &dataMat, const vector<int> &selected_indexes, bool op_flag = false);
177 void down_sample(MedFeatures &dataMat, double take_ratio, bool with_repeats = false,
178 vector<int> *selected_indexes = NULL);
180 double reweight_by_general(MedFeatures &data_records, const vector<string> &groups,
181 vector<float> &weigths, bool print_verbose);
183 void match_by_general(MedFeatures &data_records, const vector<string> &groups,
184 vector<int> &filtered_row_ids, float price_ratio, int min_grp_size, bool print_verbose);
186 void match_by_general(MedFeatures &data_records, const vector<string> &groups,
187 vector<int> &filtered_row_ids, float price_ratio, float max_ratio, int min_grp_size, bool print_verbose);
188
190 void split_matrix(const MedFeatures& matrix, vector<int>& folds, int iFold,
191 MedFeatures& trainMatrix, MedFeatures& testMatrix, const vector<string> *selected_features = NULL);
193 void split_matrix(const MedFeatures& matrix, unordered_map<int, int>& folds, int iFold,
194 MedFeatures& trainMatrix, MedFeatures& testMatrix, const vector<string> *selected_features = NULL);
196 void convert_prctile(vector<float> &features_prctiles);
202 void match_to_prior(const vector<float> &outcome,
203 const vector<float> &group_values, float target_prior, vector<int> &sel_idx);
204
206 double match_to_prior(MedSamples &samples, float target_prior, vector<int> &sel_idx);
207
209 double match_to_prior(MedFeatures &features, float target_prior, vector<int> &sel_idx);
210
216 void match_to_prior(MedFeatures &features,
217 const vector<string> &group_values, float target_prior, vector<int> &sel_idx, bool print_verbose = true);
218
220 int nSplits(vector<MedSample>& samples);
221
223 float match_multi_class(MedFeatures& data, const vector<string> &groups, vector<int> &filtered_row_ids, vector<float>& price_ratios, int nRand = 10000, int verbose = false);
224 float match_multi_class(vector<MedSample>& data, const vector<string> &groups, vector<int> &filtered_row_ids, vector<float>& price_ratios, int nRand = 10000, int verbose = false);
225
226 void match_multi_class_to_dist(MedFeatures& data, const vector<string> &groups, vector<int> &filtered_row_ids, vector<float> probs);
227 void match_multi_class_to_dist(vector<MedSample>& data, const vector<string> &groups, vector<int> &filtered_row_ids, vector<float> probs);
228
229 }
230
231}
232
233#endif
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
Definition MedFeatures.h:16
unordered_map< float, string > value2Name
map value to name (e.g. for naming one-hot feature processor)
Definition MedFeatures.h:22
bool imputed
indicator that the feature has been imputed and does not contain missing values
Definition MedFeatures.h:19
float denorm_sdv
Mean and Standard deviation for de-normalization.
Definition MedFeatures.h:20
bool normalized
indicator that the feature has been normalized
Definition MedFeatures.h:18
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
map< string, vector< float > > data
the actual matrix of values per sample
Definition MedFeatures.h:51
map< int, pair< int, int > > pid_pos_len
feature generation assumes that all "rows" for a specific pid are adjacent.
Definition MedFeatures.h:57
void insert_samples(MedIdSamples &in_samples, int index)
Insert samples at position idex, assuming samples vector is properly allocated (used for generating s...
Definition MedFeatures.cpp:267
void init_all_samples(vector< MedIdSamples > &in_samples)
Fill samples vetor and initialize pid_pos_len according to input vector of MedIdSamples.
Definition MedFeatures.h:105
void set_as_matrix(const MedMat< float > &mat)
Set data (+attributes) from MedMat.
Definition MedFeatures.cpp:226
void clear()
Clear all vectors.
Definition MedFeatures.h:84
int write_as_csv_mat(const string &csv_fname, bool write_attributes=false) const
Write features (samples + weights + data) as csv with a header line.
Definition MedFeatures.cpp:361
vector< MedSample > samples
The samples representing the lines.
Definition MedFeatures.h:53
map< string, unordered_set< string > > tags
a set of tags per feature
Definition MedFeatures.h:61
void append_samples(MedIdSamples &in_samples)
Append samples at end of samples vector (used for generating samples set before generating features)
Definition MedFeatures.cpp:261
void init_pid_pos_len()
initialize pid_pos_len vector according to samples
Definition MedFeatures.cpp:275
int filter(unordered_set< string > &selectedFeatures)
Filter data (and attributes) to include only selected features.
Definition MedFeatures.cpp:716
int get_pid_pos(int pid) const
Return the first row in the virtual matrix for an id (-1 if none)
Definition MedFeatures.h:109
int prep_selected_list(vector< string > &search_str, unordered_set< string > &selected)
preparing a list all features that contain as a substring one of the given search strings,...
Definition MedFeatures.cpp:783
void set_time_unit(int _time_unit)
set time unit
Definition MedFeatures.h:86
void print_csv() const
MLOG data in csv format.
Definition MedFeatures.cpp:325
void get_samples(MedSamples &outSamples) const
Get the corresponding MedSamples object . Assuming samples vector in features are ordered (all id's s...
Definition MedFeatures.cpp:749
MedFeatures(int _time_unit)
Constructor Given time-unit.
Definition MedFeatures.h:78
vector< float > weights
a vector of weight per sample
Definition MedFeatures.h:52
map< string, FeatureAttr > attributes
a FeatureAttr per feature
Definition MedFeatures.h:60
int time_unit
the time unit of the samples
Definition MedFeatures.h:70
int get_max_serial_id_cnt() const
Return the max serial_id_cnt.
Definition MedFeatures.cpp:767
unsigned int get_crc()
Calculate a crc for the data (used for debugging mainly)
Definition MedFeatures.cpp:299
static int global_serial_id_cnt
A global counter used to prevent identical names for two features by adding FTR_::_ before generated ...
Definition MedFeatures.h:73
void get_feature_names(vector< string > &names) const
Get a vector of feature names.
Definition MedFeatures.cpp:52
void get_as_matrix(MedMat< float > &mat) const
Get data (+attributes) as a MedMat.
Definition MedFeatures.cpp:63
string resolve_name(string &substr) const
Get feature name that matches a substring.
Definition MedFeatures.cpp:2191
int get_pid_len(int pid) const
Return the number of rows in the virtual matrix for an id (-1 if none)
Definition MedFeatures.h:111
void samples_sort()
Sort by id and time.
Definition MedFeatures.cpp:2152
int read_from_csv_mat(const string &csv_fname, bool read_time_raw=true)
Read features (samples + weights + data) from a csv file with a header line.
Definition MedFeatures.cpp:509
MedIdSamples represent a collection of samples of a given id Additional (optinal) entries: split
Definition MedSamples.h:90
Definition MedMat.h:63
MedSamples represent a collection of samples per different id The data is conatined in a vector of ...
Definition MedSamples.h:129
Definition SerializableObject.h:32
process(fname, allow_type)
Definition lint.py:152
void match_by_general(MedFeatures &data_records, const vector< string > &groups, vector< int > &filtered_row_ids, float price_ratio, int min_grp_size, bool print_verbose)
matching by given groups uniq values. returns also the row_ids filtered
Definition MedFeatures.cpp:1083
void filter_row_indexes(MedFeatures &dataMat, vector< int > &selected_indexes, bool op_flag=false)
filtering MedFeatures by selected indexes rows
Definition MedFeatures.cpp:880
void commit_selection(vector< T > &vec, const vector< int > &idx)
commit selection of indexes on vector
Definition MedFeatures.cpp:794
int nSplits(vector< MedSample > &samples)
Return number of splits, also check mismatches between idSample and internal MedSamples and set idSam...
Definition MedFeatures.cpp:2006
float match_multi_class(MedFeatures &data, const vector< string > &groups, vector< int > &filtered_row_ids, vector< float > &price_ratios, int nRand=10000, int verbose=false)
multi-class matching.
Definition MedFeatures.cpp:1684
void down_sample(MedFeatures &dataMat, double take_ratio, bool with_repeats=false, vector< int > *selected_indexes=NULL)
down sampling with ratio
Definition MedFeatures.cpp:885
void convert_prctile(vector< float > &features_prctiles)
convert feature vector to it's prctil's value in each element
Definition MedFeatures.cpp:1836
void filter_row_indexes_safe(MedFeatures &dataMat, const vector< int > &selected_indexes, bool op_flag=false)
filtering MedFeatures by selected indexes rows (thread safe for selected_indexes) no sort of selected...
Definition MedFeatures.cpp:807
void split_matrix(const MedFeatures &matrix, vector< int > &folds, int iFold, MedFeatures &trainMatrix, MedFeatures &testMatrix, const vector< string > *selected_features=NULL)
split matrix to train test based on iFold value. folds is fold id for each sample
Definition MedFeatures.cpp:1752
void match_to_prior(const vector< float > &outcome, const vector< float > &group_values, float target_prior, vector< int > &sel_idx)
does matching to specific target_prior.
Definition MedFeatures.cpp:1857
double reweight_by_general(MedFeatures &data_records, const vector< string > &groups, vector< float > &weigths, bool print_verbose)
reweighting method by given groups uniq values. return weights and min_factor
Definition MedFeatures.cpp:913
medial namespace for function
Definition InfraMed.h:667