3#ifndef __MED_FEATURES_H__
4#define __MED_FEATURES_H__
6#include <InfraMed/InfraMed/InfraMed.h>
8#include <MedProcessTools/MedProcessTools/MedProcessUtils.h>
9#include <MedProcessTools/MedProcessTools/MedSamples.h>
10#include <MedMat/MedMat/MedMat.h>
51 map<string, vector<float> >
data;
61 map<string, unordered_set<string> >
tags;
64 const static unsigned char cleaned_mask = (
unsigned char)0x01;
65 const static unsigned char imputed_mask = (
unsigned char)0x02;
66 map<string, vector<unsigned char>> masks;
67 float medf_missing_value = (float)MED_MAT_MISSING_VALUE;
123 int write_as_csv_mat(
const string &csv_fname,
bool write_attributes =
false)
const;
124 int add_to_csv_mat(
const string &csv_fname,
bool write_attributes,
int start_idx)
const;
125 void write_csv_data(ofstream& out_f,
bool write_attributes, vector<string>& col_names,
int start_idx)
const;
133 int filter(unordered_set<string>& selectedFeatures);
141 int mark_imputed_in_masks(
float _missing_val);
142 int mark_imputed_in_masks() {
return mark_imputed_in_masks(medf_missing_value); }
144 void round_data(
float r);
145 void noise_data(
float r);
171 template<
class T>
void commit_selection(vector<T> &vec,
const vector<int> &idx);
178 vector<int> *selected_indexes = NULL);
181 vector<float> &weigths,
bool print_verbose);
184 vector<int> &filtered_row_ids,
float price_ratio,
int min_grp_size,
bool print_verbose);
187 vector<int> &filtered_row_ids,
float price_ratio,
float max_ratio,
int min_grp_size,
bool print_verbose);
203 const vector<float> &group_values,
float target_prior, vector<int> &sel_idx);
217 const vector<string> &group_values,
float target_prior, vector<int> &sel_idx,
bool print_verbose =
true);
220 int nSplits(vector<MedSample>& samples);
223 float match_multi_class(
MedFeatures& data,
const vector<string> &groups, vector<int> &filtered_row_ids, vector<float>& price_ratios,
int nRand = 10000,
int verbose =
false);
224 float match_multi_class(vector<MedSample>& data,
const vector<string> &groups, vector<int> &filtered_row_ids, vector<float>& price_ratios,
int nRand = 10000,
int verbose =
false);
226 void match_multi_class_to_dist(
MedFeatures& data,
const vector<string> &groups, vector<int> &filtered_row_ids, vector<float> probs);
227 void match_multi_class_to_dist(vector<MedSample>& data,
const vector<string> &groups, vector<int> &filtered_row_ids, vector<float> probs);
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
Definition MedFeatures.h:16
unordered_map< float, string > value2Name
map value to name (e.g. for naming one-hot feature processor)
Definition MedFeatures.h:22
bool imputed
indicator that the feature has been imputed and does not contain missing values
Definition MedFeatures.h:19
float denorm_sdv
Mean and Standard deviation for de-normalization.
Definition MedFeatures.h:20
bool normalized
indicator that the feature has been normalized
Definition MedFeatures.h:18
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
map< string, vector< float > > data
the actual matrix of values per sample
Definition MedFeatures.h:51
map< int, pair< int, int > > pid_pos_len
feature generation assumes that all "rows" for a specific pid are adjacent.
Definition MedFeatures.h:57
void insert_samples(MedIdSamples &in_samples, int index)
Insert samples at position idex, assuming samples vector is properly allocated (used for generating s...
Definition MedFeatures.cpp:267
void init_all_samples(vector< MedIdSamples > &in_samples)
Fill samples vetor and initialize pid_pos_len according to input vector of MedIdSamples.
Definition MedFeatures.h:105
void set_as_matrix(const MedMat< float > &mat)
Set data (+attributes) from MedMat.
Definition MedFeatures.cpp:226
void clear()
Clear all vectors.
Definition MedFeatures.h:84
int write_as_csv_mat(const string &csv_fname, bool write_attributes=false) const
Write features (samples + weights + data) as csv with a header line.
Definition MedFeatures.cpp:361
vector< MedSample > samples
The samples representing the lines.
Definition MedFeatures.h:53
map< string, unordered_set< string > > tags
a set of tags per feature
Definition MedFeatures.h:61
void append_samples(MedIdSamples &in_samples)
Append samples at end of samples vector (used for generating samples set before generating features)
Definition MedFeatures.cpp:261
void init_pid_pos_len()
initialize pid_pos_len vector according to samples
Definition MedFeatures.cpp:275
int filter(unordered_set< string > &selectedFeatures)
Filter data (and attributes) to include only selected features.
Definition MedFeatures.cpp:716
int get_pid_pos(int pid) const
Return the first row in the virtual matrix for an id (-1 if none)
Definition MedFeatures.h:109
int prep_selected_list(vector< string > &search_str, unordered_set< string > &selected)
preparing a list all features that contain as a substring one of the given search strings,...
Definition MedFeatures.cpp:783
void set_time_unit(int _time_unit)
set time unit
Definition MedFeatures.h:86
void print_csv() const
MLOG data in csv format.
Definition MedFeatures.cpp:325
void get_samples(MedSamples &outSamples) const
Get the corresponding MedSamples object . Assuming samples vector in features are ordered (all id's s...
Definition MedFeatures.cpp:749
MedFeatures(int _time_unit)
Constructor Given time-unit.
Definition MedFeatures.h:78
vector< float > weights
a vector of weight per sample
Definition MedFeatures.h:52
map< string, FeatureAttr > attributes
a FeatureAttr per feature
Definition MedFeatures.h:60
int time_unit
the time unit of the samples
Definition MedFeatures.h:70
int get_max_serial_id_cnt() const
Return the max serial_id_cnt.
Definition MedFeatures.cpp:767
unsigned int get_crc()
Calculate a crc for the data (used for debugging mainly)
Definition MedFeatures.cpp:299
static int global_serial_id_cnt
A global counter used to prevent identical names for two features by adding FTR_::_ before generated ...
Definition MedFeatures.h:73
void get_feature_names(vector< string > &names) const
Get a vector of feature names.
Definition MedFeatures.cpp:52
void get_as_matrix(MedMat< float > &mat) const
Get data (+attributes) as a MedMat.
Definition MedFeatures.cpp:63
string resolve_name(string &substr) const
Get feature name that matches a substring.
Definition MedFeatures.cpp:2191
int get_pid_len(int pid) const
Return the number of rows in the virtual matrix for an id (-1 if none)
Definition MedFeatures.h:111
void samples_sort()
Sort by id and time.
Definition MedFeatures.cpp:2152
int read_from_csv_mat(const string &csv_fname, bool read_time_raw=true)
Read features (samples + weights + data) from a csv file with a header line.
Definition MedFeatures.cpp:509
MedIdSamples represent a collection of samples of a given id Additional (optinal) entries: split
Definition MedSamples.h:90
MedSamples represent a collection of samples per different id The data is conatined in a vector of ...
Definition MedSamples.h:129
Definition SerializableObject.h:32
process(fname, allow_type)
Definition lint.py:152