1#ifndef _FTR_GENERATOR_H_
2#define _FTR_GENERATOR_H_
4#include <InfraMed/InfraMed/InfraMed.h>
7#include <MedProcessTools/MedProcessTools/MedFeatures.h>
9#include <MedProcessTools/MedProcessTools/MedModelExceptions.h>
12#include <MedAlgo/MedAlgo/MedLM.h>
14#include <boost/regex.hpp>
16#define DEFAULT_FEAT_GNRTR_NTHREADS 8
63 int learn_nthreads = 16, pred_nthreads = 16;
75 virtual void set_names() {
names.clear(); }
78 vector <float *> p_data;
82 virtual void get_p_data(
MedFeatures& features, vector<float *> &_p_data);
83 void get_p_data(
MedFeatures& features) { get_p_data(features, p_data); }
88 virtual void clear() { };
91 vector<string> req_signals;
92 vector<int> req_signal_ids;
94 void get_required_signal_names(unordered_set<string>& signalNames);
96 void get_required_signal_ids(unordered_set<int>& signalIds);
99 virtual void get_generated_features(unordered_set<string>& names_list) {
for (
auto &s :
names) names_list.insert(s); }
102 virtual void set_signal_ids(
MedSignals& sigs) {
return; }
112 int learn(
MedPidRepository& rep,
const MedSamples& samples, vector<RepProcessor *> processors) { set_names();
return _learn(rep, samples, processors); }
113 int learn(
MedPidRepository& rep,
const MedSamples& samples) { set_names();
return _learn(rep, samples, vector<RepProcessor *>()); }
117 int _generate(
PidDynamicRec& in_rep,
MedFeatures& features,
int index,
int num) {
return _generate(in_rep, features, index, num, p_data); }
122 virtual int _generate(
PidDynamicRec& in_rep,
MedFeatures& features,
int index,
int num, vector<float *> &_p_data) {
return 0; }
124 int generate(
PidDynamicRec& in_rep,
MedFeatures& features,
int index,
int num) {
return _generate(in_rep, features, index, num); }
130 virtual int _generate(
MedFeatures& features) {
return 0; }
131 int generate(
MedFeatures& features) {
return _generate(features); }
142 virtual int init(
void *generator_params) {
return 0; };
143 virtual int init(map<string, string>& mapper);
144 virtual void init_defaults() {};
150 virtual int nfeatures() {
return (
int)
names.size(); }
162 virtual void make_summary() {};
169 size_t get_generator_size();
170 size_t generator_serialize(
unsigned char *blob);
172 virtual void print() { fprintf(stderr,
"Print Not Implemented for feature\n"); }
176 virtual void dprint(
const string &pref,
int fg_flag);
244 float uget_last(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
245 float uget_last_nth(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
246 float uget_first(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
247 float uget_last2(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
248 float uget_avg(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
249 float uget_max(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
250 float uget_min(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
251 float uget_sum(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
252 float uget_std(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
253 float uget_last_delta(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
254 float uget_last_time(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
255 float uget_last2_time(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
256 float uget_slope(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
257 float uget_win_delta(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int _d_win_from,
int _d_win_to,
int outcomeTime);
258 float uget_category_set(
PidDynamicRec &rec, UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
259 float uget_category_set_last_nth(
PidDynamicRec &rec, UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
260 float uget_category_set_count(
PidDynamicRec &rec, UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
261 float uget_category_set_sum(
PidDynamicRec &rec, UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
262 float uget_nsamples(UniversalSigVec &usv,
int time,
int _win_from,
int _win_to,
int outcomeTime);
263 float uget_exists(UniversalSigVec &usv,
int time,
int _win_from,
int _win_to,
int outcomeTime);
264 float uget_range_width(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
265 float uget_max_diff(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
266 float uget_first_time(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
267 float uget_category_set_first(
PidDynamicRec &rec, UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
268 float uget_category_set_first_time(
PidDynamicRec &rec, UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
269 float uget_time_since_last_change(UniversalSigVec &usv,
int time_point,
int _win_from,
int _win_to,
int outcomeTime);
275 map<string, int> categ_value2id;
276 bool needs_categ_dict =
true;
284 string timeRangeSignalName =
"";
285 int timeRangeSignalId;
308 string rename_signal =
"";
313 bool apply_categ_map;
324 set(_signalName, _type, 0, 360000);
325 req_signals.assign(1, signalName);
326 if (timeRangeSignalName !=
"")
327 req_signals.push_back(timeRangeSignalName);
330 void set(
string& _signalName,
BasicFeatureTypes _type,
int _time_win_from,
int _time_win_to) {
331 signalName = _signalName; type = _type;
win_from = _time_win_from;
win_to = _time_win_to;
333 req_signals.assign(1, signalName);
334 if (timeRangeSignalName !=
"")
335 req_signals.push_back(timeRangeSignalName);
347 int init(map<string, string>& mapper);
348 void init_defaults();
355 time_unit_sig = rep.sigs.Sid2Info[rep.sigs.sid(signalName)].time_unit;
356 if (timeRangeSignalName !=
"")
363 float get_value(
PidDynamicRec& rec,
int index,
int time,
int outcomeTime);
375 ADD_SERIALIZATION_FUNCS(
generator_type, type,
tags, serial_id,
win_from,
win_to,
d_win_from,
d_win_to,
time_unit_win,
time_channel,
val_channel,
sum_channel, min_value,
max_value, signalName,
sets,
376 names, req_signals,
in_set_name,
bound_outcomeTime, timeRangeSignalName, timeRangeType,
time_unit_sig,
N_th,
zero_missing,
missing_val, categ_value2id,
zero_missing_val,
full_name, rename_signal)
400 void set_names() {
if (
names.empty())
names.push_back(
"FTR_" + int_to_string_digits(serial_id, 6) +
".Age");
tags.push_back(
"Age"); }
415 virtual int init(map<string, string>& mapper);
424 unordered_map<string, float> name2Value;
425 vector<float> id2Value;
434 vector<string> sets = {};
448 int init(map<string, string>& mapper);
479 vector<string> category_values;
492 void set_names() {
if (
names.empty())
names.push_back(
"Gender");
tags.push_back(
"Gender"); }
499 int init(map<string, string>& mapper);
507 req_signal_ids.assign(1, dict.id(
"GENDER"));
511 category_values.clear();
512 if (dict.SectionName2Id.find(
"GENDER") != dict.SectionName2Id.end()) {
513 int section_id = dict.section_id(
"GENDER");
514 for (
const auto &it : dict.dicts[section_id].Id2Name)
515 category_values.push_back(it.second);
530 vector<int> bin_bounds;
535 vector<int> estimation_points;
546 BINNED_LM_TAKE_ALL = 0,
547 BINNED_LM_STOP_AT_FIRST = 1,
548 BINNED_LM_STOP_AT_LAST = 2,
559 int signalId, bdateId, genderId;
563 vector<MedLM> models;
564 vector<float> xmeans, xsdvs, ymeans, ysdvs;
565 vector<vector<float>> means = { {}, {} };
582 void set(
string& _signalName);
585 void init_defaults();
588 int init(map<string, string>& mapper);
603 void get_p_data(
MedFeatures& features, vector<float *> &_p_data);
609 void set_sampling_strategy(
string& strategy);
612 void prepare_for_age(
PidDynamicRec& rec, UniversalSigVec& ageUsv,
int &age,
int &byear);
613 void prepare_for_age(
MedPidRepository& rep,
int id, UniversalSigVec& ageUsv,
int &age,
int &byear);
614 inline void get_age(
int time,
int time_unit_from,
int& age,
int byear);
616 void dprint(
const string &pref,
int fg_flag);
620 ADD_SERIALIZATION_FUNCS(
generator_type, signalName,
names,
tags, req_signals,
time_unit_periods,
iGenerateWeights, params, xmeans, xsdvs, ymeans, means, models,
time_unit_sig, sampling_strategy)
642 FTR_RANGE_RECURRENCE_COUNT = 6,
647 FTR_RANGE_TIME_DIFF_START = 9,
658 float uget_range_current(UniversalSigVec &usv,
int updated_win_from,
int updated_win_to,
int time);
659 float uget_range_latest(UniversalSigVec &usv,
int updated_win_from,
int updated_win_to,
int time);
660 float uget_range_min(UniversalSigVec &usv,
int updated_win_from,
int updated_win_to,
int time);
661 float uget_range_max(UniversalSigVec &usv,
int updated_win_from,
int updated_win_to,
int time);
662 float uget_range_ever(UniversalSigVec &usv,
int updated_win_from,
int updated_win_to,
int time);
663 float uget_range_time_diff(UniversalSigVec &usv,
int updated_win_from,
int updated_win_to,
int time);
664 float uget_range_recurrence_count(UniversalSigVec &usv,
int updated_win_from,
int updated_win_to,
int time);
665 float uget_range_time_covered(UniversalSigVec &usv,
int updated_win_from,
int updated_win_to,
int time);
666 float uget_range_last_nth_time_len(UniversalSigVec &usv,
int updated_win_from,
int updated_win_to,
int time);
667 float uget_range_time_diff_start(UniversalSigVec &usv,
int updated_win_from,
int updated_win_to,
int time);
668 float uget_range_time_inside(UniversalSigVec &usv,
int updated_win_from,
int updated_win_to,
int time);
682 float div_factor = 1.0f;
696 string timeRangeSignalName =
"";
697 int timeRangeSignalId;
707 void set(
string& _signalName,
RangeFeatureTypes _type,
int _time_win_from,
int _time_win_to) {
709 set_names(); req_signals.assign(1,
signalName);
719 int init(map<string, string>& mapper);
720 void init_defaults();
739 ADD_SERIALIZATION_FUNCS(
generator_type,
signalName,
type,
win_from,
win_to,
val_channel,
names,
tags, req_signals,
sets,
check_first, timeRangeSignalName, timeRangeType,
recurrence_delta,
min_range_time,
767 int init(map<string, string>& mapper);
768 int init_from_model();
781 void modifySampleTime(
MedSamples& samples,
int time);
785 ADD_SERIALIZATION_HEADERS()
790 vector<vector<vector<float>>> preds;
817 vector<int> time_bins;
818 vector<string> time_bin_names;
829 int init(map<string, string>& mapper);
830 int get_time_unit(
string name);
831 int get_time_bins(
string& binsInfo);
833 void set_default_bins();
870 int init(map<string, string>& mapper);
883enum class category_stat_test {
895 map<int, vector<string>> categoryId_to_name;
896 map<int, vector<int>> _member2Sets;
897 map<int, vector<int>> _set2Members;
898 unordered_map<int, vector<int>> _member2Sets_flat_cache;
900 vector<string> top_codes;
901 vector<vector<char>> luts;
902 vector<vector<char>> filter_luts;
903 vector<int> filter_vals_idx;
904 int input_sig_num_val_ch;
906 void get_parents(
int codeGroup, vector<int> &parents,
const boost::regex ®_pat,
const boost::regex & remove_reg_pat);
908 void get_stats(
const unordered_map<
int, vector<vector<vector<int>>>> &categoryVal_to_stats,
909 vector<int> &all_signal_values, vector<int> &signal_indexes, vector<double> &valCnts, vector<double> &posCnts,
910 vector<double> &lift, vector<double> &scores, vector<double> &p_values, vector<double> &pos_ratio, vector<int> &dof,
const vector<vector<double>> &prior_per_bin)
const;
940 bool filter_hierarchy;
964 void init_defaults();
970 int init(map<string, string>& mapper);
972 int update(map<string, string>& mapper);
985 ADD_SERIALIZATION_FUNCS(
generator_type, req_signals, top_codes,
names,
signalName,
time_channel,
val_channel,
win_from,
win_to,
time_unit_win,
997void get_window_in_sig_time(
int _win_from,
int _win_to,
int _time_unit_win,
int _time_unit_sig,
int _win_time,
int &_min_time,
int &_max_time,
998 bool boundOutcomeTime =
false,
int outcome_time = -1);
1005void get_updated_time_window(UniversalSigVec& time_range_usv,
TimeRangeTypes type,
int time_unit_range_sig,
int time_unit_win,
int time_unit_sig,
int time,
1006 int win_from,
int& updated_win_from,
int win_to,
int& updated_win_to,
bool delta_win,
int d_win_from,
int& updated_d_win_from,
int d_win_to,
int& updated_d_win_to);
1007void get_updated_time_window(
TimeRangeTypes type,
int range_from,
int range_to,
int time,
int _win_from,
int _win_to,
int& updated_win_from,
int& updated_win_to);
RangeFeatureTypes
Definition FeatureGenerator.h:630
@ FTR_RANGE_EVER
"ever" - boolean 0/1 - finds if there is intersection between signal time window and the defined time...
Definition FeatureGenerator.h:635
@ FTR_RANGE_MAX
"max" - finds the maximal value of the time range signal, that there is intersection of time signal r...
Definition FeatureGenerator.h:633
@ FTR_RANGE_CURRENT
"current" - finds the value of the time range signal that intersect with win_from....
Definition FeatureGenerator.h:631
@ FTR_RANGE_MIN
"min" - finds the minimal value of the time range signal, that there is intersection of time signal r...
Definition FeatureGenerator.h:634
@ FTR_RANGE_TIME_INSIDE
< "time_inside" : checks if the prediction time point is currently INSIDE a range,...
Definition FeatureGenerator.h:648
@ FTR_RANGE_TIME_COVERED
"time_covered" : give a time window, sum up all the times in ranges that intersect the time window
Definition FeatureGenerator.h:644
@ FTR_RANGE_TIME_DIFF
"time_diff" - returns time diffrences between first intersection(if check_first is True) between sign...
Definition FeatureGenerator.h:639
@ FTR_RANGE_LAST_NTH_TIME_LENGTH
"last_nth_time_len" : gives the length (in win_time_unit) of the last_n range in the window....
Definition FeatureGenerator.h:646
@ FTR_RANGE_LATEST
"latest" - finds the last value of the time range signal, that there is intersection of time signal r...
Definition FeatureGenerator.h:632
FeatureGeneratorTypes
Definition FeatureGenerator.h:26
@ FTR_GEN_TIME
"time" - creating sample-time features (e.g. differentiate between times of day, season of year,...
Definition FeatureGenerator.h:40
@ FTR_GEN_EMBEDDING
"embedding" - allows applying a pre trained embedding model to incorporate features into matrix....
Definition FeatureGenerator.h:43
@ FTR_GEN_RANGE
"range" - creating RangeFeatGenerator
Definition FeatureGenerator.h:36
@ FTR_GEN_CATEGORY_DEPEND
"category_depend" - creates features from categorical signal that have statistical strength in sample...
Definition FeatureGenerator.h:42
@ FTR_GEN_DRG_INTAKE
"drugIntake" - creating drugs feature coverage of prescription time - DrugIntakeGenerator
Definition FeatureGenerator.h:37
@ FTR_GEN_KP_SMOKING
"kp_smoking" - creating smoking feature - KpSmokingGenerator
Definition FeatureGenerator.h:34
@ FTR_GEN_SINGLETON
"singleton" - take the value of a time-less signale - SingletonGenerator
Definition FeatureGenerator.h:30
@ FTR_GEN_BASIC
"basic" - creates basic statistic on time windows - BasicFeatGenerator
Definition FeatureGenerator.h:28
@ FTR_GEN_BINNED_LM
"binnedLm" or "binnedLM" - creating linear model for esitmating feature in time points - BinnedLmEsti...
Definition FeatureGenerator.h:32
@ FTR_GEN_ATTR
"attr" - creating features from samples attributes. Creates AttrFeatGenerator
Definition FeatureGenerator.h:41
@ FTR_GEN_MODEL
"model" - creating ModelFeatGenerator
Definition FeatureGenerator.h:39
@ FTR_GEN_GENDER
"gender" - creating gender feature - GenderGenerator (special case of signleton)
Definition FeatureGenerator.h:31
@ FTR_GEN_UNIFIED_SMOKING
"unified_smoking" - creating smoking feature - UnifiedSmokingGenerator
Definition FeatureGenerator.h:35
@ FTR_GEN_DIABETES_FINDER
"diabetes_finder" - Diabetes Finder feature. Creates DiabetesFinderGenerator
Definition FeatureGenerator.h:46
@ FTR_GEN_SMOKING
"smoking" - creating smoking feature - SmokingGenerator
Definition FeatureGenerator.h:33
@ FTR_GEN_EXTRACT_TBL
"extract_tbl" - extract values from table with keys and rules to join with each patient....
Definition FeatureGenerator.h:44
@ FTR_GEN_AGE
"age" - creating age feature - AgeGenerator
Definition FeatureGenerator.h:29
@ FTR_GEN_ALCOHOL
"alcohol" - creating alcohol feature - AlcoholGenerator
Definition FeatureGenerator.h:38
@ FTR_GEN_ELIXHAUSER
Calculate Current Elixhauser given latest DRG and Diagnosis information. Creates ElixhauserGenerator.
Definition FeatureGenerator.h:45
TimeRangeTypes time_range_name_to_type(const string &name)
Conversion between time-range type and name.
Definition FeatureGenerator.cpp:2585
BasicFeatureTypes
Definition FeatureGenerator.h:199
@ FTR_LAST_VALUE
"last" - Last Value in Window
Definition FeatureGenerator.h:200
@ FTR_NSAMPLES
"nsamples" - counts the number of times the signal apear in the time window
Definition FeatureGenerator.h:215
@ FTR_MAX_VALUE
"max" - Max value in Window
Definition FeatureGenerator.h:204
@ FTR_FIRST_DAYS
"first_time" time diffrence from prediction time to first time with signal
Definition FeatureGenerator.h:219
@ FTR_AVG_VALUE
"avg" - Mean value in Window
Definition FeatureGenerator.h:203
@ FTR_CATEGORY_SET
"category_set" - boolean 0/1 if the signal has the value in the given lut (which initialized by the "...
Definition FeatureGenerator.h:212
@ FTR_MIN_VALUE
"min" - Min value in Window
Definition FeatureGenerator.h:205
@ FTR_MAX_DIFF
"max_diff" maximum diff in window
Definition FeatureGenerator.h:218
@ FTR_EXISTS
"exists" - boolean 0/1 if the signal apears in the time window
Definition FeatureGenerator.h:216
@ FTR_CATEGORY_SET_SUM
"category_set_sum" - sums the values of appearnces of sets in the time window
Definition FeatureGenerator.h:214
@ FTR_SUM_VALUE
"sum" - sum of values in window
Definition FeatureGenerator.h:222
@ FTR_TIME_SINCE_LAST_CHANGE
"time_since_last_change" : go over states signal, take last time since the value changed
Definition FeatureGenerator.h:225
@ FTR_LAST_DELTA_VALUE
"last_delta" - Last delta. last-previous_last value
Definition FeatureGenerator.h:207
@ FTR_STD_VALUE
"std" - Standart Dev. value in Window
Definition FeatureGenerator.h:206
@ FTR_CATEGORY_SET_FIRST
"category_set_first" - boolean 0/1 if the signal apears in the time window and did not appear ever be...
Definition FeatureGenerator.h:217
@ FTR_WIN_DELTA_VALUE
"win_delta" - diffrence in value in two time windows (only if both exists, otherwise missing_value)....
Definition FeatureGenerator.h:211
@ FTR_FIRST_VALUE
"first" - First Value in Window
Definition FeatureGenerator.h:201
@ FTR_LAST2_DAYS
"last2_time" - time diffrence from prediction time to one previous last time has signal in range of v...
Definition FeatureGenerator.h:209
@ FTR_CATEGORY_SET_FIRST_TIME
"category_set_first_time" - first time of category set found in the time window
Definition FeatureGenerator.h:221
@ FTR_SLOPE_VALUE
"slope" - calculating the slope over the points in the window
Definition FeatureGenerator.h:210
@ FTR_LAST_DAYS
"last_time" - time diffrence from prediction time to last time has signal in range of values
Definition FeatureGenerator.h:208
@ FTR_LAST2_VALUE
"last2" - One before last value in Window
Definition FeatureGenerator.h:202
@ FTR_CATEGORY_SET_COUNT
"category_set_count" - counts the number of appearnces of sets in the time window
Definition FeatureGenerator.h:213
@ FTR_CATEGORY_SET_LAST_NTH
"category_set_last_nth" : (set also N_th parameter to use), check is the last N_th in window is in th...
Definition FeatureGenerator.h:224
@ FTR_LAST_NTH_VALUE
"last_nth" : (set also N_th parameter to use), get the last N_th in window, 0 is last,...
Definition FeatureGenerator.h:223
@ FTR_RANGE_WIDTH
"range_width" maximal value - minimal value in a given window time frame
Definition FeatureGenerator.h:220
BinnedLMSamplingStrategy
BinnedLinearModels : which time-points to take.
Definition FeatureGenerator.h:545
TimeRangeTypes
Definition FeatureGenerator.h:232
@ TIME_RANGE_CURRENT
"current" - consider only the current time-range
Definition FeatureGenerator.h:233
@ TIME_RANGE_BEFORE
"before" - consider anything before the current time-range
Definition FeatureGenerator.h:234
void get_window_in_sig_time(int _win_from, int _win_to, int _time_unit_win, int _time_unit_sig, int _win_time, int &_min_time, int &_max_time, bool boundOutcomeTime=false, int outcome_time=-1)
gets a [-_win_to, -_win_from] window in win time unit, and returns [_min_time, _max_time] window in s...
Definition FeatureGenerator.cpp:2560
TimeFeatTypes
Time Feature Generator: creating sample-time features (e.g.
Definition FeatureGenerator.h:798
@ FTR_TIME_HOUR
Hour of the day (0-23)
Definition FeatureGenerator.h:803
@ FTR_TIME_YEAR
Year (as is)
Definition FeatureGenerator.h:799
@ FTR_TIME_MONTH
Month of year (0-11)
Definition FeatureGenerator.h:800
@ FTR_TIME_DAY_IN_WEEK
Day of the week (0-6)
Definition FeatureGenerator.h:802
@ FTR_TIME_DAY_IN_MONTH
Day of the month (0-30)
Definition FeatureGenerator.h:801
@ FTR_TIME_MINUTE
Minute of the hout (0-59)
Definition FeatureGenerator.h:804
@ FTR_TIME_DATE
Completete date (as is)
Definition FeatureGenerator.h:805
Logger.h - allowing logs with more control.
MedAlgo - APIs to different algorithms: Linear Models, RF, GBM, KNN, and more.
RepProcessor is the parent class for processing a MedRepository or PidDynamicRec Basic functionalitie...
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
Age Generator.
Definition FeatureGenerator.h:383
int signalId
Signal Id.
Definition FeatureGenerator.h:388
virtual int init(map< string, string > &mapper)
Virtual to init object from parsed fields.
Definition FeatureGenerator.cpp:739
Attribute Feature Generator: creating features from samples attributes.
Definition FeatureGenerator.h:852
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:2288
A Basic Stats Generator for calcing simple statics on time window.
Definition FeatureGenerator.h:241
int win_from
time window for feature: win_from is the minimal time before from the prediction time
Definition FeatureGenerator.h:291
vector< float > categ_map
to be used when applying non FTR_CATEGORY_SET_* types to categorical data
Definition FeatureGenerator.h:312
void set_signal_ids(MedSignals &sigs)
Signal Ids.
Definition FeatureGenerator.cpp:521
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:651
int time_channel
n >= 0 : use time channel n , default: 0.
Definition FeatureGenerator.h:296
int d_win_to
delta time window for the FTR_WIN_DELTA_VALUE feature. the second time window
Definition FeatureGenerator.h:294
int full_name
add time and value channels even if 0
Definition FeatureGenerator.h:307
int time_unit_range_sig
the time init in which the range signal is given. (set correctly from Repository in learn and _genera...
Definition FeatureGenerator.h:287
float max_value
values range for FTR_LAST(2)_DAYS
Definition FeatureGenerator.h:303
int time_unit_sig
the time init in which the signal is given. (set correctly from Repository in learn and _generate)
Definition FeatureGenerator.h:300
int _generate(PidDynamicRec &rec, MedFeatures &features, int index, int num, vector< float * > &_p_data)
generate a new feature
Definition FeatureGenerator.cpp:501
int N_th
used in last_nth and category_set_last_nth
Definition FeatureGenerator.h:304
int val_channel
n >= 0 : use val channel n , default : 0.
Definition FeatureGenerator.h:297
int time_unit_win
the time unit in which the windows are given. Default: Undefined
Definition FeatureGenerator.h:295
string in_set_name
set name (if not given - take list of members)
Definition FeatureGenerator.h:301
void init_tables(MedDictionarySections &dict)
Init required tables.
Definition FeatureGenerator.cpp:534
TimeRangeTypes time_range_name_to_type(const string &name)
Conversion between time-range type and name.
Definition FeatureGenerator.cpp:401
int sum_channel
for FTR_CETEGORY_SET_SUM
Definition FeatureGenerator.h:298
vector< string > sets
for FTR_CATEGORY_SET_* , the list of sets
Definition FeatureGenerator.h:299
bool bound_outcomeTime
If true will truncate time window till outcomeTime.
Definition FeatureGenerator.h:302
void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition FeatureGenerator.cpp:596
BasicFeatureTypes name_to_type(const string &name)
Converts a name to type - please reffer to BasicFeatureTypes.
Definition FeatureGenerator.cpp:364
vector< char > lut
to be used when generating FTR_CATEGORY_SET_*
Definition FeatureGenerator.h:311
int _learn(MedPidRepository &rep, const MedSamples &samples, vector< RepProcessor * > processors)
Learn a generator.
Definition FeatureGenerator.h:354
int win_to
time window for feature: win_to is the maximal time before the prediction time
Definition FeatureGenerator.h:292
int zero_missing
in some cases of category_set (or others) we may want to get 0 instead of missing_value,...
Definition FeatureGenerator.h:305
float zero_missing_val
when zero_missing is on - whats the value to store in the missing value feature
Definition FeatureGenerator.h:306
int d_win_from
delta time window for the FTR_WIN_DELTA_VALUE feature. the second time window
Definition FeatureGenerator.h:293
BinnedLinearModels : Apply a set of liner models to generate features.
Definition FeatureGenerator.h:555
int _learn(MedPidRepository &rep, const MedSamples &samples, vector< RepProcessor * > processors)
Learn a generator.
Definition BinnedLmEstimates.cpp:180
int val_channel
n >= 0 : use val channel n , default : 0.
Definition FeatureGenerator.h:570
int time_unit_sig
the time init in which the signal is given. Default: Undefined
Definition FeatureGenerator.h:568
int time_channel
n >= 0 : use time channel n , default: 0.
Definition FeatureGenerator.h:569
int time_unit_periods
the time unit in which the periods are given. Default: Undefined
Definition FeatureGenerator.h:567
int _generate(PidDynamicRec &rec, MedFeatures &features, int index, int num, vector< float * > &_p_data)
generate new feature(s)
Definition BinnedLmEstimates.cpp:483
void set_names()
Naming.
Definition BinnedLmEstimates.cpp:34
int filter_features(unordered_set< string > &validFeatures)
Filter generated features according to a set. return number of valid features (does not affect single...
Definition BinnedLmEstimates.cpp:619
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition BinnedLmEstimates.cpp:112
Creates multipal features based on categorical values and statistical dependency strength by Age,...
Definition FeatureGenerator.h:891
bool generate_with_counts
If true will generate feature with counts not just as set.
Definition FeatureGenerator.h:945
float filter_child_lift_ratio
below this threshold of lift change to remove child category
Definition FeatureGenerator.h:930
int max_depth
maximal depth to go in heirarchy
Definition FeatureGenerator.h:937
float male_regression_cntrl_lower
lower limit mask on outcome for controls - important inregression
Definition FeatureGenerator.h:949
string regex_filter
regex filter for filtering categories in learn
Definition FeatureGenerator.h:922
bool use_fixed_lift
If true will also sort be lifts below 1.
Definition FeatureGenerator.h:939
string verbose_full_file
output file for verbose_full debug in learn
Definition FeatureGenerator.h:943
int update(map< string, string > &mapper)
Virtual to update object from parsed fields.
Definition CategoryDependencyGenerator.cpp:204
vector< string > filter_set_by_val_channel_names
naming for each set matched filter_set_by_val_channel variable
Definition FeatureGenerator.h:947
void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition CategoryDependencyGenerator.cpp:275
category_stat_test stat_metric
statistical test
Definition FeatureGenerator.h:933
float male_regression_cntrl_upper
upper limit mask on outcome for controls - important inregression
Definition FeatureGenerator.h:950
int min_code_cnt
minimal number of occourences to consider signal
Definition FeatureGenerator.h:924
int age_bin
age bin for testing statistical dependency
Definition FeatureGenerator.h:921
float filter_child_pval_diff
below this threshold of pvalue diff change to remove child category (with AND condition on average li...
Definition FeatureGenerator.h:929
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition CategoryDependencyGenerator.cpp:67
string remove_regex_filter
remove regex filter for filtering categories in learn
Definition FeatureGenerator.h:923
string signalName
the signal name
Definition FeatureGenerator.h:912
float female_regression_case_lower
lower limit mask on outcome for cases - important inregression
Definition FeatureGenerator.h:955
float male_regression_case_lower
lower limit mask on outcome for cases - important inregression
Definition FeatureGenerator.h:951
float lift_below
filter lift to keep below it
Definition FeatureGenerator.h:927
string feature_prefix
additional prefix to add to name to describe the feature
Definition FeatureGenerator.h:944
int time_channel
n >= 0 : use time channel n , default: 0.
Definition FeatureGenerator.h:914
int win_to
time window for feature: win_to is the maximal time before the prediction time
Definition FeatureGenerator.h:917
float female_regression_cntrl_lower
lower limit mask on outcome for controls - important inregression
Definition FeatureGenerator.h:953
int minimal_chi_cnt
chi_square arg to keep at least count to use row in calc
Definition FeatureGenerator.h:935
vector< vector< string > > filter_set_by_val_channel
filter set by value channels. can be initialized by "filter_set_by_val_channel_X":"string_set_for_val...
Definition FeatureGenerator.h:946
int max_parents
controls maximum parents count
Definition FeatureGenerator.h:938
float filter_child_removed_ratio
If child removed ratio is beyond this and has other child taken - remove parent.
Definition FeatureGenerator.h:932
float female_regression_case_upper
upper limit mask on outcome for cases - important inregression
Definition FeatureGenerator.h:956
int win_from
time window for feature: win_from is the minimal time before from the prediction time
Definition FeatureGenerator.h:916
int max_age
maximal age for testing statistical dependency
Definition FeatureGenerator.h:920
float male_regression_case_upper
upper limit mask on outcome for cases - important inregression
Definition FeatureGenerator.h:952
int take_top
maximal number of features to create
Definition FeatureGenerator.h:926
float fdr
the FDR value
Definition FeatureGenerator.h:925
bool verbose_full
If true will print a lot - table of all stats for each code.
Definition FeatureGenerator.h:942
bool verbose
Apply hierarchy filtering.
Definition FeatureGenerator.h:941
int min_age
minimal age for testing statistical dependency
Definition FeatureGenerator.h:919
float female_regression_cntrl_upper
upper limit mask on outcome for controls - important inregression
Definition FeatureGenerator.h:954
float lift_above
filter lift to keep above it
Definition FeatureGenerator.h:928
int val_channel
n >= 0 : use val channel n , default : 0.
Definition FeatureGenerator.h:915
int filter_features(unordered_set< string > &validFeatures)
summary> prints summary of generator job.
Definition CategoryDependencyGenerator.cpp:955
float filter_child_count_ratio
If child ratio count is too similar, small change from parent code - keep only paretn code.
Definition FeatureGenerator.h:931
int time_unit_win
the time unit in which the windows are given. Default: Undefined
Definition FeatureGenerator.h:918
float chi_square_at_least
chi_square arg to test for at least that change in lift to measure bigger diffrence
Definition FeatureGenerator.h:934
int sort_by_chi
sort results by chi-square
Definition FeatureGenerator.h:936
Definition FeatureGenerator.h:53
int iGenerateWeights
Feature/Weights generator.
Definition FeatureGenerator.h:72
vector< string > tags
Tags - for defining labels or groups. may be used later for filtering for example.
Definition FeatureGenerator.h:69
FeatureGeneratorTypes generator_type
Type.
Definition FeatureGenerator.h:57
void * new_polymorphic(string derived_class_name)
for polymorphic classes that want to be able to serialize/deserialize a pointer * to the derived clas...
Definition FeatureGenerator.cpp:130
virtual void fit_for_repository(MedPidRepository &rep)
Prepartion and adjustment for model based on repository.
Definition FeatureGenerator.h:108
virtual int filter_features(unordered_set< string > &validFeatures)
summary> prints summary of generator job.
Definition FeatureGenerator.cpp:321
float missing_val
Missing value.
Definition FeatureGenerator.h:66
virtual void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition FeatureGenerator.h:153
vector< string > names
Feature name.
Definition FeatureGenerator.h:60
Gender.
Definition FeatureGenerator.h:477
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:782
int genderId
Gender Id.
Definition FeatureGenerator.h:483
void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition FeatureGenerator.cpp:804
Definition MedDictionary.h:87
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
static int global_serial_id_cnt
A global counter used to prevent identical names for two features by adding FTR_::_ before generated ...
Definition MedFeatures.h:73
A model = repCleaner + featureGenerator + featureProcessor + MedPredictor.
Definition MedModel.h:56
Definition MedPidRepository.h:87
MedSamples represent a collection of samples per different id The data is conatined in a vector of ...
Definition MedSamples.h:129
Definition MedSignals.h:719
static const int Undefined
undefined time unit
Definition MedTime.h:24
Use a model to generate predictions to be used as features.
Definition FeatureGenerator.h:746
string modelFile
File for serialized model.
Definition FeatureGenerator.h:749
int _learn(MedPidRepository &rep, const MedSamples &samples, vector< RepProcessor * > processors)
learn method
Definition FeatureGenerator.cpp:2417
void override_predictions(MedSamples &inSamples, MedSamples &modelSamples)
Use a given vector of predictions instead of applying model.
Definition FeatureGenerator.cpp:2446
void prepare(MedFeatures &features, MedPidRepository &rep, MedSamples &samples)
Do the actual prediction prior to feature generation ...
Definition FeatureGenerator.cpp:2484
int use_overriden_predictions
Use a given vector of predictions instead of applying model.
Definition FeatureGenerator.h:757
bool ensure_patient_ids
if true will ensure the ids are the same as curretn training samples
Definition FeatureGenerator.h:754
string modelName
name of final feature
Definition FeatureGenerator.h:751
int _generate(PidDynamicRec &rec, MedFeatures &features, int index, int num, vector< float * > &_p_data)
generate a new feature
Definition FeatureGenerator.cpp:2531
string model_train_samples
path train model samples.
Definition FeatureGenerator.h:753
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:2352
string model_json
path load json and train model for this.
Definition FeatureGenerator.h:752
int time_unit_win
the time unit in which the times are given. Default: global_default_windows_time_unit
Definition FeatureGenerator.h:758
int n_preds
how many features to create
Definition FeatureGenerator.h:755
void set_names()
Naming.
Definition FeatureGenerator.cpp:2326
MedModel * model
model
Definition FeatureGenerator.h:750
int impute_existing_feature
If true will use model to impute an existing feature (determined by model name. Otherwise - generate ...
Definition FeatureGenerator.h:756
int time_unit_sig
the time init in which the signal is given. (set correctly from Repository in learn and Generate)
Definition FeatureGenerator.h:759
Definition MedPidRepository.h:127
RangeFeatGenerator : Generate features for a time range with value signal (for example drug)
Definition FeatureGenerator.h:655
int time_unit_range_sig
the time unit in which the range signal is given. (set correctly from Repository in learn and _genera...
Definition FeatureGenerator.h:699
bool regex_on_sets
if on , regex is applied on .*sets[i].* and aggregated.
Definition FeatureGenerator.h:691
int time_unit_win
the time unit in which the windows are given. Default: Undefined
Definition FeatureGenerator.h:678
int check_first
if 1 choose first occurance of check_val otherwise choose last
Definition FeatureGenerator.h:681
int win_from
time window for feature: from is the minimal time before prediciton time
Definition FeatureGenerator.h:676
int val_channel
n >= 0 : use val channel n , default : 0.
Definition FeatureGenerator.h:680
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:1036
void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition FeatureGenerator.cpp:1104
int min_range_time
if different from -1, the minimum length for a range to be considered valid in window time units (els...
Definition FeatureGenerator.h:686
int time_unit_sig
the time init in which the signal is given. (set correctly from Repository in learn and Generate)
Definition FeatureGenerator.h:679
int conditional_channel
in some cases (currently last_nth_len, and time_covered) we allow doing the calculation only on range...
Definition FeatureGenerator.h:690
string signalName
Signal to consider.
Definition FeatureGenerator.h:672
RangeFeatureTypes type
Type of comorbidity index to generate.
Definition FeatureGenerator.h:675
int N_th
the index of the N-th range in order to consider in the last_nth_time_len option
Definition FeatureGenerator.h:687
int strict_times
if on , will ignore cases in which the second time channel is after the prediction time
Definition FeatureGenerator.h:689
vector< string > sets
FTR_RANGE_EVER checks if the signal ever was in one of these sets/defs from the respective dict.
Definition FeatureGenerator.h:674
int win_to
time window for feature: to is the maximal time before prediciton time
Definition FeatureGenerator.h:677
int recurrence_delta
maximum time for a subsequent range signal to be considered a recurrence in in window time units
Definition FeatureGenerator.h:685
RangeFeatureTypes name_to_type(const string &name)
please reffer to RangeFeatureTypes to understand the options
Definition FeatureGenerator.cpp:1128
int zero_missing
in some cases we may want to get 0 instead of missing values
Definition FeatureGenerator.h:688
int first_evidence_time_channel
sometimes we have a different time channel stating WHEN the range started. We are strict and use the ...
Definition FeatureGenerator.h:693
vector< char > lut
dividing by this number in time_covered option
Definition FeatureGenerator.h:684
Definition SerializableObject.h:32
int init_from_string(string init_string)
Init from string.
Definition SerializableObject.cpp:121
Singleton.
Definition FeatureGenerator.h:421
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:960
string in_set_name
list of sets
Definition FeatureGenerator.h:435
string signalName
Signal Id.
Definition FeatureGenerator.h:431
void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition FeatureGenerator.cpp:913
Definition FeatureGenerator.h:810
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:2113
BinnedLinearModels : parameters.
Definition FeatureGenerator.h:529