Medial Code Documentation
Loading...
Searching...
No Matches
FeatureGenerator.h
Go to the documentation of this file.
1#ifndef _FTR_GENERATOR_H_
2#define _FTR_GENERATOR_H_
3
4#include <InfraMed/InfraMed/InfraMed.h>
7#include <MedProcessTools/MedProcessTools/MedFeatures.h>
9#include <MedProcessTools/MedProcessTools/MedModelExceptions.h>
12#include <MedAlgo/MedAlgo/MedLM.h>
13#include <cfloat>
14#include <boost/regex.hpp>
15
16#define DEFAULT_FEAT_GNRTR_NTHREADS 8
17
18// For ModelFeatureGenerator
19class MedModel;
20
21// TBD : Add wrapper for management of features list (read/write to file, etc.)
22
49
54public:
55
58
60 vector<string> names;
61
62 // Threading
63 int learn_nthreads = 16, pred_nthreads = 16;
64
66 float missing_val = (float)MED_MAT_MISSING_VALUE;
67
69 vector<string> tags;
70
73
74 // Naming
75 virtual void set_names() { names.clear(); }
76
77 // Helper - pointers to data vectors in MedFeatures (to save time in generation)
78 vector <float *> p_data;
79
80 // Prepare for feature generation
81 virtual void prepare(MedFeatures &features, MedPidRepository& rep, MedSamples& samples);
82 virtual void get_p_data(MedFeatures& features, vector<float *> &_p_data);
83 void get_p_data(MedFeatures& features) { get_p_data(features, p_data); }
84
85 // Constructor/Destructor
86 FeatureGenerator() { learn_nthreads = DEFAULT_FEAT_GNRTR_NTHREADS; pred_nthreads = DEFAULT_FEAT_GNRTR_NTHREADS; missing_val = MED_MAT_MISSING_VALUE; serial_id = ++MedFeatures::global_serial_id_cnt; };
87 virtual ~FeatureGenerator() { clear(); };
88 virtual void clear() { };
89
90 // Required Signals
91 vector<string> req_signals;
92 vector<int> req_signal_ids;
93
94 void get_required_signal_names(unordered_set<string>& signalNames);
95 virtual void set_required_signal_ids(MedDictionarySections& dict);
96 void get_required_signal_ids(unordered_set<int>& signalIds);
97
98 // generated features
99 virtual void get_generated_features(unordered_set<string>& names_list) { for (auto &s : names) names_list.insert(s); }
100
101 // Signal Ids
102 virtual void set_signal_ids(MedSignals& sigs) { return; }
103
104 // Init required tables
105 virtual void init_tables(MedDictionarySections& dict) { return; }
106
108 virtual void fit_for_repository(MedPidRepository &rep) { return; }
109
110 // Learn a generator
111 virtual int _learn(MedPidRepository& rep, const MedSamples& samples, vector<RepProcessor *> processors) { return 0; }
112 int learn(MedPidRepository& rep, const MedSamples& samples, vector<RepProcessor *> processors) { set_names(); return _learn(rep, samples, processors); }
113 int learn(MedPidRepository& rep, const MedSamples& samples) { set_names(); return _learn(rep, samples, vector<RepProcessor *>()); }
114
115 // generate feature data from repository
116 // We assume the corresponding MedSamples have been inserted to MedFeatures : either at the end or at position index
117 int _generate(PidDynamicRec& in_rep, MedFeatures& features, int index, int num) {return _generate(in_rep, features, index, num, p_data); }
118
119 // the following is the MAIN generation routine to implement.
120 // note that it is given a p_data of its own. This is in order to allow different records to write results to different places.
121 // the default run will use it with the generator p_data.
122 virtual int _generate(PidDynamicRec& in_rep, MedFeatures& features, int index, int num, vector<float *> &_p_data) { return 0; }
123
124 int generate(PidDynamicRec& in_rep, MedFeatures& features, int index, int num) { return _generate(in_rep, features, index, num); }
125 int generate(PidDynamicRec& in_rep, MedFeatures& features);
126 int generate(MedPidRepository& rep, int id, MedFeatures& features);
127 int generate(MedPidRepository& rep, int id, MedFeatures& features, int index, int num);
128
129 // generate feature data from other features
130 virtual int _generate(MedFeatures& features) { return 0; }
131 int generate(MedFeatures& features) { return _generate(features); }
132
133 // Init
134 // create a generator
135 static FeatureGenerator *make_generator(string name);
136 static FeatureGenerator *make_generator(string name, string params);
137 static FeatureGenerator *make_generator(FeatureGeneratorTypes type);
138 static FeatureGenerator *make_generator(FeatureGeneratorTypes type, string params);
139
140 static FeatureGenerator *create_generator(string &params); // must include fg_type
141
142 virtual int init(void *generator_params) { return 0; };
143 virtual int init(map<string, string>& mapper);
144 virtual void init_defaults() {};
145
146 // Copy
147 virtual void copy(FeatureGenerator *generator) { *this = *generator; }
148
149 // Number of features generated
150 virtual int nfeatures() { return (int)names.size(); }
151
153 virtual void get_required_signal_categories(unordered_map<string, vector<string>> &signal_categories_in_use) const {};
154
155 // Filter generated features according to a set. return number of valid features
156 virtual int filter_features(unordered_set<string>& validFeatures);
157
162 virtual void make_summary() {};
163
164 // Serialization
165 ADD_CLASS_NAME(FeatureGenerator)
167 void *new_polymorphic(string derived_class_name);
168
169 size_t get_generator_size();
170 size_t generator_serialize(unsigned char *blob);
171
172 virtual void print() { fprintf(stderr, "Print Not Implemented for feature\n"); }
173
174
175 // debug print for a feature generator. fg_flag can
176 virtual void dprint(const string &pref, int fg_flag);
177
178
179 int serial_id; // serial id of feature
180};
181
182FeatureGeneratorTypes ftr_generator_name_to_type(const string& generator_name);
183
184//..............................................................................................
185// FeatureSingleChannel -
186// This class is a mediator between FeatureGenerator and classes that generate
187// Features on a single variable (not including age and gender) and in it in a single channel.
188//..............................................................................................
189
190//.......................................................................................
191//.......................................................................................
192// Single signal features that do not require learning(e.g. last hemoglobin)
193//.......................................................................................
194//.......................................................................................
195
228
232typedef enum {
235 TIME_RANGE_LAST
237
242private:
243 // actual generators
244 float uget_last(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime); // Added the win as needed to be called on different ones in uget_win_delta
245 float uget_last_nth(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
246 float uget_first(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
247 float uget_last2(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
248 float uget_avg(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
249 float uget_max(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
250 float uget_min(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
251 float uget_sum(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
252 float uget_std(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
253 float uget_last_delta(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
254 float uget_last_time(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
255 float uget_last2_time(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
256 float uget_slope(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
257 float uget_win_delta(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int _d_win_from, int _d_win_to, int outcomeTime);
258 float uget_category_set(PidDynamicRec &rec, UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
259 float uget_category_set_last_nth(PidDynamicRec &rec, UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
260 float uget_category_set_count(PidDynamicRec &rec, UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
261 float uget_category_set_sum(PidDynamicRec &rec, UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
262 float uget_nsamples(UniversalSigVec &usv, int time, int _win_from, int _win_to, int outcomeTime);
263 float uget_exists(UniversalSigVec &usv, int time, int _win_from, int _win_to, int outcomeTime);
264 float uget_range_width(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
265 float uget_max_diff(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
266 float uget_first_time(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
267 float uget_category_set_first(PidDynamicRec &rec, UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
268 float uget_category_set_first_time(PidDynamicRec &rec, UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
269 float uget_time_since_last_change(UniversalSigVec &usv, int time_point, int _win_from, int _win_to, int outcomeTime);
270
271 // Applying non FTR_CATEGORY_SET_* types to categorical data
272 unordered_set<int> categ_require_dict = { FTR_LAST_VALUE, FTR_FIRST_VALUE, FTR_LAST2_VALUE, FTR_LAST_NTH_VALUE }; // Types that requriew dictionary if applied on categorical data
273 unordered_set<int> categ_forbidden = { FTR_AVG_VALUE, FTR_MAX_VALUE, FTR_MIN_VALUE, FTR_STD_VALUE, FTR_LAST_DELTA_VALUE,
274 FTR_SLOPE_VALUE,FTR_WIN_DELTA_VALUE, FTR_MAX_DIFF,FTR_RANGE_WIDTH, FTR_SUM_VALUE }; // Types that are not allowed for categorical data
275 map<string, int> categ_value2id;
276 bool needs_categ_dict = true;
277
278public:
279 // Feature Descrption
280 string signalName;
281 int signalId;
282
283 // Signal to determine allowed time-range (e.g. current stay/admission for inpatients)
284 string timeRangeSignalName = "";
285 int timeRangeSignalId;
286 TimeRangeTypes timeRangeType = TIME_RANGE_CURRENT;
288
289 // parameters (should be serialized)
290 BasicFeatureTypes type = FTR_LAST;
291 int win_from = 0;
292 int win_to = 360000;
293 int d_win_from = 360;
294 int d_win_to = 360000;
296 int time_channel = 0;
297 int val_channel = 0;
298 int sum_channel = 1;
299 vector<string> sets;
301 string in_set_name = "";
302 bool bound_outcomeTime = false;
303 float min_value = -FLT_MAX, max_value = FLT_MAX;
304 int N_th = 0;
305 int zero_missing = 0;
307 int full_name = 0;
308 string rename_signal = "";
309
310 // helpers
311 vector<char> lut;
312 vector<float> categ_map;
313 bool apply_categ_map;
314
315 // Naming
316 void set_names();
317
318 void get_required_signal_categories(unordered_map<string, vector<string>> &signal_categories_in_use) const;
319
320 // Constructor/Destructor
321 BasicFeatGenerator() : FeatureGenerator() { init_defaults(); };
322 //~BasicFeatGenerator() {};
323 void set(string& _signalName, BasicFeatureTypes _type) {
324 set(_signalName, _type, 0, 360000);
325 req_signals.assign(1, signalName);
326 if (timeRangeSignalName != "")
327 req_signals.push_back(timeRangeSignalName);
328 }
329
330 void set(string& _signalName, BasicFeatureTypes _type, int _time_win_from, int _time_win_to) {
331 signalName = _signalName; type = _type; win_from = _time_win_from; win_to = _time_win_to;
332 set_names();
333 req_signals.assign(1, signalName);
334 if (timeRangeSignalName != "")
335 req_signals.push_back(timeRangeSignalName);
336 }
337
339 BasicFeatureTypes name_to_type(const string &name);
340
342 TimeRangeTypes time_range_name_to_type(const string& name);
343 string time_range_type_to_name(TimeRangeTypes type);
344
347 int init(map<string, string>& mapper);
348 void init_defaults();
349
350 // Copy
351 virtual void copy(FeatureGenerator *generator) { *this = *(dynamic_cast<BasicFeatGenerator *>(generator)); }
352
354 int _learn(MedPidRepository& rep, const MedSamples& samples, vector<RepProcessor *> processors) {
355 time_unit_sig = rep.sigs.Sid2Info[rep.sigs.sid(signalName)].time_unit;
356 if (timeRangeSignalName != "")
357 time_unit_range_sig = rep.sigs.Sid2Info[rep.sigs.sid(timeRangeSignalName)].time_unit;
358 return 0;
359 }
360
362 int _generate(PidDynamicRec& rec, MedFeatures& features, int index, int num, vector<float *> &_p_data);
363 float get_value(PidDynamicRec& rec, int index, int time, int outcomeTime);
364
366 void set_signal_ids(MedSignals& sigs);
367
370
371 void prepare(MedFeatures &features, MedPidRepository& rep, MedSamples& samples);
372
373 // Serialization
374 ADD_CLASS_NAME(BasicFeatGenerator)
376 names, req_signals, in_set_name, bound_outcomeTime, timeRangeSignalName, timeRangeType, time_unit_sig, N_th, zero_missing, missing_val, categ_value2id, zero_missing_val, full_name, rename_signal)
377
378};
379
384public:
385
386 string signalName;
389
390 ~AgeGenerator() { clear(); }
391 void clear() { }
392
393 // Constructor/Destructor
394 AgeGenerator() {
395 generator_type = FTR_GEN_AGE; names.push_back("Age"); signalId = -1; signalName = "BDATE"; req_signals.assign(1, signalName);
396 }
397 //~AgeGenerator() {};
398
399 // Name
400 void set_names() { if (names.empty()) names.push_back("FTR_" + int_to_string_digits(serial_id, 6) + ".Age"); tags.push_back("Age"); }
401
402 // Copy
403 virtual void copy(FeatureGenerator *generator) { *this = *(dynamic_cast<AgeGenerator *>(generator)); }
404
405 // generate a new feature
406 int _generate(PidDynamicRec& rec, MedFeatures& features, int index, int num, vector<float *> &_p_data);
407
408 // Signal Ids
409 void set_signal_ids(MedSignals& sigs) { signalId = sigs.sid(signalName); }
410
411 // Serialization
412 ADD_CLASS_NAME(AgeGenerator)
414
415 virtual int init(map<string, string>& mapper);
416};
417
422private:
423 vector<char> lut;
424 unordered_map<string, float> name2Value;
425 vector<float> id2Value;
426
427 void get_id2Value(MedDictionarySections& dict);
428public:
429
432 int signalId;
433
434 vector<string> sets = {};
435 string in_set_name = "";
436
437 // Constructor/Destructor
438 SingletonGenerator() : FeatureGenerator() { generator_type = FTR_GEN_SINGLETON; names.push_back(signalName); signalId = -1; req_signals.assign(1, signalName); }
439 SingletonGenerator(int _signalId) : FeatureGenerator() { generator_type = FTR_GEN_SINGLETON; names.push_back(signalName); signalId = _signalId; req_signals.assign(1, signalName); }
440
441 // Name
442 void set_names();
443
444 // Init LUT for categorial variable
445 void init_tables(MedDictionarySections& dict);
448 int init(map<string, string>& mapper);
449
450 // Copy
451 virtual void copy(FeatureGenerator *generator) { *this = *(dynamic_cast<SingletonGenerator *>(generator)); }
452
453 // learn generator (learning name2Value)
454 int _learn(MedPidRepository& rep, const MedSamples& samples, vector<RepProcessor *> processors);
455
456 // generate a new feature
457 int _generate(PidDynamicRec& rec, MedFeatures& features, int index, int num, vector<float *> &_p_data);
458
459 // Preparation - just fill the value2Name attribute
460 void prepare(MedFeatures &features, MedPidRepository& rep, MedSamples& samples);
461
462 // Signal Ids
463 void set_signal_ids(MedSignals& sigs) { signalId = sigs.sid(signalName); }
464 void set_required_signal_ids(MedDictionarySections& dict) { req_signal_ids.assign(1, dict.id(signalName)); }
465
466 void get_required_signal_categories(unordered_map<string, vector<string>> &signal_categories_in_use) const;
467
468 // Serialization
469 ADD_CLASS_NAME(SingletonGenerator)
470 ADD_SERIALIZATION_FUNCS(generator_type, req_signals, signalName, names, tags, iGenerateWeights, sets, lut, name2Value)
471};
472
473
478private:
479 vector<string> category_values;
480public:
481
484
485 // Constructor/Destructor
486 GenderGenerator() : FeatureGenerator() { generator_type = FTR_GEN_GENDER; names.push_back("Gender"); genderId = -1; req_signals.assign(1, "GENDER"); }
487 GenderGenerator(int _genderId) : FeatureGenerator() { generator_type = FTR_GEN_GENDER; names.push_back("Gender"); genderId = _genderId; req_signals.assign(1, "GENDER"); }
488
489 //~GenderGenerator() {};
490
491 // Name
492 void set_names() { if (names.empty()) names.push_back("Gender"); tags.push_back("Gender"); }
493
494 // Copy
495 virtual void copy(FeatureGenerator *generator) { *this = *(dynamic_cast<GenderGenerator *>(generator)); }
496
499 int init(map<string, string>& mapper);
500
501 // generate a new feature
502 int _generate(PidDynamicRec& rec, MedFeatures& features, int index, int num, vector<float *> &_p_data);
503
504 // Signal Ids
505 void set_signal_ids(MedSignals& sigs) { genderId = sigs.sid("GENDER"); }
506 void set_required_signal_ids(MedDictionarySections& dict) {
507 req_signal_ids.assign(1, dict.id("GENDER"));
508 }
509
510 void init_tables(MedDictionarySections& dict) {
511 category_values.clear();
512 if (dict.SectionName2Id.find("GENDER") != dict.SectionName2Id.end()) {
513 int section_id = dict.section_id("GENDER");
514 for (const auto &it : dict.dicts[section_id].Id2Name)
515 category_values.push_back(it.second);
516 }
517 }
518
519 void get_required_signal_categories(unordered_map<string, vector<string>> &signal_categories_in_use) const;
520
521 // Serialization
522 ADD_CLASS_NAME(GenderGenerator)
524};
525
530 vector<int> bin_bounds;
531 int min_period;
532 int max_period;
533 float rfactor;
534
535 vector<int> estimation_points;
536 ADD_CLASS_NAME(BinnedLmEstimatesParams)
537 ADD_SERIALIZATION_FUNCS(bin_bounds, min_period, max_period, rfactor, estimation_points)
538
539};
540
545typedef enum {
546 BINNED_LM_TAKE_ALL = 0,
547 BINNED_LM_STOP_AT_FIRST = 1,
548 BINNED_LM_STOP_AT_LAST = 2,
549 BINNED_LM_LAST
551
556public:
557 // Feature Descrption
558 string signalName;
559 int signalId, bdateId, genderId;
560
562 BinnedLMSamplingStrategy sampling_strategy = BINNED_LM_TAKE_ALL;
563 vector<MedLM> models;
564 vector<float> xmeans, xsdvs, ymeans, ysdvs;
565 vector<vector<float>> means = { {}, {} };
566
569 int time_channel = 0;
570 int val_channel = 0;
571
573 void set_names();
574
575 // Constructor/Destructor
576 BinnedLmEstimates() : FeatureGenerator() { signalName = ""; init_defaults(); };
577 BinnedLmEstimates(string _signalName) : FeatureGenerator() { signalName = _signalName; init_defaults(); req_signals.push_back(signalName); names.clear(); set_names(); };
578 BinnedLmEstimates(string _signalName, string init_string) : FeatureGenerator() { signalName = _signalName; init_defaults(); req_signals.push_back(signalName); init_from_string(init_string); };
579
580 //~BinnedLmEstimates() {};
581
582 void set(string& _signalName);
583 void set(string& _signalName, BinnedLmEstimatesParams* _params);
584
585 void init_defaults();
588 int init(map<string, string>& mapper);
589
590 // Copy
591 virtual void copy(FeatureGenerator *generator) { *this = *(dynamic_cast<BinnedLmEstimates *>(generator)); }
592
594 int _learn(MedPidRepository& rep, const MedSamples& samples, vector<RepProcessor *> processors);
595
597 int _generate(PidDynamicRec& rec, MedFeatures& features, int index, int num, vector<float *> &_p_data);
598
600 int filter_features(unordered_set<string>& validFeatures);
601
602 // get pointers to data
603 void get_p_data(MedFeatures& features, vector<float *> &_p_data);
604
605 // Signal Ids
606 void set_signal_ids(MedSignals& sigs);
607
608 // Sampling strategy
609 void set_sampling_strategy(string& strategy);
610
611 // Age Related functions
612 void prepare_for_age(PidDynamicRec& rec, UniversalSigVec& ageUsv, int &age, int &byear);
613 void prepare_for_age(MedPidRepository& rep, int id, UniversalSigVec& ageUsv, int &age, int &byear);
614 inline void get_age(int time, int time_unit_from, int& age, int byear);
615
616 void dprint(const string &pref, int fg_flag);
617
618 // Serialization
619 ADD_CLASS_NAME(BinnedLmEstimates)
620 ADD_SERIALIZATION_FUNCS(generator_type, signalName, names, tags, req_signals, time_unit_periods, iGenerateWeights, params, xmeans, xsdvs, ymeans, means, models, time_unit_sig, sampling_strategy)
621
622 // print
623 void print();
624};
625
626
630typedef enum {
642 FTR_RANGE_RECURRENCE_COUNT = 6,
647 FTR_RANGE_TIME_DIFF_START = 9,
649 FTR_RANGE_LAST
651
656private:
657 // actual generators
658 float uget_range_current(UniversalSigVec &usv, int updated_win_from, int updated_win_to, int time);
659 float uget_range_latest(UniversalSigVec &usv, int updated_win_from, int updated_win_to, int time);
660 float uget_range_min(UniversalSigVec &usv, int updated_win_from, int updated_win_to, int time);
661 float uget_range_max(UniversalSigVec &usv, int updated_win_from, int updated_win_to, int time);
662 float uget_range_ever(UniversalSigVec &usv, int updated_win_from, int updated_win_to, int time);
663 float uget_range_time_diff(UniversalSigVec &usv, int updated_win_from, int updated_win_to, int time);
664 float uget_range_recurrence_count(UniversalSigVec &usv, int updated_win_from, int updated_win_to, int time);
665 float uget_range_time_covered(UniversalSigVec &usv, int updated_win_from, int updated_win_to, int time);
666 float uget_range_last_nth_time_len(UniversalSigVec &usv, int updated_win_from, int updated_win_to, int time);
667 float uget_range_time_diff_start(UniversalSigVec &usv, int updated_win_from, int updated_win_to, int time);
668 float uget_range_time_inside(UniversalSigVec &usv, int updated_win_from, int updated_win_to, int time);
669
670public:
671
672 string signalName;
673 int signalId;
674 vector<string> sets;
676 int win_from = 0;
677 int win_to = 360000;
680 int val_channel = 0;
681 int check_first = 1;
682 float div_factor = 1.0f;
683
684 vector<char> lut;
685 int recurrence_delta = 30 * 24 * 60;
686 int min_range_time = -1;
687 int N_th = 0;
688 int zero_missing = 0;
689 int strict_times = 0;
691 bool regex_on_sets= false;
692
694
695 // Signal to determine allowed time-range (e.g. current stay/admission for inpatients)
696 string timeRangeSignalName = "";
697 int timeRangeSignalId;
698 TimeRangeTypes timeRangeType = TIME_RANGE_CURRENT;
700
701
702
703 // Constructor/Destructor
704 RangeFeatGenerator() : FeatureGenerator() { init_defaults(); };
705 //~RangeFeatGenerator() {};
706 void set(string& _signalName, RangeFeatureTypes _type) { set(_signalName, _type, 0, 360000); req_signals.assign(1, signalName); }
707 void set(string& _signalName, RangeFeatureTypes _type, int _time_win_from, int _time_win_to) {
708 signalName = _signalName; type = _type; win_from = _time_win_from; win_to = _time_win_to;
709 set_names(); req_signals.assign(1, signalName);
710 }
711
712 // Naming
713 void set_names();
714
715 void get_required_signal_categories(unordered_map<string, vector<string>> &signal_categories_in_use) const;
716
719 int init(map<string, string>& mapper);
720 void init_defaults();
721 RangeFeatureTypes name_to_type(const string &name);
722 void init_tables(MedDictionarySections& dict);
723 // Copy
724 virtual void copy(FeatureGenerator *generator) { *this = *(dynamic_cast<RangeFeatGenerator *>(generator)); }
725
726 // Learn a generator
727 int _learn(MedPidRepository& rep, const MedSamples& samples, vector<RepProcessor *> processors) { time_unit_sig = rep.sigs.Sid2Info[rep.sigs.sid(signalName)].time_unit; return 0; }
728
729 // generate a new feature
730 int _generate(PidDynamicRec& rec, MedFeatures& features, int index, int num, vector<float *> &_p_data);
731 float get_value(PidDynamicRec& rec, int index, int date);
732
733 // Signal Ids
734 void set_signal_ids(MedSignals& sigs) { signalId = sigs.sid(signalName); }
735
736
737 // Serialization
738 ADD_CLASS_NAME(RangeFeatGenerator)
741};
742
747public:
748
749 string modelFile = "";
750 MedModel *model = NULL;
751 string modelName = "";
752 string model_json = "";
754 bool ensure_patient_ids = true;
755 int n_preds = 1;
758 int time_unit_win = global_default_windows_time_unit;
759 int time_unit_sig = global_default_windows_time_unit;
760 vector<int> times;
761
763 void set_names();
764
767 int init(map<string, string>& mapper);
768 int init_from_model();
769
771 void override_predictions(MedSamples& inSamples, MedSamples& modelSamples);
772
774 void prepare(MedFeatures & features, MedPidRepository& rep, MedSamples& samples);
775
777 int _learn(MedPidRepository& rep, const MedSamples& samples, vector<RepProcessor *> processors);
779 int _generate(PidDynamicRec& rec, MedFeatures& features, int index, int num, vector<float *> &_p_data);
780
781 void modifySampleTime(MedSamples& samples, int time);
782
783 // (De)Serialize
784 ADD_CLASS_NAME(ModelFeatGenerator)
785 ADD_SERIALIZATION_HEADERS()
786
787 //dctor:
789private:
790 vector<vector<vector<float>>> preds;
791};
792
793
808
809
811public:
812
813 // Time Unit
814 TimeFeatTypes time_unit = FTR_TIME_LAST;
815
816 // Binning of time units
817 vector<int> time_bins;
818 vector<string> time_bin_names;
819
820 // Constructor/Destructor
823
824 // Naming
825 void set_names();
826
829 int init(map<string, string>& mapper);
830 int get_time_unit(string name);
831 int get_time_bins(string& binsInfo);
832 int get_nBins();
833 void set_default_bins();
834 string time_unit_to_string(TimeFeatTypes time_unit);
835
836 // Copy
837 virtual void copy(FeatureGenerator *generator) { *this = *(dynamic_cast<TimeFeatGenerator *>(generator)); }
838
839 // generate a new feature
840 int _generate(PidDynamicRec& rec, MedFeatures& features, int index, int num, vector<float *> &_p_data);
841
842 // Serialization
843 ADD_CLASS_NAME(TimeFeatGenerator)
844 ADD_SERIALIZATION_FUNCS(generator_type, names, time_unit, time_bins, time_bin_names, tags)
845};
846
853public:
854
855 // Attribute to use
856 string attribute;
857
858 // Feature name (if empty - use attribute)
859 string ftr_name;
860
861 // Constructor/Destructor
864
865 // Naming
866 void set_names();
867
870 int init(map<string, string>& mapper);
871
872 // Copy
873 virtual void copy(FeatureGenerator *generator) { *this = *(dynamic_cast<AttrFeatGenerator *>(generator)); }
874
875 // generate a new feature
876 int _generate(PidDynamicRec& rec, MedFeatures& features, int index, int num, vector<float *> &_p_data);
877
878 // Serialization
879 ADD_CLASS_NAME(AttrFeatGenerator);
880 ADD_SERIALIZATION_FUNCS(generator_type, ftr_name, attribute, names);
881};
882
883enum class category_stat_test {
884 chi_square = 1,
885 mcnemar = 2
886};
887
892private:
893 int bdate_sid;
894 int gender_sid;
895 map<int, vector<string>> categoryId_to_name; //for regex filter
896 map<int, vector<int>> _member2Sets; //for hierarchy
897 map<int, vector<int>> _set2Members; //for hierarchy
898 unordered_map<int, vector<int>> _member2Sets_flat_cache; //for hierarchy cache in get_parents
899
900 vector<string> top_codes;
901 vector<vector<char>> luts;
902 vector<vector<char>> filter_luts;
903 vector<int> filter_vals_idx; // stores filter indexes
904 int input_sig_num_val_ch; // store num val channels for validation
905
906 void get_parents(int codeGroup, vector<int> &parents, const boost::regex &reg_pat, const boost::regex & remove_reg_pat);
907
908 void get_stats(const unordered_map<int, vector<vector<vector<int>>>> &categoryVal_to_stats,
909 vector<int> &all_signal_values, vector<int> &signal_indexes, vector<double> &valCnts, vector<double> &posCnts,
910 vector<double> &lift, vector<double> &scores, vector<double> &p_values, vector<double> &pos_ratio, vector<int> &dof, const vector<vector<double>> &prior_per_bin) const;
911public:
912 string signalName;
913 int signalId;
925 float fdr;
933 category_stat_test stat_metric;
936 int sort_by_chi = 0;
940 bool filter_hierarchy;
941 bool verbose;
946 vector<vector<string>> filter_set_by_val_channel;
948
957
958 void set_signal_ids(MedSignals& sigs);
959
960 void init_tables(MedDictionarySections& dict);
961
962 // Constructor/Destructor
963 CategoryDependencyGenerator() : FeatureGenerator() { init_defaults(); };
964 void init_defaults();
965
966 virtual void copy(FeatureGenerator *generator) { *this = *(dynamic_cast<CategoryDependencyGenerator *>(generator)); }
967
970 int init(map<string, string>& mapper);
971
972 int update(map<string, string>& mapper);
973
974 void set_names();
975 int filter_features(unordered_set<string>& validFeatures);
976
977 int _learn(MedPidRepository& rep, const MedSamples& samples, vector<RepProcessor *> processors);
978 int _generate(PidDynamicRec& rec, MedFeatures& features, int index, int num, vector<float *> &_p_data);
979
980 int nfeatures();
981
982 void get_required_signal_categories(unordered_map<string, vector<string>> &signal_categories_in_use) const;
983
984 ADD_CLASS_NAME(CategoryDependencyGenerator)
989};
990
991//=======================================
992// Helpers
993//=======================================
994
997void get_window_in_sig_time(int _win_from, int _win_to, int _time_unit_win, int _time_unit_sig, int _win_time, int &_min_time, int &_max_time,
998 bool boundOutcomeTime = false, int outcome_time = -1);
999
1001TimeRangeTypes time_range_name_to_type(const string& name);
1002string time_range_type_to_name(TimeRangeTypes type);
1003
1004// update time window according to time-range signal
1005void get_updated_time_window(UniversalSigVec& time_range_usv, TimeRangeTypes type, int time_unit_range_sig, int time_unit_win, int time_unit_sig, int time,
1006 int win_from, int& updated_win_from, int win_to, int& updated_win_to, bool delta_win, int d_win_from, int& updated_d_win_from, int d_win_to, int& updated_d_win_to);
1007void get_updated_time_window(TimeRangeTypes type, int range_from, int range_to, int time, int _win_from, int _win_to, int& updated_win_from, int& updated_win_to);
1008
1009
1010//=======================================
1011// Joining the MedSerialze wagon
1012//=======================================
1022
1023#endif
RangeFeatureTypes
Definition FeatureGenerator.h:630
@ FTR_RANGE_EVER
"ever" - boolean 0/1 - finds if there is intersection between signal time window and the defined time...
Definition FeatureGenerator.h:635
@ FTR_RANGE_MAX
"max" - finds the maximal value of the time range signal, that there is intersection of time signal r...
Definition FeatureGenerator.h:633
@ FTR_RANGE_CURRENT
"current" - finds the value of the time range signal that intersect with win_from....
Definition FeatureGenerator.h:631
@ FTR_RANGE_MIN
"min" - finds the minimal value of the time range signal, that there is intersection of time signal r...
Definition FeatureGenerator.h:634
@ FTR_RANGE_TIME_INSIDE
< "time_inside" : checks if the prediction time point is currently INSIDE a range,...
Definition FeatureGenerator.h:648
@ FTR_RANGE_TIME_COVERED
"time_covered" : give a time window, sum up all the times in ranges that intersect the time window
Definition FeatureGenerator.h:644
@ FTR_RANGE_TIME_DIFF
"time_diff" - returns time diffrences between first intersection(if check_first is True) between sign...
Definition FeatureGenerator.h:639
@ FTR_RANGE_LAST_NTH_TIME_LENGTH
"last_nth_time_len" : gives the length (in win_time_unit) of the last_n range in the window....
Definition FeatureGenerator.h:646
@ FTR_RANGE_LATEST
"latest" - finds the last value of the time range signal, that there is intersection of time signal r...
Definition FeatureGenerator.h:632
FeatureGeneratorTypes
Definition FeatureGenerator.h:26
@ FTR_GEN_TIME
"time" - creating sample-time features (e.g. differentiate between times of day, season of year,...
Definition FeatureGenerator.h:40
@ FTR_GEN_EMBEDDING
"embedding" - allows applying a pre trained embedding model to incorporate features into matrix....
Definition FeatureGenerator.h:43
@ FTR_GEN_RANGE
"range" - creating RangeFeatGenerator
Definition FeatureGenerator.h:36
@ FTR_GEN_CATEGORY_DEPEND
"category_depend" - creates features from categorical signal that have statistical strength in sample...
Definition FeatureGenerator.h:42
@ FTR_GEN_DRG_INTAKE
"drugIntake" - creating drugs feature coverage of prescription time - DrugIntakeGenerator
Definition FeatureGenerator.h:37
@ FTR_GEN_KP_SMOKING
"kp_smoking" - creating smoking feature - KpSmokingGenerator
Definition FeatureGenerator.h:34
@ FTR_GEN_SINGLETON
"singleton" - take the value of a time-less signale - SingletonGenerator
Definition FeatureGenerator.h:30
@ FTR_GEN_BASIC
"basic" - creates basic statistic on time windows - BasicFeatGenerator
Definition FeatureGenerator.h:28
@ FTR_GEN_BINNED_LM
"binnedLm" or "binnedLM" - creating linear model for esitmating feature in time points - BinnedLmEsti...
Definition FeatureGenerator.h:32
@ FTR_GEN_ATTR
"attr" - creating features from samples attributes. Creates AttrFeatGenerator
Definition FeatureGenerator.h:41
@ FTR_GEN_MODEL
"model" - creating ModelFeatGenerator
Definition FeatureGenerator.h:39
@ FTR_GEN_GENDER
"gender" - creating gender feature - GenderGenerator (special case of signleton)
Definition FeatureGenerator.h:31
@ FTR_GEN_UNIFIED_SMOKING
"unified_smoking" - creating smoking feature - UnifiedSmokingGenerator
Definition FeatureGenerator.h:35
@ FTR_GEN_DIABETES_FINDER
"diabetes_finder" - Diabetes Finder feature. Creates DiabetesFinderGenerator
Definition FeatureGenerator.h:46
@ FTR_GEN_SMOKING
"smoking" - creating smoking feature - SmokingGenerator
Definition FeatureGenerator.h:33
@ FTR_GEN_EXTRACT_TBL
"extract_tbl" - extract values from table with keys and rules to join with each patient....
Definition FeatureGenerator.h:44
@ FTR_GEN_AGE
"age" - creating age feature - AgeGenerator
Definition FeatureGenerator.h:29
@ FTR_GEN_ALCOHOL
"alcohol" - creating alcohol feature - AlcoholGenerator
Definition FeatureGenerator.h:38
@ FTR_GEN_ELIXHAUSER
Calculate Current Elixhauser given latest DRG and Diagnosis information. Creates ElixhauserGenerator.
Definition FeatureGenerator.h:45
TimeRangeTypes time_range_name_to_type(const string &name)
Conversion between time-range type and name.
Definition FeatureGenerator.cpp:2585
BasicFeatureTypes
Definition FeatureGenerator.h:199
@ FTR_LAST_VALUE
"last" - Last Value in Window
Definition FeatureGenerator.h:200
@ FTR_NSAMPLES
"nsamples" - counts the number of times the signal apear in the time window
Definition FeatureGenerator.h:215
@ FTR_MAX_VALUE
"max" - Max value in Window
Definition FeatureGenerator.h:204
@ FTR_FIRST_DAYS
"first_time" time diffrence from prediction time to first time with signal
Definition FeatureGenerator.h:219
@ FTR_AVG_VALUE
"avg" - Mean value in Window
Definition FeatureGenerator.h:203
@ FTR_CATEGORY_SET
"category_set" - boolean 0/1 if the signal has the value in the given lut (which initialized by the "...
Definition FeatureGenerator.h:212
@ FTR_MIN_VALUE
"min" - Min value in Window
Definition FeatureGenerator.h:205
@ FTR_MAX_DIFF
"max_diff" maximum diff in window
Definition FeatureGenerator.h:218
@ FTR_EXISTS
"exists" - boolean 0/1 if the signal apears in the time window
Definition FeatureGenerator.h:216
@ FTR_CATEGORY_SET_SUM
"category_set_sum" - sums the values of appearnces of sets in the time window
Definition FeatureGenerator.h:214
@ FTR_SUM_VALUE
"sum" - sum of values in window
Definition FeatureGenerator.h:222
@ FTR_TIME_SINCE_LAST_CHANGE
"time_since_last_change" : go over states signal, take last time since the value changed
Definition FeatureGenerator.h:225
@ FTR_LAST_DELTA_VALUE
"last_delta" - Last delta. last-previous_last value
Definition FeatureGenerator.h:207
@ FTR_STD_VALUE
"std" - Standart Dev. value in Window
Definition FeatureGenerator.h:206
@ FTR_CATEGORY_SET_FIRST
"category_set_first" - boolean 0/1 if the signal apears in the time window and did not appear ever be...
Definition FeatureGenerator.h:217
@ FTR_WIN_DELTA_VALUE
"win_delta" - diffrence in value in two time windows (only if both exists, otherwise missing_value)....
Definition FeatureGenerator.h:211
@ FTR_FIRST_VALUE
"first" - First Value in Window
Definition FeatureGenerator.h:201
@ FTR_LAST2_DAYS
"last2_time" - time diffrence from prediction time to one previous last time has signal in range of v...
Definition FeatureGenerator.h:209
@ FTR_CATEGORY_SET_FIRST_TIME
"category_set_first_time" - first time of category set found in the time window
Definition FeatureGenerator.h:221
@ FTR_SLOPE_VALUE
"slope" - calculating the slope over the points in the window
Definition FeatureGenerator.h:210
@ FTR_LAST_DAYS
"last_time" - time diffrence from prediction time to last time has signal in range of values
Definition FeatureGenerator.h:208
@ FTR_LAST2_VALUE
"last2" - One before last value in Window
Definition FeatureGenerator.h:202
@ FTR_CATEGORY_SET_COUNT
"category_set_count" - counts the number of appearnces of sets in the time window
Definition FeatureGenerator.h:213
@ FTR_CATEGORY_SET_LAST_NTH
"category_set_last_nth" : (set also N_th parameter to use), check is the last N_th in window is in th...
Definition FeatureGenerator.h:224
@ FTR_LAST_NTH_VALUE
"last_nth" : (set also N_th parameter to use), get the last N_th in window, 0 is last,...
Definition FeatureGenerator.h:223
@ FTR_RANGE_WIDTH
"range_width" maximal value - minimal value in a given window time frame
Definition FeatureGenerator.h:220
BinnedLMSamplingStrategy
BinnedLinearModels : which time-points to take.
Definition FeatureGenerator.h:545
TimeRangeTypes
Definition FeatureGenerator.h:232
@ TIME_RANGE_CURRENT
"current" - consider only the current time-range
Definition FeatureGenerator.h:233
@ TIME_RANGE_BEFORE
"before" - consider anything before the current time-range
Definition FeatureGenerator.h:234
void get_window_in_sig_time(int _win_from, int _win_to, int _time_unit_win, int _time_unit_sig, int _win_time, int &_min_time, int &_max_time, bool boundOutcomeTime=false, int outcome_time=-1)
gets a [-_win_to, -_win_from] window in win time unit, and returns [_min_time, _max_time] window in s...
Definition FeatureGenerator.cpp:2560
TimeFeatTypes
Time Feature Generator: creating sample-time features (e.g.
Definition FeatureGenerator.h:798
@ FTR_TIME_HOUR
Hour of the day (0-23)
Definition FeatureGenerator.h:803
@ FTR_TIME_YEAR
Year (as is)
Definition FeatureGenerator.h:799
@ FTR_TIME_MONTH
Month of year (0-11)
Definition FeatureGenerator.h:800
@ FTR_TIME_DAY_IN_WEEK
Day of the week (0-6)
Definition FeatureGenerator.h:802
@ FTR_TIME_DAY_IN_MONTH
Day of the month (0-30)
Definition FeatureGenerator.h:801
@ FTR_TIME_MINUTE
Minute of the hout (0-59)
Definition FeatureGenerator.h:804
@ FTR_TIME_DATE
Completete date (as is)
Definition FeatureGenerator.h:805
Logger.h - allowing logs with more control.
MedAlgo - APIs to different algorithms: Linear Models, RF, GBM, KNN, and more.
MedTime.h.
RepProcessor is the parent class for processing a MedRepository or PidDynamicRec Basic functionalitie...
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
Age Generator.
Definition FeatureGenerator.h:383
int signalId
Signal Id.
Definition FeatureGenerator.h:388
virtual int init(map< string, string > &mapper)
Virtual to init object from parsed fields.
Definition FeatureGenerator.cpp:739
Attribute Feature Generator: creating features from samples attributes.
Definition FeatureGenerator.h:852
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:2288
A Basic Stats Generator for calcing simple statics on time window.
Definition FeatureGenerator.h:241
int win_from
time window for feature: win_from is the minimal time before from the prediction time
Definition FeatureGenerator.h:291
vector< float > categ_map
to be used when applying non FTR_CATEGORY_SET_* types to categorical data
Definition FeatureGenerator.h:312
void set_signal_ids(MedSignals &sigs)
Signal Ids.
Definition FeatureGenerator.cpp:521
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:651
int time_channel
n >= 0 : use time channel n , default: 0.
Definition FeatureGenerator.h:296
int d_win_to
delta time window for the FTR_WIN_DELTA_VALUE feature. the second time window
Definition FeatureGenerator.h:294
int full_name
add time and value channels even if 0
Definition FeatureGenerator.h:307
int time_unit_range_sig
the time init in which the range signal is given. (set correctly from Repository in learn and _genera...
Definition FeatureGenerator.h:287
float max_value
values range for FTR_LAST(2)_DAYS
Definition FeatureGenerator.h:303
int time_unit_sig
the time init in which the signal is given. (set correctly from Repository in learn and _generate)
Definition FeatureGenerator.h:300
int _generate(PidDynamicRec &rec, MedFeatures &features, int index, int num, vector< float * > &_p_data)
generate a new feature
Definition FeatureGenerator.cpp:501
int N_th
used in last_nth and category_set_last_nth
Definition FeatureGenerator.h:304
int val_channel
n >= 0 : use val channel n , default : 0.
Definition FeatureGenerator.h:297
int time_unit_win
the time unit in which the windows are given. Default: Undefined
Definition FeatureGenerator.h:295
string in_set_name
set name (if not given - take list of members)
Definition FeatureGenerator.h:301
void init_tables(MedDictionarySections &dict)
Init required tables.
Definition FeatureGenerator.cpp:534
TimeRangeTypes time_range_name_to_type(const string &name)
Conversion between time-range type and name.
Definition FeatureGenerator.cpp:401
int sum_channel
for FTR_CETEGORY_SET_SUM
Definition FeatureGenerator.h:298
vector< string > sets
for FTR_CATEGORY_SET_* , the list of sets
Definition FeatureGenerator.h:299
bool bound_outcomeTime
If true will truncate time window till outcomeTime.
Definition FeatureGenerator.h:302
void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition FeatureGenerator.cpp:596
BasicFeatureTypes name_to_type(const string &name)
Converts a name to type - please reffer to BasicFeatureTypes.
Definition FeatureGenerator.cpp:364
vector< char > lut
to be used when generating FTR_CATEGORY_SET_*
Definition FeatureGenerator.h:311
int _learn(MedPidRepository &rep, const MedSamples &samples, vector< RepProcessor * > processors)
Learn a generator.
Definition FeatureGenerator.h:354
int win_to
time window for feature: win_to is the maximal time before the prediction time
Definition FeatureGenerator.h:292
int zero_missing
in some cases of category_set (or others) we may want to get 0 instead of missing_value,...
Definition FeatureGenerator.h:305
float zero_missing_val
when zero_missing is on - whats the value to store in the missing value feature
Definition FeatureGenerator.h:306
int d_win_from
delta time window for the FTR_WIN_DELTA_VALUE feature. the second time window
Definition FeatureGenerator.h:293
BinnedLinearModels : Apply a set of liner models to generate features.
Definition FeatureGenerator.h:555
int _learn(MedPidRepository &rep, const MedSamples &samples, vector< RepProcessor * > processors)
Learn a generator.
Definition BinnedLmEstimates.cpp:180
int val_channel
n >= 0 : use val channel n , default : 0.
Definition FeatureGenerator.h:570
int time_unit_sig
the time init in which the signal is given. Default: Undefined
Definition FeatureGenerator.h:568
int time_channel
n >= 0 : use time channel n , default: 0.
Definition FeatureGenerator.h:569
int time_unit_periods
the time unit in which the periods are given. Default: Undefined
Definition FeatureGenerator.h:567
int _generate(PidDynamicRec &rec, MedFeatures &features, int index, int num, vector< float * > &_p_data)
generate new feature(s)
Definition BinnedLmEstimates.cpp:483
void set_names()
Naming.
Definition BinnedLmEstimates.cpp:34
int filter_features(unordered_set< string > &validFeatures)
Filter generated features according to a set. return number of valid features (does not affect single...
Definition BinnedLmEstimates.cpp:619
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition BinnedLmEstimates.cpp:112
Creates multipal features based on categorical values and statistical dependency strength by Age,...
Definition FeatureGenerator.h:891
bool generate_with_counts
If true will generate feature with counts not just as set.
Definition FeatureGenerator.h:945
float filter_child_lift_ratio
below this threshold of lift change to remove child category
Definition FeatureGenerator.h:930
int max_depth
maximal depth to go in heirarchy
Definition FeatureGenerator.h:937
float male_regression_cntrl_lower
lower limit mask on outcome for controls - important inregression
Definition FeatureGenerator.h:949
string regex_filter
regex filter for filtering categories in learn
Definition FeatureGenerator.h:922
bool use_fixed_lift
If true will also sort be lifts below 1.
Definition FeatureGenerator.h:939
string verbose_full_file
output file for verbose_full debug in learn
Definition FeatureGenerator.h:943
int update(map< string, string > &mapper)
Virtual to update object from parsed fields.
Definition CategoryDependencyGenerator.cpp:204
vector< string > filter_set_by_val_channel_names
naming for each set matched filter_set_by_val_channel variable
Definition FeatureGenerator.h:947
void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition CategoryDependencyGenerator.cpp:275
category_stat_test stat_metric
statistical test
Definition FeatureGenerator.h:933
float male_regression_cntrl_upper
upper limit mask on outcome for controls - important inregression
Definition FeatureGenerator.h:950
int min_code_cnt
minimal number of occourences to consider signal
Definition FeatureGenerator.h:924
int age_bin
age bin for testing statistical dependency
Definition FeatureGenerator.h:921
float filter_child_pval_diff
below this threshold of pvalue diff change to remove child category (with AND condition on average li...
Definition FeatureGenerator.h:929
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition CategoryDependencyGenerator.cpp:67
string remove_regex_filter
remove regex filter for filtering categories in learn
Definition FeatureGenerator.h:923
string signalName
the signal name
Definition FeatureGenerator.h:912
float female_regression_case_lower
lower limit mask on outcome for cases - important inregression
Definition FeatureGenerator.h:955
float male_regression_case_lower
lower limit mask on outcome for cases - important inregression
Definition FeatureGenerator.h:951
float lift_below
filter lift to keep below it
Definition FeatureGenerator.h:927
string feature_prefix
additional prefix to add to name to describe the feature
Definition FeatureGenerator.h:944
int time_channel
n >= 0 : use time channel n , default: 0.
Definition FeatureGenerator.h:914
int win_to
time window for feature: win_to is the maximal time before the prediction time
Definition FeatureGenerator.h:917
float female_regression_cntrl_lower
lower limit mask on outcome for controls - important inregression
Definition FeatureGenerator.h:953
int minimal_chi_cnt
chi_square arg to keep at least count to use row in calc
Definition FeatureGenerator.h:935
vector< vector< string > > filter_set_by_val_channel
filter set by value channels. can be initialized by "filter_set_by_val_channel_X":"string_set_for_val...
Definition FeatureGenerator.h:946
int max_parents
controls maximum parents count
Definition FeatureGenerator.h:938
float filter_child_removed_ratio
If child removed ratio is beyond this and has other child taken - remove parent.
Definition FeatureGenerator.h:932
float female_regression_case_upper
upper limit mask on outcome for cases - important inregression
Definition FeatureGenerator.h:956
int win_from
time window for feature: win_from is the minimal time before from the prediction time
Definition FeatureGenerator.h:916
int max_age
maximal age for testing statistical dependency
Definition FeatureGenerator.h:920
float male_regression_case_upper
upper limit mask on outcome for cases - important inregression
Definition FeatureGenerator.h:952
int take_top
maximal number of features to create
Definition FeatureGenerator.h:926
float fdr
the FDR value
Definition FeatureGenerator.h:925
bool verbose_full
If true will print a lot - table of all stats for each code.
Definition FeatureGenerator.h:942
bool verbose
Apply hierarchy filtering.
Definition FeatureGenerator.h:941
int min_age
minimal age for testing statistical dependency
Definition FeatureGenerator.h:919
float female_regression_cntrl_upper
upper limit mask on outcome for controls - important inregression
Definition FeatureGenerator.h:954
float lift_above
filter lift to keep above it
Definition FeatureGenerator.h:928
int val_channel
n >= 0 : use val channel n , default : 0.
Definition FeatureGenerator.h:915
int filter_features(unordered_set< string > &validFeatures)
summary> prints summary of generator job.
Definition CategoryDependencyGenerator.cpp:955
float filter_child_count_ratio
If child ratio count is too similar, small change from parent code - keep only paretn code.
Definition FeatureGenerator.h:931
int time_unit_win
the time unit in which the windows are given. Default: Undefined
Definition FeatureGenerator.h:918
float chi_square_at_least
chi_square arg to test for at least that change in lift to measure bigger diffrence
Definition FeatureGenerator.h:934
int sort_by_chi
sort results by chi-square
Definition FeatureGenerator.h:936
Definition FeatureGenerator.h:53
int iGenerateWeights
Feature/Weights generator.
Definition FeatureGenerator.h:72
vector< string > tags
Tags - for defining labels or groups. may be used later for filtering for example.
Definition FeatureGenerator.h:69
FeatureGeneratorTypes generator_type
Type.
Definition FeatureGenerator.h:57
void * new_polymorphic(string derived_class_name)
for polymorphic classes that want to be able to serialize/deserialize a pointer * to the derived clas...
Definition FeatureGenerator.cpp:130
virtual void fit_for_repository(MedPidRepository &rep)
Prepartion and adjustment for model based on repository.
Definition FeatureGenerator.h:108
virtual int filter_features(unordered_set< string > &validFeatures)
summary> prints summary of generator job.
Definition FeatureGenerator.cpp:321
float missing_val
Missing value.
Definition FeatureGenerator.h:66
virtual void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition FeatureGenerator.h:153
vector< string > names
Feature name.
Definition FeatureGenerator.h:60
Gender.
Definition FeatureGenerator.h:477
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:782
int genderId
Gender Id.
Definition FeatureGenerator.h:483
void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition FeatureGenerator.cpp:804
Definition MedDictionary.h:87
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
static int global_serial_id_cnt
A global counter used to prevent identical names for two features by adding FTR_::_ before generated ...
Definition MedFeatures.h:73
A model = repCleaner + featureGenerator + featureProcessor + MedPredictor.
Definition MedModel.h:56
Definition MedPidRepository.h:87
MedSamples represent a collection of samples per different id The data is conatined in a vector of ...
Definition MedSamples.h:129
Definition MedSignals.h:719
static const int Undefined
undefined time unit
Definition MedTime.h:24
Use a model to generate predictions to be used as features.
Definition FeatureGenerator.h:746
string modelFile
File for serialized model.
Definition FeatureGenerator.h:749
int _learn(MedPidRepository &rep, const MedSamples &samples, vector< RepProcessor * > processors)
learn method
Definition FeatureGenerator.cpp:2417
void override_predictions(MedSamples &inSamples, MedSamples &modelSamples)
Use a given vector of predictions instead of applying model.
Definition FeatureGenerator.cpp:2446
void prepare(MedFeatures &features, MedPidRepository &rep, MedSamples &samples)
Do the actual prediction prior to feature generation ...
Definition FeatureGenerator.cpp:2484
int use_overriden_predictions
Use a given vector of predictions instead of applying model.
Definition FeatureGenerator.h:757
bool ensure_patient_ids
if true will ensure the ids are the same as curretn training samples
Definition FeatureGenerator.h:754
string modelName
name of final feature
Definition FeatureGenerator.h:751
int _generate(PidDynamicRec &rec, MedFeatures &features, int index, int num, vector< float * > &_p_data)
generate a new feature
Definition FeatureGenerator.cpp:2531
string model_train_samples
path train model samples.
Definition FeatureGenerator.h:753
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:2352
string model_json
path load json and train model for this.
Definition FeatureGenerator.h:752
int time_unit_win
the time unit in which the times are given. Default: global_default_windows_time_unit
Definition FeatureGenerator.h:758
int n_preds
how many features to create
Definition FeatureGenerator.h:755
void set_names()
Naming.
Definition FeatureGenerator.cpp:2326
MedModel * model
model
Definition FeatureGenerator.h:750
int impute_existing_feature
If true will use model to impute an existing feature (determined by model name. Otherwise - generate ...
Definition FeatureGenerator.h:756
int time_unit_sig
the time init in which the signal is given. (set correctly from Repository in learn and Generate)
Definition FeatureGenerator.h:759
Definition MedPidRepository.h:127
RangeFeatGenerator : Generate features for a time range with value signal (for example drug)
Definition FeatureGenerator.h:655
int time_unit_range_sig
the time unit in which the range signal is given. (set correctly from Repository in learn and _genera...
Definition FeatureGenerator.h:699
bool regex_on_sets
if on , regex is applied on .*sets[i].* and aggregated.
Definition FeatureGenerator.h:691
int time_unit_win
the time unit in which the windows are given. Default: Undefined
Definition FeatureGenerator.h:678
int check_first
if 1 choose first occurance of check_val otherwise choose last
Definition FeatureGenerator.h:681
int win_from
time window for feature: from is the minimal time before prediciton time
Definition FeatureGenerator.h:676
int val_channel
n >= 0 : use val channel n , default : 0.
Definition FeatureGenerator.h:680
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:1036
void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition FeatureGenerator.cpp:1104
int min_range_time
if different from -1, the minimum length for a range to be considered valid in window time units (els...
Definition FeatureGenerator.h:686
int time_unit_sig
the time init in which the signal is given. (set correctly from Repository in learn and Generate)
Definition FeatureGenerator.h:679
int conditional_channel
in some cases (currently last_nth_len, and time_covered) we allow doing the calculation only on range...
Definition FeatureGenerator.h:690
string signalName
Signal to consider.
Definition FeatureGenerator.h:672
RangeFeatureTypes type
Type of comorbidity index to generate.
Definition FeatureGenerator.h:675
int N_th
the index of the N-th range in order to consider in the last_nth_time_len option
Definition FeatureGenerator.h:687
int strict_times
if on , will ignore cases in which the second time channel is after the prediction time
Definition FeatureGenerator.h:689
vector< string > sets
FTR_RANGE_EVER checks if the signal ever was in one of these sets/defs from the respective dict.
Definition FeatureGenerator.h:674
int win_to
time window for feature: to is the maximal time before prediciton time
Definition FeatureGenerator.h:677
int recurrence_delta
maximum time for a subsequent range signal to be considered a recurrence in in window time units
Definition FeatureGenerator.h:685
RangeFeatureTypes name_to_type(const string &name)
please reffer to RangeFeatureTypes to understand the options
Definition FeatureGenerator.cpp:1128
int zero_missing
in some cases we may want to get 0 instead of missing values
Definition FeatureGenerator.h:688
int first_evidence_time_channel
sometimes we have a different time channel stating WHEN the range started. We are strict and use the ...
Definition FeatureGenerator.h:693
vector< char > lut
dividing by this number in time_covered option
Definition FeatureGenerator.h:684
Definition SerializableObject.h:32
int init_from_string(string init_string)
Init from string.
Definition SerializableObject.cpp:121
Singleton.
Definition FeatureGenerator.h:421
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:960
string in_set_name
list of sets
Definition FeatureGenerator.h:435
string signalName
Signal Id.
Definition FeatureGenerator.h:431
void get_required_signal_categories(unordered_map< string, vector< string > > &signal_categories_in_use) const
returns for each used signal it's used categories
Definition FeatureGenerator.cpp:913
Definition FeatureGenerator.h:810
int init(map< string, string > &mapper)
The parsed fields from init command.
Definition FeatureGenerator.cpp:2113
BinnedLinearModels : parameters.
Definition FeatureGenerator.h:529