Medial Code Documentation
Loading...
Searching...
No Matches
bootstrap.h
Go to the documentation of this file.
1#ifndef __BOOTSTRAP_ANALYSIS_H__
2#define __BOOTSTRAP_ANALYSIS_H__
3#include <vector>
4#include <string>
5#include <map>
6#include <random>
10#include <MedStat/MedStat/MedStat.h>
11
12using namespace std;
13
19static MedTime med_time;
20
27{
28public:
40 Lazy_Iterator(const vector<int> *p_pids, const vector<float> *p_preds,
41 const vector<float> *p_y, const vector<float> *p_w, float p_sample_ratio, int p_sample_per_pid, int max_loops, int seed, const vector<int> *p_preds_order = NULL);
42
43 void init(const vector<int> *p_pids, const vector<float> *p_preds,
44 const vector<float> *p_y, const vector<float> *p_w, float p_sample_ratio, int p_sample_per_pid, int max_loops, int seed);
45
49 inline bool fetch_next(int thread, float &ret_y, float &ret_pred, float &weight);
50 inline bool fetch_next(int thread, float &ret_y, const float *&ret_pred, float &weight, const int *&preds_order);
54 bool fetch_next_external(int thread, float &ret_y, float &ret_pred, float &weight);
58 bool fetch_next_external(int thread, float &ret_y, float &ret_pred, float &weight, const int *&preds_order);
59
63 void restart_iterator(int thread);
70 void set_static(const vector<float> *p_y, const vector<float> *p_preds, const vector<float> *p_w, const vector<int> *p_preds_order, int thread_num);
71
72 void set_thread_sample_all(int thread);
73
75
76 // sampling params:
81private:
82 // internal structure - one time init
83 static random_device rd;
84 vector<mt19937> rd_gen;
85 uniform_int_distribution<> rand_pids;
86 vector<int> ind_to_pid;
87 vector<vector<int>> pid_index_to_indexes; // for each pid_index retrieve the indexes in the original vectors
88 vector<uniform_int_distribution<>> internal_random;
89 int cohort_size;
90 int min_pid_start;
91 // init each time again
92 // save for each Thread!
93 vector<int> current_pos;
94 vector<int> inner_pos; // only used when sample_per_pid==0
95 vector<int> sel_pid_index; // only used when sample_per_pid==0
96 vector<int> vec_size;
97 vector<const float *> vec_y;
98 vector<const float *> vec_preds;
99 vector<const float *> vec_weights;
100 vector<const int *> vec_preds_order;
101 // original vectors
102 const float *preds;
103 const float *y;
104 const float *weights;
105 const vector<int> *pids;
106 const int *preds_order;
107
108 // threading:
109 int maxThreadCount;
110};
111
119{
120public:
121 // no calc
122 Mem_Iterator() {}
123
133 Mem_Iterator(const vector<int> &pids, const vector<int> &cohort_indexes, float p_sample_ratio, int p_sample_per_pid, int seed);
134
138 inline void fetch_selection(vector<int> &indexes) const;
139
143 void fetch_selection_external(vector<int> &indexes) const;
144
148 inline void fetch_selection(mt19937 &rd_gen, vector<int> &indexes) const;
149
153 void fetch_selection_external(mt19937 &rd_gen, vector<int> &indexes) const;
154
155 // sampling params:
158private:
159 // internal structure - one time init
160 mt19937 _rd_gen;
161
162 int cohort_size;
163 int tot_rec_cnt;
164 vector<vector<int>> pid_to_inds;
165 vector<int> ind_to_pid;
166 vector<int> cohort_idx;
167};
168
173{
174public:
177 virtual ~Measurement_Params() {};
178};
179
180#pragma region Measurements Functions
188map<string, float> calc_npos_nneg(Lazy_Iterator *iterator, int thread_num, Measurement_Params *function_params);
196map<string, float> calc_only_auc(Lazy_Iterator *iterator, int thread_num, Measurement_Params *function_params);
205map<string, float> calc_roc_measures_with_inc(Lazy_Iterator *iterator, int thread_num, Measurement_Params *function_params); // with PPV and PR
213map<string, float> calc_kandel_tau(Lazy_Iterator *iterator, int thread_num, Measurement_Params *function_params);
221map<string, float> calc_multi_class(Lazy_Iterator *iterator, int thread_num, Measurement_Params *function_params);
222
234map<string, float> calc_harrell_c_statistic(Lazy_Iterator *iterator, int thread_num, Measurement_Params *function_params);
235
244map<string, float> calc_regression(Lazy_Iterator *iterator, int thread_num, Measurement_Params *function_params);
245
246// For example we can put here statistical measures for regression problem or more measurements for classification..
247#pragma endregion
248
253{
254public:
255 // age bin config:
257 float min_age;
258 float max_age;
261 // male:
262 vector<vector<double>> male_labels_count_per_age;
263 // female:
264 vector<vector<double>> female_labels_count_per_age;
265
280 void read_from_text_file(const string &text_file);
282 void write_to_text_file(const string &text_file);
283
286};
287
343
348{
349public:
350 vector<int> top_n;
352 vector<float> dist_weights;
353 vector<vector<float>> dist_matrix;
354 string dist_name = "JACCARD";
355 string dist_file;
356 bool do_class_auc = false;
357
359 {
360 top_n = {1, 5};
361 n_categ = 1;
362 }
363 Multiclass_Params(const string &init_string);
364
365 int init(map<string, string> &map);
366 void read_dist_matrix_from_file(const string &fileName);
367
368 ADD_CLASS_NAME(Multiclass_Params)
369 ADD_SERIALIZATION_FUNCS(top_n, n_categ, dist_weights, dist_file, dist_matrix, dist_name, do_class_auc)
370};
371
376{
377public:
378 bool do_logloss = false;
379 double epsilon = 1e-5;
380 vector<float> coverage_quantile_percentages;
381
387 int init(map<string, string> &mapper);
388
389 ADD_CLASS_NAME(Regression_Params)
390 ADD_SERIALIZATION_FUNCS(do_logloss, coverage_quantile_percentages, epsilon)
391};
392
393#pragma region Cohort Functions
397bool filter_range_param(const map<string, vector<float>> &record_info, int index, void *cohort_params); // on single param
402bool filter_range_params(const map<string, vector<float>> &record_info, int index, void *cohort_params); // on vector of params
403#pragma endregion
404
409{ // for example Age and range for filter
410public:
411 string param_name;
412 float min_range;
413 float max_range;
414
421 Filter_Param(const string &init_string);
422
429 int init_from_string(string init_string);
430
433 int init(map<string, string> &map);
434
439
441};
442
444{
445 ROC_Params *roc_params;
446 vector<Filter_Param> *filter;
447};
448
449// Infra
453typedef map<string, float> (*MeasurementFunctions)(Lazy_Iterator *iterator, int thread_num, Measurement_Params *function_params);
456typedef bool (*FilterCohortFunc)(const map<string, vector<float>> &record_info, int index, void *cohort_params);
459typedef void (*ProcessMeasurementParamFunc)(const map<string, vector<float>> &additional_info, const vector<float> &y, const vector<int> &pids, Measurement_Params *function_params,
460 const vector<int> &filtered_indexes, const vector<float> &y_full, const vector<int> &pids_full);
463typedef void (*PreprocessScoresFunc)(vector<float> &preds, Measurement_Params *function_params);
464
465#pragma region Process Measurement Param Functions
470void fix_cohort_sample_incidence(const map<string, vector<float>> &additional_info,
471 const vector<float> &y, const vector<int> &pids, Measurement_Params *function_params,
472 const vector<int> &filtered_indexes, const vector<float> &y_full, const vector<int> &pids_full);
473
479void fix_cohort_sample_incidence_old(const map<string, vector<float>> &additional_info,
480 const vector<float> &y, const vector<int> &pids, Measurement_Params *function_params,
481 const vector<int> &filtered_indexes, const vector<float> &y_full, const vector<int> &pids_full);
482#pragma endregion
483
484#pragma region Process Scores Functions
488void preprocess_bin_scores(vector<float> &preds, Measurement_Params *function_params);
489#pragma endregion
490
494inline string format_working_point(const string &init_str, float wp, bool perc = true)
495{
496 char res[100];
497 if (perc)
498 wp *= 100;
499 snprintf(res, sizeof(res), "%s_%06.3f", init_str.c_str(), wp);
500 return string(res);
501}
502
503inline string format_working_point_topn(const string &init_str, int wp, bool perc = true)
504{
505 char res[100];
506 snprintf(res, sizeof(res), "%s_%d", init_str.c_str(), wp);
507 return string(res);
508}
509
513map<string, float> booststrap_analyze_cohort(const vector<float> &preds, const vector<int> &preds_order, const vector<float> &y,
514 const vector<int> &pids, float sample_ratio, int sample_per_pid, int loopCnt,
515 const vector<MeasurementFunctions> &meas_functions, const vector<Measurement_Params *> &function_params,
516 ProcessMeasurementParamFunc process_measurments_params,
517 const map<string, vector<float>> &additional_info, const vector<float> &y_full,
518 const vector<int> &pids_full, const vector<float> *weights, const vector<int> &filter_indexes, FilterCohortFunc cohort_def,
519 void *cohort_params, int &warn_cnt, const string &cohort_name, int seed = 0);
520
527
543map<string, map<string, float>> booststrap_analyze(
544 const vector<float> &preds,
545 const vector<int> &preds_order,
546 const vector<float> &y,
547 const vector<float> *weights,
548 const vector<int> &pids,
549 const map<string, vector<float>> &additional_info,
550 const map<string, FilterCohortFunc> &filter_cohort,
551 const vector<MeasurementFunctions> &meas_functions = {calc_roc_measures_with_inc},
552 const map<string, void *> *cohort_params = NULL,
553 const vector<Measurement_Params *> *function_params = NULL,
554 ProcessMeasurementParamFunc process_measurments_params = NULL,
555 PreprocessScoresFunc preprocess_scores = NULL,
556 Measurement_Params *preprocess_scores_params = NULL,
557 float sample_ratio = (float)1.0,
558 int sample_per_pid = 1,
559 int loopCnt = 500,
560 int seed = 0,
561 bool binary_outcome = true);
562
576void prepare_for_bootstrap(const vector<int> &pids,
577 const map<string, vector<float>> &additional_info, FilterCohortFunc &filter_cohort, void *cohort_params, float sample_ratio, int sample_per_pid, int seed, vector<int> &indexes);
578
583void write_bootstrap_results(const string &file_name, const map<string, map<string, float>> &all_cohorts_measurments, const string &run_id = "");
588void read_bootstrap_results(const string &file_name, map<string, map<string, float>> &all_cohorts_measurments);
589
594void write_pivot_bootstrap_results(const string &file_name, const map<string, map<string, float>> &all_cohorts_measurments, const string &run_id = "");
599void read_pivot_bootstrap_results(const string &file_name, map<string, map<string, float>> &all_cohorts_measurments);
600
606
607#endif // !__BOOTSTRAP_ANALYSIS_H__
Logger.h - allowing logs with more control.
MedTime.h.
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
map< string, float > booststrap_analyze_cohort(const vector< float > &preds, const vector< int > &preds_order, const vector< float > &y, const vector< int > &pids, float sample_ratio, int sample_per_pid, int loopCnt, const vector< MeasurementFunctions > &meas_functions, const vector< Measurement_Params * > &function_params, ProcessMeasurementParamFunc process_measurments_params, const map< string, vector< float > > &additional_info, const vector< float > &y_full, const vector< int > &pids_full, const vector< float > *weights, const vector< int > &filter_indexes, FilterCohortFunc cohort_def, void *cohort_params, int &warn_cnt, const string &cohort_name, int seed=0)
to run bootstrap on single cohort
Definition bootstrap.cpp:510
void read_pivot_bootstrap_results(const string &file_name, map< string, map< string, float > > &all_cohorts_measurments)
Will read the bootstrap results into file with the new format with columns: "Cohort$Measure_Name",...
Definition bootstrap.cpp:1014
void(* ProcessMeasurementParamFunc)(const map< string, vector< float > > &additional_info, const vector< float > &y, const vector< int > &pids, Measurement_Params *function_params, const vector< int > &filtered_indexes, const vector< float > &y_full, const vector< int > &pids_full)
a function to process and maniplulate function params based on the given cohort - for example sotring...
Definition bootstrap.h:459
void prepare_for_bootstrap(const vector< int > &pids, const map< string, vector< float > > &additional_info, FilterCohortFunc &filter_cohort, void *cohort_params, float sample_ratio, int sample_per_pid, int seed, vector< int > &indexes)
Definition bootstrap.cpp:494
void fix_cohort_sample_incidence_old(const map< string, vector< float > > &additional_info, const vector< float > &y, const vector< int > &pids, Measurement_Params *function_params, const vector< int > &filtered_indexes, const vector< float > &y_full, const vector< int > &pids_full)
a function to calculate the incidence in each cohort - preprocessing of function_params and storing t...
Definition bootstrap.cpp:2901
map< string, float >(* MeasurementFunctions)(Lazy_Iterator *iterator, int thread_num, Measurement_Params *function_params)
Function which recieves Lazy_Iterator and the thread num for iterating the predictions and labels.
Definition bootstrap.h:453
void write_pivot_bootstrap_results(const string &file_name, const map< string, map< string, float > > &all_cohorts_measurments, const string &run_id="")
Will output the bootstrap results into file with the new format with columns: "Cohort$Measure_Name",...
Definition bootstrap.cpp:983
bool filter_range_param(const map< string, vector< float > > &record_info, int index, void *cohort_params)
A function to filter samples based on single Filter_Param object.
Definition bootstrap.cpp:2703
void preprocess_bin_scores(vector< float > &preds, Measurement_Params *function_params)
Binning function of scores based on ROC_Params.
Definition bootstrap.cpp:3030
bool filter_range_params(const map< string, vector< float > > &record_info, int index, void *cohort_params)
A function to filter samples based on multipal Filter_Param objects - in a vector with and condition ...
Definition bootstrap.cpp:2713
string format_working_point(const string &init_str, float wp, bool perc=true)
Format out measurement.
Definition bootstrap.h:494
void write_bootstrap_results(const string &file_name, const map< string, map< string, float > > &all_cohorts_measurments, const string &run_id="")
Will output the bootstrap results into file in TAB delimeted format.
Definition bootstrap.cpp:915
void(* PreprocessScoresFunc)(vector< float > &preds, Measurement_Params *function_params)
a funtion to preprocess the prediction scores (binning for example to speed up bootstrap).
Definition bootstrap.h:463
bool(* FilterCohortFunc)(const map< string, vector< float > > &record_info, int index, void *cohort_params)
Function which recieves map from feature name to vector of all samples value, sample index and cohort...
Definition bootstrap.h:456
map< string, map< string, float > > booststrap_analyze(const vector< float > &preds, const vector< int > &preds_order, const vector< float > &y, const vector< float > *weights, const vector< int > &pids, const map< string, vector< float > > &additional_info, const map< string, FilterCohortFunc > &filter_cohort, const vector< MeasurementFunctions > &meas_functions={calc_roc_measures_with_inc}, const map< string, void * > *cohort_params=NULL, const vector< Measurement_Params * > *function_params=NULL, ProcessMeasurementParamFunc process_measurments_params=NULL, PreprocessScoresFunc preprocess_scores=NULL, Measurement_Params *preprocess_scores_params=NULL, float sample_ratio=(float) 1.0, int sample_per_pid=1, int loopCnt=500, int seed=0, bool binary_outcome=true)
The main bootstrap function to run all bootstrap process with all the arguments.
Definition bootstrap.cpp:749
void read_bootstrap_results(const string &file_name, map< string, map< string, float > > &all_cohorts_measurments)
Will read the bootstrap results from file in TAB delimeted format.
Definition bootstrap.cpp:951
void fix_cohort_sample_incidence(const map< string, vector< float > > &additional_info, const vector< float > &y, const vector< int > &pids, Measurement_Params *function_params, const vector< int > &filtered_indexes, const vector< float > &y_full, const vector< int > &pids_full)
a function to calculate the incidence in each cohort - preprocessing of function_params and storing t...
Definition bootstrap.cpp:2812
Parameter object for filter_params functions.
Definition bootstrap.h:409
int init(map< string, string > &map)
default init function for each parameter.
Definition bootstrap.cpp:3176
float max_range
the maximal range for the parameter
Definition bootstrap.h:413
Filter_Param()
default Ctor
Definition bootstrap.h:438
int init_from_string(string init_string)
initializing object in format: "PARAM_NAME:MIN_RANGE,MAX_RANGE".
Definition bootstrap.cpp:3160
float min_range
the minimal range for the parameter
Definition bootstrap.h:412
string param_name
The parameter name for the filtering.
Definition bootstrap.h:411
The Incident Object which holds the gender, age incidence stats.
Definition bootstrap.h:253
vector< vector< double > > male_labels_count_per_age
for each age_bin, histogram of outcome labels
Definition bootstrap.h:262
void write_to_text_file(const string &text_file)
Writing the file. please refer to read_from_text_file for the file format.
Definition bootstrap.cpp:3193
float max_age
the maximal age in the file
Definition bootstrap.h:258
int age_bin_years
age bin size in years
Definition bootstrap.h:256
void read_from_text_file(const string &text_file)
Reading the file.
Definition bootstrap.cpp:3215
vector< vector< double > > female_labels_count_per_age
for each age_bin, histogram of outcome labels
Definition bootstrap.h:264
float min_age
the minimal age in the file
Definition bootstrap.h:257
vector< float > sorted_outcome_labels
outcome_labels - sorted:
Definition bootstrap.h:260
A class which fetches the samples in bootstrap manner in lazy way.
Definition bootstrap.h:27
void set_static(const vector< float > *p_y, const vector< float > *p_preds, const vector< float > *p_w, const vector< int > *p_preds_order, int thread_num)
set the bootstrap to retrieve those vectors p_y,p_preds with no randomizations
Definition bootstrap.cpp:191
bool sample_all_no_sampling
for calcing Obs if true
Definition bootstrap.h:79
int sample_per_pid
how many samples to take for each patients. 0 - means no sampling take all sample for patient
Definition bootstrap.h:78
bool fetch_next_external(int thread, float &ret_y, float &ret_pred, float &weight)
external function to fetch next pred,label couple in the bootstrap process for external implementitio...
Definition bootstrap.cpp:274
void restart_iterator(int thread)
to restart the iterator
Definition bootstrap.cpp:285
float sample_ratio
the sample ratio of the patients out of all patients in each bootstrap
Definition bootstrap.h:77
bool fetch_next(int thread, float &ret_y, float &ret_pred, float &weight)
Inline function to fetch next pred,label couple in the bootstrap process.
Definition bootstrap.cpp:203
size_t num_categories
number of categories (inferred)
Definition bootstrap.h:80
A base class for measurements parameter.
Definition bootstrap.h:173
bool show_warns
If True will show warnnings.
Definition bootstrap.h:175
The following is a class to handle time conversions There's a global instance of it declared below (m...
Definition MedTime.h:19
A class which fetches the samples in bootstrap manner in memort way.
Definition bootstrap.h:119
void fetch_selection(vector< int > &indexes) const
Inline function to fetch indexes.
Definition bootstrap.cpp:472
int sample_per_pid
how many samples to take for each patients. 0 - means no sampling take all sample for patient
Definition bootstrap.h:157
float sample_ratio
the sample ratio of the patients out of all patients in each bootstrap
Definition bootstrap.h:156
void fetch_selection_external(vector< int > &indexes) const
external function to fetch indexes
Definition bootstrap.cpp:477
Parameter object for Multiclass measure functions.
Definition bootstrap.h:348
int n_categ
Number of categories.
Definition bootstrap.h:351
vector< int > top_n
when looking on top predictions, this is the maximal index
Definition bootstrap.h:350
string dist_name
dist(i,j)
Definition bootstrap.h:354
int init(map< string, string > &map)
Virtual to init object from parsed fields.
Definition bootstrap.cpp:3387
vector< float > dist_weights
Vector of weights - for index i : dist_weights[i] = 1/sum(dist(i,k))
Definition bootstrap.h:352
Parameter object for calc_roc_measures functions.
Definition bootstrap.h:294
int min_score_quants_to_force_score_wp
The minimal count of unique score to force fetching scores only by score cutoffs.
Definition bootstrap.h:308
int score_bins
score bin count for speed up calculation. 0 means no binning
Definition bootstrap.h:304
bool use_score_working_points
If true will calculate all roc measurements based on scores working points.
Definition bootstrap.h:302
vector< float > working_point_Score
The Scores workin point definition.
Definition bootstrap.h:301
vector< int > working_point_TOPN
The Top N working points.
Definition bootstrap.h:297
double incidence_fix
The final incidence calculation on the cohort (will be calcuated)
Definition bootstrap.h:339
float score_resolution
score resultion to contorl bining for speed up calculation. 0 means no binning resulotion
Definition bootstrap.h:306
int init(map< string, string > &map)
Initializing each parameter from string in format: "parameter_name=value;...".
Definition bootstrap.cpp:3313
vector< float > working_point_PR
The Positive rate working point definition.
Definition bootstrap.h:299
Incident_Stats inc_stats
the incedince data if provided for general population. look for Incident_Stats for more info
Definition bootstrap.h:309
bool fix_label_to_binary
If True will change label value to be binary 0,1 (default is True)
Definition bootstrap.h:307
vector< float > working_point_SENS
The True Positive rate working point definition.
Definition bootstrap.h:298
float max_diff_working_point
The maximal diff in calculated working point to requested working point to drop.
Definition bootstrap.h:303
int score_min_samples
score bin min sample count for speed up calculation. 0 means no limit
Definition bootstrap.h:305
vector< float > working_point_auc
The partial auc working points definition.
Definition bootstrap.h:300
ROC_Params()
Default Ctor.
Definition bootstrap.h:314
vector< float > working_point_FPR
The False Positive rate working point definition.
Definition bootstrap.h:296
Parameter object for Regression measure functions.
Definition bootstrap.h:376
int init(map< string, string > &mapper)
Initializing each parameter from string in format: "parameter_name=value;...".
Definition bootstrap.cpp:3454
Definition SerializableObject.h:32
@ string
string value
Definition StdDeque.h:58
Definition bootstrap.h:444