This is the infrastracture of bootstrap. More...

#include <vector>
#include <string>
#include <map>
#include <random>
#include <MedTime/MedTime/MedTime.h>
#include <SerializableObject/SerializableObject/SerializableObject.h>
#include <Logger/Logger/Logger.h>
#include <MedStat/MedStat/MedStat.h>

Data Structures
class	Lazy_Iterator
	A class which fetches the samples in bootstrap manner in lazy way. More...

class	Mem_Iterator
	A class which fetches the samples in bootstrap manner in memort way. More...

class	Measurement_Params
	A base class for measurements parameter. More...

class	Incident_Stats
	The Incident Object which holds the gender, age incidence stats. More...

class	ROC_Params
	Parameter object for calc_roc_measures functions. More...

class	Multiclass_Params
	Parameter object for Multiclass measure functions. More...

class	Regression_Params
	Parameter object for Regression measure functions. More...

class	Filter_Param
	Parameter object for filter_params functions. More...

struct	ROC_And_Filter_Params

Typedefs
typedef map< string, float >(*	MeasurementFunctions) (Lazy_Iterator iterator, int thread_num, Measurement_Params function_params)
	Function which recieves Lazy_Iterator and the thread num for iterating the predictions and labels.

typedef bool(*	FilterCohortFunc) (const map< string, vector< float > > &record_info, int index, void *cohort_params)
	Function which recieves map from feature name to vector of all samples value, sample index and cohort definition params and return true\false if to include the sample in the cohort.

typedef void(*	ProcessMeasurementParamFunc) (const map< string, vector< float > > &additional_info, const vector< float > &y, const vector< int > &pids, Measurement_Params *function_params, const vector< int > &filtered_indexes, const vector< float > &y_full, const vector< int > &pids_full)
	a function to process and maniplulate function params based on the given cohort - for example sotring incedince information for the cohort

typedef void(*	PreprocessScoresFunc) (vector< float > &preds, Measurement_Params *function_params)
	a funtion to preprocess the prediction scores (binning for example to speed up bootstrap).

Functions
map< string, float >	calc_npos_nneg (Lazy_Iterator iterator, int thread_num, Measurement_Params function_params)
	A Function to calculate only NPOS,NNEG (already calculated in calc_roc_measures_with_inc).

map< string, float >	calc_only_auc (Lazy_Iterator iterator, int thread_num, Measurement_Params function_params)
	A Function to calculate only AUC (already calculated in calc_roc_measures_with_inc).

map< string, float >	calc_roc_measures_with_inc (Lazy_Iterator iterator, int thread_num, Measurement_Params function_params)
	A Function to calculate all roc measurements- AUC, Sensitivity, speceficity positive rate, ppv... Implements MeasurementFunctions signature function.

map< string, float >	calc_kandel_tau (Lazy_Iterator iterator, int thread_num, Measurement_Params function_params)
	A Function to calculate calc_kandel_tau Implements MeasurementFunctions signature function.

map< string, float >	calc_multi_class (Lazy_Iterator iterator, int thread_num, Measurement_Params function_params)
	A Function to calculate performance measurements for multicategory Implements MeasurementFunctions signature function.

map< string, float >	calc_harrell_c_statistic (Lazy_Iterator iterator, int thread_num, Measurement_Params function_params)
	A Function to calculate performance measurements for harrell c statistic Implements MeasurementFunctions signature function Encoding: Case/Control => effect outcome/y sign.

map< string, float >	calc_regression (Lazy_Iterator iterator, int thread_num, Measurement_Params function_params)
	A Function to calculate performance measurements for regression problems Implements MeasurementFunctions signature function Accepted Regresion_Params.

bool	filter_range_param (const map< string, vector< float > > &record_info, int index, void *cohort_params)
	A function to filter samples based on single Filter_Param object.

bool	filter_range_params (const map< string, vector< float > > &record_info, int index, void *cohort_params)
	A function to filter samples based on multipal Filter_Param objects - in a vector with and condition between each parameter range.

void	fix_cohort_sample_incidence (const map< string, vector< float > > &additional_info, const vector< float > &y, const vector< int > &pids, Measurement_Params *function_params, const vector< int > &filtered_indexes, const vector< float > &y_full, const vector< int > &pids_full)
	a function to calculate the incidence in each cohort - preprocessing of function_params and storing the incidence inside of it.

void	fix_cohort_sample_incidence_old (const map< string, vector< float > > &additional_info, const vector< float > &y, const vector< int > &pids, Measurement_Params *function_params, const vector< int > &filtered_indexes, const vector< float > &y_full, const vector< int > &pids_full)
	a function to calculate the incidence in each cohort - preprocessing of function_params and storing the incidence inside of it.

void	preprocess_bin_scores (vector< float > &preds, Measurement_Params *function_params)
	Binning function of scores based on ROC_Params.

string	format_working_point (const string &init_str, float wp, bool perc=true)
	Format out measurement.

string	format_working_point_topn (const string &init_str, int wp, bool perc=true)

map< string, float >	booststrap_analyze_cohort (const vector< float > &preds, const vector< int > &preds_order, const vector< float > &y, const vector< int > &pids, float sample_ratio, int sample_per_pid, int loopCnt, const vector< MeasurementFunctions > &meas_functions, const vector< Measurement_Params * > &function_params, ProcessMeasurementParamFunc process_measurments_params, const map< string, vector< float > > &additional_info, const vector< float > &y_full, const vector< int > &pids_full, const vector< float > weights, const vector< int > &filter_indexes, FilterCohortFunc cohort_def, void cohort_params, int &warn_cnt, const string &cohort_name, int seed=0)
	to run bootstrap on single cohort

map< string, map< string, float > >	booststrap_analyze (const vector< float > &preds, const vector< int > &preds_order, const vector< float > &y, const vector< float > weights, const vector< int > &pids, const map< string, vector< float > > &additional_info, const map< string, FilterCohortFunc > &filter_cohort, const vector< MeasurementFunctions > &meas_functions={calc_roc_measures_with_inc}, const map< string, void > cohort_params=NULL, const vector< Measurement_Params > function_params=NULL, ProcessMeasurementParamFunc process_measurments_params=NULL, PreprocessScoresFunc preprocess_scores=NULL, Measurement_Params preprocess_scores_params=NULL, float sample_ratio=(float) 1.0, int sample_per_pid=1, int loopCnt=500, int seed=0, bool binary_outcome=true)
	The main bootstrap function to run all bootstrap process with all the arguments.

void	prepare_for_bootstrap (const vector< int > &pids, const map< string, vector< float > > &additional_info, FilterCohortFunc &filter_cohort, void *cohort_params, float sample_ratio, int sample_per_pid, int seed, vector< int > &indexes)

void	write_bootstrap_results (const string &file_name, const map< string, map< string, float > > &all_cohorts_measurments, const string &run_id="")
	Will output the bootstrap results into file in TAB delimeted format.

void	read_bootstrap_results (const string &file_name, map< string, map< string, float > > &all_cohorts_measurments)
	Will read the bootstrap results from file in TAB delimeted format.

void	write_pivot_bootstrap_results (const string &file_name, const map< string, map< string, float > > &all_cohorts_measurments, const string &run_id="")
	Will output the bootstrap results into file with the new format with columns: "Cohort$Measure_Name", "Value".

void	read_pivot_bootstrap_results (const string &file_name, map< string, map< string, float > > &all_cohorts_measurments)
	Will read the bootstrap results into file with the new format with columns: "Cohort$Measure_Name", "Value" <//summary>

Detailed Description

This is the infrastracture of bootstrap.

Typedef Documentation

◆ MeasurementFunctions

typedef map< string, float >(* MeasurementFunctions) (Lazy_Iterator *iterator, int thread_num, Measurement_Params *function_params)

Function which recieves Lazy_Iterator and the thread num for iterating the predictions and labels.

it also recieves function_params which are additional arguments for the function (can be working points defintions for example)

◆ PreprocessScoresFunc

typedef void(* PreprocessScoresFunc) (vector< float > &preds, Measurement_Params *function_params)

a funtion to preprocess the prediction scores (binning for example to speed up bootstrap).

the function manipulate preds based on function_params

Function Documentation

◆ booststrap_analyze()

map< string, map< string, float > > booststrap_analyze	(	const vector< float > &	preds,
		const vector< int > &	preds_order,
		const vector< float > &	y,
		const vector< float > *	weights,
		const vector< int > &	pids,
		const map< string, vector< float > > &	additional_info,
		const map< string, FilterCohortFunc > &	filter_cohort,
		const vector< MeasurementFunctions > &	meas_functions = `{calc_roc_measures_with_inc}`,
		const map< string, void * > *	cohort_params = `NULL`,
		const vector< Measurement_Params * > *	function_params = `NULL`,
		ProcessMeasurementParamFunc	process_measurments_params = `NULL`,
		PreprocessScoresFunc	preprocess_scores = `NULL`,
		Measurement_Params *	preprocess_scores_params = `NULL`,
		float	sample_ratio = `(float) 1.0`,
		int	sample_per_pid = `1`,
		int	loopCnt = `500`,
		int	seed = `0`,
		bool	binary_outcome = `true`
	)

The main bootstrap function to run all bootstrap process with all the arguments.

Parameters

preds	vector of predictions
y	labels
pids	patient ids [used for random draws]
additional_info	Dictionary used to keep values of features [for filtering]. Key is the feature name. Val contains vector of feature values.
filter_cohort	Dictionary, where key is name of cohort, value is a function which performs the filtering [keeps only entries that belong to this cohort]
meas_functions	Vector of metrics to calculate per one bootstrap (?) experiment
cohort_params	Key: name of a cohort, Value: additional parameters which are passed to functions from filter_cohort.values
function_params	Configuration parameters passed to "meas_functions" (like 2 for F2 metric).
process_measurments_params	Function to process the function_params before running on each cohort (helps to calc incidence for example)
preprocess_scores	A function to preprocess all scores - for example binning the scores (can sometimes speedup metrics calculation)
preprocess_scores_params	Additional parameters for the preprocess_scores function
sample_ratio	A number in range (0,1] for subsampling the samples [in order to speed-up the bootstrap]
sample_per_pid	How many samples to sample on each id [sampling with replacement]
loopCnt	How many bootstrap experiments(?) to do
seed	The random seed. If 0 will use random_device to create random seed
binary_outcome	A flag to indicate whether the labels are binary (used to validate the input labels) Returns Returns a map from each cohort name to the measurments results. each measurments results is also a map from each measurement name to it's value

◆ calc_harrell_c_statistic()

map< string, float > calc_harrell_c_statistic	(	Lazy_Iterator *	iterator,
		int	thread_num,
		Measurement_Params *	function_params
	)

A Function to calculate performance measurements for harrell c statistic Implements MeasurementFunctions signature function Encoding: Case/Control => effect outcome/y sign.

positive is case, negative controls. Can't handle event in time zero. Time to event => abs value of outcome/y Score => the prediction

Returns: A map from measurement name to it's value

◆ calc_kandel_tau()

map< string, float > calc_kandel_tau	(	Lazy_Iterator *	iterator,
		int	thread_num,
		Measurement_Params *	function_params
	)

A Function to calculate calc_kandel_tau Implements MeasurementFunctions signature function.

Returns: A map from measurement name "Kendell-Tau" to it's value

◆ calc_multi_class()

map< string, float > calc_multi_class	(	Lazy_Iterator *	iterator,
		int	thread_num,
		Measurement_Params *	function_params
	)

A Function to calculate performance measurements for multicategory Implements MeasurementFunctions signature function.

Returns: A map from measurement name to it's value

◆ calc_npos_nneg()

map< string, float > calc_npos_nneg	(	Lazy_Iterator *	iterator,
		int	thread_num,
		Measurement_Params *	function_params
	)

A Function to calculate only NPOS,NNEG (already calculated in calc_roc_measures_with_inc).

Implements MeasurementFunctions signature function

Returns: A map from each measurement name("NPOS" or "NNEG") to it's value

◆ calc_only_auc()

map< string, float > calc_only_auc	(	Lazy_Iterator *	iterator,
		int	thread_num,
		Measurement_Params *	function_params
	)

A Function to calculate only AUC (already calculated in calc_roc_measures_with_inc).

Implements MeasurementFunctions signature function

Returns: A map from measurement name "AUC" to it's value

◆ calc_regression()

map< string, float > calc_regression	(	Lazy_Iterator *	iterator,
		int	thread_num,
		Measurement_Params *	function_params
	)

A Function to calculate performance measurements for regression problems Implements MeasurementFunctions signature function Accepted Regresion_Params.

Returns: A map from measurement name to it's value

< can be called False

< can be called Negative

< can be called True

< can be called Positive

Counts how much in percentage [0-100] in the data points are "covered" (the L1 error is within threshold) The threshold is determined by percentages for mean outcome (the prior)

◆ calc_roc_measures_with_inc()

map< string, float > calc_roc_measures_with_inc	(	Lazy_Iterator *	iterator,
		int	thread_num,
		Measurement_Params *	function_params
	)

A Function to calculate all roc measurements- AUC, Sensitivity, speceficity positive rate, ppv...
Implements MeasurementFunctions signature function.

Returns: A map from each measurement name to it's value

counts also false positives weights if no fix_label_to_binary

◆ filter_range_param()

bool filter_range_param	(	const map< string, vector< float > > &	record_info,
		int	index,
		void *	cohort_params
	)

A function to filter samples based on single Filter_Param object.

it's a FilterCohortFunc signature

◆ filter_range_params()

bool filter_range_params	(	const map< string, vector< float > > &	record_info,
		int	index,
		void *	cohort_params
	)

A function to filter samples based on multipal Filter_Param objects - in a vector with and condition between each parameter range.

it's a FilterCohortFunc signature

◆ fix_cohort_sample_incidence_old()

void fix_cohort_sample_incidence_old	(	const map< string, vector< float > > &	additional_info,
		const vector< float > &	y,
		const vector< int > &	pids,
		Measurement_Params *	function_params,
		const vector< int > &	filtered_indexes,
		const vector< float > &	y_full,
		const vector< int > &	pids_full
	)

a function to calculate the incidence in each cohort - preprocessing of function_params and storing the incidence inside of it.

The old has same implementation as old bootstrap only averaging incidence over the controls in the sample based on incidence in each group(age+gender)

◆ prepare_for_bootstrap()

void prepare_for_bootstrap	(	const vector< int > &	pids,
		const map< string, vector< float > > &	additional_info,
		FilterCohortFunc &	filter_cohort,
		void *	cohort_params,
		float	sample_ratio,
		int	sample_per_pid,
		int	seed,
		vector< int > &	indexes
	)

Parameters

pids	the pids vector
additional_info	the data vector for filtering
filter_cohort	The cohorts definition - the filtering function
cohort_params	Additional parameters for the filtering cohort function
sample_ratio	A number in range (0,1] for subsampling the samples
sample_per_pid	How many samples to sample on each id
seed	The random seed. If 0 will use random_device to create random seed
indexes	the selected indexes results for the bootstrap Returns Returns indexes vector

◆ preprocess_bin_scores()

void preprocess_bin_scores	(	vector< float > &	preds,
		Measurement_Params *	function_params
	)

Binning function of scores based on ROC_Params.

look at score_bins,score_resolution

◆ read_bootstrap_results()

void read_bootstrap_results	(	const string &	file_name,
		map< string, map< string, float > > &	all_cohorts_measurments
	)

Will read the bootstrap results from file in TAB delimeted format.

each line is cohort and the The columns are the measurements

◆ write_bootstrap_results()

void write_bootstrap_results	(	const string &	file_name,
		const map< string, map< string, float > > &	all_cohorts_measurments,
		const string &	run_id = `""`
	)

Will output the bootstrap results into file in TAB delimeted format.