Medial Code Documentation
Loading...
Searching...
No Matches
MedStat.h
1//
2// MedStat - Statistics utilities
3//
4
5#ifndef _MED_STAT_H_
6#define _MED_STAT_H_
7
8#include <stdlib.h>
9#include <stdarg.h>
10#include <stdio.h>
11
12#include "assert.h"
13#include "math.h"
14
15#include <vector>
16#include <map>
17#include <string>
18#include <algorithm>
19#include <cstring>
20
21#include "string.h"
22#include <MedMat/MedMat/MedMat.h>
23#include <bitset>
24
25#define MED_DEFAULT_MISSING_VALUE -1
26#define MED_DEFAULT_MIN_TRIM -1e9
27#define MED_DEFAULT_MAX_TRIM 1e9
28
29using namespace std;
30
31// general useful routines for various statistics given two vectors (usually score + labels)
32
33namespace medial {
34 namespace performance {
37 template <typename T> float pearson_corr_without_cleaning(const vector<T> &v1, const vector<T> &v2, const vector<float> *weights = NULL);
38 template <typename T> float pearson_corr(const vector<T> &v1, const vector<T> &v2, T missing_value, int& n, const vector<float> *weights = NULL);
39
42 template <typename T, typename S> float spearman_corr_without_cleaning(const vector<T> &v1, const vector<S> &v2, const vector<float> *weights = NULL);
43 template <typename T> float spearman_corr(const vector<T> &v1, const vector<T> &v2, T missing_val, int &n, const vector<float> *weights = NULL);
44
47 template <typename T>float rmse_without_cleaning(const vector<T> &preds, const vector<T> &y, const vector<float> *weights = NULL);
48 template <typename T>float rmse(const vector<T> &preds, const vector<T> &y, T missing_val, int &n, const vector<float> *weights = NULL);
49
52 template <typename T>float L1_dist_without_cleaning(const vector<T> &preds, const vector<T> &y, const vector<float> *weights = NULL);
53 template <typename T>float L1_dist(const vector<T> &preds, const vector<T> &y, T missing_val, int &n, const vector<float> *weights = NULL);
54
57 template <typename T>float relative_L1_dist_without_cleaning(const vector<T> &preds, const vector<T> &y, const vector<float> *weights = NULL);
58 template <typename T>float relative_L1_dist(const vector<T> &preds, const vector<T> &y, T missing_val, int &n, const vector<float> *weights = NULL);
59
63 template <typename T, typename S> double kendall_tau_without_cleaning(const vector<T> &v1, const vector<S> &v2, bool is01Vec1 = false, bool is01Vec2 = false);
64 template <typename T> double kendall_tau(const vector<T> &v1, const vector<T> &v2, T missing_value, int &n, bool is01Vec1 = false, bool is01Vec2 = false);
65
68 template <typename T, typename S> double kendall_tau_without_cleaning_q(const vector<T> &v1, const vector<S> &v2, const vector<float> *weights = NULL);
69 template <typename T, typename S> double kendall_tau_q(const vector<T> &v1, const vector<S> &v2, T missing_val1, S missing_val2, int& n,
70 const vector<float> *weights = NULL);
71
74 float mutual_information(const vector<float>& x, const vector<float>& y, int &n);
77 float mutual_information(vector<int>& xCounts, vector<int>& yCounts, vector<int> coCounts, int n);
80 float mutual_information(vector<int>& x, vector<int>& y, int &n);
81
84 template<typename T> float auc(vector<T> &preds, vector<float> &y);
87 template<typename T> float auc_q(const vector<T> &preds, const vector<float> &y, const vector<float>* weights = NULL);
88
92 template<typename T> void get_preds_perf_cnts(vector<T> &preds, vector<float> &y, vector<float> &size, int direction, vector<vector<int>> &cnts);
95 void cnts_to_perf(vector<int> &cnt, float &sens, float &spec, float &ppv, float &rr);
96
99 template <typename T> float accuracy(const vector<T> &preds, const vector<float> &y, const vector<float> *weights = NULL);
100 template <typename T> float approx_accuracy(const vector<T> &preds, const vector<float> &y, T epsilon, const vector<float> *weights = NULL);
101
102 // Functions for distance correlation
104 template <typename T> void get_dMatrix(vector<T>& values, MedMat<T>& dMatrix, T missing_value);
106 template <typename T> float get_dVar(MedMat<T>& dMatrix);
108 template <typename T> float get_dCov(MedMat<T>& xDistMat, MedMat<T>& yDistMat);
109
112 template <typename T> void multicateg_get_max_pred(vector<T> &probs, int nsamples, int ncateg, vector<float> &max_pred);
114 template <typename T> void multicateg_get_avg_pred(vector<T> &probs, int nsamples, int ncateg, vector<T> &avg_pred);
116 template <typename T> void multicateg_get_error_rate(vector<T> &probs, vector<float> &y, int nsamples, int ncateg, float &err_rate, T &rms, T &avg_rms);
117
119 template <typename T> void get_quantized_breakdown(vector<T> &preds, vector<T> &y, vector<T> &bounds, MedMat<int> &counts);
121 template <typename T> void print_quantized_breakdown(MedMat<int> &cnt, vector<T> &bounds);
123 template <typename T> double integrated_calibration_index(const vector<T> &predicted_prob, const vector<float> &y_label, const vector<float>* weights = NULL);
125 float jaccard_similarity(int val1, int val2);
127 float jaccard_distance(int val1, int val2);
129 void get_jaccard_matrix(int n, vector<vector<float>>& jaccard_dist);
130 }
131
132 namespace stats {
135 double chi2_n_x_m(vector<int> &cnts, int n, int m);
138 double chi2_n_x_m(vector<int> &cnts, int n, int m, vector<double> &exp);
139
140 // Moments
143 template <typename T> double mean_without_cleaning(const vector<T> &v1, const vector<float> *weights = NULL);
144 template <typename T> double mean(const vector<T> &v1, T missing_value, int& n, const vector<float> *weights = NULL);
145
149 template <typename T> double std_without_cleaning(const vector<T> &v, T mean, const vector<float> *weights = NULL);
150 template <typename T> double std(const vector<T> &v, T mean, T missing_value, int& n, const vector<float> *weights = NULL);
151
153 template<typename T> void get_mean_and_std_without_cleaning(const vector<T> &v, T& mean, T& std, const vector<float> *weights = NULL) {
154 mean = medial::stats::mean_without_cleaning(v, weights); std = medial::stats::std_without_cleaning(v, mean, weights);
155 }
156 template<typename T> void get_mean_and_std(const vector<T> &v, T missing_value, int& n, T& mean, T& std, const vector<float> *weights = NULL) {
157 mean = medial::stats::mean(v, missing_value, n, weights); std = medial::stats::std(v, mean, missing_value, n, weights);
158 }
159
160 template<typename T> void get_z_transform(const vector<T> &v, T missing_value_in_v, T missing_value_z, vector<T> &z);
161
163 void get_mean_and_std(float *values, const float* wgts, int size, float missing_value, float& mean, float&sd, int& n, bool do_missing);
164
167 template<typename T> T median_without_cleaning(vector<T>& v, bool in_place = false);
168 template<typename T> T median(vector<T>& v, T missing_value, int& n);
169
172 template<typename T> T most_common_without_cleaning(vector<T> &v);
173 template<typename T> T most_common(vector<T>& v, T missing_value, int& n);
174
176 template<typename T> void get_histogram_without_cleaning(vector<T>& v, vector<pair<T, float> >& hist, bool in_place = false);
177 template<typename T> void get_histogram(vector<T>& v, T missing_value, int& n, vector<pair<T, float> >& hist);
178 template<typename T> T sample_from_histogram(vector<pair<T, float> >& hist);
179
180
183 template<class T> void get_percentiles(vector<T> &vals, vector<float> &p, vector<T> &out_pvals, int only_positive_flag = false);
186 template<typename T> T get_quantile(vector<T> vals, vector<float> w, float q);
188 float get_best_rounding(vector<float>& vals, vector<float>& res, vector<int>& counts, float missing_value = -1);
189
190 double chi_square_table(double grp1_cntrl, double grp1_cases, double grp2_cntrl, double grp2_cases,
191 int smooth_balls = 0, float allowed_error = 0);
192
194 template<typename T> void t_test(const vector<T> &grp1, const vector<T> &grp2, double &t_value, double &degree_of_freedom, double &p_value);
195
197 template<typename T> void t_test_unequal_sample_size(const vector<T> &grp1, const vector<T> &grp2, double &t_value, double &degree_of_freedom, double &p_value);
198
200 template<typename T> void welch_t_test(const vector<T> &grp1, const vector<T> &grp2, double &t_value, double &degree_of_freedom, double &p_value);
201
203 template<typename T> double KL_divergence(const vector<T>& p, const vector<T>& q, T epsilon = 1e-8);
204 }
205}
206
207#endif
Definition MedMat.h:63
Copyright 2015-2023 by XGBoost Contributors.
medial namespace for function
Definition InfraMed.h:667
Definition StdDeque.h:58