MR_LIBS/MedStat_8h_source.html

//

// MedStat - Statistics utilities

//


#ifndef _MED_STAT_H_

#define _MED_STAT_H_


#include <stdlib.h>

#include <stdarg.h>

#include <stdio.h>


#include "assert.h"

#include "math.h"


#include <vector>

#include <map>

#include <string>

#include <algorithm>

#include <cstring>


#include "string.h"

#include <MedMat/MedMat/MedMat.h>

#include <bitset>


#define MED_DEFAULT_MISSING_VALUE       -1

#define MED_DEFAULT_MIN_TRIM            -1e9

#define MED_DEFAULT_MAX_TRIM             1e9


using namespace std;


// general useful routines for various statistics given two vectors (usually score + labels)


namespace medial {

    namespace performance {

        template <typename T> float pearson_corr_without_cleaning(const vector<T> &v1, const vector<T> &v2, const vector<float> *weights = NULL);

        template <typename T> float pearson_corr(const vector<T> &v1, const vector<T> &v2, T missing_value, int& n, const vector<float> *weights = NULL);


        template <typename T, typename S> float spearman_corr_without_cleaning(const vector<T> &v1, const vector<S> &v2, const vector<float> *weights = NULL);

        template <typename T> float spearman_corr(const vector<T> &v1, const vector<T> &v2, T missing_val, int &n, const vector<float> *weights = NULL);


        template <typename T>float rmse_without_cleaning(const vector<T> &preds, const vector<T> &y, const vector<float> *weights = NULL);

        template <typename T>float rmse(const vector<T> &preds, const vector<T> &y, T missing_val, int &n, const vector<float> *weights = NULL);


        template <typename T>float L1_dist_without_cleaning(const vector<T> &preds, const vector<T> &y, const vector<float> *weights = NULL);

        template <typename T>float L1_dist(const vector<T> &preds, const vector<T> &y, T missing_val, int &n, const vector<float> *weights = NULL);


        template <typename T>float relative_L1_dist_without_cleaning(const vector<T> &preds, const vector<T> &y, const vector<float> *weights = NULL);

        template <typename T>float relative_L1_dist(const vector<T> &preds, const vector<T> &y, T missing_val, int &n, const vector<float> *weights = NULL);


        template <typename T, typename S> double kendall_tau_without_cleaning(const vector<T> &v1, const vector<S> &v2, bool is01Vec1 = false, bool is01Vec2 = false);

        template <typename T> double kendall_tau(const vector<T> &v1, const vector<T> &v2, T missing_value, int &n, bool is01Vec1 = false, bool is01Vec2 = false);


        template <typename T, typename S> double kendall_tau_without_cleaning_q(const vector<T> &v1, const vector<S> &v2, const vector<float> *weights = NULL);

        template <typename T, typename S> double kendall_tau_q(const vector<T> &v1, const vector<S> &v2, T missing_val1, S missing_val2, int& n,

            const vector<float> *weights = NULL);


        float mutual_information(const vector<float>& x, const vector<float>& y, int &n);

        float mutual_information(vector<int>& xCounts, vector<int>& yCounts, vector<int> coCounts, int n);

        float mutual_information(vector<int>& x, vector<int>& y, int &n);


        template<typename T> float auc(vector<T> &preds, vector<float> &y);

        template<typename T> float auc_q(const vector<T> &preds, const vector<float> &y, const vector<float>* weights = NULL);


        template<typename T> void get_preds_perf_cnts(vector<T> &preds, vector<float> &y, vector<float> &size, int direction, vector<vector<int>> &cnts);

        void cnts_to_perf(vector<int> &cnt, float &sens, float &spec, float &ppv, float &rr);


        template <typename T> float accuracy(const vector<T> &preds, const vector<float> &y, const vector<float> *weights = NULL);

        template <typename T> float approx_accuracy(const vector<T> &preds, const vector<float> &y, T epsilon, const vector<float> *weights = NULL);


        // Functions for distance correlation

        template <typename T> void get_dMatrix(vector<T>& values, MedMat<T>& dMatrix, T missing_value);

        template <typename T> float get_dVar(MedMat<T>& dMatrix);

        template <typename T> float get_dCov(MedMat<T>& xDistMat, MedMat<T>& yDistMat);


        template <typename T> void multicateg_get_max_pred(vector<T> &probs, int nsamples, int ncateg, vector<float> &max_pred);

        template <typename T> void multicateg_get_avg_pred(vector<T> &probs, int nsamples, int ncateg, vector<T> &avg_pred);

        template <typename T> void multicateg_get_error_rate(vector<T> &probs, vector<float> &y, int nsamples, int ncateg, float &err_rate, T &rms, T &avg_rms);


        template <typename T> void get_quantized_breakdown(vector<T> &preds, vector<T> &y, vector<T> &bounds, MedMat<int> &counts);

        template <typename T> void print_quantized_breakdown(MedMat<int> &cnt, vector<T> &bounds);

        template <typename T> double integrated_calibration_index(const vector<T> &predicted_prob, const vector<float> &y_label, const vector<float>* weights = NULL);

        float jaccard_similarity(int val1, int val2);

        float jaccard_distance(int val1, int val2);

        void get_jaccard_matrix(int n, vector<vector<float>>& jaccard_dist);

    }


    namespace stats {

        double chi2_n_x_m(vector<int> &cnts, int n, int m);

        double chi2_n_x_m(vector<int> &cnts, int n, int m, vector<double> &exp);


        // Moments

        template <typename T> double mean_without_cleaning(const vector<T> &v1, const vector<float> *weights = NULL);

        template <typename T> double mean(const vector<T> &v1, T missing_value, int& n, const vector<float> *weights = NULL);


        template <typename T> double std_without_cleaning(const vector<T> &v, T mean, const vector<float> *weights = NULL);

        template <typename T> double std(const vector<T> &v, T mean, T missing_value, int& n, const vector<float> *weights = NULL);


        template<typename T> void get_mean_and_std_without_cleaning(const vector<T> &v, T& mean, T& std, const vector<float> *weights = NULL) {

            mean = medial::stats::mean_without_cleaning(v, weights); std = medial::stats::std_without_cleaning(v, mean, weights);

        }

        template<typename T> void get_mean_and_std(const vector<T> &v, T missing_value, int& n, T& mean, T& std, const vector<float> *weights = NULL) {

            mean = medial::stats::mean(v, missing_value, n, weights); std = medial::stats::std(v, mean, missing_value, n, weights);

        }


        template<typename T> void get_z_transform(const vector<T> &v, T missing_value_in_v, T missing_value_z, vector<T> &z);


        void get_mean_and_std(float *values, const float* wgts, int size, float missing_value, float& mean, float&sd, int& n, bool do_missing);


        template<typename T> T median_without_cleaning(vector<T>& v, bool in_place = false);

        template<typename T> T median(vector<T>& v, T missing_value, int& n);


        template<typename T> T most_common_without_cleaning(vector<T> &v);

        template<typename T> T most_common(vector<T>& v, T missing_value, int& n);


        template<typename T> void get_histogram_without_cleaning(vector<T>& v, vector<pair<T, float> >& hist, bool in_place = false);

        template<typename T> void get_histogram(vector<T>& v, T missing_value, int& n, vector<pair<T, float> >& hist);

        template<typename T> T sample_from_histogram(vector<pair<T, float> >& hist);


        template<class T> void get_percentiles(vector<T> &vals, vector<float> &p, vector<T> &out_pvals, int only_positive_flag = false);

        template<typename T> T get_quantile(vector<T> vals, vector<float> w, float q);

        float get_best_rounding(vector<float>& vals, vector<float>& res, vector<int>& counts, float missing_value = -1);


        double chi_square_table(double grp1_cntrl, double grp1_cases, double grp2_cntrl, double grp2_cases,

            int smooth_balls = 0, float allowed_error = 0);


        template<typename T> void t_test(const vector<T> &grp1, const vector<T> &grp2, double &t_value, double &degree_of_freedom, double &p_value);


        template<typename T> void t_test_unequal_sample_size(const vector<T> &grp1, const vector<T> &grp2, double &t_value, double &degree_of_freedom, double &p_value);


        template<typename T> void welch_t_test(const vector<T> &grp1, const vector<T> &grp2, double &t_value, double &degree_of_freedom, double &p_value);


        template<typename T> double KL_divergence(const vector<T>& p, const vector<T>& q, T epsilon = 1e-8);

    }

}


#endif

MedMat
Definition MedMat.h:63

math.h
Copyright 2015-2023 by XGBoost Contributors.

medial
medial namespace for function
Definition InfraMed.h:667

std
Definition StdDeque.h:58