Medial Code Documentation
Loading...
Searching...
No Matches
MedMat.h
1//
2// General routines for easier manipulation of a general 2d matrix
3// To be used to hold data points, split them (train, test, cv),
4// and create features matrices.
5//
6// Besides the major MedMat class, contains also several routines
7// to handle Matrices and vectors.
8//
9
10#ifndef __MED_MAT_H__
11#define __MED_MAT_H__
12
13#ifndef _CRT_SECURE_NO_WARNINGS
14#define _CRT_SECURE_NO_WARNINGS
15#endif
16
17
18
19#include <vector>
20#include <math.h>
22#include <MedIO/MedIO/MedIO.h>
24#include <MedMat/MedMat/MedMatConstants.h>
25#include <MedUtils/MedUtils/MedGlobalRNG.h>
26
27using namespace std;
28
31public:
32 RecordData() {};
33 RecordData(int id, int date, long outcomeTime, int split, float weight, float label, float pred) {
34 this->id = id;
35 this->date = date;
36 this->outcomeTime = outcomeTime;
37 this->split = split;
38 this->weight = weight;
39 this->label = label;
40 this->pred = pred;
41 }
42 int id;
43 int date;
44 int outcomeTime;
45 int split;
46 float label;
47
48 float pred;
49 float weight;
50
51 ADD_CLASS_NAME(RecordData)
52 ADD_SERIALIZATION_FUNCS(id, date, outcomeTime, split, label, pred, weight)
53};
54
55
56// General Class for a 2d Marix
57// The matrix can contain elements of any type <T> (typically <T> is float or double)
58// There are several basic methods to construct, load, and append to a matrix (even from a general other type <S>)
59// Also reordering, choosing a sub matrix or transposing (while keeping the transpose state) are possible.
60// One can serialize/deserialize a matrix and then use other utils to IO it, or use a direct method.
61// mat(i,j) can be used to access the (i,j) element in the matrix for read or write.
62template <class T>
63class MedMat : public SerializableObject {
64private:
65 vector<T> m;
66public:
67
68 const static int Normalize_Cols = 1;
69 const static int Normalize_Rows = 2;
70
71 // data holders (major)
72 //vector<T> m;
73 unsigned long long nrows = 0;
74 unsigned long long ncols = 0;
75 unsigned long long size() const { return (unsigned long long)nrows*ncols; }
76 vector<T> &get_vec() { return m; }
77 void copy_vec(vector<T> &c_vec) { c_vec.clear(); c_vec = m; }
78
79 // metadata holders
80 vector<int> row_ids;
81 vector<RecordData> recordsMetadata;
82 vector<string> signals;
83 //int time_unit;
84
85 // Normalization/DeNormalization
86 vector<T> avg;
87 vector<T> std;
88
89 int normalized_flag = 0; // 0 - non normalized 1 - cols normalized, 2 - rows normalized
90 int transposed_flag = 0; // 0 - as was when matrix was created/loaded, 1 - transpose of it
91
92 T missing_value;
93
94 // get/set
95 inline T operator ()(size_t i, size_t j) const { return m[i*ncols + j]; }
96
97 inline T &operator ()(size_t i, size_t j) { return m[i*ncols + j]; }
98
99 inline T get(size_t i, size_t j) const { return m[i*ncols + j]; }
100
101 inline T& set(size_t i, size_t j) { return m[i*ncols + j]; } // use var.set(i,j) = ....
102
103 inline void copy_header(MedMat<T> &other) {
104 if (this != &other) {
105 nrows = other.nrows;
106 ncols = other.ncols;
107 signals = other.signals;
108 avg = other.avg;
109 std = other.std;
110 normalized_flag = other.normalized_flag;
111 transposed_flag = other.transposed_flag;
112 missing_value = other.missing_value;
113 }
114 }
115
116 inline MedMat<T>& operator=(MedMat<T> &&other) noexcept {
117 if (this != &other) {
118 m = move(other.m);
119 nrows = other.nrows;
120 ncols = other.ncols;
121 row_ids = move(other.row_ids);
122 signals = move(other.signals);
123 avg = move(other.avg);
124 std = move(other.std);
125 normalized_flag = other.normalized_flag;
126 transposed_flag = other.transposed_flag;
127 missing_value = other.missing_value;
128 recordsMetadata = move(other.recordsMetadata);
129 }
130 return *this;
131 }
132 inline MedMat<T>& operator=(const MedMat<T> &other) noexcept {
133 if (this != &other) {
134 m = other.m;
135 nrows = other.nrows;
136 ncols = other.ncols;
137 row_ids = other.row_ids;
138 signals = other.signals;
139 avg = other.avg;
140 std = other.std;
141 normalized_flag = other.normalized_flag;
142 transposed_flag = other.transposed_flag;
143 missing_value = other.missing_value;
144 recordsMetadata = other.recordsMetadata;
145 }
146 return *this;
147 }
148
149 // init
150 MedMat() { clear(); }
151 MedMat(int n_rows, int n_cols) { clear(); nrows = n_rows; ncols = n_cols; m.resize(nrows*ncols); zero(); };
152 MedMat(const MedMat<T> &other) {
153 m = other.m;
154 nrows = other.nrows;
155 ncols = other.ncols;
156 row_ids = other.row_ids;
157 signals = other.signals;
158 avg = other.avg;
159 std = other.std;
160 normalized_flag = other.normalized_flag;
161 transposed_flag = other.transposed_flag;
162 missing_value = other.missing_value;
163 recordsMetadata = other.recordsMetadata;
164 }
165 template <class S> MedMat(S *x, int n_rows, int n_cols) { clear(); load(x, n_rows, n_cols); }
166 template <class S> MedMat(const vector<S> &x, int n_cols) { clear(); load(x, n_cols); }
167 template <class S> MedMat(MedMat<S> &x) { clear(); load(x); }
168
169 template <class S> void load(S *x, int n_rows, int n_cols);
170 template <class S> void load_transposed(S *x, int n_rows, int n_cols);
171 template <class S> void load(const vector<S> &x, int n_cols);
172 template <class S> void load(MedMat<S> &x);
173
174 void zero() { fill(m.begin(), m.end(), (T)0); }
175 void set_val(T val) { fill(m.begin(), m.end(), val); } // set all matrix to a certain value.
176
177 // basic
178 void clear() { m.clear(); row_ids.clear(); signals.clear(); recordsMetadata.clear(); nrows = 0; ncols = 0; normalized_flag = 0; transposed_flag = 0; missing_value = (T)MED_MAT_MISSING_VALUE; }
179 T *data_ptr() { if (m.size() > 0) return &m[0]; else return NULL; }
180 const T *data_ptr() const { if (m.size() > 0) return &m[0]; else return NULL; }
181 T *data_ptr(size_t r, size_t c) { if (m.size() > r*ncols + c) return &m[r*ncols + c]; else return NULL; }
182 int get_nrows() { return (int)nrows; }
183 int get_ncols() { return (int)ncols; }
184 void resize(int n_rows, int n_cols) { nrows = n_rows; ncols = n_cols; m.resize(nrows*ncols); }
185
186 // i/o from specific format files
187 int read_from_bin_file(const string &fname);
188 int write_to_bin_file(const string &fname);
189 int write_to_csv_file(const string &fname);
190 int read_from_csv_file(const string &fname, int titles_line_flag);
191 //int read_from_csv_file(const string &fname, int titles_line_flag, vector<string>& fields_out);
192
193 // serialize(), deserialize()
194 //size_t get_size();
195 //size_t serialize(unsigned char *buf);
196 //size_t deserialize(unsigned char *buf);
197 ADD_SERIALIZATION_FUNCS(m, nrows, ncols, row_ids, recordsMetadata, signals, avg, std, normalized_flag, transposed_flag, missing_value);
198
199 // simple handling options
200 void transpose();
201 void get_sub_mat(vector<int> &rows_to_take, vector<int> &cols_to_take); // empty list means - take them all
202 void get_sub_mat_by_flags(vector<int> &rows_to_take_flag, vector<int> &cols_to_take_flag);
203 void reorder_by_row(vector<int> &row_order);
204 void reorder_by_col(vector<int> &col_order);
205
206 void random_split_mat_by_ids(MedMat<T> &mat_0, MedMat<T> &mat_1, float p0, vector<int> &inds0, vector<int> &inds1);
207
208 template <class S> void add_rows(MedMat<S> &m_add);
209 template <class S> void add_rows(S *m_add, int nrows_to_add);
210 template <class S> void add_rows(vector<S> &m_add);
211 template <class S> void add_cols(MedMat<S> &m_add);
212 template <class S> void add_cols(S *m_add, int ncols_to_add); // packed as nrows x ncols_to_add
213 template <class S> void add_cols(vector<S> &m_add);
214
215 // get a row or a column to a vector
216 void get_row(int i_row, vector<T> &rowv) const;
217 void get_col(int i_col, vector<T> &colv) const;
218
219 // normalization (norm_type = 1 for cols (default), 2 for rows)
220 void normalize(int norm_type, float *wgts);
221 void normalize(int norm_type, vector<float> &wgts) { return normalize(norm_type, &wgts[0]); }
222 void normalize(int norm_type = Normalize_Cols) { return normalize(norm_type, NULL); }
223
224 template <class S> void normalize(const vector<S>& external_avg, const vector<S>& external_std, int norm_type = 1);
225
226 void get_cols_avg_std(vector<T>& _avg, vector<T>& _std);
227
228 void print_row(FILE *fout, const string &prefix, const string &format, int i_row);
229
230 void set_signals(vector<string> & sigs);
231
232 //return true iff the matrix contains only valid floating point vals (not nan/infinite)
233 //if the type of the matrix is not floating point, always returns true
234 //if output = true, output the first invalid entry encountered to cerr
235 bool is_valid(bool output = false) {
236 if (std::is_floating_point<T>::value == false)
237 return true;
238
239 for (size_t i = 0; i < nrows; i++) {
240 for (size_t j = 0; j < ncols; j++) {
241 double x = (double)(m[i*ncols + j]);
242 if (!isfinite(x)) {
243 if (output)
244 cerr << "invalid element(" << i << ", " << j << ") = " << x << endl;
245
246 return false;
247 }
248 }
249 }
250
251 return true;
252 }
253
254 string my_class_name() const { return "MedMat"; }
255};
256
257// a few related util functions
258void flags_to_indexes(vector<int> &flags, vector<int> &inds);
259
260#include "MedMat_imp.h"
261
262// a few basic tools for MedMat<float> mats
263int get_rand_medmat(MedMat<float> &A); // fills mat with uniform 0-1 numbers
264int multiply_medmat(MedMat<float> &A, MedMat<float> &B, MedMat<float> &C); // A:n x m B:m x k --> gets C=A*B C:n x k
265int fast_multiply_medmat_(const MedMat<float> &A, const MedMat<float> &B, MedMat<float> &C); // A:n x m B:m x k --> gets C=A*B C:n x k :: Uses Eigen to get performance
266int fast_multiply_medmat(const MedMat<float> &A, const MedMat<float> &B, MedMat<float> &C, float s); // A:n x m B:m x k --> gets C=s*A*B C:n x k :: Uses Eigen to get performance
267int fast_sum_medmat_rows(MedMat<float> &A, MedMat<float> &Asum, float factor); // A: n x m , output: Asum: 1 x m , summing all rows, done with matrix mult with factor (more efficient this way)
268int fast_sum_medmat_cols(MedMat<float> &A, MedMat<float> &Asum, float factor); // A: n x m , output: Asum: n x 1 , summing all cols, done with matrix mult with factor (more efficient this way)
269int fast_multiply_medmat_transpose(const MedMat<float> &A, const MedMat<float> &B, MedMat<float> &C, int transpose_flag); // A:n x m B:m x k --> gets C=A*B C:n x k , but allows transposing each mat
270int fast_multiply_medmat_transpose(const MedMat<float> &A, const MedMat<float> &B, MedMat<float> &C, int transpose_flag, float s); // A:n x m B:m x k --> gets C=s*A*B C:n x k , but allows transposing each mat
271
272int fast_multiply_scalar_vector(vector<float> &v, float s); //v = s * v;
273int fast_multiply_scalar_vector(vector<float> &v, float s, vector<float> &w); //w = s * v;
274int fast_element_dot_vector_vector(vector<float> &v, vector<float> &u, vector<float> &w); //w = v * u elementwise
275int fast_element_dot_vector_vector(vector<float> &v, vector<float> &u); //v = v * u elementwise
276int fast_element_affine_scalar(vector<float> &v, float s, vector<float> &u); // v = v + s*u element wise
277int fast_element_affine_scalar(float s1, vector<float> &v, float s2, vector<float> &u); // v = s1*v + s2*u element wise
278int fast_element_affine_scalar(vector<float> &v, float s, vector<float> &u, vector<float> &w); // w = v + s*u element wise
279
280// gets inds[j] = i iff flags[i] != 0
281
282
283// pearson correlation between two columns (A,B can be the same mat)
284//template <class T> double corr_mats_cols(MedMat<T> &A, int Acol, MedMat &B, int Bcol);
285
286// next are useful to split matrices randomly to train/test
287//void get_rand_binary_vec(vector<int> &v, double p, int len);
288//template <class T> void split_mat_by_rows(MedMat<T> &A, vector<int> &flag, MedMat<T> &B, MedMat<T> &C);
289//template <class T> void get_mat_by_rows(MedMat<T> &A, vector<int> &flag, MedMat<T> &B);
290//void split_vector_by_flag(vector<int> &A, vector<int>flag, vector<int> &B, vector<int> &C);
291
292// summasions
293//template <class T> template <class S> void medmat_sum_rows(MedMat<T> &A, MedMat<S> &B);
294//template <class T> template <class S> void medmat_sum_cols(MedMat<T> &A, MedMat<S> &B);
295//template <class T> template <class S> void medmat_avg_rows(MedMat<T> &A, MedMat<S> &B);
296//template <class T> template <class S> void medmat_avg_cols(MedMat<T> &A, MedMat<S> &B);
297//template <class T> template <class S> void medmat_scalar_mult(MedMat<T> &A, S &s);
298
299
300//========================================================
301// Joining the MedSerialize Wagon
302//========================================================
307
308#endif
MedTime.h.
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
Definition MedMat.h:63
string my_class_name() const
For better handling of serializations it is highly recommended that each SerializableObject inheritin...
Definition MedMat.h:254
TODO: this class should be refactored and merged with MedSample.
Definition MedMat.h:30
Definition SerializableObject.h:32
Copyright 2015-2023 by XGBoost Contributors.
Definition StdDeque.h:58