5#ifndef _CRT_SECURE_NO_WARNINGS
6#define _CRT_SECURE_NO_WARNINGS
9#define MED_FLT_EPSILON 1.192092896e-07F
12template <
class T>
template <
class S>
void MedMat<T>::load(S *x,
int n_rows,
int n_cols)
16 m.resize(nrows*ncols);
18 for (
size_t i=0; i<nrows; i++)
19 for (
size_t j=0; j<ncols; j++)
20 set(i,j) = (T)x[i*ncols + j];
29 m.resize(nrows*ncols);
31 for (
size_t i=0; i<n_rows; i++)
32 for (
size_t j=0; j<n_cols; j++)
33 set(j,i) = (T)x[i*n_cols + j];
38template <
class T>
template <
class S>
void MedMat<T>::load(
const vector<S> &x,
int n_cols)
43 nrows = x.size()/ncols;
44 m.resize(nrows*ncols);
46 for (
size_t i=0; i<nrows; i++)
47 for (
size_t j=0; j<ncols; j++)
48 set(i,j) = (T)x[i*ncols + j];
57 m.resize(nrows*ncols);
59 for (
size_t i=0; i<nrows; i++)
60 for (
size_t j=0; j<ncols; j++)
65 recordsMetadata.clear();
66 recordsMetadata = x.recordsMetadata;
74 size_t ncols_orig = ncols;
78 for (
size_t i=0; i<nrows; i++)
79 for (
size_t j=0; j<ncols; j++)
80 set(i,j) = m_orig[j*ncols_orig+i];
81 transposed_flag = 1 - transposed_flag;
91 size_t new_n_rows = (rows_to_take.size() == 0 ? nrows : (int) rows_to_take.size());
92 size_t new_n_cols = (cols_to_take.size() == 0 ? ncols : (int) cols_to_take.size());
95 vector<RecordData> r_orig = recordsMetadata;
96 vector<string> c_orig = signals;
97 recordsMetadata.clear(); signals.clear();
99 m.resize(new_n_rows*new_n_cols);
101 for (
size_t i=0; i<new_n_rows; i++) {
102 size_t r = (rows_to_take.size() == 0 ? i : rows_to_take[i]);
103 if (r_orig.size() > 0)
104 recordsMetadata.push_back(r_orig[r]);
105 for (
size_t j=0; j<new_n_cols; j++) {
106 size_t c = (cols_to_take.size() == 0 ? j : cols_to_take[j]);
107 if ((i == 0) && (c_orig.size() > 0))
108 signals.push_back(c_orig[c]);
110 m[i*new_n_cols + j] = m_orig[r*ncols+c];
121 vector<int> rows_to_take;
122 vector<int> cols_to_take;
124 flags_to_indexes(rows_to_take_flag, rows_to_take);
125 flags_to_indexes(cols_to_take_flag, cols_to_take);
127 get_sub_mat(rows_to_take, cols_to_take);
133 if (recordsMetadata.size() > 0 || m_add.recordsMetadata.size() > 0)
134 throw runtime_error(
"concating matrices with metadata is not supported yet");
135 if (ncols != m_add.ncols)
136 throw runtime_error(
"can not concat matrices with different number of cols");
138 add_rows(&m_add.m[0], m_add.nrows);
144 if (nrows_to_add <= 0)
147 m.resize((nrows+(
size_t)nrows_to_add)*ncols);
148 for (
size_t j=0; j<nrows_to_add*ncols; j++)
149 m[ncols*nrows+j] = (T)m_add[j];
150 nrows += nrows_to_add;
156 if (ncols == 0 || (
int)(m_add.size() % ncols) != 0)
159 int nrows_to_add =(int) (m_add.size()/ncols);
161 add_rows(&m_add[0], nrows_to_add);
168 if (signals.size() > 0 || m_add.signals.size() > 0)
169 throw runtime_error(
"concating matrices with metadata is not supported yet");
170 if (m_add.nrows != nrows)
171 throw runtime_error(
"can not concat matrices with different number of rows");
172 add_cols(&m_add.m[0], m_add.ncols);
179 if (ncols_to_add == 0)
182 vector<T> m_orig = m;
184 size_t new_ncols = ncols+ncols_to_add;
186 m.resize(nrows*new_ncols);
187 for (
size_t i=0; i<nrows; i++) {
188 for (
size_t j=0; j<ncols; j++)
189 m[i*new_ncols+j] = (T)m_orig[i*ncols+j];
190 for (
size_t j=0; j<ncols_to_add; j++)
191 m[i*new_ncols+ncols+j] = (T)m_add[i*(size_t)ncols_to_add + j];
200 if (nrows == 0 || (
int)(m_add.size() % nrows) != 0)
203 int ncols_to_add = (int)(m_add.size()/nrows);
204 add_cols(&m_add[0], ncols_to_add);
210 get_sub_mat(row_order, vector<int>());
216 get_sub_mat(vector<int>(), col_order);
223 if (ncols > 0 && nrows > 0)
224 memcpy(&(rowv[0]),&(m[(
size_t)i_row*ncols]),ncols*
sizeof(T));
231 if (ncols > 0 && nrows > 0) {
232 for (
size_t i=0; i<nrows; i++)
233 colv[i] = m[i*ncols + (
size_t)i_col];
405 unsigned long long size;
406 if (read_binary_data_alloc(fname, data, size) < 0) {
407 fprintf(stderr,
"Error reading file %s\n", fname.c_str()); fflush(stderr);
412 if (deserialize(data) == (
size_t)-1)
423 if (!file_exists(fname)) {
424 cerr <<
"File " << fname <<
" doesn't exist\n";
427 cerr <<
"reading binary data from " << fname <<
"\n";
431 inf.open(fname,ios::in|ios::binary);
439 inf.read((
char *)(&nrows),
sizeof(
int));
440 inf.read((
char *)(&ncols),
sizeof(
int));
442 m.resize(nrows*ncols);
444 char *d = (
char *)&m[0];
445 inf.read(d, nrows*ncols*
sizeof(T));
448 inf.read((
char *)(&r_size),
sizeof(
int));
449 for (
int i = 0; i < r_size; i++) {
451 inf.read((
char *)(&r.id),
sizeof(r.id));
452 inf.read((
char *)(&r.date),
sizeof(r.date));
453 inf.read((
char *)(&r.time),
sizeof(r.time));
454 inf.read((
char *)(&r.split),
sizeof(r.split));
455 inf.read((
char *)(&r.weight),
sizeof(r.weight));
456 inf.read((
char *)(&r.label),
sizeof(r.label));
457 inf.read((
char *)(&r.pred),
sizeof(r.pred));
458 recordsMetadata.push_back(r);
462 inf.read((
char *)(&c_size),
sizeof(
int));
463 for (
int i = 0; i < c_size; i++) {
465 inf.read((
char *)(&len),
sizeof(
int));
466 std::vector<char> tmp(len);
467 inf.read(tmp.data(), len);
469 name.assign(tmp.data(), len);
470 signals.push_back(name);
484 vector<unsigned char> serialized;
485 size_t size = get_size();
486 serialized.resize(size+1);
487 serialize(&serialized[0]);
488 if (write_binary_data(fname, &serialized[0], size) < 0) {
489 fprintf(stderr,
"MedMat write_to_bon_file ERROR: failed writing to %s\n", fname.c_str());
498 of.open(fname, ios::out|ios::binary);
500 fprintf(stderr,
"Can not write to %s\n", fname.c_str());
505 cerr <<
"writing binary " << fname <<
" with " << nrows <<
"X" << ncols <<
" :: elem size " <<
sizeof(T) <<
"\n";
506 of.write((
char *)(&nrows),
sizeof(
int));
507 of.write((
char *)(&ncols),
sizeof(
int));
508 of.write((
char *)(&m[0]),
sizeof(T)*nrows*ncols);
509 int r_size = (int)recordsMetadata.size();
510 of.write((
char *)(&r_size),
sizeof(
int));
511 cerr <<
"writing additional data for " << r_size <<
" records\n";
513 of.write((
char *)(&r.id),
sizeof(r.id));
514 of.write((
char *)(&r.date),
sizeof(r.date));
515 of.write((
char *)(&r.time),
sizeof(r.time));
516 of.write((
char *)(&r.split),
sizeof(r.split));
517 of.write((
char *)(&r.weight),
sizeof(r.weight));
518 of.write((
char *)(&r.label),
sizeof(r.label));
519 of.write((
char *)(&r.pred),
sizeof(r.pred));
522 int c_size = (int)signals.size();
523 of.write((
char *)(&c_size),
sizeof(
int));
524 cerr <<
"writing additional data for " << c_size <<
" columns\n";
525 for (
string name: signals) {
526 int len = (int)name.size();
527 of.write((
char *)(&len),
sizeof(len));
545 if (!file_exists(fname)) {
546 fprintf(stderr,
"File %s doesn't exist\n",fname.c_str());
547 throw std::exception();
549 fprintf(stderr,
"reading data from %s\n", fname.c_str());
551 inf.open(fname, ios::in);
553 cerr <<
"can not open file\n";
554 throw std::exception();
558 int METADATA_COLUMNS_PREFIX = 0;
559 int METADATA_COLUMNS_SUFFIX = 0;
560 if (titles_line_flag == 1) {
561 METADATA_COLUMNS_PREFIX = 5;
562 METADATA_COLUMNS_SUFFIX = 2;
564 int METADATA_COLUMNS = METADATA_COLUMNS_PREFIX + METADATA_COLUMNS_SUFFIX;
566 while (getline(inf, curr_line)) {
567 boost::trim(curr_line);
568 vector<string> fields;
569 boost::split(fields, curr_line, boost::is_any_of(
","));
571 if (titles_line_flag) {
572 assert(fields[0].compare(
"pid") == 0);
573 assert(fields[1].compare(
"date") == 0);
574 assert(fields[2].compare(
"outcomeTime") == 0);
575 assert(fields[3].compare(
"split") == 0);
576 assert(fields[4].compare(
"weight") == 0);
577 for (
int i = METADATA_COLUMNS_PREFIX; i < fields.size() - METADATA_COLUMNS_SUFFIX; i++)
578 signals.push_back(fields[i]);
580 assert(fields.end()[-2].compare(
"label") == 0);
581 assert(fields.end()[-1].compare(
"pred") == 0);
582 ncols = (int)fields.size() - METADATA_COLUMNS;
587 ncols = (int)fields.size();
591 if (fields.size() != ncols + METADATA_COLUMNS) {
593 string msg =
"expected " + to_string(ncols + METADATA_COLUMNS) +
" fields, got " + to_string((
int)fields.size()) +
"fields in line: " + curr_line.c_str() +
"\n";
595 throw runtime_error(msg.c_str());
597 if (METADATA_COLUMNS > 0) {
598 RecordData sample(stoi(fields[0]), stoi(fields[1]), stol(fields[2]), stoi(fields[3]),
stof(fields[4]),
599 stof(fields.end()[-2]),
stof(fields.end()[-1]));
600 recordsMetadata.push_back(sample);
602 vector<T> new_row(ncols);
603 for (
int i = 0; i < ncols; i++)
604 new_row[i] = (T)
stof(fields[i + METADATA_COLUMNS_PREFIX]);
609 fprintf(stderr,
"read %lldX%lld data\n", nrows, ncols);
614 fprintf(stderr,
"writing %s with %lldX%lld data\n", fname.c_str(), nrows, ncols);
616 of.open(fname, ios::out);
618 fprintf(stderr,
"Error: failed opening file %s\n", fname.c_str());
620 throw std::exception();
622 bool with_signals = (signals.size() == ncols);
623 bool with_records = (recordsMetadata.size() == nrows);
625 if (signals.size() != ncols)
626 cerr <<
"ncols: " << ncols <<
" number of column names: " << signals.size() <<
", not writing column names\n";
627 if (recordsMetadata.size() != nrows)
628 cerr <<
"nrows: " << nrows <<
" number of records metadata entries: " << recordsMetadata.size() <<
", not writing record metadata\n";
630 if (with_records && with_signals)
631 of <<
"pid,date,outcomeTime,split,weight,";
633 for (
int j = 0; j < ncols; j++) {
634 of << signals[j] <<
",";
636 if (with_records && with_signals)
637 of <<
"label,pred\n";
638 else if (with_signals)
642 for (
int i = 0; i < nrows; i++) {
644 of << recordsMetadata[i].id <<
"," << med_time_converter.convert_times_S(global_default_time_unit,
MedTime::DateTimeString, recordsMetadata[i].date)
645 <<
"," << recordsMetadata[i].outcomeTime <<
"," << recordsMetadata[i].split <<
"," <<
646 recordsMetadata[i].weight <<
",";
647 for (
int j = 0; j < ncols; j++) {
648 of << get(i, j) <<
",";
651 of << recordsMetadata[i].label <<
"," << recordsMetadata[i].pred;
661inline void calculate_moments(
int num,
double sum,
double sum2,
float& mean,
float&
std,
float missing_val) {
664 mean =
std = missing_val ;
666 mean = (float)(sum/(
double)num);
668 float val = (float)((sum2 - sum*mean)/(double)(num-1)) ;
669 if (val > MED_FLT_EPSILON)
686 if (norm_type == Normalize_Cols) {
691 for (
int j=0; j<ncols; j++) {
696 for (
int i=0; i<nrows; i++) {
697 val = (float)get(i,j);
698 if (val != missing_value) {
699 if (wgts != NULL) val *= wgts[i];
707 calculate_moments(num,sum,sum2,av,sd,(
float)missing_value) ;
717 for (
int i=0; i<nrows; i++) {
721 for (
int j=0; j<ncols; j++) {
722 val = (float)get(i,j);
723 if (val != missing_value) {
724 if (wgts != NULL) val *= wgts[i];
732 calculate_moments(num,sum,sum2,av,sd,(
float)missing_value) ;
739 normalize(avg,
std,norm_type) ;
747 for (
int j=0; j<ncols; j++) {
749 double _sum = 0 , _sum2 = 0;
752 for (
int i=0; i<nrows; i++) {
753 float val = (float)get(i, j);
754 if (val != missing_value) {
762 _avg[j] = (T)(_sum/_num);
767 for (
int i=0; i<nrows; i++) {
769 if (val != missing_value) {
771 _sum2 += (double)(val - _avg[j])*(val - _avg[j]);
776 _std[j] = (T)sqrt((
double)_sum2/_num);
786template <
class T>
template <
class S>
void MedMat<T>::normalize (
const vector<S>& external_mean,
const vector<S>& external_std,
int norm_type) {
788 normalized_flag = norm_type ;
789 vector<S> internal_std(external_std.size());
790 for (
int i=0; i<external_std.size(); i++) {
791 if (external_std[i] == 0)
794 internal_std[i] = external_std[i];
797 for (
size_t i=0; i<nrows; i++) {
798 for (
size_t j=0; j<ncols; j++) {
799 if (normalized_flag == Normalize_Cols) {
800 if (m[i*ncols +j] == missing_value)
802 else if (internal_std.size())
803 m[i*ncols + j] = (m[i*ncols + j] - external_mean[j])/internal_std[j] ;
805 m[i*ncols + j] = (m[i*ncols + j] - external_mean[j]) ;
807 if (m[i*ncols +j] == missing_value)
809 else if (internal_std.size())
810 m[i*ncols + j] = (m[i*ncols + j] - external_mean[i])/internal_std[i] ;
812 m[i*ncols + j] = (m[i*ncols + j] - external_mean[i]) ;
819template <
class T>
void MedMat<T>::print_row(FILE *fout,
const string &prefix,
const string &format,
int i_row)
821 fprintf(fout,
"%s :: [%d,:] :", prefix.c_str(),i_row);
822 for (
int i=0; i<ncols; i++)
823 fprintf(fout, format.c_str(), get(i_row, i));
832 for (
auto &sig : sigs)
834 signals.push_back(sig);
841 if (recordsMetadata.size() != nrows)
842 HMTHROW_AND_ERR(
"ERROR: MedMat : Can't split a matrix by id with a non matching recordsMetadata (%d records != %d rows)\n", (
int)recordsMetadata.size(), (
int)nrows);
845 unordered_set<int> all_ids, ids_0;
846 for (
int i = 0; i < nrows; i++)
847 all_ids.insert(recordsMetadata[i].id);
848 for (
auto id : all_ids)
854 mat_0.copy_header(*
this);
855 mat_1.copy_header(*
this);
856 int nrows0 = 0, nrows1 = 0;
859 vector<int> assignment(nrows, 0);
860 for (
int i = 0; i < nrows; i++) {
861 if (ids_0.find(recordsMetadata[i].id) != ids_0.end()) {
872 mat_0.nrows = nrows0;
873 mat_0.m.resize(mat_0.nrows*mat_0.ncols);
874 mat_0.recordsMetadata.resize(mat_0.nrows);
875 mat_0.row_ids.resize(mat_0.nrows);
877 mat_1.nrows = nrows1;
878 mat_1.m.resize(mat_1.nrows*mat_1.ncols);
879 mat_1.recordsMetadata.resize(mat_1.nrows);
880 mat_1.row_ids.resize(mat_1.nrows);
883 for (
int i = 0; i < nrows; i++) {
885 for (
int j = 0; j < ncols; j++)
886 mat_1(i1, j) = m[i*ncols + j];
887 mat_1.recordsMetadata[i1] = recordsMetadata[i];
888 mat_1.row_ids[i1] = recordsMetadata[i].id;
892 for (
int j = 0; j < ncols; j++)
893 mat_0(i0, j) = m[i*ncols + j];
894 mat_1.recordsMetadata[i0] = recordsMetadata[i];
895 mat_1.row_ids[i0] = recordsMetadata[i].id;
static const int DateTimeString
string only format "YYYYMMDDHHMI"
Definition MedTime.h:31
TODO: this class should be refactored and merged with MedSample.
Definition MedMat.h:30
float stof(const std::string &value, size_t *pos=nullptr)
A faster implementation of stof(). See documentation of std::stof() for more information....
Definition strtonum.h:467