Medial Code Documentation
Loading...
Searching...
No Matches
MedSamples.h
1// Classes for holding samples
2
3#ifndef __MED_SAMPLES_H__
4#define __MED_SAMPLES_H__
5
6
9#include <unordered_set>
10#include <nlohmann/json.hpp>
11
12using json = nlohmann::json;
13
14class MedFeatures;
15
16//.......................................................................................
20//.......................................................................................
21
23public:
24 int id = -1;
25 int split = -1;
26 int time = 0;
27 float outcome = 0;
28 int outcomeTime = 0;
29 vector<float> prediction;
30 map<string, float> attributes;
31 map<string, string> str_attributes;
32 json jrec;
33
35 MedSample() { prediction.clear(); }
36 MedSample(int _id, int _time, int _outcome, int _outcomeTime) { id = _id; time = _time; outcome = _outcome; outcomeTime = _outcomeTime; prediction.clear(); }
38 ~MedSample() { prediction.clear(); }
39
41 void print(const string prefix);
42
44 void print() { print(""); }
45
52 int parse_from_string(string &s, int time_unit);
58 int parse_from_string(string &s, const map <string, int> & pos, const vector<int>& pred_pos, const map<string, int>& attr_pos,
59 const map<string, int>& str_attr_pos, int time_unit, int raw_format, const string &delimeter = "\t");
60 int parse_from_string(const vector<string> &fields, const map<string, int> & pos, const vector<int>& pred_pos, const map<string, int>& attr_pos,
61 const map<string, int>& str_attr_pos, int time_unit, int raw_format, const string &delimeter);
62
64 void write_to_string(string &s, int time_unit, bool write_attrib = true, const string &delimeter = "\t", int pred_precision = -1) const;
65
67 int get_all_attributes(vector<string>& attributes, vector<string>& str_attributes) const;
68
69 // Serialization
70 ADD_CLASS_NAME(MedSample)
72};
73
75inline bool comp_sample_pred(const MedSample &pr1, const MedSample &pr2) {
76 return pr1.prediction[0] < pr2.prediction[0];
77}
78
80inline bool comp_sample_id_time(const MedSample &pr1, const MedSample &pr2) {
81 if (pr1.id == pr2.id)
82 return pr1.time < pr2.time;
83 else
84 return pr1.id < pr2.id;
85}
86
87//.......................................................................................
91//.......................................................................................
93public:
94 int id = -1;
96 int split = -1;
97 vector<MedSample> samples;
98
100 MedIdSamples(int _id) { id = _id; split = -1; samples.clear(); }
102 MedIdSamples() { id = -1; split = -1; samples.clear(); }
103
105 void set_split(int _split) { split = _split; for (auto& s : samples) s.split = _split; }
106
109 bool same_as(MedIdSamples &other, int mode);
110
112 void get_times(vector<int> &times) const { times.clear(); for (auto &s : samples) times.push_back(s.time); }
113
114 // Serialization
115 ADD_CLASS_NAME(MedIdSamples)
117
118};
119
121inline bool comp_patient_id_time(const MedIdSamples &pr1, const MedIdSamples &pr2) {
122 return pr1.id < pr2.id;
123}
124
125//.......................................................................................
129//.......................................................................................
130
131class MedSamples final : public SerializableObject {
132public:
134 vector<MedIdSamples> idSamples;
135 int raw_format = 0; // read times as is, no conversions
136
138 MedSamples() { time_unit = global_default_time_unit; }
139 ~MedSamples() {}
140
142 void clear() { time_unit = global_default_time_unit; idSamples.clear(); }
143
149 int insert_preds(MedFeatures& featuresData);
150
157 int copy_attributes(const vector<MedSample>& samples);
158
160 void get_ids(vector<int>& ids) const;
161
163 void append(MedSamples& newSamples) { idSamples.insert(idSamples.end(), newSamples.idSamples.begin(), newSamples.idSamples.end()); }
164
167 int read_from_bin_file(const string& file_name) { return SerializableObject::read_from_file(file_name); }
170 int write_to_bin_file(const string& file_name) { return SerializableObject::write_to_file(file_name); }
171
178 int read_from_file(const string& file_name, bool sort_rows = true);
179
182 int write_to_file(const string &fname, int pred_precision=-1, bool print_attributes =true);
183 void write_to_file(ofstream& of, int pred_precision, bool print_attributes, bool print_header);
184
186 void get_preds(vector<float>& preds) const;
187 void get_preds_channel(vector<float>& preds, int channel);
189 void get_y(vector<float>& y) const;
191 void get_categs(vector<float> &categs) const;
193 void get_attr_values(const string& attr_name, vector<float>& values) const;
195 void export_to_sample_vec(vector<MedSample> &vec_samples) const;
197 void import_from_sample_vec(const vector<MedSample> &vec_samples, bool allow_split_inconsistency = false);
198
200 void sort_by_id_date();
202 void normalize();
203
206 bool same_as(MedSamples &other, int mode);
207
209 int nSamples() const;
210
212 int nSplits();
213
215 int get_predictions_size(int& nPreds);
216
218 int get_all_attributes(vector<string>& attributes, vector<string>& str_attributes) const;
219
221 void dilute(float prob);
222
224 void binary_dilute(float p0, float p1);
225
227 void subtract(MedSamples &_dont_include);
228
230 void split_train_test(MedSamples &train, MedSamples &test, float p_test);
231
233 void split_by_split(MedSamples &in_split, MedSamples &off_split, int split);
234
236 void add_splits_from_file(string f_splits);
237
239 void init_all_jrecs();
240
241
242 void flatten(vector<MedSample> &flat) const;
243
245 void insertRec(int pid, int time, float outcome, int outcomeTime);
246 void insertRec(int pid, int time, float outcome, int outcomeTime, float pred);
247 void insertRec(int pid, int time) { insertRec(pid, time, -1, 0); }
248
249 //Serialization, version 1: Added version, model_features, features_count to serialization
250 // version 2: Added attributes
251 ADD_CLASS_NAME(MedSamples)
253};
254
258namespace medial {
262 namespace print {
264 void print_samples_stats(const vector<MedSample> &samples, const string &log_file = "");
266 void print_samples_stats(const MedSamples &samples, const string &log_file = "");
268 void print_by(const vector<MedSample> &data_records, const vector<string> &groups,
269 bool unique_ids = false, const string &log_file = "");
271 void print_by(const MedSamples &data_records, const vector<string> &groups,
272 bool unique_ids = false, const string &log_file = "");
274 void print_by_year(const vector<MedSample> &data_records, int year_bin_size, bool unique_ids = false,
275 bool take_prediction_time = true, const string &log_file = "");
277 void print_by_year(const MedSamples &data_records, int year_bin_size, bool unique_ids = false,
278 bool take_prediction_time = true, const string &log_file = "");
279 }
283 namespace process {
285 void down_sample(MedSamples &samples, double take_ratio, bool with_repeats = false);
287 void down_sample_by_pid(MedSamples &samples, double take_ratio, bool with_repeats = false);
288
290 void down_sample(MedSamples &samples, int no_more_than, bool with_repeats = false);
292 void down_sample_by_pid(MedSamples &samples, int no_more_than, bool with_repeats = false);
293 }
294
295 namespace stats {
296 double kaplan_meir_on_samples(const MedSamples &incidence_samples, int time_period, const vector<pair<int, int>> *filtered_idx = NULL);
297 double kaplan_meir_on_samples(const vector<MedSample> &incidence_samples, int time_unit, int time_period, const vector<int> *filtered_idx = NULL);
298 }
299}
300
301//=======================================
302// Joining the MedSerialze wagon
303//=======================================
307
308#endif
MedTime.h.
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:156
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:142
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
MedIdSamples represent a collection of samples of a given id Additional (optinal) entries: split
Definition MedSamples.h:92
void get_times(vector< int > &times) const
get a vector of all times for the pid
Definition MedSamples.h:112
int id
Patient id.
Definition MedSamples.h:94
vector< MedSample > samples
MedSamples for the given id.
Definition MedSamples.h:97
void set_split(int _split)
Set split and export to all MedSample entries.
Definition MedSamples.h:105
bool same_as(MedIdSamples &other, int mode)
Comparison function : mode 0 requires equal id/time, mode 1 requires equal outcome info,...
Definition MedSamples.cpp:342
MedIdSamples(int _id)
Constructor with id.
Definition MedSamples.h:100
MedIdSamples()
Constructor without id.
Definition MedSamples.h:102
int split
Split for cross-validation. Note that nothing forces the id and split of each MedSample to be the sam...
Definition MedSamples.h:96
MedSample represents a signle sample: id + time (date) Additional (optinal) entries: outcome,...
Definition MedSamples.h:22
void write_to_string(string &s, int time_unit, bool write_attrib=true, const string &delimeter="\t", int pred_precision=-1) const
Write to string in new format.
Definition MedSamples.cpp:295
vector< float > prediction
Prediction(s) - empty if non given.
Definition MedSamples.h:29
int parse_from_string(string &s, int time_unit)
Get sample from tab-delimited string, in old or new format (<split> and <prediction> optional,...
Definition MedSamples.cpp:240
json jrec
a json record that can be built along side the sample to contain any information in a nice json forma...
Definition MedSamples.h:32
map< string, string > str_attributes
Attribute(s) - empty if non given.
Definition MedSamples.h:31
int get_all_attributes(vector< string > &attributes, vector< string > &str_attributes) const
Get sample attributes.
Definition MedSamples.cpp:658
float outcome
Outcome.
Definition MedSamples.h:27
int time
Time (Date)
Definition MedSamples.h:26
map< string, float > attributes
Attribute(s) - empty if non given.
Definition MedSamples.h:30
void print()
printing the sample
Definition MedSamples.h:44
~MedSample()
Destructor.
Definition MedSamples.h:38
int outcomeTime
Outcome time (date)
Definition MedSamples.h:28
MedSample()
Constructor.
Definition MedSamples.h:35
int id
Patient id.
Definition MedSamples.h:24
int split
Cross-validation split. -1 if not given. Proper use is to set the same split for all samples of a giv...
Definition MedSamples.h:25
MedSamples represent a collection of samples per different id The data is conatined in a vector of ...
Definition MedSamples.h:131
int write_to_bin_file(const string &file_name)
Write to bin file.
Definition MedSamples.h:170
bool same_as(MedSamples &other, int mode)
Comparison function : mode 0 requires equal id/time, mode 1 requires equal outcome info,...
Definition MedSamples.cpp:984
void subtract(MedSamples &_dont_include)
removing all ids that appear in _dont_include from the current samples
Definition MedSamples.cpp:1004
int copy_attributes(const vector< MedSample > &samples)
Copy attributes from MedSample vector.
Definition MedSamples.cpp:400
void add_splits_from_file(string f_splits)
adding splits to the samples given in an external file
Definition MedSamples.cpp:1050
int nSamples() const
Return number of samples.
Definition MedSamples.cpp:806
void normalize()
Make sure that : (1) every pid has one idSample at most and (2) everything is sorted.
Definition MedSamples.cpp:781
void init_all_jrecs()
initializing all jrecs to contain pid and time
Definition MedSamples.cpp:1068
int nSplits()
Return number of splits, also check mismatches between idSample and internal MedSamples and set idSam...
Definition MedSamples.cpp:819
void dilute(float prob)
given a probability dilution prob, dilute current samples
Definition MedSamples.cpp:928
void import_from_sample_vec(const vector< MedSample > &vec_samples, bool allow_split_inconsistency=false)
Set MedSamples from a single vector.
Definition MedSamples.cpp:902
void split_by_split(MedSamples &in_split, MedSamples &off_split, int split)
gets a split number and splits samples to lists in/off the split
Definition MedSamples.cpp:1034
void export_to_sample_vec(vector< MedSample > &vec_samples) const
Get all MedSamples as a single vector.
Definition MedSamples.cpp:890
void get_attr_values(const string &attr_name, vector< float > &values) const
get a vector corresponding to given attr (name should include attr_)
Definition MedSamples.cpp:448
void get_ids(vector< int > &ids) const
Get all patient ids.
Definition MedSamples.cpp:420
int read_from_bin_file(const string &file_name)
Read from bin file.
Definition MedSamples.h:167
int insert_preds(MedFeatures &featuresData)
Extract predictions from MedFeatures and insert to corresponding samples Samples in MedFeatures are...
Definition MedSamples.cpp:383
int write_to_file(const string &fname, int pred_precision=-1, bool print_attributes=true)
Write to text file in new format.
Definition MedSamples.cpp:704
void get_categs(vector< float > &categs) const
Get a list of all categories (different values) appearing in the outcome.
Definition MedSamples.cpp:470
int read_from_file(const string &file_name, bool sort_rows=true)
Read from text file.
Definition MedSamples.cpp:541
void clear()
Clear data and init time_unit according to default.
Definition MedSamples.h:142
void get_preds(vector< float > &preds) const
Extract a single vector of concatanated predictions.
Definition MedSamples.cpp:430
void binary_dilute(float p0, float p1)
will dilute 0 labeled samples (traditionally controls) with p0, and all the rest with p1
Definition MedSamples.cpp:951
int get_predictions_size(int &nPreds)
Get predictions vector size. Return -1 if not-consistent.
Definition MedSamples.cpp:643
void get_y(vector< float > &y) const
Extract a vector of all outcomes.
Definition MedSamples.cpp:462
void sort_by_id_date()
Sort by id and then date.
Definition MedSamples.cpp:772
void insertRec(int pid, int time, float outcome, int outcomeTime)
API's for online insertions : main use case is a single time point for prediction per pid.
Definition MedSamples.cpp:854
int time_unit
The time unit in which the samples are given. Default: Date.
Definition MedSamples.h:133
void append(MedSamples &newSamples)
Append new MedIdSamples at the end of current ones.
Definition MedSamples.h:163
MedSamples()
Constructor. init time_unit according to default.
Definition MedSamples.h:138
vector< MedIdSamples > idSamples
The vector of MedIdSamples.
Definition MedSamples.h:134
void split_train_test(MedSamples &train, MedSamples &test, float p_test)
gets p_test and splits by id , p_test of the ids into test, and the rest into train
Definition MedSamples.cpp:1018
int get_all_attributes(vector< string > &attributes, vector< string > &str_attributes) const
Get all attributes. Return -1 if not-consistent.
Definition MedSamples.cpp:676
static const int Date
dates are in full regular format YYYYMMDD
Definition MedTime.h:25
Definition SerializableObject.h:33
virtual int write_to_file(const string &fname)
serialize model and write to file
Definition SerializableObject.cpp:81
virtual int read_from_file(const string &fname)
read and deserialize model
Definition SerializableObject.cpp:74
void print_by_year(const vector< MedSample > &data_records, int year_bin_size, bool unique_ids=false, bool take_prediction_time=true, const string &log_file="")
print samples stats by year
Definition MedSamples.cpp:1224
void print_by(const vector< MedSample > &data_records, const vector< string > &groups, bool unique_ids=false, const string &log_file="")
print samples stats by group
Definition MedSamples.cpp:1146
void print_samples_stats(const vector< MedSample > &samples, const string &log_file="")
print samples stats
Definition MedSamples.cpp:1078
void down_sample(MedFeatures &dataMat, double take_ratio, bool with_repeats=false, vector< int > *selected_indexes=NULL)
down sampling with ratio
Definition MedFeatures.cpp:886
void down_sample_by_pid(MedSamples &samples, double take_ratio, bool with_repeats=false)
down sample by selecting from pids
Definition MedSamples.cpp:1346
medial namespace for function
Definition InfraMed.h:667