3#ifndef __MED_SAMPLES_H__
4#define __MED_SAMPLES_H__
9#include <unordered_set>
10#include <json/json.hpp>
39 void print(
const string prefix);
56 int parse_from_string(
string &s,
const map <string, int> & pos,
const vector<int>& pred_pos,
const map<string, int>& attr_pos,
57 const map<string, int>& str_attr_pos,
int time_unit,
int raw_format,
const string &delimeter =
"\t");
58 int parse_from_string(
const vector<string> &fields,
const map<string, int> & pos,
const vector<int>& pred_pos,
const map<string, int>& attr_pos,
59 const map<string, int>& str_attr_pos,
int time_unit,
int raw_format,
const string &delimeter);
62 void write_to_string(
string &s,
int time_unit,
bool write_attrib =
true,
const string &delimeter =
"\t",
int pred_precision = -1)
const;
82 return pr1.
id < pr2.
id;
110 void get_times(vector<int> ×)
const { times.clear();
for (
auto &s :
samples) times.push_back(s.time); }
120 return pr1.
id < pr2.
id;
158 void get_ids(vector<int>& ids)
const;
176 int read_from_file(
const string& file_name,
bool sort_rows =
true);
180 int write_to_file(
const string &fname,
int pred_precision=-1,
bool print_attributes =
true);
181 void write_to_file(ofstream& of,
int pred_precision,
bool print_attributes,
bool print_header);
184 void get_preds(vector<float>& preds)
const;
185 void get_preds_channel(vector<float>& preds,
int channel);
187 void get_y(vector<float>& y)
const;
191 void get_attr_values(
const string& attr_name, vector<float>& values)
const;
195 void import_from_sample_vec(
const vector<MedSample> &vec_samples,
bool allow_split_inconsistency =
false);
216 int get_all_attributes(vector<string>& attributes, vector<string>& str_attributes)
const;
240 void flatten(vector<MedSample> &flat)
const;
243 void insertRec(
int pid,
int time,
float outcome,
int outcomeTime);
244 void insertRec(
int pid,
int time,
float outcome,
int outcomeTime,
float pred);
266 void print_by(
const vector<MedSample> &data_records,
const vector<string> &groups,
267 bool unique_ids =
false,
const string &log_file =
"");
270 bool unique_ids =
false,
const string &log_file =
"");
272 void print_by_year(
const vector<MedSample> &data_records,
int year_bin_size,
bool unique_ids =
false,
273 bool take_prediction_time =
true,
const string &log_file =
"");
276 bool take_prediction_time =
true,
const string &log_file =
"");
294 double kaplan_meir_on_samples(
const MedSamples &incidence_samples,
int time_period,
const vector<pair<int, int>> *filtered_idx = NULL);
295 double kaplan_meir_on_samples(
const vector<MedSample> &incidence_samples,
int time_unit,
int time_period,
const vector<int> *filtered_idx = NULL);
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
A class for holding features data as a virtual matrix
Definition MedFeatures.h:47
MedIdSamples represent a collection of samples of a given id Additional (optinal) entries: split
Definition MedSamples.h:90
void get_times(vector< int > ×) const
get a vector of all times for the pid
Definition MedSamples.h:110
int id
Patient id.
Definition MedSamples.h:92
vector< MedSample > samples
MedSamples for the given id.
Definition MedSamples.h:95
void set_split(int _split)
Set split and export to all MedSample entries.
Definition MedSamples.h:103
bool same_as(MedIdSamples &other, int mode)
Comparison function : mode 0 requires equal id/time, mode 1 requires equal outcome info,...
Definition MedSamples.cpp:341
MedIdSamples(int _id)
Constructor with id.
Definition MedSamples.h:98
MedIdSamples()
Constructor without id.
Definition MedSamples.h:100
int split
Split for cross-validation. Note that nothing forces the id and split of each MedSample to be the sam...
Definition MedSamples.h:94
MedSample represents a signle sample: id + time (date) Additional (optinal) entries: outcome,...
Definition MedSamples.h:20
void write_to_string(string &s, int time_unit, bool write_attrib=true, const string &delimeter="\t", int pred_precision=-1) const
Write to string in new format.
Definition MedSamples.cpp:294
vector< float > prediction
Prediction(s) - empty if non given.
Definition MedSamples.h:27
int parse_from_string(string &s, int time_unit)
Get sample from tab-delimited string, in old or new format (<split> and <prediction> optional,...
Definition MedSamples.cpp:239
json jrec
a json record that can be built along side the sample to contain any information in a nice json forma...
Definition MedSamples.h:30
map< string, string > str_attributes
Attribute(s) - empty if non given.
Definition MedSamples.h:29
int get_all_attributes(vector< string > &attributes, vector< string > &str_attributes) const
Get sample attributes.
Definition MedSamples.cpp:657
float outcome
Outcome.
Definition MedSamples.h:25
int time
Time (Date)
Definition MedSamples.h:24
map< string, float > attributes
Attribute(s) - empty if non given.
Definition MedSamples.h:28
void print()
printing the sample
Definition MedSamples.h:42
~MedSample()
Destructor.
Definition MedSamples.h:36
int outcomeTime
Outcome time (date)
Definition MedSamples.h:26
MedSample()
Constructor.
Definition MedSamples.h:33
int id
Patient id.
Definition MedSamples.h:22
int split
Cross-validation split. -1 if not given. Proper use is to set the same split for all samples of a giv...
Definition MedSamples.h:23
MedSamples represent a collection of samples per different id The data is conatined in a vector of ...
Definition MedSamples.h:129
int write_to_bin_file(const string &file_name)
Write to bin file.
Definition MedSamples.h:168
bool same_as(MedSamples &other, int mode)
Comparison function : mode 0 requires equal id/time, mode 1 requires equal outcome info,...
Definition MedSamples.cpp:983
void subtract(MedSamples &_dont_include)
removing all ids that appear in _dont_include from the current samples
Definition MedSamples.cpp:1003
int copy_attributes(const vector< MedSample > &samples)
Copy attributes from MedSample vector.
Definition MedSamples.cpp:399
void add_splits_from_file(string f_splits)
adding splits to the samples given in an external file
Definition MedSamples.cpp:1049
int nSamples() const
Return number of samples.
Definition MedSamples.cpp:805
void normalize()
Make sure that : (1) every pid has one idSample at most and (2) everything is sorted.
Definition MedSamples.cpp:780
void init_all_jrecs()
initializing all jrecs to contain pid and time
Definition MedSamples.cpp:1067
int nSplits()
Return number of splits, also check mismatches between idSample and internal MedSamples and set idSam...
Definition MedSamples.cpp:818
void dilute(float prob)
given a probability dilution prob, dilute current samples
Definition MedSamples.cpp:927
void import_from_sample_vec(const vector< MedSample > &vec_samples, bool allow_split_inconsistency=false)
Set MedSamples from a single vector.
Definition MedSamples.cpp:901
void split_by_split(MedSamples &in_split, MedSamples &off_split, int split)
gets a split number and splits samples to lists in/off the split
Definition MedSamples.cpp:1033
void export_to_sample_vec(vector< MedSample > &vec_samples) const
Get all MedSamples as a single vector.
Definition MedSamples.cpp:889
void get_attr_values(const string &attr_name, vector< float > &values) const
get a vector corresponding to given attr (name should include attr_)
Definition MedSamples.cpp:447
void get_ids(vector< int > &ids) const
Get all patient ids.
Definition MedSamples.cpp:419
int read_from_bin_file(const string &file_name)
Read from bin file.
Definition MedSamples.h:165
int insert_preds(MedFeatures &featuresData)
Extract predictions from MedFeatures and insert to corresponding samples Samples in MedFeatures are...
Definition MedSamples.cpp:382
int write_to_file(const string &fname, int pred_precision=-1, bool print_attributes=true)
Write to text file in new format.
Definition MedSamples.cpp:703
void get_categs(vector< float > &categs) const
Get a list of all categories (different values) appearing in the outcome.
Definition MedSamples.cpp:469
int read_from_file(const string &file_name, bool sort_rows=true)
Read from text file.
Definition MedSamples.cpp:540
void clear()
Clear data and init time_unit according to default.
Definition MedSamples.h:140
void get_preds(vector< float > &preds) const
Extract a single vector of concatanated predictions.
Definition MedSamples.cpp:429
void binary_dilute(float p0, float p1)
will dilute 0 labeled samples (traditionally controls) with p0, and all the rest with p1
Definition MedSamples.cpp:950
int get_predictions_size(int &nPreds)
Get predictions vector size. Return -1 if not-consistent.
Definition MedSamples.cpp:642
void get_y(vector< float > &y) const
Extract a vector of all outcomes.
Definition MedSamples.cpp:461
void sort_by_id_date()
Sort by id and then date.
Definition MedSamples.cpp:771
void insertRec(int pid, int time, float outcome, int outcomeTime)
API's for online insertions : main use case is a single time point for prediction per pid.
Definition MedSamples.cpp:853
int time_unit
The time unit in which the samples are given. Default: Date.
Definition MedSamples.h:131
void append(MedSamples &newSamples)
Append new MedIdSamples at the end of current ones.
Definition MedSamples.h:161
MedSamples()
Constructor. init time_unit according to default.
Definition MedSamples.h:136
vector< MedIdSamples > idSamples
The vector of MedIdSamples.
Definition MedSamples.h:132
void split_train_test(MedSamples &train, MedSamples &test, float p_test)
gets p_test and splits by id , p_test of the ids into test, and the rest into train
Definition MedSamples.cpp:1017
int get_all_attributes(vector< string > &attributes, vector< string > &str_attributes) const
Get all attributes. Return -1 if not-consistent.
Definition MedSamples.cpp:675
static const int Date
dates are in full regular format YYYYMMDD
Definition MedTime.h:25
Definition SerializableObject.h:32
virtual int write_to_file(const string &fname)
serialize model and write to file
Definition SerializableObject.cpp:92
virtual int read_from_file(const string &fname)
read and deserialize model
Definition SerializableObject.cpp:86
process(fname, allow_type)
Definition lint.py:152