Medial Code Documentation
Loading...
Searching...
No Matches
DataLoader.h
1#ifndef __DATALOADER_H
2#define __DATALOADER_H
3
4#define AM_DLL_IMPORT
5
6#include <AlgoMarker/AlgoMarker/AlgoMarker.h>
7#include <AlgoMarker/DynAMWrapper/DynAMWrapper.h>
8#include <AlgoMarker/CommonTestingTools/CommonTestingTools.h>
11#include <MedProcessTools/MedProcessTools/MedSamples.h>
12#include <MedIO/MedIO/MedIO.h>
13#include <boost/property_tree/ptree.hpp>
14#include <boost/property_tree/json_parser.hpp>
15#include <boost/algorithm/string/predicate.hpp>
16#include <boost/algorithm/string.hpp>
17#include <algorithm>
18
19#undef LOCAL_SECTION
20#define LOCAL_SECTION LOG_APP
21#define LOCAL_LEVEL LOG_DEF_LEVEL
22
23namespace CommonTestingTools {
24
25 class DataLoader {
26 public:
27 static const int base_pid;
28
29 MedModel model;
30 MedSamples samples;
32 vector<int> pids;
33 vector<string> sigs;
34 map<int, MedIdSamples* > pid2samples;
35 map<string, vector<map<int, string> > > sig_dict_cached;
36
37 void load(const string& rep_fname, const string& model_fname, const string& samples_fname = "", bool read_signals = true);
38
39 void get_sig_dict_cached(const string& cat_prefix = "", bool force_cat_prefix = false) {
40 sig_dict_cached = get_sig_dict(cat_prefix, force_cat_prefix);
41 }
42
43 map<string, vector<map<int, string> > > get_sig_dict(const string& cat_prefix = "", bool force_cat_prefix = false) {
44 map<string, vector<map<int, string> > > sig_dict;
45 for (auto& sig : sigs) {
46 vector<map<int, string > > chan_dict;
47 int section_id = rep.dict.section_id(sig);
48 int sid = rep.sigs.Name2Sid[sig];
49 int n_vchan = rep.sigs.Sid2Info[sid].n_val_channels;
50 for (int vchan = 0; vchan < n_vchan; ++vchan) {
51 if (rep.sigs.is_categorical_channel(sig, vchan)) {
52 map<int, string> new_dict;
53 const auto& Id2Names = rep.dict.dict(section_id)->Id2Names;
54 const auto& Member2Sets = rep.dict.dict(section_id)->Member2Sets;
55 for (const auto& entry : Id2Names) {
56 if (boost::starts_with(entry.second[0], cat_prefix)) {
57 new_dict[entry.first] = entry.second[0];
58 continue;
59 }
60 string new_ent = entry.second[0];
61 if (Member2Sets.count(entry.first) != 0)
62 for (const auto& setid : Member2Sets.at(entry.first)) {
63 if (Id2Names.count(setid) != 0 && boost::starts_with(Id2Names.at(setid)[0], cat_prefix)) {
64 if (!boost::starts_with(new_ent, cat_prefix) || new_ent.length() > Id2Names.at(setid)[0].length())
65 new_ent = Id2Names.at(setid)[0];
66 }
67 }
68 if (!force_cat_prefix || boost::starts_with(new_ent, cat_prefix))
69 new_dict[entry.first] = new_ent;
70 }
71
72 chan_dict.push_back(new_dict);
73 }
74 else chan_dict.push_back(map<int, string>());
75 }
76 sig_dict[sig] = chan_dict;
77 }
78 return sig_dict;
79 }
80
81 map<string, vector<map<string, int>* > > get_sig_reverse_dict() {
82 map<string, vector<map<string, int >* > > sig_dict;
83 MLOG("(II) Preparing signal reverse dictionary for signals\n");
84 for (auto& sig : sigs) {
85 //MLOG("(II) Preparing signal dictionary for signal '%s'\n", sig.c_str());
86 vector<map<string, int >* > chan_dict;
87 if (rep.sigs.Name2Sid.count(sig) == 0) {
88 MERR("no Name2Sid entry for signal '%s'\n", sig.c_str());
89 exit(-1);
90 }
91 int section_id = rep.dict.section_id(sig);
92 int sid = rep.sigs.Name2Sid[sig];
93 int n_vchan = rep.sigs.Sid2Info[sid].n_val_channels;
94 for (int vchan = 0; vchan < n_vchan; ++vchan) {
95 if (rep.sigs.is_categorical_channel(sig, vchan))
96 {
97 chan_dict.push_back(&(rep.dict.dict(section_id)->Name2Id));
98 }
99 else {
100 chan_dict.push_back(nullptr);
101 }
102 }
103 sig_dict[sig] = chan_dict;
104 }
105 return sig_dict;
106 }
107
108 void export_required_data(const string& fname, const string& cat_prefix, bool force_cat_prefix) {
109 ofstream outfile(fname, ios::binary | ios::out);
110
111 MLOG("(II) Preparing dictinaries to export\n", fname.c_str());
112
113 auto sig_dict = get_sig_dict(cat_prefix, force_cat_prefix);
114
115 MLOG("(II) Exporting required data to %s\n", fname.c_str());
116
117 UniversalSigVec usv;
118
119 for (int pid : pids) {
120 for (auto &sig : sigs) {
121 rep.uget(pid, sig, usv);
122 for (int i = 0; i < usv.len; ++i) {
123 stringstream outss;
124 outss << pid << '\t';
125 outss << sig;
126 for (int tchan = 0, n_tchan = usv.n_time_channels(); tchan < n_tchan; ++tchan) {
127 outss << '\t' << usv.Time(i, tchan);
128 }
129 bool ignore_line = false;
130 for (int vchan = 0, n_vchan = usv.n_val_channels(); vchan < n_vchan; ++vchan) {
131 if (sig_dict.at(sig)[vchan].size() == 0)
132 outss << '\t' << setprecision(10) << usv.Val(i, vchan);
133 else {
134 if (sig_dict.at(sig)[vchan].count((int)(usv.Val(i, vchan))) != 0) {
135 outss << '\t' << sig_dict.at(sig)[vchan].at((int)(usv.Val(i, vchan)));
136 }
137 else {
138 ignore_line = true;
139 }
140 }
141 }
142 if (!ignore_line)
143 outfile << outss.str() << '\n';
144 }
145 }
146 }
147 outfile.close();
148 }
149
150 static void convert_reqfile_to_data(const string& input_json_fname, const string& output_data_fname) {
151 ofstream outfile(output_data_fname, ios::binary | ios::out);
152 ifstream infile(input_json_fname, ios::binary | ios::in);
153
154 MLOG("(II) Exporting required data to %s\n", output_data_fname.c_str());
155
156 json j;
157 infile >> j;
158
159 MLOG("(II) num of requests = %d\n", j.size());
160
161 for (int pid = 0; pid < j.size(); ++pid) {
162 json j_req_signals;
163 if (j[pid].count("body") != 0)
164 j_req_signals = j[pid]["body"]["signals"];
165 else if (j[pid].count("signals") != 0)
166 j_req_signals = j[pid]["signals"];
167 else throw runtime_error("Unrecognized JSON fromat");
168
169 for (const auto& j_sig : j_req_signals)
170 {
171 string sig = j_sig["code"];
172 for (const auto& j_data : j_sig["data"]) {
173 outfile << pid + base_pid << '\t';
174 outfile << sig;
175 for (const auto& j_time : j_data["timestamp"]) {
176 outfile << '\t' << j_time;
177 }
178 for (const auto& j_val : j_data["value"]) {
179 if (boost::to_upper_copy(sig) == "GENDER")
180 outfile << '\t' << (boost::to_upper_copy(j_val.get<string>()) == "MALE" ? "1" : "2");
181 else
182 outfile << '\t' << j_val.get<string>();
183 }
184
185 outfile << "\n";
186 }
187
188 }
189 }
190 outfile.close();
191 }
192
193
194
195 void import_required_data(const string& fname);
196
197 void import_json_request_data(const string& fname);
198
199 int load_samples_from_dates_to_score(const string& fname)
200 {
201 // read scores file
202 vector<vector<string>> raw_scores;
203 if (read_text_file_cols(fname, " \t", raw_scores) < 0) {
204 MERR("Could not read scores file %s\n", fname.c_str());
205 return -1;
206 }
207 MLOG("(II) Read %d lines from scores file %s\n", raw_scores.size(), fname.c_str());
208
209 // prepare MedSamples
210 for (auto &v : raw_scores)
211 if (v.size() >= 2) {
212 samples.insertRec(stoi(v[0]), stoi(v[1]));
213 }
214 samples.normalize();
215 MLOG("(II) Prepared MedSamples\n");
216 for (auto &id : samples.idSamples)
217 pid2samples[id.id] = &id;
218 return 0;
219 }
220
221 void am_add_data(AlgoMarker *am, int pid, int max_date, bool force_add_data, vector<string> ignore_sig, json& json_out);
222
223 };
224
225}
226#endif // __DATALOADER_H
Logger.h - allowing logs with more control.
#define MLOG(fmt,...)
MLOG() - use LOCAL_SECTION and LOCAL_LEVEL.
Definition Logger.h:145
#define MERR(fmt,...)
MERR() - use LOCAL_SECTION , always print.
Definition Logger.h:151
Definition AlgoMarker.h:272
Definition DataLoader.h:25
A model = repCleaner + featureGenerator + featureProcessor + MedPredictor.
Definition MedModel.h:56
Definition MedPidRepository.h:87
MedSamples represent a collection of samples per different id The data is conatined in a vector of ...
Definition MedSamples.h:131