Medial Code Documentation
Loading...
Searching...
No Matches
DataLoader.h
1#ifndef __DATALOADER_H
2#define __DATALOADER_H
3
4#define AM_DLL_IMPORT
5
6#include <AlgoMarker/AlgoMarker/AlgoMarker.h>
7#include <AlgoMarker/DynAMWrapper/DynAMWrapper.h>
8#include <AlgoMarker/CommonTestingTools/CommonTestingTools.h>
11#include <MedProcessTools/MedProcessTools/MedSamples.h>
12#include <MedIO/MedIO/MedIO.h>
13#include <boost/property_tree/ptree.hpp>
14#include <boost/property_tree/json_parser.hpp>
15#include <boost/algorithm/string/predicate.hpp>
16#include <boost/algorithm/string.hpp>
17#include <algorithm>
18
19#define LOCAL_SECTION LOG_APP
20#define LOCAL_LEVEL LOG_DEF_LEVEL
21
22namespace CommonTestingTools {
23
24 class DataLoader {
25 public:
26 static const int base_pid;
27
28 MedModel model;
29 MedSamples samples;
31 vector<int> pids;
32 vector<string> sigs;
33 map<int, MedIdSamples* > pid2samples;
34 map<string, vector<map<int, string> > > sig_dict_cached;
35
36 void load(const string& rep_fname, const string& model_fname, const string& samples_fname = "", bool read_signals = true);
37
38 void get_sig_dict_cached(const string& cat_prefix = "", bool force_cat_prefix = false) {
39 sig_dict_cached = get_sig_dict(cat_prefix, force_cat_prefix);
40 }
41
42 map<string, vector<map<int, string> > > get_sig_dict(const string& cat_prefix = "", bool force_cat_prefix = false) {
43 map<string, vector<map<int, string> > > sig_dict;
44 for (auto& sig : sigs) {
45 vector<map<int, string > > chan_dict;
46 int section_id = rep.dict.section_id(sig);
47 int sid = rep.sigs.Name2Sid[sig];
48 int n_vchan = rep.sigs.Sid2Info[sid].n_val_channels;
49 for (int vchan = 0; vchan < n_vchan; ++vchan) {
50 if (rep.sigs.is_categorical_channel(sig, vchan)) {
51 map<int, string> new_dict;
52 const auto& Id2Names = rep.dict.dict(section_id)->Id2Names;
53 const auto& Member2Sets = rep.dict.dict(section_id)->Member2Sets;
54 for (const auto& entry : Id2Names) {
55 if (boost::starts_with(entry.second[0], cat_prefix)) {
56 new_dict[entry.first] = entry.second[0];
57 continue;
58 }
59 string new_ent = entry.second[0];
60 if (Member2Sets.count(entry.first) != 0)
61 for (const auto& setid : Member2Sets.at(entry.first)) {
62 if (Id2Names.count(setid) != 0 && boost::starts_with(Id2Names.at(setid)[0], cat_prefix)) {
63 if (!boost::starts_with(new_ent, cat_prefix) || new_ent.length() > Id2Names.at(setid)[0].length())
64 new_ent = Id2Names.at(setid)[0];
65 }
66 }
67 if (!force_cat_prefix || boost::starts_with(new_ent, cat_prefix))
68 new_dict[entry.first] = new_ent;
69 }
70
71 chan_dict.push_back(new_dict);
72 }
73 else chan_dict.push_back(map<int, string>());
74 }
75 sig_dict[sig] = chan_dict;
76 }
77 return sig_dict;
78 }
79
80 map<string, vector<map<string, int>* > > get_sig_reverse_dict() {
81 map<string, vector<map<string, int >* > > sig_dict;
82 MLOG("(II) Preparing signal reverse dictionary for signals\n");
83 for (auto& sig : sigs) {
84 //MLOG("(II) Preparing signal dictionary for signal '%s'\n", sig.c_str());
85 vector<map<string, int >* > chan_dict;
86 if (rep.sigs.Name2Sid.count(sig) == 0) {
87 MERR("no Name2Sid entry for signal '%s'\n", sig.c_str());
88 exit(-1);
89 }
90 int section_id = rep.dict.section_id(sig);
91 int sid = rep.sigs.Name2Sid[sig];
92 int n_vchan = rep.sigs.Sid2Info[sid].n_val_channels;
93 for (int vchan = 0; vchan < n_vchan; ++vchan) {
94 if (rep.sigs.is_categorical_channel(sig, vchan))
95 {
96 chan_dict.push_back(&(rep.dict.dict(section_id)->Name2Id));
97 }
98 else {
99 chan_dict.push_back(nullptr);
100 }
101 }
102 sig_dict[sig] = chan_dict;
103 }
104 return sig_dict;
105 }
106
107 void export_required_data(const string& fname, const string& cat_prefix, bool force_cat_prefix) {
108 ofstream outfile(fname, ios::binary | ios::out);
109
110 MLOG("(II) Preparing dictinaries to export\n", fname.c_str());
111
112 auto sig_dict = get_sig_dict(cat_prefix, force_cat_prefix);
113
114 MLOG("(II) Exporting required data to %s\n", fname.c_str());
115
116 UniversalSigVec usv;
117
118 for (int pid : pids) {
119 for (auto &sig : sigs) {
120 rep.uget(pid, sig, usv);
121 for (int i = 0; i < usv.len; ++i) {
122 stringstream outss;
123 outss << pid << '\t';
124 outss << sig;
125 for (int tchan = 0, n_tchan = usv.n_time_channels(); tchan < n_tchan; ++tchan) {
126 outss << '\t' << usv.Time(i, tchan);
127 }
128 bool ignore_line = false;
129 for (int vchan = 0, n_vchan = usv.n_val_channels(); vchan < n_vchan; ++vchan) {
130 if (sig_dict.at(sig)[vchan].size() == 0)
131 outss << '\t' << setprecision(10) << usv.Val(i, vchan);
132 else {
133 if (sig_dict.at(sig)[vchan].count((int)(usv.Val(i, vchan))) != 0) {
134 outss << '\t' << sig_dict.at(sig)[vchan].at((int)(usv.Val(i, vchan)));
135 }
136 else {
137 ignore_line = true;
138 }
139 }
140 }
141 if (!ignore_line)
142 outfile << outss.str() << '\n';
143 }
144 }
145 }
146 outfile.close();
147 }
148
149 static void convert_reqfile_to_data(const string& input_json_fname, const string& output_data_fname) {
150 ofstream outfile(output_data_fname, ios::binary | ios::out);
151 ifstream infile(input_json_fname, ios::binary | ios::in);
152
153 MLOG("(II) Exporting required data to %s\n", output_data_fname.c_str());
154
155 json j;
156 infile >> j;
157
158 MLOG("(II) num of requests = %d\n", j.size());
159
160 for (int pid = 0; pid < j.size(); ++pid) {
161 json j_req_signals;
162 if (j[pid].count("body") != 0)
163 j_req_signals = j[pid]["body"]["signals"];
164 else if (j[pid].count("signals") != 0)
165 j_req_signals = j[pid]["signals"];
166 else throw runtime_error("Unrecognized JSON fromat");
167
168 for (const auto& j_sig : j_req_signals)
169 {
170 string sig = j_sig["code"];
171 for (const auto& j_data : j_sig["data"]) {
172 outfile << pid + base_pid << '\t';
173 outfile << sig;
174 for (const auto& j_time : j_data["timestamp"]) {
175 outfile << '\t' << j_time;
176 }
177 for (const auto& j_val : j_data["value"]) {
178 if (boost::to_upper_copy(sig) == "GENDER")
179 outfile << '\t' << (boost::to_upper_copy(j_val.get<string>()) == "MALE" ? "1" : "2");
180 else
181 outfile << '\t' << j_val.get<string>();
182 }
183
184 outfile << "\n";
185 }
186
187 }
188 }
189 outfile.close();
190 }
191
192
193
194 void import_required_data(const string& fname);
195
196 void import_json_request_data(const string& fname);
197
198 int load_samples_from_dates_to_score(const string& fname)
199 {
200 // read scores file
201 vector<vector<string>> raw_scores;
202 if (read_text_file_cols(fname, " \t", raw_scores) < 0) {
203 MERR("Could not read scores file %s\n", fname.c_str());
204 return -1;
205 }
206 MLOG("(II) Read %d lines from scores file %s\n", raw_scores.size(), fname.c_str());
207
208 // prepare MedSamples
209 for (auto &v : raw_scores)
210 if (v.size() >= 2) {
211 samples.insertRec(stoi(v[0]), stoi(v[1]));
212 }
213 samples.normalize();
214 MLOG("(II) Prepared MedSamples\n");
215 for (auto &id : samples.idSamples)
216 pid2samples[id.id] = &id;
217 return 0;
218 }
219
220 void am_add_data(AlgoMarker *am, int pid, int max_date, bool force_add_data, vector<string> ignore_sig, json& json_out);
221
222 };
223
224}
225#endif // __DATALOADER_H
Logger.h - allowing logs with more control.
#define MLOG(fmt,...)
MLOG() - use LOCAL_SECTION and LOCAL_LEVEL.
Definition Logger.h:145
#define MERR(fmt,...)
MERR() - use LOCAL_SECTION , always print.
Definition Logger.h:151
Definition AlgoMarker.h:272
Definition DataLoader.h:24
A model = repCleaner + featureGenerator + featureProcessor + MedPredictor.
Definition MedModel.h:56
Definition MedPidRepository.h:87
MedSamples represent a collection of samples per different id The data is conatined in a vector of ...
Definition MedSamples.h:129