Medial Code Documentation
Loading...
Searching...
No Matches
MedEmbed.h
1#pragma once
2//
3// MedEmbed :
4// Contains a set of structures and tools to allow for preparing a training of an embedding,
5// and using it to create features by generating the sparse categorial line and running it
6// through a pre trained model (currently we use keras models)
7//
8
12#include <InfraMed/InfraMed/MedPidRepository.h>
13#include <MedSparseMat/MedSparseMat/MedSparseMat.h>
14
15#include <unordered_map>
16#include <unordered_set>
17
18enum EmbeddedCodeType { ECTYPE_CATEGORIAL = 0, ECTYPE_CONTINUOUS, ECTYPE_AGE, ECTYPE_DUMMY , ECTYPE_MODEL, ECTYPE_UNDEFINED};
19
20//======================================================================================
21// needed info on each signal participating
22// containing all the information needed to create the codes
23//======================================================================================
25
26public:
27 string sig;
28 EmbeddedCodeType type = ECTYPE_CATEGORIAL;
29 int add_hierarchy = 1;
30 int do_shrink = 1; // keeping the bit of wheather to shrink this one when shrinking is done.
31 int do_counts = 1; // collecting data as counts if =1 , or just as 0/1 if =0
32
33 // next is used for categorial signals. It holds the string names of the categories we want to embed.
34 // initialization is with the categories= parameter , it is a comma (,) separeted list of names (best initialize with the "list:" option)
35 vector<string> categories_to_embed;
36 string regex_filter = "";
37
38 // for categorial: ranges to use (empty is all). The sequence is:
39 // a value and all its hierarchy (if asked for) are created. Then only those within the asked for ranges are actually added and used.
40
41 // for non categorial: defines value ranges r[i][0] <= val < r[i][1] means it will be counted into original category i
42 // This allows creating categorial features out of continous signals
43 vector<vector<float>> ranges;
44
45 int time_chan = 0;
46 int val_chan = 0;
47 int win_from = 0;
48 int win_to = 365;
49 int sig_time_unit = global_default_time_unit;
50 int win_time_unit = global_default_windows_time_unit;
51
52 // next are for the model type
53 // a model can be initiated with a model file (you have to pretrain it) .
54 // It is highly recommended to use models creating matrices that are imputed AND normalized.
55 // the features generated will be copied into the sparse matrix
56 // features generated like this are never shrunk
57 MedModel *model = NULL; // only object needed for serializations
58 string model_file = "";
59 vector<string> model_req_sigs;
60 vector<string> model_features_names;
61 vector<float *> feat_ptrs; // after matrix was created, holds pointers to each model feature for faster access
62 map<pair<int, int>, int> pidtime2idx; // since order in batch prep is not necessarily the same as the one in the generation, we need this mapping
63
64
65 // for categorials only : all sets of a value : we need to build this table before each apply using the given dictionary (and the known categories)
66 unordered_map<int, vector<int>> sig_members2sets;
67
68 // for categorials: after limiting to sets in range only ( = orig values)
69 unordered_map<int, vector<int>> sig_members2sets_in_range;
70
71 // orig to code and name
72 map<string, int> Name2Id; // Only relevant for categorial cases : keeping the sub Name2Id table that was used to build the Orig2X tables.
73 // when using a new different repository we need to translate the ids to these ones.
74 // this is done in the categs_convert table
75 vector<int> categ_convert; // relevant also for the shrunk case
76 map<int, int> Orig2Code;
77 map<int, string> Orig2Name;
78
79 // orig to shrunk code
80 map<int, int> Orig2ShrunkCode;
81
82 void clear_tables() { sig_members2sets.clear(); sig_members2sets_in_range.clear(); Name2Id.clear(); Orig2Code.clear(); Orig2Name.clear(); Orig2ShrunkCode.clear(); }
83
84 // simple API's
85
86 // appends the orig values to the given codes vector , returns number of elements added
87 int get_categ_orig(int val, vector<int> &codes) const;
88
89 // appends the codes to the given codes vector , returns number of elements added
90 int get_categ_codes(int val, vector<int> &codes, int use_shrink = 1) const;
91
92 // appends the shrunk codes to the given codes vector , returns number of elements added
93 int get_categ_shrunk_codes(int val, vector<int> &codes) const;
94
95 // appends the orig values to the given codes vector , returns number of elements added
96 int get_continuous_orig(float val) const;
97
98 // appends the codes to the given codes vector , returns number of elements added
99 int get_continuous_codes(float val, int use_shrink = 1) const;
100
101 // appends the shrunk codes to the given codes vector , returns number of elements added
102 int get_continuous_shrunk_codes(float val) const;
103
104 // helper and not needed to serialize params
105 int sid = -1;
106
107 // initialization from string
108 int init(map<string, string>& _map);
109
110
111 // initializing a categorial case : need to get a dictionary and init : Orig2Code, Orig2Name (see also init_categotial_tables)
112 // this is needed in order to make the embedding independent of the actual values given in the directory and rely on names only.
113 // This has the potential of allowing to transfer embeddings between different data sets, as long as they use the same signal names with the same category names in the dictionary.
114 int init_categorial(MedDictionarySections &dict, int &curr_code);
115
116 // the next is special for the categorial case:
117 // We need to initialize the Name2Id table (only if it is not empty !! , as it may be full from the original mapping that was used to build the Orig tables)
118 // once we have that table, we need to initialize the following tables:
119 // sig_members2sets, sig_members2sets_in_range, and also categ_convert
120 int init_categorial_tables(MedDictionarySections &dict);
121
122 // initialize a continous or age case : preparing the Orig2X tables based on the given ranges.
123 int init_continous(int &curr_code);
124
125 // initialize a dummy case : simple constant variable always added to make sure we have at least one entry per sample (helps in some cases)
126 int init_dummy();
127
128
129 // actually collecting matrix lines
130 int add_sig_to_lines(UniversalSigVec &usv, int pid, int time, int use_shrink, map<int, map<int, float>> &out_lines) const;
131 int get_codes(UniversalSigVec &usv, int pid, int time, int use_shrink, vector<int> &codes) const;
132 int add_codes_to_line(vector<int> &codes, map<int, float> &out_line) const;
133 int add_to_line(UniversalSigVec &usv, int pid, int time, int use_shrink, map<int, float> &out_line) const;
134
135 // preparing a batch of model results (will also initialize the feat_ptr vector, and the pidtime2idx map)
136 int prep_model_batch(MedPidRepository &rep, MedSamples &samples);
137
138
139 EmbeddedCodeType type_name_to_code(string name);
140
141 string print_to_string(int verbosity);
142
143 // next can be used after shrinking was done
144 // it keeps the minimal structures needed in order to allow matrix creation and lower scheme file size.
145 int minimize();
146
147 ADD_CLASS_NAME(EmbeddingSig)
148 ADD_SERIALIZATION_FUNCS(sig, type, add_hierarchy, do_shrink, ranges, time_chan, val_chan, win_from, win_to, categories_to_embed, Name2Id, Orig2Code, Orig2Name, Orig2ShrunkCode, model)
149};
150
151
152//============================================================================================================================
153// EmbedMatsCreator : major class for creating sparse embedding matrices for a given setup + list of times, window_lens, etc
154//============================================================================================================================
156
157public:
158 vector<string> sigs_to_load;
159
160 int rep_time_unit = MedTime::Date;
161 int win_time_unit = MedTime::Days;
162 int byear_time_unit = MedTime::Years;
163
164 vector<EmbeddingSig> embed_sigs; // containing all the information on each the sigs to embed
165
166 // general high level operations
167
168 // prepare needs to be run before creating a matrix for the first time:
169 // (1) initializes the sigs_to_load vector
170 // (2) initializes the embed_sigs objects up to the pre shrinking stage
171 // When starting with a serialized object there's no need to call this one.
172 int prepare(MedPidRepository &rep);
173
174 // adding all the needed lines for a pid. Written for a dynamic record, to allow easy connection to MedProcessTools
175 int add_pid_lines(PidDynamicRec &pdr, MedSparseMat &smat, vector<int> &times, int use_shrink);
176 int get_pid_out_line(PidDynamicRec &pdr, int ver, int time, int use_shrink, map<int, float> &out_line);
177
178 // another api to generate a matrix given a list of pids and times, that promises the SAME order as in the given input
179 // the input pair vector has pids on first, and times on second
180 // works directly through the rep (not the PidDynamicRec path)
181 int get_sparse_mat(MedPidRepository &rep, vector<pair<int, int>> &pids_times, int use_shrink, MedSparseMat &smat);
182
183 // sometimes easier to use, BUT the ORDER of lines in the matrix is the order of normalized samples,
184 // this makes it a problem when needing to produce a matrix with a different order for lines.
185 int get_sparse_mat(MedPidRepository &rep, MedSamples &samples , int use_outcome_time, int use_shrink, MedSparseMat &smat);
186
187 // helper for es preparation
188 void prep_memebers_to_sets(MedPidRepository &rep, EmbeddingSig &es);
189
190 // shrinking calculation
191 // gets an smat that had been produced with the non shrinked dictionary,
192 // then selects the columns that will stay (es with do_shrink = 0, or those with at least min_p-max_p rows containing it.
193 int get_shrinked_dictionary(MedSparseMat &smat, float min_p, float max_p);
194
195 // apply shrinking to a given matrix
196 // (other better option is to build it with the use_shrink=1 flag)
197 int shrink_mat(MedSparseMat &smat, MedSparseMat &shrunk_smat);
198
199 // initialization from string
200 int init(map<string, string>& _map);
201
202 // needed before we start using the class on a specific rep, but AFTER params and embed_sigs were initialized.
203 void init_sids(MedSignals &sigs);
204
205 // next must be called after coming from serialization, at the moment we get hold of dict.
206 void init_tables(MedDictionarySections &dict) { for (auto &es : embed_sigs) es.init_categorial_tables(dict); }
207
208
209 // API to write the dictionary to a file, to have a readable interpretation of the codes.
210 int write_dict_to_file(string fname, int only_shrink);
211
212
213 // printing object to string
214 string print_to_string(int verbosity);
215
216 // minimizing size of shrunk categorials for smaller scheme files
217 // if this is run before serialization one will only be able to create the shrunk version (which is what is needed...)
218 int minimize() { for (auto &es : embed_sigs) es.minimize(); return 0; };
219
220 // next is needed in order to allow for batch preparations of model es
221 void prep_models_batches(MedPidRepository &rep, MedSamples &samples) { for (auto &es : embed_sigs) es.prep_model_batch(rep, samples); }
222
223
224 ADD_CLASS_NAME(EmbedMatCreator)
225 ADD_SERIALIZATION_FUNCS(sigs_to_load, rep_time_unit, win_time_unit, byear_time_unit, embed_sigs)
226
227private:
228 // helpers
229 int curr_code = 1; // needed in codes allocation process
230
231};
232
233
234//============================================================================================================================
236 int pid;
237 int x_time;
238 int y_time;
239};
240
241
242//============================================================================================================================
243// train matrices creation class
244//============================================================================================================================
246{
247
248public:
249
250 // params
251 string x_params;
252 string y_params;
253 int use_same_dictionaries = 1; // if on : x,y must have the SAME es order, the same Orig2Code, Orig2Name in each, and we will copy
254 // the x shrinking dictionary to y.
255
256 // next params are to generate an xy list
257 int min_time = 20060101;
258 int max_time = 20160101;
259 int min_age = 30;
260 int max_age = 100;
261 int npoints_per_pid = 1;
262 float min_p = (float)0.001;
263 float max_p = (float)0.95;
264 vector<int> time_dist_range;
265 vector<int> time_dist_points ={ -365, 0, 365 };
266
267 // general technical params needed for production
268 int rep_time_unit = MedTime::Date;
269 int win_time_unit = MedTime::Days;
270 int byear_time_unit = MedTime::Years;
271 string prefix = "smat";
272
273 // matrices params
274 float p_train = (float)0.8;
275
276 EmbedMatCreator x_creator, y_creator;
277
278 // generate x,y matrices for a given xy-file, and write them to files (including dictionaries)
279 int generate_from_xy_file(string xy_fname, string rep_fname, string out_prefix);
280
281 // generate an xy list and write it to file, input is a list of pids and a repository
282 int generate_xy_list(string xy_fname, string pids_fname, string rep_fname);
283
284 // helpers: read/write a file of <pid> <xtime> <ytime> records
285 int read_xy_records(string xy_fname, vector<EmbedXYRecord> &xy);
286 int write_xy_records(string xy_fname, vector<EmbedXYRecord> &xy);
287
288 // init
289 int init(map<string, string>& _map);
290};
291
292
293
294
295//=================================================================
296// Joining the MedSerialize Wagon
297//=================================================================
298
301
302
MedTime.h.
An Abstract class that can be serialized and written/read from file.
#define ADD_SERIALIZATION_FUNCS(...)
Definition SerializableObject.h:122
#define MEDSERIALIZE_SUPPORT(Type)
Definition SerializableObject.h:108
Definition MedEmbed.h:155
int init(map< string, string > &_map)
Virtual to init object from parsed fields.
Definition MedEmbed.cpp:490
Definition MedEmbed.h:246
int init(map< string, string > &_map)
Virtual to init object from parsed fields.
Definition MedEmbed.cpp:880
Definition MedEmbed.h:24
int init(map< string, string > &_map)
Virtual to init object from parsed fields.
Definition MedEmbed.cpp:27
Definition MedDictionary.h:87
A model = repCleaner + featureGenerator + featureProcessor + MedPredictor.
Definition MedModel.h:56
Definition MedPidRepository.h:87
MedSamples represent a collection of samples per different id The data is conatined in a vector of ...
Definition MedSamples.h:129
Definition MedSignals.h:719
Definition MedSparseMat.h:19
static const int Days
days since 1900/01/01
Definition MedTime.h:28
static const int Years
years since 1900 (not since 0!)
Definition MedTime.h:26
static const int Date
dates are in full regular format YYYYMMDD
Definition MedTime.h:25
Definition MedPidRepository.h:127
Definition SerializableObject.h:32
Definition MedEmbed.h:235