Medial Code Documentation
Loading...
Searching...
No Matches
MedConvert.h
1//
2// a few tools to help converting data files into a repository
3//
4
5#ifndef __MED_CONVERT_H__
6#define __MED_CONVERT_H__
7
8#include "InfraMed.h"
9#include "MedDictionary.h"
10#include "MedSignals.h"
11
12#include <vector>
13#include <string>
14#include <fstream>
15
16using namespace std;
17
18#define MAX_COLLECTED_DATA_SIZE 56 // = previous sizeof(collected_data)
19
21public:
22 char buf[MAX_COLLECTED_DATA_SIZE];
23 /*
24 int type;
25 int date;
26 long long time ;
27 long long time2 ;
28 float val;
29 int date2;
30 short val1;
31 short val2;
32 short val3;
33 short val4;
34 long long longVal ;
35 float f_val2;
36 */
37 // nextis helpers in speeding up parsing
38 int serial = -1;
39 //int idx = -1;
40 void zero() {
41 memset(buf, 0, sizeof(buf));
42 //type = 0; date = 0; val = 0; date2 = 0; time = 0; time2 = 0; longVal = 0; val1 = 0; val2 = 0; val3 = 0; val4 = 0; f_val2 = 0;
43 }
44};
45
46
47class pid_data {
48public:
49 int pid;
50 vector<vector<collected_data>> raw_data;
51};
52
53struct sig_info {
54 int sid;
55 int serial;
56 int fno;
57 int type;
58};
59
60struct file_stat {
61 string fname;
62 int id;
63 int n_lines;
64 int n_relevant_lines;
65 int n_parsed_lines;
66 int n_bad_format_lines;
67
68 file_stat() { fname = ""; id = -1; n_lines = 0; n_relevant_lines = 0; n_parsed_lines = 0; n_bad_format_lines = 0; }
69};
70
72public:
73 int mode; // 0/1 - original mode (currently default) 2 - new mode (data and index file for each signal)
74 int safe_mode; // 0/1 - in safe_mode==1 loading will exit in several inconsistencies
75 string rep_files_prefix; // general prefix for all files created in mode 2
76
77 string config_fname;
78
79 string path;
80 string out_path;
81 string code_to_signal_fname; // format: signal_code, signal_name
82 vector<string> dict_fnames;
83 vector<string> sig_fnames;
84 string signal_to_files_fname;
85 string prefixes_fname;
86 int relative; // if 1 - we will put "." in dir in .repository in case of OUTDIR
87 int default_time_unit;
90
91 // next files should be sorted by pid
92 string registry_fname; // format: pid , date, location(string), stage (number) // tab delimited
93 //string demographic_fname; // format: pid , birth year, M/F
94 vector<string> in_data_fnames; // format: pid , signal_code, date/time , value
95 vector<string> in_strings_data_fnames; // format: pid , signale_code, date/time ,
96 vector<string> forced; // signal names of signals that each id MUST have (like GENDER and/or BYEAR , etc), ids without those will not be loaded into the repository
97 vector<string> load_only; // loading signals only from this list and leave others as is, usefull for fixes and updates
98 // note that for efficiency it is recommended to have in the data file only the files needed for forced and load_only
99 // the default is an empty load_only, which means load all possible signals in the sig gile.
100 vector<int> sids_to_load;
101
102 //running parameters for load:
104 int read_lines_buffer = 100000;
109 double max_bad_line_ratio = 0.05;
110 double min_parsed_line_ratio = 0.01;
111 bool verbose_open_files = false;
112 bool run_parallel = true;
113 bool run_parallel_files = false;
115
116 void init_load_params(const string &init_str);
117
118
119
120 // outputs
121 string repository_config_fname;
122 vector<string> prefix_names;
123 vector<string> index_fnames;
124 vector<string> data_fnames;
125 string description;
126
127 // next are for debug and statistics
128 vector<file_stat> fstats;
129 map<string, int> missing_forced_signals;
130
131 // internal variables
132 unordered_map<string, string> codes2names;
134 MedSignals sigs;
135 map<int, int> sid2fno;
136 unordered_map<int, int> sid2serial;
137 vector<int> serial2sid;
138 vector<sig_info> serial2siginfo;
139
140 vector<int> pid_in_file;
141
142 vector<IndexTable> indexes;
143
144 void clear();
145
146 MedConvert(const string &prefix) { rep_files_prefix = prefix; }
147 MedConvert() { rep_files_prefix = "rep"; }
148
149 // main entry points
150 int create_rep(const string &config_fname, int _mode) { mode = _mode; return read_all(config_fname); }
151 int create_rep(const string &config_fname) { return create_rep(config_fname, 1); }
152
153 // configuration and preparations
154 int read_config(const string &fname);
155 int read_code_to_signal(const string &fname);
156 int read_prefix_names(const string &fname);
157 int read_signal_to_files(const string &fname);
158
159 // mode 2 related
160 int generate_prefix_names();
161
162 // general prep function
163 int read_all(const string &config_fname);
164
165 int n_open_in_files;
166 // actually reading data and creating index and data files
167 void collect_lines(vector<string> &lines, vector<int> &f_i, int file_i, vector<string> &buffered_lines, int &buffer_pos, ifstream &inf, int file_type, pid_data &curr, int &fpid, file_stat& curr_fstat, map<pair<string, string>, int>&);
168 void get_next_signal_all_lines(vector<string> &lines, vector<int> &f_i, pid_data &curr, vector<file_stat> &fstat, map<pair<string, string>, int>&);
169 void parse_fields_into_gsv(string &curr_line, vector<string> &fields, int sid, GenericSigVec &cd_sv);
170 int create_indexes();
171 int create_repository_config();
172 int create_signals_config();
173
174 // legacy
175 //void get_next_signal(vector<string> &buffered_lines, int &buffer_pos, ifstream &inf, int file_type, pid_data &curr, int &fpid, file_stat& curr_fstat, map<pair<string, string>, int>&);
176 //int write_indexes(pid_data &curr);
177 //void get_next_signal_new_modes(vector<string> &buffered_lines, int &buffer_pos, ifstream &inf, int file_type, pid_data &curr, int &fpid, file_stat& curr_fstat, map<pair<string, string>, int>&);
178
179
180 // output files related
181 ofstream signals_config_f;
182 ofstream repository_config_f;
183 vector<ofstream *> index_f;
184 vector<ofstream *> data_f;
185 vector<unsigned long long> data_f_pos;
186 int open_indexes();
187 int write_all_indexes(vector<int> &all_pids);
188 int write_indexes_new_modes(pid_data &curr);
189 int close_indexes();
190
191 // loading subsets
192 int prep_sids_to_load();
193private:
195 void test_for_load_error(const map<pair<string, string>, int> &missing_dict_vals, int n_pids_extracted, bool final_test
196 ,int prev_total_missings, int &total_missing, const map<string, int> &prev_missing_forced_signals) const;
197
198 ofstream err_log_file;
199};
200
201#endif
Definition MedSignals.h:915
Definition MedConvert.h:71
int allowed_missing_pids_from_forced_cnt
how many pids are allowed to be missing in forced signals. 0 means no limit
Definition MedConvert.h:107
int allowed_unknown_catgory_cnt
how many unknown categories are allowed
Definition MedConvert.h:106
bool run_parallel
If true will load in parallel.
Definition MedConvert.h:112
double max_bad_line_ratio
maximal ratio for bad lines in file
Definition MedConvert.h:109
double allowed_missing_pids_from_forced_ratio
how many pids are allowed to be missing in forced signals. 0 means no limit
Definition MedConvert.h:108
string full_error_file
provide full path to error file
Definition MedConvert.h:114
bool run_parallel_files
If true will read files in parallel.
Definition MedConvert.h:113
string registry_fname
internal representation for all dates.
Definition MedConvert.h:92
bool verbose_open_files
If true will print when openning files.
Definition MedConvert.h:111
double min_parsed_line_ratio
minimal ratio for parsed lines in file
Definition MedConvert.h:110
int read_lines_buffer
how much lines to read for file
Definition MedConvert.h:104
int check_for_error_pid_cnt
after how many pids to check for error. If 0 only at the end
Definition MedConvert.h:103
int test_run_max_pids
If bigger than 1 - will run in dry run till that number of pids.
Definition MedConvert.h:105
Definition MedDictionary.h:87
Definition MedSignals.h:719
Definition MedConvert.h:20
Definition MedConvert.h:47
Definition StdDeque.h:58
Definition MedConvert.h:60
Definition MedConvert.h:53