Medial Code Documentation
Loading...
Searching...
No Matches
text_reader.h
1#ifndef LIGHTGBM_UTILS_TEXT_READER_H_
2#define LIGHTGBM_UTILS_TEXT_READER_H_
3
4#include <LightGBM/utils/pipeline_reader.h>
5#include <LightGBM/utils/log.h>
6#include <LightGBM/utils/random.h>
7
8#include <cstdio>
9#include <sstream>
10
11#include <vector>
12#include <string>
13#include <functional>
14
15namespace LightGBM {
16
20template<typename INDEX_T>
22public:
28 TextReader(const char* filename, bool is_skip_first_line):
29 filename_(filename), is_skip_first_line_(is_skip_first_line) {
30 if (is_skip_first_line_) {
31 auto reader = VirtualFileReader::Make(filename);
32 if (!reader->Init()) {
33 Log::Fatal("Could not open %s", filename);
34 }
35 std::stringstream str_buf;
36 char read_c;
37 size_t nread = reader->Read(&read_c, 1);
38 while (nread == 1) {
39 if (read_c == '\n' || read_c == '\r') {
40 break;
41 }
42 str_buf << read_c;
43 ++skip_bytes_;
44 nread = reader->Read(&read_c, 1);
45 }
46 if (read_c == '\r') {
47 reader->Read(&read_c, 1);
48 ++skip_bytes_;
49 }
50 if (read_c == '\n') {
51 reader->Read(&read_c, 1);
52 ++skip_bytes_;
53 }
54 first_line_ = str_buf.str();
55 Log::Debug("Skipped header \"%s\" in file %s", first_line_.c_str(), filename_);
56 }
57 }
62 Clear();
63 }
67 inline void Clear() {
68 lines_.clear();
69 lines_.shrink_to_fit();
70 }
74 inline std::string first_line() {
75 return first_line_;
76 }
81 inline std::vector<std::string>& Lines() { return lines_; }
82
83 INDEX_T ReadAllAndProcess(const std::function<void(INDEX_T, const char*, size_t)>& process_fun) {
84 last_line_ = "";
85 INDEX_T total_cnt = 0;
86 PipelineReader::Read(filename_, skip_bytes_,
87 [&]
88 (const char* buffer_process, size_t read_cnt) {
89 size_t cnt = 0;
90 size_t i = 0;
91 size_t last_i = 0;
92 // skip the break between \r and \n
93 if (last_line_.size() == 0 && buffer_process[0] == '\n') {
94 i = 1;
95 last_i = i;
96 }
97 while (i < read_cnt) {
98 if (buffer_process[i] == '\n' || buffer_process[i] == '\r') {
99 if (last_line_.size() > 0) {
100 last_line_.append(buffer_process + last_i, i - last_i);
101 process_fun(total_cnt, last_line_.c_str(), last_line_.size());
102 last_line_ = "";
103 }
104 else {
105 process_fun(total_cnt, buffer_process + last_i, i - last_i);
106 }
107 ++cnt;
108 ++i;
109 ++total_cnt;
110 // skip end of line
111 while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; }
112 last_i = i;
113 }
114 else {
115 ++i;
116 }
117 }
118 if (last_i != read_cnt) {
119 last_line_.append(buffer_process + last_i, read_cnt - last_i);
120 }
121 return cnt;
122 });
123 // if last line of file doesn't contain end of line
124 if (last_line_.size() > 0) {
125 Log::Info("Warning: last line of %s has no end of line, still using this line", filename_);
126 process_fun(total_cnt, last_line_.c_str(), last_line_.size());
127 ++total_cnt;
128 last_line_ = "";
129 }
130 return total_cnt;
131 }
132
137 INDEX_T ReadAllLines() {
138 return ReadAllAndProcess(
139 [=](INDEX_T, const char* buffer, size_t size) {
140 lines_.emplace_back(buffer, size);
141 });
142 }
143
144 std::vector<char> ReadContent(size_t* out_len) {
145 std::vector<char> ret;
146 *out_len = 0;
147 auto reader = VirtualFileReader::Make(filename_);
148 if (!reader->Init()) {
149 return ret;
150 }
151 const size_t buffer_size = 16 * 1024 * 1024;
152 auto buffer_read = std::vector<char>(buffer_size);
153 size_t read_cnt = 0;
154 do {
155 read_cnt = reader->Read(buffer_read.data(), buffer_size);
156 ret.insert(ret.end(), buffer_read.begin(), buffer_read.begin() + read_cnt);
157 *out_len += read_cnt;
158 } while (read_cnt > 0);
159 return ret;
160 }
161
162 INDEX_T SampleFromFile(Random& random, INDEX_T sample_cnt, std::vector<std::string>* out_sampled_data) {
163 INDEX_T cur_sample_cnt = 0;
164 return ReadAllAndProcess(
165 [&]
166 (INDEX_T line_idx, const char* buffer, size_t size) {
167 if (cur_sample_cnt < sample_cnt) {
168 out_sampled_data->emplace_back(buffer, size);
169 ++cur_sample_cnt;
170 }
171 else {
172 const size_t idx = static_cast<size_t>(random.NextInt(0, static_cast<int>(line_idx + 1)));
173 if (idx < static_cast<size_t>(sample_cnt)) {
174 out_sampled_data->operator[](idx) = std::string(buffer, size);
175 }
176 }
177 });
178 }
185 INDEX_T ReadAndFilterLines(const std::function<bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices) {
186 out_used_data_indices->clear();
187 INDEX_T total_cnt = ReadAllAndProcess(
188 [&]
189 (INDEX_T line_idx , const char* buffer, size_t size) {
190 bool is_used = filter_fun(line_idx);
191 if (is_used) { out_used_data_indices->push_back(line_idx); }
192 if (is_used) { lines_.emplace_back(buffer, size); }
193 });
194 return total_cnt;
195 }
196
197 INDEX_T SampleAndFilterFromFile(const std::function<bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices,
198 Random& random, INDEX_T sample_cnt, std::vector<std::string>* out_sampled_data) {
199 INDEX_T cur_sample_cnt = 0;
200 out_used_data_indices->clear();
201 INDEX_T total_cnt = ReadAllAndProcess(
202 [&]
203 (INDEX_T line_idx, const char* buffer, size_t size) {
204 bool is_used = filter_fun(line_idx);
205 if (is_used) { out_used_data_indices->push_back(line_idx); }
206 if (is_used) {
207 if (cur_sample_cnt < sample_cnt) {
208 out_sampled_data->emplace_back(buffer, size);
209 ++cur_sample_cnt;
210 }
211 else {
212 const size_t idx = static_cast<size_t>(random.NextInt(0, static_cast<int>(out_used_data_indices->size())));
213 if (idx < static_cast<size_t>(sample_cnt)) {
214 out_sampled_data->operator[](idx) = std::string(buffer, size);
215 }
216 }
217 }
218 });
219 return total_cnt;
220 }
221
222 INDEX_T CountLine() {
223 return ReadAllAndProcess(
224 [=](INDEX_T, const char*, size_t) {
225 });
226 }
227
228 INDEX_T ReadAllAndProcessParallelWithFilter(const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun, const std::function<bool(INDEX_T, INDEX_T)>& filter_fun) {
229 last_line_ = "";
230 INDEX_T total_cnt = 0;
231 INDEX_T used_cnt = 0;
232 PipelineReader::Read(filename_, skip_bytes_,
233 [&]
234 (const char* buffer_process, size_t read_cnt) {
235 size_t cnt = 0;
236 size_t i = 0;
237 size_t last_i = 0;
238 INDEX_T start_idx = used_cnt;
239 // skip the break between \r and \n
240 if (last_line_.size() == 0 && buffer_process[0] == '\n') {
241 i = 1;
242 last_i = i;
243 }
244 while (i < read_cnt) {
245 if (buffer_process[i] == '\n' || buffer_process[i] == '\r') {
246 if (last_line_.size() > 0) {
247 last_line_.append(buffer_process + last_i, i - last_i);
248 if (filter_fun(used_cnt, total_cnt)) {
249 lines_.push_back(last_line_);
250 ++used_cnt;
251 }
252 last_line_ = "";
253 }
254 else {
255 if (filter_fun(used_cnt, total_cnt)) {
256 lines_.emplace_back(buffer_process + last_i, i - last_i);
257 ++used_cnt;
258 }
259 }
260 ++cnt;
261 ++i;
262 ++total_cnt;
263 // skip end of line
264 while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; }
265 last_i = i;
266 }
267 else {
268 ++i;
269 }
270 }
271 process_fun(start_idx, lines_);
272 lines_.clear();
273 if (last_i != read_cnt) {
274 last_line_.append(buffer_process + last_i, read_cnt - last_i);
275 }
276 return cnt;
277 });
278 // if last line of file doesn't contain end of line
279 if (last_line_.size() > 0) {
280 Log::Info("Warning: last line of %s has no end of line, still using this line", filename_);
281 if (filter_fun(used_cnt, total_cnt)) {
282 lines_.push_back(last_line_);
283 process_fun(used_cnt, lines_);
284 }
285 lines_.clear();
286 ++total_cnt;
287 ++used_cnt;
288 last_line_ = "";
289 }
290 return total_cnt;
291 }
292
293 INDEX_T ReadAllAndProcessParallel(const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
294 return ReadAllAndProcessParallelWithFilter(process_fun, [](INDEX_T, INDEX_T) { return true; });
295 }
296
297 INDEX_T ReadPartAndProcessParallel(const std::vector<INDEX_T>& used_data_indices, const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
298 return ReadAllAndProcessParallelWithFilter(process_fun,
299 [&used_data_indices](INDEX_T used_cnt, INDEX_T total_cnt) {
300 if (static_cast<size_t>(used_cnt) < used_data_indices.size() && total_cnt == used_data_indices[used_cnt]) {
301 return true;
302 }
303 else {
304 return false;
305 }
306 });
307 }
308
309private:
311 const char* filename_;
313 std::vector<std::string> lines_;
315 std::string last_line_;
317 std::string first_line_ = "";
319 bool is_skip_first_line_ = false;
321 int skip_bytes_ = 0;
322};
323
324} // namespace LightGBM
325
326#endif // LightGBM_UTILS_TEXT_READER_H_
static size_t Read(const char *filename, int skip_bytes, const std::function< size_t(const char *, size_t)> &process_fun)
Read data from a file, use pipeline methods.
Definition pipeline_reader.h:27
A wrapper for random generator.
Definition random.h:15
int NextInt(int lower_bound, int upper_bound)
Generate random integer, int32 range.
Definition random.h:48
Read text data from file.
Definition text_reader.h:21
void Clear()
Clear cached data.
Definition text_reader.h:67
std::vector< std::string > & Lines()
Get text data that read from file.
Definition text_reader.h:81
TextReader(const char *filename, bool is_skip_first_line)
Constructor.
Definition text_reader.h:28
INDEX_T ReadAndFilterLines(const std::function< bool(INDEX_T)> &filter_fun, std::vector< INDEX_T > *out_used_data_indices)
Read part of text data from file in memory, use filter_fun to filter data.
Definition text_reader.h:185
INDEX_T ReadAllLines()
Read all text data from file in memory.
Definition text_reader.h:137
~TextReader()
Destructor.
Definition text_reader.h:61
std::string first_line()
return first line of data
Definition text_reader.h:74
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
static std::unique_ptr< VirtualFileReader > Make(const std::string &filename)
Create appropriate reader for filename.
Definition file_io.cpp:153