28 TextReader(
const char* filename,
bool is_skip_first_line):
29 filename_(filename), is_skip_first_line_(is_skip_first_line) {
30 if (is_skip_first_line_) {
32 if (!reader->Init()) {
33 Log::Fatal(
"Could not open %s", filename);
35 std::stringstream str_buf;
37 size_t nread = reader->Read(&read_c, 1);
39 if (read_c ==
'\n' || read_c ==
'\r') {
44 nread = reader->Read(&read_c, 1);
47 reader->Read(&read_c, 1);
51 reader->Read(&read_c, 1);
54 first_line_ = str_buf.str();
55 Log::Debug(
"Skipped header \"%s\" in file %s", first_line_.c_str(), filename_);
69 lines_.shrink_to_fit();
81 inline std::vector<std::string>&
Lines() {
return lines_; }
83 INDEX_T ReadAllAndProcess(
const std::function<
void(INDEX_T,
const char*,
size_t)>& process_fun) {
85 INDEX_T total_cnt = 0;
88 (
const char* buffer_process,
size_t read_cnt) {
93 if (last_line_.size() == 0 && buffer_process[0] ==
'\n') {
97 while (i < read_cnt) {
98 if (buffer_process[i] ==
'\n' || buffer_process[i] ==
'\r') {
99 if (last_line_.size() > 0) {
100 last_line_.append(buffer_process + last_i, i - last_i);
101 process_fun(total_cnt, last_line_.c_str(), last_line_.size());
105 process_fun(total_cnt, buffer_process + last_i, i - last_i);
111 while ((buffer_process[i] ==
'\n' || buffer_process[i] ==
'\r') && i < read_cnt) { ++i; }
118 if (last_i != read_cnt) {
119 last_line_.append(buffer_process + last_i, read_cnt - last_i);
124 if (last_line_.size() > 0) {
125 Log::Info(
"Warning: last line of %s has no end of line, still using this line", filename_);
126 process_fun(total_cnt, last_line_.c_str(), last_line_.size());
138 return ReadAllAndProcess(
139 [=](INDEX_T,
const char* buffer,
size_t size) {
140 lines_.emplace_back(buffer, size);
144 std::vector<char> ReadContent(
size_t* out_len) {
145 std::vector<char> ret;
148 if (!reader->Init()) {
151 const size_t buffer_size = 16 * 1024 * 1024;
152 auto buffer_read = std::vector<char>(buffer_size);
155 read_cnt = reader->Read(buffer_read.data(), buffer_size);
156 ret.insert(ret.end(), buffer_read.begin(), buffer_read.begin() + read_cnt);
157 *out_len += read_cnt;
158 }
while (read_cnt > 0);
162 INDEX_T SampleFromFile(Random& random, INDEX_T sample_cnt, std::vector<std::string>* out_sampled_data) {
163 INDEX_T cur_sample_cnt = 0;
164 return ReadAllAndProcess(
166 (INDEX_T line_idx,
const char* buffer,
size_t size) {
167 if (cur_sample_cnt < sample_cnt) {
168 out_sampled_data->emplace_back(buffer, size);
172 const size_t idx =
static_cast<size_t>(random.NextInt(0,
static_cast<int>(line_idx + 1)));
173 if (idx <
static_cast<size_t>(sample_cnt)) {
174 out_sampled_data->operator[](idx) = std::string(buffer, size);
185 INDEX_T
ReadAndFilterLines(
const std::function<
bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices) {
186 out_used_data_indices->clear();
187 INDEX_T total_cnt = ReadAllAndProcess(
189 (INDEX_T line_idx ,
const char* buffer,
size_t size) {
190 bool is_used = filter_fun(line_idx);
191 if (is_used) { out_used_data_indices->push_back(line_idx); }
192 if (is_used) { lines_.emplace_back(buffer, size); }
197 INDEX_T SampleAndFilterFromFile(
const std::function<
bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices,
198 Random& random, INDEX_T sample_cnt, std::vector<std::string>* out_sampled_data) {
199 INDEX_T cur_sample_cnt = 0;
200 out_used_data_indices->clear();
201 INDEX_T total_cnt = ReadAllAndProcess(
203 (INDEX_T line_idx,
const char* buffer,
size_t size) {
204 bool is_used = filter_fun(line_idx);
205 if (is_used) { out_used_data_indices->push_back(line_idx); }
207 if (cur_sample_cnt < sample_cnt) {
208 out_sampled_data->emplace_back(buffer, size);
212 const size_t idx =
static_cast<size_t>(random.
NextInt(0,
static_cast<int>(out_used_data_indices->size())));
213 if (idx <
static_cast<size_t>(sample_cnt)) {
214 out_sampled_data->operator[](idx) = std::string(buffer, size);
222 INDEX_T CountLine() {
223 return ReadAllAndProcess(
224 [=](INDEX_T,
const char*,
size_t) {
228 INDEX_T ReadAllAndProcessParallelWithFilter(
const std::function<
void(INDEX_T,
const std::vector<std::string>&)>& process_fun,
const std::function<
bool(INDEX_T, INDEX_T)>& filter_fun) {
230 INDEX_T total_cnt = 0;
231 INDEX_T used_cnt = 0;
234 (
const char* buffer_process,
size_t read_cnt) {
238 INDEX_T start_idx = used_cnt;
240 if (last_line_.size() == 0 && buffer_process[0] ==
'\n') {
244 while (i < read_cnt) {
245 if (buffer_process[i] ==
'\n' || buffer_process[i] ==
'\r') {
246 if (last_line_.size() > 0) {
247 last_line_.append(buffer_process + last_i, i - last_i);
248 if (filter_fun(used_cnt, total_cnt)) {
249 lines_.push_back(last_line_);
255 if (filter_fun(used_cnt, total_cnt)) {
256 lines_.emplace_back(buffer_process + last_i, i - last_i);
264 while ((buffer_process[i] ==
'\n' || buffer_process[i] ==
'\r') && i < read_cnt) { ++i; }
271 process_fun(start_idx, lines_);
273 if (last_i != read_cnt) {
274 last_line_.append(buffer_process + last_i, read_cnt - last_i);
279 if (last_line_.size() > 0) {
280 Log::Info(
"Warning: last line of %s has no end of line, still using this line", filename_);
281 if (filter_fun(used_cnt, total_cnt)) {
282 lines_.push_back(last_line_);
283 process_fun(used_cnt, lines_);
293 INDEX_T ReadAllAndProcessParallel(
const std::function<
void(INDEX_T,
const std::vector<std::string>&)>& process_fun) {
294 return ReadAllAndProcessParallelWithFilter(process_fun, [](INDEX_T, INDEX_T) {
return true; });
297 INDEX_T ReadPartAndProcessParallel(
const std::vector<INDEX_T>& used_data_indices,
const std::function<
void(INDEX_T,
const std::vector<std::string>&)>& process_fun) {
298 return ReadAllAndProcessParallelWithFilter(process_fun,
299 [&used_data_indices](INDEX_T used_cnt, INDEX_T total_cnt) {
300 if (
static_cast<size_t>(used_cnt) < used_data_indices.size() && total_cnt == used_data_indices[used_cnt]) {
311 const char* filename_;
313 std::vector<std::string> lines_;
315 std::string last_line_;
317 std::string first_line_ =
"";
319 bool is_skip_first_line_ =
false;
Read text data from file.
Definition text_reader.h:21
void Clear()
Clear cached data.
Definition text_reader.h:67
std::vector< std::string > & Lines()
Get text data that read from file.
Definition text_reader.h:81
TextReader(const char *filename, bool is_skip_first_line)
Constructor.
Definition text_reader.h:28
INDEX_T ReadAndFilterLines(const std::function< bool(INDEX_T)> &filter_fun, std::vector< INDEX_T > *out_used_data_indices)
Read part of text data from file in memory, use filter_fun to filter data.
Definition text_reader.h:185
INDEX_T ReadAllLines()
Read all text data from file in memory.
Definition text_reader.h:137
~TextReader()
Destructor.
Definition text_reader.h:61
std::string first_line()
return first line of data
Definition text_reader.h:74