7#ifndef DMLC_DATA_TEXT_PARSER_H_
8#define DMLC_DATA_TEXT_PARSER_H_
27template <
typename IndexType,
typename DType = real_t>
32 : bytes_read_(0), source_(source) {
33 int maxthread = std::max(omp_get_num_procs() / 2 - 4, 1);
34 nthread_ = std::min(maxthread, nthread);
55 virtual void ParseBlock(
const char *begin,
const char *end,
70 for (; bptr != begin; --bptr) {
71 if (*bptr ==
'\n' || *bptr ==
'\r')
81 static inline void IgnoreUTF8BOM(
const char **begin,
const char **end) {
83 for (count = 0; *begin != *end && count < 3; count++, ++*begin) {
84 if (!begin || !*begin)
86 if (**begin !=
'\xEF' && count == 0)
88 if (**begin !=
'\xBB' && count == 1)
90 if (**begin !=
'\xBF' && count == 2)
109template <
typename IndexType,
typename DType>
113 if (!source_->NextChunk(&chunk))
return false;
114 const int nthread = this->nthread_;
116 data->resize(nthread);
117 bytes_read_ += chunk.
size;
118 CHECK_NE(chunk.
size, 0U);
119 const char *head =
reinterpret_cast<char *
>(chunk.
dptr);
121 std::vector<std::thread> threads;
122 for (
int tid = 0; tid < nthread; ++tid) {
123 threads.push_back(std::thread([&chunk, head, data, nthread, tid,
this] {
124 this->omp_exc_.Run([&] {
125 size_t nstep = (chunk.
size + nthread - 1) / nthread;
126 size_t sbegin = std::min(tid * nstep, chunk.
size);
127 size_t send = std::min((tid + 1) * nstep, chunk.
size);
128 const char *pbegin = BackFindEndLine(head + sbegin, head);
130 if (tid + 1 == nthread) {
133 pend = BackFindEndLine(head + send, head);
135 ParseBlock(pbegin, pend, &(*data)[tid]);
139 for (
int i = 0; i < nthread; ++i) {
OMP Exception class catches, saves and rethrows exception from OMP blocks.
Definition common.h:53
base class for parser to parse data
Definition parser.h:24
Text parser that parses the input lines and returns rows in input data.
Definition text_parser.h:28
virtual void ParseBlock(const char *begin, const char *end, RowBlockContainer< IndexType, DType > *out)=0
parse data into out
static void IgnoreUTF8BOM(const char **begin, const char **end)
Ignore UTF-8 BOM if present.
Definition text_parser.h:81
virtual bool ParseNext(std::vector< RowBlockContainer< IndexType, DType > > *data)
read in next several blocks of data
Definition text_parser.h:45
virtual size_t BytesRead(void) const
Definition text_parser.h:42
virtual void BeforeFirst(void)
set before first of the item
Definition text_parser.h:39
bool FillData(std::vector< RowBlockContainer< IndexType, DType > > *data)
read in next several blocks of data
Definition text_parser.h:110
static const char * BackFindEndLine(const char *bptr, const char *begin)
start from bptr, go backward and find first endof line
Definition text_parser.h:69
defines common input data structure, and interface for handling the input data
namespace for dmlc
Definition array_view.h:12
header to handle OpenMP compatibility issues
additional data structure to support RowBlock data structure
dynamic data structure that holds a row block of data
Definition row_block.h:27
defines some common utility function.