Medial Code Documentation
Loading...
Searching...
No Matches
text_parser.h
Go to the documentation of this file.
1
7#ifndef DMLC_DATA_TEXT_PARSER_H_
8#define DMLC_DATA_TEXT_PARSER_H_
9
10#include <dmlc/data.h>
11#include <dmlc/omp.h>
12#include <dmlc/common.h>
13#include <thread>
14#include <mutex>
15#include <vector>
16#include <cstring>
17#include <algorithm>
18#include "./row_block.h"
19#include "./parser.h"
20
21namespace dmlc {
22namespace data {
27template <typename IndexType, typename DType = real_t>
28class TextParserBase : public ParserImpl<IndexType, DType> {
29 public:
30 explicit TextParserBase(InputSplit *source,
31 int nthread)
32 : bytes_read_(0), source_(source) {
33 int maxthread = std::max(omp_get_num_procs() / 2 - 4, 1);
34 nthread_ = std::min(maxthread, nthread);
35 }
36 virtual ~TextParserBase() {
37 delete source_;
38 }
39 virtual void BeforeFirst(void) {
40 source_->BeforeFirst();
41 }
42 virtual size_t BytesRead(void) const {
43 return bytes_read_;
44 }
45 virtual bool ParseNext(std::vector<RowBlockContainer<IndexType, DType> > *data) {
46 return FillData(data);
47 }
48
49 protected:
55 virtual void ParseBlock(const char *begin, const char *end,
62 inline bool FillData(std::vector<RowBlockContainer<IndexType, DType>> *data);
69 static inline const char *BackFindEndLine(const char *bptr, const char *begin) {
70 for (; bptr != begin; --bptr) {
71 if (*bptr == '\n' || *bptr == '\r')
72 return bptr;
73 }
74 return begin;
75 }
81 static inline void IgnoreUTF8BOM(const char **begin, const char **end) {
82 int count = 0;
83 for (count = 0; *begin != *end && count < 3; count++, ++*begin) {
84 if (!begin || !*begin)
85 break;
86 if (**begin != '\xEF' && count == 0)
87 break;
88 if (**begin != '\xBB' && count == 1)
89 break;
90 if (**begin != '\xBF' && count == 2)
91 break;
92 }
93 if (count < 3)
94 *begin -= count;
95 }
96
97 private:
98 // nthread
99 int nthread_;
100 // number of bytes readed
101 size_t bytes_read_;
102 // source split that provides the data
103 InputSplit *source_;
104 // OMPException object to catch and rethrow exceptions in omp blocks
105 dmlc::OMPException omp_exc_;
106};
107
108// implementation
109template <typename IndexType, typename DType>
111 std::vector<RowBlockContainer<IndexType, DType> > *data) {
112 InputSplit::Blob chunk;
113 if (!source_->NextChunk(&chunk)) return false;
114 const int nthread = this->nthread_;
115 // reserve space for data
116 data->resize(nthread);
117 bytes_read_ += chunk.size;
118 CHECK_NE(chunk.size, 0U);
119 const char *head = reinterpret_cast<char *>(chunk.dptr);
120
121 std::vector<std::thread> threads;
122 for (int tid = 0; tid < nthread; ++tid) {
123 threads.push_back(std::thread([&chunk, head, data, nthread, tid, this] {
124 this->omp_exc_.Run([&] {
125 size_t nstep = (chunk.size + nthread - 1) / nthread;
126 size_t sbegin = std::min(tid * nstep, chunk.size);
127 size_t send = std::min((tid + 1) * nstep, chunk.size);
128 const char *pbegin = BackFindEndLine(head + sbegin, head);
129 const char *pend;
130 if (tid + 1 == nthread) {
131 pend = head + send;
132 } else {
133 pend = BackFindEndLine(head + send, head);
134 }
135 ParseBlock(pbegin, pend, &(*data)[tid]);
136 });
137 }));
138 }
139 for (int i = 0; i < nthread; ++i) {
140 threads[i].join();
141 }
142 omp_exc_.Rethrow();
143
144 this->data_ptr_ = 0;
145 return true;
146}
147
148} // namespace data
149} // namespace dmlc
150#endif // DMLC_DATA_TEXT_PARSER_H_
input split creates that allows reading of records from split of data, independent part that covers a...
Definition io.h:155
virtual void BeforeFirst(void)=0
reset the position of InputSplit to beginning
OMP Exception class catches, saves and rethrows exception from OMP blocks.
Definition common.h:53
base class for parser to parse data
Definition parser.h:24
Text parser that parses the input lines and returns rows in input data.
Definition text_parser.h:28
virtual void ParseBlock(const char *begin, const char *end, RowBlockContainer< IndexType, DType > *out)=0
parse data into out
static void IgnoreUTF8BOM(const char **begin, const char **end)
Ignore UTF-8 BOM if present.
Definition text_parser.h:81
virtual bool ParseNext(std::vector< RowBlockContainer< IndexType, DType > > *data)
read in next several blocks of data
Definition text_parser.h:45
virtual size_t BytesRead(void) const
Definition text_parser.h:42
virtual void BeforeFirst(void)
set before first of the item
Definition text_parser.h:39
bool FillData(std::vector< RowBlockContainer< IndexType, DType > > *data)
read in next several blocks of data
Definition text_parser.h:110
static const char * BackFindEndLine(const char *bptr, const char *begin)
start from bptr, go backward and find first endof line
Definition text_parser.h:69
defines common input data structure, and interface for handling the input data
namespace for dmlc
Definition array_view.h:12
header to handle OpenMP compatibility issues
additional data structure to support RowBlock data structure
a blob of memory region
Definition io.h:158
size_t size
size of the memory region
Definition io.h:162
void * dptr
points to start of the memory region
Definition io.h:160
dynamic data structure that holds a row block of data
Definition row_block.h:27
defines some common utility function.