Medial Code Documentation
Loading...
Searching...
No Matches
csv_parser.h
Go to the documentation of this file.
1
7#ifndef DMLC_DATA_CSV_PARSER_H_
8#define DMLC_DATA_CSV_PARSER_H_
9
10#include <dmlc/data.h>
11#include <dmlc/strtonum.h>
12#include <dmlc/parameter.h>
13#include <cmath>
14#include <cstring>
15#include <map>
16#include <string>
17#include <limits>
18#include "./row_block.h"
19#include "./text_parser.h"
20
21namespace dmlc {
22namespace data {
23
24struct CSVParserParam : public Parameter<CSVParserParam> {
25 std::string format;
26 int label_column;
27 std::string delimiter;
28 int weight_column;
29 // declare parameters
30 DMLC_DECLARE_PARAMETER(CSVParserParam) {
31 DMLC_DECLARE_FIELD(format).set_default("csv")
32 .describe("File format.");
33 DMLC_DECLARE_FIELD(label_column).set_default(-1)
34 .describe("Column index (0-based) that will put into label.");
35 DMLC_DECLARE_FIELD(delimiter).set_default(",")
36 .describe("Delimiter used in the csv file.");
37 DMLC_DECLARE_FIELD(weight_column).set_default(-1)
38 .describe("Column index that will put into instance weights.");
39 }
40};
41
42
50template <typename IndexType, typename DType = real_t>
51class CSVParser : public TextParserBase<IndexType, DType> {
52 public:
53 explicit CSVParser(InputSplit *source,
54 const std::map<std::string, std::string>& args,
55 int nthread)
56 : TextParserBase<IndexType, DType>(source, nthread) {
57 param_.Init(args);
58 CHECK_EQ(param_.format, "csv");
59 CHECK(param_.label_column != param_.weight_column
60 || param_.label_column < 0)
61 << "Must have distinct columns for labels and instance weights";
62 }
63
64 protected:
65 virtual void ParseBlock(const char *begin,
66 const char *end,
68
69 private:
70 CSVParserParam param_;
71};
72
73template <typename IndexType, typename DType>
75ParseBlock(const char *begin,
76 const char *end,
78 out->Clear();
79 const char * lbegin = begin;
80 const char * lend = lbegin;
81 // advance lbegin if it points to newlines
82 while ((lbegin != end) && (*lbegin == '\n' || *lbegin == '\r')) ++lbegin;
83 while (lbegin != end) {
84 // get line end
85 this->IgnoreUTF8BOM(&lbegin, &end);
86 lend = lbegin + 1;
87 while (lend != end && *lend != '\n' && *lend != '\r') ++lend;
88
89 const char* p = lbegin;
90 int column_index = 0;
91 IndexType idx = 0;
92 real_t weight = std::numeric_limits<real_t>::quiet_NaN();
93
94 while (p != lend) {
95 char *endptr;
96 DType v;
97 // if DType is float32
98 if (std::is_same<DType, real_t>::value) {
99 v = strtof(p, &endptr);
100 // If DType is int32
101 } else if (std::is_same<DType, int32_t>::value) {
102 v = static_cast<int32_t>(strtoll(p, &endptr, 0));
103 // If DType is int64
104 } else if (std::is_same<DType, int64_t>::value) {
105 v = static_cast<int64_t>(strtoll(p, &endptr, 0));
106 // If DType is all other types
107 } else {
108 LOG(FATAL) << "Only float32, int32, and int64 are supported for the time being";
109 }
110
111 if (column_index == param_.label_column) {
112 out->label.push_back(v);
113 } else if (std::is_same<DType, real_t>::value
114 && column_index == param_.weight_column) {
115 weight = v;
116 } else {
117 if (std::distance(p, static_cast<char const*>(endptr)) != 0) {
118 out->value.push_back(v);
119 out->index.push_back(idx++);
120 } else {
121 idx++;
122 }
123 }
124 p = (endptr >= lend) ? lend : endptr;
125 ++column_index;
126 while (*p != param_.delimiter[0] && p != lend) ++p;
127 if (p == lend && idx == 0) {
128 LOG(FATAL) << "Delimiter \'" << param_.delimiter << "\' is not found in the line. "
129 << "Expected \'" << param_.delimiter
130 << "\' as the delimiter to separate fields.";
131 }
132 if (p != lend) ++p;
133 }
134 // skip empty line
135 while ((*lend == '\n' || *lend == '\r') && lend != end) ++lend;
136 lbegin = lend;
137 if (!std::isnan(weight)) {
138 out->weight.push_back(weight);
139 }
140 out->offset.push_back(out->index.size());
141 }
142 CHECK(out->label.size() == 0 || out->label.size() + 1 == out->offset.size());
143 CHECK(out->weight.size() == 0 || out->weight.size() + 1 == out->offset.size());
144}
145} // namespace data
146} // namespace dmlc
147#endif // DMLC_DATA_CSV_PARSER_H_
input split creates that allows reading of records from split of data, independent part that covers a...
Definition io.h:155
CSVParser, parses a dense csv format. All columns are treated as real dense data. Label will be empty...
Definition csv_parser.h:51
virtual void ParseBlock(const char *begin, const char *end, RowBlockContainer< IndexType, DType > *out)
parse data into out
Definition csv_parser.h:75
Text parser that parses the input lines and returns rows in input data.
Definition text_parser.h:28
defines common input data structure, and interface for handling the input data
Provide lightweight util to do parameter setup and checking.
namespace for dmlc
Definition array_view.h:12
float strtof(const char *nptr, char **endptr)
A faster implementation of strtof(). See documentation of std::strtof() for more information....
Definition strtonum.h:268
float real_t
this defines the float point that will be used to store feature values
Definition data.h:26
additional data structure to support RowBlock data structure
A faster implementation of strtof and strtod.
Definition csv_parser.h:24
dynamic data structure that holds a row block of data
Definition row_block.h:27
iterator parser to parse text format