Medial Code Documentation
Loading...
Searching...
No Matches
libsvm_parser.h
Go to the documentation of this file.
1
7#ifndef DMLC_DATA_LIBSVM_PARSER_H_
8#define DMLC_DATA_LIBSVM_PARSER_H_
9
10#include <dmlc/data.h>
11#include <dmlc/strtonum.h>
12#include <dmlc/parameter.h>
13#include <map>
14#include <string>
15#include <limits>
16#include <algorithm>
17#include <cstring>
18#include "./row_block.h"
19#include "./text_parser.h"
20
21namespace dmlc {
22namespace data {
23
24struct LibSVMParserParam : public Parameter<LibSVMParserParam> {
25 std::string format;
26 int indexing_mode;
27 // declare parameters
28 DMLC_DECLARE_PARAMETER(LibSVMParserParam) {
29 DMLC_DECLARE_FIELD(format).set_default("libsvm")
30 .describe("File format");
31 DMLC_DECLARE_FIELD(indexing_mode).set_default(0)
32 .describe(
33 "If >0, treat all feature indices as 1-based. "
34 "If =0, treat all feature indices as 0-based. "
35 "If <0, use heuristic to automatically detect mode of indexing. "
36 "See https://en.wikipedia.org/wiki/Array_data_type#Index_origin "
37 "for more details on indexing modes.");
38 }
39};
40
45template <typename IndexType, typename DType = real_t>
46class LibSVMParser : public TextParserBase<IndexType> {
47 public:
48 explicit LibSVMParser(InputSplit *source, int nthread)
49 : LibSVMParser(source, std::map<std::string, std::string>(), nthread) {}
50 explicit LibSVMParser(InputSplit *source,
51 const std::map<std::string, std::string>& args,
52 int nthread)
53 : TextParserBase<IndexType>(source, nthread) {
54 param_.Init(args);
55 CHECK_EQ(param_.format, "libsvm");
56 }
57
58 protected:
59 virtual void ParseBlock(const char *begin,
60 const char *end,
62
63 private:
64 LibSVMParserParam param_;
65};
66
67template <char kSymbol = '#'>
68std::ptrdiff_t IgnoreCommentAndBlank(char const* beg,
69 char const* line_end) {
70 char const* p = beg;
71 std::ptrdiff_t length = std::distance(beg, line_end);
72 while (p != line_end) {
73 if (*p == kSymbol) {
74 // advance to line end, `ParsePair' will return empty line.
75 return length;
76 }
77 if (!isblank(*p)) {
78 return std::distance(beg, p); // advance to p
79 }
80 p++;
81 }
82 // advance to line end, `ParsePair' will return empty line.
83 return length;
84}
85
86template <typename IndexType, typename DType>
88ParseBlock(const char *begin,
89 const char *end,
91 out->Clear();
92 const char * lbegin = begin;
93 const char * lend = lbegin;
94 IndexType min_feat_id = std::numeric_limits<IndexType>::max();
95 while (lbegin != end) {
96 // get line end
97 lend = lbegin + 1;
98 while (lend != end && *lend != '\n' && *lend != '\r') ++lend;
99 // parse label[:weight]
100 const char * p = lbegin;
101 const char * q = NULL;
102 real_t label;
103 real_t weight;
104 std::ptrdiff_t advanced = IgnoreCommentAndBlank(p, lend);
105 p += advanced;
106 int r = ParsePair<real_t, real_t>(p, lend, &q, label, weight);
107 if (r < 1) {
108 // empty line
109 lbegin = lend;
110 continue;
111 }
112 if (r == 2) {
113 // has weight
114 out->weight.push_back(weight);
115 }
116 if (out->label.size() != 0) {
117 out->offset.push_back(out->index.size());
118 }
119 out->label.push_back(label);
120 // parse qid:id
121 uint64_t qid;
122 p = q;
123 while (p != end && *p == ' ') ++p;
124 if (p != lend && (strncmp(p, "qid:", 4) == 0)) {
125 p += 4;
126 qid = static_cast<uint64_t>(atoll(p));
127 while (p != lend && isdigitchars(*p)) ++p;
128 out->qid.push_back(qid);
129 }
130 // parse feature[:value]
131 while (p != lend) {
132 IndexType featureId;
133 real_t value;
134 std::ptrdiff_t advanced = IgnoreCommentAndBlank(p, lend);
135 p += advanced;
136 int r = ParsePair<IndexType, real_t>(p, lend, &q, featureId, value);
137 if (r < 1) {
138 // q is set to line end by `ParsePair', here is p. The latter terminates
139 // while loop of parsing features.
140 p = q;
141 continue;
142 }
143 out->index.push_back(featureId);
144 min_feat_id = std::min(featureId, min_feat_id);
145 if (r == 2) {
146 // has value
147 out->value.push_back(value);
148 }
149 p = q;
150 }
151 // next line
152 lbegin = lend;
153 }
154 if (out->label.size() != 0) {
155 out->offset.push_back(out->index.size());
156 }
157 CHECK(out->label.size() + 1 == out->offset.size());
158
159 // detect indexing mode
160 // heuristic adopted from sklearn.datasets.load_svmlight_file
161 // If all feature id's exceed 0, then detect 1-based indexing
162 if (param_.indexing_mode > 0
163 || (param_.indexing_mode < 0 && !out->index.empty() && min_feat_id > 0)) {
164 // convert from 1-based to 0-based indexing
165 for (IndexType& e : out->index) {
166 --e;
167 }
168 }
169}
170
171} // namespace data
172} // namespace dmlc
173#endif // DMLC_DATA_LIBSVM_PARSER_H_
input split creates that allows reading of records from split of data, independent part that covers a...
Definition io.h:155
Text parser that parses the input lines and returns rows in input data.
Definition libsvm_parser.h:46
virtual void ParseBlock(const char *begin, const char *end, RowBlockContainer< IndexType, DType > *out)
parse data into out
Definition libsvm_parser.h:88
Text parser that parses the input lines and returns rows in input data.
Definition text_parser.h:28
defines common input data structure, and interface for handling the input data
Provide lightweight util to do parameter setup and checking.
namespace for dmlc
Definition array_view.h:12
bool isblank(char c)
Inline implementation of isblank(). Tests whether the given character is a space or tab character.
Definition strtonum.h:36
bool isdigitchars(char c)
Tests whether the given character is a valid letter in the string representation of a floating-point ...
Definition strtonum.h:70
float real_t
this defines the float point that will be used to store feature values
Definition data.h:26
additional data structure to support RowBlock data structure
A faster implementation of strtof and strtod.
Definition libsvm_parser.h:24
dynamic data structure that holds a row block of data
Definition row_block.h:27
iterator parser to parse text format