Medial Code Documentation
Loading...
Searching...
No Matches
data.cc
1// Copyright by Contributors
2#include <dmlc/base.h>
3#include <dmlc/io.h>
4#include <dmlc/logging.h>
5#include <dmlc/data.h>
6#include <dmlc/registry.h>
7#include <cstring>
8#include <string>
9#include "io/uri_spec.h"
10#include "data/parser.h"
11#include "data/basic_row_iter.h"
12#include "data/disk_row_iter.h"
13#include "data/libsvm_parser.h"
14#include "data/libfm_parser.h"
15#include "data/csv_parser.h"
16
17#ifdef DMLC_USE_PARQUET
18#include "data/parquet_parser.h"
19#endif
20
21namespace dmlc {
23namespace data {
24
25template<typename IndexType, typename DType = real_t>
27CreateLibSVMParser(const std::string& path,
28 const std::map<std::string, std::string>& args,
29 unsigned part_index,
30 unsigned num_parts) {
32 path.c_str(), part_index, num_parts, "text");
33 ParserImpl<IndexType> *parser = new LibSVMParser<IndexType>(source, args, 2);
34#if DMLC_ENABLE_STD_THREAD
35 parser = new ThreadedParser<IndexType>(parser);
36#endif
37 return parser;
38}
39
40template<typename IndexType, typename DType = real_t>
42CreateLibFMParser(const std::string& path,
43 const std::map<std::string, std::string>& args,
44 unsigned part_index,
45 unsigned num_parts) {
47 path.c_str(), part_index, num_parts, "text");
48 ParserImpl<IndexType> *parser = new LibFMParser<IndexType>(source, args, 2);
49#if DMLC_ENABLE_STD_THREAD
50 parser = new ThreadedParser<IndexType>(parser);
51#endif
52 return parser;
53}
54
55template<typename IndexType, typename DType = real_t>
57CreateCSVParser(const std::string& path,
58 const std::map<std::string, std::string>& args,
59 unsigned part_index,
60 unsigned num_parts) {
62 path.c_str(), part_index, num_parts, "text");
63 return new CSVParser<IndexType, DType>(source, args, 2);
64}
65
66#ifdef DMLC_USE_PARQUET
67template<typename IndexType, typename DType = real_t>
69CreateParquetParser(const std::string& path,
70 const std::map<std::string, std::string>& args,
71 unsigned /*part_index*/,
72 unsigned /*num_parts*/) {
73 ParserImpl<IndexType> *parser = new ParquetParser<IndexType>(path, args);
74 return parser;
75}
76#endif
77
78template<typename IndexType, typename DType = real_t>
80CreateParser_(const char *uri_,
81 unsigned part_index,
82 unsigned num_parts,
83 const char *type) {
84 std::string ptype = type;
85 io::URISpec spec(uri_, part_index, num_parts);
86 if (ptype == "auto") {
87 if (spec.args.count("format") != 0) {
88 ptype = spec.args.at("format");
89 } else {
90 ptype = "libsvm";
91 }
92 }
93
96 if (e == NULL) {
97 LOG(FATAL) << "Unknown data type " << ptype;
98 }
99 // create parser
100 return (*e->body)(spec.uri, spec.args, part_index, num_parts);
101}
102
103template<typename IndexType, typename DType = real_t>
105CreateIter_(const char *uri_,
106 unsigned part_index,
107 unsigned num_parts,
108 const char *type) {
109 using namespace std;
110 io::URISpec spec(uri_, part_index, num_parts);
111 Parser<IndexType, DType> *parser = CreateParser_<IndexType, DType>
112 (spec.uri.c_str(), part_index, num_parts, type);
113 if (spec.cache_file.length() != 0) {
114#if DMLC_ENABLE_STD_THREAD
115 return new DiskRowIter<IndexType, DType>(parser, spec.cache_file.c_str(), true);
116#else
117 LOG(FATAL) << "compile with c++0x or c++11 to enable cache file";
118 return NULL;
119#endif
120 } else {
121 return new BasicRowIter<IndexType, DType>(parser);
122 }
123}
124
125DMLC_REGISTER_PARAMETER(LibSVMParserParam);
126DMLC_REGISTER_PARAMETER(LibFMParserParam);
127DMLC_REGISTER_PARAMETER(CSVParserParam);
128#ifdef DMLC_USE_PARQUET
129DMLC_REGISTER_PARAMETER(ParquetParserParam);
130#endif
131} // namespace data
132
133// template specialization
134template<>
137 unsigned part_index,
138 unsigned num_parts,
139 const char *type) {
140 return data::CreateIter_<uint32_t, real_t>(uri, part_index, num_parts, type);
141}
142
143template<>
144RowBlockIter<uint64_t, real_t> *
146 unsigned part_index,
147 unsigned num_parts,
148 const char *type) {
149 return data::CreateIter_<uint64_t, real_t>(uri, part_index, num_parts, type);
150}
151
152template<>
153RowBlockIter<uint32_t, int32_t> *
155 unsigned part_index,
156 unsigned num_parts,
157 const char *type) {
158 return data::CreateIter_<uint32_t, int32_t>(uri, part_index, num_parts, type);
159}
160
161template<>
162RowBlockIter<uint64_t, int32_t> *
164 unsigned part_index,
165 unsigned num_parts,
166 const char *type) {
167 return data::CreateIter_<uint64_t, int32_t>(uri, part_index, num_parts, type);
168}
169
170template<>
171RowBlockIter<uint32_t, int64_t> *
173 unsigned part_index,
174 unsigned num_parts,
175 const char *type) {
176 return data::CreateIter_<uint32_t, int64_t>(uri, part_index, num_parts, type);
177}
178
179template<>
180RowBlockIter<uint64_t, int64_t> *
182 unsigned part_index,
183 unsigned num_parts,
184 const char *type) {
185 return data::CreateIter_<uint64_t, int64_t>(uri, part_index, num_parts, type);
186}
187
188template<>
189Parser<uint32_t, real_t> *
190Parser<uint32_t, real_t>::Create(const char *uri_,
191 unsigned part_index,
192 unsigned num_parts,
193 const char *type) {
194 return data::CreateParser_<uint32_t, real_t>(uri_, part_index, num_parts, type);
195}
196
197template<>
198Parser<uint64_t, real_t> *
199Parser<uint64_t, real_t>::Create(const char *uri_,
200 unsigned part_index,
201 unsigned num_parts,
202 const char *type) {
203 return data::CreateParser_<uint64_t, real_t>(uri_, part_index, num_parts, type);
204}
205
206template<>
207Parser<uint32_t, int32_t> *
208Parser<uint32_t, int32_t>::Create(const char *uri_,
209 unsigned part_index,
210 unsigned num_parts,
211 const char *type) {
212 return data::CreateParser_<uint32_t, int32_t>(uri_, part_index, num_parts, type);
213}
214
215template<>
216Parser<uint64_t, int32_t> *
217Parser<uint64_t, int32_t>::Create(const char *uri_,
218 unsigned part_index,
219 unsigned num_parts,
220 const char *type) {
221 return data::CreateParser_<uint64_t, int32_t>(uri_, part_index, num_parts, type);
222}
223
224template<>
225Parser<uint32_t, int64_t> *
226Parser<uint32_t, int64_t>::Create(const char *uri_,
227 unsigned part_index,
228 unsigned num_parts,
229 const char *type) {
230 return data::CreateParser_<uint32_t, int64_t>(uri_, part_index, num_parts, type);
231}
232
233template<>
234Parser<uint64_t, int64_t> *
235Parser<uint64_t, int64_t>::Create(const char *uri_,
236 unsigned part_index,
237 unsigned num_parts,
238 const char *type) {
239 return data::CreateParser_<uint64_t, int64_t>(uri_, part_index, num_parts, type);
240}
241
242// registry
243typedef ParserFactoryReg<uint32_t, real_t> Reg32flt;
244typedef ParserFactoryReg<uint32_t, int32_t> Reg32int32;
245typedef ParserFactoryReg<uint32_t, int64_t> Reg32int64;
246typedef ParserFactoryReg<uint64_t, real_t> Reg64flt;
247typedef ParserFactoryReg<uint64_t, int32_t> Reg64int32;
248typedef ParserFactoryReg<uint64_t, int64_t> Reg64int64;
249DMLC_REGISTRY_ENABLE(Reg32flt);
250DMLC_REGISTRY_ENABLE(Reg32int32);
251DMLC_REGISTRY_ENABLE(Reg32int64);
252DMLC_REGISTRY_ENABLE(Reg64flt);
253DMLC_REGISTRY_ENABLE(Reg64int32);
254DMLC_REGISTRY_ENABLE(Reg64int64);
255
257 uint32_t, real_t, libsvm, data::CreateLibSVMParser<uint32_t __DMLC_COMMA real_t>);
259 uint64_t, real_t, libsvm, data::CreateLibSVMParser<uint64_t __DMLC_COMMA real_t>);
261 uint32_t, real_t, libfm, data::CreateLibFMParser<uint32_t __DMLC_COMMA real_t>);
263 uint64_t, real_t, libfm, data::CreateLibFMParser<uint64_t __DMLC_COMMA real_t>);
265 uint32_t, real_t, csv, data::CreateCSVParser<uint32_t __DMLC_COMMA real_t>);
267 uint64_t, real_t, csv, data::CreateCSVParser<uint64_t __DMLC_COMMA real_t>);
269 uint32_t, int32_t, csv, data::CreateCSVParser<uint32_t __DMLC_COMMA int32_t>);
271 uint64_t, int32_t, csv, data::CreateCSVParser<uint64_t __DMLC_COMMA int32_t>);
273 uint32_t, int64_t, csv, data::CreateCSVParser<uint32_t __DMLC_COMMA int64_t>);
275 uint64_t, int64_t, csv, data::CreateCSVParser<uint64_t __DMLC_COMMA int64_t>);
276#ifdef DMLC_USE_PARQUET
278 uint32_t, real_t, parquet, data::CreateParquetParser<uint32_t __DMLC_COMMA real_t>);
280 uint64_t, real_t, parquet, data::CreateParquetParser<uint64_t __DMLC_COMMA real_t>);
281#endif
282
283} // namespace dmlc
row based iterator that loads in everything into memory and returns
input split creates that allows reading of records from split of data, independent part that covers a...
Definition io.h:155
static InputSplit * Create(const char *uri, unsigned part_index, unsigned num_parts, const char *type)
factory function: create input split given a uri
Definition io.cc:74
parser interface that parses input data used to load dmlc data format into your own data format Diffe...
Definition data.h:293
static Parser< IndexType, DType > * Create(const char *uri_, unsigned part_index, unsigned num_parts, const char *type)
create a new instance of parser based on the "type"
Registry class. Registry can be used to register global singletons. The most commonly use case are fa...
Definition registry.h:27
static const EntryType * Find(const std::string &name)
Find the entry with corresponding name.
Definition registry.h:48
Data structure that holds the data Row block iterator interface that gets RowBlocks Difference betwee...
Definition data.h:254
static RowBlockIter< IndexType, DType > * Create(const char *uri, unsigned part_index, unsigned num_parts, const char *type)
create a new instance of iterator that returns rowbatch by default, a in-memory based iterator will b...
basic set of row iterators that provides
Definition basic_row_iter.h:24
CSVParser, parses a dense csv format. All columns are treated as real dense data. Label will be empty...
Definition csv_parser.h:51
Text parser that parses the input lines and returns rows in input data.
Definition libfm_parser.h:46
Text parser that parses the input lines and returns rows in input data.
Definition libsvm_parser.h:46
Definition parquet_parser.h:50
base class for parser to parse data
Definition parser.h:24
declare thread class
Definition parser.h:20
some super set of URI that allows sugars to be passed around Example:
Definition uri_spec.h:28
std::string cache_file
the path to cache file
Definition uri_spec.h:35
std::map< std::string, std::string > args
arguments in the URL
Definition uri_spec.h:33
std::string uri
the real URI
Definition uri_spec.h:31
iterator parser to parse csv format
defines configuration macros
defines common input data structure, and interface for handling the input data
#define DMLC_REGISTER_DATA_PARSER(IndexType, DataType, TypeName, FactoryFunction)
Register a new distributed parser to dmlc-core.
Definition data.h:358
defines serializable interface of dmlc
defines logging macros of dmlc allows use of GLOG, fall back to internal implementation when disabled
iterator parser to parse libfm format
iterator parser to parse libsvm format
namespace for dmlc
Definition array_view.h:12
float real_t
this defines the float point that will be used to store feature values
Definition data.h:26
Definition StdDeque.h:58
iterator parser to parse parquet format
Registry utility that helps to build registry singletons.
#define DMLC_REGISTRY_ENABLE(EntryType)
Macro to enable the registry of EntryType. This macro must be used under namespace dmlc,...
Definition registry.h:234
registry entry of parser factory
Definition data.h:330
Definition csv_parser.h:24
Definition libfm_parser.h:24
Definition libsvm_parser.h:24
Definition parquet_parser.h:31
common specification of sugars in URI string passed to dmlc Create functions such as local file cache