10#include "data/parser.h"
12#include "data/disk_row_iter.h"
17#ifdef DMLC_USE_PARQUET
25template<
typename IndexType,
typename DType = real_t>
27CreateLibSVMParser(
const std::string& path,
28 const std::map<std::string, std::string>& args,
32 path.c_str(), part_index, num_parts,
"text");
34#if DMLC_ENABLE_STD_THREAD
40template<
typename IndexType,
typename DType = real_t>
42CreateLibFMParser(
const std::string& path,
43 const std::map<std::string, std::string>& args,
47 path.c_str(), part_index, num_parts,
"text");
49#if DMLC_ENABLE_STD_THREAD
55template<
typename IndexType,
typename DType = real_t>
57CreateCSVParser(
const std::string& path,
58 const std::map<std::string, std::string>& args,
62 path.c_str(), part_index, num_parts,
"text");
66#ifdef DMLC_USE_PARQUET
67template<
typename IndexType,
typename DType = real_t>
69CreateParquetParser(
const std::string& path,
70 const std::map<std::string, std::string>& args,
78template<
typename IndexType,
typename DType = real_t>
80CreateParser_(
const char *uri_,
84 std::string ptype = type;
86 if (ptype ==
"auto") {
87 if (spec.
args.count(
"format") != 0) {
88 ptype = spec.
args.at(
"format");
97 LOG(FATAL) <<
"Unknown data type " << ptype;
100 return (*e->body)(spec.
uri, spec.
args, part_index, num_parts);
103template<
typename IndexType,
typename DType = real_t>
105CreateIter_(
const char *uri_,
112 (spec.
uri.c_str(), part_index, num_parts, type);
114#if DMLC_ENABLE_STD_THREAD
115 return new DiskRowIter<IndexType, DType>(parser, spec.
cache_file.c_str(),
true);
117 LOG(FATAL) <<
"compile with c++0x or c++11 to enable cache file";
128#ifdef DMLC_USE_PARQUET
140 return data::CreateIter_<uint32_t, real_t>(uri, part_index, num_parts, type);
144RowBlockIter<uint64_t, real_t> *
149 return data::CreateIter_<uint64_t, real_t>(uri, part_index, num_parts, type);
153RowBlockIter<uint32_t, int32_t> *
158 return data::CreateIter_<uint32_t, int32_t>(uri, part_index, num_parts, type);
162RowBlockIter<uint64_t, int32_t> *
167 return data::CreateIter_<uint64_t, int32_t>(uri, part_index, num_parts, type);
171RowBlockIter<uint32_t, int64_t> *
176 return data::CreateIter_<uint32_t, int64_t>(uri, part_index, num_parts, type);
180RowBlockIter<uint64_t, int64_t> *
185 return data::CreateIter_<uint64_t, int64_t>(uri, part_index, num_parts, type);
189Parser<uint32_t, real_t> *
194 return data::CreateParser_<uint32_t, real_t>(uri_, part_index, num_parts, type);
198Parser<uint64_t, real_t> *
203 return data::CreateParser_<uint64_t, real_t>(uri_, part_index, num_parts, type);
207Parser<uint32_t, int32_t> *
212 return data::CreateParser_<uint32_t, int32_t>(uri_, part_index, num_parts, type);
216Parser<uint64_t, int32_t> *
221 return data::CreateParser_<uint64_t, int32_t>(uri_, part_index, num_parts, type);
225Parser<uint32_t, int64_t> *
230 return data::CreateParser_<uint32_t, int64_t>(uri_, part_index, num_parts, type);
234Parser<uint64_t, int64_t> *
239 return data::CreateParser_<uint64_t, int64_t>(uri_, part_index, num_parts, type);
243typedef ParserFactoryReg<uint32_t, real_t> Reg32flt;
244typedef ParserFactoryReg<uint32_t, int32_t> Reg32int32;
245typedef ParserFactoryReg<uint32_t, int64_t> Reg32int64;
246typedef ParserFactoryReg<uint64_t, real_t> Reg64flt;
247typedef ParserFactoryReg<uint64_t, int32_t> Reg64int32;
248typedef ParserFactoryReg<uint64_t, int64_t> Reg64int64;
257 uint32_t,
real_t, libsvm, data::CreateLibSVMParser<uint32_t __DMLC_COMMA real_t>);
259 uint64_t,
real_t, libsvm, data::CreateLibSVMParser<uint64_t __DMLC_COMMA real_t>);
261 uint32_t,
real_t, libfm, data::CreateLibFMParser<uint32_t __DMLC_COMMA real_t>);
263 uint64_t,
real_t, libfm, data::CreateLibFMParser<uint64_t __DMLC_COMMA real_t>);
265 uint32_t,
real_t, csv, data::CreateCSVParser<uint32_t __DMLC_COMMA real_t>);
267 uint64_t,
real_t, csv, data::CreateCSVParser<uint64_t __DMLC_COMMA real_t>);
269 uint32_t, int32_t, csv, data::CreateCSVParser<uint32_t __DMLC_COMMA int32_t>);
271 uint64_t, int32_t, csv, data::CreateCSVParser<uint64_t __DMLC_COMMA int32_t>);
273 uint32_t, int64_t, csv, data::CreateCSVParser<uint32_t __DMLC_COMMA int64_t>);
275 uint64_t, int64_t, csv, data::CreateCSVParser<uint64_t __DMLC_COMMA int64_t>);
276#ifdef DMLC_USE_PARQUET
278 uint32_t,
real_t, parquet, data::CreateParquetParser<uint32_t __DMLC_COMMA real_t>);
280 uint64_t,
real_t, parquet, data::CreateParquetParser<uint64_t __DMLC_COMMA real_t>);
row based iterator that loads in everything into memory and returns
parser interface that parses input data used to load dmlc data format into your own data format Diffe...
Definition data.h:293
static Parser< IndexType, DType > * Create(const char *uri_, unsigned part_index, unsigned num_parts, const char *type)
create a new instance of parser based on the "type"
Registry class. Registry can be used to register global singletons. The most commonly use case are fa...
Definition registry.h:27
static const EntryType * Find(const std::string &name)
Find the entry with corresponding name.
Definition registry.h:48
Data structure that holds the data Row block iterator interface that gets RowBlocks Difference betwee...
Definition data.h:254
static RowBlockIter< IndexType, DType > * Create(const char *uri, unsigned part_index, unsigned num_parts, const char *type)
create a new instance of iterator that returns rowbatch by default, a in-memory based iterator will b...
basic set of row iterators that provides
Definition basic_row_iter.h:24
CSVParser, parses a dense csv format. All columns are treated as real dense data. Label will be empty...
Definition csv_parser.h:51
Text parser that parses the input lines and returns rows in input data.
Definition libfm_parser.h:46
Text parser that parses the input lines and returns rows in input data.
Definition libsvm_parser.h:46
Definition parquet_parser.h:50
base class for parser to parse data
Definition parser.h:24
declare thread class
Definition parser.h:20
some super set of URI that allows sugars to be passed around Example:
Definition uri_spec.h:28
std::string cache_file
the path to cache file
Definition uri_spec.h:35
std::map< std::string, std::string > args
arguments in the URL
Definition uri_spec.h:33
std::string uri
the real URI
Definition uri_spec.h:31
iterator parser to parse csv format
defines configuration macros
defines common input data structure, and interface for handling the input data
#define DMLC_REGISTER_DATA_PARSER(IndexType, DataType, TypeName, FactoryFunction)
Register a new distributed parser to dmlc-core.
Definition data.h:358
defines serializable interface of dmlc
defines logging macros of dmlc allows use of GLOG, fall back to internal implementation when disabled
iterator parser to parse libfm format
iterator parser to parse libsvm format
namespace for dmlc
Definition array_view.h:12
float real_t
this defines the float point that will be used to store feature values
Definition data.h:26
iterator parser to parse parquet format
Registry utility that helps to build registry singletons.
#define DMLC_REGISTRY_ENABLE(EntryType)
Macro to enable the registry of EntryType. This macro must be used under namespace dmlc,...
Definition registry.h:234
registry entry of parser factory
Definition data.h:330
Definition csv_parser.h:24
Definition libfm_parser.h:24
Definition libsvm_parser.h:24
Definition parquet_parser.h:31
common specification of sugars in URI string passed to dmlc Create functions such as local file cache