Medial Code Documentation
Loading...
Searching...
No Matches
io.cc
1// Copyright by Contributors
2
3#include <dmlc/base.h>
4#include <dmlc/io.h>
5#include <dmlc/logging.h>
6#include <cstring>
7#include "io/uri_spec.h"
8#include "io/line_split.h"
9#include "io/recordio_split.h"
12#include "io/local_filesys.h"
15
16#if DMLC_USE_HDFS
17#include "io/hdfs_filesys.h"
18#endif
19
20#if DMLC_USE_S3
21#include "io/s3_filesys.h"
22#endif
23
24#if DMLC_USE_AZURE
25#include "io/azure_filesys.h"
26#endif
27
28namespace dmlc {
29namespace io {
30FileSystem *FileSystem::GetInstance(const URI &path) {
31 if (path.protocol == "file://" || path.protocol.length() == 0) {
33 }
34 if (path.protocol == "hdfs://" || path.protocol == "viewfs://") {
35#if DMLC_USE_HDFS
36 if (path.host.length() == 0) {
37 return HDFSFileSystem::GetInstance("default");
38 } else if (path.protocol == "viewfs://") {
39 char* defaultFS = nullptr;
40 hdfsConfGetStr("fs.defaultFS", &defaultFS);
41 if (path.host.length() != 0) {
42 CHECK("viewfs://" + path.host == defaultFS)
43 << "viewfs is only supported as a fs.defaultFS.";
44 }
45 return HDFSFileSystem::GetInstance("default");
46 } else {
47 return HDFSFileSystem::GetInstance(path.host);
48 }
49#else
50 LOG(FATAL) << "Please compile with DMLC_USE_HDFS=1 to use hdfs";
51#endif
52 }
53 if (path.protocol == "s3://" || path.protocol == "http://" || path.protocol == "https://") {
54#if DMLC_USE_S3
56#else
57 LOG(FATAL) << "Please compile with DMLC_USE_S3=1 to use S3";
58#endif
59 }
60
61 if (path.protocol == "azure://") {
62#if DMLC_USE_AZURE
64#else
65 LOG(FATAL) << "Please compile with DMLC_USE_AZURE=1 to use Azure";
66#endif
67 }
68
69 LOG(FATAL) << "unknown filesystem protocol " + path.protocol;
70 return NULL;
71}
72} // namespace io
73
75 unsigned part,
76 unsigned nsplit,
77 const char *type) {
78 return Create(uri_, nullptr, part, nsplit, type);
79}
80
82 const char *index_uri_,
83 unsigned part,
84 unsigned nsplit,
85 const char *type,
86 const bool shuffle,
87 const int seed,
88 const size_t batch_size,
89 const bool recurse_directories) {
90 using namespace std;
91 using namespace dmlc::io;
92 // allow cachefile in format path#cachefile
93 io::URISpec spec(uri_, part, nsplit);
94 if (!strcmp(spec.uri.c_str(), "stdin")) {
95 return new SingleFileSplit(spec.uri.c_str());
96 }
97 CHECK(part < nsplit) << "invalid input parameter for InputSplit::Create";
98 URI path(spec.uri.c_str());
99 InputSplitBase *split = NULL;
100 if (!strcmp(type, "text")) {
101 split = new LineSplitter(FileSystem::GetInstance(path),
102 spec.uri.c_str(), part, nsplit);
103 } else if (!strcmp(type, "indexed_recordio")) {
104 if (index_uri_ != nullptr) {
105 io::URISpec index_spec(index_uri_, part, nsplit);
106 split = new IndexedRecordIOSplitter(FileSystem::GetInstance(path),
107 spec.uri.c_str(), index_spec.uri.c_str(), part, nsplit,
108 batch_size, shuffle, seed);
109 } else {
110 LOG(FATAL) << "need to pass index file to use IndexedRecordIO";
111 }
112 } else if (!strcmp(type, "recordio")) {
113 split = new RecordIOSplitter(FileSystem::GetInstance(path),
114 spec.uri.c_str(), part, nsplit,
115 recurse_directories);
116 } else {
117 LOG(FATAL) << "unknown input split type " << type;
118 }
119#if DMLC_ENABLE_STD_THREAD
120 if (spec.cache_file.length() == 0) {
121 return new ThreadedInputSplit(split, batch_size);
122 } else {
123 return new CachedInputSplit(split, spec.cache_file.c_str());
124 }
125#else
126 CHECK(spec.cache_file.length() == 0)
127 << "to enable cached file, compile with c++11";
128 return split;
129#endif
130}
131
132Stream *Stream::Create(const char *uri,
133 const char * const flag,
134 bool try_create) {
135 io::URI path(uri);
136 return io::FileSystem::
137 GetInstance(path)->Open(path, flag, try_create);
138}
139
140SeekStream *SeekStream::CreateForRead(const char *uri, bool try_create) {
141 io::URI path(uri);
142 return io::FileSystem::
143 GetInstance(path)->OpenForRead(path, try_create);
144}
145} // namespace dmlc
Azure access module.
InputSplit that reads from an existing InputSplit and cache the data into local disk,...
input split creates that allows reading of records from split of data, independent part that covers a...
Definition io.h:155
static InputSplit * Create(const char *uri, unsigned part_index, unsigned num_parts, const char *type)
factory function: create input split given a uri
Definition io.cc:74
interface of i/o stream that support seek
Definition io.h:109
static SeekStream * CreateForRead(const char *uri, bool allow_null=false)
generic factory function create an SeekStream for read only, the stream will close the underlying fil...
Definition io.cc:140
interface of stream I/O for serialization
Definition io.h:30
static Stream * Create(const char *uri, const char *const flag, bool allow_null=false)
generic factory function create an stream, the stream will close the underlying files upon deletion
Definition io.cc:132
static AzureFileSystem * GetInstance(void)
get a singleton of AzureFileSystem when needed
Definition azure_filesys.h:38
static HDFSFileSystem * GetInstance(const std::string &namenode="default")
get a singleton of HDFSFileSystem when needed
Definition hdfs_filesys.h:59
class that splits the recordIO file by record
Definition indexed_recordio_split.h:23
class to construct input split from multiple files
Definition input_split_base.h:21
class that split the files by line
Definition line_split.h:20
static LocalFileSystem * GetInstance(void)
get a singleton of LocalFileSystem when needed
Definition local_filesys.h:54
class that split the files by line
Definition recordio_split.h:21
static S3FileSystem * GetInstance(void)
get a singleton of S3FileSystem when needed
Definition s3_filesys.h:64
line split implementation from single FILE simply returns lines of files, used for stdin
Definition single_file_split.h:32
some super set of URI that allows sugars to be passed around Example:
Definition uri_spec.h:28
std::string cache_file
the path to cache file
Definition uri_spec.h:35
std::string uri
the real URI
Definition uri_spec.h:31
defines configuration macros
defines serializable interface of dmlc
defines logging macros of dmlc allows use of GLOG, fall back to internal implementation when disabled
HDFS access module.
input split that splits indexed recordio files
base class implementation of input splitter
local access module
namespace for dmlc
Definition array_view.h:12
Definition StdDeque.h:58
input split that splits recordio files
S3 access module.
base implementation of line-spliter
a threaded version of InputSplit with a prefetch thread
common specification of sugars in URI string passed to dmlc Create functions such as local file cache