Medial Code Documentation
Loading...
Searching...
No Matches
input_split_base.h
Go to the documentation of this file.
1
7#ifndef DMLC_IO_INPUT_SPLIT_BASE_H_
8#define DMLC_IO_INPUT_SPLIT_BASE_H_
9
10#include <dmlc/io.h>
11#include <dmlc/filesystem.h>
12#include <cstdio>
13#include <cstring>
14#include <vector>
15#include <string>
16#include <algorithm>
17
18namespace dmlc {
19namespace io {
21class InputSplitBase : public InputSplit {
22 public:
27 struct Chunk {
28 char *begin;
29 char *end;
30 std::vector<uint32_t> data;
31 explicit Chunk(size_t buffer_size)
32 : begin(NULL), end(NULL),
33 data(buffer_size + 1) {}
34 // load chunk from split
35 bool Load(InputSplitBase *split, size_t buffer_size);
36 // append to chunk
37 bool Append(InputSplitBase *split, size_t buffer_size);
38 };
39 // 16 MB
40 static const size_t kBufferSize = 2UL << 20UL;
41 // destructor
42 virtual ~InputSplitBase(void);
43 // implement BeforeFirst
44 virtual void BeforeFirst(void);
45 virtual void HintChunkSize(size_t chunk_size) {
46 buffer_size_ = std::max(chunk_size / sizeof(uint32_t), buffer_size_);
47 }
48 virtual size_t GetTotalSize(void) {
49 return file_offset_.back();
50 }
51 // implement next record
52 virtual bool NextRecord(Blob *out_rec) {
53 while (!ExtractNextRecord(out_rec, &tmp_chunk_)) {
54 if (!NextChunkEx(&tmp_chunk_)) return false;
55 }
56 return true;
57 }
58 // implement next chunk
59 virtual bool NextChunk(Blob *out_chunk) {
60 while (!ExtractNextChunk(out_chunk, &tmp_chunk_)) {
61 if (!NextChunkEx(&tmp_chunk_)) return false;
62 }
63 return true;
64 }
65 // implement ResetPartition.
66 virtual void ResetPartition(unsigned rank, unsigned nsplit);
78 virtual bool ReadChunk(void *buf, size_t *size);
86 bool ExtractNextChunk(Blob *out_rchunk, Chunk *chunk);
94 virtual bool ExtractNextRecord(Blob *out_rec, Chunk *chunk) = 0;
100 virtual bool IsTextParser(void) = 0;
106 virtual bool NextChunkEx(Chunk *chunk) {
107 if (!chunk->Load(this, buffer_size_)) return false;
108 return true;
109 }
115 virtual bool NextBatchEx(Chunk *chunk, size_t /*n_records*/) {
116 return NextChunkEx(chunk);
117 }
118
119 protected:
121 FileSystem *filesys_;
123 std::vector<size_t> file_offset_;
131 std::vector<FileInfo> files_;
135 size_t file_ptr_;
142 // constructor
144 : fs_(NULL),
145 tmp_chunk_(kBufferSize),
146 buffer_size_(kBufferSize),
147 align_bytes_(8) {}
158 void Init(FileSystem *fs,
159 const char *uri,
160 size_t align_bytes,
161 const bool recurse_directories = false);
162 // to be implemented by child class
168 virtual size_t SeekRecordBegin(Stream *fi) = 0;
176 virtual const char*
177 FindLastRecordBegin(const char *begin, const char *end) = 0;
178
180 std::vector<URI> ConvertToURIs(const std::string& uri);
182 size_t Read(void *ptr, size_t size);
183
184 private:
186 size_t align_bytes_;
188 std::string overflow_;
190 void InitInputFileInfo(const std::string& uri,
191 const bool recurse_directories);
193 std::string StripEnd(std::string str, char ch);
194};
195} // namespace io
196} // namespace dmlc
197#endif // DMLC_IO_INPUT_SPLIT_BASE_H_
input split creates that allows reading of records from split of data, independent part that covers a...
Definition io.h:155
interface of i/o stream that support seek
Definition io.h:109
interface of stream I/O for serialization
Definition io.h:30
class to construct input split from multiple files
Definition input_split_base.h:21
virtual bool NextChunkEx(Chunk *chunk)
fill the given chunk with new data without using internal temporary chunk
Definition input_split_base.h:106
size_t offset_begin_
beginning of offset
Definition input_split_base.h:127
size_t offset_end_
end of the offset
Definition input_split_base.h:129
virtual void HintChunkSize(size_t chunk_size)
hint the inputsplit how large the chunk size it should return when implementing NextChunk this is a h...
Definition input_split_base.h:45
virtual bool NextRecord(Blob *out_rec)
get the next record, the returning value is valid until next call to NextRecord, NextChunk or NextBat...
Definition input_split_base.h:52
SeekStream * fs_
current input stream
Definition input_split_base.h:133
size_t file_ptr_end_
file pointer where the end of file lies
Definition input_split_base.h:137
void Init(FileSystem *fs, const char *uri, size_t align_bytes, const bool recurse_directories=false)
intialize the base before doing anything
Definition input_split_base.cc:13
std::vector< FileInfo > files_
information about files
Definition input_split_base.h:131
virtual bool IsTextParser(void)=0
query whether this object is a text parser
FileSystem * filesys_
FileSystem.
Definition input_split_base.h:121
std::vector< size_t > file_offset_
byte-offset of each file
Definition input_split_base.h:123
virtual bool NextBatchEx(Chunk *chunk, size_t)
fill the given chunk with new batch of data without using internal temporary chunk
Definition input_split_base.h:115
virtual bool NextChunk(Blob *out_chunk)
get a chunk of memory that can contain multiple records, the caller needs to parse the content of the...
Definition input_split_base.h:59
virtual size_t GetTotalSize(void)
get the total size of the InputSplit
Definition input_split_base.h:48
size_t file_ptr_
file pointer of which file to read on
Definition input_split_base.h:135
virtual size_t SeekRecordBegin(Stream *fi)=0
seek to the beginning of the first record in current file pointer
bool ExtractNextChunk(Blob *out_rchunk, Chunk *chunk)
extract next chunk from the chunk
Definition input_split_base.cc:300
virtual void BeforeFirst(void)
reset the position of InputSplit to beginning
Definition input_split_base.cc:66
std::vector< URI > ConvertToURIs(const std::string &uri)
split string list of files into vector of URIs
Definition input_split_base.cc:96
virtual bool ReadChunk(void *buf, size_t *size)
read a chunk of data into buf the data can span multiple records, but cannot contain partial records
Definition input_split_base.cc:221
size_t offset_curr_
get the current offset
Definition input_split_base.h:125
size_t buffer_size_
buffer size
Definition input_split_base.h:141
size_t Read(void *ptr, size_t size)
same as stream.Read
Definition input_split_base.cc:177
virtual const char * FindLastRecordBegin(const char *begin, const char *end)=0
find the last occurance of record header
Chunk tmp_chunk_
temporal chunk
Definition input_split_base.h:139
virtual bool ExtractNextRecord(Blob *out_rec, Chunk *chunk)=0
extract next record from the chunk
virtual void ResetPartition(unsigned rank, unsigned nsplit)
reset the Input split to a certain part id, The InputSplit will be pointed to the head of the new spe...
Definition input_split_base.cc:30
Utilities to manipulate files.
defines serializable interface of dmlc
namespace for dmlc
Definition array_view.h:12
a blob of memory region
Definition io.h:158
helper struct to hold chunk data with internal pointer to move along the record
Definition input_split_base.h:27