6#ifndef DMLC_IO_INDEXED_RECORDIO_SPLIT_H_
7#define DMLC_IO_INDEXED_RECORDIO_SPLIT_H_
21const unsigned INDEXED_RECORDIO_ALIGN = 4;
27 const char *index_uri,
30 const size_t batch_size,
33 this->shuffle_ = shuffle;
34 if (shuffle) SetRandomSeed(seed);
35 this->batch_size_ = batch_size;
36 this->
Init(fs, uri, INDEXED_RECORDIO_ALIGN);
37 this->ReadIndexFile(fs, index_uri);
45 bool ReadChunk(
void *buf,
size_t *size)
override;
48 bool NextBatch(Blob *out_chunk,
size_t n_records)
override;
56 void SetRandomSeed(
size_t seed) {
57 rnd_.seed(kRandMagic + seed);
59 void SetBatchSize(
int batch_size) {
60 this->batch_size_ = batch_size;
65 bool NextBatchEx(Chunk *out_chunk,
size_t n_records)
override;
71 virtual void ReadIndexFile(FileSystem *fs,
const std::string& index_uri);
74 std::vector<std::pair<size_t, size_t> > index_;
75 std::vector<size_t> permutation_;
77 size_t current_index_;
82 const int kRandMagic = 111;
interface of stream I/O for serialization
Definition io.h:30
class that splits the recordIO file by record
Definition indexed_recordio_split.h:23
size_t SeekRecordBegin(Stream *fi) override
seek to the beginning of the first record in current file pointer
Definition indexed_recordio_split.cc:66
bool NextBatchEx(Chunk *out_chunk, size_t n_records) override
fill the given chunk with new batch of data without using internal temporary chunk
Definition indexed_recordio_split.cc:159
bool NextRecord(Blob *out_rec) override
get the next record, the returning value is valid until next call to NextRecord, NextChunk or NextBat...
Definition indexed_recordio_split.h:49
bool NextChunk(Blob *out_chunk) override
get a chunk of memory that can contain multiple records, the caller needs to parse the content of the...
Definition indexed_recordio_split.cc:155
const char * FindLastRecordBegin(const char *begin, const char *end) override
find the last occurance of record header
Definition indexed_recordio_split.cc:86
bool ReadChunk(void *buf, size_t *size) override
read a chunk of data into buf the data can span multiple records, but cannot contain partial records
Definition indexed_recordio_split.cc:144
bool ExtractNextRecord(Blob *out_rec, Chunk *chunk) override
extract next record from the chunk
Definition indexed_recordio_split.cc:104
void ResetPartition(unsigned rank, unsigned nsplit) override
reset the Input split to a certain part id, The InputSplit will be pointed to the head of the new spe...
Definition indexed_recordio_split.cc:12
void BeforeFirst(void) override
reset the position of InputSplit to beginning
Definition indexed_recordio_split.cc:221
bool NextBatch(Blob *out_chunk, size_t n_records) override
get a chunk of memory that can contain multiple records, with hint for how many records is needed,...
Definition indexed_recordio_split.cc:214
bool NextChunkEx(Chunk *out_chunk) override
fill the given chunk with new data without using internal temporary chunk
Definition indexed_recordio_split.h:62
bool IsTextParser(void) override
query whether this object is a text parser
Definition indexed_recordio_split.h:41
defines serializable interface of dmlc
namespace for dmlc
Definition array_view.h:12
recordio that is able to pack binary data into a splittable format, useful to exchange data in binary...