Medial Code Documentation
|
class that split the files by line More...
#include <line_split.h>
Public Member Functions | |
LineSplitter (FileSystem *fs, const char *uri, unsigned rank, unsigned nsplit) | |
bool | IsTextParser (void) |
query whether this object is a text parser | |
virtual bool | ExtractNextRecord (Blob *out_rec, Chunk *chunk) |
extract next record from the chunk | |
![]() | |
virtual void | BeforeFirst (void) |
reset the position of InputSplit to beginning | |
virtual void | HintChunkSize (size_t chunk_size) |
hint the inputsplit how large the chunk size it should return when implementing NextChunk this is a hint so may not be enforced, but InputSplit will try adjust its internal buffer size to the hinted value | |
virtual size_t | GetTotalSize (void) |
get the total size of the InputSplit | |
virtual bool | NextRecord (Blob *out_rec) |
get the next record, the returning value is valid until next call to NextRecord, NextChunk or NextBatch caller can modify the memory content of out_rec | |
virtual bool | NextChunk (Blob *out_chunk) |
get a chunk of memory that can contain multiple records, the caller needs to parse the content of the resulting chunk, for text file, out_chunk can contain data of multiple lines for recordio, out_chunk can contain multiple records(including headers) | |
virtual void | ResetPartition (unsigned rank, unsigned nsplit) |
reset the Input split to a certain part id, The InputSplit will be pointed to the head of the new specified segment. This feature may not be supported by every implementation of InputSplit. | |
virtual bool | ReadChunk (void *buf, size_t *size) |
read a chunk of data into buf the data can span multiple records, but cannot contain partial records | |
bool | ExtractNextChunk (Blob *out_rchunk, Chunk *chunk) |
extract next chunk from the chunk | |
virtual bool | NextChunkEx (Chunk *chunk) |
fill the given chunk with new data without using internal temporary chunk | |
virtual bool | NextBatchEx (Chunk *chunk, size_t) |
fill the given chunk with new batch of data without using internal temporary chunk | |
![]() | |
virtual bool | NextBatch (Blob *out_chunk, size_t) |
get a chunk of memory that can contain multiple records, with hint for how many records is needed, the caller needs to parse the content of the resulting chunk, for text file, out_chunk can contain data of multiple lines for recordio, out_chunk can contain multiple records(including headers) | |
virtual | ~InputSplit (void) DMLC_THROW_EXCEPTION |
destructor | |
Protected Member Functions | |
virtual size_t | SeekRecordBegin (Stream *fi) |
seek to the beginning of the first record in current file pointer | |
virtual const char * | FindLastRecordBegin (const char *begin, const char *end) |
find the last occurance of record header | |
![]() | |
void | Init (FileSystem *fs, const char *uri, size_t align_bytes, const bool recurse_directories=false) |
intialize the base before doing anything | |
std::vector< URI > | ConvertToURIs (const std::string &uri) |
split string list of files into vector of URIs | |
size_t | Read (void *ptr, size_t size) |
same as stream.Read | |
Additional Inherited Members | |
![]() | |
static InputSplit * | Create (const char *uri, unsigned part_index, unsigned num_parts, const char *type) |
factory function: create input split given a uri | |
static InputSplit * | Create (const char *uri, const char *index_uri, unsigned part_index, unsigned num_parts, const char *type, const bool shuffle=false, const int seed=0, const size_t batch_size=256, const bool recurse_directories=false) |
factory function: create input split given a uri for input and index | |
![]() | |
static const size_t | kBufferSize = 2UL << 20UL |
![]() | |
FileSystem * | filesys_ |
FileSystem. | |
std::vector< size_t > | file_offset_ |
byte-offset of each file | |
size_t | offset_curr_ |
get the current offset | |
size_t | offset_begin_ |
beginning of offset | |
size_t | offset_end_ |
end of the offset | |
std::vector< FileInfo > | files_ |
information about files | |
SeekStream * | fs_ |
current input stream | |
size_t | file_ptr_ |
file pointer of which file to read on | |
size_t | file_ptr_end_ |
file pointer where the end of file lies | |
Chunk | tmp_chunk_ |
temporal chunk | |
size_t | buffer_size_ |
buffer size | |
class that split the files by line
extract next record from the chunk
out_rec | the output record |
chunk | the chunk information |
Implements dmlc::io::InputSplitBase.
|
protectedvirtual |
find the last occurance of record header
begin | beginning of the buffer |
end | end of the buffer |
Implements dmlc::io::InputSplitBase.
|
inlinevirtual |
query whether this object is a text parser
Implements dmlc::io::InputSplitBase.
|
protectedvirtual |
seek to the beginning of the first record in current file pointer
Implements dmlc::io::InputSplitBase.