Medial Code Documentation
Loading...
Searching...
No Matches
file_iterator.h
1
4#ifndef XGBOOST_DATA_FILE_ITERATOR_H_
5#define XGBOOST_DATA_FILE_ITERATOR_H_
6
7#include <algorithm> // for max_element
8#include <cstddef> // for size_t
9#include <cstdint> // for uint32_t
10#include <memory> // for unique_ptr
11#include <string> // for string
12#include <utility> // for move
13
14#include "dmlc/data.h" // for RowBlock, Parser
15#include "xgboost/c_api.h" // for XGDMatrixSetDenseInfo, XGDMatrixFree, XGProxyDMatrixCreate
16#include "xgboost/linalg.h" // for ArrayInterfaceStr, MakeVec
17#include "xgboost/logging.h" // for CHECK
18
19namespace xgboost::data {
20[[nodiscard]] std::string ValidateFileFormat(std::string const& uri);
21
28 // uri of input file, encodes parameters about whether it's 1-based index etc. dmlc
29 // parser will decode these information.
30 std::string uri_;
31 // Equals to rank_id in distributed training, used to split file into parts for each
32 // worker.
33 uint32_t part_idx_;
34 // Equals to total number of workers.
35 uint32_t n_parts_;
36
37 DMatrixHandle proxy_;
38
39 std::unique_ptr<dmlc::Parser<uint32_t>> parser_;
40 // Temporary reference to stage the data.
42 // Storage for the array interface strings.
43 std::string indptr_;
44 std::string values_;
45 std::string indices_;
46
47 public:
48 FileIterator(std::string uri, unsigned part_index, unsigned num_parts)
49 : uri_{ValidateFileFormat(std::move(uri))}, part_idx_{part_index}, n_parts_{num_parts} {
50 XGProxyDMatrixCreate(&proxy_);
51 }
53 XGDMatrixFree(proxy_);
54 }
55
56 int Next() {
57 CHECK(parser_);
58 if (parser_->Next()) {
59 row_block_ = parser_->Value();
60 using linalg::MakeVec;
61
62 indptr_ = ArrayInterfaceStr(MakeVec(row_block_.offset, row_block_.size + 1));
63 values_ = ArrayInterfaceStr(MakeVec(row_block_.value, row_block_.offset[row_block_.size]));
64 indices_ = ArrayInterfaceStr(MakeVec(row_block_.index, row_block_.offset[row_block_.size]));
65
66 size_t n_columns = *std::max_element(row_block_.index,
67 row_block_.index + row_block_.offset[row_block_.size]);
68 // dmlc parser converts 1-based indexing back to 0-based indexing so we can ignore
69 // this condition and just add 1 to n_columns
70 n_columns += 1;
71
72 XGProxyDMatrixSetDataCSR(proxy_, indptr_.c_str(), indices_.c_str(),
73 values_.c_str(), n_columns);
74
75 if (row_block_.label) {
76 XGDMatrixSetDenseInfo(proxy_, "label", row_block_.label, row_block_.size, 1);
77 }
78 if (row_block_.qid) {
79 XGDMatrixSetDenseInfo(proxy_, "qid", row_block_.qid, row_block_.size, 1);
80 }
81 if (row_block_.weight) {
82 XGDMatrixSetDenseInfo(proxy_, "weight", row_block_.weight, row_block_.size, 1);
83 }
84 // Continue iteration
85 return true;
86 } else {
87 // Stop iteration
88 return false;
89 }
90 }
91
92 auto Proxy() -> decltype(proxy_) { return proxy_; }
93
94 void Reset() {
95 parser_.reset(dmlc::Parser<uint32_t>::Create(uri_.c_str(), part_idx_, n_parts_, "auto"));
96 }
97};
98
99namespace fileiter {
100inline void Reset(DataIterHandle self) {
101 static_cast<FileIterator*>(self)->Reset();
102}
103
104inline int Next(DataIterHandle self) {
105 return static_cast<FileIterator*>(self)->Next();
106}
107} // namespace fileiter
108} // namespace xgboost::data
109#endif // XGBOOST_DATA_FILE_ITERATOR_H_
parser interface that parses input data used to load dmlc data format into your own data format Diffe...
Definition data.h:293
An iterator for implementing external memory support with file inputs.
Definition file_iterator.h:27
defines common input data structure, and interface for handling the input data
XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, void const *data, bst_ulong size, int type)
Set meta info from dense matrix. Valid field names are:
Definition c_api.cc:672
XGB_DLL int XGDMatrixFree(DMatrixHandle handle)
free space in data matrix
Definition c_api.cc:585
XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle *out)
Second set of callback functions, used by constructing Quantile DMatrix or external memory DMatrix us...
Definition c_api.cc:359
XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr, char const *indices, char const *data, bst_ulong ncol)
Set data on a DMatrix proxy.
Definition c_api.cc:406
defines console logging options for xgboost. Use to enforce unified print behavior.
Copyright 2021-2023 by XGBoost Contributors.
Copyright 2019-2023, XGBoost Contributors.
Definition data.py:1
auto MakeVec(T *ptr, size_t s, int32_t device=-1)
Create a vector view from contigious memory.
Definition linalg.h:649
a block of data, containing several rows in sparse matrix This is useful for (streaming-sxtyle) algor...
Definition data.h:175
const DType * label
array[size] label of each instance
Definition data.h:181
size_t size
batch size
Definition data.h:177
const real_t * weight
With weight: array[size] label of each instance, otherwise nullptr.
Definition data.h:183
const IndexType * index
feature index
Definition data.h:189
const DType * value
feature value, can be NULL, indicating all values are 1
Definition data.h:191
const uint64_t * qid
With qid: array[size] session id of each instance, otherwise nullptr.
Definition data.h:185
const size_t * offset
array[size+1], row pointer to beginning of each rows
Definition data.h:179
Copyright 2015~2023 by XGBoost Contributors.