Medial Code Documentation
Loading...
Searching...
No Matches
sparse_page_dmatrix.h
Go to the documentation of this file.
1
7#ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
8#define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
9
10#include <algorithm>
11#include <map>
12#include <memory>
13#include <string>
14#include <utility>
15#include <vector>
16
17#include "ellpack_page_source.h"
18#include "gradient_index_page_source.h"
19#include "sparse_page_source.h"
20#include "xgboost/data.h"
21#include "xgboost/logging.h"
22
23namespace xgboost::data {
57class SparsePageDMatrix : public DMatrix {
58 MetaInfo info_;
59 BatchParam batch_param_;
60 std::map<std::string, std::shared_ptr<Cache>> cache_info_;
61
62 DMatrixHandle proxy_;
63 DataIterHandle iter_;
66
67 float missing_;
68 Context fmat_ctx_;
69 std::string cache_prefix_;
70 uint32_t n_batches_{0};
71 // sparse page is the source to other page types, we make a special member function.
72 void InitializeSparsePage(Context const *ctx);
73 // Non-virtual version that can be used in constructor
74 BatchSet<SparsePage> GetRowBatchesImpl(Context const *ctx);
75
76 public:
77 explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
78 XGDMatrixCallbackNext *next, float missing, int32_t nthreads,
79 std::string cache_prefix);
80
81 ~SparsePageDMatrix() override {
82 // Clear out all resources before deleting the cache file.
83 sparse_page_source_.reset();
84 ellpack_page_source_.reset();
85 column_source_.reset();
86 sorted_column_source_.reset();
87 ghist_index_source_.reset();
88
89 for (auto const &kv : cache_info_) {
90 CHECK(kv.second);
91 auto n = kv.second->ShardName();
92 TryDeleteCacheFile(n);
93 }
94 }
95
96 MetaInfo &Info() override;
97 const MetaInfo &Info() const override;
98 Context const *Ctx() const override { return &fmat_ctx_; }
99 // The only DMatrix implementation that returns false.
100 bool SingleColBlock() const override { return false; }
101 DMatrix *Slice(common::Span<int32_t const>) override {
102 LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
103 return nullptr;
104 }
105 DMatrix *SliceCol(int, int) override {
106 LOG(FATAL) << "Slicing DMatrix columns is not supported for external memory.";
107 return nullptr;
108 }
109
110 private:
111 BatchSet<SparsePage> GetRowBatches() override;
112 BatchSet<CSCPage> GetColumnBatches(Context const *ctx) override;
113 BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const *ctx) override;
114 BatchSet<EllpackPage> GetEllpackBatches(Context const *ctx, const BatchParam &param) override;
115 BatchSet<GHistIndexMatrix> GetGradientIndex(Context const *ctx, const BatchParam &) override;
116 BatchSet<ExtSparsePage> GetExtBatches(Context const *, BatchParam const &) override {
117 LOG(FATAL) << "Can not obtain a single CSR page for external memory DMatrix";
119 }
120
121 // source data pointers.
122 std::shared_ptr<SparsePageSource> sparse_page_source_;
123 std::shared_ptr<EllpackPageSource> ellpack_page_source_;
124 std::shared_ptr<CSCPageSource> column_source_;
125 std::shared_ptr<SortedCSCPageSource> sorted_column_source_;
126 std::shared_ptr<GradientIndexPageSource> ghist_index_source_;
127
128 bool EllpackExists() const override { return static_cast<bool>(ellpack_page_source_); }
129 bool GHistIndexExists() const override { return static_cast<bool>(ghist_index_source_); }
130 bool SparsePageExists() const override { return static_cast<bool>(sparse_page_source_); }
131};
132
133inline std::string MakeId(std::string prefix, SparsePageDMatrix *ptr) {
134 std::stringstream ss;
135 ss << ptr;
136 return prefix + "-" + ss.str();
137}
138
139inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::string prefix,
140 std::map<std::string, std::shared_ptr<Cache>> *out) {
141 auto &cache_info = *out;
142 auto name = MakeId(prefix, ptr);
143 auto id = name + format;
144 auto it = cache_info.find(id);
145 if (it == cache_info.cend()) {
146 cache_info[id].reset(new Cache{false, name, format});
147 LOG(INFO) << "Make cache:" << cache_info[id]->ShardName() << std::endl;
148 }
149 return id;
150}
151} // namespace xgboost::data
152#endif // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
Definition svm.cpp:71
Definition data.h:458
Definition data.h:494
Meta information about dataset, always sit in memory.
Definition data.h:48
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition span.h:424
Definition core.py:748
DMatrix used for external memory.
Definition sparse_page_dmatrix.h:57
XGB_EXTERN_C typedef int XGDMatrixCallbackNext(DataIterHandle iter)
Callback function prototype for getting next batch of data.
XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle)
Callback function prototype for resetting external iterator.
Copyright 2015-2023 by XGBoost Contributors.
defines console logging options for xgboost. Use to enforce unified print behavior.
Copyright 2019-2023, XGBoost Contributors.
Definition data.py:1
Copyright 2014-2023, XGBoost Contributors.
Parameters for constructing histogram index batches.
Definition data.h:244
Runtime context for XGBoost.
Definition context.h:84