Medial Code Documentation
Loading...
Searching...
No Matches
gradient_index.h
1
5#ifndef XGBOOST_DATA_GRADIENT_INDEX_H_
6#define XGBOOST_DATA_GRADIENT_INDEX_H_
7
8#include <algorithm> // for min
9#include <atomic> // for atomic
10#include <cinttypes> // for uint32_t
11#include <cstddef> // for size_t
12#include <memory> // for make_unique
13#include <vector>
14
15#include "../common/categorical.h"
16#include "../common/error_msg.h" // for InfInData
17#include "../common/hist_util.h"
18#include "../common/numeric.h"
19#include "../common/ref_resource_view.h" // for RefResourceView
20#include "../common/threading_utils.h"
21#include "../common/transform_iterator.h" // for MakeIndexTransformIter
22#include "adapter.h"
23#include "xgboost/base.h"
24#include "xgboost/data.h"
25
26namespace xgboost {
27namespace common {
28class ColumnMatrix;
29class AlignedFileWriteStream;
30} // namespace common
31
39 // Get the size of each row
40 template <typename AdapterBatchT>
41 auto GetRowCounts(AdapterBatchT const& batch, float missing, int32_t n_threads) {
42 std::vector<size_t> valid_counts(batch.Size(), 0);
43 common::ParallelFor(batch.Size(), n_threads, [&](size_t i) {
44 auto line = batch.GetLine(i);
45 for (size_t j = 0; j < line.Size(); ++j) {
46 data::COOTuple elem = line.GetElement(j);
47 if (data::IsValidFunctor {missing}(elem)) {
48 valid_counts[i]++;
49 }
50 }
51 });
52 return valid_counts;
53 }
54
59 void PushBatch(SparsePage const& batch, common::Span<FeatureType const> ft, int32_t n_threads);
60
61 template <typename Batch, typename BinIdxType, typename GetOffset, typename IsValid>
62 void SetIndexData(common::Span<BinIdxType> index_data_span, size_t rbegin,
63 common::Span<FeatureType const> ft, size_t batch_threads, Batch const& batch,
64 IsValid&& is_valid, size_t nbins, GetOffset&& get_offset) {
65 auto batch_size = batch.Size();
66 BinIdxType* index_data = index_data_span.data();
67 auto const& ptrs = cut.Ptrs();
68 auto const& values = cut.Values();
69 std::atomic<bool> valid{true};
70 common::ParallelFor(batch_size, batch_threads, [&](size_t i) {
71 auto line = batch.GetLine(i);
72 size_t ibegin = row_ptr[rbegin + i]; // index of first entry for current block
73 size_t k = 0;
74 auto tid = omp_get_thread_num();
75 for (size_t j = 0; j < line.Size(); ++j) {
76 data::COOTuple elem = line.GetElement(j);
77 if (is_valid(elem)) {
78 if (XGBOOST_EXPECT((std::isinf(elem.value)), false)) {
79 valid = false;
80 }
81 bst_bin_t bin_idx{-1};
82 if (common::IsCat(ft, elem.column_idx)) {
83 bin_idx = cut.SearchCatBin(elem.value, elem.column_idx, ptrs, values);
84 } else {
85 bin_idx = cut.SearchBin(elem.value, elem.column_idx, ptrs, values);
86 }
87 index_data[ibegin + k] = get_offset(bin_idx, j);
88 ++hit_count_tloc_[tid * nbins + bin_idx];
89 ++k;
90 }
91 }
92 });
93
94 CHECK(valid) << error::InfInData();
95 }
96
97 // Gather hit_count from all threads
98 void GatherHitCount(int32_t n_threads, bst_bin_t n_bins_total) {
99 CHECK_EQ(hit_count.size(), n_bins_total);
100 common::ParallelFor(n_bins_total, n_threads, [&](bst_omp_uint idx) {
101 for (int32_t tid = 0; tid < n_threads; ++tid) {
102 hit_count[idx] += hit_count_tloc_[tid * n_bins_total + idx];
103 hit_count_tloc_[tid * n_bins_total + idx] = 0; // reset for next batch
104 }
105 });
106 }
107
108 template <typename Batch, typename IsValid>
109 void PushBatchImpl(int32_t n_threads, Batch const& batch, size_t rbegin, IsValid&& is_valid,
110 common::Span<FeatureType const> ft) {
111 // The number of threads is pegged to the batch size. If the OMP block is parallelized
112 // on anything other than the batch/block size, it should be reassigned
113 size_t batch_threads =
114 std::max(static_cast<size_t>(1), std::min(batch.Size(), static_cast<size_t>(n_threads)));
115
116 auto n_bins_total = cut.TotalBins();
117 const size_t n_index = row_ptr[rbegin + batch.Size()]; // number of entries in this page
118 ResizeIndex(n_index, isDense_);
119 if (isDense_) {
120 index.SetBinOffset(cut.Ptrs());
121 }
122 if (isDense_) {
123 common::DispatchBinType(index.GetBinTypeSize(), [&](auto dtype) {
124 using T = decltype(dtype);
125 common::Span<T> index_data_span = {index.data<T>(), index.Size()};
126 SetIndexData(index_data_span, rbegin, ft, batch_threads, batch, is_valid, n_bins_total,
127 index.MakeCompressor<T>());
128 });
129 } else {
130 common::Span<uint32_t> index_data_span = {index.data<uint32_t>(), n_index};
131 // no compression
132 SetIndexData(index_data_span, rbegin, ft, batch_threads, batch, is_valid, n_bins_total,
133 [](auto idx, auto) { return idx; });
134 }
135 this->GatherHitCount(n_threads, n_bins_total);
136 }
137
138 public:
152 bst_row_t base_rowid{0};
153
154 [[nodiscard]] bst_bin_t MaxNumBinPerFeat() const {
155 return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
156 }
157
158 ~GHistIndexMatrix();
162 GHistIndexMatrix(Context const* ctx, DMatrix* x, bst_bin_t max_bins_per_feat,
163 double sparse_thresh, bool sorted_sketch, common::Span<float const> hess = {});
168 GHistIndexMatrix(MetaInfo const& info, common::HistogramCuts&& cuts, bst_bin_t max_bin_per_feat);
173 GHistIndexMatrix(Context const* ctx, MetaInfo const& info, EllpackPage const& page,
174 BatchParam const& p);
175
179 GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
180 common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
181 double sparse_thresh, int32_t n_threads);
182 GHistIndexMatrix(); // also for ext mem, empty ctor so that we can read the cache back.
183
184 template <typename Batch>
185 void PushAdapterBatch(Context const* ctx, size_t rbegin, size_t prev_sum, Batch const& batch,
186 float missing, common::Span<FeatureType const> ft, double sparse_thresh,
187 size_t n_samples_total) {
188 auto n_bins_total = cut.TotalBins();
189 hit_count_tloc_.clear();
190 hit_count_tloc_.resize(ctx->Threads() * n_bins_total, 0);
191
192 auto n_threads = ctx->Threads();
193 auto valid_counts = GetRowCounts(batch, missing, n_threads);
194
195 auto it = common::MakeIndexTransformIter([&](size_t ridx) { return valid_counts[ridx]; });
196 common::PartialSum(n_threads, it, it + batch.Size(), prev_sum, row_ptr.begin() + rbegin);
197 auto is_valid = data::IsValidFunctor{missing};
198
199 PushBatchImpl(ctx->Threads(), batch, rbegin, is_valid, ft);
200
201 if (rbegin + batch.Size() == n_samples_total) {
202 // finished
203 CHECK(!std::isnan(sparse_thresh));
204 this->columns_ = std::make_unique<common::ColumnMatrix>(*this, sparse_thresh);
205 }
206 }
207
208 // Call ColumnMatrix::PushBatch
209 template <typename Batch>
210 void PushAdapterBatchColumns(Context const* ctx, Batch const& batch, float missing,
211 size_t rbegin);
212
213 void ResizeIndex(const size_t n_index, const bool isDense);
214
215 void GetFeatureCounts(size_t* counts) const {
216 auto nfeature = cut.Ptrs().size() - 1;
217 for (unsigned fid = 0; fid < nfeature; ++fid) {
218 auto ibegin = cut.Ptrs()[fid];
219 auto iend = cut.Ptrs()[fid + 1];
220 for (auto i = ibegin; i < iend; ++i) {
221 counts[fid] += hit_count[i];
222 }
223 }
224 }
225
226 [[nodiscard]] bool IsDense() const { return isDense_; }
227 void SetDense(bool is_dense) { isDense_ = is_dense; }
231 [[nodiscard]] std::size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
232
233 [[nodiscard]] bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
234 [[nodiscard]] bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
235
236 [[nodiscard]] bool ReadColumnPage(common::AlignedResourceReadStream* fi);
237 [[nodiscard]] std::size_t WriteColumnPage(common::AlignedFileWriteStream* fo) const;
238
239 [[nodiscard]] common::ColumnMatrix const& Transpose() const;
240
241 [[nodiscard]] bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
242
243 [[nodiscard]] float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
244 [[nodiscard]] float GetFvalue(std::vector<std::uint32_t> const& ptrs,
245 std::vector<float> const& values, std::vector<float> const& mins,
246 bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
247
248 [[nodiscard]] common::HistogramCuts& Cuts() { return cut; }
249 [[nodiscard]] common::HistogramCuts const& Cuts() const { return cut; }
250
251 private:
252 std::unique_ptr<common::ColumnMatrix> columns_;
253 std::vector<size_t> hit_count_tloc_;
254 bool isDense_;
255};
256
264template <typename Fn>
265void AssignColumnBinIndex(GHistIndexMatrix const& page, Fn&& assign) {
266 auto const batch_size = page.Size();
267 auto const& ptrs = page.cut.Ptrs();
268 std::size_t k{0};
269
270 auto dense = page.IsDense();
271
272 common::DispatchBinType(page.index.GetBinTypeSize(), [&](auto t) {
273 using BinT = decltype(t);
274 auto const& index = page.index;
275 for (std::size_t ridx = 0; ridx < batch_size; ++ridx) {
276 auto r_beg = page.row_ptr[ridx];
277 auto r_end = page.row_ptr[ridx + 1];
278 bst_feature_t fidx{0};
279 if (dense) {
280 // compressed, use the operator to obtain the true value.
281 for (std::size_t j = r_beg; j < r_end; ++j) {
282 bst_feature_t fidx = j - r_beg;
283 std::uint32_t bin_idx = index[k];
284 assign(bin_idx, k, ridx, fidx);
285 ++k;
286 }
287 } else {
288 // not compressed
289 auto const* row_index = index.data<BinT>() + page.row_ptr[page.base_rowid];
290 for (std::size_t j = r_beg; j < r_end; ++j) {
291 std::uint32_t bin_idx = row_index[k];
292 // find the feature index for current bin.
293 while (bin_idx >= ptrs[fidx + 1]) {
294 fidx++;
295 }
296 assign(bin_idx, k, ridx, fidx);
297 ++k;
298 }
299 }
300 }
301 });
302}
303} // namespace xgboost
304#endif // XGBOOST_DATA_GRADIENT_INDEX_H_
preprocessed global index matrix, in CSR format.
Definition gradient_index.h:38
common::RefResourceView< std::size_t > row_ptr
row pointer to rows by element position
Definition gradient_index.h:140
common::Index index
The histogram index.
Definition gradient_index.h:144
std::size_t RowIdx(size_t ridx) const
Get the local row index.
Definition gradient_index.h:231
common::RefResourceView< std::size_t > hit_count
hit count of each index, used for constructing the ColumnMatrix
Definition gradient_index.h:146
bst_bin_t max_numeric_bins_per_feat
max_bin for each feature.
Definition gradient_index.h:150
common::RefResourceView< std::uint8_t > data
data storage for index.
Definition gradient_index.h:142
common::HistogramCuts cut
The corresponding cuts.
Definition gradient_index.h:148
In-memory storage unit of sparse batch, stored in CSR format.
Definition data.h:328
Definition hist_util.h:37
Optionally compressed gradient index.
Definition hist_util.h:210
A vector-like type that holds a reference counted resource.
Definition ref_resource_view.h:26
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition span.h:424
Copyright 2015-2023 by XGBoost Contributors.
Copyright 2015-2023 by XGBoost Contributors.
auto DispatchBinType(BinTypeSize type, Fn &&fn)
Dispatch for bin type, fn is a function that accepts a scalar of the bin type.
Definition hist_util.h:187
namespace of xgboost
Definition base.h:90
void AssignColumnBinIndex(GHistIndexMatrix const &page, Fn &&assign)
Helper for recovering feature index from row-based storage of histogram bin.
Definition gradient_index.h:265
dmlc::omp_uint bst_omp_uint
define unsigned int for openmp loop
Definition base.h:324
std::size_t bst_row_t
Type for data row index.
Definition base.h:110
int32_t bst_bin_t
Type for histogram bin index.
Definition base.h:103
std::int32_t Threads() const
Returns the automatically chosen number of threads based on the nthread parameter and the system sett...
Definition context.cc:203