152 struct MissingIndicator {
154 using T =
typename BitFieldT::value_type;
158 static_assert(std::is_same_v<T, std::uint32_t>);
160 template <
typename U>
161 [[nodiscard]] std::enable_if_t<!std::is_signed_v<U>, U>
static InitValue(
bool init) {
162 return init ? ~U{0} : U{0};
165 MissingIndicator() =
default;
170 MissingIndicator(std::size_t n_elements,
bool init) {
171 auto m_size = missing.ComputeStorageSize(n_elements);
176 void SetValid(
typename LBitField32::index_type i) { missing.Clear(i); }
182 void GrowTo(std::size_t n_elements,
bool init) {
183 CHECK(storage.
Resource()->Type() == ResourceHandler::kMalloc)
184 <<
"[Internal Error]: Cannot grow the vector when external memory is used.";
185 auto m_size = missing.ComputeStorageSize(n_elements);
186 CHECK_GE(m_size, storage.size());
187 if (m_size == storage.size()) {
191 auto resource = std::dynamic_pointer_cast<common::MallocResource>(storage.
Resource());
193 resource->Resize(m_size *
sizeof(T), InitValue<std::byte>(init));
202 template <
typename ColumnBinT,
typename BinT,
typename RIdx>
203 void SetBinSparse(BinT bin_id, RIdx rid,
bst_feature_t fid, ColumnBinT* local_index) {
204 if (type_[fid] == kDenseColumn) {
205 ColumnBinT* begin = &local_index[feature_offsets_[fid]];
206 begin[rid] = bin_id - index_base_[fid];
210 missing_.SetValid(feature_offsets_[fid] + rid);
212 ColumnBinT* begin = &local_index[feature_offsets_[fid]];
213 begin[num_nonzeros_[fid]] = bin_id - index_base_[fid];
214 row_ind_[feature_offsets_[fid] + num_nonzeros_[fid]] = rid;
215 ++num_nonzeros_[fid];
227 this->InitStorage(gmat, sparse_threshold);
237 this->InitStorage(gmat, sparse_threshold);
239 this->
PushBatch(n_threads, batch, std::numeric_limits<float>::quiet_NaN(), gmat, 0);
250 auto n_threads = ctx->
Threads();
254 n_features = gmat.Features()](
auto t) {
255 using RowBinIdxT = decltype(t);
256 SetIndexNoMissing(gmat.base_rowid, gmat.index.data<RowBinIdxT>(), size, n_features,
264 [[nodiscard]]
bool IsInitialized()
const {
return !type_.empty(); }
273 template <
typename Batch>
283 n_features = gmat.Features()](
auto t) {
284 using RowBinIdxT = decltype(t);
285 SetIndexNoMissing(base_rowid, gmat.index.data<RowBinIdxT>(), size, n_features, n_threads);
293 void SetTypeSize(
size_t max_bin_per_feat) {
294 if ((max_bin_per_feat - 1) <=
static_cast<int>(std::numeric_limits<uint8_t>::max())) {
295 bins_type_size_ = kUint8BinsTypeSize;
296 }
else if ((max_bin_per_feat - 1) <=
static_cast<int>(std::numeric_limits<uint16_t>::max())) {
297 bins_type_size_ = kUint16BinsTypeSize;
299 bins_type_size_ = kUint32BinsTypeSize;
303 template <
typename BinIdxType>
305 const size_t feature_offset = feature_offsets_[fidx];
306 const size_t column_size = feature_offsets_[fidx + 1] - feature_offset;
307 common::Span<const BinIdxType> bin_index = {
308 reinterpret_cast<const BinIdxType*
>(&index_[feature_offset * bins_type_size_]),
310 return SparseColumnIter<BinIdxType>(bin_index, index_base_[fidx],
311 {&row_ind_[feature_offset], column_size}, first_row_idx);
314 template <
typename BinIdxType,
bool any_missing>
316 const size_t feature_offset = feature_offsets_[fidx];
317 const size_t column_size = feature_offsets_[fidx + 1] - feature_offset;
318 common::Span<const BinIdxType> bin_index = {
319 reinterpret_cast<const BinIdxType*
>(&index_[feature_offset * bins_type_size_]),
321 return std::move(DenseColumnIter<BinIdxType, any_missing>{
322 bin_index,
static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing, feature_offset});
327 template <
typename RowBinIdxT>
328 void SetIndexNoMissing(
bst_row_t base_rowid, RowBinIdxT
const* row_index,
const size_t n_samples,
329 const size_t n_features, int32_t n_threads) {
330 missing_.GrowTo(feature_offsets_[n_features],
false);
333 using ColumnBinT =
decltype(t);
334 auto column_index = Span<ColumnBinT>{
reinterpret_cast<ColumnBinT*
>(index_.data()),
335 index_.size() /
sizeof(ColumnBinT)};
336 ParallelFor(n_samples, n_threads, [&](
auto rid) {
338 const size_t ibegin = rid * n_features;
339 const size_t iend = (rid + 1) * n_features;
340 for (
size_t i = ibegin, j = 0; i < iend; ++i, ++j) {
341 const size_t idx = feature_offsets_[j];
343 column_index[idx + rid] = row_index[i];
352 template <
typename Batch>
355 auto n_features = gmat.Features();
357 missing_.GrowTo(feature_offsets_[n_features],
true);
358 auto const* row_index = gmat.
index.data<std::uint32_t>() + gmat.
row_ptr[base_rowid];
359 if (num_nonzeros_.empty()) {
362 CHECK_EQ(num_nonzeros_.size(), n_features);
368 using ColumnBinT =
decltype(t);
369 ColumnBinT* local_index =
reinterpret_cast<ColumnBinT*
>(index_.data());
370 size_t const batch_size = batch.Size();
372 for (
size_t rid = 0; rid < batch_size; ++rid) {
373 auto line = batch.GetLine(rid);
374 for (
size_t i = 0; i < line.Size(); ++i) {
375 auto coo = line.GetElement(i);
377 auto fid = coo.column_idx;
378 const uint32_t bin_id = row_index[k];
379 SetBinSparse(bin_id, rid + base_rowid, fid, local_index);
392 auto n_features = gmat.Features();
394 missing_ = MissingIndicator{feature_offsets_[n_features],
true};
398 using ColumnBinT =
decltype(t);
399 ColumnBinT* local_index =
reinterpret_cast<ColumnBinT*
>(index_.data());
400 CHECK(this->any_missing_);
402 [&](
auto bin_idx, std::size_t, std::size_t ridx,
bst_feature_t fidx) {
403 SetBinSparse(bin_idx, ridx, fidx, local_index);
408 [[nodiscard]] BinTypeSize GetTypeSize()
const {
return bins_type_size_; }
409 [[nodiscard]]
auto GetColumnType(
bst_feature_t fidx)
const {
return type_[fidx]; }
412 [[nodiscard]]
bool AnyMissing()
const {
return any_missing_; }
415 [[nodiscard]]
bool Read(AlignedResourceReadStream* fi, uint32_t
const* index_base);
416 [[nodiscard]] std::size_t Write(AlignedFileWriteStream* fo)
const;
417 [[nodiscard]] MissingIndicator
const& Missing()
const {
return missing_; }
420 RefResourceView<std::uint8_t> index_;
422 RefResourceView<ColumnType> type_;
424 RefResourceView<std::size_t> row_ind_;
426 RefResourceView<std::size_t> feature_offsets_;
428 RefResourceView<std::size_t> num_nonzeros_;
431 std::uint32_t
const* index_base_;
433 MissingIndicator missing_;
435 BinTypeSize bins_type_size_;
In-memory storage unit of sparse batch, stored in CSR format.
Definition data.h:328
Runtime context for XGBoost.
Definition context.h:84
std::int32_t Threads() const
Returns the automatically chosen number of threads based on the nthread parameter and the system sett...
Definition context.cc:203