16#include <xgboost/span.h>
17#include <xgboost/string_view.h>
41enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 };
43enum class DataSplitMode :
int { kRow = 0, kCol = 1 };
112 void Validate(int32_t device)
const;
156 const void** out_dptr)
const;
158 void SetFeatureInfo(
const char *key,
const char **info,
const bst_ulong size);
159 void GetFeatureInfo(
const char *field, std::vector<std::string>* out_str_vecs)
const;
172 void Extend(
MetaInfo const& that,
bool accumulate_rows,
bool check_column);
212 mutable std::vector<size_t> label_order_cache_;
233 static bool CmpIndex(
Entry const& a,
Entry const& b) {
236 inline bool operator==(
const Entry& other)
const {
237 return (this->index == other.index && this->fvalue == other.fvalue);
285 [[nodiscard]]
bool ParamNotEqual(
BatchParam const& other)
const {
296 [[nodiscard]]
bool Initialized()
const {
return max_bin != 0; }
305 p.forbid_regen =
false;
316 Inst operator[](
size_t i)
const {
317 auto size = *(offset.data() + i + 1) - *(offset.data() + i);
318 return {data.data() + *(offset.data() + i),
319 static_cast<Inst::index_type
>(size)};
322 [[nodiscard]]
size_t Size()
const {
return offset.size() == 0 ? 0 : offset.size() - 1; }
335 size_t base_rowid {0};
341 return {offset.ConstHostSpan(),
data.ConstHostSpan()};
356 [[nodiscard]]
size_t Size()
const {
357 return offset.Size() == 0 ? 0 : offset.Size() - 1;
362 return offset.Size() *
sizeof(size_t) +
data.Size() *
sizeof(
Entry);
368 auto& offset_vec = offset.HostVector();
370 offset_vec.push_back(0);
371 data.HostVector().clear();
379 [[nodiscard]]
SparsePage GetTranspose(
int num_columns, int32_t n_threads)
const;
392 void Reindex(uint64_t feature_offset, int32_t n_threads);
394 void SortRows(int32_t n_threads);
406 template <
typename AdapterBatchT>
407 uint64_t
Push(
const AdapterBatchT& batch,
float missing,
int nthread);
433 std::shared_ptr<SparsePage const> page;
434 explicit ExtSparsePage(std::shared_ptr<SparsePage const> p) : page{std::move(p)} {}
449 using iterator_category = std::forward_iterator_tag;
451 virtual const T& operator*()
const = 0;
453 [[nodiscard]]
virtual bool AtEnd()
const = 0;
454 virtual std::shared_ptr<T const> Page()
const = 0;
460 using iterator_category = std::forward_iterator_tag;
465 CHECK(impl_ !=
nullptr);
470 const T& operator*()
const {
471 CHECK(impl_ !=
nullptr);
476 CHECK(impl_ !=
nullptr);
477 return !impl_->AtEnd();
480 [[nodiscard]]
bool AtEnd()
const {
481 CHECK(impl_ !=
nullptr);
482 return impl_->AtEnd();
485 [[nodiscard]] std::shared_ptr<T const> Page()
const {
486 return impl_->Page();
490 std::shared_ptr<BatchIteratorImpl<T>> impl_;
515 virtual void SetInfo(
const char* key,
const void* dptr,
DataType dtype,
size_t num) {
516 auto const& ctx = *this->
Ctx();
519 virtual void SetInfo(
const char* key, std::string
const& interface_str) {
520 auto const& ctx = *this->
Ctx();
537 template <
typename T>
539 template <
typename T>
541 template <
typename T>
543 template <
typename T>
544 [[nodiscard]]
bool PageExists()
const;
566 static DMatrix*
Load(
const std::string& uri,
bool silent =
true,
567 DataSplitMode data_split_mode = DataSplitMode::kRow);
581 template <
typename AdapterT>
582 static DMatrix*
Create(AdapterT* adapter,
float missing,
int nthread,
583 const std::string& cache_prefix =
"",
584 DataSplitMode data_split_mode = DataSplitMode::kRow);
607 static DMatrix*
Create(DataIterHandle iter, DMatrixHandle proxy, std::shared_ptr<DMatrix> ref,
629 template <
typename DataIterHandle,
typename DMatrixHandle,
631 static DMatrix *
Create(DataIterHandle iter, DMatrixHandle proxy,
634 int32_t nthread, std::string cache);
656 [[nodiscard]]
virtual bool EllpackExists()
const = 0;
657 [[nodiscard]]
virtual bool GHistIndexExists()
const = 0;
658 [[nodiscard]]
virtual bool SparsePageExists()
const = 0;
663 return GetRowBatches();
667inline bool DMatrix::PageExists<EllpackPage>()
const {
668 return this->EllpackExists();
672inline bool DMatrix::PageExists<GHistIndexMatrix>()
const {
673 return this->GHistIndexExists();
677inline bool DMatrix::PageExists<SparsePage>()
const {
678 return this->SparsePageExists();
682inline BatchSet<SparsePage> DMatrix::GetBatches(Context
const*) {
683 return GetRowBatches();
687inline BatchSet<CSCPage> DMatrix::GetBatches(Context
const* ctx) {
688 return GetColumnBatches(ctx);
692inline BatchSet<SortedCSCPage> DMatrix::GetBatches(Context
const* ctx) {
693 return GetSortedColumnBatches(ctx);
697inline BatchSet<EllpackPage> DMatrix::GetBatches(Context
const* ctx, BatchParam
const& param) {
698 return GetEllpackBatches(ctx, param);
702inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(Context
const* ctx, BatchParam
const& param) {
703 return GetGradientIndex(ctx, param);
707inline BatchSet<ExtSparsePage> DMatrix::GetBatches(Context
const* ctx, BatchParam
const& param) {
708 return GetExtBatches(ctx, param);
717namespace serializer {
722 strm->
Write(data.index);
723 strm->
Write(data.fvalue);
727 return strm->
Read(&data->index) && strm->
Read(&data->fvalue);
interface of stream I/O for serialization
Definition io.h:30
virtual void Write(const void *ptr, size_t size)=0
writes data to a stream
virtual size_t Read(void *ptr, size_t size)=0
reads data from a stream
Internal data structured used by XGBoost during training.
Definition data.h:509
virtual const MetaInfo & Info() const =0
meta information of the dataset
virtual Context const * Ctx() const =0
Get the context object of this DMatrix.
virtual MetaInfo & Info()=0
meta information of the dataset
static DMatrix * Create(AdapterT *adapter, float missing, int nthread, const std::string &cache_prefix="", DataSplitMode data_split_mode=DataSplitMode::kRow)
Creates a new DMatrix from an external data adapter.
Definition data.cc:975
virtual ~DMatrix()
virtual destructor
Definition data.cc:822
virtual DMatrix * SliceCol(int num_slices, int slice_id)=0
Slice a DMatrix by columns.
BatchSet< T > GetBatches()
Gets batches.
virtual bool SingleColBlock() const =0
static DMatrix * Load(const std::string &uri, bool silent=true, DataSplitMode data_split_mode=DataSplitMode::kRow)
Load DMatrix from URI.
Definition data.cc:853
XGBAPIThreadLocalEntry & GetThreadLocal() const
Get thread local memory for returning data from DMatrix.
Definition data.cc:818
bool IsDense() const
Whether the matrix is dense.
Definition data.h:553
DMatrix()=default
default constructor
A page stored in ELLPACK format.
Definition ellpack_page.h:21
Sparse page for exporting DMatrix.
Definition data.h:431
preprocessed global index matrix, in CSR format.
Definition gradient_index.h:38
Definition host_device_vector.h:87
Data structure representing JSON format.
Definition json.h:357
In-memory storage unit of sparse batch, stored in CSR format.
Definition data.h:328
SparsePage()
constructor
Definition data.h:345
void SetBaseRowId(size_t row_id)
Set the base row id for this page.
Definition data.h:375
void Reindex(uint64_t feature_offset, int32_t n_threads)
Reindex the column index with an offset.
Definition data.cc:1081
uint64_t Push(const AdapterBatchT &batch, float missing, int nthread)
Pushes external data batch onto this page.
Definition data.cc:1117
void PushCSC(const SparsePage &batch)
Push a SparsePage stored in CSC format.
Definition data.cc:1216
bool IsIndicesSorted(int32_t n_threads) const
Check wether the column index is sorted.
Definition data.cc:1053
void SortIndices(int32_t n_threads)
Sort the column index.
Definition data.cc:1070
HostDeviceVector< Entry > data
the data of the segments
Definition data.h:333
size_t MemCostBytes() const
Definition data.h:361
void Clear()
clear the page
Definition data.h:366
size_t Size() const
Definition data.h:356
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition span.h:424
A tensor storage.
Definition linalg.h:742
defines configuration macros
defines common input data structure, and interface for handling the input data
XGB_EXTERN_C typedef int XGDMatrixCallbackNext(DataIterHandle iter)
Callback function prototype for getting next batch of data.
XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle)
Callback function prototype for resetting external iterator.
A device-and-host vector abstraction layer.
Copyright 2015-2023 by XGBoost Contributors.
#define XGBOOST_DEVICE
Tag function as usable by device.
Definition base.h:64
#define DECLARE_FIELD_ENUM_CLASS(EnumClass)
Specialization of FieldEntry for enum class (backed by int)
Definition parameter.h:50
Copyright 2021-2023 by XGBoost Contributors.
namespace for dmlc
Definition array_view.h:12
Definition feature_weights.py:1
namespace of xgboost
Definition base.h:90
uint32_t bst_feature_t
Type for data column (feature) index.
Definition base.h:101
uint64_t bst_ulong
unsigned long integers
Definition base.h:95
int32_t bst_bin_t
Type for histogram bin index.
Definition base.h:103
DataType
data type accepted by xgboost interface
Definition data.h:33
float bst_float
float type, used for storing statistics
Definition base.h:97
serializer template class that helps serialization. This file do not need to be directly used by most...
generic serialization handler
Definition serializer.h:259
static void Write(Stream *strm, const T &data)
write data to stream
Definition serializer.h:265
static bool Read(Stream *strm, T *data)
read data to stream
Definition serializer.h:283
Parameters for constructing histogram index batches.
Definition data.h:244
bool forbid_regen
Forbid regenerating the gradient index.
Definition data.h:261
bst_bin_t max_bin
Maximum number of bins per feature for histograms.
Definition data.h:248
common::Span< float const > hess
Hessian, used for sketching with future approx implementation.
Definition data.h:252
bool regen
Whether should we force DMatrix to regenerate the batch.
Definition data.h:257
BatchParam()=default
Exact or others that don't need histogram.
double sparse_thresh
Parameter used to generate column matrix for hist.
Definition data.h:265
BatchParam(bst_bin_t max_bin, common::Span< float const > hessian, bool regenerate)
Used by the approx tree method.
Definition data.h:282
BatchParam MakeCache() const
Make a copy of self for DMatrix to describe how its existing index was generated.
Definition data.h:300
BatchParam(bst_bin_t max_bin, double sparse_thresh)
Used by the hist tree method.
Definition data.h:274
Runtime context for XGBoost.
Definition context.h:84
Element from a sparse vector.
Definition data.h:216
XGBOOST_DEVICE Entry(bst_feature_t index, bst_float fvalue)
constructor with index and value
Definition data.h:228
Entry()=default
default constructor
bst_feature_t index
feature index
Definition data.h:218
bst_float fvalue
feature value
Definition data.h:220
static bool CmpValue(const Entry &a, const Entry &b)
reversely compare feature values
Definition data.h:230
Definition string_view.h:15
entry to to easily hold returning information
Definition api_entry.h:16
#define DMLC_DECLARE_TRAITS(Trait, Type, Value)
macro to quickly declare traits information
Definition type_traits.h:126