5#ifndef XGBOOST_DATA_ADAPTER_H_
6#define XGBOOST_DATA_ADAPTER_H_
19#include "../c_api/c_api_error.h"
20#include "../common/error_msg.h"
21#include "../common/math.h"
27#include "xgboost/span.h"
28#include "xgboost/string_view.h"
81 : row_idx(row_idx), column_idx(column_idx), value(value) {}
94 return !(common::CheckNAN(value) || value == missing);
98 return !(common::CheckNAN(e.value) || e.value == missing);
102 return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
111template <
typename DType>
131 const float* Labels()
const {
return nullptr; }
132 const float* Weights()
const {
return nullptr; }
133 const uint64_t* Qid()
const {
return nullptr; }
134 const float* BaseMargin()
const {
return nullptr; }
143 Line(
size_t row_idx,
size_t size,
const unsigned* feature_idx,
147 feature_idx_(feature_idx),
150 size_t Size()
const {
return size_; }
151 COOTuple GetElement(
size_t idx)
const {
152 return COOTuple{row_idx_, feature_idx_[idx], values_[idx]};
158 const unsigned* feature_idx_;
159 const float* values_;
162 const float* values,
size_t num_rows,
size_t,
size_t)
164 feature_idx_(feature_idx),
166 num_rows_(num_rows) {}
167 const Line GetLine(
size_t idx)
const {
168 size_t begin_offset = row_ptr_[idx];
169 size_t end_offset = row_ptr_[idx + 1];
170 return Line(idx, end_offset - begin_offset, &feature_idx_[begin_offset],
171 &values_[begin_offset]);
173 size_t Size()
const {
return num_rows_; }
174 static constexpr bool kIsRowMajor =
true;
177 const size_t* row_ptr_;
178 const unsigned* feature_idx_;
179 const float* values_;
185 CSRAdapter(
const size_t* row_ptr,
const unsigned* feature_idx,
186 const float* values,
size_t num_rows,
size_t num_elements,
188 : batch_(row_ptr, feature_idx, values, num_rows, num_elements,
191 num_columns_(num_features) {}
193 size_t NumRows()
const {
return num_rows_; }
194 size_t NumColumns()
const {
return num_columns_; }
197 CSRAdapterBatch batch_;
207 num_features_(num_features) {}
212 Line(
const float* values,
size_t size,
size_t row_idx)
213 : row_idx_(row_idx), size_(size), values_(values) {}
215 size_t Size()
const {
return size_; }
216 COOTuple GetElement(
size_t idx)
const {
217 return COOTuple{row_idx_, idx, values_[idx]};
223 const float* values_;
227 size_t Size()
const {
return num_rows_; }
228 const Line GetLine(
size_t idx)
const {
229 return Line(values_ + idx * num_features_, num_features_, idx);
231 static constexpr bool kIsRowMajor =
true;
234 const float* values_;
236 size_t num_features_;
241 DenseAdapter(
const float* values,
size_t num_rows,
size_t num_features)
242 : batch_(values, num_rows, num_features),
244 num_columns_(num_features) {}
247 size_t NumRows()
const {
return num_rows_; }
248 size_t NumColumns()
const {
return num_columns_; }
251 DenseAdapterBatch batch_;
258 static constexpr bool kIsRowMajor =
true;
269 : array_interface_{std::move(array_interface)}, ridx_{ridx} {}
271 size_t Size()
const {
return array_interface_.Shape(1); }
273 COOTuple GetElement(
size_t idx)
const {
274 return {ridx_, idx, array_interface_(ridx_, idx)};
280 Line
const GetLine(
size_t idx)
const {
281 return Line{array_interface_, idx};
284 size_t NumRows()
const {
return array_interface_.Shape(0); }
285 size_t NumCols()
const {
return array_interface_.Shape(1); }
286 size_t Size()
const {
return this->NumRows(); }
289 : array_interface_{std::move(array_interface)} {}
305 [[nodiscard]] std::size_t NumRows()
const {
return array_interface_.Shape(0); }
306 [[nodiscard]] std::size_t NumColumns()
const {
return array_interface_.Shape(1); }
309 ArrayAdapterBatch batch_;
310 ArrayInterface<2> array_interface_;
328 : indices_{std::move(indices)}, values_{std::move(values)}, ridx_{ridx},
331 COOTuple GetElement(std::size_t idx)
const {
335 size_t Size()
const {
336 return values_.Shape(0);
341 static constexpr bool kIsRowMajor =
true;
347 : indptr_{std::move(indptr)},
348 indices_{std::move(indices)},
349 values_{std::move(values)},
350 n_features_{n_features} {
353 size_t NumRows()
const {
354 size_t size = indptr_.Shape(0);
355 size = size == 0 ? 0 : size - 1;
358 size_t NumCols()
const {
return n_features_; }
359 size_t Size()
const {
return this->NumRows(); }
361 Line
const GetLine(
size_t idx)
const {
365 auto indices = indices_;
366 auto values = values_;
369 auto offset = indices.strides[0] * begin_no_stride;
371 indices.shape[0] = end_no_stride - begin_no_stride;
372 values.shape[0] = end_no_stride - begin_no_stride;
374 return Line{indices, values, idx, offset};
387 : indptr_{indptr}, indices_{indices}, values_{values}, num_cols_{num_cols} {
395 size_t NumRows()
const {
396 size_t size = indptr_.Shape(0);
397 size = size == 0 ? 0 : size - 1;
400 size_t NumColumns()
const {
return num_cols_; }
403 CSRArrayAdapterBatch batch_;
404 ArrayInterface<1> indptr_;
405 ArrayInterface<1> indices_;
406 ArrayInterface<1> values_;
413 const float* values,
size_t num_features)
417 num_features_(num_features) {}
422 Line(
size_t col_idx,
size_t size,
const unsigned* row_idx,
424 : col_idx_(col_idx), size_(size), row_idx_(row_idx), values_(values) {}
426 size_t Size()
const {
return size_; }
427 COOTuple GetElement(
size_t idx)
const {
428 return COOTuple{row_idx_[idx], col_idx_, values_[idx]};
434 const unsigned* row_idx_;
435 const float* values_;
439 size_t Size()
const {
return num_features_; }
440 const Line GetLine(
size_t idx)
const {
441 size_t begin_offset = col_ptr_[idx];
442 size_t end_offset = col_ptr_[idx + 1];
443 return Line(idx, end_offset - begin_offset, &row_idx_[begin_offset],
444 &values_[begin_offset]);
446 static constexpr bool kIsRowMajor =
false;
449 const size_t* col_ptr_;
450 const unsigned* row_idx_;
451 const float* values_;
452 size_t num_features_;
457 CSCAdapter(
const size_t* col_ptr,
const unsigned* row_idx,
458 const float* values,
size_t num_features,
size_t num_rows)
459 : batch_(col_ptr, row_idx, values, num_features),
461 num_columns_(num_features) {}
465 size_t NumRows()
const {
468 size_t NumColumns()
const {
return num_columns_; }
471 CSCAdapterBatch batch_;
482 std::size_t column_idx_;
490 row_idx_{std::move(row_idx)},
491 values_{std::move(values)},
494 std::size_t Size()
const {
return values_.Shape(0); }
495 COOTuple GetElement(std::size_t idx)
const {
497 values_(offset_ + idx)};
502 static constexpr bool kIsRowMajor =
false;
506 : indptr_{std::move(indptr)}, indices_{std::move(indices)}, values_{std::move(values)} {}
508 std::size_t Size()
const {
return indptr_.n - 1; }
509 Line GetLine(std::size_t idx)
const {
513 auto indices = indices_;
514 auto values = values_;
517 auto offset = indices.strides[0] * begin_no_stride;
518 indices.shape[0] = end_no_stride - begin_no_stride;
519 values.shape[0] = end_no_stride - begin_no_stride;
521 return Line{idx, indices, values, offset};
545 size_t NumColumns()
const {
return indptr_.n - 1; }
550 enum class DTType : std::uint8_t {
561 static DTType DTGetType(std::string type_string) {
562 if (type_string ==
"float32") {
563 return DTType::kFloat32;
564 }
else if (type_string ==
"float64") {
565 return DTType::kFloat64;
566 }
else if (type_string ==
"bool8") {
567 return DTType::kBool8;
568 }
else if (type_string ==
"int32") {
569 return DTType::kInt32;
570 }
else if (type_string ==
"int8") {
571 return DTType::kInt8;
572 }
else if (type_string ==
"int16") {
573 return DTType::kInt16;
574 }
else if (type_string ==
"int64") {
575 return DTType::kInt64;
577 LOG(FATAL) <<
"Unknown data table type.";
578 return DTType::kUnknown;
584 std::size_t num_rows, std::size_t num_features)
585 : data_(data), num_rows_(num_rows) {
586 CHECK(feature_types_.empty());
587 std::transform(feature_stypes, feature_stypes + num_features,
588 std::back_inserter(feature_types_),
589 [](
char const* stype) {
return DTGetType(stype); });
594 std::size_t row_idx_;
595 void const*
const*
const data_;
596 std::vector<DTType>
const& feature_types_;
598 float DTGetValue(
void const* column, DTType dt_type, std::size_t ridx)
const {
599 float missing = std::numeric_limits<float>::quiet_NaN();
601 case DTType::kFloat32: {
602 float val =
reinterpret_cast<const float*
>(column)[ridx];
603 return std::isfinite(val) ? val : missing;
605 case DTType::kFloat64: {
606 double val =
reinterpret_cast<const double*
>(column)[ridx];
607 return std::isfinite(val) ?
static_cast<float>(val) : missing;
609 case DTType::kBool8: {
610 bool val =
reinterpret_cast<const bool*
>(column)[ridx];
611 return static_cast<float>(val);
613 case DTType::kInt32: {
614 int32_t val =
reinterpret_cast<const int32_t*
>(column)[ridx];
615 return val != (-2147483647 - 1) ?
static_cast<float>(val) : missing;
617 case DTType::kInt8: {
618 int8_t val =
reinterpret_cast<const int8_t*
>(column)[ridx];
619 return val != -128 ?
static_cast<float>(val) : missing;
621 case DTType::kInt16: {
622 int16_t val =
reinterpret_cast<const int16_t*
>(column)[ridx];
623 return val != -32768 ?
static_cast<float>(val) : missing;
625 case DTType::kInt64: {
626 int64_t val =
reinterpret_cast<const int64_t*
>(column)[ridx];
627 return val != -9223372036854775807 - 1 ?
static_cast<float>(val) : missing;
630 LOG(FATAL) <<
"Unknown data table type.";
637 Line(std::size_t ridx,
void const*
const*
const data, std::vector<DTType>
const& ft)
638 : row_idx_{ridx}, data_{data}, feature_types_{ft} {}
639 std::size_t Size()
const {
return feature_types_.size(); }
640 COOTuple GetElement(std::size_t idx)
const {
641 return COOTuple{row_idx_, idx, DTGetValue(data_[idx], feature_types_[idx], row_idx_)};
646 size_t Size()
const {
return num_rows_; }
647 const Line GetLine(std::size_t ridx)
const {
return {ridx, data_, feature_types_}; }
648 static constexpr bool kIsRowMajor =
true;
651 void const*
const*
const data_;
653 std::vector<DTType> feature_types_;
654 std::size_t num_rows_;
659 DataTableAdapter(
void** data,
const char** feature_stypes, std::size_t num_rows,
660 std::size_t num_features)
661 : batch_(data, feature_stypes, num_rows, num_features),
663 num_columns_(num_features) {}
665 std::size_t NumRows()
const {
return num_rows_; }
666 std::size_t NumColumns()
const {
return num_columns_; }
669 DataTableAdapterBatch batch_;
670 std::size_t num_rows_;
671 std::size_t num_columns_;
678 Line(
size_t row_idx,
const uint32_t *feature_idx,
const float *value,
681 feature_idx_(feature_idx),
685 size_t Size() {
return size_; }
687 float fvalue = value_ ==
nullptr ? 1.0f : value_[idx];
688 return COOTuple{row_idx_, feature_idx_[idx], fvalue};
693 const uint32_t* feature_idx_;
698 : block_(block), row_offset_(row_offset) {}
699 Line GetLine(
size_t idx)
const {
700 auto begin = block_->
offset[idx];
701 auto end = block_->
offset[idx + 1];
702 return Line{idx + row_offset_, &block_->
index[begin], &block_->
value[begin],
705 const float* Labels()
const {
return block_->
label; }
706 const float* Weights()
const {
return block_->
weight; }
707 const uint64_t* Qid()
const {
return block_->
qid; }
708 const float* BaseMargin()
const {
return nullptr; }
710 size_t Size()
const {
return block_->
size; }
711 static constexpr bool kIsRowMajor =
true;
731 bool next = parser_->
Next();
733 row_offset_ += parser_->
Value().size;
741 size_t row_offset_{0};
742 std::unique_ptr<FileAdapterBatch> batch_;
748template <
typename DataIterHandle,
typename XGBCallbackDataIterNext,
typename XGBoostBatchCSR>
753 data_handle_(data_handle),
754 next_callback_(next_callback) {}
758 CHECK(at_first_) <<
"Cannot reset IteratorAdapter";
762 if ((*next_callback_)(
778 return *batch_.get();
790 if (batch.
label !=
nullptr) {
791 label_.insert(label_.end(), batch.
label, batch.
label + batch.
size);
793 if (batch.
weight !=
nullptr) {
796 if (batch.
index !=
nullptr) {
797 index_.insert(index_.end(), batch.
index + offset_[0],
798 batch.
index + offset_.back());
800 if (batch.
value !=
nullptr) {
801 value_.insert(value_.end(), batch.
value + offset_[0],
802 batch.
value + offset_.back());
804 if (offset_[0] != 0) {
805 size_t base = offset_[0];
806 for (
size_t &item : offset_) {
811 <<
"Number of columns between batches changed from " << columns_
812 <<
" to " << batch.columns;
814 columns_ = batch.columns;
820 block_.
qid =
nullptr;
821 block_.
field =
nullptr;
825 batch_.reset(
new FileAdapterBatch(&block_, row_offset_));
826 row_offset_ += offset_.size() - 1;
829 size_t NumColumns()
const {
return columns_; }
833 std::vector<size_t> offset_;
834 std::vector<dmlc::real_t> label_;
835 std::vector<dmlc::real_t> weight_;
836 std::vector<uint32_t> index_;
837 std::vector<dmlc::real_t> value_;
840 size_t row_offset_{0};
842 bool at_first_{
true};
849 std::unique_ptr<FileAdapterBatch> batch_;
852enum ColumnDType : uint8_t {
870 Column(
size_t col_idx,
size_t length,
size_t null_count,
const uint8_t* bitmap)
871 : col_idx_{col_idx}, length_{length}, null_count_{null_count}, bitmap_{bitmap} {}
881 bool IsValid(
size_t row_idx)
const {
882 return (!bitmap_ || (bitmap_[row_idx/8] & (1 << (row_idx%8))));
885 virtual COOTuple GetElement(
size_t row_idx)
const = 0;
887 virtual bool IsValidElement(
size_t row_idx)
const = 0;
889 virtual std::vector<float> AsFloatVector()
const = 0;
891 virtual std::vector<uint64_t> AsUint64Vector()
const = 0;
893 size_t Length()
const {
return length_; }
899 const uint8_t* bitmap_;
908 static constexpr float kNaN = std::numeric_limits<float>::quiet_NaN();
912 const uint8_t* bitmap,
const T* data,
float missing)
913 :
Column{idx, length, null_count, bitmap}, data_{data}, missing_{missing} {}
915 COOTuple GetElement(
size_t row_idx)
const override {
916 CHECK(data_ && row_idx < length_) <<
"Column is empty or out-of-bound index of the column";
917 return { row_idx, col_idx_, IsValidElement(row_idx) ?
918 static_cast<float>(data_[row_idx]) : kNaN };
921 bool IsValidElement(
size_t row_idx)
const override {
923 return IsValid(row_idx)
924 && std::isfinite(
static_cast<double>(data_[row_idx]))
925 &&
static_cast<float>(data_[row_idx]) != missing_;
928 std::vector<float> AsFloatVector()
const override {
929 CHECK(data_) <<
"Column is empty";
930 std::vector<float> fv(length_);
931 std::transform(data_, data_ + length_, fv.begin(),
932 [](T v) { return static_cast<float>(v); });
936 std::vector<uint64_t> AsUint64Vector()
const override {
937 CHECK(data_) <<
"Column is empty";
938 std::vector<uint64_t> iv(length_);
939 std::transform(data_, data_ + length_, iv.begin(),
940 [](T v) { return static_cast<uint64_t>(v); });
951 ColumnDType type{ColumnDType::kUnknown};
957 std::vector<ColumnarMetaInfo> columns;
960 static ColumnDType FormatMap(
char const* format_str) {
961 CHECK(format_str) <<
"Format string cannot be empty";
962 switch (format_str[0]) {
964 return ColumnDType::kInt8;
966 return ColumnDType::kUInt8;
968 return ColumnDType::kInt16;
970 return ColumnDType::kUInt16;
972 return ColumnDType::kInt32;
974 return ColumnDType::kUInt32;
976 return ColumnDType::kInt64;
978 return ColumnDType::kUInt64;
980 return ColumnDType::kFloat;
982 return ColumnDType::kDouble;
984 CHECK(
false) <<
"Column data type not supported by XGBoost";
985 return ColumnDType::kUnknown;
991 CHECK(std::string(schema->format) ==
"+s");
992 CHECK(columns.empty());
993 for (
auto i = 0; i < schema->n_children; ++i) {
994 std::string name{schema->children[i]->name};
995 ColumnDType type = FormatMap(schema->children[i]->format);
997 columns.push_back(col_info);
999 if (schema->release) {
1000 schema->release(schema);
1009 : rb_{rb}, schema_{schema} {
1010 CHECK(rb_) <<
"Cannot import non-existent record batch";
1011 CHECK(!schema_->columns.empty()) <<
"Cannot import record batch without a schema";
1014 size_t Import(
float missing) {
1015 auto& infov = schema_->columns;
1016 for (
size_t i = 0; i < infov.size(); ++i) {
1017 columns_.push_back(CreateColumn(i, infov[i], missing));
1021 auto batch_size = rb_->length;
1022 auto num_columns = columns_.size();
1023 row_offsets_.resize(batch_size + 1, 0);
1024 for (
auto i = 0; i < batch_size; ++i) {
1025 row_offsets_[i+1] = row_offsets_[i];
1026 for (
size_t j = 0; j < num_columns; ++j) {
1027 if (GetColumn(j).IsValidElement(i)) {
1028 row_offsets_[i+1]++;
1033 return row_offsets_.back();
1042 if (rb_ && rb_->release) {
1049 size_t Size()
const {
return rb_ ? rb_->length : 0; }
1051 size_t NumColumns()
const {
return columns_.size(); }
1053 size_t NumElements()
const {
return row_offsets_.back(); }
1055 const Column& GetColumn(
size_t col_idx)
const {
1056 return *columns_[col_idx];
1059 void ShiftRowOffsets(
size_t batch_offset) {
1060 std::transform(row_offsets_.begin(), row_offsets_.end(), row_offsets_.begin(),
1061 [=](
size_t c) { return c + batch_offset; });
1064 const std::vector<size_t>& RowOffsets()
const {
return row_offsets_; }
1067 std::shared_ptr<Column> CreateColumn(
size_t idx,
1069 float missing)
const {
1074 auto loc_in_batch = info.loc;
1075 auto length = rb_->length;
1076 auto null_count = rb_->null_count;
1077 auto buffers0 = rb_->children[loc_in_batch]->buffers[0];
1078 auto buffers1 = rb_->children[loc_in_batch]->buffers[1];
1079 const uint8_t* bitmap = buffers0 ?
reinterpret_cast<const uint8_t*
>(buffers0) :
nullptr;
1080 const uint8_t* data = buffers1 ?
reinterpret_cast<const uint8_t*
>(buffers1) :
nullptr;
1083 if (null_count < 0) {
1087 null_count = length;
1088 for (
auto i = 0; i < length; ++i) {
1089 if (bitmap[i/8] & (1 << (i%8))) {
1096 switch (info.type) {
1097 case ColumnDType::kInt8:
1098 return std::make_shared<PrimitiveColumn<int8_t>>(
1099 idx, length, null_count, bitmap,
1100 reinterpret_cast<const int8_t*
>(data), missing);
1101 case ColumnDType::kUInt8:
1102 return std::make_shared<PrimitiveColumn<uint8_t>>(
1103 idx, length, null_count, bitmap, data, missing);
1104 case ColumnDType::kInt16:
1105 return std::make_shared<PrimitiveColumn<int16_t>>(
1106 idx, length, null_count, bitmap,
1107 reinterpret_cast<const int16_t*
>(data), missing);
1108 case ColumnDType::kUInt16:
1109 return std::make_shared<PrimitiveColumn<uint16_t>>(
1110 idx, length, null_count, bitmap,
1111 reinterpret_cast<const uint16_t*
>(data), missing);
1112 case ColumnDType::kInt32:
1113 return std::make_shared<PrimitiveColumn<int32_t>>(
1114 idx, length, null_count, bitmap,
1115 reinterpret_cast<const int32_t*
>(data), missing);
1116 case ColumnDType::kUInt32:
1117 return std::make_shared<PrimitiveColumn<uint32_t>>(
1118 idx, length, null_count, bitmap,
1119 reinterpret_cast<const uint32_t*
>(data), missing);
1120 case ColumnDType::kInt64:
1121 return std::make_shared<PrimitiveColumn<int64_t>>(
1122 idx, length, null_count, bitmap,
1123 reinterpret_cast<const int64_t*
>(data), missing);
1124 case ColumnDType::kUInt64:
1125 return std::make_shared<PrimitiveColumn<uint64_t>>(
1126 idx, length, null_count, bitmap,
1127 reinterpret_cast<const uint64_t*
>(data), missing);
1128 case ColumnDType::kFloat:
1129 return std::make_shared<PrimitiveColumn<float>>(
1130 idx, length, null_count, bitmap,
1131 reinterpret_cast<const float*
>(data), missing);
1132 case ColumnDType::kDouble:
1133 return std::make_shared<PrimitiveColumn<double>>(
1134 idx, length, null_count, bitmap,
1135 reinterpret_cast<const double*
>(data), missing);
1143 std::vector<std::shared_ptr<Column>> columns_;
1144 std::vector<size_t> row_offsets_;
1147using ArrowColumnarBatchVec = std::vector<std::unique_ptr<ArrowColumnarBatch>>;
1151 : next_callback_{next_callback}, nbatches_{nbatch} {}
1154 CHECK(at_first_) <<
"Cannot reset RecordBatchesIterAdapter";
1159 while (batches_.size() <
static_cast<size_t>(nbatches_) && (*next_callback_)(
this) != 0) {
1163 if (batches_.size() > 0) {
1175 if (at_first_ && schema) {
1176 schema_.Import(schema);
1178 if (schema && schema->release) {
1179 schema->release(schema);
1183 batches_.push_back(std::make_unique<ArrowColumnarBatch>(rb, &schema_));
1187 const ArrowColumnarBatchVec&
Value()
const override {
1191 size_t NumColumns()
const {
return schema_.columns.size(); }
1196 bool at_first_{
true};
1198 struct ArrowSchemaImporter schema_;
1199 ArrowColumnarBatchVec batches_;
1210 COOTuple GetElement(
size_t idx)
const {
return {ridx, inst[idx].index, inst[idx].fvalue}; }
1211 size_t Size()
const {
return n; }
1215 Line GetLine(
size_t ridx)
const {
return Line{page_[ridx].data(), page_[ridx].size(), ridx}; }
1216 size_t Size()
const {
return page_.Size(); }
Copyright 2019-2023 by XGBoost Contributors.
data iterator interface this is not a C++ style iterator, but nice for data pulling:) This interface ...
Definition data.h:56
virtual bool Next(void)=0
move to next item
virtual void BeforeFirst(void)=0
set before first of the item
virtual const DType & Value(void) const =0
get current data
parser interface that parses input data used to load dmlc data format into your own data format Diffe...
Definition data.h:293
A type erased view over array_interface protocol defined by numpy.
Definition array_interface.h:388
static Json Load(StringView str, std::ios::openmode mode=std::ios::in)
Decode the JSON object.
Definition json.cc:652
Adapter for dense array on host, in Python that's numpy.ndarray.
Definition adapter.h:297
ArrayAdapterBatch const & Value() const override
get current data
Definition adapter.h:304
Definition adapter.h:1006
const CSCAdapterBatch & Value() const override
get current data
Definition adapter.h:462
CSC adapter with support for array interface.
Definition adapter.h:528
const CSCArrayAdapterBatch & Value() const override
get current data
Definition adapter.h:546
const CSRAdapterBatch & Value() const override
get current data
Definition adapter.h:192
Adapter for CSR array on host, in Python that's scipy.sparse.csr_matrix.
Definition adapter.h:383
CSRArrayAdapterBatch const & Value() const override
get current data
Definition adapter.h:392
const DataTableAdapterBatch & Value() const override
get current data
Definition adapter.h:664
const DenseAdapterBatch & Value() const override
get current data
Definition adapter.h:245
FileAdapter wraps dmlc::parser to read files and provide access in a common interface.
Definition adapter.h:720
const FileAdapterBatch & Value() const override
get current data
Definition adapter.h:724
bool Next() override
move to next item
Definition adapter.h:730
void BeforeFirst() override
set before first of the item
Definition adapter.h:725
Data iterator that takes callback to return data, used in JVM package for accepting data iterator.
Definition adapter.h:749
bool Next() override
move to next item
Definition adapter.h:761
FileAdapterBatch const & Value() const override
get current data
Definition adapter.h:777
void BeforeFirst() override
set before first of the item
Definition adapter.h:757
Definition adapter.h:1148
const ArrowColumnarBatchVec & Value() const override
get current data
Definition adapter.h:1187
bool Next() override
move to next item
Definition adapter.h:1157
void BeforeFirst() override
set before first of the item
Definition adapter.h:1153
Definition adapter.h:1202
Simplifies the use of DataIter when there is only one batch.
Definition adapter.h:112
void BeforeFirst() override
set before first of the item
Definition adapter.h:114
bool Next() override
move to next item
Definition adapter.h:115
defines common input data structure, and interface for handling the input data
XGB_EXTERN_C typedef int XGDMatrixCallbackNext(DataIterHandle iter)
Callback function prototype for getting next batch of data.
void * DataIterHandle
handle to a external data iterator
Definition c_api.h:334
XGB_EXTERN_C typedef int XGBCallbackDataIterNext(DataIterHandle data_handle, XGBCallbackSetData *set_function, DataHolderHandle set_function_handle)
The data reading callback function. The iterator will be able to give subset of batch in the data.
Copyright 2015-2023 by XGBoost Contributors.
#define XGBOOST_DEVICE
Tag function as usable by device.
Definition base.h:64
Copyright 2015-2023 by XGBoost Contributors.
defines console logging options for xgboost. Use to enforce unified print behavior.
detail namespace with internal helper functions
Definition json.hpp:249
T * BeginPtr(std::vector< T > &vec)
safely get the beginning address of a vector
Definition base.h:284
constexpr size_t kAdapterUnknownSize
External data formats should implement an adapter as below.
Definition adapter.h:76
namespace of xgboost
Definition base.h:90
uint32_t bst_feature_t
Type for data column (feature) index.
Definition base.h:101
std::size_t bst_row_t
Type for data row index.
Definition base.h:110
Definition arrow-cdi.h:47
Definition arrow-cdi.h:31
Mini batch used in XGBoost Data Iteration.
Definition c_api.h:340
float * label
labels of each instance
Definition c_api.h:354
int64_t * offset
row pointer to the rows in the data
Definition c_api.h:351
int * index
feature index
Definition c_api.h:358
float * value
feature values
Definition c_api.h:360
float * weight
weight of each instance, can be NULL
Definition c_api.h:356
size_t size
number of rows in the minibatch
Definition c_api.h:342
a block of data, containing several rows in sparse matrix This is useful for (streaming-sxtyle) algor...
Definition data.h:175
const DType * label
array[size] label of each instance
Definition data.h:181
size_t size
batch size
Definition data.h:177
const real_t * weight
With weight: array[size] label of each instance, otherwise nullptr.
Definition data.h:183
const IndexType * index
feature index
Definition data.h:189
const DType * value
feature value, can be NULL, indicating all values are 1
Definition data.h:191
const uint64_t * qid
With qid: array[size] session id of each instance, otherwise nullptr.
Definition data.h:185
const IndexType * field
field id
Definition data.h:187
const size_t * offset
array[size+1], row pointer to beginning of each rows
Definition data.h:179
Element from a sparse vector.
Definition data.h:216
Definition string_view.h:15
Helper for type casting.
Definition array_interface.h:665
Definition adapter.h:1206