Medial Code Documentation
Loading...
Searching...
No Matches
helpers.h
1
4#pragma once
5
6#include <gtest/gtest.h>
7#include <sys/stat.h>
8#include <sys/types.h>
9#include <xgboost/base.h>
10#include <xgboost/context.h>
11#include <xgboost/json.h>
12#include <xgboost/learner.h> // for LearnerModelParam
13#include <xgboost/model.h> // for Configurable
14
15#include <cstdint> // std::int32_t
16#include <cstdio>
17#include <fstream>
18#include <iostream>
19#include <memory>
20#include <string>
21#include <thread>
22#include <vector>
23
24#include "../../src/collective/communicator-inl.h"
25#include "../../src/common/common.h"
26#include "../../src/common/threading_utils.h"
27#include "../../src/data/array_interface.h"
28#include "filesystem.h" // dmlc::TemporaryDirectory
29#include "xgboost/linalg.h"
30
31#if defined(__CUDACC__)
32#define DeclareUnifiedTest(name) GPU ## name
33#else
34#define DeclareUnifiedTest(name) name
35#endif
36
37#if defined(__CUDACC__)
38#define GPUIDX (common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
39#else
40#define GPUIDX (-1)
41#endif
42
43#if defined(__CUDACC__)
44#define DeclareUnifiedDistributedTest(name) MGPU ## name
45#else
46#define DeclareUnifiedDistributedTest(name) name
47#endif
48
49namespace xgboost {
50class ObjFunction;
51class Metric;
52struct LearnerModelParam;
53class GradientBooster;
54}
55
56template <typename Float>
57Float RelError(Float l, Float r) {
58 static_assert(std::is_floating_point<Float>::value);
59 return std::abs(1.0f - l / r);
60}
61
62bool FileExists(const std::string& filename);
63
64void CreateSimpleTestData(const std::string& filename);
65
66// Create a libsvm format file with 3 entries per-row. `zero_based` specifies whether it's
67// 0-based indexing.
68void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based = true);
69
70void CreateTestCSV(std::string const& path, size_t rows, size_t cols);
71
72void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
73 std::vector<xgboost::bst_float> preds,
74 std::vector<xgboost::bst_float> labels,
75 std::vector<xgboost::bst_float> weights,
76 std::vector<xgboost::bst_float> out_grad,
77 std::vector<xgboost::bst_float> out_hess);
78
79xgboost::Json CheckConfigReloadImpl(xgboost::Configurable* const configurable,
80 std::string name);
81
82template <typename T>
83xgboost::Json CheckConfigReload(std::unique_ptr<T> const& configurable,
84 std::string name = "") {
85 return CheckConfigReloadImpl(dynamic_cast<xgboost::Configurable*>(configurable.get()),
86 name);
87}
88
89void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
90 std::vector<xgboost::bst_float> preds,
91 std::vector<xgboost::bst_float> labels,
92 std::vector<xgboost::bst_float> weights,
93 std::vector<xgboost::bst_uint> groups,
94 std::vector<xgboost::bst_float> out_grad,
95 std::vector<xgboost::bst_float> out_hess);
96
97xgboost::bst_float GetMetricEval(
98 xgboost::Metric * metric,
100 std::vector<xgboost::bst_float> labels,
101 std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(),
102 std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>(),
103 xgboost::DataSplitMode data_split_Mode = xgboost::DataSplitMode::kRow);
104
105double GetMultiMetricEval(xgboost::Metric* metric,
108 std::vector<xgboost::bst_float> weights = {},
109 std::vector<xgboost::bst_uint> groups = {},
110 xgboost::DataSplitMode data_split_Mode = xgboost::DataSplitMode::kRow);
111
112namespace xgboost {
113
114float GetBaseScore(Json const &config);
115
125 private:
126 using StateType = uint64_t;
127 static StateType constexpr kDefaultInit = 3;
128 static StateType constexpr kDefaultAlpha = 61;
129 static StateType constexpr kMaxValue = (static_cast<StateType>(1) << 32) - 1;
130
131 StateType state_;
132 StateType const alpha_;
133 StateType const mod_;
134
135 public:
136 using result_type = StateType; // NOLINT
137
138 public:
139 SimpleLCG() : state_{kDefaultInit}, alpha_{kDefaultAlpha}, mod_{kMaxValue} {}
140 SimpleLCG(SimpleLCG const& that) = default;
141 SimpleLCG(SimpleLCG&& that) = default;
142
143 void Seed(StateType seed) { state_ = seed % mod_; }
150 explicit SimpleLCG(StateType state)
151 : state_{state == 0 ? kDefaultInit : state}, alpha_{kDefaultAlpha}, mod_{kMaxValue} {}
152
153 StateType operator()();
154 StateType Min() const;
155 StateType Max() const;
156
157 constexpr result_type static min() { return 0; }; // NOLINT
158 constexpr result_type static max() { return kMaxValue; } // NOLINT
159};
160
161template <typename ResultT>
163 private:
164 ResultT const lower_;
165 ResultT const upper_;
166
168 template <size_t Bits, typename GeneratorT>
169 ResultT GenerateCanonical(GeneratorT* rng) const {
170 static_assert(std::is_floating_point<ResultT>::value,
171 "Result type must be floating point.");
172 long double const r = (static_cast<long double>(rng->Max())
173 - static_cast<long double>(rng->Min())) + 1.0L;
174 auto const log2r = static_cast<size_t>(std::log(r) / std::log(2.0L));
175 size_t m = std::max<size_t>(1UL, (Bits + log2r - 1UL) / log2r);
176 ResultT sum_value = 0, r_k = 1;
177
178 for (size_t k = m; k != 0; --k) {
179 sum_value += static_cast<ResultT>((*rng)() - rng->Min()) * r_k;
180 r_k *= static_cast<ResultT>(r);
181 }
182
183 ResultT res = sum_value / r_k;
184 return res;
185 }
186
187 public:
188 SimpleRealUniformDistribution(ResultT l, ResultT u) :
189 lower_{l}, upper_{u} {}
190
191 template <typename GeneratorT>
192 ResultT operator()(GeneratorT* rng) const {
193 ResultT tmp = GenerateCanonical<std::numeric_limits<ResultT>::digits,
194 GeneratorT>(rng);
195 auto ret = (tmp * (upper_ - lower_)) + lower_;
196 // Correct floating point error.
197 return std::max(ret, lower_);
198 }
199};
200
201template <typename T>
202Json GetArrayInterface(HostDeviceVector<T> const* storage, size_t rows, size_t cols) {
203 Json array_interface{Object()};
204 array_interface["data"] = std::vector<Json>(2);
205 if (storage->DeviceCanRead()) {
206 array_interface["data"][0] = Integer{reinterpret_cast<int64_t>(storage->ConstDevicePointer())};
207 array_interface["stream"] = nullptr;
208 } else {
209 array_interface["data"][0] = Integer{reinterpret_cast<int64_t>(storage->ConstHostPointer())};
210 }
211 array_interface["data"][1] = Boolean(false);
212
213 array_interface["shape"] = std::vector<Json>(2);
214 array_interface["shape"][0] = rows;
215 array_interface["shape"][1] = cols;
216
217 char t = linalg::detail::ArrayInterfaceHandler::TypeChar<T>();
218 array_interface["typestr"] = String(std::string{"<"} + t + std::to_string(sizeof(T)));
219 array_interface["version"] = 3;
220 return array_interface;
221}
222
223// Generate in-memory random data without using DMatrix.
225 bst_row_t rows_;
226 size_t cols_;
227 float sparsity_;
228
229 float lower_{0.0f};
230 float upper_{1.0f};
231
232 bst_target_t n_targets_{1};
233
234 std::int32_t device_{Context::kCpuId};
235 std::size_t n_batches_{0};
236 std::uint64_t seed_{0};
237 SimpleLCG lcg_;
238
239 bst_bin_t bins_{0};
240 std::vector<FeatureType> ft_;
241 bst_cat_t max_cat_;
242
243 Json ArrayInterfaceImpl(HostDeviceVector<float>* storage, size_t rows, size_t cols) const;
244
245 void GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const;
246
247 public:
248 RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
249 : rows_{rows}, cols_{cols}, sparsity_{sparsity}, lcg_{seed_} {}
250
251 RandomDataGenerator& Lower(float v) {
252 lower_ = v;
253 return *this;
254 }
255 RandomDataGenerator& Upper(float v) {
256 upper_ = v;
257 return *this;
258 }
259 RandomDataGenerator& Device(int32_t d) {
260 device_ = d;
261 return *this;
262 }
263 RandomDataGenerator& Batches(std::size_t n_batches) {
264 n_batches_ = n_batches;
265 return *this;
266 }
267 RandomDataGenerator& Seed(uint64_t s) {
268 seed_ = s;
269 lcg_.Seed(seed_);
270 return *this;
271 }
273 bins_ = b;
274 return *this;
275 }
277 CHECK_EQ(ft.size(), cols_);
278 ft_.resize(ft.size());
279 std::copy(ft.cbegin(), ft.cend(), ft_.begin());
280 return *this;
281 }
282 RandomDataGenerator& MaxCategory(bst_cat_t cat) {
283 max_cat_ = cat;
284 return *this;
285 }
286 RandomDataGenerator& Targets(bst_target_t n_targets) {
287 n_targets_ = n_targets;
288 return *this;
289 }
290
291 void GenerateDense(HostDeviceVector<float>* out) const;
292
293 std::string GenerateArrayInterface(HostDeviceVector<float>* storage) const;
294
305 std::pair<std::vector<std::string>, std::string> GenerateArrayInterfaceBatch(
306 HostDeviceVector<float>* storage, size_t batches) const;
307
308 std::string GenerateColumnarArrayInterface(std::vector<HostDeviceVector<float>>* data) const;
309
310 void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
311 HostDeviceVector<bst_feature_t>* columns) const;
312
313 [[nodiscard]] std::shared_ptr<DMatrix> GenerateDMatrix(bool with_label = false,
314 bool float_label = true,
315 size_t classes = 1) const;
316
317 [[nodiscard]] std::shared_ptr<DMatrix> GenerateSparsePageDMatrix(std::string prefix,
318 bool with_label) const;
319
320#if defined(XGBOOST_USE_CUDA)
321 std::shared_ptr<DMatrix> GenerateDeviceDMatrix(bool with_label);
322#endif
323 std::shared_ptr<DMatrix> GenerateQuantileDMatrix(bool with_label);
324};
325
326// Generate an empty DMatrix, mostly for its meta info.
327inline std::shared_ptr<DMatrix> EmptyDMatrix() {
328 return RandomDataGenerator{0, 0, 0.0}.GenerateDMatrix();
329}
330
331inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
332 std::vector<float> x(n);
333 std::mt19937 rng(0);
334 std::uniform_int_distribution<size_t> dist(0, num_categories - 1);
335 std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
336 // Make sure each category is present
337 for (size_t i = 0; i < num_categories; i++) {
338 x[i] = static_cast<decltype(x)::value_type>(i);
339 }
340 return x;
341}
342
343std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::size_t num_rows,
344 bst_feature_t num_columns);
345
356std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_row_t n_samples, bst_feature_t n_features,
357 size_t n_batches, std::string prefix = "cache");
358
362std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, std::string prefix = "cache");
363
381std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
382 size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
384
385std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs, size_t kRows,
386 size_t kCols,
387 LearnerModelParam const* learner_model_param,
388 Context const* generic_param);
389
390inline std::unique_ptr<HostDeviceVector<GradientPair>> GenerateGradients(
391 std::size_t rows, bst_target_t n_targets = 1) {
392 auto p_gradients = std::make_unique<HostDeviceVector<GradientPair>>(rows * n_targets);
393 auto& h_gradients = p_gradients->HostVector();
394
397
398 for (std::size_t i = 0; i < rows * n_targets; ++i) {
399 auto grad = dist(&gen);
400 auto hess = dist(&gen);
401 h_gradients[i] = GradientPair{grad, hess};
402 }
403
404 return p_gradients;
405}
406
410inline Context MakeCUDACtx(std::int32_t device) {
411 if (device == Context::kCpuId) {
412 return Context{};
413 }
414 return Context{}.MakeCUDA(device);
415}
416
417inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_rows,
418 float lower= 0.0f, float upper = 1.0f) {
421 std::vector<GradientPair> h_gpair(n_rows);
422 for (auto &gpair : h_gpair) {
423 bst_float grad = dist(&gen);
424 bst_float hess = dist(&gen);
425 gpair = GradientPair(grad, hess);
426 }
427 HostDeviceVector<GradientPair> gpair(h_gpair);
428 return gpair;
429}
430
431typedef void *DMatrixHandle; // NOLINT(*);
432
434 protected:
436 size_t iter_{0};
437 DMatrixHandle proxy_;
438 std::unique_ptr<RandomDataGenerator> rng_;
439
440 std::vector<std::string> batches_;
441 std::string interface_;
442 size_t rows_;
443 size_t cols_;
444 size_t n_batches_;
445
446 public:
447 size_t static constexpr Rows() { return 1024; }
448 size_t static constexpr Batches() { return 100; }
449 size_t static constexpr Cols() { return 13; }
450
451 public:
452 [[nodiscard]] std::string AsArray() const { return interface_; }
453
454 virtual int Next() = 0;
455 virtual void Reset() { iter_ = 0; }
456 [[nodiscard]] std::size_t Iter() const { return iter_; }
457 auto Proxy() -> decltype(proxy_) { return proxy_; }
458
459 explicit ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches);
463 explicit ArrayIterForTest(Context const* ctx, HostDeviceVector<float> const& data,
464 std::size_t n_samples, bst_feature_t n_features, std::size_t n_batches);
465 virtual ~ArrayIterForTest();
466};
467
469 public:
470 explicit CudaArrayIterForTest(float sparsity, size_t rows = Rows(), size_t cols = Cols(),
471 size_t batches = Batches());
472 int Next() override;
473 ~CudaArrayIterForTest() override = default;
474};
475
477 public:
478 explicit NumpyArrayIterForTest(float sparsity, size_t rows = Rows(), size_t cols = Cols(),
479 size_t batches = Batches());
480 explicit NumpyArrayIterForTest(Context const* ctx, HostDeviceVector<float> const& data,
481 std::size_t n_samples, bst_feature_t n_features,
482 std::size_t n_batches)
483 : ArrayIterForTest{ctx, data, n_samples, n_features, n_batches} {}
484 int Next() override;
485 ~NumpyArrayIterForTest() override = default;
486};
487
488void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
489 std::vector<size_t> *p_row_ptr,
490 std::vector<bst_feature_t> *p_cids);
491
492typedef void *DataIterHandle; // NOLINT(*)
493
494inline void Reset(DataIterHandle self) {
495 static_cast<ArrayIterForTest*>(self)->Reset();
496}
497
498inline int Next(DataIterHandle self) {
499 return static_cast<ArrayIterForTest*>(self)->Next();
500}
501
502class RMMAllocator;
503using RMMAllocatorPtr = std::unique_ptr<RMMAllocator, void(*)(RMMAllocator*)>;
504RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
505
506/*
507 * \brief Make learner model param
508 */
509inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint32_t n_groups,
510 int32_t device = Context::kCpuId) {
511 size_t shape[1]{1};
512 LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
513 n_groups, 1, MultiStrategy::kOneOutputPerTree);
514 return mparam;
515}
516
517inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }
518
519template <bool use_nccl = false, typename Function, typename... Args>
520void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&... args) {
521 auto run = [&](auto rank) {
522 Json config{JsonObject()};
523 if constexpr (use_nccl) {
524 config["xgboost_communicator"] = String("in-memory-nccl");
525 } else {
526 config["xgboost_communicator"] = String("in-memory");
527 }
528 config["in_memory_world_size"] = world_size;
529 config["in_memory_rank"] = rank;
531
532 std::forward<Function>(function)(std::forward<Args>(args)...);
533
535 };
536#if defined(_OPENMP)
537 common::ParallelFor(world_size, world_size, run);
538#else
539 std::vector<std::thread> threads;
540 for (auto rank = 0; rank < world_size; rank++) {
541 threads.emplace_back(run, rank);
542 }
543 for (auto& thread : threads) {
544 thread.join();
545 }
546#endif
547}
548
549class BaseMGPUTest : public ::testing::Test {
550 protected:
551 int world_size_;
552 bool use_nccl_{false};
553
554 void SetUp() override {
555 auto const n_gpus = common::AllVisibleGPUs();
556 if (n_gpus <= 1) {
557 // Use a single GPU to simulate distributed environment.
558 world_size_ = 3;
559 // NCCL doesn't like sharing a single GPU, so we use the adapter instead.
560 use_nccl_ = false;
561 } else {
562 // Use multiple GPUs for real.
563 world_size_ = n_gpus;
564 use_nccl_ = true;
565 }
566 }
567
568 template <typename Function, typename... Args>
569 void DoTest(Function&& function, Args&&... args) {
570 if (use_nccl_) {
571 RunWithInMemoryCommunicator<true>(world_size_, function, args...);
572 } else {
573 RunWithInMemoryCommunicator<false>(world_size_, function, args...);
574 }
575 }
576};
577
578class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
579
580} // namespace xgboost
Manager class for temporary directories. Whenever a new TemporaryDirectory object is constructed,...
Definition filesystem.h:54
Definition helpers.h:433
Definition helpers.h:549
Definition helpers.h:468
Internal data structured used by XGBoost during training.
Definition data.h:509
Definition host_device_vector.h:87
Definition json.h:190
Data structure representing JSON format.
Definition json.h:357
interface of evaluation metric used to evaluate model performance. This has nothing to do with traini...
Definition metric.h:29
Definition helpers.h:476
Definition helpers.h:224
std::pair< std::vector< std::string >, std::string > GenerateArrayInterfaceBatch(HostDeviceVector< float > *storage, size_t batches) const
Generate batches of array interface stored in consecutive memory.
Definition helpers.cc:310
Linear congruential generator.
Definition helpers.h:124
SimpleLCG(StateType state)
Initialize SimpleLCG.
Definition helpers.h:150
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition span.h:424
A tensor storage.
Definition linalg.h:742
Copyright 2014-2023, XGBoost Contributors.
void * DMatrixHandle
handle to DMatrix
Definition c_api.h:49
void * DataIterHandle
handle to a external data iterator
Definition c_api.h:334
Copyright 2015-2023 by XGBoost Contributors.
Copyright 2015-2023 by XGBoost Contributors.
Copyright 2021-2023 by XGBoost Contributors.
Defines the abstract interface for different components in XGBoost.
void Init(Json const &config)
Initialize the collective communicator.
Definition communicator-inl.h:60
void Finalize()
Finalize the collective communicator.
Definition communicator-inl.h:69
namespace of xgboost
Definition base.h:90
Context MakeCUDACtx(std::int32_t device)
Make a context that uses CUDA if device >= 0.
Definition helpers.h:410
uint32_t bst_feature_t
Type for data column (feature) index.
Definition base.h:101
std::uint32_t bst_target_t
Type for indexing into output targets.
Definition base.h:118
std::size_t bst_row_t
Type for data row index.
Definition base.h:110
int32_t bst_cat_t
Categorical value type.
Definition base.h:99
detail::GradientPairInternal< float > GradientPair
gradient statistics pair usually needed in gradient boosting
Definition base.h:256
std::unique_ptr< DMatrix > CreateSparsePageDMatrix(bst_row_t n_samples, bst_feature_t n_features, size_t n_batches, std::string prefix)
Create Sparse Page using data iterator.
Definition helpers.cc:514
int32_t bst_bin_t
Type for histogram bin index.
Definition base.h:103
float bst_float
float type, used for storing statistics
Definition base.h:97
std::unique_ptr< DMatrix > CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols, size_t page_size, bool deterministic, const dmlc::TemporaryDirectory &tempdir)
Deprecated, stop using it.
Definition helpers.cc:567
Definition model.h:31
Runtime context for XGBoost.
Definition context.h:84
Context MakeCUDA(bst_d_ordinal_t ordinal=0) const
Make a CUDA context based on the current context.
Definition context.h:160