Medial Code Documentation
Loading...
Searching...
No Matches
random.h
Go to the documentation of this file.
1
7#ifndef XGBOOST_COMMON_RANDOM_H_
8#define XGBOOST_COMMON_RANDOM_H_
9
10#include <xgboost/logging.h>
11
12#include <algorithm>
13#include <functional>
14#include <limits>
15#include <map>
16#include <memory>
17#include <numeric>
18#include <random>
19#include <utility>
20#include <vector>
21
22#include "../collective/communicator-inl.h"
23#include "algorithm.h" // ArgSort
24#include "common.h"
25#include "xgboost/context.h" // Context
27
28namespace xgboost {
29namespace common {
33using RandomEngine = std::mt19937;
34
35#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
42class CustomGlobalRandomEngine {
43 public:
45 using result_type = uint32_t;
47 inline static constexpr result_type min() {
48 return 0;
49 }
51 inline static constexpr result_type max() {
52 return std::numeric_limits<result_type>::max();
53 }
58 void seed(result_type val);
62 result_type operator()();
63};
64
68typedef CustomGlobalRandomEngine GlobalRandomEngine;
69
70#else
75#endif // XGBOOST_CUSTOMIZE_GLOBAL_PRNG
76
82GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
83
84/*
85 * Original paper:
86 * Weighted Random Sampling (2005; Efraimidis, Spirakis)
87 *
88 * Blog:
89 * https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/
90*/
91template <typename T>
92std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vector<T> const& array,
93 std::vector<float> const& weights, size_t n) {
94 // ES sampling.
95 CHECK_EQ(array.size(), weights.size());
96 std::vector<float> keys(weights.size());
97 std::uniform_real_distribution<float> dist;
98 auto& rng = GlobalRandom();
99 for (size_t i = 0; i < array.size(); ++i) {
100 auto w = std::max(weights.at(i), kRtEps);
101 auto u = dist(rng);
102 auto k = std::log(u) / w;
103 keys[i] = k;
104 }
105 auto ind = ArgSort<std::size_t>(ctx, keys.data(), keys.data() + keys.size(), std::greater<>{});
106 ind.resize(n);
107
108 std::vector<T> results(ind.size());
109 for (size_t k = 0; k < ind.size(); ++k) {
110 auto idx = ind[k];
111 results[k] = array[idx];
112 }
113 return results;
114}
115
124 std::shared_ptr<HostDeviceVector<bst_feature_t>> feature_set_tree_;
125 std::map<int, std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_set_level_;
126 std::vector<float> feature_weights_;
127 float colsample_bylevel_{1.0f};
128 float colsample_bytree_{1.0f};
129 float colsample_bynode_{1.0f};
131 Context const* ctx_;
132
133 public:
134 std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
135 std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features, float colsample);
140 explicit ColumnSampler(uint32_t seed) {
141 rng_.seed(seed);
142 }
143
149 uint32_t seed = common::GlobalRandom()();
150 collective::Broadcast(&seed, sizeof(seed), 0);
151 rng_.seed(seed);
152 }
153
163 void Init(Context const* ctx, int64_t num_col, std::vector<float> feature_weights,
164 float colsample_bynode, float colsample_bylevel, float colsample_bytree) {
165 feature_weights_ = std::move(feature_weights);
166 colsample_bylevel_ = colsample_bylevel;
167 colsample_bytree_ = colsample_bytree;
168 colsample_bynode_ = colsample_bynode;
169 ctx_ = ctx;
170
171 if (feature_set_tree_ == nullptr) {
172 feature_set_tree_ = std::make_shared<HostDeviceVector<bst_feature_t>>();
173 }
174 Reset();
175
176 feature_set_tree_->Resize(num_col);
177 std::iota(feature_set_tree_->HostVector().begin(), feature_set_tree_->HostVector().end(), 0);
178
179 feature_set_tree_ = ColSample(feature_set_tree_, colsample_bytree_);
180 }
181
185 void Reset() {
186 feature_set_tree_->Resize(0);
187 feature_set_level_.clear();
188 }
189
201 std::shared_ptr<HostDeviceVector<bst_feature_t>> GetFeatureSet(int depth) {
202 if (colsample_bylevel_ == 1.0f && colsample_bynode_ == 1.0f) {
203 return feature_set_tree_;
204 }
205
206 if (feature_set_level_.count(depth) == 0) {
207 // Level sampling, level does not yet exist so generate it
208 feature_set_level_[depth] = ColSample(feature_set_tree_, colsample_bylevel_);
209 }
210 if (colsample_bynode_ == 1.0f) {
211 // Level sampling
212 return feature_set_level_[depth];
213 }
214 // Need to sample for the node individually
215 return ColSample(feature_set_level_[depth], colsample_bynode_);
216 }
217};
218
219} // namespace common
220} // namespace xgboost
221#endif // XGBOOST_COMMON_RANDOM_H_
Definition host_device_vector.h:87
Handles selection of columns due to colsample_bytree, colsample_bylevel and colsample_bynode paramete...
Definition random.h:123
ColumnSampler()
Column sampler constructor.
Definition random.h:148
std::shared_ptr< HostDeviceVector< bst_feature_t > > GetFeatureSet(int depth)
Samples a feature set.
Definition random.h:201
ColumnSampler(uint32_t seed)
Column sampler constructor.
Definition random.h:140
void Init(Context const *ctx, int64_t num_col, std::vector< float > feature_weights, float colsample_bynode, float colsample_bylevel, float colsample_bytree)
Initialise this object before use.
Definition random.h:163
void Reset()
Resets this object.
Definition random.h:185
Copyright 2014-2023, XGBoost Contributors.
A device-and-host vector abstraction layer.
defines console logging options for xgboost. Use to enforce unified print behavior.
Definition feature_weights.py:1
void Broadcast(void *send_receive_buffer, size_t size, int root)
Broadcast a memory region to all others from root. This function is NOT thread-safe.
Definition communicator-inl.h:129
std::mt19937 RandomEngine
Define mt19937 as default type Random Engine.
Definition random.h:33
RandomEngine GlobalRandomEngine
global random engine
Definition random.h:74
GlobalRandomEngine & GlobalRandom()
global singleton of a random engine. This random engine is thread-local and only visible to current t...
Definition common.cc:23
namespace of xgboost
Definition base.h:90
constexpr bst_float kRtEps
small eps gap for minimum split decision.
Definition base.h:319
Runtime context for XGBoost.
Definition context.h:84
Copyright 2015-2023 by XGBoost Contributors.