Medial Code Documentation
Loading...
Searching...
No Matches
ranking_utils.h
1
4#ifndef XGBOOST_COMMON_RANKING_UTILS_H_
5#define XGBOOST_COMMON_RANKING_UTILS_H_
6#include <algorithm> // for min
7#include <cmath> // for log2, fabs, floor
8#include <cstddef> // for size_t
9#include <cstdint> // for uint32_t, uint8_t, int32_t
10#include <limits> // for numeric_limits
11#include <string> // for char_traits, string
12#include <vector> // for vector
13
14#include "dmlc/parameter.h" // for FieldEntry, DMLC_DECLARE_FIELD
15#include "error_msg.h" // for GroupWeight, GroupSize, InvalidCUDAOrdinal
16#include "xgboost/base.h" // for XGBOOST_DEVICE, bst_group_t
17#include "xgboost/context.h" // for Context
18#include "xgboost/data.h" // for MetaInfo
19#include "xgboost/host_device_vector.h" // for HostDeviceVector
20#include "xgboost/linalg.h" // for Vector, VectorView, Tensor
21#include "xgboost/logging.h" // for CHECK_EQ, CHECK
22#include "xgboost/parameter.h" // for XGBoostParameter
23#include "xgboost/span.h" // for Span
24#include "xgboost/string_view.h" // for StringView
25
26namespace xgboost::ltr {
30using rel_degree_t = std::uint32_t; // NOLINT
34using position_t = std::uint32_t; // NOLINT
35
39constexpr std::size_t MaxRel() { return sizeof(rel_degree_t) * 8 - 1; }
40static_assert(MaxRel() == 31);
41
42XGBOOST_DEVICE inline double CalcDCGGain(rel_degree_t label) {
43 return static_cast<double>((1u << label) - 1);
44}
45
46XGBOOST_DEVICE inline double CalcDCGDiscount(std::size_t idx) {
47 return 1.0 / std::log2(static_cast<double>(idx) + 2.0);
48}
49
50XGBOOST_DEVICE inline double CalcInvIDCG(double idcg) {
51 auto inv_idcg = (idcg == 0.0 ? 0.0 : (1.0 / idcg)); // handle irrelevant document
52 return inv_idcg;
53}
54
55enum class PairMethod : std::int32_t {
56 kTopK = 0,
57 kMean = 1,
58};
59} // namespace xgboost::ltr
60
61DECLARE_FIELD_ENUM_CLASS(xgboost::ltr::PairMethod);
62
63namespace xgboost::ltr {
64struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
65 private:
66 static constexpr position_t DefaultK() { return 32; }
67 static constexpr position_t DefaultSamplePairs() { return 1; }
68
69 protected:
70 // pairs
71 // should be accessed by getter for auto configuration.
72 // nolint so that we can keep the string name.
73 PairMethod lambdarank_pair_method{PairMethod::kTopK}; // NOLINT
74 std::size_t lambdarank_num_pair_per_sample{NotSet()}; // NOLINT
75
76 public:
77 static constexpr position_t NotSet() { return std::numeric_limits<position_t>::max(); }
78
79 // unbiased
80 bool lambdarank_unbiased{false};
81 double lambdarank_bias_norm{1.0};
82 // ndcg
83 bool ndcg_exp_gain{true};
84
85 bool operator==(LambdaRankParam const& that) const {
86 return lambdarank_pair_method == that.lambdarank_pair_method &&
87 lambdarank_num_pair_per_sample == that.lambdarank_num_pair_per_sample &&
88 lambdarank_unbiased == that.lambdarank_unbiased &&
89 lambdarank_bias_norm == that.lambdarank_bias_norm && ndcg_exp_gain == that.ndcg_exp_gain;
90 }
91 bool operator!=(LambdaRankParam const& that) const { return !(*this == that); }
92
93 [[nodiscard]] double Regularizer() const { return 1.0 / (1.0 + this->lambdarank_bias_norm); }
94
98 [[nodiscard]] position_t NumPair() const {
99 if (lambdarank_num_pair_per_sample == NotSet()) {
100 switch (lambdarank_pair_method) {
101 case PairMethod::kMean:
102 return DefaultSamplePairs();
103 case PairMethod::kTopK:
104 return DefaultK();
105 }
106 } else {
107 return lambdarank_num_pair_per_sample;
108 }
109 LOG(FATAL) << "Unreachable.";
110 return 0;
111 }
112
113 [[nodiscard]] bool HasTruncation() const { return lambdarank_pair_method == PairMethod::kTopK; }
114
115 // Used for evaluation metric and cache initialization, iterate through top-k or the whole list
116 [[nodiscard]] auto TopK() const {
117 if (HasTruncation()) {
118 return NumPair();
119 } else {
120 return NotSet();
121 }
122 }
123
124 DMLC_DECLARE_PARAMETER(LambdaRankParam) {
125 DMLC_DECLARE_FIELD(lambdarank_pair_method)
126 .set_default(PairMethod::kTopK)
127 .add_enum("mean", PairMethod::kMean)
128 .add_enum("topk", PairMethod::kTopK)
129 .describe("Method for constructing pairs.");
130 DMLC_DECLARE_FIELD(lambdarank_num_pair_per_sample)
131 .set_default(NotSet())
132 .set_lower_bound(1)
133 .describe("Number of pairs for each sample in the list.");
134 DMLC_DECLARE_FIELD(lambdarank_unbiased)
135 .set_default(false)
136 .describe("Unbiased lambda mart. Use extended IPW to debias click position");
137 DMLC_DECLARE_FIELD(lambdarank_bias_norm)
138 .set_default(1.0)
139 .set_lower_bound(0.0)
140 .describe("Lp regularization for unbiased lambdarank.");
141 DMLC_DECLARE_FIELD(ndcg_exp_gain)
142 .set_default(true)
143 .describe("When set to true, the label gain is 2^rel - 1, otherwise it's rel.");
144 }
145};
146
151 private:
152 void InitOnCPU(Context const* ctx, MetaInfo const& info);
153 void InitOnCUDA(Context const* ctx, MetaInfo const& info);
154 // Cached parameter
155 LambdaRankParam param_;
156 // offset to data groups.
158 // store the sorted index of prediction.
159 HostDeviceVector<std::size_t> sorted_idx_cache_;
160 // Maximum size of group
161 std::size_t max_group_size_{0};
162 // Normalization for weight
163 double weight_norm_{1.0};
167 // offset to threads assigned to each group for gradient calculation
168 HostDeviceVector<std::size_t> threads_group_ptr_;
169 // Sorted index of label for finding buckets.
170 HostDeviceVector<std::size_t> y_sorted_idx_cache_;
171 // Cached labels sorted by the model
172 HostDeviceVector<float> y_ranked_by_model_;
173 // store rounding factor for objective for each group
175 // rounding factor for cost
176 HostDeviceVector<double> cost_rounding_;
177 // temporary storage for creating rounding factors. Stored as byte to avoid having cuda
178 // data structure in here.
180 // total number of cuda threads used for gradient calculation
181 std::size_t n_cuda_threads_{0};
182
183 // Create model rank list on GPU
184 common::Span<std::size_t const> MakeRankOnCUDA(Context const* ctx,
186 // Create model rank list on CPU
187 common::Span<std::size_t const> MakeRankOnCPU(Context const* ctx,
189
190 protected:
191 [[nodiscard]] std::size_t MaxGroupSize() const { return max_group_size_; }
192
193 public:
194 RankingCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : param_{p} {
195 CHECK(param_.GetInitialised());
196 if (!info.group_ptr_.empty()) {
197 CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
198 << error::GroupSize() << "the size of label.";
199 }
200 if (ctx->IsCPU()) {
201 this->InitOnCPU(ctx, info);
202 } else {
203 this->InitOnCUDA(ctx, info);
204 }
205 if (!info.weights_.Empty()) {
206 CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
207 }
208 }
209 [[nodiscard]] std::size_t MaxPositionSize() const {
210 // Use truncation level as bound.
211 if (param_.HasTruncation()) {
212 return param_.NumPair();
213 }
214 // Hardcoded maximum size of positions to track. We don't need too many of them as the
215 // bias decreases exponentially.
216 return std::min(max_group_size_, static_cast<std::size_t>(32));
217 }
218 // Constructed as [1, n_samples] if group ptr is not supplied by the user
219 common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
220 group_ptr_.SetDevice(ctx->gpu_id);
221 return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
222 }
223
224 [[nodiscard]] auto const& Param() const { return param_; }
225 [[nodiscard]] std::size_t Groups() const { return group_ptr_.Size() - 1; }
226 [[nodiscard]] double WeightNorm() const { return weight_norm_; }
227
228 // Create a rank list by model prediction
230 if (sorted_idx_cache_.Empty()) {
231 sorted_idx_cache_.SetDevice(ctx->gpu_id);
232 sorted_idx_cache_.Resize(predt.size());
233 }
234 if (ctx->IsCPU()) {
235 return this->MakeRankOnCPU(ctx, predt);
236 } else {
237 return this->MakeRankOnCUDA(ctx, predt);
238 }
239 }
240 // The function simply returns a uninitialized buffer as this is only used by the
241 // objective for creating pairs.
242 common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
243 CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
244 if (y_sorted_idx_cache_.Empty()) {
245 y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
246 y_sorted_idx_cache_.Resize(n_samples);
247 }
248 return y_sorted_idx_cache_.DeviceSpan();
249 }
250 common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
251 CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
252 if (y_ranked_by_model_.Empty()) {
253 y_ranked_by_model_.SetDevice(ctx->gpu_id);
254 y_ranked_by_model_.Resize(n_samples);
255 }
256 return y_ranked_by_model_.DeviceSpan();
257 }
258
259 // CUDA cache getters, the cache is shared between metric and objective, some of these
260 // fields are lazy initialized to avoid unnecessary allocation.
261 [[nodiscard]] common::Span<std::size_t const> CUDAThreadsGroupPtr() const {
262 CHECK(!threads_group_ptr_.Empty());
263 return threads_group_ptr_.ConstDeviceSpan();
264 }
265 [[nodiscard]] std::size_t CUDAThreads() const { return n_cuda_threads_; }
266
267 linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
268 if (roundings_.Size() == 0) {
269 roundings_.SetDevice(ctx->gpu_id);
270 roundings_.Reshape(Groups());
271 }
272 return roundings_.View(ctx->gpu_id);
273 }
274 common::Span<double> CUDACostRounding(Context const* ctx) {
275 if (cost_rounding_.Size() == 0) {
276 cost_rounding_.SetDevice(ctx->gpu_id);
277 cost_rounding_.Resize(1);
278 }
279 return cost_rounding_.DeviceSpan();
280 }
281 template <typename Type>
282 common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
283 max_lambdas_.SetDevice(ctx->gpu_id);
284 std::size_t bytes = n * sizeof(Type);
285 if (bytes != max_lambdas_.Size()) {
286 max_lambdas_.Resize(bytes);
287 }
288 return common::Span<Type>{reinterpret_cast<Type*>(max_lambdas_.DevicePointer()), n};
289 }
290};
291
292class NDCGCache : public RankingCache {
293 // NDCG discount
294 HostDeviceVector<double> discounts_;
295 // 1.0 / IDCG
296 linalg::Vector<double> inv_idcg_;
300 // store the intermediate DCG calculation result for metric
302
303 public:
304 void InitOnCPU(Context const* ctx, MetaInfo const& info);
305 void InitOnCUDA(Context const* ctx, MetaInfo const& info);
306
307 public:
308 NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
309 : RankingCache{ctx, info, p} {
310 if (ctx->IsCPU()) {
311 this->InitOnCPU(ctx, info);
312 } else {
313 this->InitOnCUDA(ctx, info);
314 }
315 }
316
317 linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
318 return inv_idcg_.View(ctx->gpu_id);
319 }
320 common::Span<double const> Discount(Context const* ctx) const {
321 return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
322 }
323 linalg::VectorView<double> Dcg(Context const* ctx) {
324 if (dcg_.Size() == 0) {
325 dcg_.SetDevice(ctx->gpu_id);
326 dcg_.Reshape(this->Groups());
327 }
328 return dcg_.View(ctx->gpu_id);
329 }
330};
331
338template <typename NoneOf>
340 NoneOf none_of) {
341 auto d_labels = labels.Values();
342 if (p.ndcg_exp_gain) {
343 auto label_is_integer =
344 none_of(d_labels.data(), d_labels.data() + d_labels.size(), [] XGBOOST_DEVICE(float v) {
345 auto l = std::floor(v);
346 return std::fabs(l - v) > kRtEps || v < 0.0f;
347 });
348 CHECK(label_is_integer)
349 << "When using relevance degree as target, label must be either 0 or positive integer.";
350 }
351
352 if (p.ndcg_exp_gain) {
353 auto label_is_valid = none_of(d_labels.data(), d_labels.data() + d_labels.size(),
354 [] XGBOOST_DEVICE(ltr::rel_degree_t v) { return v > MaxRel(); });
355 CHECK(label_is_valid) << "Relevance degress must be lesser than or equal to " << MaxRel()
356 << " when the exponential NDCG gain function is used. "
357 << "Set `ndcg_exp_gain` to false to use custom DCG gain.";
358 }
359}
360
361template <typename AllOf>
362bool IsBinaryRel(linalg::VectorView<float const> label, AllOf all_of) {
363 auto s_label = label.Values();
364 return all_of(s_label.data(), s_label.data() + s_label.size(), [] XGBOOST_DEVICE(float y) {
365 return std::abs(y - 1.0f) < kRtEps || std::abs(y - 0.0f) < kRtEps;
366 });
367}
374template <typename AllOf>
376 auto s_label = label.Values();
377 auto is_binary = IsBinaryRel(label, all_of);
378 CHECK(is_binary) << name << " can only be used with binary labels.";
379}
380
381class PreCache : public RankingCache {
383
384 void InitOnCPU(Context const* ctx, MetaInfo const& info);
385 void InitOnCUDA(Context const* ctx, MetaInfo const& info);
386
387 public:
388 PreCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
389 : RankingCache{ctx, info, p} {
390 if (ctx->IsCPU()) {
391 this->InitOnCPU(ctx, info);
392 } else {
393 this->InitOnCUDA(ctx, info);
394 }
395 }
396
397 common::Span<double> Pre(Context const* ctx) {
398 if (pre_.Empty()) {
399 pre_.SetDevice(ctx->gpu_id);
400 pre_.Resize(this->Groups());
401 }
402 return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan();
403 }
404};
405
406class MAPCache : public RankingCache {
407 // Total number of relevant documents for each group
409 // \sum l_k/k
412 // Number of samples in this dataset.
413 std::size_t n_samples_{0};
414
415 void InitOnCPU(Context const* ctx, MetaInfo const& info);
416 void InitOnCUDA(Context const* ctx, MetaInfo const& info);
417
418 public:
419 MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
420 : RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
421 if (ctx->IsCPU()) {
422 this->InitOnCPU(ctx, info);
423 } else {
424 this->InitOnCUDA(ctx, info);
425 }
426 }
427
428 common::Span<double> NumRelevant(Context const* ctx) {
429 if (n_rel_.Empty()) {
430 n_rel_.SetDevice(ctx->gpu_id);
431 n_rel_.Resize(n_samples_);
432 }
433 return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
434 }
435 common::Span<double> Acc(Context const* ctx) {
436 if (acc_.Empty()) {
437 acc_.SetDevice(ctx->gpu_id);
438 acc_.Resize(n_samples_);
439 }
440 return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
441 }
442 common::Span<double> Map(Context const* ctx) {
443 if (map_.Empty()) {
444 map_.SetDevice(ctx->gpu_id);
445 map_.Resize(this->Groups());
446 }
447 return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
448 }
449};
450
462std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus);
463
467std::string MakeMetricName(StringView name, position_t topn, bool minus);
468} // namespace xgboost::ltr
469#endif // XGBOOST_COMMON_RANKING_UTILS_H_
Definition host_device_vector.h:87
Meta information about dataset, always sit in memory.
Definition data.h:48
HostDeviceVector< bst_float > weights_
weights of each instance, optional
Definition data.h:69
std::vector< bst_group_t > group_ptr_
the index of begin and end of a group needed when the learning task is ranking.
Definition data.h:67
uint64_t num_row_
number of rows in the data
Definition data.h:54
linalg::Tensor< float, 2 > labels
label of each instance
Definition data.h:60
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition span.h:424
A tensor view with static type and dimension.
Definition linalg.h:293
A tensor storage.
Definition linalg.h:742
void SetDevice(int32_t device) const
Set device ordinal for this tensor.
Definition linalg.h:933
void Reshape(S &&...s)
Reshape the tensor.
Definition linalg.h:887
TensorView< T, kDim > View(int32_t device)
Get a TensorView for this tensor.
Definition linalg.h:837
Definition ranking_utils.h:406
Definition ranking_utils.h:292
Definition ranking_utils.h:381
Common cached items for ranking tasks.
Definition ranking_utils.h:150
Copyright 2014-2023, XGBoost Contributors.
Provide lightweight util to do parameter setup and checking.
A device-and-host vector abstraction layer.
Copyright 2015-2023 by XGBoost Contributors.
#define XGBOOST_DEVICE
Tag function as usable by device.
Definition base.h:64
Copyright 2015-2023 by XGBoost Contributors.
defines console logging options for xgboost. Use to enforce unified print behavior.
macro for using C++11 enum class as DMLC parameter
#define DECLARE_FIELD_ENUM_CLASS(EnumClass)
Specialization of FieldEntry for enum class (backed by int)
Definition parameter.h:50
Copyright 2021-2023 by XGBoost Contributors.
Copyright 2023 by XGBoost contributors.
Definition ranking_utils.cc:22
void CheckNDCGLabels(ltr::LambdaRankParam const &p, linalg::VectorView< float const > labels, NoneOf none_of)
Validate label for NDCG.
Definition ranking_utils.h:339
std::string ParseMetricName(StringView name, StringView param, position_t *topn, bool *minus)
Parse name for ranking metric given parameters.
Definition ranking_utils.cc:137
std::string MakeMetricName(StringView name, position_t topn, bool minus)
Parse name for ranking metric given parameters.
Definition ranking_utils.cc:157
void CheckPreLabels(StringView name, linalg::VectorView< float const > label, AllOf all_of)
Validate label for precision-based metric.
Definition ranking_utils.h:375
std::uint32_t rel_degree_t
Relevance degree.
Definition ranking_utils.h:30
std::uint32_t position_t
top-k position
Definition ranking_utils.h:34
constexpr std::size_t MaxRel()
Maximum relevance degree for NDCG.
Definition ranking_utils.h:39
constexpr bst_float kRtEps
small eps gap for minimum split decision.
Definition base.h:319
Definition parameter_test.cc:4
Runtime context for XGBoost.
Definition context.h:84
bool IsCPU() const
Is XGBoost running on CPU?
Definition context.h:133
bool IsCUDA() const
Is XGBoost running on a CUDA device?
Definition context.h:137
Definition string_view.h:15
Definition parameter.h:84
Definition ranking_utils.h:64
position_t NumPair() const
Get number of pairs for each sample.
Definition ranking_utils.h:98