Medial Code Documentation
Loading...
Searching...
No Matches
bin.h
1#ifndef LIGHTGBM_BIN_H_
2#define LIGHTGBM_BIN_H_
3
4#include <LightGBM/meta.h>
5
6#include <LightGBM/utils/common.h>
7#include <LightGBM/utils/file_io.h>
8
9
10#include <vector>
11#include <functional>
12#include <unordered_map>
13#include <sstream>
14
15namespace LightGBM {
16
17enum BinType {
18 NumericalBin,
19 CategoricalBin
20};
21
22enum MissingType {
23 None,
24 Zero,
25 NaN
26};
27
30public:
32 double sum_gradients = 0.0f;
34 double sum_hessians = 0.0f;
40 inline static void SumReducer(const char *src, char *dst, int type_size, comm_size_t len) {
41 comm_size_t used_size = 0;
42 const HistogramBinEntry* p1;
44 while (used_size < len) {
45 // convert
46 p1 = reinterpret_cast<const HistogramBinEntry*>(src);
47 p2 = reinterpret_cast<HistogramBinEntry*>(dst);
48 // add
49 p2->cnt += p1->cnt;
51 p2->sum_hessians += p1->sum_hessians;
52 src += type_size;
53 dst += type_size;
54 used_size += type_size;
55 }
56 }
57};
58
61class BinMapper {
62public:
63 BinMapper();
64 BinMapper(const BinMapper& other);
65 explicit BinMapper(const void* memory);
66 ~BinMapper();
67
68 bool CheckAlign(const BinMapper& other) const {
69 if (num_bin_ != other.num_bin_) {
70 return false;
71 }
72 if (missing_type_ != other.missing_type_) {
73 return false;
74 }
75 if (bin_type_ == BinType::NumericalBin) {
76 for (int i = 0; i < num_bin_; ++i) {
77 if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
78 return false;
79 }
80 }
81 } else {
82 for (int i = 0; i < num_bin_; i++) {
83 if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
84 return false;
85 }
86 }
87 }
88 return true;
89 }
90
92 inline int num_bin() const { return num_bin_; }
94 inline MissingType missing_type() const { return missing_type_; }
96 inline bool is_trivial() const { return is_trivial_; }
98 inline double sparse_rate() const { return sparse_rate_; }
103 void SaveBinaryToFile(const VirtualFileWriter* writer) const;
109 inline double BinToValue(uint32_t bin) const {
110 if (bin_type_ == BinType::NumericalBin) {
111 return bin_upper_bound_[bin];
112 } else {
113 return bin_2_categorical_[bin];
114 }
115 }
119 size_t SizesInByte() const;
125 inline uint32_t ValueToBin(double value) const;
126
131 inline uint32_t GetDefaultBin() const {
132 return default_bin_;
133 }
146 void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
147 bool use_missing, bool zero_as_missing);
148
154 static int SizeForSpecificBin(int bin);
155
160 void CopyTo(char* buffer) const;
161
166 void CopyFrom(const char* buffer);
167
171 inline BinType bin_type() const { return bin_type_; }
172
176 inline std::string bin_info() const {
177 if (bin_type_ == BinType::CategoricalBin) {
178 return Common::Join(bin_2_categorical_, ":");
179 } else {
180 std::stringstream str_buf;
181 str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
182 str_buf << '[' << min_val_ << ':' << max_val_ << ']';
183 return str_buf.str();
184 }
185 }
186
187private:
189 int num_bin_;
190 MissingType missing_type_;
192 std::vector<double> bin_upper_bound_;
194 bool is_trivial_;
196 double sparse_rate_;
198 BinType bin_type_;
200 std::unordered_map<int, unsigned int> categorical_2_bin_;
202 std::vector<int> bin_2_categorical_;
204 double min_val_;
206 double max_val_;
208 uint32_t default_bin_;
209};
210
220public:
222 virtual ~OrderedBin() {}
223
230 virtual void Init(const char* used_indices, data_size_t num_leaves) = 0;
231
241 virtual void ConstructHistogram(int leaf, const score_t* gradients,
242 const score_t* hessians, HistogramBinEntry* out) const = 0;
243
252 virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
253
261 virtual void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) = 0;
262
263 virtual data_size_t NonZeroCount(int leaf) const = 0;
264};
265
268public:
274 virtual uint32_t Get(data_size_t idx) = 0;
275 virtual uint32_t RawGet(data_size_t idx) = 0;
276 virtual void Reset(data_size_t idx) = 0;
277 virtual ~BinIterator() = default;
278};
279
286class Bin {
287public:
289 virtual ~Bin() {}
296 virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;
297
298
299 virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
307 virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const = 0;
308
313 virtual void SaveBinaryToFile(const VirtualFileWriter* writer) const = 0;
314
320 virtual void LoadFromMemory(const void* memory,
321 const std::vector<data_size_t>& local_used_indices) = 0;
322
326 virtual size_t SizesInByte() const = 0;
327
329 virtual data_size_t num_data() const = 0;
330
331 virtual void ReSize(data_size_t num_data) = 0;
332
346 virtual void ConstructHistogram(
347 const data_size_t* data_indices, data_size_t num_data,
348 const score_t* ordered_gradients, const score_t* ordered_hessians,
349 HistogramBinEntry* out) const = 0;
350
352 const score_t* ordered_gradients, const score_t* ordered_hessians,
353 HistogramBinEntry* out) const = 0;
354
367 virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
368 const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
369
371 const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
372
387 virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
388 uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold,
389 data_size_t* data_indices, data_size_t num_data,
390 data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
391
405 virtual data_size_t SplitCategorical(uint32_t min_bin, uint32_t max_bin,
406 uint32_t default_bin, const uint32_t* threshold, int num_threshold,
407 data_size_t* data_indices, data_size_t num_data,
408 data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
409
414 virtual OrderedBin* CreateOrderedBin() const = 0;
415
419 virtual void FinishLoad() = 0;
420
432 static Bin* CreateBin(data_size_t num_data, int num_bin,
433 double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
434
441 static Bin* CreateDenseBin(data_size_t num_data, int num_bin);
442
449 static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
450};
451
452inline uint32_t BinMapper::ValueToBin(double value) const {
453 if (std::isnan(value)) {
454 if (missing_type_ == MissingType::NaN) {
455 return num_bin_ - 1;
456 } else {
457 value = 0.0f;
458 }
459 }
460 if (bin_type_ == BinType::NumericalBin) {
461 // binary search to find bin
462 int l = 0;
463 int r = num_bin_ - 1;
464 if (missing_type_ == MissingType::NaN) {
465 r -= 1;
466 }
467 while (l < r) {
468 int m = (r + l - 1) / 2;
469 if (value <= bin_upper_bound_[m]) {
470 r = m;
471 } else {
472 l = m + 1;
473 }
474 }
475 return l;
476 } else {
477 int int_value = static_cast<int>(value);
478 // convert negative value to NaN bin
479 if (int_value < 0) {
480 return num_bin_ - 1;
481 }
482 if (categorical_2_bin_.count(int_value)) {
483 return categorical_2_bin_.at(int_value);
484 } else {
485 return num_bin_ - 1;
486 }
487 }
488}
489
490} // namespace LightGBM
491
492#endif // LightGBM_BIN_H_
Iterator for one bin column.
Definition bin.h:267
virtual uint32_t Get(data_size_t idx)=0
Get bin data on specific row index.
This class used to convert feature values into bin, and store some meta information for bin.
Definition bin.h:61
BinType bin_type() const
Get bin types.
Definition bin.h:171
void FindBin(double *values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing)
Construct feature value to bin mapper according feature values.
Definition bin.cpp:219
static int SizeForSpecificBin(int bin)
Use specific number of bin to calculate the size of this class.
Definition bin.cpp:416
MissingType missing_type() const
Missing Type.
Definition bin.h:94
void CopyFrom(const char *buffer)
Deserilizing this object from buffer.
Definition bin.cpp:453
size_t SizesInByte() const
Get sizes in byte of this object.
Definition bin.cpp:499
uint32_t ValueToBin(double value) const
Mapping feature value into bin.
Definition bin.h:452
void SaveBinaryToFile(const VirtualFileWriter *writer) const
Save binary data to file.
Definition bin.cpp:483
double BinToValue(uint32_t bin) const
Mapping bin into feature value.
Definition bin.h:109
int num_bin() const
Get number of bins.
Definition bin.h:92
bool is_trivial() const
True if bin is trivial (contains only one bin)
Definition bin.h:96
std::string bin_info() const
Get bin info.
Definition bin.h:176
uint32_t GetDefaultBin() const
Get the default bin when value is 0.
Definition bin.h:131
double sparse_rate() const
Sparsity of this bin ( num_zero_bins / num_data )
Definition bin.h:98
void CopyTo(char *buffer) const
Seirilizing this object to buffer.
Definition bin.cpp:429
Interface for bin data. This class will store bin data for one feature. unlike OrderedBin,...
Definition bin.h:286
static Bin * CreateDenseBin(data_size_t num_data, int num_bin)
Create object for bin data of one feature, used for dense feature.
Definition bin.cpp:534
virtual data_size_t SplitCategorical(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, const uint32_t *threshold, int num_threshold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const =0
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
virtual void Push(int tid, data_size_t idx, uint32_t value)=0
Push one record \pram tid Thread id.
virtual void ConstructHistogram(const data_size_t *data_indices, data_size_t num_data, const score_t *ordered_gradients, HistogramBinEntry *out) const =0
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
virtual void SaveBinaryToFile(const VirtualFileWriter *writer) const =0
Save binary data to file.
static Bin * CreateSparseBin(data_size_t num_data, int num_bin)
Create object for bin data of one feature, used for sparse feature.
Definition bin.cpp:546
virtual BinIterator * GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const =0
Get bin iterator of this bin for specific feature.
static Bin * CreateBin(data_size_t num_data, int num_bin, double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool *is_sparse)
Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "...
Definition bin.cpp:522
virtual void LoadFromMemory(const void *memory, const std::vector< data_size_t > &local_used_indices)=0
Load from memory.
virtual ~Bin()
virtual destructor
Definition bin.h:289
virtual OrderedBin * CreateOrderedBin() const =0
Create the ordered bin for this bin.
virtual size_t SizesInByte() const =0
Get sizes in byte of this object.
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const =0
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
virtual void FinishLoad()=0
After pushed all feature data, call this could have better refactor for bin data.
virtual void ConstructHistogram(const data_size_t *data_indices, data_size_t num_data, const score_t *ordered_gradients, const score_t *ordered_hessians, HistogramBinEntry *out) const =0
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
virtual data_size_t num_data() const =0
Number of all data.
Interface for ordered bin data. efficient for construct histogram, especially for sparse bin There ar...
Definition bin.h:219
virtual ~OrderedBin()
virtual destructor
Definition bin.h:222
virtual void Split(int leaf, int right_leaf, const char *is_in_leaf, char mark)=0
Split current bin, and perform re-order by leaf.
virtual void ConstructHistogram(int leaf, const score_t *gradients, const score_t *hessians, HistogramBinEntry *out) const =0
Construct histogram by using this bin Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ...
virtual void ConstructHistogram(int leaf, const score_t *gradients, HistogramBinEntry *out) const =0
Construct histogram by using this bin Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ...
virtual void Init(const char *used_indices, data_size_t num_leaves)=0
Initialization logic.
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
float score_t
Type of score, and gradients.
Definition meta.h:26
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
Store data for one histogram bin.
Definition bin.h:29
static void SumReducer(const char *src, char *dst, int type_size, comm_size_t len)
Sum up (reducers) functions for histogram bin.
Definition bin.h:40
data_size_t cnt
Number of data on this bin.
Definition bin.h:36
double sum_hessians
Sum of hessians on this bin.
Definition bin.h:34
double sum_gradients
Sum of gradients on this bin.
Definition bin.h:32
An interface for writing files from buffers.
Definition file_io.h:15