4#include <LightGBM/meta.h>
6#include <LightGBM/utils/common.h>
7#include <LightGBM/utils/file_io.h>
12#include <unordered_map>
40 inline static void SumReducer(
const char *src,
char *dst,
int type_size, comm_size_t len) {
41 comm_size_t used_size = 0;
44 while (used_size < len) {
54 used_size += type_size;
68 bool CheckAlign(
const BinMapper& other)
const {
69 if (num_bin_ != other.num_bin_) {
72 if (missing_type_ != other.missing_type_) {
75 if (bin_type_ == BinType::NumericalBin) {
76 for (
int i = 0; i < num_bin_; ++i) {
77 if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
82 for (
int i = 0; i < num_bin_; i++) {
83 if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
92 inline int num_bin()
const {
return num_bin_; }
110 if (bin_type_ == BinType::NumericalBin) {
111 return bin_upper_bound_[bin];
113 return bin_2_categorical_[bin];
125 inline uint32_t
ValueToBin(
double value)
const;
146 void FindBin(
double* values,
int num_values,
size_t total_sample_cnt,
int max_bin,
int min_data_in_bin,
int min_split_data, BinType
bin_type,
147 bool use_missing,
bool zero_as_missing);
160 void CopyTo(
char* buffer)
const;
171 inline BinType
bin_type()
const {
return bin_type_; }
177 if (bin_type_ == BinType::CategoricalBin) {
178 return Common::Join(bin_2_categorical_,
":");
180 std::stringstream str_buf;
181 str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
182 str_buf <<
'[' << min_val_ <<
':' << max_val_ <<
']';
183 return str_buf.str();
190 MissingType missing_type_;
192 std::vector<double> bin_upper_bound_;
200 std::unordered_map<int, unsigned int> categorical_2_bin_;
202 std::vector<int> bin_2_categorical_;
208 uint32_t default_bin_;
261 virtual void Split(
int leaf,
int right_leaf,
const char* is_in_leaf,
char mark) = 0;
263 virtual data_size_t NonZeroCount(
int leaf)
const = 0;
321 const std::vector<data_size_t>& local_used_indices) = 0;
348 const score_t* ordered_gradients,
const score_t* ordered_hessians,
352 const score_t* ordered_gradients,
const score_t* ordered_hessians,
388 uint32_t default_bin, MissingType missing_type,
bool default_left, uint32_t threshold,
406 uint32_t default_bin,
const uint32_t* threshold,
int num_threshold,
433 double sparse_rate,
bool is_enable_sparse,
double sparse_threshold,
bool* is_sparse);
453 if (std::isnan(value)) {
454 if (missing_type_ == MissingType::NaN) {
460 if (bin_type_ == BinType::NumericalBin) {
463 int r = num_bin_ - 1;
464 if (missing_type_ == MissingType::NaN) {
468 int m = (r + l - 1) / 2;
469 if (value <= bin_upper_bound_[m]) {
477 int int_value =
static_cast<int>(value);
482 if (categorical_2_bin_.count(int_value)) {
483 return categorical_2_bin_.at(int_value);
Iterator for one bin column.
Definition bin.h:267
virtual uint32_t Get(data_size_t idx)=0
Get bin data on specific row index.
This class used to convert feature values into bin, and store some meta information for bin.
Definition bin.h:61
BinType bin_type() const
Get bin types.
Definition bin.h:171
void FindBin(double *values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing)
Construct feature value to bin mapper according feature values.
Definition bin.cpp:219
static int SizeForSpecificBin(int bin)
Use specific number of bin to calculate the size of this class.
Definition bin.cpp:416
MissingType missing_type() const
Missing Type.
Definition bin.h:94
void CopyFrom(const char *buffer)
Deserilizing this object from buffer.
Definition bin.cpp:453
size_t SizesInByte() const
Get sizes in byte of this object.
Definition bin.cpp:499
uint32_t ValueToBin(double value) const
Mapping feature value into bin.
Definition bin.h:452
void SaveBinaryToFile(const VirtualFileWriter *writer) const
Save binary data to file.
Definition bin.cpp:483
double BinToValue(uint32_t bin) const
Mapping bin into feature value.
Definition bin.h:109
int num_bin() const
Get number of bins.
Definition bin.h:92
bool is_trivial() const
True if bin is trivial (contains only one bin)
Definition bin.h:96
std::string bin_info() const
Get bin info.
Definition bin.h:176
uint32_t GetDefaultBin() const
Get the default bin when value is 0.
Definition bin.h:131
double sparse_rate() const
Sparsity of this bin ( num_zero_bins / num_data )
Definition bin.h:98
void CopyTo(char *buffer) const
Seirilizing this object to buffer.
Definition bin.cpp:429
Interface for bin data. This class will store bin data for one feature. unlike OrderedBin,...
Definition bin.h:286
static Bin * CreateDenseBin(data_size_t num_data, int num_bin)
Create object for bin data of one feature, used for dense feature.
Definition bin.cpp:534
virtual data_size_t SplitCategorical(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, const uint32_t *threshold, int num_threshold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const =0
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
virtual void Push(int tid, data_size_t idx, uint32_t value)=0
Push one record \pram tid Thread id.
virtual void ConstructHistogram(const data_size_t *data_indices, data_size_t num_data, const score_t *ordered_gradients, HistogramBinEntry *out) const =0
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
virtual void SaveBinaryToFile(const VirtualFileWriter *writer) const =0
Save binary data to file.
static Bin * CreateSparseBin(data_size_t num_data, int num_bin)
Create object for bin data of one feature, used for sparse feature.
Definition bin.cpp:546
virtual BinIterator * GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const =0
Get bin iterator of this bin for specific feature.
static Bin * CreateBin(data_size_t num_data, int num_bin, double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool *is_sparse)
Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "...
Definition bin.cpp:522
virtual void LoadFromMemory(const void *memory, const std::vector< data_size_t > &local_used_indices)=0
Load from memory.
virtual ~Bin()
virtual destructor
Definition bin.h:289
virtual OrderedBin * CreateOrderedBin() const =0
Create the ordered bin for this bin.
virtual size_t SizesInByte() const =0
Get sizes in byte of this object.
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const =0
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
virtual void FinishLoad()=0
After pushed all feature data, call this could have better refactor for bin data.
virtual void ConstructHistogram(const data_size_t *data_indices, data_size_t num_data, const score_t *ordered_gradients, const score_t *ordered_hessians, HistogramBinEntry *out) const =0
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
virtual data_size_t num_data() const =0
Number of all data.
Interface for ordered bin data. efficient for construct histogram, especially for sparse bin There ar...
Definition bin.h:219
virtual ~OrderedBin()
virtual destructor
Definition bin.h:222
virtual void Split(int leaf, int right_leaf, const char *is_in_leaf, char mark)=0
Split current bin, and perform re-order by leaf.
virtual void ConstructHistogram(int leaf, const score_t *gradients, const score_t *hessians, HistogramBinEntry *out) const =0
Construct histogram by using this bin Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ...
virtual void ConstructHistogram(int leaf, const score_t *gradients, HistogramBinEntry *out) const =0
Construct histogram by using this bin Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ...
virtual void Init(const char *used_indices, data_size_t num_leaves)=0
Initialization logic.
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
float score_t
Type of score, and gradients.
Definition meta.h:26
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
Store data for one histogram bin.
Definition bin.h:29
static void SumReducer(const char *src, char *dst, int type_size, comm_size_t len)
Sum up (reducers) functions for histogram bin.
Definition bin.h:40
data_size_t cnt
Number of data on this bin.
Definition bin.h:36
double sum_hessians
Sum of hessians on this bin.
Definition bin.h:34
double sum_gradients
Sum of gradients on this bin.
Definition bin.h:32
An interface for writing files from buffers.
Definition file_io.h:15