1#ifndef LIGHTGBM_IO_SPARSE_BIN_HPP_
2#define LIGHTGBM_IO_SPARSE_BIN_HPP_
4#include <LightGBM/utils/log.h>
6#include <LightGBM/bin.h>
8#include <LightGBM/utils/openmp_wrapper.h>
17template <
typename VAL_T>
class SparseBin;
19const size_t kNumFastIndex = 64;
21template <
typename VAL_T>
25 uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
26 : bin_data_(bin_data), min_bin_(
static_cast<VAL_T
>(min_bin)),
27 max_bin_(
static_cast<VAL_T
>(max_bin)),
28 default_bin_(
static_cast<VAL_T
>(default_bin)) {
29 if (default_bin_ == 0) {
37 : bin_data_(bin_data) {
45 VAL_T ret = InnerRawGet(idx);
46 if (ret >= min_bin_ && ret <= max_bin_) {
47 return ret - min_bin_ + bias_;
65template <
typename VAL_T>
66class OrderedSparseBin;
68template <
typename VAL_T>
80 num_threads = omp_get_num_threads();
82 push_buffers_.resize(num_threads);
93 auto cur_bin =
static_cast<VAL_T
>(value);
95 push_buffers_[tid].emplace_back(idx, cur_bin);
104 Log::Fatal(
"Using OrderedSparseBin->ConstructHistogram() instead");
110 Log::Fatal(
"Using OrderedSparseBin->ConstructHistogram() instead");
116 Log::Fatal(
"Using OrderedSparseBin->ConstructHistogram() instead");
122 Log::Fatal(
"Using OrderedSparseBin->ConstructHistogram() instead");
130 while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
133 delta |=
static_cast<data_size_t>(deltas_[*i_delta]) << shift;
136 if (*i_delta < num_vals_) {
139 *cur_pos = num_data_;
145 uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type,
bool default_left,
150 VAL_T th =
static_cast<VAL_T
>(threshold + min_bin);
151 const VAL_T minb =
static_cast<VAL_T
>(min_bin);
152 const VAL_T maxb =
static_cast<VAL_T
>(max_bin);
153 VAL_T t_default_bin =
static_cast<VAL_T
>(min_bin + default_bin);
154 if (default_bin == 0) {
163 if (missing_type == MissingType::NaN) {
164 if (default_bin <= threshold) {
165 default_indices = lte_indices;
166 default_count = <e_count;
171 missing_default_indices = lte_indices;
172 missing_default_count = <e_count;
176 const VAL_T bin = iterator.InnerRawGet(idx);
177 if (bin < minb || bin > maxb || t_default_bin == bin) {
178 default_indices[(*default_count)++] = idx;
179 }
else if (bin == maxb) {
180 missing_default_indices[(*missing_default_count)++] = idx;
181 }
else if (bin > th) {
182 gt_indices[gt_count++] = idx;
184 lte_indices[lte_count++] = idx;
188 if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
189 default_indices = lte_indices;
190 default_count = <e_count;
194 const VAL_T bin = iterator.InnerRawGet(idx);
195 if (bin < minb || bin > maxb || t_default_bin == bin) {
196 default_indices[(*default_count)++] = idx;
197 }
else if (bin > th) {
198 gt_indices[gt_count++] = idx;
200 lte_indices[lte_count++] = idx;
208 uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
217 if (Common::FindInBitset(threshold, num_threahold, default_bin)) {
218 default_indices = lte_indices;
219 default_count = <e_count;
223 uint32_t bin = iterator.InnerRawGet(idx);
224 if (bin < min_bin || bin > max_bin) {
225 default_indices[(*default_count)++] = idx;
226 }
else if (Common::FindInBitset(threshold, num_threahold, bin - min_bin)) {
227 lte_indices[lte_count++] = idx;
229 gt_indices[gt_count++] = idx;
242 for (
size_t i = 0; i < push_buffers_.size(); ++i) {
243 pair_cnt += push_buffers_[i].size();
245 std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs = push_buffers_[0];
246 idx_val_pairs.reserve(pair_cnt);
248 for (
size_t i = 1; i < push_buffers_.size(); ++i) {
249 idx_val_pairs.insert(idx_val_pairs.end(), push_buffers_[i].begin(), push_buffers_[i].end());
250 push_buffers_[i].clear();
251 push_buffers_[i].shrink_to_fit();
254 std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
255 [](
const std::pair<data_size_t, VAL_T>& a,
const std::pair<data_size_t, VAL_T>& b) {
256 return a.first < b.first;
259 LoadFromPair(idx_val_pairs);
262 void LoadFromPair(
const std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs) {
267 for (
size_t i = 0; i < idx_val_pairs.size(); ++i) {
268 const data_size_t cur_idx = idx_val_pairs[i].first;
269 const VAL_T bin = idx_val_pairs[i].second;
271 if (i > 0 && cur_delta == 0) {
continue; }
272 while (cur_delta >= 256) {
273 deltas_.push_back(cur_delta & 0xff);
277 deltas_.push_back(
static_cast<uint8_t
>(cur_delta));
278 vals_.push_back(bin);
282 deltas_.push_back(0);
283 num_vals_ =
static_cast<data_size_t>(vals_.size());
286 deltas_.shrink_to_fit();
287 vals_.shrink_to_fit();
293 void GetFastIndex() {
296 data_size_t mod_size = (num_data_ + kNumFastIndex - 1) / kNumFastIndex;
298 fast_index_shift_ = 0;
299 while (pow2_mod_size < mod_size) {
307 while (NextNonzero(&i_delta, &cur_pos)) {
308 while (next_threshold <= cur_pos) {
309 fast_index_.emplace_back(i_delta, cur_pos);
310 next_threshold += pow2_mod_size;
314 while (next_threshold < num_data_) {
315 fast_index_.emplace_back(num_vals_ - 1, cur_pos);
316 next_threshold += pow2_mod_size;
318 fast_index_.shrink_to_fit();
322 writer->
Write(&num_vals_,
sizeof(num_vals_));
323 writer->
Write(deltas_.data(),
sizeof(uint8_t) * (num_vals_ + 1));
324 writer->
Write(vals_.data(),
sizeof(VAL_T) * num_vals_);
328 return sizeof(num_vals_) +
sizeof(uint8_t) * (num_vals_ + 1)
329 +
sizeof(VAL_T) * num_vals_;
332 void LoadFromMemory(
const void* memory,
const std::vector<data_size_t>& local_used_indices)
override {
333 const char* mem_ptr =
reinterpret_cast<const char*
>(memory);
335 mem_ptr +=
sizeof(tmp_num_vals);
336 const uint8_t* tmp_delta =
reinterpret_cast<const uint8_t*
>(mem_ptr);
337 mem_ptr +=
sizeof(uint8_t) * (tmp_num_vals + 1);
338 const VAL_T* tmp_vals =
reinterpret_cast<const VAL_T*
>(mem_ptr);
342 num_vals_ = tmp_num_vals;
344 deltas_.push_back(tmp_delta[i]);
345 vals_.push_back(tmp_vals[i]);
347 deltas_.push_back(0);
349 deltas_.shrink_to_fit();
350 vals_.shrink_to_fit();
352 if (local_used_indices.empty()) {
356 std::vector<std::pair<data_size_t, VAL_T>> tmp_pair;
359 for (
data_size_t i = 0; i < static_cast<data_size_t>(local_used_indices.size()); ++i) {
361 while (cur_pos < idx && j < num_vals_) {
362 NextNonzero(&j, &cur_pos);
364 if (cur_pos == idx && j < num_vals_) {
366 tmp_pair.emplace_back(i, vals_[j]);
369 LoadFromPair(tmp_pair);
378 if (num_used_indices > 0) {
379 start = used_indices[0];
381 SparseBinIterator<VAL_T> iterator(other_bin, start);
384 for (
data_size_t i = 0; i < num_used_indices; ++i) {
385 VAL_T bin = iterator.InnerRawGet(used_indices[i]);
388 while (cur_delta >= 256) {
389 deltas_.push_back(cur_delta & 0xff);
393 deltas_.push_back(
static_cast<uint8_t
>(cur_delta));
394 vals_.push_back(bin);
399 deltas_.push_back(0);
400 num_vals_ =
static_cast<data_size_t>(vals_.size());
403 deltas_.shrink_to_fit();
404 vals_.shrink_to_fit();
412 std::vector<uint8_t> deltas_;
413 std::vector<VAL_T> vals_;
415 std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
416 std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
420template <
typename VAL_T>
421inline uint32_t SparseBinIterator<VAL_T>::RawGet(
data_size_t idx) {
422 return InnerRawGet(idx);
425template <
typename VAL_T>
426inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(
data_size_t idx) {
427 while (cur_pos_ < idx) {
428 bin_data_->NextNonzero(&i_delta_, &cur_pos_);
430 if (cur_pos_ == idx) {
431 return bin_data_->vals_[i_delta_];
437template <
typename VAL_T>
438inline void SparseBinIterator<VAL_T>::Reset(
data_size_t start_idx) {
439 auto idx = start_idx >> bin_data_->fast_index_shift_;
440 if (
static_cast<size_t>(idx) < bin_data_->fast_index_.size()) {
441 const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
442 i_delta_ = fast_pair.first;
443 cur_pos_ = fast_pair.second;
450template <
typename VAL_T>
Iterator for one bin column.
Definition bin.h:267
Interface for bin data. This class will store bin data for one feature. unlike OrderedBin,...
Definition bin.h:286
Interface for ordered bin data. efficient for construct histogram, especially for sparse bin There ar...
Definition bin.h:219
Interface for ordered bin data. efficient for construct histogram, especially for sparse bin There ar...
Definition ordered_sparse_bin.hpp:26
Definition sparse_bin.hpp:22
uint32_t Get(data_size_t idx) override
Get bin data on specific row index.
Definition sparse_bin.hpp:44
Definition sparse_bin.hpp:69
void Push(int tid, data_size_t idx, uint32_t value) override
Push one record \pram tid Thread id.
Definition sparse_bin.hpp:92
OrderedBin * CreateOrderedBin() const override
Create the ordered bin for this bin.
Definition ordered_sparse_bin.hpp:206
data_size_t num_data() const override
Number of all data.
Definition sparse_bin.hpp:235
void FinishLoad() override
After pushed all feature data, call this could have better refactor for bin data.
Definition sparse_bin.hpp:239
void SaveBinaryToFile(const VirtualFileWriter *writer) const override
Save binary data to file.
Definition sparse_bin.hpp:321
void ConstructHistogram(const data_size_t *, data_size_t, const score_t *, HistogramBinEntry *) const override
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
Definition sparse_bin.hpp:113
size_t SizesInByte() const override
Get sizes in byte of this object.
Definition sparse_bin.hpp:327
virtual data_size_t SplitCategorical(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, const uint32_t *threshold, int num_threahold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const override
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
Definition sparse_bin.hpp:207
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const override
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
Definition sparse_bin.hpp:144
void LoadFromMemory(const void *memory, const std::vector< data_size_t > &local_used_indices) override
Load from memory.
Definition sparse_bin.hpp:332
BinIterator * GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override
Get bin iterator of this bin for specific feature.
Definition sparse_bin.hpp:451
void ConstructHistogram(const data_size_t *, data_size_t, const score_t *, const score_t *, HistogramBinEntry *) const override
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
Definition sparse_bin.hpp:101
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
float score_t
Type of score, and gradients.
Definition meta.h:26
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
Store data for one histogram bin.
Definition bin.h:29
An interface for writing files from buffers.
Definition file_io.h:15
virtual size_t Write(const void *data, size_t bytes) const =0
Append buffer to file.