Medial Code Documentation
Loading...
Searching...
No Matches
sparse_bin.hpp
1#ifndef LIGHTGBM_IO_SPARSE_BIN_HPP_
2#define LIGHTGBM_IO_SPARSE_BIN_HPP_
3
4#include <LightGBM/utils/log.h>
5
6#include <LightGBM/bin.h>
7
8#include <LightGBM/utils/openmp_wrapper.h>
9
10#include <cstring>
11#include <cstdint>
12#include <limits>
13#include <vector>
14
15namespace LightGBM {
16
17template <typename VAL_T> class SparseBin;
18
19const size_t kNumFastIndex = 64;
20
21template <typename VAL_T>
23public:
24 SparseBinIterator(const SparseBin<VAL_T>* bin_data,
25 uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
26 : bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
27 max_bin_(static_cast<VAL_T>(max_bin)),
28 default_bin_(static_cast<VAL_T>(default_bin)) {
29 if (default_bin_ == 0) {
30 bias_ = 1;
31 } else {
32 bias_ = 0;
33 }
34 Reset(0);
35 }
36 SparseBinIterator(const SparseBin<VAL_T>* bin_data, data_size_t start_idx)
37 : bin_data_(bin_data) {
38 Reset(start_idx);
39 }
40
41 inline uint32_t RawGet(data_size_t idx) override;
42 inline VAL_T InnerRawGet(data_size_t idx);
43
44 inline uint32_t Get(data_size_t idx) override {
45 VAL_T ret = InnerRawGet(idx);
46 if (ret >= min_bin_ && ret <= max_bin_) {
47 return ret - min_bin_ + bias_;
48 } else {
49 return default_bin_;
50 }
51 }
52
53 inline void Reset(data_size_t idx) override;
54
55private:
56 const SparseBin<VAL_T>* bin_data_;
57 data_size_t cur_pos_;
58 data_size_t i_delta_;
59 VAL_T min_bin_;
60 VAL_T max_bin_;
61 VAL_T default_bin_;
62 uint8_t bias_;
63};
64
65template <typename VAL_T>
66class OrderedSparseBin;
67
68template <typename VAL_T>
69class SparseBin: public Bin {
70public:
71 friend class SparseBinIterator<VAL_T>;
72 friend class OrderedSparseBin<VAL_T>;
73
75 : num_data_(num_data) {
76 int num_threads = 1;
77#pragma omp parallel
78#pragma omp master
79 {
80 num_threads = omp_get_num_threads();
81 }
82 push_buffers_.resize(num_threads);
83 }
84
85 ~SparseBin() {
86 }
87
88 void ReSize(data_size_t num_data) override {
89 num_data_ = num_data;
90 }
91
92 void Push(int tid, data_size_t idx, uint32_t value) override {
93 auto cur_bin = static_cast<VAL_T>(value);
94 if (cur_bin != 0) {
95 push_buffers_[tid].emplace_back(idx, cur_bin);
96 }
97 }
98
99 BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
100
102 const score_t*, HistogramBinEntry*) const override {
103 // Will use OrderedSparseBin->ConstructHistogram() instead
104 Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
105 }
106
108 const score_t*, HistogramBinEntry*) const override {
109 // Will use OrderedSparseBin->ConstructHistogram() instead
110 Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
111 }
112
114 HistogramBinEntry*) const override {
115 // Will use OrderedSparseBin->ConstructHistogram() instead
116 Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
117 }
118
120 HistogramBinEntry*) const override {
121 // Will use OrderedSparseBin->ConstructHistogram() instead
122 Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
123 }
124
125 inline bool NextNonzero(data_size_t* i_delta,
126 data_size_t* cur_pos) const {
127 ++(*i_delta);
128 data_size_t shift = 0;
129 data_size_t delta = deltas_[*i_delta];
130 while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
131 ++(*i_delta);
132 shift += 8;
133 delta |= static_cast<data_size_t>(deltas_[*i_delta]) << shift;
134 }
135 *cur_pos += delta;
136 if (*i_delta < num_vals_) {
137 return true;
138 } else {
139 *cur_pos = num_data_;
140 return false;
141 }
142 }
143
145 uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
146 uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
147 data_size_t* lte_indices, data_size_t* gt_indices) const override {
148 // not need to split
149 if (num_data <= 0) { return 0; }
150 VAL_T th = static_cast<VAL_T>(threshold + min_bin);
151 const VAL_T minb = static_cast<VAL_T>(min_bin);
152 const VAL_T maxb = static_cast<VAL_T>(max_bin);
153 VAL_T t_default_bin = static_cast<VAL_T>(min_bin + default_bin);
154 if (default_bin == 0) {
155 th -= 1;
156 t_default_bin -= 1;
157 }
158 SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
159 data_size_t lte_count = 0;
160 data_size_t gt_count = 0;
161 data_size_t* default_indices = gt_indices;
162 data_size_t* default_count = &gt_count;
163 if (missing_type == MissingType::NaN) {
164 if (default_bin <= threshold) {
165 default_indices = lte_indices;
166 default_count = &lte_count;
167 }
168 data_size_t* missing_default_indices = gt_indices;
169 data_size_t* missing_default_count = &gt_count;
170 if (default_left) {
171 missing_default_indices = lte_indices;
172 missing_default_count = &lte_count;
173 }
174 for (data_size_t i = 0; i < num_data; ++i) {
175 const data_size_t idx = data_indices[i];
176 const VAL_T bin = iterator.InnerRawGet(idx);
177 if (bin < minb || bin > maxb || t_default_bin == bin) {
178 default_indices[(*default_count)++] = idx;
179 } else if (bin == maxb) {
180 missing_default_indices[(*missing_default_count)++] = idx;
181 } else if (bin > th) {
182 gt_indices[gt_count++] = idx;
183 } else {
184 lte_indices[lte_count++] = idx;
185 }
186 }
187 } else {
188 if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
189 default_indices = lte_indices;
190 default_count = &lte_count;
191 }
192 for (data_size_t i = 0; i < num_data; ++i) {
193 const data_size_t idx = data_indices[i];
194 const VAL_T bin = iterator.InnerRawGet(idx);
195 if (bin < minb || bin > maxb || t_default_bin == bin) {
196 default_indices[(*default_count)++] = idx;
197 } else if (bin > th) {
198 gt_indices[gt_count++] = idx;
199 } else {
200 lte_indices[lte_count++] = idx;
201 }
202 }
203 }
204 return lte_count;
205 }
206
208 uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
209 const uint32_t* threshold, int num_threahold, data_size_t* data_indices, data_size_t num_data,
210 data_size_t* lte_indices, data_size_t* gt_indices) const override {
211 if (num_data <= 0) { return 0; }
212 data_size_t lte_count = 0;
213 data_size_t gt_count = 0;
214 SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
215 data_size_t* default_indices = gt_indices;
216 data_size_t* default_count = &gt_count;
217 if (Common::FindInBitset(threshold, num_threahold, default_bin)) {
218 default_indices = lte_indices;
219 default_count = &lte_count;
220 }
221 for (data_size_t i = 0; i < num_data; ++i) {
222 const data_size_t idx = data_indices[i];
223 uint32_t bin = iterator.InnerRawGet(idx);
224 if (bin < min_bin || bin > max_bin) {
225 default_indices[(*default_count)++] = idx;
226 } else if (Common::FindInBitset(threshold, num_threahold, bin - min_bin)) {
227 lte_indices[lte_count++] = idx;
228 } else {
229 gt_indices[gt_count++] = idx;
230 }
231 }
232 return lte_count;
233 }
234
235 data_size_t num_data() const override { return num_data_; }
236
237 OrderedBin* CreateOrderedBin() const override;
238
239 void FinishLoad() override {
240 // get total non zero size
241 size_t pair_cnt = 0;
242 for (size_t i = 0; i < push_buffers_.size(); ++i) {
243 pair_cnt += push_buffers_[i].size();
244 }
245 std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs = push_buffers_[0];
246 idx_val_pairs.reserve(pair_cnt);
247
248 for (size_t i = 1; i < push_buffers_.size(); ++i) {
249 idx_val_pairs.insert(idx_val_pairs.end(), push_buffers_[i].begin(), push_buffers_[i].end());
250 push_buffers_[i].clear();
251 push_buffers_[i].shrink_to_fit();
252 }
253 // sort by data index
254 std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
255 [](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
256 return a.first < b.first;
257 });
258 // load delta array
259 LoadFromPair(idx_val_pairs);
260 }
261
262 void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs) {
263 deltas_.clear();
264 vals_.clear();
265 // transform to delta array
266 data_size_t last_idx = 0;
267 for (size_t i = 0; i < idx_val_pairs.size(); ++i) {
268 const data_size_t cur_idx = idx_val_pairs[i].first;
269 const VAL_T bin = idx_val_pairs[i].second;
270 data_size_t cur_delta = cur_idx - last_idx;
271 if (i > 0 && cur_delta == 0) { continue; }
272 while (cur_delta >= 256) {
273 deltas_.push_back(cur_delta & 0xff);
274 vals_.push_back(0);
275 cur_delta >>= 8;
276 }
277 deltas_.push_back(static_cast<uint8_t>(cur_delta));
278 vals_.push_back(bin);
279 last_idx = cur_idx;
280 }
281 // avoid out of range
282 deltas_.push_back(0);
283 num_vals_ = static_cast<data_size_t>(vals_.size());
284
285 // reduce memory cost
286 deltas_.shrink_to_fit();
287 vals_.shrink_to_fit();
288
289 // generate fast index
290 GetFastIndex();
291 }
292
293 void GetFastIndex() {
294 fast_index_.clear();
295 // get shift cnt
296 data_size_t mod_size = (num_data_ + kNumFastIndex - 1) / kNumFastIndex;
297 data_size_t pow2_mod_size = 1;
298 fast_index_shift_ = 0;
299 while (pow2_mod_size < mod_size) {
300 pow2_mod_size <<= 1;
301 ++fast_index_shift_;
302 }
303 // build fast index
304 data_size_t i_delta = -1;
305 data_size_t cur_pos = 0;
306 data_size_t next_threshold = 0;
307 while (NextNonzero(&i_delta, &cur_pos)) {
308 while (next_threshold <= cur_pos) {
309 fast_index_.emplace_back(i_delta, cur_pos);
310 next_threshold += pow2_mod_size;
311 }
312 }
313 // avoid out of range
314 while (next_threshold < num_data_) {
315 fast_index_.emplace_back(num_vals_ - 1, cur_pos);
316 next_threshold += pow2_mod_size;
317 }
318 fast_index_.shrink_to_fit();
319 }
320
321 void SaveBinaryToFile(const VirtualFileWriter* writer) const override {
322 writer->Write(&num_vals_, sizeof(num_vals_));
323 writer->Write(deltas_.data(), sizeof(uint8_t) * (num_vals_ + 1));
324 writer->Write(vals_.data(), sizeof(VAL_T) * num_vals_);
325 }
326
327 size_t SizesInByte() const override {
328 return sizeof(num_vals_) + sizeof(uint8_t) * (num_vals_ + 1)
329 + sizeof(VAL_T) * num_vals_;
330 }
331
332 void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
333 const char* mem_ptr = reinterpret_cast<const char*>(memory);
334 data_size_t tmp_num_vals = *(reinterpret_cast<const data_size_t*>(mem_ptr));
335 mem_ptr += sizeof(tmp_num_vals);
336 const uint8_t* tmp_delta = reinterpret_cast<const uint8_t*>(mem_ptr);
337 mem_ptr += sizeof(uint8_t) * (tmp_num_vals + 1);
338 const VAL_T* tmp_vals = reinterpret_cast<const VAL_T*>(mem_ptr);
339
340 deltas_.clear();
341 vals_.clear();
342 num_vals_ = tmp_num_vals;
343 for (data_size_t i = 0; i < num_vals_; ++i) {
344 deltas_.push_back(tmp_delta[i]);
345 vals_.push_back(tmp_vals[i]);
346 }
347 deltas_.push_back(0);
348 // reduce memory cost
349 deltas_.shrink_to_fit();
350 vals_.shrink_to_fit();
351
352 if (local_used_indices.empty()) {
353 // generate fast index
354 GetFastIndex();
355 } else {
356 std::vector<std::pair<data_size_t, VAL_T>> tmp_pair;
357 data_size_t cur_pos = 0;
358 data_size_t j = -1;
359 for (data_size_t i = 0; i < static_cast<data_size_t>(local_used_indices.size()); ++i) {
360 const data_size_t idx = local_used_indices[i];
361 while (cur_pos < idx && j < num_vals_) {
362 NextNonzero(&j, &cur_pos);
363 }
364 if (cur_pos == idx && j < num_vals_) {
365 // new row index is i
366 tmp_pair.emplace_back(i, vals_[j]);
367 }
368 }
369 LoadFromPair(tmp_pair);
370 }
371 }
372
373 void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
374 auto other_bin = dynamic_cast<const SparseBin<VAL_T>*>(full_bin);
375 deltas_.clear();
376 vals_.clear();
377 data_size_t start = 0;
378 if (num_used_indices > 0) {
379 start = used_indices[0];
380 }
381 SparseBinIterator<VAL_T> iterator(other_bin, start);
382 // transform to delta array
383 data_size_t last_idx = 0;
384 for (data_size_t i = 0; i < num_used_indices; ++i) {
385 VAL_T bin = iterator.InnerRawGet(used_indices[i]);
386 if (bin > 0) {
387 data_size_t cur_delta = i - last_idx;
388 while (cur_delta >= 256) {
389 deltas_.push_back(cur_delta & 0xff);
390 vals_.push_back(0);
391 cur_delta >>= 8;
392 }
393 deltas_.push_back(static_cast<uint8_t>(cur_delta));
394 vals_.push_back(bin);
395 last_idx = i;
396 }
397 }
398 // avoid out of range
399 deltas_.push_back(0);
400 num_vals_ = static_cast<data_size_t>(vals_.size());
401
402 // reduce memory cost
403 deltas_.shrink_to_fit();
404 vals_.shrink_to_fit();
405
406 // generate fast index
407 GetFastIndex();
408 }
409
410protected:
411 data_size_t num_data_;
412 std::vector<uint8_t> deltas_;
413 std::vector<VAL_T> vals_;
414 data_size_t num_vals_;
415 std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
416 std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
417 data_size_t fast_index_shift_;
418};
419
420template <typename VAL_T>
421inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
422 return InnerRawGet(idx);
423}
424
425template <typename VAL_T>
426inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
427 while (cur_pos_ < idx) {
428 bin_data_->NextNonzero(&i_delta_, &cur_pos_);
429 }
430 if (cur_pos_ == idx) {
431 return bin_data_->vals_[i_delta_];
432 } else {
433 return 0;
434 }
435}
436
437template <typename VAL_T>
438inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
439 auto idx = start_idx >> bin_data_->fast_index_shift_;
440 if (static_cast<size_t>(idx) < bin_data_->fast_index_.size()) {
441 const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
442 i_delta_ = fast_pair.first;
443 cur_pos_ = fast_pair.second;
444 } else {
445 i_delta_ = -1;
446 cur_pos_ = 0;
447 }
448}
449
450template <typename VAL_T>
451BinIterator* SparseBin<VAL_T>::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
452 return new SparseBinIterator<VAL_T>(this, min_bin, max_bin, default_bin);
453}
454
455} // namespace LightGBM
456#endif // LightGBM_IO_SPARSE_BIN_HPP_
Iterator for one bin column.
Definition bin.h:267
Interface for bin data. This class will store bin data for one feature. unlike OrderedBin,...
Definition bin.h:286
Interface for ordered bin data. efficient for construct histogram, especially for sparse bin There ar...
Definition bin.h:219
Interface for ordered bin data. efficient for construct histogram, especially for sparse bin There ar...
Definition ordered_sparse_bin.hpp:26
Definition sparse_bin.hpp:22
uint32_t Get(data_size_t idx) override
Get bin data on specific row index.
Definition sparse_bin.hpp:44
Definition sparse_bin.hpp:69
void Push(int tid, data_size_t idx, uint32_t value) override
Push one record \pram tid Thread id.
Definition sparse_bin.hpp:92
OrderedBin * CreateOrderedBin() const override
Create the ordered bin for this bin.
Definition ordered_sparse_bin.hpp:206
data_size_t num_data() const override
Number of all data.
Definition sparse_bin.hpp:235
void FinishLoad() override
After pushed all feature data, call this could have better refactor for bin data.
Definition sparse_bin.hpp:239
void SaveBinaryToFile(const VirtualFileWriter *writer) const override
Save binary data to file.
Definition sparse_bin.hpp:321
void ConstructHistogram(const data_size_t *, data_size_t, const score_t *, HistogramBinEntry *) const override
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
Definition sparse_bin.hpp:113
size_t SizesInByte() const override
Get sizes in byte of this object.
Definition sparse_bin.hpp:327
virtual data_size_t SplitCategorical(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, const uint32_t *threshold, int num_threahold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const override
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
Definition sparse_bin.hpp:207
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const override
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
Definition sparse_bin.hpp:144
void LoadFromMemory(const void *memory, const std::vector< data_size_t > &local_used_indices) override
Load from memory.
Definition sparse_bin.hpp:332
BinIterator * GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override
Get bin iterator of this bin for specific feature.
Definition sparse_bin.hpp:451
void ConstructHistogram(const data_size_t *, data_size_t, const score_t *, const score_t *, HistogramBinEntry *) const override
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
Definition sparse_bin.hpp:101
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
float score_t
Type of score, and gradients.
Definition meta.h:26
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
Store data for one histogram bin.
Definition bin.h:29
An interface for writing files from buffers.
Definition file_io.h:15
virtual size_t Write(const void *data, size_t bytes) const =0
Append buffer to file.