Medial Code Documentation
Loading...
Searching...
No Matches
dense_nbits_bin.hpp
1#ifndef LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
2#define LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
3
4#include <LightGBM/bin.h>
5
6#include <vector>
7#include <cstring>
8#include <cstdint>
9
10namespace LightGBM {
11
12class Dense4bitsBin;
13
15public:
16 explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
17 : bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)),
18 max_bin_(static_cast<uint8_t>(max_bin)),
19 default_bin_(static_cast<uint8_t>(default_bin)) {
20 if (default_bin_ == 0) {
21 bias_ = 1;
22 } else {
23 bias_ = 0;
24 }
25 }
26 inline uint32_t RawGet(data_size_t idx) override;
27 inline uint32_t Get(data_size_t idx) override;
28 inline void Reset(data_size_t) override {}
29private:
30 const Dense4bitsBin* bin_data_;
31 uint8_t min_bin_;
32 uint8_t max_bin_;
33 uint8_t default_bin_;
34 uint8_t bias_;
35};
36
37class Dense4bitsBin : public Bin {
38public:
41 : num_data_(num_data) {
42 int len = (num_data_ + 1) / 2;
43 data_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
44 buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
45 }
46
48 }
49
50 void Push(int, data_size_t idx, uint32_t value) override {
51 const int i1 = idx >> 1;
52 const int i2 = (idx & 1) << 2;
53 const uint8_t val = static_cast<uint8_t>(value) << i2;
54 if (i2 == 0) {
55 data_[i1] = val;
56 } else {
57 buf_[i1] = val;
58 }
59 }
60
61 void ReSize(data_size_t num_data) override {
62 if (num_data_ != num_data) {
63 num_data_ = num_data;
64 const int len = (num_data_ + 1) / 2;
65 data_.resize(len);
66 }
67 }
68
69 inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
70
72 const score_t* ordered_gradients, const score_t* ordered_hessians,
73 HistogramBinEntry* out) const override {
74 const data_size_t rest = num_data & 0x3;
75 data_size_t i = 0;
76 for (; i < num_data - rest; i += 4) {
77 const data_size_t idx0 = data_indices[i];
78 const auto bin0 = (data_[idx0 >> 1] >> ((idx0 & 1) << 2)) & 0xf;
79
80 const data_size_t idx1 = data_indices[i + 1];
81 const auto bin1 = (data_[idx1 >> 1] >> ((idx1 & 1) << 2)) & 0xf;
82
83 const data_size_t idx2 = data_indices[i + 2];
84 const auto bin2 = (data_[idx2 >> 1] >> ((idx2 & 1) << 2)) & 0xf;
85
86 const data_size_t idx3 = data_indices[i + 3];
87 const auto bin3 = (data_[idx3 >> 1] >> ((idx3 & 1) << 2)) & 0xf;
88
89 out[bin0].sum_gradients += ordered_gradients[i];
90 out[bin1].sum_gradients += ordered_gradients[i + 1];
91 out[bin2].sum_gradients += ordered_gradients[i + 2];
92 out[bin3].sum_gradients += ordered_gradients[i + 3];
93
94 out[bin0].sum_hessians += ordered_hessians[i];
95 out[bin1].sum_hessians += ordered_hessians[i + 1];
96 out[bin2].sum_hessians += ordered_hessians[i + 2];
97 out[bin3].sum_hessians += ordered_hessians[i + 3];
98
99 ++out[bin0].cnt;
100 ++out[bin1].cnt;
101 ++out[bin2].cnt;
102 ++out[bin3].cnt;
103 }
104
105 for (; i < num_data; ++i) {
106 const data_size_t idx = data_indices[i];
107 const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
108 out[bin].sum_gradients += ordered_gradients[i];
109 out[bin].sum_hessians += ordered_hessians[i];
110 ++out[bin].cnt;
111 }
112 }
113
115 const score_t* ordered_gradients, const score_t* ordered_hessians,
116 HistogramBinEntry* out) const override {
117 const data_size_t rest = num_data & 0x3;
118 data_size_t i = 0;
119 for (; i < num_data - rest; i += 4) {
120 const auto bin0 = (data_[i >> 1]) & 0xf;
121 const auto bin1 = (data_[i >> 1] >> 4) & 0xf;
122 const auto bin2 = (data_[(i >> 1) + 1]) & 0xf;
123 const auto bin3 = (data_[(i >> 1) + 1] >> 4) & 0xf;
124
125 out[bin0].sum_gradients += ordered_gradients[i];
126 out[bin1].sum_gradients += ordered_gradients[i + 1];
127 out[bin2].sum_gradients += ordered_gradients[i + 2];
128 out[bin3].sum_gradients += ordered_gradients[i + 3];
129
130 out[bin0].sum_hessians += ordered_hessians[i];
131 out[bin1].sum_hessians += ordered_hessians[i + 1];
132 out[bin2].sum_hessians += ordered_hessians[i + 2];
133 out[bin3].sum_hessians += ordered_hessians[i + 3];
134
135 ++out[bin0].cnt;
136 ++out[bin1].cnt;
137 ++out[bin2].cnt;
138 ++out[bin3].cnt;
139 }
140 for (; i < num_data; ++i) {
141 const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
142 out[bin].sum_gradients += ordered_gradients[i];
143 out[bin].sum_hessians += ordered_hessians[i];
144 ++out[bin].cnt;
145 }
146 }
147
149 const score_t* ordered_gradients,
150 HistogramBinEntry* out) const override {
151 const data_size_t rest = num_data & 0x3;
152 data_size_t i = 0;
153 for (; i < num_data - rest; i += 4) {
154 const data_size_t idx0 = data_indices[i];
155 const auto bin0 = (data_[idx0 >> 1] >> ((idx0 & 1) << 2)) & 0xf;
156
157 const data_size_t idx1 = data_indices[i + 1];
158 const auto bin1 = (data_[idx1 >> 1] >> ((idx1 & 1) << 2)) & 0xf;
159
160 const data_size_t idx2 = data_indices[i + 2];
161 const auto bin2 = (data_[idx2 >> 1] >> ((idx2 & 1) << 2)) & 0xf;
162
163 const data_size_t idx3 = data_indices[i + 3];
164 const auto bin3 = (data_[idx3 >> 1] >> ((idx3 & 1) << 2)) & 0xf;
165
166 out[bin0].sum_gradients += ordered_gradients[i];
167 out[bin1].sum_gradients += ordered_gradients[i + 1];
168 out[bin2].sum_gradients += ordered_gradients[i + 2];
169 out[bin3].sum_gradients += ordered_gradients[i + 3];
170
171 ++out[bin0].cnt;
172 ++out[bin1].cnt;
173 ++out[bin2].cnt;
174 ++out[bin3].cnt;
175 }
176
177 for (; i < num_data; ++i) {
178 const data_size_t idx = data_indices[i];
179 const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
180 out[bin].sum_gradients += ordered_gradients[i];
181 ++out[bin].cnt;
182 }
183 }
184
186 const score_t* ordered_gradients,
187 HistogramBinEntry* out) const override {
188 const data_size_t rest = num_data & 0x3;
189 data_size_t i = 0;
190 for (; i < num_data - rest; i += 4) {
191 const auto bin0 = (data_[i >> 1]) & 0xf;
192 const auto bin1 = (data_[i >> 1] >> 4) & 0xf;
193 const auto bin2 = (data_[(i >> 1) + 1]) & 0xf;
194 const auto bin3 = (data_[(i >> 1) + 1] >> 4) & 0xf;
195
196 out[bin0].sum_gradients += ordered_gradients[i];
197 out[bin1].sum_gradients += ordered_gradients[i + 1];
198 out[bin2].sum_gradients += ordered_gradients[i + 2];
199 out[bin3].sum_gradients += ordered_gradients[i + 3];
200
201 ++out[bin0].cnt;
202 ++out[bin1].cnt;
203 ++out[bin2].cnt;
204 ++out[bin3].cnt;
205 }
206 for (; i < num_data; ++i) {
207 const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
208 out[bin].sum_gradients += ordered_gradients[i];
209 ++out[bin].cnt;
210 }
211 }
212
214 uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
215 uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
216 data_size_t* lte_indices, data_size_t* gt_indices) const override {
217 if (num_data <= 0) { return 0; }
218 uint8_t th = static_cast<uint8_t>(threshold + min_bin);
219 const uint8_t minb = static_cast<uint8_t>(min_bin);
220 const uint8_t maxb = static_cast<uint8_t>(max_bin);
221 uint8_t t_default_bin = static_cast<uint8_t>(min_bin + default_bin);
222 if (default_bin == 0) {
223 th -= 1;
224 t_default_bin -= 1;
225 }
226 data_size_t lte_count = 0;
227 data_size_t gt_count = 0;
228 data_size_t* default_indices = gt_indices;
229 data_size_t* default_count = &gt_count;
230 if (missing_type == MissingType::NaN) {
231 if (default_bin <= threshold) {
232 default_indices = lte_indices;
233 default_count = &lte_count;
234 }
235 data_size_t* missing_default_indices = gt_indices;
236 data_size_t* missing_default_count = &gt_count;
237 if (default_left) {
238 missing_default_indices = lte_indices;
239 missing_default_count = &lte_count;
240 }
241 for (data_size_t i = 0; i < num_data; ++i) {
242 const data_size_t idx = data_indices[i];
243 const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
244 if (bin < minb || bin > maxb || t_default_bin == bin) {
245 default_indices[(*default_count)++] = idx;
246 } else if (bin == maxb) {
247 missing_default_indices[(*missing_default_count)++] = idx;
248 } else if (bin > th) {
249 gt_indices[gt_count++] = idx;
250 } else {
251 lte_indices[lte_count++] = idx;
252 }
253 }
254 } else {
255 if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
256 default_indices = lte_indices;
257 default_count = &lte_count;
258 }
259 for (data_size_t i = 0; i < num_data; ++i) {
260 const data_size_t idx = data_indices[i];
261 const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
262 if (bin < minb || bin > maxb || t_default_bin == bin) {
263 default_indices[(*default_count)++] = idx;
264 } else if (bin > th) {
265 gt_indices[gt_count++] = idx;
266 } else {
267 lte_indices[lte_count++] = idx;
268 }
269 }
270 }
271 return lte_count;
272 }
273
275 uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
276 const uint32_t* threshold, int num_threahold, data_size_t* data_indices, data_size_t num_data,
277 data_size_t* lte_indices, data_size_t* gt_indices) const override {
278 if (num_data <= 0) { return 0; }
279 data_size_t lte_count = 0;
280 data_size_t gt_count = 0;
281 data_size_t* default_indices = gt_indices;
282 data_size_t* default_count = &gt_count;
283 if (Common::FindInBitset(threshold, num_threahold, default_bin)) {
284 default_indices = lte_indices;
285 default_count = &lte_count;
286 }
287 for (data_size_t i = 0; i < num_data; ++i) {
288 const data_size_t idx = data_indices[i];
289 const uint32_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
290 if (bin < min_bin || bin > max_bin) {
291 default_indices[(*default_count)++] = idx;
292 } else if (Common::FindInBitset(threshold, num_threahold, bin - min_bin)) {
293 lte_indices[lte_count++] = idx;
294 } else {
295 gt_indices[gt_count++] = idx;
296 }
297 }
298 return lte_count;
299 }
300
301 data_size_t num_data() const override { return num_data_; }
302
304 OrderedBin* CreateOrderedBin() const override { return nullptr; }
305
306 void FinishLoad() override {
307 if (buf_.empty()) { return; }
308 int len = (num_data_ + 1) / 2;
309 for (int i = 0; i < len; ++i) {
310 data_[i] |= buf_[i];
311 }
312 buf_.clear();
313 }
314
315 void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
316 const uint8_t* mem_data = reinterpret_cast<const uint8_t*>(memory);
317 if (!local_used_indices.empty()) {
318 const data_size_t rest = num_data_ & 1;
319 for (int i = 0; i < num_data_ - rest; i += 2) {
320 // get old bins
321 data_size_t idx = local_used_indices[i];
322 const auto bin1 = static_cast<uint8_t>((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
323 idx = local_used_indices[i + 1];
324 const auto bin2 = static_cast<uint8_t>((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
325 // add
326 const int i1 = i >> 1;
327 data_[i1] = (bin1 | (bin2 << 4));
328 }
329 if (rest) {
330 data_size_t idx = local_used_indices[num_data_ - 1];
331 data_[num_data_ >> 1] = (mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
332 }
333 } else {
334 for (size_t i = 0; i < data_.size(); ++i) {
335 data_[i] = mem_data[i];
336 }
337 }
338 }
339
340 void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
341 auto other_bin = dynamic_cast<const Dense4bitsBin*>(full_bin);
342 const data_size_t rest = num_used_indices & 1;
343 for (int i = 0; i < num_used_indices - rest; i += 2) {
344 data_size_t idx = used_indices[i];
345 const auto bin1 = static_cast<uint8_t>((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
346 idx = used_indices[i + 1];
347 const auto bin2 = static_cast<uint8_t>((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
348 const int i1 = i >> 1;
349 data_[i1] = (bin1 | (bin2 << 4));
350 }
351 if (rest) {
352 data_size_t idx = used_indices[num_used_indices - 1];
353 data_[num_used_indices >> 1] = (other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
354 }
355 }
356
357 void SaveBinaryToFile(const VirtualFileWriter* writer) const override {
358 writer->Write(data_.data(), sizeof(uint8_t) * data_.size());
359 }
360
361 size_t SizesInByte() const override {
362 return sizeof(uint8_t) * data_.size();
363 }
364
365protected:
366 data_size_t num_data_;
367 std::vector<uint8_t> data_;
368 std::vector<uint8_t> buf_;
369};
370
372 const auto bin = (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
373 if (bin >= min_bin_ && bin <= max_bin_) {
374 return bin - min_bin_ + bias_;
375 } else {
376 return default_bin_;
377 }
378}
379
380uint32_t Dense4bitsBinIterator::RawGet(data_size_t idx) {
381 return (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
382}
383
384inline BinIterator* Dense4bitsBin::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
385 return new Dense4bitsBinIterator(this, min_bin, max_bin, default_bin);
386}
387
388} // namespace LightGBM
389#endif // LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
Iterator for one bin column.
Definition bin.h:267
Interface for bin data. This class will store bin data for one feature. unlike OrderedBin,...
Definition bin.h:286
Definition dense_nbits_bin.hpp:14
uint32_t Get(data_size_t idx) override
Get bin data on specific row index.
Definition dense_nbits_bin.hpp:371
Definition dense_nbits_bin.hpp:37
data_size_t num_data() const override
Number of all data.
Definition dense_nbits_bin.hpp:301
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const override
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
Definition dense_nbits_bin.hpp:213
void Push(int, data_size_t idx, uint32_t value) override
Push one record \pram tid Thread id.
Definition dense_nbits_bin.hpp:50
virtual data_size_t SplitCategorical(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, const uint32_t *threshold, int num_threahold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const override
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
Definition dense_nbits_bin.hpp:274
void ConstructHistogram(const data_size_t *data_indices, data_size_t num_data, const score_t *ordered_gradients, HistogramBinEntry *out) const override
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
Definition dense_nbits_bin.hpp:148
size_t SizesInByte() const override
Get sizes in byte of this object.
Definition dense_nbits_bin.hpp:361
void FinishLoad() override
After pushed all feature data, call this could have better refactor for bin data.
Definition dense_nbits_bin.hpp:306
void ConstructHistogram(const data_size_t *data_indices, data_size_t num_data, const score_t *ordered_gradients, const score_t *ordered_hessians, HistogramBinEntry *out) const override
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
Definition dense_nbits_bin.hpp:71
BinIterator * GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override
Get bin iterator of this bin for specific feature.
Definition dense_nbits_bin.hpp:384
OrderedBin * CreateOrderedBin() const override
not ordered bin for dense feature
Definition dense_nbits_bin.hpp:304
void LoadFromMemory(const void *memory, const std::vector< data_size_t > &local_used_indices) override
Load from memory.
Definition dense_nbits_bin.hpp:315
void SaveBinaryToFile(const VirtualFileWriter *writer) const override
Save binary data to file.
Definition dense_nbits_bin.hpp:357
Interface for ordered bin data. efficient for construct histogram, especially for sparse bin There ar...
Definition bin.h:219
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
float score_t
Type of score, and gradients.
Definition meta.h:26
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
Store data for one histogram bin.
Definition bin.h:29
An interface for writing files from buffers.
Definition file_io.h:15
virtual size_t Write(const void *data, size_t bytes) const =0
Append buffer to file.