Medial Code Documentation
Loading...
Searching...
No Matches
dense_bin.hpp
1#ifndef LIGHTGBM_IO_DENSE_BIN_HPP_
2#define LIGHTGBM_IO_DENSE_BIN_HPP_
3
4#include <LightGBM/bin.h>
5
6#include <vector>
7#include <cstring>
8#include <cstdint>
9
10namespace LightGBM {
11
12template <typename VAL_T>
13class DenseBin;
14
15template <typename VAL_T>
17public:
18 explicit DenseBinIterator(const DenseBin<VAL_T>* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
19 : bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
20 max_bin_(static_cast<VAL_T>(max_bin)),
21 default_bin_(static_cast<VAL_T>(default_bin)) {
22 if (default_bin_ == 0) {
23 bias_ = 1;
24 } else {
25 bias_ = 0;
26 }
27 }
28 inline uint32_t RawGet(data_size_t idx) override;
29 inline uint32_t Get(data_size_t idx) override;
30 inline void Reset(data_size_t) override { }
31private:
32 const DenseBin<VAL_T>* bin_data_;
33 VAL_T min_bin_;
34 VAL_T max_bin_;
35 VAL_T default_bin_;
36 uint8_t bias_;
37};
42template <typename VAL_T>
43class DenseBin: public Bin {
44public:
47 : num_data_(num_data), data_(num_data_, static_cast<VAL_T>(0)) {
48 }
49
50 ~DenseBin() {
51 }
52
53 void Push(int, data_size_t idx, uint32_t value) override {
54 data_[idx] = static_cast<VAL_T>(value);
55 }
56
57 void ReSize(data_size_t num_data) override {
58 if (num_data_ != num_data) {
59 num_data_ = num_data;
60 data_.resize(num_data_);
61 }
62 }
63
64 BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
65
67 const score_t* ordered_gradients, const score_t* ordered_hessians,
68 HistogramBinEntry* out) const override {
69 const data_size_t rest = num_data & 0x3;
70 data_size_t i = 0;
71 for (; i < num_data - rest; i += 4) {
72 const VAL_T bin0 = data_[data_indices[i]];
73 const VAL_T bin1 = data_[data_indices[i + 1]];
74 const VAL_T bin2 = data_[data_indices[i + 2]];
75 const VAL_T bin3 = data_[data_indices[i + 3]];
76
77 out[bin0].sum_gradients += ordered_gradients[i];
78 out[bin1].sum_gradients += ordered_gradients[i + 1];
79 out[bin2].sum_gradients += ordered_gradients[i + 2];
80 out[bin3].sum_gradients += ordered_gradients[i + 3];
81
82 out[bin0].sum_hessians += ordered_hessians[i];
83 out[bin1].sum_hessians += ordered_hessians[i + 1];
84 out[bin2].sum_hessians += ordered_hessians[i + 2];
85 out[bin3].sum_hessians += ordered_hessians[i + 3];
86
87 ++out[bin0].cnt;
88 ++out[bin1].cnt;
89 ++out[bin2].cnt;
90 ++out[bin3].cnt;
91 }
92 for (; i < num_data; ++i) {
93 const VAL_T bin = data_[data_indices[i]];
94 out[bin].sum_gradients += ordered_gradients[i];
95 out[bin].sum_hessians += ordered_hessians[i];
96 ++out[bin].cnt;
97 }
98 }
99
101 const score_t* ordered_gradients, const score_t* ordered_hessians,
102 HistogramBinEntry* out) const override {
103 const data_size_t rest = num_data & 0x3;
104 data_size_t i = 0;
105 for (; i < num_data - rest; i += 4) {
106 const VAL_T bin0 = data_[i];
107 const VAL_T bin1 = data_[i + 1];
108 const VAL_T bin2 = data_[i + 2];
109 const VAL_T bin3 = data_[i + 3];
110
111 out[bin0].sum_gradients += ordered_gradients[i];
112 out[bin1].sum_gradients += ordered_gradients[i + 1];
113 out[bin2].sum_gradients += ordered_gradients[i + 2];
114 out[bin3].sum_gradients += ordered_gradients[i + 3];
115
116 out[bin0].sum_hessians += ordered_hessians[i];
117 out[bin1].sum_hessians += ordered_hessians[i + 1];
118 out[bin2].sum_hessians += ordered_hessians[i + 2];
119 out[bin3].sum_hessians += ordered_hessians[i + 3];
120
121 ++out[bin0].cnt;
122 ++out[bin1].cnt;
123 ++out[bin2].cnt;
124 ++out[bin3].cnt;
125 }
126 for (; i < num_data; ++i) {
127 const VAL_T bin = data_[i];
128 out[bin].sum_gradients += ordered_gradients[i];
129 out[bin].sum_hessians += ordered_hessians[i];
130 ++out[bin].cnt;
131 }
132 }
133
135 const score_t* ordered_gradients,
136 HistogramBinEntry* out) const override {
137 const data_size_t rest = num_data & 0x3;
138 data_size_t i = 0;
139 for (; i < num_data - rest; i += 4) {
140 const VAL_T bin0 = data_[data_indices[i]];
141 const VAL_T bin1 = data_[data_indices[i + 1]];
142 const VAL_T bin2 = data_[data_indices[i + 2]];
143 const VAL_T bin3 = data_[data_indices[i + 3]];
144
145 out[bin0].sum_gradients += ordered_gradients[i];
146 out[bin1].sum_gradients += ordered_gradients[i + 1];
147 out[bin2].sum_gradients += ordered_gradients[i + 2];
148 out[bin3].sum_gradients += ordered_gradients[i + 3];
149
150 ++out[bin0].cnt;
151 ++out[bin1].cnt;
152 ++out[bin2].cnt;
153 ++out[bin3].cnt;
154 }
155 for (; i < num_data; ++i) {
156 const VAL_T bin = data_[data_indices[i]];
157 out[bin].sum_gradients += ordered_gradients[i];
158 ++out[bin].cnt;
159 }
160 }
161
163 const score_t* ordered_gradients,
164 HistogramBinEntry* out) const override {
165 const data_size_t rest = num_data & 0x3;
166 data_size_t i = 0;
167 for (; i < num_data - rest; i += 4) {
168 const VAL_T bin0 = data_[i];
169 const VAL_T bin1 = data_[i + 1];
170 const VAL_T bin2 = data_[i + 2];
171 const VAL_T bin3 = data_[i + 3];
172
173 out[bin0].sum_gradients += ordered_gradients[i];
174 out[bin1].sum_gradients += ordered_gradients[i + 1];
175 out[bin2].sum_gradients += ordered_gradients[i + 2];
176 out[bin3].sum_gradients += ordered_gradients[i + 3];
177
178 ++out[bin0].cnt;
179 ++out[bin1].cnt;
180 ++out[bin2].cnt;
181 ++out[bin3].cnt;
182 }
183 for (; i < num_data; ++i) {
184 const VAL_T bin = data_[i];
185 out[bin].sum_gradients += ordered_gradients[i];
186 ++out[bin].cnt;
187 }
188 }
189
191 uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
192 uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
193 data_size_t* lte_indices, data_size_t* gt_indices) const override {
194 if (num_data <= 0) { return 0; }
195 VAL_T th = static_cast<VAL_T>(threshold + min_bin);
196 const VAL_T minb = static_cast<VAL_T>(min_bin);
197 const VAL_T maxb = static_cast<VAL_T>(max_bin);
198 VAL_T t_default_bin = static_cast<VAL_T>(min_bin + default_bin);
199 if (default_bin == 0) {
200 th -= 1;
201 t_default_bin -= 1;
202 }
203 data_size_t lte_count = 0;
204 data_size_t gt_count = 0;
205 data_size_t* default_indices = gt_indices;
206 data_size_t* default_count = &gt_count;
207 if (missing_type == MissingType::NaN) {
208 if (default_bin <= threshold) {
209 default_indices = lte_indices;
210 default_count = &lte_count;
211 }
212 data_size_t* missing_default_indices = gt_indices;
213 data_size_t* missing_default_count = &gt_count;
214 if (default_left) {
215 missing_default_indices = lte_indices;
216 missing_default_count = &lte_count;
217 }
218 for (data_size_t i = 0; i < num_data; ++i) {
219 const data_size_t idx = data_indices[i];
220 const VAL_T bin = data_[idx];
221 if (bin < minb || bin > maxb || t_default_bin == bin) {
222 default_indices[(*default_count)++] = idx;
223 } else if (bin == maxb) {
224 missing_default_indices[(*missing_default_count)++] = idx;
225 } else if (bin > th) {
226 gt_indices[gt_count++] = idx;
227 } else {
228 lte_indices[lte_count++] = idx;
229 }
230 }
231 } else {
232 if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
233 default_indices = lte_indices;
234 default_count = &lte_count;
235 }
236 for (data_size_t i = 0; i < num_data; ++i) {
237 const data_size_t idx = data_indices[i];
238 const VAL_T bin = data_[idx];
239 if (bin < minb || bin > maxb || t_default_bin == bin) {
240 default_indices[(*default_count)++] = idx;
241 } else if (bin > th) {
242 gt_indices[gt_count++] = idx;
243 } else {
244 lte_indices[lte_count++] = idx;
245 }
246 }
247 }
248 return lte_count;
249 }
250
252 uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
253 const uint32_t* threshold, int num_threahold, data_size_t* data_indices, data_size_t num_data,
254 data_size_t* lte_indices, data_size_t* gt_indices) const override {
255 if (num_data <= 0) { return 0; }
256 data_size_t lte_count = 0;
257 data_size_t gt_count = 0;
258 data_size_t* default_indices = gt_indices;
259 data_size_t* default_count = &gt_count;
260 if (Common::FindInBitset(threshold, num_threahold, default_bin)) {
261 default_indices = lte_indices;
262 default_count = &lte_count;
263 }
264 for (data_size_t i = 0; i < num_data; ++i) {
265 const data_size_t idx = data_indices[i];
266 const uint32_t bin = data_[idx];
267 if (bin < min_bin || bin > max_bin) {
268 default_indices[(*default_count)++] = idx;
269 } else if (Common::FindInBitset(threshold, num_threahold, bin - min_bin)) {
270 lte_indices[lte_count++] = idx;
271 } else {
272 gt_indices[gt_count++] = idx;
273 }
274 }
275 return lte_count;
276 }
277
278 data_size_t num_data() const override { return num_data_; }
279
281 OrderedBin* CreateOrderedBin() const override { return nullptr; }
282
283 void FinishLoad() override {}
284
285 void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
286 const VAL_T* mem_data = reinterpret_cast<const VAL_T*>(memory);
287 if (!local_used_indices.empty()) {
288 for (int i = 0; i < num_data_; ++i) {
289 data_[i] = mem_data[local_used_indices[i]];
290 }
291 } else {
292 for (int i = 0; i < num_data_; ++i) {
293 data_[i] = mem_data[i];
294 }
295 }
296 }
297
298 void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
299 auto other_bin = dynamic_cast<const DenseBin<VAL_T>*>(full_bin);
300 for (int i = 0; i < num_used_indices; ++i) {
301 data_[i] = other_bin->data_[used_indices[i]];
302 }
303 }
304
305 void SaveBinaryToFile(const VirtualFileWriter* writer) const override {
306 writer->Write(data_.data(), sizeof(VAL_T) * num_data_);
307 }
308
309 size_t SizesInByte() const override {
310 return sizeof(VAL_T) * num_data_;
311 }
312
313protected:
314 data_size_t num_data_;
315 std::vector<VAL_T> data_;
316};
317
318template <typename VAL_T>
320 auto ret = bin_data_->data_[idx];
321 if (ret >= min_bin_ && ret <= max_bin_) {
322 return ret - min_bin_ + bias_;
323 } else {
324 return default_bin_;
325 }
326}
327
328template <typename VAL_T>
329inline uint32_t DenseBinIterator<VAL_T>::RawGet(data_size_t idx) {
330 return bin_data_->data_[idx];
331}
332
333template <typename VAL_T>
334BinIterator* DenseBin<VAL_T>::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
335 return new DenseBinIterator<VAL_T>(this, min_bin, max_bin, default_bin);
336}
337
338} // namespace LightGBM
339#endif // LightGBM_IO_DENSE_BIN_HPP_
Iterator for one bin column.
Definition bin.h:267
Interface for bin data. This class will store bin data for one feature. unlike OrderedBin,...
Definition bin.h:286
Definition dense_bin.hpp:16
uint32_t Get(data_size_t idx) override
Get bin data on specific row index.
Definition dense_bin.hpp:319
Used to store bins for dense feature Use template to reduce memory cost.
Definition dense_bin.hpp:43
BinIterator * GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override
Get bin iterator of this bin for specific feature.
Definition dense_bin.hpp:334
virtual data_size_t SplitCategorical(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, const uint32_t *threshold, int num_threahold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const override
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
Definition dense_bin.hpp:251
void Push(int, data_size_t idx, uint32_t value) override
Push one record \pram tid Thread id.
Definition dense_bin.hpp:53
void LoadFromMemory(const void *memory, const std::vector< data_size_t > &local_used_indices) override
Load from memory.
Definition dense_bin.hpp:285
void ConstructHistogram(const data_size_t *data_indices, data_size_t num_data, const score_t *ordered_gradients, const score_t *ordered_hessians, HistogramBinEntry *out) const override
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
Definition dense_bin.hpp:66
OrderedBin * CreateOrderedBin() const override
not ordered bin for dense feature
Definition dense_bin.hpp:281
data_size_t num_data() const override
Number of all data.
Definition dense_bin.hpp:278
void SaveBinaryToFile(const VirtualFileWriter *writer) const override
Save binary data to file.
Definition dense_bin.hpp:305
void FinishLoad() override
After pushed all feature data, call this could have better refactor for bin data.
Definition dense_bin.hpp:283
size_t SizesInByte() const override
Get sizes in byte of this object.
Definition dense_bin.hpp:309
void ConstructHistogram(const data_size_t *data_indices, data_size_t num_data, const score_t *ordered_gradients, HistogramBinEntry *out) const override
Construct histogram of this feature, Note: We use ordered_gradients and ordered_hessians to improve c...
Definition dense_bin.hpp:134
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold, data_size_t *data_indices, data_size_t num_data, data_size_t *lte_indices, data_size_t *gt_indices) const override
Split data according to threshold, if bin <= threshold, will put into left(lte_indices),...
Definition dense_bin.hpp:190
Interface for ordered bin data. efficient for construct histogram, especially for sparse bin There ar...
Definition bin.h:219
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
float score_t
Type of score, and gradients.
Definition meta.h:26
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
Store data for one histogram bin.
Definition bin.h:29
An interface for writing files from buffers.
Definition file_io.h:15
virtual size_t Write(const void *data, size_t bytes) const =0
Append buffer to file.