Medial Code Documentation
Loading...
Searching...
No Matches
feature_group.h
1#ifndef LIGHTGBM_FEATURE_GROUP_H_
2#define LIGHTGBM_FEATURE_GROUP_H_
3
4#include <LightGBM/utils/random.h>
5
6#include <LightGBM/meta.h>
7#include <LightGBM/bin.h>
8
9#include <cstdio>
10#include <memory>
11#include <vector>
12
13namespace LightGBM {
14
15class Dataset;
16class DatasetLoader;
19public:
20 friend Dataset;
21 friend DatasetLoader;
30 FeatureGroup(int num_feature,
31 std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
32 data_size_t num_data, double sparse_threshold, bool is_enable_sparse) : num_feature_(num_feature) {
33 CHECK(static_cast<int>(bin_mappers.size()) == num_feature);
34 // use bin at zero to store default_bin
35 num_total_bin_ = 1;
36 bin_offsets_.emplace_back(num_total_bin_);
37 int cnt_non_zero = 0;
38 for (int i = 0; i < num_feature_; ++i) {
39 bin_mappers_.emplace_back(bin_mappers[i].release());
40 auto num_bin = bin_mappers_[i]->num_bin();
41 if (bin_mappers_[i]->GetDefaultBin() == 0) {
42 num_bin -= 1;
43 }
44 num_total_bin_ += num_bin;
45 bin_offsets_.emplace_back(num_total_bin_);
46 cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers_[i]->sparse_rate()));
47 }
48 double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
49 bin_data_.reset(Bin::CreateBin(num_data, num_total_bin_,
50 sparse_rate, is_enable_sparse, sparse_threshold, &is_sparse_));
51 }
52
53 FeatureGroup(int num_feature,
54 std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
55 data_size_t num_data, bool is_sparse) : num_feature_(num_feature) {
56 CHECK(static_cast<int>(bin_mappers.size()) == num_feature);
57 // use bin at zero to store default_bin
58 num_total_bin_ = 1;
59 bin_offsets_.emplace_back(num_total_bin_);
60 for (int i = 0; i < num_feature_; ++i) {
61 bin_mappers_.emplace_back(bin_mappers[i].release());
62 auto num_bin = bin_mappers_[i]->num_bin();
63 if (bin_mappers_[i]->GetDefaultBin() == 0) {
64 num_bin -= 1;
65 }
66 num_total_bin_ += num_bin;
67 bin_offsets_.emplace_back(num_total_bin_);
68 }
69 is_sparse_ = is_sparse;
70 if (is_sparse_) {
71 bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
72 } else {
73 bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
74 }
75 }
82 FeatureGroup(const void* memory, data_size_t num_all_data,
83 const std::vector<data_size_t>& local_used_indices) {
84 const char* memory_ptr = reinterpret_cast<const char*>(memory);
85 // get is_sparse
86 is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
87 memory_ptr += sizeof(is_sparse_);
88 num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
89 memory_ptr += sizeof(num_feature_);
90 // get bin mapper
91 bin_mappers_.clear();
92 bin_offsets_.clear();
93 // start from 1, due to need to store zero bin in this slot
94 num_total_bin_ = 1;
95 bin_offsets_.emplace_back(num_total_bin_);
96 for (int i = 0; i < num_feature_; ++i) {
97 bin_mappers_.emplace_back(new BinMapper(memory_ptr));
98 auto num_bin = bin_mappers_[i]->num_bin();
99 if (bin_mappers_[i]->GetDefaultBin() == 0) {
100 num_bin -= 1;
101 }
102 num_total_bin_ += num_bin;
103 bin_offsets_.emplace_back(num_total_bin_);
104 memory_ptr += bin_mappers_[i]->SizesInByte();
105 }
106 data_size_t num_data = num_all_data;
107 if (!local_used_indices.empty()) {
108 num_data = static_cast<data_size_t>(local_used_indices.size());
109 }
110 if (is_sparse_) {
111 bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
112 } else {
113 bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
114 }
115 // get bin data
116 bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
117 }
120 }
121
128 inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
129 uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
130 if (bin == bin_mappers_[sub_feature_idx]->GetDefaultBin()) { return; }
131 bin += bin_offsets_[sub_feature_idx];
132 if (bin_mappers_[sub_feature_idx]->GetDefaultBin() == 0) {
133 bin -= 1;
134 }
135 bin_data_->Push(tid, line_idx, bin);
136 }
137
138 inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
139 bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
140 }
141
142 inline BinIterator* SubFeatureIterator(int sub_feature) {
143 uint32_t min_bin = bin_offsets_[sub_feature];
144 uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
145 uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
146 return bin_data_->GetIterator(min_bin, max_bin, default_bin);
147 }
148
155 uint32_t min_bin = bin_offsets_[0];
156 uint32_t max_bin = bin_offsets_.back() - 1;
157 uint32_t default_bin = 0;
158 return bin_data_->GetIterator(min_bin, max_bin, default_bin);
159 }
160
161 inline data_size_t Split(
162 int sub_feature,
163 const uint32_t* threshold,
164 int num_threshold,
165 bool default_left,
166 data_size_t* data_indices, data_size_t num_data,
167 data_size_t* lte_indices, data_size_t* gt_indices) const {
168
169 uint32_t min_bin = bin_offsets_[sub_feature];
170 uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
171 uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
172 if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
173 auto missing_type = bin_mappers_[sub_feature]->missing_type();
174 return bin_data_->Split(min_bin, max_bin, default_bin, missing_type, default_left,
175 *threshold, data_indices, num_data, lte_indices, gt_indices);
176 } else {
177 return bin_data_->SplitCategorical(min_bin, max_bin, default_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
178 }
179 }
185 inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
186 return bin_mappers_[sub_feature_idx]->BinToValue(bin);
187 }
188
193 void SaveBinaryToFile(const VirtualFileWriter* writer) const {
194 writer->Write(&is_sparse_, sizeof(is_sparse_));
195 writer->Write(&num_feature_, sizeof(num_feature_));
196 for (int i = 0; i < num_feature_; ++i) {
197 bin_mappers_[i]->SaveBinaryToFile(writer);
198 }
199 bin_data_->SaveBinaryToFile(writer);
200 }
204 size_t SizesInByte() const {
205 size_t ret = sizeof(is_sparse_) + sizeof(num_feature_);
206 for (int i = 0; i < num_feature_; ++i) {
207 ret += bin_mappers_[i]->SizesInByte();
208 }
209 ret += bin_data_->SizesInByte();
210 return ret;
211 }
215 FeatureGroup(const FeatureGroup&) = delete;
216
217private:
219 int num_feature_;
221 std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
223 std::vector<uint32_t> bin_offsets_;
225 std::unique_ptr<Bin> bin_data_;
227 bool is_sparse_;
228 int num_total_bin_;
229};
230
231
232} // namespace LightGBM
233
234#endif // LIGHTGBM_FEATURE_GROUP_H_
Iterator for one bin column.
Definition bin.h:267
This class used to convert feature values into bin, and store some meta information for bin.
Definition bin.h:61
static Bin * CreateDenseBin(data_size_t num_data, int num_bin)
Create object for bin data of one feature, used for dense feature.
Definition bin.cpp:534
static Bin * CreateSparseBin(data_size_t num_data, int num_bin)
Create object for bin data of one feature, used for sparse feature.
Definition bin.cpp:546
static Bin * CreateBin(data_size_t num_data, int num_bin, double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool *is_sparse)
Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "...
Definition bin.cpp:522
Definition dataset_loader.h:8
The main class of data set, which are used to traning or validation.
Definition dataset.h:278
Using to store data and providing some operations on one feature group.
Definition feature_group.h:18
FeatureGroup & operator=(const FeatureGroup &)=delete
Disable copy.
FeatureGroup(int num_feature, std::vector< std::unique_ptr< BinMapper > > &bin_mappers, data_size_t num_data, double sparse_threshold, bool is_enable_sparse)
Constructor.
Definition feature_group.h:30
void SaveBinaryToFile(const VirtualFileWriter *writer) const
Save binary data to file.
Definition feature_group.h:193
FeatureGroup(const FeatureGroup &)=delete
Disable copy.
void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value)
Push one record, will auto convert to bin and push to bin data.
Definition feature_group.h:128
BinIterator * FeatureGroupIterator()
Returns a BinIterator that can access the entire feature group's raw data. The RawGet() function of t...
Definition feature_group.h:154
FeatureGroup(const void *memory, data_size_t num_all_data, const std::vector< data_size_t > &local_used_indices)
Constructor from memory.
Definition feature_group.h:82
~FeatureGroup()
Destructor.
Definition feature_group.h:119
size_t SizesInByte() const
Get sizes in byte of this object.
Definition feature_group.h:204
double BinToValue(int sub_feature_idx, uint32_t bin) const
From bin to feature value.
Definition feature_group.h:185
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
-artifacts
An interface for writing files from buffers.
Definition file_io.h:15
virtual size_t Write(const void *data, size_t bytes) const =0
Append buffer to file.