1#ifndef LIGHTGBM_FEATURE_GROUP_H_
2#define LIGHTGBM_FEATURE_GROUP_H_
4#include <LightGBM/utils/random.h>
6#include <LightGBM/meta.h>
7#include <LightGBM/bin.h>
31 std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
32 data_size_t num_data,
double sparse_threshold,
bool is_enable_sparse) : num_feature_(num_feature) {
33 CHECK(
static_cast<int>(bin_mappers.size()) == num_feature);
36 bin_offsets_.emplace_back(num_total_bin_);
38 for (
int i = 0; i < num_feature_; ++i) {
39 bin_mappers_.emplace_back(bin_mappers[i].
release());
40 auto num_bin = bin_mappers_[i]->num_bin();
41 if (bin_mappers_[i]->GetDefaultBin() == 0) {
44 num_total_bin_ += num_bin;
45 bin_offsets_.emplace_back(num_total_bin_);
46 cnt_non_zero +=
static_cast<int>(num_data * (1.0f - bin_mappers_[i]->sparse_rate()));
48 double sparse_rate = 1.0f -
static_cast<double>(cnt_non_zero) / (num_data);
50 sparse_rate, is_enable_sparse, sparse_threshold, &is_sparse_));
54 std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
55 data_size_t num_data,
bool is_sparse) : num_feature_(num_feature) {
56 CHECK(
static_cast<int>(bin_mappers.size()) == num_feature);
59 bin_offsets_.emplace_back(num_total_bin_);
60 for (
int i = 0; i < num_feature_; ++i) {
61 bin_mappers_.emplace_back(bin_mappers[i].
release());
62 auto num_bin = bin_mappers_[i]->num_bin();
63 if (bin_mappers_[i]->GetDefaultBin() == 0) {
66 num_total_bin_ += num_bin;
67 bin_offsets_.emplace_back(num_total_bin_);
69 is_sparse_ = is_sparse;
83 const std::vector<data_size_t>& local_used_indices) {
84 const char* memory_ptr =
reinterpret_cast<const char*
>(memory);
86 is_sparse_ = *(
reinterpret_cast<const bool*
>(memory_ptr));
87 memory_ptr +=
sizeof(is_sparse_);
88 num_feature_ = *(
reinterpret_cast<const int*
>(memory_ptr));
89 memory_ptr +=
sizeof(num_feature_);
95 bin_offsets_.emplace_back(num_total_bin_);
96 for (
int i = 0; i < num_feature_; ++i) {
97 bin_mappers_.emplace_back(
new BinMapper(memory_ptr));
98 auto num_bin = bin_mappers_[i]->num_bin();
99 if (bin_mappers_[i]->GetDefaultBin() == 0) {
102 num_total_bin_ += num_bin;
103 bin_offsets_.emplace_back(num_total_bin_);
104 memory_ptr += bin_mappers_[i]->SizesInByte();
107 if (!local_used_indices.empty()) {
108 num_data =
static_cast<data_size_t>(local_used_indices.size());
116 bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
129 uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
130 if (bin == bin_mappers_[sub_feature_idx]->GetDefaultBin()) {
return; }
131 bin += bin_offsets_[sub_feature_idx];
132 if (bin_mappers_[sub_feature_idx]->GetDefaultBin() == 0) {
135 bin_data_->Push(tid, line_idx, bin);
139 bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
142 inline BinIterator* SubFeatureIterator(
int sub_feature) {
143 uint32_t min_bin = bin_offsets_[sub_feature];
144 uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
145 uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
146 return bin_data_->GetIterator(min_bin, max_bin, default_bin);
155 uint32_t min_bin = bin_offsets_[0];
156 uint32_t max_bin = bin_offsets_.back() - 1;
157 uint32_t default_bin = 0;
158 return bin_data_->GetIterator(min_bin, max_bin, default_bin);
163 const uint32_t* threshold,
169 uint32_t min_bin = bin_offsets_[sub_feature];
170 uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
171 uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
172 if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
173 auto missing_type = bin_mappers_[sub_feature]->missing_type();
174 return bin_data_->Split(min_bin, max_bin, default_bin, missing_type, default_left,
175 *threshold, data_indices, num_data, lte_indices, gt_indices);
177 return bin_data_->SplitCategorical(min_bin, max_bin, default_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
185 inline double BinToValue(
int sub_feature_idx, uint32_t bin)
const {
186 return bin_mappers_[sub_feature_idx]->BinToValue(bin);
194 writer->
Write(&is_sparse_,
sizeof(is_sparse_));
195 writer->
Write(&num_feature_,
sizeof(num_feature_));
196 for (
int i = 0; i < num_feature_; ++i) {
197 bin_mappers_[i]->SaveBinaryToFile(writer);
199 bin_data_->SaveBinaryToFile(writer);
205 size_t ret =
sizeof(is_sparse_) +
sizeof(num_feature_);
206 for (
int i = 0; i < num_feature_; ++i) {
207 ret += bin_mappers_[i]->SizesInByte();
209 ret += bin_data_->SizesInByte();
221 std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
223 std::vector<uint32_t> bin_offsets_;
225 std::unique_ptr<Bin> bin_data_;
Iterator for one bin column.
Definition bin.h:267
This class used to convert feature values into bin, and store some meta information for bin.
Definition bin.h:61
static Bin * CreateDenseBin(data_size_t num_data, int num_bin)
Create object for bin data of one feature, used for dense feature.
Definition bin.cpp:534
static Bin * CreateSparseBin(data_size_t num_data, int num_bin)
Create object for bin data of one feature, used for sparse feature.
Definition bin.cpp:546
static Bin * CreateBin(data_size_t num_data, int num_bin, double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool *is_sparse)
Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "...
Definition bin.cpp:522
Definition dataset_loader.h:8
The main class of data set, which are used to traning or validation.
Definition dataset.h:278
Using to store data and providing some operations on one feature group.
Definition feature_group.h:18
FeatureGroup & operator=(const FeatureGroup &)=delete
Disable copy.
FeatureGroup(int num_feature, std::vector< std::unique_ptr< BinMapper > > &bin_mappers, data_size_t num_data, double sparse_threshold, bool is_enable_sparse)
Constructor.
Definition feature_group.h:30
void SaveBinaryToFile(const VirtualFileWriter *writer) const
Save binary data to file.
Definition feature_group.h:193
FeatureGroup(const FeatureGroup &)=delete
Disable copy.
void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value)
Push one record, will auto convert to bin and push to bin data.
Definition feature_group.h:128
BinIterator * FeatureGroupIterator()
Returns a BinIterator that can access the entire feature group's raw data. The RawGet() function of t...
Definition feature_group.h:154
FeatureGroup(const void *memory, data_size_t num_all_data, const std::vector< data_size_t > &local_used_indices)
Constructor from memory.
Definition feature_group.h:82
~FeatureGroup()
Destructor.
Definition feature_group.h:119
size_t SizesInByte() const
Get sizes in byte of this object.
Definition feature_group.h:204
double BinToValue(int sub_feature_idx, uint32_t bin) const
From bin to feature value.
Definition feature_group.h:185
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
An interface for writing files from buffers.
Definition file_io.h:15
virtual size_t Write(const void *data, size_t bytes) const =0
Append buffer to file.