287 std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
288 int** sample_non_zero_indices,
289 const int* num_per_col,
290 size_t total_sample_cnt,
296 LIGHTGBM_EXPORT
bool CheckAlign(
const Dataset& other)
const {
297 if (num_features_ != other.num_features_) {
300 if (num_total_features_ != other.num_total_features_) {
303 if (label_idx_ != other.label_idx_) {
306 for (
int i = 0; i < num_features_; ++i) {
307 if (!FeatureBinMapper(i)->CheckAlign(*(other.FeatureBinMapper(i)))) {
314 inline void PushOneRow(
int tid,
data_size_t row_idx,
const std::vector<double>& feature_values) {
315 if (is_finish_load_) {
return; }
316 for (
size_t i = 0; i < feature_values.size() && i <
static_cast<size_t>(num_total_features_); ++i) {
317 int feature_idx = used_feature_map_[i];
318 if (feature_idx >= 0) {
319 const int group = feature2group_[feature_idx];
320 const int sub_feature = feature2subfeature_[feature_idx];
321 feature_groups_[group]->PushData(tid, sub_feature, row_idx, feature_values[i]);
326 inline void PushOneRow(
int tid,
data_size_t row_idx,
const std::vector<std::pair<int, double>>& feature_values) {
327 if (is_finish_load_) {
return; }
328 for (
auto& inner_data : feature_values) {
329 if (inner_data.first >= num_total_features_) {
continue; }
330 int feature_idx = used_feature_map_[inner_data.first];
331 if (feature_idx >= 0) {
332 const int group = feature2group_[feature_idx];
333 const int sub_feature = feature2subfeature_[feature_idx];
334 feature_groups_[group]->PushData(tid, sub_feature, row_idx, inner_data.second);
339 inline void PushOneData(
int tid,
data_size_t row_idx,
int group,
int sub_feature,
double value) {
340 feature_groups_[group]->PushData(tid, sub_feature, row_idx, value);
343 inline int RealFeatureIndex(
int fidx)
const {
344 return real_feature_idx_[fidx];
347 inline int InnerFeatureIndex(
int col_idx)
const {
348 return used_feature_map_[col_idx];
350 inline int Feature2Group(
int feature_idx)
const {
351 return feature2group_[feature_idx];
353 inline int Feture2SubFeature(
int feature_idx)
const {
354 return feature2subfeature_[feature_idx];
356 inline uint64_t GroupBinBoundary(
int group_idx)
const {
357 return group_bin_boundaries_[group_idx];
359 inline uint64_t NumTotalBin()
const {
360 return group_bin_boundaries_.back();
362 inline std::vector<int> ValidFeatureIndices()
const {
363 std::vector<int> ret;
364 for (
int i = 0; i < num_total_features_; ++i) {
365 if (used_feature_map_[i] >= 0) {
375 LIGHTGBM_EXPORT
void FinishLoad();
377 LIGHTGBM_EXPORT
bool SetFloatField(
const char* field_name,
const float* field_data,
data_size_t num_element);
379 LIGHTGBM_EXPORT
bool SetDoubleField(
const char* field_name,
const double* field_data,
data_size_t num_element);
381 LIGHTGBM_EXPORT
bool SetIntField(
const char* field_name,
const int* field_data,
data_size_t num_element);
383 LIGHTGBM_EXPORT
bool GetFloatField(
const char* field_name,
data_size_t* out_len,
const float** out_ptr);
385 LIGHTGBM_EXPORT
bool GetDoubleField(
const char* field_name,
data_size_t* out_len,
const double** out_ptr);
387 LIGHTGBM_EXPORT
bool GetIntField(
const char* field_name,
data_size_t* out_len,
const int** out_ptr);
394 LIGHTGBM_EXPORT
void CopyFeatureMapperFrom(
const Dataset* dataset);
396 LIGHTGBM_EXPORT
void CreateValid(
const Dataset* dataset);
398 void ConstructHistograms(
const std::vector<int8_t>& is_feature_used,
401 std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
404 bool is_constant_hessian,
407 void FixHistogram(
int feature_idx,
double sum_gradient,
double sum_hessian,
data_size_t num_data,
411 const uint32_t* threshold,
int num_threshold,
bool default_left,
414 const int group = feature2group_[feature];
415 const int sub_feature = feature2subfeature_[feature];
416 return feature_groups_[group]->Split(sub_feature, threshold, num_threshold, default_left, data_indices,
num_data, lte_indices, gt_indices);
419 inline int SubFeatureBinOffset(
int i)
const {
420 const int sub_feature = feature2subfeature_[i];
421 if (sub_feature == 0) {
428 inline int FeatureNumBin(
int i)
const {
429 const int group = feature2group_[i];
430 const int sub_feature = feature2subfeature_[i];
431 return feature_groups_[group]->bin_mappers_[sub_feature]->num_bin();
434 inline int8_t FeatureMonotone(
int i)
const {
435 if (monotone_types_.empty()) {
438 return monotone_types_[i];
442 inline double FeaturePenalte(
int i)
const {
443 if (feature_penalty_.empty()) {
446 return feature_penalty_[i];
450 bool HasMonotone()
const {
451 if (monotone_types_.empty()) {
454 for (
size_t i = 0; i < monotone_types_.size(); ++i) {
455 if (monotone_types_[i] != 0) {
463 inline int FeatureGroupNumBin(
int group)
const {
464 return feature_groups_[group]->num_total_bin_;
467 inline const BinMapper* FeatureBinMapper(
int i)
const {
468 const int group = feature2group_[i];
469 const int sub_feature = feature2subfeature_[i];
470 return feature_groups_[group]->bin_mappers_[sub_feature].get();
473 inline const Bin* FeatureBin(
int i)
const {
474 const int group = feature2group_[i];
475 return feature_groups_[group]->bin_data_.get();
478 inline const Bin* FeatureGroupBin(
int group)
const {
479 return feature_groups_[group]->bin_data_.get();
482 inline bool FeatureGroupIsSparse(
int group)
const {
483 return feature_groups_[group]->is_sparse_;
487 const int group = feature2group_[i];
488 const int sub_feature = feature2subfeature_[i];
489 return feature_groups_[group]->SubFeatureIterator(sub_feature);
492 inline BinIterator* FeatureGroupIterator(
int group)
const {
493 return feature_groups_[group]->FeatureGroupIterator();
496 inline double RealThreshold(
int i, uint32_t threshold)
const {
497 const int group = feature2group_[i];
498 const int sub_feature = feature2subfeature_[i];
499 return feature_groups_[group]->bin_mappers_[sub_feature]->BinToValue(threshold);
503 inline uint32_t BinThreshold(
int i,
double threshold_double)
const {
504 const int group = feature2group_[i];
505 const int sub_feature = feature2subfeature_[i];
506 return feature_groups_[group]->bin_mappers_[sub_feature]->ValueToBin(threshold_double);
509 inline void CreateOrderedBins(std::vector<std::unique_ptr<OrderedBin>>* ordered_bins)
const {
510 ordered_bins->resize(num_groups_);
512 #pragma omp parallel for schedule(guided)
513 for (
int i = 0; i < num_groups_; ++i) {
515 ordered_bins->at(i).reset(feature_groups_[i]->bin_data_->CreateOrderedBin());
540 inline const std::vector<std::string>&
feature_names()
const {
return feature_names_; }
542 inline void set_feature_names(
const std::vector<std::string>&
feature_names) {
543 if (
feature_names.size() !=
static_cast<size_t>(num_total_features_)) {
544 Log::Fatal(
"Size of feature_names error, should equal with total number of features");
548 bool spaceInFeatureName =
false;
549 for (
auto& feature_name : feature_names_) {
550 if (feature_name.find(
' ') != std::string::npos) {
551 spaceInFeatureName =
true;
552 std::replace(feature_name.begin(), feature_name.end(),
' ',
'_');
555 if (spaceInFeatureName) {
556 Log::Warning(
"Find whitespaces in feature_names, replace with underlines");
560 inline std::vector<std::string> feature_infos()
const {
561 std::vector<std::string> bufs;
562 for (
int i = 0; i < num_total_features_; i++) {
563 int fidx = used_feature_map_[i];
565 bufs.push_back(
"none");
567 const auto bin_mapper = FeatureBinMapper(fidx);
568 bufs.push_back(bin_mapper->bin_info());
574 void ResetConfig(
const char* parameters);
585 std::string data_filename_;
587 std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
589 std::vector<int> used_feature_map_;
593 int num_total_features_;
601 double sparse_threshold_;
603 std::vector<std::string> feature_names_;
605 static const char* binary_file_token;
607 std::vector<int> real_feature_idx_;
608 std::vector<int> feature2group_;
609 std::vector<int> feature2subfeature_;
610 std::vector<uint64_t> group_bin_boundaries_;
611 std::vector<int> group_feature_start_;
612 std::vector<int> group_feature_cnt_;
613 std::vector<int8_t> monotone_types_;
614 std::vector<double> feature_penalty_;
615 bool is_finish_load_;
617 int bin_construct_sample_cnt_;
618 int min_data_in_bin_;
620 bool zero_as_missing_;