Medial Code Documentation
Loading...
Searching...
No Matches
dataset.h
1#ifndef LIGHTGBM_DATASET_H_
2#define LIGHTGBM_DATASET_H_
3
4#include <LightGBM/utils/random.h>
5#include <LightGBM/utils/text_reader.h>
6#include <LightGBM/utils/openmp_wrapper.h>
7
8#include <LightGBM/meta.h>
9#include <LightGBM/config.h>
10#include <LightGBM/feature_group.h>
11
12#include <vector>
13#include <utility>
14#include <functional>
15#include <string>
16#include <unordered_set>
17#include <mutex>
18
19namespace LightGBM {
20
22class DatasetLoader;
36class Metadata {
37public:
41 Metadata();
47 void Init(const char* data_filename, const char* initscore_file);
54 void Init(const Metadata& metadata, const data_size_t* used_indices, data_size_t num_used_indices);
59 void LoadFromMemory(const void* memory);
61 ~Metadata();
62
69 void Init(data_size_t num_data, int weight_idx, int query_idx);
70
75 void PartitionLabel(const std::vector<data_size_t>& used_indices);
76
82 void CheckOrPartition(data_size_t num_all_data,
83 const std::vector<data_size_t>& used_data_indices);
84
85 void SetLabel(const label_t* label, data_size_t len);
86
87 void SetWeights(const label_t* weights, data_size_t len);
88
89 void SetQuery(const data_size_t* query, data_size_t len);
90
95 void SetInitScore(const double* init_score, data_size_t len);
96
97
102 void SaveBinaryToFile(const VirtualFileWriter* writer) const;
103
107 size_t SizesInByte() const;
108
113 inline const label_t* label() const { return label_.data(); }
114
120 inline void SetLabelAt(data_size_t idx, label_t value) {
121 label_[idx] = value;
122 }
123
129 inline void SetWeightAt(data_size_t idx, label_t value) {
130 weights_[idx] = value;
131 }
132
138 inline void SetQueryAt(data_size_t idx, data_size_t value) {
139 queries_[idx] = static_cast<data_size_t>(value);
140 }
141
146 inline const label_t* weights() const {
147 if (!weights_.empty()) {
148 return weights_.data();
149 } else {
150 return nullptr;
151 }
152 }
153
161 inline const data_size_t* query_boundaries() const {
162 if (!query_boundaries_.empty()) {
163 return query_boundaries_.data();
164 } else {
165 return nullptr;
166 }
167 }
168
173 inline data_size_t num_queries() const { return num_queries_; }
174
179 inline const label_t* query_weights() const {
180 if (!query_weights_.empty()) {
181 return query_weights_.data();
182 } else {
183 return nullptr;
184 }
185 }
186
191 inline const double* init_score() const {
192 if (!init_score_.empty()) {
193 return init_score_.data();
194 } else {
195 return nullptr;
196 }
197 }
198
202 inline int64_t num_init_score() const { return num_init_score_; }
203
205 Metadata& operator=(const Metadata&) = delete;
207 Metadata(const Metadata&) = delete;
208
209private:
211 void LoadInitialScore(const char* initscore_file);
213 void LoadWeights();
215 void LoadQueryBoundaries();
217 void LoadQueryWeights();
219 std::string data_filename_;
221 data_size_t num_data_;
223 data_size_t num_weights_;
225 std::vector<label_t> label_;
227 std::vector<label_t> weights_;
229 std::vector<data_size_t> query_boundaries_;
231 std::vector<label_t> query_weights_;
233 data_size_t num_queries_;
235 int64_t num_init_score_;
237 std::vector<double> init_score_;
239 std::vector<data_size_t> queries_;
241 std::mutex mutex_;
242 bool weight_load_from_file_;
243 bool query_load_from_file_;
244 bool init_score_load_from_file_;
245};
246
247
249class Parser {
250public:
252 virtual ~Parser() {}
253
260 virtual void ParseOneLine(const char* str,
261 std::vector<std::pair<int, double>>* out_features, double* out_label) const = 0;
262
263 virtual int TotalColumns() const = 0;
264
272 static Parser* CreateParser(const char* filename, bool header, int num_features, int label_idx);
273};
274
278class Dataset {
279public:
280 friend DatasetLoader;
281
282 LIGHTGBM_EXPORT Dataset();
283
284 LIGHTGBM_EXPORT Dataset(data_size_t num_data);
285
286 void Construct(
287 std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
288 int** sample_non_zero_indices,
289 const int* num_per_col,
290 size_t total_sample_cnt,
291 const Config& io_config);
292
294 LIGHTGBM_EXPORT ~Dataset();
295
296 LIGHTGBM_EXPORT bool CheckAlign(const Dataset& other) const {
297 if (num_features_ != other.num_features_) {
298 return false;
299 }
300 if (num_total_features_ != other.num_total_features_) {
301 return false;
302 }
303 if (label_idx_ != other.label_idx_) {
304 return false;
305 }
306 for (int i = 0; i < num_features_; ++i) {
307 if (!FeatureBinMapper(i)->CheckAlign(*(other.FeatureBinMapper(i)))) {
308 return false;
309 }
310 }
311 return true;
312 }
313
314 inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
315 if (is_finish_load_) { return; }
316 for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
317 int feature_idx = used_feature_map_[i];
318 if (feature_idx >= 0) {
319 const int group = feature2group_[feature_idx];
320 const int sub_feature = feature2subfeature_[feature_idx];
321 feature_groups_[group]->PushData(tid, sub_feature, row_idx, feature_values[i]);
322 }
323 }
324 }
325
326 inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<std::pair<int, double>>& feature_values) {
327 if (is_finish_load_) { return; }
328 for (auto& inner_data : feature_values) {
329 if (inner_data.first >= num_total_features_) { continue; }
330 int feature_idx = used_feature_map_[inner_data.first];
331 if (feature_idx >= 0) {
332 const int group = feature2group_[feature_idx];
333 const int sub_feature = feature2subfeature_[feature_idx];
334 feature_groups_[group]->PushData(tid, sub_feature, row_idx, inner_data.second);
335 }
336 }
337 }
338
339 inline void PushOneData(int tid, data_size_t row_idx, int group, int sub_feature, double value) {
340 feature_groups_[group]->PushData(tid, sub_feature, row_idx, value);
341 }
342
343 inline int RealFeatureIndex(int fidx) const {
344 return real_feature_idx_[fidx];
345 }
346
347 inline int InnerFeatureIndex(int col_idx) const {
348 return used_feature_map_[col_idx];
349 }
350 inline int Feature2Group(int feature_idx) const {
351 return feature2group_[feature_idx];
352 }
353 inline int Feture2SubFeature(int feature_idx) const {
354 return feature2subfeature_[feature_idx];
355 }
356 inline uint64_t GroupBinBoundary(int group_idx) const {
357 return group_bin_boundaries_[group_idx];
358 }
359 inline uint64_t NumTotalBin() const {
360 return group_bin_boundaries_.back();
361 }
362 inline std::vector<int> ValidFeatureIndices() const {
363 std::vector<int> ret;
364 for (int i = 0; i < num_total_features_; ++i) {
365 if (used_feature_map_[i] >= 0) {
366 ret.push_back(i);
367 }
368 }
369 return ret;
370 }
371 void ReSize(data_size_t num_data);
372
373 void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
374
375 LIGHTGBM_EXPORT void FinishLoad();
376
377 LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
378
379 LIGHTGBM_EXPORT bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element);
380
381 LIGHTGBM_EXPORT bool SetIntField(const char* field_name, const int* field_data, data_size_t num_element);
382
383 LIGHTGBM_EXPORT bool GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr);
384
385 LIGHTGBM_EXPORT bool GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr);
386
387 LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr);
388
392 LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename);
393
394 LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset);
395
396 LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);
397
398 void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
399 const data_size_t* data_indices, data_size_t num_data,
400 int leaf_idx,
401 std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
402 const score_t* gradients, const score_t* hessians,
403 score_t* ordered_gradients, score_t* ordered_hessians,
404 bool is_constant_hessian,
405 HistogramBinEntry* histogram_data) const;
406
407 void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
408 HistogramBinEntry* data) const;
409
410 inline data_size_t Split(int feature,
411 const uint32_t* threshold, int num_threshold, bool default_left,
412 data_size_t* data_indices, data_size_t num_data,
413 data_size_t* lte_indices, data_size_t* gt_indices) const {
414 const int group = feature2group_[feature];
415 const int sub_feature = feature2subfeature_[feature];
416 return feature_groups_[group]->Split(sub_feature, threshold, num_threshold, default_left, data_indices, num_data, lte_indices, gt_indices);
417 }
418
419 inline int SubFeatureBinOffset(int i) const {
420 const int sub_feature = feature2subfeature_[i];
421 if (sub_feature == 0) {
422 return 1;
423 } else {
424 return 0;
425 }
426 }
427
428 inline int FeatureNumBin(int i) const {
429 const int group = feature2group_[i];
430 const int sub_feature = feature2subfeature_[i];
431 return feature_groups_[group]->bin_mappers_[sub_feature]->num_bin();
432 }
433
434 inline int8_t FeatureMonotone(int i) const {
435 if (monotone_types_.empty()) {
436 return 0;
437 } else {
438 return monotone_types_[i];
439 }
440 }
441
442 inline double FeaturePenalte(int i) const {
443 if (feature_penalty_.empty()) {
444 return 1;
445 } else {
446 return feature_penalty_[i];
447 }
448 }
449
450 bool HasMonotone() const {
451 if (monotone_types_.empty()) {
452 return false;
453 } else {
454 for (size_t i = 0; i < monotone_types_.size(); ++i) {
455 if (monotone_types_[i] != 0) {
456 return true;
457 }
458 }
459 return false;
460 }
461 }
462
463 inline int FeatureGroupNumBin(int group) const {
464 return feature_groups_[group]->num_total_bin_;
465 }
466
467 inline const BinMapper* FeatureBinMapper(int i) const {
468 const int group = feature2group_[i];
469 const int sub_feature = feature2subfeature_[i];
470 return feature_groups_[group]->bin_mappers_[sub_feature].get();
471 }
472
473 inline const Bin* FeatureBin(int i) const {
474 const int group = feature2group_[i];
475 return feature_groups_[group]->bin_data_.get();
476 }
477
478 inline const Bin* FeatureGroupBin(int group) const {
479 return feature_groups_[group]->bin_data_.get();
480 }
481
482 inline bool FeatureGroupIsSparse(int group) const {
483 return feature_groups_[group]->is_sparse_;
484 }
485
486 inline BinIterator* FeatureIterator(int i) const {
487 const int group = feature2group_[i];
488 const int sub_feature = feature2subfeature_[i];
489 return feature_groups_[group]->SubFeatureIterator(sub_feature);
490 }
491
492 inline BinIterator* FeatureGroupIterator(int group) const {
493 return feature_groups_[group]->FeatureGroupIterator();
494 }
495
496 inline double RealThreshold(int i, uint32_t threshold) const {
497 const int group = feature2group_[i];
498 const int sub_feature = feature2subfeature_[i];
499 return feature_groups_[group]->bin_mappers_[sub_feature]->BinToValue(threshold);
500 }
501
502 // given a real threshold, find the closest threshold bin
503 inline uint32_t BinThreshold(int i, double threshold_double) const {
504 const int group = feature2group_[i];
505 const int sub_feature = feature2subfeature_[i];
506 return feature_groups_[group]->bin_mappers_[sub_feature]->ValueToBin(threshold_double);
507 }
508
509 inline void CreateOrderedBins(std::vector<std::unique_ptr<OrderedBin>>* ordered_bins) const {
510 ordered_bins->resize(num_groups_);
511 OMP_INIT_EX();
512 #pragma omp parallel for schedule(guided)
513 for (int i = 0; i < num_groups_; ++i) {
514 OMP_LOOP_EX_BEGIN();
515 ordered_bins->at(i).reset(feature_groups_[i]->bin_data_->CreateOrderedBin());
516 OMP_LOOP_EX_END();
517 }
518 OMP_THROW_EX();
519 }
520
525 inline const Metadata& metadata() const { return metadata_; }
526
528 inline int num_features() const { return num_features_; }
529
531 inline int num_feature_groups() const { return num_groups_;}
532
534 inline int num_total_features() const { return num_total_features_; }
535
537 inline int label_idx() const { return label_idx_; }
538
540 inline const std::vector<std::string>& feature_names() const { return feature_names_; }
541
542 inline void set_feature_names(const std::vector<std::string>& feature_names) {
543 if (feature_names.size() != static_cast<size_t>(num_total_features_)) {
544 Log::Fatal("Size of feature_names error, should equal with total number of features");
545 }
546 feature_names_ = std::vector<std::string>(feature_names);
547 // replace ' ' in feature_names with '_'
548 bool spaceInFeatureName = false;
549 for (auto& feature_name : feature_names_) {
550 if (feature_name.find(' ') != std::string::npos) {
551 spaceInFeatureName = true;
552 std::replace(feature_name.begin(), feature_name.end(), ' ', '_');
553 }
554 }
555 if (spaceInFeatureName) {
556 Log::Warning("Find whitespaces in feature_names, replace with underlines");
557 }
558 }
559
560 inline std::vector<std::string> feature_infos() const {
561 std::vector<std::string> bufs;
562 for (int i = 0; i < num_total_features_; i++) {
563 int fidx = used_feature_map_[i];
564 if (fidx == -1) {
565 bufs.push_back("none");
566 } else {
567 const auto bin_mapper = FeatureBinMapper(fidx);
568 bufs.push_back(bin_mapper->bin_info());
569 }
570 }
571 return bufs;
572 }
573
574 void ResetConfig(const char* parameters);
575
577 inline data_size_t num_data() const { return num_data_; }
578
580 Dataset& operator=(const Dataset&) = delete;
582 Dataset(const Dataset&) = delete;
583
584private:
585 std::string data_filename_;
587 std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
589 std::vector<int> used_feature_map_;
591 int num_features_;
593 int num_total_features_;
595 data_size_t num_data_;
597 Metadata metadata_;
599 int label_idx_ = 0;
601 double sparse_threshold_;
603 std::vector<std::string> feature_names_;
605 static const char* binary_file_token;
606 int num_groups_;
607 std::vector<int> real_feature_idx_;
608 std::vector<int> feature2group_;
609 std::vector<int> feature2subfeature_;
610 std::vector<uint64_t> group_bin_boundaries_;
611 std::vector<int> group_feature_start_;
612 std::vector<int> group_feature_cnt_;
613 std::vector<int8_t> monotone_types_;
614 std::vector<double> feature_penalty_;
615 bool is_finish_load_;
616 int max_bin_;
617 int bin_construct_sample_cnt_;
618 int min_data_in_bin_;
619 bool use_missing_;
620 bool zero_as_missing_;
621};
622
623} // namespace LightGBM
624
625#endif // LightGBM_DATA_H_
Iterator for one bin column.
Definition bin.h:267
This class used to convert feature values into bin, and store some meta information for bin.
Definition bin.h:61
Interface for bin data. This class will store bin data for one feature. unlike OrderedBin,...
Definition bin.h:286
Definition dataset_loader.h:8
The main class of data set, which are used to traning or validation.
Definition dataset.h:278
data_size_t num_data() const
Get Number of data.
Definition dataset.h:577
LIGHTGBM_EXPORT ~Dataset()
Destructor.
Definition dataset.cpp:35
const Metadata & metadata() const
Get meta data pointer.
Definition dataset.h:525
int num_total_features() const
Get Number of total features.
Definition dataset.h:534
const std::vector< std::string > & feature_names() const
Get names of current data set.
Definition dataset.h:540
Dataset & operator=(const Dataset &)=delete
Disable copy.
LIGHTGBM_EXPORT void SaveBinaryFile(const char *bin_filename)
Save current dataset into binary file, will save to "filename.bin".
Definition dataset.cpp:598
int num_features() const
Get Number of used features.
Definition dataset.h:528
Dataset(const Dataset &)=delete
Disable copy.
int num_feature_groups() const
Get Number of feature groups.
Definition dataset.h:531
int label_idx() const
Get the index of label column.
Definition dataset.h:537
This class is used to store some meta(non-feature) data for training data, e.g. labels,...
Definition dataset.h:36
const label_t * label() const
Get pointer of label.
Definition dataset.h:113
void Init(const char *data_filename, const char *initscore_file)
Initialization will load qurey level informations, since it is need for sampling data.
Definition metadata.cpp:20
void LoadFromMemory(const void *memory)
Initial with binary memory.
Definition metadata.cpp:475
void CheckOrPartition(data_size_t num_all_data, const std::vector< data_size_t > &used_data_indices)
Partition meta data according to local used indices if need.
Definition metadata.cpp:144
const label_t * query_weights() const
Get weights for queries, if not exists, will return nullptr.
Definition dataset.h:179
void SetInitScore(const double *init_score, data_size_t len)
Set initial scores.
Definition metadata.cpp:283
Metadata & operator=(const Metadata &)=delete
Disable copy.
void PartitionLabel(const std::vector< data_size_t > &used_indices)
Partition label by used indices.
Definition metadata.cpp:130
void SetWeightAt(data_size_t idx, label_t value)
Set Weight for one record.
Definition dataset.h:129
void SetLabelAt(data_size_t idx, label_t value)
Set label for one record.
Definition dataset.h:120
const data_size_t * query_boundaries() const
Get data boundaries on queries, if not exists, will return nullptr we assume data will order by query...
Definition dataset.h:161
Metadata(const Metadata &)=delete
Disable copy.
size_t SizesInByte() const
Get sizes in byte of this object.
Definition metadata.cpp:520
const double * init_score() const
Get initial scores, if not exists, will return nullptr.
Definition dataset.h:191
void SaveBinaryToFile(const VirtualFileWriter *writer) const
Save binary data to file.
Definition metadata.cpp:507
int64_t num_init_score() const
Get size of initial scores.
Definition dataset.h:202
Metadata()
Null costructor.
Definition metadata.cpp:10
const label_t * weights() const
Get weights, if not exists, will return nullptr.
Definition dataset.h:146
void SetQueryAt(data_size_t idx, data_size_t value)
Set Query Id for one record.
Definition dataset.h:138
~Metadata()
Destructor.
Definition metadata.cpp:29
data_size_t num_queries() const
Get Number of queries.
Definition dataset.h:173
Interface for Parser.
Definition dataset.h:249
static Parser * CreateParser(const char *filename, bool header, int num_features, int label_idx)
Create a object of parser, will auto choose the format depend on file.
Definition parser.cpp:87
virtual ~Parser()
virtual destructor
Definition dataset.h:252
virtual void ParseOneLine(const char *str, std::vector< std::pair< int, double > > *out_features, double *out_label) const =0
Parse one line with label.
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
float score_t
Type of score, and gradients.
Definition meta.h:26
float label_t
Type of metadata, include weight and label.
Definition meta.h:33
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14
Definition config.h:27
Store data for one histogram bin.
Definition bin.h:29
An interface for writing files from buffers.
Definition file_io.h:15