Medial Code Documentation
Loading...
Searching...
No Matches
data_partition.hpp
1#ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
2#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
3
4#include <LightGBM/meta.h>
5#include <LightGBM/dataset.h>
6
7#include <LightGBM/utils/openmp_wrapper.h>
8
9#include <cstring>
10
11#include <vector>
12
13namespace LightGBM {
18public:
20 :num_data_(num_data), num_leaves_(num_leaves) {
21 leaf_begin_.resize(num_leaves_);
22 leaf_count_.resize(num_leaves_);
23 indices_.resize(num_data_);
24 temp_left_indices_.resize(num_data_);
25 temp_right_indices_.resize(num_data_);
26 used_data_indices_ = nullptr;
27 #pragma omp parallel
28 #pragma omp master
29 {
30 num_threads_ = omp_get_num_threads();
31 }
32 offsets_buf_.resize(num_threads_);
33 left_cnts_buf_.resize(num_threads_);
34 right_cnts_buf_.resize(num_threads_);
35 left_write_pos_buf_.resize(num_threads_);
36 right_write_pos_buf_.resize(num_threads_);
37 }
38
39 void ResetLeaves(int num_leaves) {
40 num_leaves_ = num_leaves;
41 leaf_begin_.resize(num_leaves_);
42 leaf_count_.resize(num_leaves_);
43 }
44 void ResetNumData(int num_data) {
45 num_data_ = num_data;
46 indices_.resize(num_data_);
47 temp_left_indices_.resize(num_data_);
48 temp_right_indices_.resize(num_data_);
49 }
51 }
52
56 void Init() {
57 std::fill(leaf_begin_.begin(), leaf_begin_.end(), 0);
58 std::fill(leaf_count_.begin(), leaf_count_.end(), 0);
59 if (used_data_indices_ == nullptr) {
60 // if using all data
61 leaf_count_[0] = num_data_;
62 #pragma omp parallel for schedule(static)
63 for (data_size_t i = 0; i < num_data_; ++i) {
64 indices_[i] = i;
65 }
66 } else {
67 // if bagging
68 leaf_count_[0] = used_data_count_;
69 std::memcpy(indices_.data(), used_data_indices_, used_data_count_ * sizeof(data_size_t));
70 }
71 }
72
73 void ResetByLeafPred(const std::vector<int>& leaf_pred, int num_leaves) {
74 ResetLeaves(num_leaves);
75 std::vector<std::vector<data_size_t>> indices_per_leaf(num_leaves_);
76 for (data_size_t i = 0; i < static_cast<data_size_t>(leaf_pred.size()); ++i) {
77 indices_per_leaf[leaf_pred[i]].push_back(i);
78 }
79 data_size_t offset = 0;
80 for (int i = 0; i < num_leaves_; ++i) {
81 leaf_begin_[i] = offset;
82 leaf_count_[i] = static_cast<data_size_t>(indices_per_leaf[i].size());
83 std::copy(indices_per_leaf[i].begin(), indices_per_leaf[i].end(), indices_.begin() + leaf_begin_[i]);
84 offset += leaf_count_[i];
85 }
86 }
87
94 const data_size_t* GetIndexOnLeaf(int leaf, data_size_t* out_len) const {
95 // copy reference, maybe unsafe, but faster
96 data_size_t begin = leaf_begin_[leaf];
97 *out_len = leaf_count_[leaf];
98 return indices_.data() + begin;
99 }
100
108 void Split(int leaf, const Dataset* dataset, int feature, const uint32_t* threshold, int num_threshold, bool default_left, int right_leaf) {
109 const data_size_t min_inner_size = 512;
110 // get leaf boundary
111 const data_size_t begin = leaf_begin_[leaf];
112 const data_size_t cnt = leaf_count_[leaf];
113
114 data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_;
115 if (inner_size < min_inner_size) { inner_size = min_inner_size; }
116 // split data multi-threading
117 OMP_INIT_EX();
118 #pragma omp parallel for schedule(static, 1)
119 for (int i = 0; i < num_threads_; ++i) {
120 OMP_LOOP_EX_BEGIN();
121 left_cnts_buf_[i] = 0;
122 right_cnts_buf_[i] = 0;
123 data_size_t cur_start = i * inner_size;
124 if (cur_start > cnt) { continue; }
125 data_size_t cur_cnt = inner_size;
126 if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
127 // split data inner, reduce the times of function called
128 data_size_t cur_left_count = dataset->Split(feature, threshold, num_threshold, default_left, indices_.data() + begin + cur_start, cur_cnt,
129 temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
130 offsets_buf_[i] = cur_start;
131 left_cnts_buf_[i] = cur_left_count;
132 right_cnts_buf_[i] = cur_cnt - cur_left_count;
133 OMP_LOOP_EX_END();
134 }
135 OMP_THROW_EX();
136 data_size_t left_cnt = 0;
137 left_write_pos_buf_[0] = 0;
138 right_write_pos_buf_[0] = 0;
139 for (int i = 1; i < num_threads_; ++i) {
140 left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
141 right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
142 }
143 left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
144 // copy back indices of right leaf to indices_
145 #pragma omp parallel for schedule(static, 1)
146 for (int i = 0; i < num_threads_; ++i) {
147 if (left_cnts_buf_[i] > 0) {
148 std::memcpy(indices_.data() + begin + left_write_pos_buf_[i],
149 temp_left_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
150 }
151 if (right_cnts_buf_[i] > 0) {
152 std::memcpy(indices_.data() + begin + left_cnt + right_write_pos_buf_[i],
153 temp_right_indices_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
154 }
155 }
156 // update leaf boundary
157 leaf_count_[leaf] = left_cnt;
158 leaf_begin_[right_leaf] = left_cnt + begin;
159 leaf_count_[right_leaf] = cnt - left_cnt;
160 }
161
167 void SetUsedDataIndices(const data_size_t* used_data_indices, data_size_t num_used_data) {
168 used_data_indices_ = used_data_indices;
169 used_data_count_ = num_used_data;
170 }
171
177 data_size_t leaf_count(int leaf) const { return leaf_count_[leaf]; }
178
184 data_size_t leaf_begin(int leaf) const { return leaf_begin_[leaf]; }
185
186 const data_size_t* indices() const { return indices_.data(); }
187
189 int num_leaves() const { return num_leaves_; }
190
191private:
193 data_size_t num_data_;
195 int num_leaves_;
197 std::vector<data_size_t> leaf_begin_;
199 std::vector<data_size_t> leaf_count_;
201 std::vector<data_size_t> indices_;
203 std::vector<data_size_t> temp_left_indices_;
205 std::vector<data_size_t> temp_right_indices_;
207 const data_size_t* used_data_indices_;
209 data_size_t used_data_count_;
211 int num_threads_;
213 std::vector<data_size_t> offsets_buf_;
215 std::vector<data_size_t> left_cnts_buf_;
217 std::vector<data_size_t> right_cnts_buf_;
219 std::vector<data_size_t> left_write_pos_buf_;
221 std::vector<data_size_t> right_write_pos_buf_;
222};
223
224} // namespace LightGBM
225#endif // LightGBM_TREELEARNER_DATA_PARTITION_HPP_
DataPartition is used to store the the partition of data on tree.
Definition data_partition.hpp:17
data_size_t leaf_begin(int leaf) const
Get leaf begin.
Definition data_partition.hpp:184
void Split(int leaf, const Dataset *dataset, int feature, const uint32_t *threshold, int num_threshold, bool default_left, int right_leaf)
Split the data.
Definition data_partition.hpp:108
void SetUsedDataIndices(const data_size_t *used_data_indices, data_size_t num_used_data)
SetLabelAt used data indices before training, used for bagging.
Definition data_partition.hpp:167
data_size_t leaf_count(int leaf) const
Get number of data on one leaf.
Definition data_partition.hpp:177
void Init()
Init, will put all data on the root(leaf_idx = 0)
Definition data_partition.hpp:56
const data_size_t * GetIndexOnLeaf(int leaf, data_size_t *out_len) const
Get the data indices of one leaf.
Definition data_partition.hpp:94
int num_leaves() const
Get number of leaves.
Definition data_partition.hpp:189
The main class of data set, which are used to traning or validation.
Definition dataset.h:278
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
int32_t data_size_t
Type of data size, it is better to use signed type.
Definition meta.h:14