Medial Code Documentation
Loading...
Searching...
No Matches
config.h
1
2
3#ifndef LIGHTGBM_CONFIG_H_
4#define LIGHTGBM_CONFIG_H_
5
6#include <LightGBM/utils/common.h>
7#include <LightGBM/utils/log.h>
8
9#include <LightGBM/meta.h>
10#include <LightGBM/export.h>
11
12#include <vector>
13#include <string>
14#include <unordered_map>
15#include <unordered_set>
16#include <algorithm>
17#include <memory>
18
19namespace LightGBM {
20
23 kTrain, kPredict, kConvertModel, KRefitTree
24};
25const int kDefaultNumLeaves = 31;
26
27struct Config {
28public:
29 std::string ToString() const;
37 inline static bool GetString(
38 const std::unordered_map<std::string, std::string>& params,
39 const std::string& name, std::string* out);
40
48 inline static bool GetInt(
49 const std::unordered_map<std::string, std::string>& params,
50 const std::string& name, int* out);
51
59 inline static bool GetDouble(
60 const std::unordered_map<std::string, std::string>& params,
61 const std::string& name, double* out);
62
70 inline static bool GetBool(
71 const std::unordered_map<std::string, std::string>& params,
72 const std::string& name, bool* out);
73
74 static void KV2Map(std::unordered_map<std::string, std::string>& params, const char* kv);
75 static std::unordered_map<std::string, std::string> Str2Map(const char* parameters);
76
77 #pragma region Parameters
78
79 #pragma region Core Parameters
80
81 // [doc-only]
82 // alias = config_file
83 // desc = path of config file
84 // desc = **Note**: can be used only in CLI version
85 std::string config = "";
86
87 // [doc-only]
88 // type = enum
89 // default = train
90 // options = train, predict, convert_model, refit
91 // alias = task_type
92 // desc = ``train``, for training, aliases: ``training``
93 // desc = ``predict``, for prediction, aliases: ``prediction``, ``test``
94 // desc = ``convert_model``, for converting model file into if-else format, see more information in `IO Parameters <#io-parameters>`__
95 // desc = ``refit``, for refitting existing models with new data, aliases: ``refit_tree``
96 // desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions
97 TaskType task = TaskType::kTrain;
98
99 // [doc-only]
100 // type = enum
101 // options = regression, regression_l1, huber, fair, poisson, quantile, mape, gammma, tweedie, binary, multiclass, multiclassova, xentropy, xentlambda, lambdarank
102 // alias = objective_type, app, application
103 // desc = regression application
104 // descl2 = ``regression_l2``, L2 loss, aliases: ``regression``, ``mean_squared_error``, ``mse``, ``l2_root``, ``root_mean_squared_error``, ``rmse``
105 // descl2 = ``regression_l1``, L1 loss, aliases: ``mean_absolute_error``, ``mae``
106 // descl2 = ``huber``, `Huber loss <https://en.wikipedia.org/wiki/Huber_loss>`__
107 // descl2 = ``fair``, `Fair loss <https://www.kaggle.com/c/allstate-claims-severity/discussion/24520>`__
108 // descl2 = ``poisson``, `Poisson regression <https://en.wikipedia.org/wiki/Poisson_regression>`__
109 // descl2 = ``quantile``, `Quantile regression <https://en.wikipedia.org/wiki/Quantile_regression>`__
110 // descl2 = ``mape``, `MAPE loss <https://en.wikipedia.org/wiki/Mean_absolute_percentage_error>`__, aliases: ``mean_absolute_percentage_error``
111 // descl2 = ``gamma``, Gamma regression with log-link. It might be useful, e.g., for modeling insurance claims severity, or for any target that might be `gamma-distributed <https://en.wikipedia.org/wiki/Gamma_distribution#Applications>`__
112 // descl2 = ``tweedie``, Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any target that might be `tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Applications>`__
113 // desc = ``binary``, binary `log loss <https://en.wikipedia.org/wiki/Cross_entropy>`__ classification (or logistic regression). Requires labels in {0, 1}; see ``cross-entropy`` application for general probability labels in [0, 1]
114 // desc = multi-class classification application
115 // descl2 = ``multiclass``, `softmax <https://en.wikipedia.org/wiki/Softmax_function>`__ objective function, aliases: ``softmax``
116 // descl2 = ``multiclassova``, `One-vs-All <https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest>`__ binary objective function, aliases: ``multiclass_ova``, ``ova``, ``ovr``
117 // descl2 = ``num_class`` should be set as well
118 // desc = cross-entropy application
119 // descl2 = ``xentropy``, objective function for cross-entropy (with optional linear weights), aliases: ``cross_entropy``
120 // descl2 = ``xentlambda``, alternative parameterization of cross-entropy, aliases: ``cross_entropy_lambda``
121 // descl2 = label is anything in interval [0, 1]
122 // desc = ``lambdarank``, `lambdarank <https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf>`__ application
123 // descl2 = label should be ``int`` type in lambdarank tasks, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
124 // descl2 = `label_gain <#objective-parameters>`__ can be used to set the gain (weight) of ``int`` label
125 // descl2 = all values in ``label`` must be smaller than number of elements in ``label_gain``
126 std::string objective = "regression";
127
128 // [doc-only]
129 // type = enum
130 // alias = boosting_type, boost
131 // options = gbdt, gbrt, rf, random_forest, dart, goss
132 // desc = ``gbdt``, traditional Gradient Boosting Decision Tree, aliases: ``gbrt``
133 // desc = ``rf``, Random Forest, aliases: ``random_forest``
134 // desc = ``dart``, `Dropouts meet Multiple Additive Regression Trees <https://arxiv.org/abs/1505.01866>`__
135 // desc = ``goss``, Gradient-based One-Side Sampling
136 std::string boosting = "gbdt";
137
138 // alias = train, train_data, train_data_file, data_filename
139 // desc = path of training data, LightGBM will train from this data
140 // desc = **Note**: can be used only in CLI version
141 std::string data = "";
142
143 // alias = test, valid_data, valid_data_file, test_data, test_data_file, valid_filenames
144 // default = ""
145 // desc = path(s) of validation/test data, LightGBM will output metrics for these data
146 // desc = support multiple validation data, separated by ``,``
147 // desc = **Note**: can be used only in CLI version
148 std::vector<std::string> valid;
149
150 // alias = num_iteration, n_iter, num_tree, num_trees, num_round, num_rounds, num_boost_round, n_estimators
151 // check = >=0
152 // desc = number of boosting iterations
153 // desc = **Note**: internally, LightGBM constructs ``num_class * num_iterations`` trees for multi-class classification problems
154 int num_iterations = 100;
155
156 // alias = shrinkage_rate, eta
157 // check = >0.0
158 // desc = shrinkage rate
159 // desc = in ``dart``, it also affects on normalization weights of dropped trees
160 double learning_rate = 0.1;
161
162 // default = 31
163 // alias = num_leaf, max_leaves, max_leaf
164 // check = >1
165 // desc = max number of leaves in one tree
166 int num_leaves = kDefaultNumLeaves;
167
168 // [doc-only]
169 // type = enum
170 // options = serial, feature, data, voting
171 // alias = tree, tree_type, tree_learner_type
172 // desc = ``serial``, single machine tree learner
173 // desc = ``feature``, feature parallel tree learner, aliases: ``feature_parallel``
174 // desc = ``data``, data parallel tree learner, aliases: ``data_parallel``
175 // desc = ``voting``, voting parallel tree learner, aliases: ``voting_parallel``
176 // desc = refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details
177 std::string tree_learner = "serial";
178
179 // alias = num_thread, nthread, nthreads, n_jobs
180 // desc = number of threads for LightGBM
181 // desc = ``0`` means default number of threads in OpenMP
182 // desc = for the best speed, set this to the number of **real CPU cores**, not the number of threads (most CPUs use `hyper-threading <https://en.wikipedia.org/wiki/Hyper-threading>`__ to generate 2 threads per CPU core)
183 // desc = do not set it too large if your dataset is small (for instance, do not use 64 threads for a dataset with 10,000 rows)
184 // desc = be aware a task manager or any similar CPU monitoring tool might report that cores not being fully utilized. **This is normal**
185 // desc = for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
186 int num_threads = 0;
187
188 // [doc-only]
189 // type = enum
190 // options = cpu, gpu
191 // alias = device
192 // desc = device for the tree learning, you can use GPU to achieve the faster learning
193 // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up
194 // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training
195 // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
196 std::string device_type = "cpu";
197
198 // [doc-only]
199 // alias = random_seed, random_state
200 // default = None
201 // desc = this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc.
202 // desc = by default, this seed is unused in favor of default values of other seeds
203 // desc = this seed has lower priority in comparison with other seeds, which means that it will be overridden, if you set other seeds explicitly
204 int seed = 0;
205
206 #pragma endregion
207
208 #pragma region Learning Control Parameters
209
210 // desc = limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise
211 // desc = ``< 0`` means no limit
212 int max_depth = -1;
213
214 // alias = min_data_per_leaf, min_data, min_child_samples
215 // check = >=0
216 // desc = minimal number of data in one leaf. Can be used to deal with over-fitting
217 int min_data_in_leaf = 20;
218
219 // alias = min_sum_hessian_per_leaf, min_sum_hessian, min_hessian, min_child_weight
220 // check = >=0.0
221 // desc = minimal sum hessian in one leaf. Like ``min_data_in_leaf``, it can be used to deal with over-fitting
222 double min_sum_hessian_in_leaf = 1e-3;
223
224 // alias = sub_row, subsample, bagging
225 // check = >0.0
226 // check = <=1.0
227 // desc = like ``feature_fraction``, but this will randomly select part of data without resampling
228 // desc = can be used to speed up training
229 // desc = can be used to deal with over-fitting
230 // desc = **Note**: to enable bagging, ``bagging_freq`` should be set to a non zero value as well
231 double bagging_fraction = 1.0;
232
233 // alias = subsample_freq
234 // desc = frequency for bagging
235 // desc = ``0`` means disable bagging; ``k`` means perform bagging at every ``k`` iteration
236 // desc = **Note**: to enable bagging, ``bagging_fraction`` should be set to value smaller than ``1.0`` as well
237 int bagging_freq = 0;
238
239 // alias = bagging_fraction_seed
240 // desc = random seed for bagging
241 int bagging_seed = 3;
242
243 // alias = sub_feature, colsample_bytree
244 // check = >0.0
245 // check = <=1.0
246 // desc = LightGBM will randomly select part of features on each iteration if ``feature_fraction`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree
247 // desc = can be used to speed up training
248 // desc = can be used to deal with over-fitting
249 double feature_fraction = 1.0;
250
251 // desc = random seed for ``feature_fraction``
252 int feature_fraction_seed = 2;
253
254 // alias = early_stopping_rounds, early_stopping
255 // desc = will stop training if one metric of one validation data doesn't improve in last ``early_stopping_round`` rounds
256 // desc = ``<= 0`` means disable
257 int early_stopping_round = 0;
258
259 // alias = max_tree_output, max_leaf_output
260 // desc = used to limit the max output of tree leaves
261 // desc = ``<= 0`` means no constraint
262 // desc = the final max output of leaves is ``learning_rate * max_delta_step``
263 double max_delta_step = 0.0;
264
265 // alias = reg_alpha
266 // check = >=0.0
267 // desc = L1 regularization
268 double lambda_l1 = 0.0;
269
270 // alias = reg_lambda, lambda
271 // check = >=0.0
272 // desc = L2 regularization
273 double lambda_l2 = 0.0;
274
275 // alias = min_split_gain
276 // check = >=0.0
277 // desc = the minimal gain to perform split
278 double min_gain_to_split = 0.0;
279
280 // alias = rate_drop
281 // check = >=0.0
282 // check = <=1.0
283 // desc = used only in ``dart``
284 // desc = dropout rate: a fraction of previous trees to drop during the dropout
285 double drop_rate = 0.1;
286
287 // desc = used only in ``dart``
288 // desc = max number of dropped trees during one boosting iteration
289 // desc = ``<=0`` means no limit
290 int max_drop = 50;
291
292 // check = >=0.0
293 // check = <=1.0
294 // desc = used only in ``dart``
295 // desc = probability of skipping the dropout procedure during a boosting iteration
296 double skip_drop = 0.5;
297
298 // desc = used only in ``dart``
299 // desc = set this to ``true``, if you want to use xgboost dart mode
300 bool xgboost_dart_mode = false;
301
302 // desc = used only in ``dart``
303 // desc = set this to ``true``, if you want to use uniform drop
304 bool uniform_drop = false;
305
306 // desc = used only in ``dart``
307 // desc = random seed to choose dropping models
308 int drop_seed = 4;
309
310 // check = >=0.0
311 // check = <=1.0
312 // desc = used only in ``goss``
313 // desc = the retain ratio of large gradient data
314 double top_rate = 0.2;
315
316 // check = >=0.0
317 // check = <=1.0
318 // desc = used only in ``goss``
319 // desc = the retain ratio of small gradient data
320 double other_rate = 0.1;
321
322 // check = >0
323 // desc = minimal number of data per categorical group
324 int min_data_per_group = 100;
325
326 // check = >0
327 // desc = used for the categorical features
328 // desc = limit the max threshold points in categorical features
329 int max_cat_threshold = 32;
330
331 // check = >=0.0
332 // desc = used for the categorical features
333 // desc = L2 regularization in categorcial split
334 double cat_l2 = 10.0;
335
336 // check = >=0.0
337 // desc = used for the categorical features
338 // desc = this can reduce the effect of noises in categorical features, especially for categories with few data
339 double cat_smooth = 10.0;
340
341 // check = >0
342 // desc = when number of categories of one feature smaller than or equal to ``max_cat_to_onehot``, one-vs-other split algorithm will be used
343 int max_cat_to_onehot = 4;
344
345 // alias = topk
346 // check = >0
347 // desc = used in `Voting parallel <./Parallel-Learning-Guide.rst#choose-appropriate-parallel-algorithm>`__
348 // desc = set this to larger value for more accurate result, but it will slow down the training speed
349 int top_k = 20;
350
351 // type = multi-int
352 // alias = mc, monotone_constraint
353 // default = None
354 // desc = used for constraints of monotonic features
355 // desc = ``1`` means increasing, ``-1`` means decreasing, ``0`` means non-constraint
356 // desc = you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature
357 std::vector<int8_t> monotone_constraints;
358
359 // type = multi-double
360 // alias = feature_contrib, fc, fp, feature_penalty
361 // default = None
362 // desc = used to control feature's split gain, will use ``gain[i] = max(0, feature_contri[i]) * gain[i]`` to replace the split gain of i-th feature
363 // desc = you need to specify all features in order
364 std::vector<double> feature_contri;
365
366 // alias = fs, forced_splits_filename, forced_splits_file, forced_splits
367 // desc = path to a ``.json`` file that specifies splits to force at the top of every decision tree before best-first learning commences
368 // desc = ``.json`` file can be arbitrarily nested, and each split contains ``feature``, ``threshold`` fields, as well as ``left`` and ``right`` fields representing subsplits
369 // desc = categorical splits are forced in a one-hot fashion, with ``left`` representing the split containing the feature value and ``right`` representing other values
370 // desc = **Note**: the forced split logic will be ignored, if the split makes gain worse
371 // desc = see `this file <https://github.com/Microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
372 std::string forcedsplits_filename = "";
373
374 // check = >=0.0
375 // check = <=1.0
376 // desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
377 // desc = used only in ``refit`` task in CLI version or as argument in ``refit`` function in language-specific package
378 double refit_decay_rate = 0.9;
379
380 #pragma endregion
381
382 #pragma region IO Parameters
383
384 // alias = verbose
385 // desc = controls the level of LightGBM's verbosity
386 // desc = ``< 0``: Fatal, ``= 0``: Error (Warning), ``= 1``: Info, ``> 1``: Debug
387 int verbosity = 1;
388
389 // check = >1
390 // desc = max number of bins that feature values will be bucketed in
391 // desc = small number of bins may reduce training accuracy but may increase general power (deal with over-fitting)
392 // desc = LightGBM will auto compress memory according to ``max_bin``. For example, LightGBM will use ``uint8_t`` for feature value if ``max_bin=255``
393 int max_bin = 255;
394
395 // check = >0
396 // desc = minimal number of data inside one bin
397 // desc = use this to avoid one-data-one-bin (potential over-fitting)
398 int min_data_in_bin = 3;
399
400 // alias = subsample_for_bin
401 // check = >0
402 // desc = number of data that sampled to construct histogram bins
403 // desc = setting this to larger value will give better training result, but will increase data loading time
404 // desc = set this to larger value if data is very sparse
405 int bin_construct_sample_cnt = 200000;
406
407 // alias = hist_pool_size
408 // desc = max cache size in MB for historical histogram
409 // desc = ``< 0`` means no limit
410 double histogram_pool_size = -1.0;
411
412 // alias = data_seed
413 // desc = random seed for data partition in parallel learning (excluding the ``feature_parallel`` mode)
414 int data_random_seed = 1;
415
416 // alias = model_output, model_out
417 // desc = filename of output model in training
418 // desc = **Note**: can be used only in CLI version
419 std::string output_model = "LightGBM_model.txt";
420
421 // alias = save_period
422 // desc = frequency of saving model file snapshot
423 // desc = set this to positive value to enable this function. For example, the model file will be snapshotted at each iteration if ``snapshot_freq=1``
424 // desc = **Note**: can be used only in CLI version
425 int snapshot_freq = -1;
426
427 // alias = model_input, model_in
428 // desc = filename of input model
429 // desc = for ``prediction`` task, this model will be applied to prediction data
430 // desc = for ``train`` task, training will be continued from this model
431 // desc = **Note**: can be used only in CLI version
432 std::string input_model = "";
433
434 // alias = predict_result, prediction_result, predict_name, prediction_name, pred_name, name_pred
435 // desc = filename of prediction result in ``prediction`` task
436 // desc = **Note**: can be used only in CLI version
437 std::string output_result = "LightGBM_predict_result.txt";
438
439 // alias = init_score_filename, init_score_file, init_score, input_init_score
440 // desc = path of file with training initial scores
441 // desc = if ``""``, will use ``train_data_file`` + ``.init`` (if exists)
442 // desc = **Note**: can be used only in CLI version
443 std::string initscore_filename = "";
444
445 // alias = valid_data_init_scores, valid_init_score_file, valid_init_score
446 // default = ""
447 // desc = path(s) of file(s) with validation initial scores
448 // desc = if ``""``, will use ``valid_data_file`` + ``.init`` (if exists)
449 // desc = separate by ``,`` for multi-validation data
450 // desc = **Note**: can be used only in CLI version
451 std::vector<std::string> valid_data_initscores;
452
453 // alias = is_pre_partition
454 // desc = used for parallel learning (excluding the ``feature_parallel`` mode)
455 // desc = ``true`` if training data are pre-partitioned, and different machines use different partitions
456 bool pre_partition = false;
457
458 // alias = is_enable_bundle, bundle
459 // desc = set this to ``false`` to disable Exclusive Feature Bundling (EFB), which is described in `LightGBM: A Highly Efficient Gradient Boosting Decision Tree <https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree>`__
460 // desc = **Note**: disabling this may cause the slow training speed for sparse datasets
461 bool enable_bundle = true;
462
463 // check = >=0.0
464 // check = <1.0
465 // desc = max conflict rate for bundles in EFB
466 // desc = set this to ``0.0`` to disallow the conflict and provide more accurate results
467 // desc = set this to a larger value to achieve faster speed
468 double max_conflict_rate = 0.0;
469
470 // alias = is_sparse, enable_sparse, sparse
471 // desc = used to enable/disable sparse optimization
472 bool is_enable_sparse = true;
473
474 // check = >0.0
475 // check = <=1.0
476 // desc = the threshold of zero elements percentage for treating a feature as a sparse one
477 double sparse_threshold = 0.8;
478
479 // desc = set this to ``false`` to disable the special handle of missing value
480 bool use_missing = true;
481
482 // desc = set this to ``true`` to treat all zero as missing values (including the unshown values in libsvm/sparse matrices)
483 // desc = set this to ``false`` to use ``na`` for representing missing values
484 bool zero_as_missing = false;
485
486 // alias = two_round_loading, use_two_round_loading
487 // desc = set this to ``true`` if data file is too big to fit in memory
488 // desc = by default, LightGBM will map data file to memory and load features from memory. This will provide faster data loading speed, but may cause run out of memory error when the data file is very big
489 bool two_round = false;
490
491 // alias = is_save_binary, is_save_binary_file
492 // desc = if ``true``, LightGBM will save the dataset (including validation data) to a binary file. This speed ups the data loading for the next time
493 bool save_binary = false;
494
495 // alias = load_from_binary_file, binary_load, load_binary
496 // desc = set this to ``true`` to enable autoloading from previous saved binary datasets
497 // desc = set this to ``false`` to ignore binary datasets
498 bool enable_load_from_binary_file = true;
499
500 // alias = has_header
501 // desc = set this to ``true`` if input data has header
502 bool header = false;
503
504 // type = int or string
505 // alias = label
506 // desc = used to specify the label column
507 // desc = use number for index, e.g. ``label=0`` means column\_0 is the label
508 // desc = add a prefix ``name:`` for column name, e.g. ``label=name:is_click``
509 std::string label_column = "";
510
511 // type = int or string
512 // alias = weight
513 // desc = used to specify the weight column
514 // desc = use number for index, e.g. ``weight=0`` means column\_0 is the weight
515 // desc = add a prefix ``name:`` for column name, e.g. ``weight=name:weight``
516 // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0, and weight is column\_1, the correct parameter is ``weight=0``
517 std::string weight_column = "";
518
519 // type = int or string
520 // alias = group, group_id, query_column, query, query_id
521 // desc = used to specify the query/group id column
522 // desc = use number for index, e.g. ``query=0`` means column\_0 is the query id
523 // desc = add a prefix ``name:`` for column name, e.g. ``query=name:query_id``
524 // desc = **Note**: data should be grouped by query\_id
525 // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and query\_id is column\_1, the correct parameter is ``query=0``
526 std::string group_column = "";
527
528 // type = multi-int or string
529 // alias = ignore_feature, blacklist
530 // desc = used to specify some ignoring columns in training
531 // desc = use number for index, e.g. ``ignore_column=0,1,2`` means column\_0, column\_1 and column\_2 will be ignored
532 // desc = add a prefix ``name:`` for column name, e.g. ``ignore_column=name:c1,c2,c3`` means c1, c2 and c3 will be ignored
533 // desc = **Note**: works only in case of loading data directly from file
534 // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``
535 // desc = **Note**: despite the fact that specified columns will be completely ignored during the training, they still should have a valid format allowing LightGBM to load file successfully
536 std::string ignore_column = "";
537
538 // type = multi-int or string
539 // alias = cat_feature, categorical_column, cat_column
540 // desc = used to specify categorical features
541 // desc = use number for index, e.g. ``categorical_feature=0,1,2`` means column\_0, column\_1 and column\_2 are categorical features
542 // desc = add a prefix ``name:`` for column name, e.g. ``categorical_feature=name:c1,c2,c3`` means c1, c2 and c3 are categorical features
543 // desc = **Note**: only supports categorical with ``int`` type
544 // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``
545 // desc = **Note**: all values should be less than ``Int32.MaxValue`` (2147483647)
546 // desc = **Note**: using large values could be memory consuming. Tree decision rule works best when categorical features are presented by consecutive integers starting from zero
547 // desc = **Note**: all negative values will be treated as **missing values**
548 std::string categorical_feature = "";
549
550 // alias = is_predict_raw_score, predict_rawscore, raw_score
551 // desc = used only in ``prediction`` task
552 // desc = set this to ``true`` to predict only the raw scores
553 // desc = set this to ``false`` to predict transformed scores
554 bool predict_raw_score = false;
555
556 // alias = is_predict_leaf_index, leaf_index
557 // desc = used only in ``prediction`` task
558 // desc = set this to ``true`` to predict with leaf index of all trees
559 bool predict_leaf_index = false;
560
561 // alias = is_predict_contrib, contrib
562 // desc = used only in ``prediction`` task
563 // desc = set this to ``true`` to estimate `SHAP values <https://arxiv.org/abs/1706.06060>`__, which represent how each feature contributes to each prediction
564 // desc = produces ``#features + 1`` values where the last value is the expected value of the model output over the training data
565 // desc = **Note**: if you want to get more explanation for your model's predictions using SHAP values like SHAP interaction values, you can install `shap package <https://github.com/slundberg/shap>`__
566 bool predict_contrib = false;
567
568 // desc = used only in ``prediction`` task
569 // desc = used to specify how many trained iterations will be used in prediction
570 // desc = ``<= 0`` means no limit
571 int num_iteration_predict = -1;
572
573 // desc = used only in ``prediction`` task
574 // desc = if ``true``, will use early-stopping to speed up the prediction. May affect the accuracy
575 bool pred_early_stop = false;
576
577 // desc = used only in ``prediction`` task
578 // desc = the frequency of checking early-stopping prediction
579 int pred_early_stop_freq = 10;
580
581 // desc = used only in ``prediction`` task
582 // desc = the threshold of margin in early-stopping prediction
583 double pred_early_stop_margin = 10.0;
584
585 // desc = used only in ``convert_model`` task
586 // desc = only ``cpp`` is supported yet
587 // desc = if ``convert_model_language`` is set and ``task=train``, the model will be also converted
588 // desc = **Note**: can be used only in CLI version
589 std::string convert_model_language = "";
590
591 // alias = convert_model_file
592 // desc = used only in ``convert_model`` task
593 // desc = output filename of converted model
594 // desc = **Note**: can be used only in CLI version
595 std::string convert_model = "gbdt_prediction.cpp";
596
597 #pragma endregion
598
599 #pragma region Objective Parameters
600
601 // check = >0
602 // alias = num_classes
603 // desc = used only in ``multi-class`` classification application
604 int num_class = 1;
605
606 // alias = unbalance, unbalanced_sets
607 // desc = used only in ``binary`` application
608 // desc = set this to ``true`` if training data are unbalanced
609 // desc = **Note**: this parameter cannot be used at the same time with ``scale_pos_weight``, choose only **one** of them
610 bool is_unbalance = false;
611
612 // check = >0.0
613 // desc = used only in ``binary`` application
614 // desc = weight of labels with positive class
615 // desc = **Note**: this parameter cannot be used at the same time with ``is_unbalance``, choose only **one** of them
616 double scale_pos_weight = 1.0;
617
618 // check = >0.0
619 // desc = used only in ``binary`` and ``multiclassova`` classification and in ``lambdarank`` applications
620 // desc = parameter for the sigmoid function
621 double sigmoid = 1.0;
622
623 // desc = used only in ``regression``, ``binary`` and ``cross-entropy`` applications
624 // desc = adjusts initial score to the mean of labels for faster convergence
625 bool boost_from_average = true;
626
627 // desc = used only in ``regression`` application
628 // desc = used to fit ``sqrt(label)`` instead of original values and prediction result will be also automatically converted to ``prediction^2``
629 // desc = might be useful in case of large-range labels
630 bool reg_sqrt = false;
631
632 // check = >0.0
633 // desc = used only in ``huber`` and ``quantile`` ``regression`` applications
634 // desc = parameter for `Huber loss <https://en.wikipedia.org/wiki/Huber_loss>`__ and `Quantile regression <https://en.wikipedia.org/wiki/Quantile_regression>`__
635 double alpha = 0.9;
636
637 // check = >0.0
638 // desc = used only in ``fair`` ``regression`` application
639 // desc = parameter for `Fair loss <https://www.kaggle.com/c/allstate-claims-severity/discussion/24520>`__
640 double fair_c = 1.0;
641
642 // check = >0.0
643 // desc = used only in ``poisson`` ``regression`` application
644 // desc = parameter for `Poisson regression <https://en.wikipedia.org/wiki/Poisson_regression>`__ to safeguard optimization
645 double poisson_max_delta_step = 0.7;
646
647 // check = >=1.0
648 // check = <2.0
649 // desc = used only in ``tweedie`` ``regression`` application
650 // desc = used to control the variance of the tweedie distribution
651 // desc = set this closer to ``2`` to shift towards a **Gamma** distribution
652 // desc = set this closer to ``1`` to shift towards a **Poisson** distribution
653 double tweedie_variance_power = 1.5;
654
655 // check = >0
656 // desc = used only in ``lambdarank`` application
657 // desc = optimizes `NDCG <https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG>`__ at this position
658 int max_position = 20;
659
660 // type = multi-double
661 // default = 0,1,3,7,15,31,63,...,2^30-1
662 // desc = used only in ``lambdarank`` application
663 // desc = relevant gain for labels. For example, the gain of label ``2`` is ``3`` in case of default label gains
664 // desc = separate by ``,``
665 std::vector<double> label_gain;
666
667 #pragma endregion
668
669 #pragma region Metric Parameters
670
671 // [doc-only]
672 // alias = metrics, metric_types
673 // default = ""
674 // type = multi-enum
675 // desc = metric(s) to be evaluated on the evaluation set(s)
676 // descl2 = ``""`` (empty string or not specified) means that metric corresponding to specified ``objective`` will be used (this is possible only for pre-defined objective functions, otherwise no evaluation metric will be added)
677 // descl2 = ``"None"`` (string, **not** a ``None`` value) means that no metric will be registered, aliases: ``na``, ``null``, ``custom``
678 // descl2 = ``l1``, absolute loss, aliases: ``mean_absolute_error``, ``mae``, ``regression_l1``
679 // descl2 = ``l2``, square loss, aliases: ``mean_squared_error``, ``mse``, ``regression_l2``, ``regression``
680 // descl2 = ``l2_root``, root square loss, aliases: ``root_mean_squared_error``, ``rmse``
681 // descl2 = ``quantile``, `Quantile regression <https://en.wikipedia.org/wiki/Quantile_regression>`__
682 // descl2 = ``mape``, `MAPE loss <https://en.wikipedia.org/wiki/Mean_absolute_percentage_error>`__, aliases: ``mean_absolute_percentage_error``
683 // descl2 = ``huber``, `Huber loss <https://en.wikipedia.org/wiki/Huber_loss>`__
684 // descl2 = ``fair``, `Fair loss <https://www.kaggle.com/c/allstate-claims-severity/discussion/24520>`__
685 // descl2 = ``poisson``, negative log-likelihood for `Poisson regression <https://en.wikipedia.org/wiki/Poisson_regression>`__
686 // descl2 = ``gamma``, negative log-likelihood for **Gamma** regression
687 // descl2 = ``gamma_deviance``, residual deviance for **Gamma** regression
688 // descl2 = ``tweedie``, negative log-likelihood for **Tweedie** regression
689 // descl2 = ``ndcg``, `NDCG <https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG>`__, aliases: ``lambdarank``
690 // descl2 = ``map``, `MAP <https://makarandtapaswi.wordpress.com/2012/07/02/intuition-behind-average-precision-and-map/>`__, aliases: ``mean_average_precision``
691 // descl2 = ``auc``, `AUC <https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve>`__
692 // descl2 = ``binary_logloss``, `log loss <https://en.wikipedia.org/wiki/Cross_entropy>`__, aliases: ``binary``
693 // descl2 = ``binary_error``, for one sample: ``0`` for correct classification, ``1`` for error classification
694 // descl2 = ``multi_logloss``, log loss for multi-class classification, aliases: ``multiclass``, ``softmax``, ``multiclassova``, ``multiclass_ova``, ``ova``, ``ovr``
695 // descl2 = ``multi_error``, error rate for multi-class classification
696 // descl2 = ``xentropy``, cross-entropy (with optional linear weights), aliases: ``cross_entropy``
697 // descl2 = ``xentlambda``, "intensity-weighted" cross-entropy, aliases: ``cross_entropy_lambda``
698 // descl2 = ``kldiv``, `Kullback-Leibler divergence <https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence>`__, aliases: ``kullback_leibler``
699 // desc = support multiple metrics, separated by ``,``
700 std::vector<std::string> metric;
701
702 // check = >0
703 // alias = output_freq
704 // desc = frequency for metric output
705 int metric_freq = 1;
706
707 // alias = training_metric, is_training_metric, train_metric
708 // desc = set this to ``true`` to output metric result over training dataset
709 // desc = **Note**: can be used only in CLI version
710 bool is_provide_training_metric = false;
711
712 // type = multi-int
713 // default = 1,2,3,4,5
714 // alias = ndcg_eval_at, ndcg_at, map_eval_at, map_at
715 // desc = used only with ``ndcg`` and ``map`` metrics
716 // desc = `NDCG <https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG>`__ and `MAP <https://makarandtapaswi.wordpress.com/2012/07/02/intuition-behind-average-precision-and-map/>`__ evaluation positions, separated by ``,``
717 std::vector<int> eval_at;
718
719 #pragma endregion
720
721 #pragma region Network Parameters
722
723 // check = >0
724 // alias = num_machine
725 // desc = the number of machines for parallel learning application
726 // desc = this parameter is needed to be set in both **socket** and **mpi** versions
727 int num_machines = 1;
728
729 // check = >0
730 // alias = local_port, port
731 // desc = TCP listen port for local machines
732 // desc = **Note**: don't forget to allow this port in firewall settings before training
733 int local_listen_port = 12400;
734
735 // check = >0
736 // desc = socket time-out in minutes
737 int time_out = 120;
738
739 // alias = machine_list_file, machine_list, mlist
740 // desc = path of file that lists machines for this parallel learning application
741 // desc = each line contains one IP and one port for one machine. The format is ``ip port`` (space as a separator)
742 std::string machine_list_filename = "";
743
744 // alias = workers, nodes
745 // desc = list of machines in the following format: ``ip1:port1,ip2:port2``
746 std::string machines = "";
747
748 #pragma endregion
749
750 #pragma region GPU Parameters
751
752 // desc = OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
753 // desc = ``-1`` means the system-wide default platform
754 // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
755 int gpu_platform_id = -1;
756
757 // desc = OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
758 // desc = ``-1`` means the default device in the selected platform
759 // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
760 int gpu_device_id = -1;
761
762 // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
763 bool gpu_use_dp = false;
764
765 #pragma endregion
766
767 #pragma endregion
768
769 bool is_parallel = false;
770 bool is_parallel_find_bin = false;
771 LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params);
772 static std::unordered_map<std::string, std::string> alias_table;
773 static std::unordered_set<std::string> parameter_set;
774
775private:
776 void CheckParamConflict();
777 void GetMembersFromString(const std::unordered_map<std::string, std::string>& params);
778 std::string SaveMembersToString() const;
779};
780
782 const std::unordered_map<std::string, std::string>& params,
783 const std::string& name, std::string* out) {
784 if (params.count(name) > 0) {
785 *out = params.at(name);
786 return true;
787 }
788 return false;
789}
790
791inline bool Config::GetInt(
792 const std::unordered_map<std::string, std::string>& params,
793 const std::string& name, int* out) {
794 if (params.count(name) > 0) {
795 if (!Common::AtoiAndCheck(params.at(name).c_str(), out)) {
796 Log::Fatal("Parameter %s should be of type int, got \"%s\"",
797 name.c_str(), params.at(name).c_str());
798 }
799 return true;
800 }
801 return false;
802}
803
805 const std::unordered_map<std::string, std::string>& params,
806 const std::string& name, double* out) {
807 if (params.count(name) > 0) {
808 if (!Common::AtofAndCheck(params.at(name).c_str(), out)) {
809 Log::Fatal("Parameter %s should be of type double, got \"%s\"",
810 name.c_str(), params.at(name).c_str());
811 }
812 return true;
813 }
814 return false;
815}
816
817inline bool Config::GetBool(
818 const std::unordered_map<std::string, std::string>& params,
819 const std::string& name, bool* out) {
820 if (params.count(name) > 0) {
821 std::string value = params.at(name);
822 std::transform(value.begin(), value.end(), value.begin(), Common::tolower);
823 if (value == std::string("false") || value == std::string("-")) {
824 *out = false;
825 } else if (value == std::string("true") || value == std::string("+")) {
826 *out = true;
827 } else {
828 Log::Fatal("Parameter %s should be \"true\"/\"+\" or \"false\"/\"-\", got \"%s\"",
829 name.c_str(), params.at(name).c_str());
830 }
831 return true;
832 }
833 return false;
834}
835
837 static void KeyAliasTransform(std::unordered_map<std::string, std::string>* params) {
838 std::unordered_map<std::string, std::string> tmp_map;
839 for (const auto& pair : *params) {
840 auto alias = Config::alias_table.find(pair.first);
841 if (alias != Config::alias_table.end()) { // found alias
842 auto alias_set = tmp_map.find(alias->second);
843 if (alias_set != tmp_map.end()) { // alias already set
844 // set priority by length & alphabetically to ensure reproducible behavior
845 if (alias_set->second.size() < pair.first.size() ||
846 (alias_set->second.size() == pair.first.size() && alias_set->second < pair.first)) {
847 Log::Warning("%s is set with %s=%s, %s=%s will be ignored. Current value: %s=%s",
848 alias->second.c_str(), alias_set->second.c_str(), params->at(alias_set->second).c_str(),
849 pair.first.c_str(), pair.second.c_str(), alias->second.c_str(), params->at(alias_set->second).c_str());
850 } else {
851 Log::Warning("%s is set with %s=%s, will be overridden by %s=%s. Current value: %s=%s",
852 alias->second.c_str(), alias_set->second.c_str(), params->at(alias_set->second).c_str(),
853 pair.first.c_str(), pair.second.c_str(), alias->second.c_str(), pair.second.c_str());
854 tmp_map[alias->second] = pair.first;
855 }
856 } else { // alias not set
857 tmp_map.emplace(alias->second, pair.first);
858 }
859 } else if (Config::parameter_set.find(pair.first) == Config::parameter_set.end()) {
860 Log::Warning("Unknown parameter: %s", pair.first.c_str());
861 }
862 }
863 for (const auto& pair : tmp_map) {
864 auto alias = params->find(pair.first);
865 if (alias == params->end()) { // not find
866 params->emplace(pair.first, params->at(pair.second));
867 params->erase(pair.second);
868 } else {
869 Log::Warning("%s is set=%s, %s=%s will be ignored. Current value: %s=%s",
870 pair.first.c_str(), alias->second.c_str(), pair.second.c_str(), params->at(pair.second).c_str(),
871 pair.first.c_str(), alias->second.c_str());
872 }
873 }
874 }
875};
876
877} // namespace LightGBM
878
879#endif // LightGBM_CONFIG_H_
desc and descl2 fields must be written in reStructuredText format
Definition application.h:10
TaskType
Types of tasks.
Definition config.h:22
Definition config.h:27
static bool GetBool(const std::unordered_map< std::string, std::string > &params, const std::string &name, bool *out)
Get bool value by specific name of key.
Definition config.h:817
static bool GetDouble(const std::unordered_map< std::string, std::string > &params, const std::string &name, double *out)
Get double value by specific name of key.
Definition config.h:804
static bool GetString(const std::unordered_map< std::string, std::string > &params, const std::string &name, std::string *out)
Get string value by specific name of key.
Definition config.h:781
static bool GetInt(const std::unordered_map< std::string, std::string > &params, const std::string &name, int *out)
Get int value by specific name of key.
Definition config.h:791
Definition config.h:836