|
Medial Code Documentation
|
DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. More...
Modules | |
| Streaming | |
| Quantile DMatrix and external memory DMatrix can be created from batches of data. | |
Functions | |
| XGB_DLL int | XGDMatrixCreateFromFile (const char *fname, int silent, DMatrixHandle *out) |
| load a data matrix | |
| XGB_DLL int | XGDMatrixCreateFromURI (char const *config, DMatrixHandle *out) |
| load a data matrix | |
| XGB_DLL int | XGDMatrixCreateFromCSREx (const size_t *indptr, const unsigned *indices, const float *data, size_t nindptr, size_t nelem, size_t num_col, DMatrixHandle *out) |
| create a matrix content from CSR format | |
| XGB_DLL int | XGDMatrixCreateFromCSR (char const *indptr, char const *indices, char const *data, bst_ulong ncol, char const *config, DMatrixHandle *out) |
| Create a matrix from CSR matrix. | |
| XGB_DLL int | XGDMatrixCreateFromDense (char const *data, char const *config, DMatrixHandle *out) |
| Create a matrix from dense array. | |
| XGB_DLL int | XGDMatrixCreateFromCSC (char const *indptr, char const *indices, char const *data, bst_ulong nrow, char const *config, DMatrixHandle *out) |
| Create a matrix from a CSC matrix. | |
| XGB_DLL int | XGDMatrixCreateFromCSCEx (const size_t *col_ptr, const unsigned *indices, const float *data, size_t nindptr, size_t nelem, size_t num_row, DMatrixHandle *out) |
| create a matrix content from CSC format | |
| XGB_DLL int | XGDMatrixCreateFromMat (const float *data, bst_ulong nrow, bst_ulong ncol, float missing, DMatrixHandle *out) |
| create matrix content from dense matrix | |
| XGB_DLL int | XGDMatrixCreateFromMat_omp (const float *data, bst_ulong nrow, bst_ulong ncol, float missing, DMatrixHandle *out, int nthread) |
| create matrix content from dense matrix | |
| XGB_DLL int | XGDMatrixCreateFromDT (void **data, const char **feature_stypes, bst_ulong nrow, bst_ulong ncol, DMatrixHandle *out, int nthread) |
| create matrix content from python data table | |
| XGB_DLL int | XGDMatrixCreateFromCudaColumnar (char const *data, char const *config, DMatrixHandle *out) |
| Create DMatrix from CUDA columnar format. (cuDF) | |
| XGB_DLL int | XGDMatrixCreateFromCudaArrayInterface (char const *data, char const *config, DMatrixHandle *out) |
| Create DMatrix from CUDA array. | |
| XGB_DLL int | XGImportArrowRecordBatch (DataIterHandle data_handle, void *ptr_array, void *ptr_schema) |
| XGB_DLL int | XGDMatrixCreateFromArrowCallback (XGDMatrixCallbackNext *next, char const *config, DMatrixHandle *out) |
| Construct DMatrix from arrow using callbacks. Arrow related C API is not stable and subject to change in the future. | |
| XGB_DLL int | XGDMatrixSliceDMatrix (DMatrixHandle handle, const int *idxset, bst_ulong len, DMatrixHandle *out) |
| create a new dmatrix from sliced content of existing matrix | |
| XGB_DLL int | XGDMatrixSliceDMatrixEx (DMatrixHandle handle, const int *idxset, bst_ulong len, DMatrixHandle *out, int allow_groups) |
| create a new dmatrix from sliced content of existing matrix | |
| XGB_DLL int | XGDMatrixFree (DMatrixHandle handle) |
| free space in data matrix | |
| XGB_DLL int | XGDMatrixSaveBinary (DMatrixHandle handle, const char *fname, int silent) |
| load a data matrix into binary file | |
| XGB_DLL int | XGDMatrixSetInfoFromInterface (DMatrixHandle handle, char const *field, char const *c_interface_str) |
| Set content in array interface to a content in info. | |
| XGB_DLL int | XGDMatrixSetFloatInfo (DMatrixHandle handle, const char *field, const float *array, bst_ulong len) |
| set float vector to a content in info | |
| XGB_DLL int | XGDMatrixSetUIntInfo (DMatrixHandle handle, const char *field, const unsigned *array, bst_ulong len) |
| set uint32 vector to a content in info | |
| XGB_DLL int | XGDMatrixSetStrFeatureInfo (DMatrixHandle handle, const char *field, const char **features, const bst_ulong size) |
| Set string encoded information of all features. | |
| XGB_DLL int | XGDMatrixGetStrFeatureInfo (DMatrixHandle handle, const char *field, bst_ulong *size, const char ***out_features) |
| Get string encoded information of all features. | |
| XGB_DLL int | XGDMatrixSetDenseInfo (DMatrixHandle handle, const char *field, void const *data, bst_ulong size, int type) |
| Set meta info from dense matrix. Valid field names are: | |
| XGB_DLL int | XGDMatrixSetGroup (DMatrixHandle handle, const unsigned *group, bst_ulong len) |
| (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix | |
| XGB_DLL int | XGDMatrixGetFloatInfo (const DMatrixHandle handle, const char *field, bst_ulong *out_len, const float **out_dptr) |
| get float info vector from matrix. | |
| XGB_DLL int | XGDMatrixGetUIntInfo (const DMatrixHandle handle, const char *field, bst_ulong *out_len, const unsigned **out_dptr) |
| get uint32 info vector from matrix | |
| XGB_DLL int | XGDMatrixNumRow (DMatrixHandle handle, bst_ulong *out) |
| get number of rows. | |
| XGB_DLL int | XGDMatrixNumCol (DMatrixHandle handle, bst_ulong *out) |
| get number of columns | |
| XGB_DLL int | XGDMatrixNumNonMissing (DMatrixHandle handle, bst_ulong *out) |
| Get number of valid values from DMatrix. | |
| XGB_DLL int | XGDMatrixGetDataAsCSR (DMatrixHandle const handle, char const *config, bst_ulong *out_indptr, unsigned *out_indices, float *out_data) |
| Get the predictors from DMatrix as CSR matrix for testing. | |
| XGB_DLL int | XGDMatrixGetQuantileCut (DMatrixHandle const handle, char const *config, char const **out_indptr, char const **out_data) |
Export the quantile cuts used for training histogram-based models like hist and approx. | |
DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation.
There are a few variants of DMatrix including normal DMatrix, which is a CSR matrix, QuantileDMatrix, which is used by histogram-based tree methods for saving memory, and lastly the experimental external-memory-based DMatrix, which reads data in batches during training. For the last two variants, see the Streaming group.
| XGB_DLL int XGDMatrixCreateFromArrowCallback | ( | XGDMatrixCallbackNext * | next, |
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Construct DMatrix from arrow using callbacks. Arrow related C API is not stable and subject to change in the future.
| next | Callback function for fetching arrow records. |
| config | JSON encoded configuration. Required values are:
|
| out | The created DMatrix. |
| XGB_DLL int XGDMatrixCreateFromCSC | ( | char const * | indptr, |
| char const * | indices, | ||
| char const * | data, | ||
| bst_ulong | nrow, | ||
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Create a matrix from a CSC matrix.
| indptr | JSON encoded array_interface to column pointers in CSC. |
| indices | JSON encoded array_interface to row indices in CSC. |
| data | JSON encoded array_interface to values in CSC. |
| nrow | number of rows in the matrix. |
| config | JSON encoded configuration. Supported values are:
|
| out | created dmatrix |
| XGB_DLL int XGDMatrixCreateFromCSCEx | ( | const size_t * | col_ptr, |
| const unsigned * | indices, | ||
| const float * | data, | ||
| size_t | nindptr, | ||
| size_t | nelem, | ||
| size_t | num_row, | ||
| DMatrixHandle * | out | ||
| ) |
| XGB_DLL int XGDMatrixCreateFromCSR | ( | char const * | indptr, |
| char const * | indices, | ||
| char const * | data, | ||
| bst_ulong | ncol, | ||
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Create a matrix from CSR matrix.
| indptr | JSON encoded array_interface to row pointers in CSR. |
| indices | JSON encoded array_interface to column indices in CSR. |
| data | JSON encoded array_interface to values in CSR. |
| ncol | Number of columns. |
| config | JSON encoded configuration. Required values are:
|
| out | created dmatrix |
| XGB_DLL int XGDMatrixCreateFromCSREx | ( | const size_t * | indptr, |
| const unsigned * | indices, | ||
| const float * | data, | ||
| size_t | nindptr, | ||
| size_t | nelem, | ||
| size_t | num_col, | ||
| DMatrixHandle * | out | ||
| ) |
| XGB_DLL int XGDMatrixCreateFromCudaArrayInterface | ( | char const * | data, |
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Create DMatrix from CUDA array.
| data | JSON encoded cuda_array_interface for array data. |
| config | JSON encoded configuration. Required values are:
|
| out | created dmatrix |
| XGB_DLL int XGDMatrixCreateFromCudaColumnar | ( | char const * | data, |
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Create DMatrix from CUDA columnar format. (cuDF)
| data | Array of JSON encoded cuda_array_interface for each column. |
| config | JSON encoded configuration. Required values are:
|
| out | created dmatrix |
| XGB_DLL int XGDMatrixCreateFromDense | ( | char const * | data, |
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Create a matrix from dense array.
| data | JSON encoded array_interface to array values. |
| config | JSON encoded configuration. Required values are:
|
| out | created dmatrix |
| XGB_DLL int XGDMatrixCreateFromDT | ( | void ** | data, |
| const char ** | feature_stypes, | ||
| bst_ulong | nrow, | ||
| bst_ulong | ncol, | ||
| DMatrixHandle * | out, | ||
| int | nthread | ||
| ) |
create matrix content from python data table
| data | pointer to pointer to column data |
| feature_stypes | pointer to strings |
| nrow | number of rows |
| ncol | number columns |
| out | created dmatrix |
| nthread | number of threads (up to maximum cores available, if <=0 use all cores) |
| XGB_DLL int XGDMatrixCreateFromFile | ( | const char * | fname, |
| int | silent, | ||
| DMatrixHandle * | out | ||
| ) |
load a data matrix
| fname | the name of the file |
| silent | whether print messages during loading |
| out | a loaded data matrix |
| XGB_DLL int XGDMatrixCreateFromMat | ( | const float * | data, |
| bst_ulong | nrow, | ||
| bst_ulong | ncol, | ||
| float | missing, | ||
| DMatrixHandle * | out | ||
| ) |
create matrix content from dense matrix
| data | pointer to the data space |
| nrow | number of rows |
| ncol | number columns |
| missing | which value to represent missing value |
| out | created dmatrix |
| XGB_DLL int XGDMatrixCreateFromMat_omp | ( | const float * | data, |
| bst_ulong | nrow, | ||
| bst_ulong | ncol, | ||
| float | missing, | ||
| DMatrixHandle * | out, | ||
| int | nthread | ||
| ) |
create matrix content from dense matrix
| data | pointer to the data space |
| nrow | number of rows |
| ncol | number columns |
| missing | which value to represent missing value |
| out | created dmatrix |
| nthread | number of threads (up to maximum cores available, if <=0 use all cores) |
| XGB_DLL int XGDMatrixCreateFromURI | ( | char const * | config, |
| DMatrixHandle * | out | ||
| ) |
load a data matrix
| config | JSON encoded parameters for DMatrix construction. Accepted fields are: |
format is required when loading text data. embed:rst:leading-asterisk * See :doc:`/tutorials/input_format` for more info. *
| out | a loaded data matrix |
| XGB_DLL int XGDMatrixFree | ( | DMatrixHandle | handle | ) |
free space in data matrix
| XGB_DLL int XGDMatrixGetDataAsCSR | ( | DMatrixHandle const | handle, |
| char const * | config, | ||
| bst_ulong * | out_indptr, | ||
| unsigned * | out_indices, | ||
| float * | out_data | ||
| ) |
Get the predictors from DMatrix as CSR matrix for testing.
If this is a quantized DMatrix, quantized values are returned instead.
Unlike most of XGBoost C functions, caller of XGDMatrixGetDataAsCSR is required to allocate the memory for return buffer instead of using thread local memory from XGBoost. This is to avoid allocating a huge memory buffer that can not be freed until exiting the thread.
| handle | the handle to the DMatrix |
| config | JSON configuration string. At the moment it should be an empty document, preserved for future use. |
| out_indptr | indptr of output CSR matrix. |
| out_indices | Column index of output CSR matrix. |
| out_data | Data value of CSR matrix. |
| XGB_DLL int XGDMatrixGetFloatInfo | ( | const DMatrixHandle | handle, |
| const char * | field, | ||
| bst_ulong * | out_len, | ||
| const float ** | out_dptr | ||
| ) |
get float info vector from matrix.
| handle | a instance of data matrix |
| field | field name |
| out_len | used to set result length |
| out_dptr | pointer to the result |
| XGB_DLL int XGDMatrixGetQuantileCut | ( | DMatrixHandle const | handle, |
| char const * | config, | ||
| char const ** | out_indptr, | ||
| char const ** | out_data | ||
| ) |
Export the quantile cuts used for training histogram-based models like hist and approx.
Useful for model compression.
| handle | the handle to the DMatrix |
| config | JSON configuration string. At the moment it should be an empty document, preserved for future use. |
| out_indptr | indptr of output CSC matrix represented by a JSON encoded __(cuda_)array_interface__. |
| out_data | Data value of CSC matrix represented by a JSON encoded __(cuda_)array_interface__. |
| XGB_DLL int XGDMatrixGetStrFeatureInfo | ( | DMatrixHandle | handle, |
| const char * | field, | ||
| bst_ulong * | size, | ||
| const char *** | out_features | ||
| ) |
Get string encoded information of all features.
Accepted fields are:
Caller is responsible for copying out the data, before next call to any API function of XGBoost.
| handle | An instance of data matrix |
| field | Field name |
| size | Size of output pointer features (number of strings returned). |
| out_features | Address of a pointer to array of strings. Result is stored in thread local memory. |
| XGB_DLL int XGDMatrixGetUIntInfo | ( | const DMatrixHandle | handle, |
| const char * | field, | ||
| bst_ulong * | out_len, | ||
| const unsigned ** | out_dptr | ||
| ) |
get uint32 info vector from matrix
| handle | a instance of data matrix |
| field | field name |
| out_len | The length of the field. |
| out_dptr | pointer to the result |
| XGB_DLL int XGDMatrixNumCol | ( | DMatrixHandle | handle, |
| bst_ulong * | out | ||
| ) |
get number of columns
| handle | the handle to the DMatrix |
| out | The output of number of columns |
| XGB_DLL int XGDMatrixNumNonMissing | ( | DMatrixHandle | handle, |
| bst_ulong * | out | ||
| ) |
Get number of valid values from DMatrix.
| handle | the handle to the DMatrix |
| out | The output of number of non-missing values |
| XGB_DLL int XGDMatrixNumRow | ( | DMatrixHandle | handle, |
| bst_ulong * | out | ||
| ) |
get number of rows.
| handle | the handle to the DMatrix |
| out | The address to hold number of rows. |
| XGB_DLL int XGDMatrixSaveBinary | ( | DMatrixHandle | handle, |
| const char * | fname, | ||
| int | silent | ||
| ) |
load a data matrix into binary file
| handle | a instance of data matrix |
| fname | file name |
| silent | print statistics when saving |
| XGB_DLL int XGDMatrixSetDenseInfo | ( | DMatrixHandle | handle, |
| const char * | field, | ||
| void const * | data, | ||
| bst_ulong | size, | ||
| int | type | ||
| ) |
Set meta info from dense matrix. Valid field names are:
| handle | An instance of data matrix |
| field | Field name |
| data | Pointer to consecutive memory storing data. |
| size | Size of the data, this is relative to size of type. (Meaning NOT number of bytes.) |
| type | Indicator of data type. This is defined in xgboost::DataType enum class.
|
| XGB_DLL int XGDMatrixSetFloatInfo | ( | DMatrixHandle | handle, |
| const char * | field, | ||
| const float * | array, | ||
| bst_ulong | len | ||
| ) |
set float vector to a content in info
| handle | a instance of data matrix |
| field | field name, can be label, weight |
| array | pointer to float vector |
| len | length of array |
| XGB_DLL int XGDMatrixSetGroup | ( | DMatrixHandle | handle, |
| const unsigned * | group, | ||
| bst_ulong | len | ||
| ) |
(deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
| handle | a instance of data matrix |
| group | pointer to group size |
| len | length of array |
| XGB_DLL int XGDMatrixSetInfoFromInterface | ( | DMatrixHandle | handle, |
| char const * | field, | ||
| char const * | c_interface_str | ||
| ) |
Set content in array interface to a content in info.
| handle | a instance of data matrix |
| field | field name. |
| c_interface_str | JSON string representation of array interface. |
| XGB_DLL int XGDMatrixSetStrFeatureInfo | ( | DMatrixHandle | handle, |
| const char * | field, | ||
| const char ** | features, | ||
| const bst_ulong | size | ||
| ) |
Set string encoded information of all features.
Accepted fields are:
| handle | An instance of data matrix |
| field | Field name |
| features | Pointer to array of strings. |
| size | Size of features pointer (number of strings passed in). |
| XGB_DLL int XGDMatrixSetUIntInfo | ( | DMatrixHandle | handle, |
| const char * | field, | ||
| const unsigned * | array, | ||
| bst_ulong | len | ||
| ) |
set uint32 vector to a content in info
| handle | a instance of data matrix |
| field | field name |
| array | pointer to unsigned int vector |
| len | length of array |
| XGB_DLL int XGDMatrixSliceDMatrix | ( | DMatrixHandle | handle, |
| const int * | idxset, | ||
| bst_ulong | len, | ||
| DMatrixHandle * | out | ||
| ) |
create a new dmatrix from sliced content of existing matrix
| handle | instance of data matrix to be sliced |
| idxset | index set |
| len | length of index set |
| out | a sliced new matrix |
| XGB_DLL int XGDMatrixSliceDMatrixEx | ( | DMatrixHandle | handle, |
| const int * | idxset, | ||
| bst_ulong | len, | ||
| DMatrixHandle * | out, | ||
| int | allow_groups | ||
| ) |
create a new dmatrix from sliced content of existing matrix
| handle | instance of data matrix to be sliced |
| idxset | index set |
| len | length of index set |
| out | a sliced new matrix |
| allow_groups | allow slicing of an array with groups |