MR_LIBS/threading__utils_8h_source.html

#ifndef XGBOOST_COMMON_THREADING_UTILS_H_

#define XGBOOST_COMMON_THREADING_UTILS_H_


#include <dmlc/common.h>

#include <dmlc/omp.h>


#include <algorithm>    // for min

#include <cstddef>      // for size_t

#include <cstdint>      // for int32_t

#include <cstdlib>      // for malloc, free

#include <functional>   // for function

#include <new>          // for bad_alloc

#include <type_traits>  // for is_signed, conditional_t, is_integral_v, invoke_result_t

#include <vector>       // for vector


#include "xgboost/logging.h"


#if !defined(_OPENMP)

extern "C" {

inline int32_t omp_get_thread_limit() __GOMP_NOTHROW { return 1; }  // NOLINT

}

#endif  // !defined(_OPENMP)


// MSVC doesn't implement the thread limit.

#if defined(_OPENMP) && defined(_MSC_VER)

#include <limits>


extern "C" {

inline int32_t omp_get_thread_limit() { return std::numeric_limits<int32_t>::max(); }  // NOLINT

}

#endif  // defined(_MSC_VER)


namespace xgboost::common {

// Represent simple range of indexes [begin, end)

// Inspired by tbb::blocked_range


class Range1d {

 public:

  Range1d(size_t begin, size_t end): begin_(begin), end_(end) {

    CHECK_LT(begin, end);

  }


  size_t begin() const {  // NOLINT

    return begin_;

  }


  size_t end() const {  // NOLINT

    return end_;

  }


 private:

  size_t begin_;

  size_t end_;

};


// Split 2d space to balanced blocks

// Implementation of the class is inspired by tbb::blocked_range2d

// However, TBB provides only (n x m) 2d range (matrix) separated by blocks. Example:

// [ 1,2,3 ]

// [ 4,5,6 ]

// [ 7,8,9 ]

// But the class is able to work with different sizes in each 'row'. Example:

// [ 1,2 ]

// [ 3,4,5,6 ]

// [ 7,8,9]

// If grain_size is 2: It produces following blocks:

// [1,2], [3,4], [5,6], [7,8], [9]

// The class helps to process data in several tree nodes (non-balanced usually) in parallel

// Using nested parallelism (by nodes and by data in each node)

// it helps to improve CPU resources utilization


class BlockedSpace2d {

 public:

  // Example of space:

  // [ 1,2 ]

  // [ 3,4,5,6 ]

  // [ 7,8,9]

  // BlockedSpace2d will create following blocks (tasks) if grain_size=2:

  // 1-block: first_dimension = 0, range of indexes in a 'row' = [0,2) (includes [1,2] values)

  // 2-block: first_dimension = 1, range of indexes in a 'row' = [0,2) (includes [3,4] values)

  // 3-block: first_dimension = 1, range of indexes in a 'row' = [2,4) (includes [5,6] values)

  // 4-block: first_dimension = 2, range of indexes in a 'row' = [0,2) (includes [7,8] values)

  // 5-block: first_dimension = 2, range of indexes in a 'row' = [2,3) (includes [9] values)

  // Arguments:

  // dim1 - size of the first dimension in the space

  // getter_size_dim2 - functor to get the second dimensions for each 'row' by row-index

  // grain_size - max size of produced blocks

  template <typename Getter>

  BlockedSpace2d(std::size_t dim1, Getter&& getter_size_dim2, std::size_t grain_size) {

    static_assert(std::is_integral_v<std::invoke_result_t<Getter, std::size_t>>);

    for (std::size_t i = 0; i < dim1; ++i) {

      std::size_t size = getter_size_dim2(i);

      // Each row (second dim) is divided into n_blocks

      std::size_t n_blocks = size / grain_size + !!(size % grain_size);

      for (std::size_t iblock = 0; iblock < n_blocks; ++iblock) {

        std::size_t begin = iblock * grain_size;

        std::size_t end = std::min(begin + grain_size, size);

        AddBlock(i, begin, end);

      }

    }

  }


  // Amount of blocks(tasks) in a space

  [[nodiscard]] std::size_t Size() const {

    return ranges_.size();

  }


  // get index of the first dimension of i-th block(task)

  [[nodiscard]] std::size_t GetFirstDimension(std::size_t i) const {

    CHECK_LT(i, first_dimension_.size());

    return first_dimension_[i];

  }


  // get a range of indexes for the second dimension of i-th block(task)

  [[nodiscard]] Range1d GetRange(std::size_t i) const {

    CHECK_LT(i, ranges_.size());

    return ranges_[i];

  }


 private:

  void AddBlock(std::size_t first_dim, std::size_t begin, std::size_t end) {

    first_dimension_.push_back(first_dim);

    ranges_.emplace_back(begin, end);

  }


  std::vector<Range1d> ranges_;

  std::vector<std::size_t> first_dimension_;

};


// Wrapper to implement nested parallelism with simple omp parallel for

template <typename Func>

void ParallelFor2d(const BlockedSpace2d& space, int n_threads, Func&& func) {

  static_assert(std::is_void_v<std::invoke_result_t<Func, std::size_t, Range1d>>);

  std::size_t n_blocks_in_space = space.Size();

  CHECK_GE(n_threads, 1);


  dmlc::OMPException exc;

#pragma omp parallel num_threads(n_threads)

  {

    exc.Run([&]() {

      std::size_t tid = omp_get_thread_num();

      std::size_t chunck_size = n_blocks_in_space / n_threads + !!(n_blocks_in_space % n_threads);


      std::size_t begin = chunck_size * tid;

      std::size_t end = std::min(begin + chunck_size, n_blocks_in_space);

      for (auto i = begin; i < end; i++) {

        func(space.GetFirstDimension(i), space.GetRange(i));

      }

    });

  }

  exc.Rethrow();

}


struct Sched {

  enum {

    kAuto,

    kDynamic,

    kStatic,

    kGuided,

  } sched;

  size_t chunk{0};


  Sched static Auto() { return Sched{kAuto}; }

  Sched static Dyn(size_t n = 0) { return Sched{kDynamic, n}; }

  Sched static Static(size_t n = 0) { return Sched{kStatic, n}; }

  Sched static Guided() { return Sched{kGuided}; }

};


template <typename Index, typename Func>

void ParallelFor(Index size, int32_t n_threads, Sched sched, Func fn) {

#if defined(_MSC_VER)

  // msvc doesn't support unsigned integer as openmp index.

  using OmpInd = std::conditional_t<std::is_signed<Index>::value, Index, omp_ulong>;

#else

  using OmpInd = Index;

#endif

  OmpInd length = static_cast<OmpInd>(size);

  CHECK_GE(n_threads, 1);


  dmlc::OMPException exc;

  switch (sched.sched) {

  case Sched::kAuto: {

#pragma omp parallel for num_threads(n_threads)

    for (OmpInd i = 0; i < length; ++i) {

      exc.Run(fn, i);

    }

    break;

  }

  case Sched::kDynamic: {

    if (sched.chunk == 0) {

#pragma omp parallel for num_threads(n_threads) schedule(dynamic)

      for (OmpInd i = 0; i < length; ++i) {

        exc.Run(fn, i);

      }

    } else {

#pragma omp parallel for num_threads(n_threads) schedule(dynamic, sched.chunk)

      for (OmpInd i = 0; i < length; ++i) {

        exc.Run(fn, i);

      }

    }

    break;

  }

  case Sched::kStatic: {

    if (sched.chunk == 0) {

#pragma omp parallel for num_threads(n_threads) schedule(static)

      for (OmpInd i = 0; i < length; ++i) {

        exc.Run(fn, i);

      }

    } else {

#pragma omp parallel for num_threads(n_threads) schedule(static, sched.chunk)

      for (OmpInd i = 0; i < length; ++i) {

        exc.Run(fn, i);

      }

    }

    break;

  }

  case Sched::kGuided: {

#pragma omp parallel for num_threads(n_threads) schedule(guided)

    for (OmpInd i = 0; i < length; ++i) {

      exc.Run(fn, i);

    }

    break;

  }

  }

  exc.Rethrow();

}


template <typename Index, typename Func>

void ParallelFor(Index size, int32_t n_threads, Func fn) {

  ParallelFor(size, n_threads, Sched::Static(), fn);

}


inline std::int32_t OmpGetThreadLimit() {

  std::int32_t limit = omp_get_thread_limit();

  CHECK_GE(limit, 1) << "Invalid thread limit for OpenMP.";

  return limit;

}


std::int32_t GetCfsCPUCount() noexcept;


std::int32_t OmpGetNumThreads(std::int32_t n_threads);


template <typename T, std::size_t MaxStackSize>


class MemStackAllocator {

 public:

  explicit MemStackAllocator(size_t required_size) : required_size_(required_size) {

    if (MaxStackSize >= required_size_) {

      ptr_ = stack_mem_;

    } else {

      ptr_ = reinterpret_cast<T*>(std::malloc(required_size_ * sizeof(T)));

    }

    if (!ptr_) {

      throw std::bad_alloc{};

    }

  }

  MemStackAllocator(size_t required_size, T init) : MemStackAllocator{required_size} {

    std::fill_n(ptr_, required_size_, init);

  }


  ~MemStackAllocator() {

    if (required_size_ > MaxStackSize) {

      std::free(ptr_);

    }

  }

  T& operator[](size_t i) { return ptr_[i]; }

  T const& operator[](size_t i) const { return ptr_[i]; }


  auto data() const { return ptr_; }                   // NOLINT

  auto data() { return ptr_; }                         // NOLINT

  std::size_t size() const { return required_size_; }  // NOLINT


  auto cbegin() const { return data(); }         // NOLINT

  auto cend() const { return data() + size(); }  // NOLINT


 private:

  T* ptr_ = nullptr;

  size_t required_size_;

  T stack_mem_[MaxStackSize];

};


std::int32_t constexpr DefaultMaxThreads() { return 128; }

}  // namespace xgboost::common


#endif  // XGBOOST_COMMON_THREADING_UTILS_H_

dmlc::OMPException
OMP Exception class catches, saves and rethrows exception from OMP blocks.
Definition common.h:53

dmlc::OMPException::Rethrow
void Rethrow()
should be called from the main thread to rethrow the exception
Definition common.h:84

dmlc::OMPException::Run
void Run(Function f, Parameters... params)
Parallel OMP blocks should be placed within Run to save exception.
Definition common.h:65

xgboost::common::BlockedSpace2d
Definition threading_utils.h:74

xgboost::common::Index
Optionally compressed gradient index.
Definition hist_util.h:210

xgboost::common::MemStackAllocator
A C-style array with in-stack allocation. As long as the array is smaller than MaxStackSize,...
Definition threading_utils.h:270

xgboost::common::Range1d
Definition threading_utils.h:39

logging.h
defines console logging options for xgboost. Use to enforce unified print behavior.

std
Definition StdDeque.h:58

xgboost::common
Copyright 2017-2023, XGBoost Contributors.
Definition span.h:77

xgboost::common::OmpGetNumThreads
std::int32_t OmpGetNumThreads(std::int32_t n_threads)
Get the number of available threads based on n_threads specified by users.
Definition threading_utils.cc:93

xgboost::common::GetCfsCPUCount
std::int32_t GetCfsCPUCount() noexcept
Get thread limit from CFS.
Definition threading_utils.cc:75

xgboost::common::DefaultMaxThreads
std::int32_t constexpr DefaultMaxThreads()
Constant that can be used for initializing static thread local memory.
Definition threading_utils.h:310

xgboost::omp_ulong
dmlc::omp_ulong omp_ulong
define unsigned long for openmp loop
Definition base.h:322

omp.h
header to handle OpenMP compatibility issues

xgboost::common::Sched
OpenMP schedule.
Definition threading_utils.h:167

common.h
defines some common utility function.