MR_LIBS/numeric_8h_source.html

#ifndef XGBOOST_COMMON_NUMERIC_H_

#define XGBOOST_COMMON_NUMERIC_H_


#include <dmlc/common.h>  // OMPException


#include <algorithm>  // for std::max

#include <cstddef>    // for size_t

#include <cstdint>    // for int32_t

#include <iterator>   // for iterator_traits

#include <numeric>    // for accumulate

#include <vector>


#include "common.h"                      // AssertGPUSupport

#include "threading_utils.h"             // MemStackAllocator, DefaultMaxThreads

#include "xgboost/context.h"             // Context

#include "xgboost/host_device_vector.h"  // HostDeviceVector


namespace xgboost::common {


template <typename Iter, typename Idx>


void RunLengthEncode(Iter begin, Iter end, std::vector<Idx>* p_out) {

  auto& out = *p_out;

  out = std::vector<Idx>{0};

  size_t n = std::distance(begin, end);

  for (size_t i = 1; i < n; ++i) {

    if (begin[i] != begin[i - 1]) {

      out.push_back(i);

    }

  }

  if (out.back() != n) {

    out.push_back(n);

  }

}


template <typename InIt, typename OutIt, typename T>


void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {

  static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value);

  static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value);

  // The number of threads is pegged to the batch size. If the OMP block is parallelized

  // on anything other than the batch/block size, it should be reassigned

  auto n = static_cast<size_t>(std::distance(begin, end));

  const size_t batch_threads =

      std::max(static_cast<size_t>(1), std::min(n, static_cast<size_t>(n_threads)));

  MemStackAllocator<T, DefaultMaxThreads()> partial_sums(batch_threads);


  size_t block_size = n / batch_threads;


  dmlc::OMPException exc;

#pragma omp parallel num_threads(batch_threads)

  {

#pragma omp for

    for (omp_ulong tid = 0; tid < batch_threads; ++tid) {

      exc.Run([&]() {

        size_t ibegin = block_size * tid;

        size_t iend = (tid == (batch_threads - 1) ? n : (block_size * (tid + 1)));


        T running_sum = 0;

        for (size_t ridx = ibegin; ridx < iend; ++ridx) {

          running_sum += *(begin + ridx);

          *(out_it + 1 + ridx) = running_sum;

        }

      });

    }


#pragma omp single

    {

      exc.Run([&]() {

        partial_sums[0] = init;

        for (size_t i = 1; i < batch_threads; ++i) {

          partial_sums[i] = partial_sums[i - 1] + *(out_it + i * block_size);

        }

      });

    }


#pragma omp for

    for (omp_ulong tid = 0; tid < batch_threads; ++tid) {

      exc.Run([&]() {

        size_t ibegin = block_size * tid;

        size_t iend = (tid == (batch_threads - 1) ? n : (block_size * (tid + 1)));


        for (size_t i = ibegin; i < iend; ++i) {

          *(out_it + 1 + i) += partial_sums[tid];

        }

      });

    }

  }

  exc.Rethrow();

}


namespace cuda_impl {

double Reduce(Context const* ctx, HostDeviceVector<float> const& values);

#if !defined(XGBOOST_USE_CUDA)

inline double Reduce(Context const*, HostDeviceVector<float> const&) {

  AssertGPUSupport();

  return 0;

}

#endif  // !defined(XGBOOST_USE_CUDA)

}  // namespace cuda_impl


namespace cpu_impl {

template <typename It, typename V = typename It::value_type>

V Reduce(Context const* ctx, It first, It second, V const& init) {

  std::size_t n = std::distance(first, second);

  auto n_threads = static_cast<std::size_t>(std::min(n, static_cast<std::size_t>(ctx->Threads())));

  common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(n_threads, init);

  common::ParallelFor(n, n_threads, [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });

  auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + n_threads, init);

  return result;

}

}  // namespace cpu_impl


double Reduce(Context const* ctx, HostDeviceVector<float> const& values);


template <typename It>

void Iota(Context const* ctx, It first, It last,

          typename std::iterator_traits<It>::value_type const& value) {

  auto n = std::distance(first, last);

  std::int32_t n_threads = ctx->Threads();

  const size_t block_size = n / n_threads + !!(n % n_threads);

  dmlc::OMPException exc;

#pragma omp parallel num_threads(n_threads)

  {

    exc.Run([&]() {

      const size_t tid = omp_get_thread_num();

      const size_t ibegin = tid * block_size;

      const size_t iend = std::min(ibegin + block_size, static_cast<size_t>(n));

      for (size_t i = ibegin; i < iend; ++i) {

        first[i] = i + value;

      }

    });

  }

}

}  // namespace xgboost::common


#endif  // XGBOOST_COMMON_NUMERIC_H_

dmlc::OMPException
OMP Exception class catches, saves and rethrows exception from OMP blocks.
Definition common.h:53

dmlc::OMPException::Rethrow
void Rethrow()
should be called from the main thread to rethrow the exception
Definition common.h:84

dmlc::OMPException::Run
void Run(Function f, Parameters... params)
Parallel OMP blocks should be placed within Run to save exception.
Definition common.h:65

xgboost::HostDeviceVector
Definition host_device_vector.h:87

xgboost::common::MemStackAllocator
A C-style array with in-stack allocation. As long as the array is smaller than MaxStackSize,...
Definition threading_utils.h:270

context.h
Copyright 2014-2023, XGBoost Contributors.

host_device_vector.h
A device-and-host vector abstraction layer.

xgboost::common
Copyright 2017-2023, XGBoost Contributors.
Definition span.h:77

xgboost::common::PartialSum
void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it)
Varient of std::partial_sum, out_it should point to a container that has n + 1 elements.
Definition numeric.h:46

xgboost::common::DefaultMaxThreads
std::int32_t constexpr DefaultMaxThreads()
Constant that can be used for initializing static thread local memory.
Definition threading_utils.h:310

xgboost::common::Reduce
double Reduce(Context const *ctx, HostDeviceVector< float > const &values)
Reduction on host device vector.
Definition numeric.cc:13

xgboost::common::RunLengthEncode
void RunLengthEncode(Iter begin, Iter end, std::vector< Idx > *p_out)
Run length encode on CPU, input must be sorted.
Definition numeric.h:27

xgboost::omp_ulong
dmlc::omp_ulong omp_ulong
define unsigned long for openmp loop
Definition base.h:322

xgboost::Context
Runtime context for XGBoost.
Definition context.h:84

xgboost::Context::Threads
std::int32_t Threads() const
Returns the automatically chosen number of threads based on the nthread parameter and the system sett...
Definition context.cc:203

common.h
defines some common utility function.

common.h
Copyright 2015-2023 by XGBoost Contributors.