release/cpp_api/_parallel_for_8h_source.html

 // ----------------------------------------------------------------------------

 // -                        Open3D: www.open3d.org                            -

 // ----------------------------------------------------------------------------

 // Copyright (c) 2018-2024 www.open3d.org

 // SPDX-License-Identifier: MIT

 // ----------------------------------------------------------------------------


 #pragma once


 #include <cstdint>

 #include <type_traits>


 #include "open3d/core/Device.h"

 #include "open3d/utility/Logging.h"

 #include "open3d/utility/Overload.h"

 #include "open3d/utility/Parallel.h"

 #include "open3d/utility/Preprocessor.h"


 #ifdef __CUDACC__

 #include <cuda.h>

 #include <cuda_runtime.h>


 #include "open3d/core/CUDAUtils.h"

 #endif


 namespace open3d {

 namespace core {


 #ifdef __CUDACC__


 static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128;

 static constexpr int64_t OPEN3D_PARFOR_THREAD = 4;


 template <int64_t block_size, int64_t thread_size, typename func_t>

 __global__ void ElementWiseKernel_(int64_t n, func_t f) {

     int64_t items_per_block = block_size * thread_size;

     int64_t idx = blockIdx.x * items_per_block + threadIdx.x;

 #pragma unroll

     for (int64_t i = 0; i < thread_size; ++i) {

         if (idx < n) {

             f(idx);

             idx += block_size;

         }

     }

 }


 template <typename func_t>

 void ParallelForCUDA_(const Device& device, int64_t n, const func_t& func) {

     if (device.GetType() != Device::DeviceType::CUDA) {

         utility::LogError("ParallelFor for CUDA cannot run on device {}.",

                           device.ToString());

     }

     if (n == 0) {

         return;

     }


     CUDAScopedDevice scoped_device(device);

     int64_t items_per_block = OPEN3D_PARFOR_BLOCK * OPEN3D_PARFOR_THREAD;

     int64_t grid_size = (n + items_per_block - 1) / items_per_block;


     ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>

             <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(

                     n, func);

     OPEN3D_GET_LAST_CUDA_ERROR("ParallelFor failed.");

 }


 #else


 template <typename func_t>

 void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {

     if (!device.IsCPU()) {

         utility::LogError("ParallelFor for CPU cannot run on device {}.",

                           device.ToString());

     }

     if (n == 0) {

         return;

     }


 #pragma omp parallel for num_threads(utility::EstimateMaxThreads())

     for (int64_t i = 0; i < n; ++i) {

         func(i);

     }

 }


 #endif


 template <typename func_t>

 void ParallelFor(const Device& device, int64_t n, const func_t& func) {

 #ifdef __CUDACC__

     ParallelForCUDA_(device, n, func);

 #else

     ParallelForCPU_(device, n, func);

 #endif

 }


 template <typename vec_func_t, typename func_t>

 void ParallelFor(const Device& device,

                  int64_t n,

                  const func_t& func,

                  const vec_func_t& vec_func) {

 #ifdef BUILD_ISPC_MODULE


 #ifdef __CUDACC__

     ParallelForCUDA_(device, n, func);

 #else

     int num_threads = utility::EstimateMaxThreads();

     ParallelForCPU_(device, num_threads, [&](int64_t i) {

         int64_t start = n * i / num_threads;

         int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);

         vec_func(start, end);

     });

 #endif


 #else


 #ifdef __CUDACC__

     ParallelForCUDA_(device, n, func);

 #else

     ParallelForCPU_(device, n, func);

 #endif


 #endif

 }


 #ifdef BUILD_ISPC_MODULE


 // Internal helper macro.

 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \

     using namespace ispc;                                     \

     ISPCKernel(start, end, __VA_ARGS__);


 #else


 // Internal helper macro.

 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...)            \

     utility::LogError(                                                   \

             "ISPC module disabled. Unable to call vectorized kernel {}", \

             OPEN3D_STRINGIFY(ISPCKernel));


 #endif


 #define OPEN3D_OVERLOADED_LAMBDA_(T, ISPCKernel, ...)                       \

     [&](T, int64_t start, int64_t end) {                                    \

         OPEN3D_CALL_ISPC_KERNEL_(                                           \

                 OPEN3D_CONCAT(ISPCKernel, OPEN3D_CONCAT(_, T)), start, end, \

                 __VA_ARGS__);                                               \

     }


 #define OPEN3D_VECTORIZED(ISPCKernel, ...)                             \

     [&](int64_t start, int64_t end) {                                  \

         OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \

     }


 #define OPEN3D_TEMPLATE_VECTORIZED(T, ISPCKernel, ...)                        \

     [&](int64_t start, int64_t end) {                                         \

         static_assert(std::is_arithmetic<T>::value,                           \

                       "Data type is not an arithmetic type");                 \

         utility::Overload(                                                    \

                 OPEN3D_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__),     \

                 OPEN3D_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, __VA_ARGS__),  \

                 OPEN3D_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, __VA_ARGS__),   \

                 OPEN3D_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, __VA_ARGS__), \

                 OPEN3D_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, __VA_ARGS__),  \

                 OPEN3D_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, __VA_ARGS__), \

                 OPEN3D_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, __VA_ARGS__),  \

                 OPEN3D_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, __VA_ARGS__), \

                 OPEN3D_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, __VA_ARGS__),  \

                 OPEN3D_OVERLOADED_LAMBDA_(float, ISPCKernel, __VA_ARGS__),    \

                 OPEN3D_OVERLOADED_LAMBDA_(double, ISPCKernel, __VA_ARGS__),   \

                 [&](auto&& generic, int64_t start, int64_t end) {             \

                     utility::LogError(                                        \

                             "Unsupported data type {} for calling "           \

                             "vectorized kernel {}",                           \

                             typeid(generic).name(),                           \

                             OPEN3D_STRINGIFY(ISPCKernel));                    \

                 })(T{}, start, end);                                          \

     }


 }  // namespace core

 }  // namespace open3d

CUDAUtils.h
Common CUDA utilities.

OPEN3D_GET_LAST_CUDA_ERROR
#define OPEN3D_GET_LAST_CUDA_ERROR(message)
Definition: CUDAUtils.h:48

Device.h

Logging.h

LogError
#define LogError(...)
Definition: Logging.h:51

Overload.h

Parallel.h

Preprocessor.h

open3d::core::Device
Definition: Device.h:18

open3d::core::Device::DeviceType::CUDA
@ CUDA

open3d::core::Device::IsCPU
bool IsCPU() const
Returns true iff device type is CPU.
Definition: Device.h:46

open3d::core::Device::ToString
std::string ToString() const
Returns string representation of device, e.g. "CPU:0", "CUDA:0".
Definition: Device.cpp:88

open3d::core::ParallelForCPU_
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition: ParallelFor.h:73

open3d::core::ParallelFor
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:108

open3d::utility::EstimateMaxThreads
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition: Parallel.cpp:31

open3d
Definition: PinholeCameraIntrinsic.cpp:16