Open3D (C++ API)  0.18.0
ParallelFor.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2023 www.open3d.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <cstdint>
11 #include <type_traits>
12 
13 #include "open3d/core/Device.h"
14 #include "open3d/utility/Logging.h"
18 
19 #ifdef __CUDACC__
20 #include <cuda.h>
21 #include <cuda_runtime.h>
22 
23 #include "open3d/core/CUDAUtils.h"
24 #endif
25 
26 namespace open3d {
27 namespace core {
28 
29 #ifdef __CUDACC__
30 
31 static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128;
32 static constexpr int64_t OPEN3D_PARFOR_THREAD = 4;
33 
35 template <int64_t block_size, int64_t thread_size, typename func_t>
36 __global__ void ElementWiseKernel_(int64_t n, func_t f) {
37  int64_t items_per_block = block_size * thread_size;
38  int64_t idx = blockIdx.x * items_per_block + threadIdx.x;
39 #pragma unroll
40  for (int64_t i = 0; i < thread_size; ++i) {
41  if (idx < n) {
42  f(idx);
43  idx += block_size;
44  }
45  }
46 }
47 
49 template <typename func_t>
50 void ParallelForCUDA_(const Device& device, int64_t n, const func_t& func) {
51  if (device.GetType() != Device::DeviceType::CUDA) {
52  utility::LogError("ParallelFor for CUDA cannot run on device {}.",
53  device.ToString());
54  }
55  if (n == 0) {
56  return;
57  }
58 
59  CUDAScopedDevice scoped_device(device);
60  int64_t items_per_block = OPEN3D_PARFOR_BLOCK * OPEN3D_PARFOR_THREAD;
61  int64_t grid_size = (n + items_per_block - 1) / items_per_block;
62 
63  ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>
64  <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(
65  n, func);
66  OPEN3D_GET_LAST_CUDA_ERROR("ParallelFor failed.");
67 }
68 
69 #else
70 
72 template <typename func_t>
73 void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {
74  if (!device.IsCPU()) {
75  utility::LogError("ParallelFor for CPU cannot run on device {}.",
76  device.ToString());
77  }
78  if (n == 0) {
79  return;
80  }
81 
82 #pragma omp parallel for num_threads(utility::EstimateMaxThreads())
83  for (int64_t i = 0; i < n; ++i) {
84  func(i);
85  }
86 }
87 
88 #endif
89 
102 template <typename func_t>
103 void ParallelFor(const Device& device, int64_t n, const func_t& func) {
104 #ifdef __CUDACC__
105  ParallelForCUDA_(device, n, func);
106 #else
107  ParallelForCPU_(device, n, func);
108 #endif
109 }
110 
157 template <typename vec_func_t, typename func_t>
158 void ParallelFor(const Device& device,
159  int64_t n,
160  const func_t& func,
161  const vec_func_t& vec_func) {
162 #ifdef BUILD_ISPC_MODULE
163 
164 #ifdef __CUDACC__
165  ParallelForCUDA_(device, n, func);
166 #else
167  int num_threads = utility::EstimateMaxThreads();
168  ParallelForCPU_(device, num_threads, [&](int64_t i) {
169  int64_t start = n * i / num_threads;
170  int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);
171  vec_func(start, end);
172  });
173 #endif
174 
175 #else
176 
177 #ifdef __CUDACC__
178  ParallelForCUDA_(device, n, func);
179 #else
180  ParallelForCPU_(device, n, func);
181 #endif
182 
183 #endif
184 }
185 
186 #ifdef BUILD_ISPC_MODULE
187 
188 // Internal helper macro.
189 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
190  using namespace ispc; \
191  ISPCKernel(start, end, __VA_ARGS__);
192 
193 #else
194 
195 // Internal helper macro.
196 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
197  utility::LogError( \
198  "ISPC module disabled. Unable to call vectorized kernel {}", \
199  OPEN3D_STRINGIFY(ISPCKernel));
200 
201 #endif
202 
204 #define OPEN3D_OVERLOADED_LAMBDA_(T, ISPCKernel, ...) \
205  [&](T, int64_t start, int64_t end) { \
206  OPEN3D_CALL_ISPC_KERNEL_( \
207  OPEN3D_CONCAT(ISPCKernel, OPEN3D_CONCAT(_, T)), start, end, \
208  __VA_ARGS__); \
209  }
210 
220 #define OPEN3D_VECTORIZED(ISPCKernel, ...) \
221  [&](int64_t start, int64_t end) { \
222  OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \
223  }
224 
238 #define OPEN3D_TEMPLATE_VECTORIZED(T, ISPCKernel, ...) \
239  [&](int64_t start, int64_t end) { \
240  static_assert(std::is_arithmetic<T>::value, \
241  "Data type is not an arithmetic type"); \
242  utility::Overload( \
243  OPEN3D_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \
244  OPEN3D_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, __VA_ARGS__), \
245  OPEN3D_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, __VA_ARGS__), \
246  OPEN3D_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, __VA_ARGS__), \
247  OPEN3D_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, __VA_ARGS__), \
248  OPEN3D_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, __VA_ARGS__), \
249  OPEN3D_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, __VA_ARGS__), \
250  OPEN3D_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, __VA_ARGS__), \
251  OPEN3D_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, __VA_ARGS__), \
252  OPEN3D_OVERLOADED_LAMBDA_(float, ISPCKernel, __VA_ARGS__), \
253  OPEN3D_OVERLOADED_LAMBDA_(double, ISPCKernel, __VA_ARGS__), \
254  [&](auto&& generic, int64_t start, int64_t end) { \
255  utility::LogError( \
256  "Unsupported data type {} for calling " \
257  "vectorized kernel {}", \
258  typeid(generic).name(), \
259  OPEN3D_STRINGIFY(ISPCKernel)); \
260  })(T{}, start, end); \
261  }
262 
263 } // namespace core
264 } // namespace open3d
Common CUDA utilities.
#define OPEN3D_GET_LAST_CUDA_ERROR(message)
Definition: CUDAUtils.h:48
#define LogError(...)
Definition: Logging.h:48
Definition: Device.h:18
bool IsCPU() const
Returns true iff device type is CPU.
Definition: Device.h:46
std::string ToString() const
Returns string representation of device, e.g. "CPU:0", "CUDA:0".
Definition: Device.cpp:88
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition: ParallelFor.h:73
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:103
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition: Parallel.cpp:31
Definition: PinholeCameraIntrinsic.cpp:16