Open3D (C++ API)  0.19.0
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
ParallelFor.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.open3d.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <cstdint>
11 #include <type_traits>
12 
13 #include "open3d/core/Device.h"
14 #include "open3d/utility/Logging.h"
18 
19 #ifdef __CUDACC__
20 #include <cuda.h>
21 #include <cuda_runtime.h>
22 
23 #include "open3d/core/CUDAUtils.h"
24 #endif
25 
26 namespace open3d {
27 namespace core {
28 
29 #ifdef __CUDACC__
30 
31 static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128;
32 static constexpr int64_t OPEN3D_PARFOR_THREAD = 4;
33 
35 template <int64_t block_size, int64_t thread_size, typename func_t>
36 __global__ void ElementWiseKernel_(int64_t n, func_t f) {
37  int64_t items_per_block = block_size * thread_size;
38  int64_t idx = blockIdx.x * items_per_block + threadIdx.x;
39 #pragma unroll
40  for (int64_t i = 0; i < thread_size; ++i) {
41  if (idx < n) {
42  f(idx);
43  idx += block_size;
44  }
45  }
46 }
47 
49 template <typename func_t>
50 void ParallelForCUDA_(const Device& device, int64_t n, const func_t& func) {
51  if (device.GetType() != Device::DeviceType::CUDA) {
52  utility::LogError("ParallelFor for CUDA cannot run on device {}.",
53  device.ToString());
54  }
55  if (n == 0) {
56  return;
57  }
58 
59  CUDAScopedDevice scoped_device(device);
60  int64_t items_per_block = OPEN3D_PARFOR_BLOCK * OPEN3D_PARFOR_THREAD;
61  int64_t grid_size = (n + items_per_block - 1) / items_per_block;
62 
63  ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>
64  <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(
65  n, func);
66  OPEN3D_GET_LAST_CUDA_ERROR("ParallelFor failed.");
67 }
68 
69 #else
70 
72 template <typename func_t>
73 void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {
74  if (!device.IsCPU()) {
75  utility::LogError("ParallelFor for CPU cannot run on device {}.",
76  device.ToString());
77  }
78  if (n == 0) {
79  return;
80  }
81 
82 #pragma omp parallel for num_threads(utility::EstimateMaxThreads())
83  for (int64_t i = 0; i < n; ++i) {
84  func(i);
85  }
86 }
87 
88 #endif
89 
107 template <typename func_t>
108 void ParallelFor(const Device& device, int64_t n, const func_t& func) {
109 #ifdef __CUDACC__
110  ParallelForCUDA_(device, n, func);
111 #else
112  ParallelForCPU_(device, n, func);
113 #endif
114 }
115 
162 template <typename vec_func_t, typename func_t>
163 void ParallelFor(const Device& device,
164  int64_t n,
165  const func_t& func,
166  const vec_func_t& vec_func) {
167 #ifdef BUILD_ISPC_MODULE
168 
169 #ifdef __CUDACC__
170  ParallelForCUDA_(device, n, func);
171 #else
172  int num_threads = utility::EstimateMaxThreads();
173  ParallelForCPU_(device, num_threads, [&](int64_t i) {
174  int64_t start = n * i / num_threads;
175  int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);
176  vec_func(start, end);
177  });
178 #endif
179 
180 #else
181 
182 #ifdef __CUDACC__
183  ParallelForCUDA_(device, n, func);
184 #else
185  ParallelForCPU_(device, n, func);
186 #endif
187 
188 #endif
189 }
190 
191 #ifdef BUILD_ISPC_MODULE
192 
193 // Internal helper macro.
194 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
195  using namespace ispc; \
196  ISPCKernel(start, end, __VA_ARGS__);
197 
198 #else
199 
200 // Internal helper macro.
201 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
202  utility::LogError( \
203  "ISPC module disabled. Unable to call vectorized kernel {}", \
204  OPEN3D_STRINGIFY(ISPCKernel));
205 
206 #endif
207 
209 #define OPEN3D_OVERLOADED_LAMBDA_(T, ISPCKernel, ...) \
210  [&](T, int64_t start, int64_t end) { \
211  OPEN3D_CALL_ISPC_KERNEL_( \
212  OPEN3D_CONCAT(ISPCKernel, OPEN3D_CONCAT(_, T)), start, end, \
213  __VA_ARGS__); \
214  }
215 
225 #define OPEN3D_VECTORIZED(ISPCKernel, ...) \
226  [&](int64_t start, int64_t end) { \
227  OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \
228  }
229 
243 #define OPEN3D_TEMPLATE_VECTORIZED(T, ISPCKernel, ...) \
244  [&](int64_t start, int64_t end) { \
245  static_assert(std::is_arithmetic<T>::value, \
246  "Data type is not an arithmetic type"); \
247  utility::Overload( \
248  OPEN3D_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \
249  OPEN3D_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, __VA_ARGS__), \
250  OPEN3D_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, __VA_ARGS__), \
251  OPEN3D_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, __VA_ARGS__), \
252  OPEN3D_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, __VA_ARGS__), \
253  OPEN3D_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, __VA_ARGS__), \
254  OPEN3D_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, __VA_ARGS__), \
255  OPEN3D_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, __VA_ARGS__), \
256  OPEN3D_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, __VA_ARGS__), \
257  OPEN3D_OVERLOADED_LAMBDA_(float, ISPCKernel, __VA_ARGS__), \
258  OPEN3D_OVERLOADED_LAMBDA_(double, ISPCKernel, __VA_ARGS__), \
259  [&](auto&& generic, int64_t start, int64_t end) { \
260  utility::LogError( \
261  "Unsupported data type {} for calling " \
262  "vectorized kernel {}", \
263  typeid(generic).name(), \
264  OPEN3D_STRINGIFY(ISPCKernel)); \
265  })(T{}, start, end); \
266  }
267 
268 } // namespace core
269 } // namespace open3d
Common CUDA utilities.
#define OPEN3D_GET_LAST_CUDA_ERROR(message)
Definition: CUDAUtils.h:48
#define LogError(...)
Definition: Logging.h:51
Definition: Device.h:18
bool IsCPU() const
Returns true iff device type is CPU.
Definition: Device.h:46
std::string ToString() const
Returns string representation of device, e.g. "CPU:0", "CUDA:0".
Definition: Device.cpp:88
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition: ParallelFor.h:73
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:108
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition: Parallel.cpp:31
Definition: PinholeCameraIntrinsic.cpp:16