11 #include <type_traits>
21 #include <cuda_runtime.h>
31 static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128;
32 static constexpr int64_t OPEN3D_PARFOR_THREAD = 4;
35 template <
int64_t block_size,
int64_t thread_size,
typename func_t>
36 __global__
void ElementWiseKernel_(int64_t n, func_t f) {
37 int64_t items_per_block = block_size * thread_size;
38 int64_t idx = blockIdx.x * items_per_block + threadIdx.x;
40 for (int64_t i = 0; i < thread_size; ++i) {
49 template <
typename func_t>
50 void ParallelForCUDA_(
const Device& device, int64_t n,
const func_t& func) {
59 CUDAScopedDevice scoped_device(device);
60 int64_t items_per_block = OPEN3D_PARFOR_BLOCK * OPEN3D_PARFOR_THREAD;
61 int64_t grid_size = (n + items_per_block - 1) / items_per_block;
63 ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>
64 <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(
72 template <
typename func_t>
74 if (!device.
IsCPU()) {
82 #pragma omp parallel for num_threads(utility::EstimateMaxThreads())
83 for (int64_t i = 0; i < n; ++i) {
102 template <
typename func_t>
105 ParallelForCUDA_(device, n, func);
157 template <
typename vec_func_t,
typename func_t>
161 const vec_func_t& vec_func) {
162 #ifdef BUILD_ISPC_MODULE
165 ParallelForCUDA_(device, n, func);
169 int64_t start = n * i / num_threads;
170 int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);
171 vec_func(start, end);
178 ParallelForCUDA_(device, n, func);
186 #ifdef BUILD_ISPC_MODULE
189 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
190 using namespace ispc; \
191 ISPCKernel(start, end, __VA_ARGS__);
196 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
198 "ISPC module disabled. Unable to call vectorized kernel {}", \
199 OPEN3D_STRINGIFY(ISPCKernel));
204 #define OPEN3D_OVERLOADED_LAMBDA_(T, ISPCKernel, ...) \
205 [&](T, int64_t start, int64_t end) { \
206 OPEN3D_CALL_ISPC_KERNEL_( \
207 OPEN3D_CONCAT(ISPCKernel, OPEN3D_CONCAT(_, T)), start, end, \
220 #define OPEN3D_VECTORIZED(ISPCKernel, ...) \
221 [&](int64_t start, int64_t end) { \
222 OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \
238 #define OPEN3D_TEMPLATE_VECTORIZED(T, ISPCKernel, ...) \
239 [&](int64_t start, int64_t end) { \
240 static_assert(std::is_arithmetic<T>::value, \
241 "Data type is not an arithmetic type"); \
243 OPEN3D_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \
244 OPEN3D_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, __VA_ARGS__), \
245 OPEN3D_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, __VA_ARGS__), \
246 OPEN3D_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, __VA_ARGS__), \
247 OPEN3D_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, __VA_ARGS__), \
248 OPEN3D_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, __VA_ARGS__), \
249 OPEN3D_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, __VA_ARGS__), \
250 OPEN3D_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, __VA_ARGS__), \
251 OPEN3D_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, __VA_ARGS__), \
252 OPEN3D_OVERLOADED_LAMBDA_(float, ISPCKernel, __VA_ARGS__), \
253 OPEN3D_OVERLOADED_LAMBDA_(double, ISPCKernel, __VA_ARGS__), \
254 [&](auto&& generic, int64_t start, int64_t end) { \
256 "Unsupported data type {} for calling " \
257 "vectorized kernel {}", \
258 typeid(generic).name(), \
259 OPEN3D_STRINGIFY(ISPCKernel)); \
260 })(T{}, start, end); \
#define OPEN3D_GET_LAST_CUDA_ERROR(message)
Definition: CUDAUtils.h:48
#define LogError(...)
Definition: Logging.h:48
bool IsCPU() const
Returns true iff device type is CPU.
Definition: Device.h:46
std::string ToString() const
Returns string representation of device, e.g. "CPU:0", "CUDA:0".
Definition: Device.cpp:88
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition: ParallelFor.h:73
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:103
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition: Parallel.cpp:31
Definition: PinholeCameraIntrinsic.cpp:16