51 template <
typename func_t>
53 func_t element_kernel) {
54 #pragma omp parallel for schedule(static) 55 for (int64_t workload_idx = 0; workload_idx < indexer.
NumWorkloads();
57 element_kernel(indexer.
GetInputPtr(0, workload_idx), workload_idx);
61 template <
typename func_t>
63 func_t element_kernel) {
64 #pragma omp parallel for schedule(static) 65 for (int64_t workload_idx = 0; workload_idx < indexer.
NumWorkloads();
67 element_kernel(indexer.
GetInputPtr(0, workload_idx),
72 template <
typename func_t>
74 func_t element_kernel) {
75 #pragma omp parallel for schedule(static) 76 for (int64_t workload_idx = 0; workload_idx < indexer.
NumWorkloads();
78 element_kernel(indexer.
GetInputPtr(0, workload_idx),
84 template <
typename func_t>
86 func_t element_kernel) {
87 #pragma omp parallel for schedule(static) 88 for (int64_t workload_idx = 0; workload_idx < indexer.
NumWorkloads();
95 template <
typename scalar_t,
typename func_t>
97 func_t element_kernel) {
98 for (int64_t workload_idx = 0; workload_idx < indexer.
NumWorkloads();
100 element_kernel(indexer.
GetInputPtr(0, workload_idx),
107 template <
typename scalar_t,
typename func_t>
109 func_t element_kernel,
113 "Internal error: two-pass reduction only works for " 114 "single-output reduction ops.");
118 int64_t workload_per_thread =
119 (num_workloads + num_threads - 1) / num_threads;
120 std::vector<scalar_t> thread_results(num_threads, identity);
122 #pragma omp parallel for schedule(static) 123 for (int64_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
124 int64_t start = thread_idx * workload_per_thread;
125 int64_t end = std::min(start + workload_per_thread, num_workloads);
126 for (int64_t workload_idx = start; workload_idx < end;
128 element_kernel(indexer.
GetInputPtr(0, workload_idx),
129 &thread_results[thread_idx]);
133 for (int64_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
134 element_kernel(&thread_results[thread_idx], output_ptr);
138 template <
typename scalar_t,
typename func_t>
140 func_t element_kernel) {
143 const int64_t num_dims = indexer.
NumDims();
147 int64_t best_dim = num_dims - 1;
151 for (int64_t dim = best_dim; dim >= 0 && !indexer.
IsReductionDim(dim);
153 if (indexer_shape[dim] >= num_threads) {
156 }
else if (indexer_shape[dim] > indexer_shape[best_dim]) {
160 if (best_dim == -1) {
162 "Internal error: all dims are reduction dims, use " 163 "LaunchReductionKernelTwoPass instead.");
166 #pragma omp parallel for schedule(static) 167 for (int64_t i = 0; i < indexer_shape[best_dim]; ++i) {
170 LaunchReductionKernelSerial<scalar_t>(sub_indexer, element_kernel);
175 template <
typename func_t>
177 #pragma omp parallel for schedule(static) 178 for (int64_t workload_idx = 0; workload_idx < n; ++workload_idx) {
179 element_kernel(workload_idx);
OPEN3D_HOST_DEVICE char * GetOutputPtr(int64_t workload_idx) const
Definition: Indexer.h:414
int64_t NumWorkloads() const
Definition: Indexer.cpp:371
int64_t NumWorkloads() const
Definition: AdvancedIndexing.h:213
bool IsReductionDim(int64_t dim) const
Returns true if the dim -th dimension is reduced.
Definition: Indexer.h:390
Definition: CPULauncher.h:42
OPEN3D_HOST_DEVICE char * GetOutputPtr(int64_t workload_idx) const
Definition: AdvancedIndexing.h:192
void LogError(const char *format, const Args &... args)
Definition: Console.h:176
static void LaunchReductionKernelTwoPass(const Indexer &indexer, func_t element_kernel, scalar_t identity)
Definition: CPULauncher.h:108
OPEN3D_HOST_DEVICE char * GetInputPtr(int64_t input_idx, int64_t workload_idx) const
Definition: Indexer.h:402
int GetMaxThreads()
Definition: ParallelUtil.h:33
Definition: AdvancedIndexing.h:135
static void LaunchReductionParallelDim(const Indexer &indexer, func_t element_kernel)
Definition: CPULauncher.h:139
static void LaunchIndexFillKernel(const Indexer &indexer, func_t element_kernel)
Definition: CPULauncher.h:52
static void LaunchGeneralKernel(int64_t n, func_t element_kernel)
General kernels with non-conventional indexers.
Definition: CPULauncher.h:176
int64_t NumDims() const
Returns number of dimensions of the Indexer.
Definition: Indexer.h:311
Definition: PinholeCameraIntrinsic.cpp:35
int64_t NumOutputElements() const
Returns the number of output elements.
Definition: Indexer.cpp:379
const int64_t * GetMasterShape() const
Definition: Indexer.h:315
Definition: Indexer.h:261
void ShrinkDim(int64_t dim, int64_t start, int64_t size)
Definition: Indexer.cpp:338
OPEN3D_HOST_DEVICE char * GetInputPtr(int64_t workload_idx) const
Definition: AdvancedIndexing.h:185
static void LaunchReductionKernelSerial(const Indexer &indexer, func_t element_kernel)
Definition: CPULauncher.h:96
static void LaunchUnaryEWKernel(const Indexer &indexer, func_t element_kernel)
Definition: CPULauncher.h:62
static void LaunchAdvancedIndexerKernel(const AdvancedIndexer &indexer, func_t element_kernel)
Definition: CPULauncher.h:85
static void LaunchBinaryEWKernel(const Indexer &indexer, func_t element_kernel)
Definition: CPULauncher.h:73