43 template <
typename func_t>
45 func_t element_kernel) {
47 #pragma omp parallel for schedule(static) 49 for (int64_t workload_idx = 0; workload_idx < indexer.
NumWorkloads();
51 element_kernel(indexer.
GetInputPtr(0, workload_idx),
56 template <
typename func_t>
58 func_t element_kernel) {
60 #pragma omp parallel for schedule(static) 62 for (int64_t workload_idx = 0; workload_idx < indexer.
NumWorkloads();
64 element_kernel(indexer.
GetInputPtr(0, workload_idx),
70 template <
typename func_t>
72 func_t element_kernel) {
74 #pragma omp parallel for schedule(static) 76 for (int64_t workload_idx = 0; workload_idx < indexer.
NumWorkloads();
83 template <
typename scalar_t,
typename func_t>
85 func_t element_kernel) {
86 for (int64_t workload_idx = 0; workload_idx < indexer.
NumWorkloads();
88 element_kernel(indexer.
GetInputPtr(0, workload_idx),
95 template <
typename scalar_t,
typename func_t>
97 func_t element_kernel,
101 "Internal error: two-pass reduction only works for " 102 "single-output reduction ops.");
106 int64_t workload_per_thread =
107 (num_workloads + num_threads - 1) / num_threads;
108 std::vector<scalar_t> thread_results(num_threads, identity);
111 #pragma omp parallel for schedule(static) 113 for (int64_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
114 int64_t start = thread_idx * workload_per_thread;
115 int64_t end = std::min(start + workload_per_thread, num_workloads);
116 for (int64_t workload_idx = start; workload_idx < end;
118 element_kernel(indexer.
GetInputPtr(0, workload_idx),
119 &thread_results[thread_idx]);
123 for (int64_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
124 element_kernel(&thread_results[thread_idx], output_ptr);
128 template <
typename scalar_t,
typename func_t>
130 func_t element_kernel) {
133 const int64_t num_dims = indexer.
NumDims();
137 int64_t best_dim = num_dims - 1;
141 for (int64_t dim = best_dim; dim >= 0 && !indexer.
IsReductionDim(dim);
143 if (indexer_shape[dim] >= num_threads) {
146 }
else if (indexer_shape[dim] > indexer_shape[best_dim]) {
150 if (best_dim == -1) {
152 "Internal error: all dims are reduction dims, use " 153 "LaunchReductionKernelTwoPass instead.");
157 #pragma omp parallel for schedule(static) 159 for (int64_t i = 0; i < indexer_shape[best_dim]; ++i) {
162 LaunchReductionKernelSerial<scalar_t>(sub_indexer, element_kernel);
Definition: CPULauncher.h:41
bool IsReductionDim(int64_t dim) const
Returns true if the dim -th dimension is reduced.
Definition: Indexer.h:389
static void LaunchUnaryEWKernel(const Indexer &indexer, func_t element_kernel)
Definition: CPULauncher.h:44
void LogError(const char *format, const Args &... args)
Definition: Console.h:174
int64_t NumWorkloads() const
Definition: Indexer.cpp:370
int GetMaxThreads()
Definition: ParallelUtil.h:33
static void LaunchReductionKernelTwoPass(const Indexer &indexer, func_t element_kernel, scalar_t identity)
Definition: CPULauncher.h:96
OPEN3D_HOST_DEVICE char * GetOutputPtr(int64_t workload_idx) const
Definition: Indexer.h:413
int64_t NumWorkloads() const
Definition: AdvancedIndexing.h:213
Definition: AdvancedIndexing.h:134
void ShrinkDim(int64_t dim, int64_t start, int64_t size)
Definition: Indexer.cpp:337
static void LaunchBinaryEWKernel(const Indexer &indexer, func_t element_kernel)
Definition: CPULauncher.h:57
const int64_t * GetMasterShape() const
Definition: Indexer.h:314
int64_t NumOutputElements() const
Returns the number of output elements.
Definition: Indexer.cpp:378
int64_t NumDims() const
Returns number of dimensions of the Indexer.
Definition: Indexer.h:310
OPEN3D_HOST_DEVICE char * GetInputPtr(int64_t workload_idx) const
Definition: AdvancedIndexing.h:185
Definition: Open3DViewer.h:29
static void LaunchAdvancedIndexerKernel(const AdvancedIndexer &indexer, func_t element_kernel)
Definition: CPULauncher.h:71
Definition: Indexer.h:260
OPEN3D_HOST_DEVICE char * GetOutputPtr(int64_t workload_idx) const
Definition: AdvancedIndexing.h:192
static void LaunchReductionParallelDim(const Indexer &indexer, func_t element_kernel)
Definition: CPULauncher.h:129
static void LaunchReductionKernelSerial(const Indexer &indexer, func_t element_kernel)
Definition: CPULauncher.h:84
OPEN3D_HOST_DEVICE char * GetInputPtr(int64_t input_idx, int64_t workload_idx) const
Definition: Indexer.h:401