10 #include <tbb/parallel_for.h>
11 #include <tbb/parallel_sort.h>
77 template <
class T,
int NDIM,
class OUTPUT_ALLOCATOR>
80 const size_t batch_size,
81 const int64_t*
const row_splits,
82 const T*
const voxel_size,
83 const T*
const points_range_min,
84 const T*
const points_range_max,
85 const int64_t max_points_per_voxel,
86 const int64_t max_voxels,
87 OUTPUT_ALLOCATOR& output_allocator) {
90 const Vec_t inv_voxel_size = T(1) / Vec_t(voxel_size);
91 const Vec_t points_range_min_vec(points_range_min);
92 const Vec_t points_range_max_vec(points_range_max);
94 ceil((points_range_max_vec - points_range_min_vec) * inv_voxel_size)
95 .template cast<int32_t>();
97 for (
int i = 0; i < NDIM; ++i) {
99 for (
int j = 0; j < i; ++j) {
100 strides[i] *= extents[j];
103 const int64_t batch_hash = strides[NDIM - 1] * extents[NDIM - 1];
104 const int64_t invalid_hash = batch_hash * batch_size;
106 std::vector<int64_t> indices_batches(num_points, 0);
107 tbb::parallel_for(tbb::blocked_range<int64_t>(0, batch_size),
108 [&](
const tbb::blocked_range<int64_t>& r) {
109 for (int64_t i = r.begin(); i != r.end(); ++i) {
110 for (int64_t idx = row_splits[i];
111 idx < row_splits[i + 1]; ++idx) {
112 indices_batches[idx] = i;
117 auto CoordFn = [&](
const Vec_t& point) {
118 auto coords = ((point - points_range_min_vec) * inv_voxel_size)
119 .template cast<int64_t>();
123 auto HashFn = [&](
const Vec_t& point,
const int64_t& idx) {
124 if ((point >= points_range_min_vec && point <= points_range_max_vec)
126 auto coords = CoordFn(point);
127 int64_t hash = coords.dot(strides);
128 hash += indices_batches[idx] * batch_hash;
134 std::vector<std::pair<int64_t, int64_t>> hashes_indices(num_points);
135 std::vector<uint64_t> num_voxels(batch_size, 0);
137 tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_points),
138 [&](
const tbb::blocked_range<int64_t>& r) {
139 for (int64_t i = r.begin(); i != r.end(); ++i) {
140 Vec_t pos(points + NDIM * i);
141 hashes_indices[i].first = HashFn(pos, i);
142 hashes_indices[i].second = i;
146 tbb::parallel_sort(hashes_indices);
149 tbb::blocked_range<int64_t>(0, hashes_indices.size()),
150 [&](
const tbb::blocked_range<int64_t>& r) {
151 for (int64_t i = r.begin(); i != r.end(); ++i) {
152 int64_t batch_id = hashes_indices[i].first / batch_hash;
153 if (batch_id >= batch_size) break;
155 core::AtomicFetchAddRelaxed(&num_voxels[batch_id], 1);
158 int64_t batch_id_prev =
159 hashes_indices[i - 1].first / batch_hash;
160 if ((batch_id != batch_id_prev) ||
161 (hashes_indices[i].first !=
162 hashes_indices[i - 1].first)) {
163 core::AtomicFetchAddRelaxed(&num_voxels[batch_id], 1);
168 tbb::parallel_for(tbb::blocked_range<int64_t>(0, batch_size),
169 [&](
const tbb::blocked_range<int64_t>& r) {
170 for (int64_t i = r.begin(); i != r.end(); ++i) {
171 num_voxels[i] = std::min(int64_t(num_voxels[i]),
176 int64_t* out_batch_splits =
nullptr;
177 output_allocator.AllocVoxelBatchSplits(&out_batch_splits, batch_size + 1);
178 out_batch_splits[0] = 0;
181 for (int64_t i = 1; i < batch_size + 1; ++i) {
182 out_batch_splits[i] = out_batch_splits[i - 1] + num_voxels[i - 1];
185 uint64_t total_voxels = out_batch_splits[batch_size];
187 int32_t* out_voxel_coords =
nullptr;
188 output_allocator.AllocVoxelCoords(&out_voxel_coords, total_voxels, NDIM);
190 int64_t* out_voxel_row_splits =
nullptr;
191 output_allocator.AllocVoxelPointRowSplits(&out_voxel_row_splits,
194 std::vector<int64_t> tmp_point_indices;
197 for (int64_t voxel_i = 0; voxel_i < total_voxels; ++voxel_i) {
199 auto coord = CoordFn(
200 Vec_t(
points + hashes_indices[hash_i].second * NDIM));
201 for (
int d = 0; d < NDIM; ++d) {
202 out_voxel_coords[voxel_i * NDIM + d] = coord[d];
204 out_voxel_row_splits[voxel_i] = tmp_point_indices.size();
207 int64_t points_per_voxel = 0;
208 const int64_t current_hash = hashes_indices[hash_i].first;
209 int64_t batch_id = current_hash / batch_hash;
210 num_voxels[batch_id]--;
211 for (; hash_i < hashes_indices.size(); ++hash_i) {
212 if (current_hash != hashes_indices[hash_i].first) {
216 if (points_per_voxel < max_points_per_voxel) {
217 tmp_point_indices.push_back(hashes_indices[hash_i].second);
223 if (num_voxels[batch_id] == 0) {
224 for (; hash_i < hashes_indices.size(); ++hash_i) {
225 if ((int64_t)(hashes_indices[hash_i].first / batch_hash) !=
232 out_voxel_row_splits[total_voxels] = tmp_point_indices.size();
235 int64_t* out_point_indices =
nullptr;
236 output_allocator.AllocVoxelPointIndices(&out_point_indices,
237 tmp_point_indices.size());
238 memcpy(out_point_indices, tmp_point_indices.data(),
239 tmp_point_indices.size() *
sizeof(int64_t));
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample uint64_t
Definition: K4aPlugin.cpp:343
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t int32_t
Definition: K4aPlugin.cpp:395
void VoxelizeCPU(const size_t num_points, const T *const points, const size_t batch_size, const int64_t *const row_splits, const T *const voxel_size, const T *const points_range_min, const T *const points_range_max, const int64_t max_points_per_voxel, const int64_t max_voxels, OUTPUT_ALLOCATOR &output_allocator)
Definition: Voxelize.h:78
Definition: Dispatch.h:91
FN_SPECIFIERS MiniVec< float, N > ceil(const MiniVec< float, N > &a)
Definition: MiniVec.h:89
Definition: PinholeCameraIntrinsic.cpp:16