52 template <
typename Hash,
typename KeyEq>
56 int64_t init_capacity,
63 void Rehash(int64_t buckets)
override;
65 void Insert(
const void* input_keys,
66 const void* input_values,
69 int64_t
count)
override;
71 void Activate(
const void* input_keys,
74 int64_t count)
override;
76 void Find(
const void* input_keys,
79 int64_t count)
override;
81 void Erase(
const void* input_keys,
83 int64_t count)
override;
87 int64_t
Size()
const override;
103 const void* input_values,
108 void Allocate(int64_t bucket_count, int64_t capacity);
112 template <
typename Hash,
typename KeyEq>
114 int64_t init_capacity,
119 init_buckets, init_capacity, dsize_key, dsize_value, device) {
120 Allocate(init_buckets, init_capacity);
123 template <
typename Hash,
typename KeyEq>
128 template <
typename Hash,
typename KeyEq>
130 int64_t iterator_count =
Size();
135 if (iterator_count > 0) {
141 active_keys = this->
buffer_->GetKeyBuffer().IndexGet({active_indices});
143 this->
buffer_->GetValueBuffer().IndexGet({active_indices});
146 float avg_capacity_per_bucket =
154 if (iterator_count > 0) {
159 static_cast<addr_t*
>(output_addrs.GetDataPtr()),
160 static_cast<bool*>(output_masks.GetDataPtr()),
166 template <
typename Hash,
typename KeyEq>
168 const void* input_values,
174 float avg_capacity_per_bucket =
176 int64_t expected_buckets = std::max(
178 int64_t(
std::ceil(new_size / avg_capacity_per_bucket)));
182 InsertImpl(input_keys, input_values, output_addrs, output_masks, count);
185 template <
typename Hash,
typename KeyEq>
192 float avg_capacity_per_bucket =
194 int64_t expected_buckets = std::max(
196 int64_t(
std::ceil(new_size / avg_capacity_per_bucket)));
200 InsertImpl(input_keys,
nullptr, output_addrs, output_masks, count);
203 template <
typename Hash,
typename KeyEq>
208 if (count == 0)
return;
212 const int64_t num_blocks =
213 (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
214 FindKernel<<<num_blocks, kThreadsPerBlock>>>(
220 template <
typename Hash,
typename KeyEq>
224 if (count == 0)
return;
227 auto iterator_addrs =
static_cast<addr_t*
>(
230 const int64_t num_blocks =
231 (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
232 EraseKernelPass0<<<num_blocks, kThreadsPerBlock>>>(
234 EraseKernelPass1<<<num_blocks, kThreadsPerBlock>>>(
242 template <
typename Hash,
typename KeyEq>
246 cudaMemset(iterator_count, 0,
sizeof(
uint32_t));
248 const int64_t num_blocks =
249 (
gpu_context_.bucket_count_ * kWarpSize + kThreadsPerBlock - 1) /
251 GetActiveIndicesKernel<<<num_blocks, kThreadsPerBlock>>>(
261 return static_cast<int64_t
>(ret);
264 template <
typename Hash,
typename KeyEq>
269 template <
typename Hash,
typename KeyEq>
271 thrust::device_vector<int64_t> elems_per_bucket(
gpu_context_.bucket_count_);
272 thrust::fill(elems_per_bucket.begin(), elems_per_bucket.end(), 0);
274 const int64_t num_blocks =
275 (
gpu_context_.capacity_ + kThreadsPerBlock - 1) / kThreadsPerBlock;
276 CountElemsPerBucketKernel<<<num_blocks, kThreadsPerBlock>>>(
277 gpu_context_, thrust::raw_pointer_cast(elems_per_bucket.data()));
281 std::vector<int64_t> result(gpu_context_.bucket_count_);
282 thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(),
284 return std::move(result);
287 template <
typename Hash,
typename KeyEq>
292 template <
typename Hash,
typename KeyEq>
294 const void* input_values,
298 if (count == 0)
return;
303 *thrust::device_ptr<int>(
gpu_context_.kv_mgr_ctx_.heap_counter_) =
304 prev_heap_counter + count;
306 const int64_t num_blocks =
307 (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
308 InsertKernelPass0<<<num_blocks, kThreadsPerBlock>>>(
310 InsertKernelPass1<<<num_blocks, kThreadsPerBlock>>>(
312 InsertKernelPass2<<<num_blocks, kThreadsPerBlock>>>(
318 template <
typename Hash,
typename KeyEq>
329 buffer_ctx_.Setup(this->capacity_, this->dsize_key_, this->dsize_value_,
331 this->
buffer_->GetValueBuffer(),
342 sizeof(
Slab) * this->bucket_count_));
344 gpu_context_.Setup(this->bucket_count_, this->capacity_, this->dsize_key_,
345 this->dsize_value_,
node_mgr_->gpu_context_,
349 template <
typename Hash,
typename KeyEq>
~CUDAHashmap()
Definition: HashmapCUDA.h:124
int64_t Size() const override
Definition: HashmapCUDA.h:265
void ReleaseCache()
Definition: CUDAUtils.cpp:55
void Free()
Definition: HashmapCUDA.h:350
int64_t dsize_value_
Definition: DeviceHashmap.h:175
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
void Allocate(int64_t bucket_count, int64_t capacity)
Definition: HashmapCUDA.h:319
void * GetDataPtr()
Definition: Tensor.h:961
static void Free(void *ptr, const Device &device)
Definition: MemoryManager.cpp:44
void Find(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: HashmapCUDA.h:204
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:57
std::shared_ptr< InternalNodeManager > node_mgr_
Definition: HashmapCUDA.h:98
static void MemcpyToHost(void *host_ptr, const void *src_ptr, const Device &src_device, size_t num_bytes)
Same as Memcpy, but with host (CPU:0) as default dst_device.
Definition: MemoryManager.cpp:88
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:40
CUDAHashmap(int64_t init_buckets, int64_t init_capacity, int64_t dsize_key, int64_t dsize_value, const Device &device)
Definition: HashmapCUDA.h:113
Definition: HashmapCUDAImpl.h:37
FN_SPECIFIERS MiniVec< float, N > ceil(const MiniVec< float, N > &a)
Definition: MiniVec.h:108
Base class: shared interface.
Definition: DeviceHashmap.h:101
int64_t capacity_
Definition: DeviceHashmap.h:173
static const Dtype Int32
Definition: Dtype.h:44
void Activate(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Definition: HashmapCUDA.h:186
void Rehash(int64_t buckets) override
Definition: HashmapCUDA.h:129
std::shared_ptr< HashmapBuffer > buffer_
Definition: DeviceHashmap.h:179
Tensor To(Dtype dtype, bool copy=false) const
Definition: Tensor.cpp:453
void Insert(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: HashmapCUDA.h:167
CUDAHashmapBufferContext buffer_ctx_
Definition: HashmapCUDA.h:97
int64_t dsize_key_
Definition: DeviceHashmap.h:174
Definition: HashmapCUDA.h:53
int count
Definition: FilePCD.cpp:61
void InsertImpl(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDA.h:293
Device device_
Definition: DeviceHashmap.h:177
static const Dtype Int64
Definition: Dtype.h:45
Definition: PinholeCameraIntrinsic.cpp:35
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: HashmapCUDA.h:221
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle float
Definition: K4aPlugin.cpp:465
uint32_t addr_t
Definition: HashmapBuffer.h:58
float LoadFactor() const override
Return size / bucket_count.
Definition: HashmapCUDA.h:288
std::vector< int64_t > BucketSizes() const override
Definition: HashmapCUDA.h:270
int64_t GetActiveIndices(addr_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: HashmapCUDA.h:243
Definition: InternalNodeManager.h:58
CUDAHashmapImplContext< Hash, KeyEq > gpu_context_
Definition: HashmapCUDA.h:95
static const Dtype Bool
Definition: Dtype.h:48
int64_t bucket_count_
Definition: DeviceHashmap.h:172