0.12.0/cpp_api/_hashmap_c_u_d_a_8h_source.html

 // ----------------------------------------------------------------------------
 // -                        Open3D: www.open3d.org                            -
 // ----------------------------------------------------------------------------
 // The MIT License (MIT)
 //
 // Copyright (c) 2018 www.open3d.org
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 // IN THE SOFTWARE.
 // ----------------------------------------------------------------------------

 // Copyright 2019 Saman Ashkiani
 // Rewritten by Wei Dong 2019 - 2020
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing permissions
 // and limitations under the License.

 #pragma once

 #include <cassert>
 #include <memory>

 #include "open3d/core/CUDAUtils.h"
 #include "open3d/core/hashmap/CUDA/HashmapCUDAImpl.h"
 #include "open3d/core/hashmap/DeviceHashmap.h"

 namespace open3d {
 namespace core {
 template <typename Hash, typename KeyEq>
 class CUDAHashmap : public DeviceHashmap<Hash, KeyEq> {
 public:
     CUDAHashmap(int64_t init_buckets,
                 int64_t init_capacity,
                 int64_t dsize_key,
                 int64_t dsize_value,
                 const Device& device);

     ~CUDAHashmap();

     void Rehash(int64_t buckets) override;

     void Insert(const void* input_keys,
                 const void* input_values,
                 addr_t* output_addrs,
                 bool* output_masks,
                 int64_t count) override;

     void Activate(const void* input_keys,
                   addr_t* output_addrs,
                   bool* output_masks,
                   int64_t count) override;

     void Find(const void* input_keys,
               addr_t* output_addrs,
               bool* output_masks,
               int64_t count) override;

     void Erase(const void* input_keys,
                bool* output_masks,
                int64_t count) override;

     int64_t GetActiveIndices(addr_t* output_indices) override;

     int64_t Size() const override;

     std::vector<int64_t> BucketSizes() const override;
     float LoadFactor() const override;

 protected:
     CUDAHashmapImplContext<Hash, KeyEq> gpu_context_;

     CUDAHashmapBufferContext buffer_ctx_;
     std::shared_ptr<InternalNodeManager> node_mgr_;

     void InsertImpl(const void* input_keys,
                     const void* input_values,
                     addr_t* output_addrs,
                     bool* output_masks,
                     int64_t count);

     void Allocate(int64_t bucket_count, int64_t capacity);
     void Free();
 };

 template <typename Hash, typename KeyEq>
 CUDAHashmap<Hash, KeyEq>::CUDAHashmap(int64_t init_buckets,
                                       int64_t init_capacity,
                                       int64_t dsize_key,
                                       int64_t dsize_value,
                                       const Device& device)
     : DeviceHashmap<Hash, KeyEq>(
               init_buckets, init_capacity, dsize_key, dsize_value, device) {
     Allocate(init_buckets, init_capacity);
 }

 template <typename Hash, typename KeyEq>
 CUDAHashmap<Hash, KeyEq>::~CUDAHashmap() {
     Free();
 }

 template <typename Hash, typename KeyEq>
 void CUDAHashmap<Hash, KeyEq>::Rehash(int64_t buckets) {
     int64_t iterator_count = Size();

     Tensor active_keys;
     Tensor active_values;

     if (iterator_count > 0) {
         Tensor active_addrs =
                 Tensor({iterator_count}, Dtype::Int32, this->device_);
         GetActiveIndices(static_cast<addr_t*>(active_addrs.GetDataPtr()));

         Tensor active_indices = active_addrs.To(Dtype::Int64);
         active_keys = this->buffer_->GetKeyBuffer().IndexGet({active_indices});
         active_values =
                 this->buffer_->GetValueBuffer().IndexGet({active_indices});
     }

     float avg_capacity_per_bucket =
             float(this->capacity_) / float(this->bucket_count_);

     Free();
     CUDACachedMemoryManager::ReleaseCache();

     Allocate(buckets, int64_t(std::ceil(buckets * avg_capacity_per_bucket)));

     if (iterator_count > 0) {
         Tensor output_addrs({iterator_count}, Dtype::Int32, this->device_);
         Tensor output_masks({iterator_count}, Dtype::Bool, this->device_);

         InsertImpl(active_keys.GetDataPtr(), active_values.GetDataPtr(),
                    static_cast<addr_t*>(output_addrs.GetDataPtr()),
                    static_cast<bool*>(output_masks.GetDataPtr()),
                    iterator_count);
     }
     CUDACachedMemoryManager::ReleaseCache();
 }

 template <typename Hash, typename KeyEq>
 void CUDAHashmap<Hash, KeyEq>::Insert(const void* input_keys,
                                       const void* input_values,
                                       addr_t* output_addrs,
                                       bool* output_masks,
                                       int64_t count) {
     int64_t new_size = Size() + count;
     if (new_size > this->capacity_) {
         float avg_capacity_per_bucket =
                 float(this->capacity_) / float(this->bucket_count_);
         int64_t expected_buckets = std::max(
                 int64_t(this->bucket_count_ * 2),
                 int64_t(std::ceil(new_size / avg_capacity_per_bucket)));
         Rehash(expected_buckets);
     }

     InsertImpl(input_keys, input_values, output_addrs, output_masks, count);
 }

 template <typename Hash, typename KeyEq>
 void CUDAHashmap<Hash, KeyEq>::Activate(const void* input_keys,
                                         addr_t* output_addrs,
                                         bool* output_masks,
                                         int64_t count) {
     int64_t new_size = Size() + count;
     if (new_size > this->capacity_) {
         float avg_capacity_per_bucket =
                 float(this->capacity_) / float(this->bucket_count_);
         int64_t expected_buckets = std::max(
                 int64_t(this->bucket_count_ * 2),
                 int64_t(std::ceil(new_size / avg_capacity_per_bucket)));
         Rehash(expected_buckets);
     }

     InsertImpl(input_keys, nullptr, output_addrs, output_masks, count);
 }

 template <typename Hash, typename KeyEq>
 void CUDAHashmap<Hash, KeyEq>::Find(const void* input_keys,
                                     addr_t* output_addrs,
                                     bool* output_masks,
                                     int64_t count) {
     if (count == 0) return;

     OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));

     const int64_t num_blocks =
             (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
     FindKernel<<<num_blocks, kThreadsPerBlock>>>(
             gpu_context_, input_keys, output_addrs, output_masks, count);
     OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
     OPEN3D_CUDA_CHECK(cudaGetLastError());
 }

 template <typename Hash, typename KeyEq>
 void CUDAHashmap<Hash, KeyEq>::Erase(const void* input_keys,
                                      bool* output_masks,
                                      int64_t count) {
     if (count == 0) return;

     OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));
     auto iterator_addrs = static_cast<addr_t*>(
             MemoryManager::Malloc(sizeof(addr_t) * count, this->device_));

     const int64_t num_blocks =
             (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
     EraseKernelPass0<<<num_blocks, kThreadsPerBlock>>>(
             gpu_context_, input_keys, iterator_addrs, output_masks, count);
     EraseKernelPass1<<<num_blocks, kThreadsPerBlock>>>(
             gpu_context_, iterator_addrs, output_masks, count);
     OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
     OPEN3D_CUDA_CHECK(cudaGetLastError());

     MemoryManager::Free(iterator_addrs, this->device_);
 }

 template <typename Hash, typename KeyEq>
 int64_t CUDAHashmap<Hash, KeyEq>::GetActiveIndices(addr_t* output_addrs) {
     uint32_t* iterator_count = static_cast<uint32_t*>(
             MemoryManager::Malloc(sizeof(uint32_t), this->device_));
     cudaMemset(iterator_count, 0, sizeof(uint32_t));

     const int64_t num_blocks =
             (gpu_context_.bucket_count_ * kWarpSize + kThreadsPerBlock - 1) /
             kThreadsPerBlock;
     GetActiveIndicesKernel<<<num_blocks, kThreadsPerBlock>>>(
             gpu_context_, output_addrs, iterator_count);
     OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
     OPEN3D_CUDA_CHECK(cudaGetLastError());

     uint32_t ret;
     MemoryManager::MemcpyToHost(&ret, iterator_count, this->device_,
                                 sizeof(uint32_t));
     MemoryManager::Free(iterator_count, this->device_);

     return static_cast<int64_t>(ret);
 }

 template <typename Hash, typename KeyEq>
 int64_t CUDAHashmap<Hash, KeyEq>::Size() const {
     return buffer_ctx_.HeapCounter(this->device_);
 }

 template <typename Hash, typename KeyEq>
 std::vector<int64_t> CUDAHashmap<Hash, KeyEq>::BucketSizes() const {
     thrust::device_vector<int64_t> elems_per_bucket(gpu_context_.bucket_count_);
     thrust::fill(elems_per_bucket.begin(), elems_per_bucket.end(), 0);

     const int64_t num_blocks =
             (gpu_context_.capacity_ + kThreadsPerBlock - 1) / kThreadsPerBlock;
     CountElemsPerBucketKernel<<<num_blocks, kThreadsPerBlock>>>(
             gpu_context_, thrust::raw_pointer_cast(elems_per_bucket.data()));
     OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
     OPEN3D_CUDA_CHECK(cudaGetLastError());

     std::vector<int64_t> result(gpu_context_.bucket_count_);
     thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(),
                  result.begin());
     return std::move(result);
 }

 template <typename Hash, typename KeyEq>
 float CUDAHashmap<Hash, KeyEq>::LoadFactor() const {
     return float(Size()) / float(this->bucket_count_);
 }

 template <typename Hash, typename KeyEq>
 void CUDAHashmap<Hash, KeyEq>::InsertImpl(const void* input_keys,
                                           const void* input_values,
                                           addr_t* output_addrs,
                                           bool* output_masks,
                                           int64_t count) {
     if (count == 0) return;

     int prev_heap_counter = buffer_ctx_.HeapCounter(this->device_);
     *thrust::device_ptr<int>(gpu_context_.kv_mgr_ctx_.heap_counter_) =
             prev_heap_counter + count;

     const int64_t num_blocks =
             (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
     InsertKernelPass0<<<num_blocks, kThreadsPerBlock>>>(
             gpu_context_, input_keys, output_addrs, prev_heap_counter, count);
     InsertKernelPass1<<<num_blocks, kThreadsPerBlock>>>(
             gpu_context_, input_keys, output_addrs, output_masks, count);
     InsertKernelPass2<<<num_blocks, kThreadsPerBlock>>>(
             gpu_context_, input_values, output_addrs, output_masks, count);
     OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
     OPEN3D_CUDA_CHECK(cudaGetLastError());
 }

 template <typename Hash, typename KeyEq>
 void CUDAHashmap<Hash, KeyEq>::Allocate(int64_t bucket_count,
                                         int64_t capacity) {
     this->bucket_count_ = bucket_count;
     this->capacity_ = capacity;

     // Allocate buffer for key values.
     this->buffer_ =
             std::make_shared<HashmapBuffer>(this->capacity_, this->dsize_key_,
                                             this->dsize_value_, this->device_);
     buffer_ctx_.HostAllocate(this->device_);
     buffer_ctx_.Setup(this->capacity_, this->dsize_key_, this->dsize_value_,
                       this->buffer_->GetKeyBuffer(),
                       this->buffer_->GetValueBuffer(),
                       this->buffer_->GetHeap());
     buffer_ctx_.Reset(this->device_);

     // Allocate buffer for linked list nodes.
     node_mgr_ = std::make_shared<InternalNodeManager>(this->device_);

     // Allocate linked list heads.
     gpu_context_.bucket_list_head_ = static_cast<Slab*>(MemoryManager::Malloc(
             sizeof(Slab) * this->bucket_count_, this->device_));
     OPEN3D_CUDA_CHECK(cudaMemset(gpu_context_.bucket_list_head_, 0xFF,
                                  sizeof(Slab) * this->bucket_count_));

     gpu_context_.Setup(this->bucket_count_, this->capacity_, this->dsize_key_,
                        this->dsize_value_, node_mgr_->gpu_context_,
                        buffer_ctx_);
 }

 template <typename Hash, typename KeyEq>
 void CUDAHashmap<Hash, KeyEq>::Free() {
     buffer_ctx_.HostFree(this->device_);
     MemoryManager::Free(gpu_context_.bucket_list_head_, this->device_);
 }
 }  // namespace core
 }  // namespace open3d
open3d::core::CUDAHashmap::~CUDAHashmap
~CUDAHashmap()
Definition: HashmapCUDA.h:124

open3d::core::CUDAHashmap::Size
int64_t Size() const override
Definition: HashmapCUDA.h:265

open3d::core::cuda::ReleaseCache
void ReleaseCache()
Definition: CUDAUtils.cpp:55

open3d::core::CUDAHashmap::Free
void Free()
Definition: HashmapCUDA.h:350

open3d::core::DeviceHashmap::dsize_value_
int64_t dsize_value_
Definition: DeviceHashmap.h:175

open3d::io::k4a_plugin::uint32_t
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557

open3d::core::CUDAHashmap::Allocate
void Allocate(int64_t bucket_count, int64_t capacity)
Definition: HashmapCUDA.h:319

open3d::core::Tensor::GetDataPtr
void * GetDataPtr()
Definition: Tensor.h:961

open3d::core::MemoryManager::Free
static void Free(void *ptr, const Device &device)
Definition: MemoryManager.cpp:44

open3d::core::CUDAHashmap::Find
void Find(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: HashmapCUDA.h:204

OPEN3D_CUDA_CHECK
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:57

open3d::core::CUDAHashmap::node_mgr_
std::shared_ptr< InternalNodeManager > node_mgr_
Definition: HashmapCUDA.h:98

open3d::core::MemoryManager::MemcpyToHost
static void MemcpyToHost(void *host_ptr, const void *src_ptr, const Device &src_device, size_t num_bytes)
Same as Memcpy, but with host (CPU:0) as default dst_device.
Definition: MemoryManager.cpp:88

open3d::core::MemoryManager::Malloc
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:40

open3d::core::CUDAHashmap::CUDAHashmap
CUDAHashmap(int64_t init_buckets, int64_t init_capacity, int64_t dsize_key, int64_t dsize_value, const Device &device)
Definition: HashmapCUDA.h:113

open3d::core::CUDAHashmapImplContext
Definition: HashmapCUDAImpl.h:37

HashmapCUDAImpl.h

open3d::utility::ceil
FN_SPECIFIERS MiniVec< float, N > ceil(const MiniVec< float, N > &a)
Definition: MiniVec.h:108

open3d::core::DeviceHashmap
Base class: shared interface.
Definition: DeviceHashmap.h:101

open3d::core::DeviceHashmap::capacity_
int64_t capacity_
Definition: DeviceHashmap.h:173

open3d::core::Dtype::Int32
static const Dtype Int32
Definition: Dtype.h:44

open3d::core::CUDAHashmap::Activate
void Activate(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Definition: HashmapCUDA.h:186

open3d::core::CUDAHashmap::Rehash
void Rehash(int64_t buckets) override
Definition: HashmapCUDA.h:129

open3d::core::DeviceHashmap::buffer_
std::shared_ptr< HashmapBuffer > buffer_
Definition: DeviceHashmap.h:179

open3d::core::Tensor::To
Tensor To(Dtype dtype, bool copy=false) const
Definition: Tensor.cpp:453

open3d::core::Device
Definition: Device.h:39

open3d::core::CUDAHashmap::Insert
void Insert(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: HashmapCUDA.h:167

open3d::core::CUDAHashmap::buffer_ctx_
CUDAHashmapBufferContext buffer_ctx_
Definition: HashmapCUDA.h:97

open3d::core::DeviceHashmap::dsize_key_
int64_t dsize_key_
Definition: DeviceHashmap.h:174

open3d::core::CUDAHashmap
Definition: HashmapCUDA.h:53

count
int count
Definition: FilePCD.cpp:61

open3d::core::CUDAHashmap::InsertImpl
void InsertImpl(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDA.h:293

open3d::core::DeviceHashmap::device_
Device device_
Definition: DeviceHashmap.h:177

open3d::core::Dtype::Int64
static const Dtype Int64
Definition: Dtype.h:45

open3d
Definition: PinholeCameraIntrinsic.cpp:35

open3d::core::Tensor
Definition: Tensor.h:48

open3d::core::CUDAHashmap::Erase
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: HashmapCUDA.h:221

open3d::io::k4a_plugin::float
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle float
Definition: K4aPlugin.cpp:465

open3d::core::addr_t
uint32_t addr_t
Definition: HashmapBuffer.h:58

open3d::core::CUDAHashmap::LoadFactor
float LoadFactor() const override
Return size / bucket_count.
Definition: HashmapCUDA.h:288

open3d::core::CUDAHashmap::BucketSizes
std::vector< int64_t > BucketSizes() const override
Definition: HashmapCUDA.h:270

DeviceHashmap.h

open3d::core::CUDAHashmap::GetActiveIndices
int64_t GetActiveIndices(addr_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: HashmapCUDA.h:243

open3d::core::Slab
Definition: InternalNodeManager.h:58

open3d::core::CUDAHashmap::gpu_context_
CUDAHashmapImplContext< Hash, KeyEq > gpu_context_
Definition: HashmapCUDA.h:95

open3d::core::Dtype::Bool
static const Dtype Bool
Definition: Dtype.h:48

CUDAUtils.h
Common CUDA utilities.

open3d::core::DeviceHashmap::bucket_count_
int64_t bucket_count_
Definition: DeviceHashmap.h:172