Open3D (C++ API)  0.18.0
SlabNodeManager.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2023 www.open3d.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 // Copyright 2019 Saman Ashkiani
9 // Rewritten by Wei Dong 2019 - 2020
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 //
14 // http://www.apache.org/licenses/LICENSE-2.0
15 //
16 // Unless required by applicable law or agreed to in writing, software
17 // distributed under the License is distributed on an "AS IS" BASIS,
18 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
19 // implied. See the License for the specific language governing permissions
20 // and limitations under the License.
21 
22 #pragma once
23 
24 #include <thrust/device_vector.h>
25 
26 #include <memory>
27 
28 #include "open3d/core/CUDAUtils.h"
32 #include "open3d/utility/Random.h"
33 
34 namespace open3d {
35 namespace core {
36 
39 class Slab {
40 public:
43  buf_index_t kv_pair_ptrs[kWarpSize - 1];
46 };
47 
49 public:
51  : super_blocks_(nullptr),
52  hash_coef_(0),
53  num_attempts_(0),
54  memory_block_index_(0),
55  super_block_index_(0) {}
56 
57  __device__ __forceinline__ uint32_t* get_unit_ptr_from_slab(
58  const buf_index_t& next_slab_ptr, const uint32_t& lane_id) {
59  return super_blocks_ + addressDecoder(next_slab_ptr) + lane_id;
60  }
61  __device__ __forceinline__ uint32_t* get_ptr_for_bitmap(
62  const uint32_t super_block_idx, const uint32_t bitmap_idx) {
63  return super_blocks_ + super_block_idx * kUIntsPerSuperBlock +
64  bitmap_idx;
65  }
66 
67  // Objective: each warp selects its own memory_block warp allocator.
68  __device__ void Init(uint32_t& tid, uint32_t& lane_id) {
69  // Hashing the memory block to be used.
70  createMemBlockIndex(tid >> 5);
71 
72  // Loading the assigned memory block.
73  memory_block_bitmap_ =
74  super_blocks_[super_block_index_ * kUIntsPerSuperBlock +
75  memory_block_index_ * kSlabsPerBlock + lane_id];
76  }
77 
78  __device__ uint32_t WarpAllocate(const uint32_t& lane_id) {
79  // Try and allocate a new memory units within the memory_block memory
80  // block if it returns 0xFFFFFFFF, then there was not any empty memory
81  // unit a new memory_block block should be chosen, and repeat again
82  // allocated result: 5 bits: super_block_index
83  // 17 bits: memory block index
84  // 5 bits: memory unit index (hi-bits of 10bit)
85  // 5 bits: memory unit index (lo-bits of 10bit)
86  int empty_lane = -1;
87  uint32_t free_lane;
88  uint32_t read_bitmap = memory_block_bitmap_;
89  uint32_t allocated_result = kNotFoundFlag;
90  // Works as long as <31 bit are used in the allocated_result
91  // in other words, if there are 32 super blocks and at most 64k blocks
92  // per super block.
93 
94  while (allocated_result == kNotFoundFlag) {
95  empty_lane = __ffs(~memory_block_bitmap_) - 1;
96  free_lane = __ballot_sync(kSyncLanesMask, empty_lane >= 0);
97  if (free_lane == 0) {
98  // all bitmaps are full: need to be rehashed again.
99  updateMemBlockIndex((threadIdx.x + blockIdx.x * blockDim.x) >>
100  5);
101  read_bitmap = memory_block_bitmap_;
102  continue;
103  }
104  uint32_t src_lane = __ffs(free_lane) - 1;
105  if (src_lane == lane_id) {
106  read_bitmap = atomicCAS(
107  super_blocks_ +
108  super_block_index_ * kUIntsPerSuperBlock +
109  memory_block_index_ * kSlabsPerBlock + lane_id,
110  memory_block_bitmap_,
111  memory_block_bitmap_ | (1 << empty_lane));
112  if (read_bitmap == memory_block_bitmap_) {
113  // Successful attempt.
114  memory_block_bitmap_ |= (1 << empty_lane);
115  allocated_result =
116  (super_block_index_ << kSuperBlockMaskBits) |
117  (memory_block_index_ << kBlockMaskBits) |
118  (lane_id << kSlabMaskBits) | empty_lane;
119  } else {
120  // Not successful: updating the current bitmap.
121  memory_block_bitmap_ = read_bitmap;
122  }
123  }
124  // Asking for the allocated result.
125  allocated_result =
126  __shfl_sync(kSyncLanesMask, allocated_result, src_lane);
127  }
128  return allocated_result;
129  }
130 
131  // This function, frees a recently allocated memory unit by a single thread.
132  // Since it is untouched, there shouldn't be any worries for the actual
133  // memory contents to be reset again.
134  __device__ void FreeUntouched(buf_index_t ptr) {
135  atomicAnd(super_blocks_ +
136  getSuperBlockIndex(ptr) * kUIntsPerSuperBlock +
137  getMemBlockIndex(ptr) * kSlabsPerBlock +
138  (getMemUnitIndex(ptr) >> 5),
139  ~(1 << (getMemUnitIndex(ptr) & 0x1F)));
140  }
141 
142 private:
143  __device__ __host__ __forceinline__ uint32_t
144  getSuperBlockIndex(buf_index_t address) const {
145  return address >> kSuperBlockMaskBits;
146  }
147  __device__ __host__ __forceinline__ uint32_t
148  getMemBlockIndex(buf_index_t address) const {
149  return ((address >> kBlockMaskBits) & 0x1FFFF);
150  }
151  __device__ __host__ __forceinline__ buf_index_t
152  getMemBlockAddress(buf_index_t address) const {
153  return (kBitmapsPerSuperBlock +
154  getMemBlockIndex(address) * kUIntsPerBlock);
155  }
156  __device__ __host__ __forceinline__ uint32_t
157  getMemUnitIndex(buf_index_t address) const {
158  return address & 0x3FF;
159  }
160  __device__ __host__ __forceinline__ buf_index_t
161  getMemUnitAddress(buf_index_t address) {
162  return getMemUnitIndex(address) * kWarpSize;
163  }
164 
165  // Called at the beginning of the kernel.
166  __device__ void createMemBlockIndex(uint32_t global_warp_id) {
167  super_block_index_ = global_warp_id % kSuperBlocks;
168  memory_block_index_ = (hash_coef_ * global_warp_id) >>
169  (32 - kBlocksPerSuperBlockInBits);
170  }
171 
172  // Called when the allocator fails to find an empty unit to allocate.
173  __device__ void updateMemBlockIndex(uint32_t global_warp_id) {
174  num_attempts_++;
175  super_block_index_++;
176  super_block_index_ =
177  (super_block_index_ == kSuperBlocks) ? 0 : super_block_index_;
178  memory_block_index_ = (hash_coef_ * (global_warp_id + num_attempts_)) >>
179  (32 - kBlocksPerSuperBlockInBits);
180  // Loading the assigned memory block.
181  memory_block_bitmap_ =
182  *((super_blocks_ + super_block_index_ * kUIntsPerSuperBlock) +
183  memory_block_index_ * kSlabsPerBlock + (threadIdx.x & 0x1f));
184  }
185 
186  __host__ __device__ buf_index_t
187  addressDecoder(buf_index_t address_ptr_index) {
188  return getSuperBlockIndex(address_ptr_index) * kUIntsPerSuperBlock +
189  getMemBlockAddress(address_ptr_index) +
190  getMemUnitIndex(address_ptr_index) * kWarpSize;
191  }
192 
193  __host__ __device__ void print_address(buf_index_t address_ptr_index) {
194  printf("Super block Index: %d, Memory block index: %d, Memory unit "
195  "index: "
196  "%d\n",
197  getSuperBlockIndex(address_ptr_index),
198  getMemBlockIndex(address_ptr_index),
199  getMemUnitIndex(address_ptr_index));
200  }
201 
202 public:
206  uint32_t hash_coef_; // A random 32-bit.
207 
208 private:
210  uint32_t num_attempts_;
211  uint32_t memory_block_index_;
212  uint32_t memory_block_bitmap_;
213  uint32_t super_block_index_;
214 };
215 
217  uint32_t* slabs_per_superblock);
218 
220 public:
221  SlabNodeManager(const Device& device) : device_(device) {
224 
228  kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t),
229  device_));
230  Reset();
231  }
232 
234 
235  void Reset() {
236  OPEN3D_CUDA_CHECK(cudaMemset(
237  impl_.super_blocks_, 0xFF,
238  kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t)));
239 
240  for (uint32_t i = 0; i < kSuperBlocks; i++) {
241  // setting bitmaps into zeros:
242  OPEN3D_CUDA_CHECK(cudaMemset(
243  impl_.super_blocks_ + i * kUIntsPerSuperBlock, 0x00,
244  kBlocksPerSuperBlock * kSlabsPerBlock * sizeof(uint32_t)));
245  }
247  OPEN3D_CUDA_CHECK(cudaGetLastError());
248  }
249 
250  std::vector<int> CountSlabsPerSuperblock() {
251  const uint32_t num_super_blocks = kSuperBlocks;
252 
253  thrust::device_vector<uint32_t> slabs_per_superblock(kSuperBlocks);
254  thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
255  0);
256 
257  // Counting total number of allocated memory units.
258  int num_mem_units = kBlocksPerSuperBlock * 32;
259  int num_cuda_blocks =
260  (num_mem_units + kThreadsPerBlock - 1) / kThreadsPerBlock;
261  CountSlabsPerSuperblockKernel<<<num_cuda_blocks, kThreadsPerBlock, 0,
262  core::cuda::GetStream()>>>(
263  impl_, thrust::raw_pointer_cast(slabs_per_superblock.data()));
265  OPEN3D_CUDA_CHECK(cudaGetLastError());
266 
267  std::vector<int> result(num_super_blocks);
268  thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
269  result.begin());
270 
271  return result;
272  }
273 
274 public:
277 };
278 } // namespace core
279 } // namespace open3d
Common CUDA utilities.
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:47
core::Tensor result
Definition: VtkUtils.cpp:75
bool copy
Definition: VtkUtils.cpp:73
Definition: Device.h:18
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:22
static void Free(void *ptr, const Device &device)
Frees previously allocated memory at address ptr on device device.
Definition: MemoryManager.cpp:28
Definition: SlabNodeManager.h:39
buf_index_t kv_pair_ptrs[kWarpSize - 1]
Definition: SlabNodeManager.h:43
buf_index_t next_slab_ptr
An internal ptr managed by InternalNodeManager.
Definition: SlabNodeManager.h:45
Definition: SlabNodeManager.h:219
SlabNodeManager(const Device &device)
Definition: SlabNodeManager.h:221
void Reset()
Definition: SlabNodeManager.h:235
~SlabNodeManager()
Definition: SlabNodeManager.h:233
SlabNodeManagerImpl impl_
Definition: SlabNodeManager.h:275
Device device_
Definition: SlabNodeManager.h:276
std::vector< int > CountSlabsPerSuperblock()
Definition: SlabNodeManager.h:250
Definition: SlabNodeManager.h:48
uint32_t * super_blocks_
A pointer to each super-block.
Definition: SlabNodeManager.h:204
__device__ __forceinline__ uint32_t * get_ptr_for_bitmap(const uint32_t super_block_idx, const uint32_t bitmap_idx)
Definition: SlabNodeManager.h:61
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const buf_index_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:57
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
Definition: SlabNodeManager.h:68
uint32_t hash_coef_
hash_coef (register): used as (16 bits, 16 bits) for hashing.
Definition: SlabNodeManager.h:206
__device__ void FreeUntouched(buf_index_t ptr)
Definition: SlabNodeManager.h:134
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:78
SlabNodeManagerImpl()
Definition: SlabNodeManager.h:50
void Synchronize()
Definition: CUDAUtils.cpp:58
uint32_t buf_index_t
Definition: HashBackendBuffer.h:44
__global__ void CountSlabsPerSuperblockKernel(SlabNodeManagerImpl impl, uint32_t *slabs_per_superblock)
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:548
uint32_t RandUint32()
Definition: Random.cpp:59
Definition: PinholeCameraIntrinsic.cpp:16