43 #include <thrust/device_vector.h> 70 : super_blocks_(nullptr),
73 memory_block_index_(0),
74 super_block_index_(0) {}
78 return super_blocks_ + addressDecoder(next_slab_ptr) + lane_id;
82 return super_blocks_ + super_block_idx * kUIntsPerSuperBlock +
89 createMemBlockIndex(tid >> 5);
92 memory_block_bitmap_ =
93 super_blocks_[super_block_index_ * kUIntsPerSuperBlock +
94 memory_block_index_ * kSlabsPerBlock + lane_id];
107 uint32_t read_bitmap = memory_block_bitmap_;
108 uint32_t allocated_result = kNotFoundFlag;
113 while (allocated_result == kNotFoundFlag) {
114 empty_lane = __ffs(~memory_block_bitmap_) - 1;
115 free_lane = __ballot_sync(kSyncLanesMask, empty_lane >= 0);
116 if (free_lane == 0) {
118 updateMemBlockIndex((threadIdx.x + blockIdx.x * blockDim.x) >>
120 read_bitmap = memory_block_bitmap_;
123 uint32_t src_lane = __ffs(free_lane) - 1;
124 if (src_lane == lane_id) {
125 read_bitmap = atomicCAS(
127 super_block_index_ * kUIntsPerSuperBlock +
128 memory_block_index_ * kSlabsPerBlock + lane_id,
129 memory_block_bitmap_,
130 memory_block_bitmap_ | (1 << empty_lane));
131 if (read_bitmap == memory_block_bitmap_) {
133 memory_block_bitmap_ |= (1 << empty_lane);
135 (super_block_index_ << kSuperBlockMaskBits) |
136 (memory_block_index_ << kBlockMaskBits) |
137 (lane_id << kSlabMaskBits) | empty_lane;
140 memory_block_bitmap_ = read_bitmap;
145 __shfl_sync(kSyncLanesMask, allocated_result, src_lane);
147 return allocated_result;
154 atomicAnd(super_blocks_ +
155 getSuperBlockIndex(ptr) * kUIntsPerSuperBlock +
156 getMemBlockIndex(ptr) * kSlabsPerBlock +
157 (getMemUnitIndex(ptr) >> 5),
158 ~(1 << (getMemUnitIndex(ptr) & 0x1F)));
162 __device__ __host__ __forceinline__
uint32_t 163 getSuperBlockIndex(
addr_t address)
const {
164 return address >> kSuperBlockMaskBits;
166 __device__ __host__ __forceinline__
uint32_t 167 getMemBlockIndex(
addr_t address)
const {
168 return ((address >> kBlockMaskBits) & 0x1FFFF);
170 __device__ __host__ __forceinline__
addr_t 171 getMemBlockAddress(
addr_t address)
const {
172 return (kBitmapsPerSuperBlock +
173 getMemBlockIndex(address) * kUIntsPerBlock);
175 __device__ __host__ __forceinline__
uint32_t 176 getMemUnitIndex(
addr_t address)
const {
177 return address & 0x3FF;
179 __device__ __host__ __forceinline__
addr_t 180 getMemUnitAddress(
addr_t address) {
181 return getMemUnitIndex(address) * kWarpSize;
185 __device__
void createMemBlockIndex(
uint32_t global_warp_id) {
186 super_block_index_ = global_warp_id % kSuperBlocks;
187 memory_block_index_ = (hash_coef_ * global_warp_id) >>
188 (32 - kBlocksPerSuperBlockInBits);
192 __device__
void updateMemBlockIndex(
uint32_t global_warp_id) {
194 super_block_index_++;
196 (super_block_index_ == kSuperBlocks) ? 0 : super_block_index_;
197 memory_block_index_ = (hash_coef_ * (global_warp_id + num_attempts_)) >>
198 (32 - kBlocksPerSuperBlockInBits);
200 memory_block_bitmap_ =
201 *((super_blocks_ + super_block_index_ * kUIntsPerSuperBlock) +
202 memory_block_index_ * kSlabsPerBlock + (threadIdx.x & 0x1f));
205 __host__ __device__
addr_t addressDecoder(
addr_t address_ptr_index) {
206 return getSuperBlockIndex(address_ptr_index) * kUIntsPerSuperBlock +
207 getMemBlockAddress(address_ptr_index) +
208 getMemUnitIndex(address_ptr_index) * kWarpSize;
211 __host__ __device__
void print_address(
addr_t address_ptr_index) {
212 printf(
"Super block Index: %d, Memory block index: %d, Memory unit " 215 getSuperBlockIndex(address_ptr_index),
216 getMemBlockIndex(address_ptr_index),
217 getMemUnitIndex(address_ptr_index));
241 std::mt19937 rng(time(0));
242 gpu_context_.hash_coef_ = rng();
246 gpu_context_.super_blocks_ =
248 kUIntsPerSuperBlock * kSuperBlocks *
sizeof(
uint32_t),
252 gpu_context_.super_blocks_, 0xFF,
253 kUIntsPerSuperBlock * kSuperBlocks *
sizeof(
uint32_t)));
255 for (
uint32_t i = 0; i < kSuperBlocks; i++) {
258 gpu_context_.super_blocks_ + i * kUIntsPerSuperBlock, 0x00,
259 kBlocksPerSuperBlock * kSlabsPerBlock *
sizeof(
uint32_t)));
268 const uint32_t num_super_blocks = kSuperBlocks;
270 thrust::device_vector<uint32_t> slabs_per_superblock(kSuperBlocks);
271 thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
275 int num_mem_units = kBlocksPerSuperBlock * 32;
276 int num_cuda_blocks =
277 (num_mem_units + kThreadsPerBlock - 1) / kThreadsPerBlock;
278 CountSlabsPerSuperblockKernel<<<num_cuda_blocks, kThreadsPerBlock>>>(
280 thrust::raw_pointer_cast(slabs_per_superblock.data()));
284 std::vector<int> result(num_super_blocks);
285 thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
288 return std::move(result);
298 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
300 int num_bitmaps = kBlocksPerSuperBlock * 32;
301 if (tid >= num_bitmaps) {
305 for (
uint32_t i = 0; i < kSuperBlocks; i++) {
307 atomicAdd(&slabs_per_superblock[i], __popc(read_bitmap));
uint32_t * super_blocks_
A pointer to each super-block.
Definition: InternalNodeManager.h:222
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: InternalNodeManager.h:97
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
static void Free(void *ptr, const Device &device)
Definition: MemoryManager.cpp:44
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:57
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:40
__device__ void FreeUntouched(addr_t ptr)
Definition: InternalNodeManager.h:153
Device device_
Definition: InternalNodeManager.h:293
addr_t next_slab_ptr
An internal ptr managed by InternalNodeManager.
Definition: InternalNodeManager.h:64
std::vector< int > CountSlabsPerSuperblock()
Definition: InternalNodeManager.h:267
InternalNodeManager(const Device &device)
Definition: InternalNodeManager.h:239
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
Definition: InternalNodeManager.h:87
Definition: InternalNodeManager.h:67
__device__ __forceinline__ uint32_t * get_ptr_for_bitmap(const uint32_t super_block_idx, const uint32_t bitmap_idx)
Definition: InternalNodeManager.h:80
uint32_t hash_coef_
hash_coef (register): used as (16 bits, 16 bits) for hashing.
Definition: InternalNodeManager.h:224
InternalNodeManagerContext gpu_context_
Definition: InternalNodeManager.h:292
Definition: PinholeCameraIntrinsic.cpp:35
uint32_t addr_t
Definition: HashmapBuffer.h:58
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const addr_t &next_slab_ptr, const uint32_t &lane_id)
Definition: InternalNodeManager.h:76
Definition: InternalNodeManager.h:58
~InternalNodeManager()
Definition: InternalNodeManager.h:263
addr_t kv_pair_ptrs[kWarpSize - 1]
Definition: InternalNodeManager.h:62
InternalNodeManagerContext()
Definition: InternalNodeManager.h:69
__global__ void CountSlabsPerSuperblockKernel(InternalNodeManagerContext context, uint32_t *slabs_per_superblock)
Definition: InternalNodeManager.h:296
Definition: InternalNodeManager.h:237